Use checked elements for training a machine learni

2019-02-18 03:20发布

问题:

For training purposes I would like to make a Shiny application where you can to check column names and use those for training a random forest algorithm.

My Shiny Application looks like this:

library(shiny)
library(DT)
library(titanic)
library(randomForest)

ui <- fluidPage(

  DT::dataTableOutput("mytable"),
  checkboxInput("checkbox" , label = "Pclass", value = FALSE),
  checkboxInput("checkbox" , label = "Sex", value = FALSE),
  checkboxInput("checkbox" , label = "Age", value = FALSE),
  actionButton("runRF", "Predict"),
  plotOutput("plotRF")
)

server <- function(input, output, session) {

  output$mytable = DT::renderDataTable({
    titanic_train
  })

  observeEvent(input$runRF, {

    var = c("Pclass")

    fit <- randomForest(as.factor(Survived) ~ var, data = titanic_train, importance = TRUE, ntree=2000)
    prediction <- as.numeric(predict(fit, titanic_test))
    titanic_test$predicted <- prediction  

    output$plotRF <- renderPlot({
      hist(prediction)
    })
  })
}

shinyApp(ui, server)

Basically the code above works when I do something like:

fit <- randomForest(as.factor(Survived) ~ Age, data = titanic_train, importance = TRUE, ntree=2000)

Or

fit <- randomForest(as.factor(Survived) ~ Pclass + Age, data = titanic_train, importance = TRUE, ntree=2000)

However I would like to make the training varibles dependent or the boxes you check. So if you check Age + Pclass it should be:

fit <- randomForest(as.factor(Survived) ~ Pclass + Age, data = titanic_train, importance = TRUE, ntree=2000)

If you check Age:

    fit <- randomForest(as.factor(Survived) ~ Age, data = titanic_train, importance = TRUE, ntree=2000)

I assume I have to make a list where I store the "checked values" like:

var = c(checkElement1)

However this gives me the following erorr:

Warning: Error in model.frame.default: variable lengths differ (found for 'var')
Stack trace (innermost first):
    74: model.frame.default
    73: model.frame
    72: eval
    71: eval
    70: randomForest.formula
    69: randomForest
    68: observeEventHandler [#11]
     4: <Anonymous>
     3: do.call
     2: print.shiny.appobj
     1: <Promise>

Any thoughts where this goes wrong?

回答1:

Here is a possible solution. Instead of creating a formula, it may be easier to pass values for x and y, so we can call the randomForest as:

randomForest(x = titanic_train[,input$myselection,drop=FALSE], 
                          y = as.factor(titanic_train$Survived), 
                          importance = TRUE, 
                          ntree=2000)

Where drop=FALSE makes sure we still pass a data.frame instead of a vector when only one column is selected, and input$myselection is a vector of the selected columns. You could construct that from separate checkBox elements, but instead I created a single checkboxGroupInput that contains all possible columns that the user should be able to select.

Also, it is bad practice to create a reactive or output from within an observer. See this slide and the two after it from a presentation by Joe Cheng. In this case, we can store our predictions along with the test dataset in a reactiveVal called my_prediction, which we can use for our plot and other statistics.

I added some simple preprocessing in the code below, e.g. character columns should become factors, and in that case some columns have too many factor values so i drop those columns. But I assume this dataset is just for illustration purposes and this is no issue since you you have already modified your real dataset to work properly. I am just a little too OCD to present an example that returns errors when used :)

Hope this helps!

library(shiny)
library(DT)
library(titanic)
library(randomForest)

# Replace NA's and replace String with Factor columns
# There may be nicer ways to do this though.
titanic_train[is.na(titanic_train)] <- 0
titanic_test[is.na(titanic_test)] <- 0
titanic_train[sapply(titanic_train, is.character)] <- lapply(titanic_train[sapply(titanic_train, is.character)], 
                                                             as.factor)
titanic_test[sapply(titanic_test, is.character)] <- lapply(titanic_test[sapply(titanic_test, is.character)], 
                                                           as.factor)

# drop columns with too many factor levels
to_drop=sapply(colnames(titanic_train)[sapply(titanic_train,class)=='factor'],function(x) {length(levels(titanic_train[,x]))>52})
if(sum(to_drop)>0){
titanic_train <- titanic_train[,-which(names(titanic_train) %in% names(to_drop)[to_drop])]
titanic_test <- titanic_test[,-which(names(titanic_test) %in% names(to_drop)[to_drop])]
}


ui <- fluidPage(
  DT::dataTableOutput("mytable"),
  checkboxGroupInput('myselection','Select columns:',
                     choices=setdiff(colnames(titanic_train),c('PassengerId','Survived','Name')),
                     inline=T),
  actionButton("runRF", "Predict"),
  plotOutput("plotRF")
)

server <- function(input, output, session) {

  output$mytable = DT::renderDataTable({
    titanic_train
  })

  observeEvent(input$runRF, {
    if(is.null(input$myselection))
    {
      my_prediction(NULL)
      showModal(modalDialog(
        title = "Error!",
        "No variables selected!"
      ))
    }
    else
    {
      fit <- randomForest(x = titanic_train[,input$myselection,drop=FALSE], 
                          y = as.factor(titanic_train$Survived), 
                          importance = TRUE, 
                          ntree=2000)
      prediction <- as.numeric(predict(fit, titanic_test[,input$myselection,drop=FALSE]))
      titanic_test$predicted <- prediction  
      my_prediction(titanic_test) # store our test set with predicted valus in reactiveVal

    }
  })

  # A reactiveVal to store titanic_test with its predictions.
  my_prediction <- reactiveVal()
  output$plotRF <- renderPlot({
    req(my_prediction())
    hist(my_prediction()$predicted)
  })
}

shinyApp(ui, server)


回答2:

We need to create a formula using paste

var = "Pclass"

form <- formula(paste('as.factor(Survived)', var, sep=' ~ '))
fit <- randomForest(form, data = titanic_train, importance = TRUE, ntree=2000)

-full code

library(shiny)
library(DT)
library(titanic)
library(randomForest)

ui <- fluidPage(

  DT::dataTableOutput("mytable"),
  checkboxInput("checkbox" , label = "Pclass", value = FALSE),
  checkboxInput("checkbox" , label = "Sex", value = FALSE),
  checkboxInput("checkbox" , label = "Age", value = FALSE),
  actionButton("runRF", "Predict"),
  plotOutput("plotRF")
)

server <- function(input, output, session) {

  output$mytable = DT::renderDataTable({
    titanic_train
  })

  observeEvent(input$runRF, {

    var = "Pclass"

    form <- formula(paste('as.factor(Survived)', var, sep=' ~ '))
    fit <- randomForest(form, data = titanic_train, importance = TRUE, ntree=2000)
    prediction <- as.numeric(predict(fit, titanic_test))
    titanic_test$predicted <- prediction  

    output$plotRF <- renderPlot({
      hist(prediction)
    })
  })
}

shinyApp(ui, server)

-output