For training purposes I would like to make a Shiny application where you can to check column names and use those for training a random forest algorithm.
My Shiny Application looks like this:
library(shiny)
library(DT)
library(titanic)
library(randomForest)
ui <- fluidPage(
DT::dataTableOutput("mytable"),
checkboxInput("checkbox" , label = "Pclass", value = FALSE),
checkboxInput("checkbox" , label = "Sex", value = FALSE),
checkboxInput("checkbox" , label = "Age", value = FALSE),
actionButton("runRF", "Predict"),
plotOutput("plotRF")
)
server <- function(input, output, session) {
output$mytable = DT::renderDataTable({
titanic_train
})
observeEvent(input$runRF, {
var = c("Pclass")
fit <- randomForest(as.factor(Survived) ~ var, data = titanic_train, importance = TRUE, ntree=2000)
prediction <- as.numeric(predict(fit, titanic_test))
titanic_test$predicted <- prediction
output$plotRF <- renderPlot({
hist(prediction)
})
})
}
shinyApp(ui, server)
Basically the code above works when I do something like:
fit <- randomForest(as.factor(Survived) ~ Age, data = titanic_train, importance = TRUE, ntree=2000)
Or
fit <- randomForest(as.factor(Survived) ~ Pclass + Age, data = titanic_train, importance = TRUE, ntree=2000)
However I would like to make the training varibles dependent or the boxes you check. So if you check Age + Pclass it should be:
fit <- randomForest(as.factor(Survived) ~ Pclass + Age, data = titanic_train, importance = TRUE, ntree=2000)
If you check Age:
fit <- randomForest(as.factor(Survived) ~ Age, data = titanic_train, importance = TRUE, ntree=2000)
I assume I have to make a list where I store the "checked values" like:
var = c(checkElement1)
However this gives me the following erorr:
Warning: Error in model.frame.default: variable lengths differ (found for 'var')
Stack trace (innermost first):
74: model.frame.default
73: model.frame
72: eval
71: eval
70: randomForest.formula
69: randomForest
68: observeEventHandler [#11]
4: <Anonymous>
3: do.call
2: print.shiny.appobj
1: <Promise>
Any thoughts where this goes wrong?
Here is a possible solution. Instead of creating a formula, it may be easier to pass values for x
and y
, so we can call the randomForest
as:
randomForest(x = titanic_train[,input$myselection,drop=FALSE],
y = as.factor(titanic_train$Survived),
importance = TRUE,
ntree=2000)
Where drop=FALSE
makes sure we still pass a data.frame
instead of a vector when only one column is selected, and input$myselection
is a vector of the selected columns. You could construct that from separate checkBox
elements, but instead I created a single checkboxGroupInput
that contains all possible columns that the user should be able to select.
Also, it is bad practice to create a reactive or output from within an observer. See this slide and the two after it from a presentation by Joe Cheng. In this case, we can store our predictions along with the test dataset in a reactiveVal called my_prediction
, which we can use for our plot and other statistics.
I added some simple preprocessing in the code below, e.g. character columns should become factors, and in that case some columns have too many factor values so i drop those columns. But I assume this dataset is just for illustration purposes and this is no issue since you you have already modified your real dataset to work properly. I am just a little too OCD to present an example that returns errors when used :)
Hope this helps!
library(shiny)
library(DT)
library(titanic)
library(randomForest)
# Replace NA's and replace String with Factor columns
# There may be nicer ways to do this though.
titanic_train[is.na(titanic_train)] <- 0
titanic_test[is.na(titanic_test)] <- 0
titanic_train[sapply(titanic_train, is.character)] <- lapply(titanic_train[sapply(titanic_train, is.character)],
as.factor)
titanic_test[sapply(titanic_test, is.character)] <- lapply(titanic_test[sapply(titanic_test, is.character)],
as.factor)
# drop columns with too many factor levels
to_drop=sapply(colnames(titanic_train)[sapply(titanic_train,class)=='factor'],function(x) {length(levels(titanic_train[,x]))>52})
if(sum(to_drop)>0){
titanic_train <- titanic_train[,-which(names(titanic_train) %in% names(to_drop)[to_drop])]
titanic_test <- titanic_test[,-which(names(titanic_test) %in% names(to_drop)[to_drop])]
}
ui <- fluidPage(
DT::dataTableOutput("mytable"),
checkboxGroupInput('myselection','Select columns:',
choices=setdiff(colnames(titanic_train),c('PassengerId','Survived','Name')),
inline=T),
actionButton("runRF", "Predict"),
plotOutput("plotRF")
)
server <- function(input, output, session) {
output$mytable = DT::renderDataTable({
titanic_train
})
observeEvent(input$runRF, {
if(is.null(input$myselection))
{
my_prediction(NULL)
showModal(modalDialog(
title = "Error!",
"No variables selected!"
))
}
else
{
fit <- randomForest(x = titanic_train[,input$myselection,drop=FALSE],
y = as.factor(titanic_train$Survived),
importance = TRUE,
ntree=2000)
prediction <- as.numeric(predict(fit, titanic_test[,input$myselection,drop=FALSE]))
titanic_test$predicted <- prediction
my_prediction(titanic_test) # store our test set with predicted valus in reactiveVal
}
})
# A reactiveVal to store titanic_test with its predictions.
my_prediction <- reactiveVal()
output$plotRF <- renderPlot({
req(my_prediction())
hist(my_prediction()$predicted)
})
}
shinyApp(ui, server)
We need to create a formula
using paste
var = "Pclass"
form <- formula(paste('as.factor(Survived)', var, sep=' ~ '))
fit <- randomForest(form, data = titanic_train, importance = TRUE, ntree=2000)
-full code
library(shiny)
library(DT)
library(titanic)
library(randomForest)
ui <- fluidPage(
DT::dataTableOutput("mytable"),
checkboxInput("checkbox" , label = "Pclass", value = FALSE),
checkboxInput("checkbox" , label = "Sex", value = FALSE),
checkboxInput("checkbox" , label = "Age", value = FALSE),
actionButton("runRF", "Predict"),
plotOutput("plotRF")
)
server <- function(input, output, session) {
output$mytable = DT::renderDataTable({
titanic_train
})
observeEvent(input$runRF, {
var = "Pclass"
form <- formula(paste('as.factor(Survived)', var, sep=' ~ '))
fit <- randomForest(form, data = titanic_train, importance = TRUE, ntree=2000)
prediction <- as.numeric(predict(fit, titanic_test))
titanic_test$predicted <- prediction
output$plotRF <- renderPlot({
hist(prediction)
})
})
}
shinyApp(ui, server)
-output