I have a data frame with 96 observations and 1106 variables.
I would like to run logistic regression on the observations by leaving one out, one at a time. (So for the first set of observations there would be 95 total with the first observation removed, the second set of observations there would be 95 total with the second observation removed, and so forth so that there are 95 sets of observations that each have one observation left out.)
In addition, I would like to run each set of these observations on only one variable at a time. After running the regression for 95 observations on one variable, I would like to extract the p-value (leaving out the intercept p-value).
I have been able to do all of this manually, one at a time. However it is very tedious to do this 96 times and I'm sure there must be a way to automate this with a loop or multiple loops.
Here is a demonstration of how I have been doing this manually for 10 observations.
## Create 10 data frames by removing one observation from each ##
di.1 <- mainDF [-1,]
di.2 <- mainDF [-2,]
di.3 <- mainDF [-3,]
di.4 <- mainDF [-4,]
di.5 <- mainDF [-5,]
di.6 <- mainDF [-6,]
di.7 <- mainDF [-7,]
di.8 <- mainDF [-8,]
di.9 <- mainDF [-9,]
di.10 <- mainDF [-10,]
## Create data frames to put each p-value result in ##
dt.1 <- data.frame(intercept=numeric(), gene=numeric(), stringsAsFactors=FALSE)
dt.2 <- data.frame(intercept=numeric(), gene=numeric(), stringsAsFactors=FALSE)
dt.3 <- data.frame(intercept=numeric(), gene=numeric(), stringsAsFactors=FALSE)
dt.4 <- data.frame(intercept=numeric(), gene=numeric(), stringsAsFactors=FALSE)
dt.5 <- data.frame(intercept=numeric(), gene=numeric(), stringsAsFactors=FALSE)
dt.6 <- data.frame(intercept=numeric(), gene=numeric(), stringsAsFactors=FALSE)
dt.7 <- data.frame(intercept=numeric(), gene=numeric(), stringsAsFactors=FALSE)
dt.8 <- data.frame(intercept=numeric(), gene=numeric(), stringsAsFactors=FALSE)
dt.9 <- data.frame(intercept=numeric(), gene=numeric(), stringsAsFactors=FALSE)
dt.10 <- data.frame(intercept=numeric(), gene=numeric(), stringsAsFactors=FALSE)
## Run logistic regression on each data frame with one one obs. left out ##
## GLM run on one variable at a time##
## Extract p-values and put in separate dfs ##
for (i in 2:1106)
{
formulas <- glm(response ~ di.1[,i], data=di.1, family= "binomial")
dt.1[i,] <- coef(summary(formulas))[,4]
}
for (i in 2:1106)
{
formulas <- glm(response ~ di.2[,i], data=di.2, family= "binomial")
dt.2[i,] <- coef(summary(formulas))[,4]
}
for (i in 2:1106)
{
formulas <- glm(response ~ di.3[,i], data=di.3, family= "binomial")
dt.3[i,] <- coef(summary(formulas))[,4]
}
for (i in 2:1106)
{
formulas <- glm(response ~ di.4[,i], data=di.4, family= "binomial")
dt.4[i,] <- coef(summary(formulas))[,4]
}
for (i in 2:1106)
{
formulas <- glm(response ~ di.5[,i], data=di.5, family= "binomial")
dt.5[i,] <- coef(summary(formulas))[,4]
}
for (i in 2:1106)
{
formulas <- glm(response ~ di.6[,i], data=di.6, family= "binomial")
dt.6[i,] <- coef(summary(formulas))[,4]
}
for (i in 2:1106)
{
formulas <- glm(response ~ di.7[,i], data=di.7, family= "binomial")
dt.7[i,] <- coef(summary(formulas))[,4]
}
for (i in 2:1106)
{
formulas <- glm(response ~ di.8[,i], data=di.8, family= "binomial")
dt.8[i,] <- coef(summary(formulas))[,4]
}
for (i in 2:1106)
{
formulas <- glm(response ~ di.9[,i], data=di.9, family= "binomial")
dt.9[i,] <- coef(summary(formulas))[,4]
}
for (i in 2:1106)
{
formulas <- glm(response ~ di.10[,i], data=di.10, family= "binomial")
dt.10[i,] <- coef(summary(formulas))[,4]
}
## Remove intercept p-values ##
dt.1<- dt.1[-c(1)]
dt.2<- dt.2[-c(1)]
dt.3<- dt.3[-c(1)]
dt.4<- dt.4[-c(1)]
dt.5<- dt.5[-c(1)]
dt.6<- dt.6[-c(1)]
dt.7<- dt.7[-c(1)]
dt.8<- dt.8[-c(1)]
dt.9<- dt.9[-c(1)]
dt.10<- dt.10[-c(1)]
## Export data frames, then manually copy and paste them into one CSV ##
write.csv(dt.1, file = "MyData.csv")
write.csv(dt.2, file = "MyData2.csv")
write.csv(dt.3, file = "MyData3.csv")
write.csv(dt.4, file = "MyData4.csv")
write.csv(dt.5, file = "MyData5.csv")
write.csv(dt.6, file = "MyData6.csv")
write.csv(dt.7, file = "MyData7.csv")
write.csv(dt.8, file = "MyData8.csv")
write.csv(dt.9, file = "MyData9.csv")
write.csv(dt.10, file = "MyData10.csv")
I would like to know how I could do all of this work without having to go through each observation one at a time.
Here is a chunk of the data that I am using:
Response X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
P1 N 1 1 1 0 1 0 1 0 2 2
P2 N 2 1 1 0 2 2 1 2 2 2
P3 N 2 1 2 1 1 0 1 1 0 1
P4 Y 1 1 2 0 1 0 0 1 1 1
P5 N 2 2 1 1 1 0 0 0 1 1
P6 N 2 1 2 1 1 0 0 0 2 1
P7 Y 2 1 1 0 2 0 0 0 2 0
P8 Y 2 1 1 0 2 0 0 1 0 2
P9 N 1 1 1 0 2 0 0 0 1 0
P10 N 2 1 2 1 1 0 1 0 0 2
Thank you very much for your time!