R split dataframe

2020-04-21 07:31发布

问题:

I have a dataset like this

alpha number  fr color
1     a   20 0.8   rot
2     a   21 2.0   rot
3     a    2 0.8   rot
4     a   34 0.8   rot
5     f   42 0.5  grün .......
......................

now i want to split this dataset in more observations which depends on a condition like number<20 so that the new dataset looks like

alpha number  fr color
1     a   19 0.8   rot
2     a   1  0.8   rot
3     a   10 2.0   rot
4     a   11 2.0   rot
5     a    2 0.8   rot
6     a   19 0.8   rot
7     a   15 0.8   rot
8     f   7  0.5  grün 
9     f   7  0.5  grün 
10     f   7  0.5  grün 
11    f   7  0.5  grün 
12     f   7  0.5  grün 
13    f   7  0.5  grün 
 .......

or similar. repeat the observations as long as the condition is not true. how to split doesnt matter but the observations have to be the same for the data you split for the other variables. thank you

回答1:

df1 <- structure(list(alpha = c("a", "a", "a", "a", "f"), 
                      number = c(20L, 21L, 2L, 34L, 42L), 
                      fr = c(0.8, 2, 0.8, 0.8, 0.5), 
                      color = c("rot", "rot", "rot", "rot", "grun")), 
                 row.names = c(NA, -5L), class = "data.frame")
rep.rev <- function(x,t){
  if(t != 0){
    rep(x,t)
  } else {
    NA_integer_
  }
}
library(dplyr)
library(tidyr)
set.seed(22)
df1 %>% 
  mutate(divisor = floor(runif(n(), min = 2, max = 19)),
         quotient = number%/%divisor,
         remainder = ifelse(number%%divisor==0, NA, number%%divisor)) %>% 
  rowwise %>% 
  mutate(number = list(c(rep.rev(divisor, quotient),remainder))) %>% 
  unnest %>% 
  select(alpha, number, fr, color) %>% 
  filter(!is.na(number))
#> # A tibble: 14 x 4
#>    alpha number    fr color
#>    <chr>  <dbl> <dbl> <chr>
#>  1 a          7   0.8 rot  
#>  2 a          7   0.8 rot  
#>  3 a          6   0.8 rot  
#>  4 a         10   2   rot  
#>  5 a         10   2   rot  
#>  6 a          1   2   rot  
#>  7 a          2   0.8 rot  
#>  8 a         10   0.8 rot  
#>  9 a         10   0.8 rot  
#> 10 a         10   0.8 rot  
#> 11 a          4   0.8 rot  
#> 12 f         16   0.5 grun 
#> 13 f         16   0.5 grun 
#> 14 f         10   0.5 grun


回答2:

Many ways we could split a number >=20, but the method below divides the each number in half, i.e. even numbers (m=2n) split into n and n, odd numbers (m=2n+1) split into (n+1) and n.

> library(dplyr)
> df <- data.frame(alpha=c("a","a","a","a","f"),
+                  number=c(20,21,2,34,42),
+                  fr=c(0.8,2.0,0.8,0.8,0.5),
+                  color=c("rot","rot","rot","rot","grün"))

The function doSplit() takes a dataframe df and an integer threshold as arguments.

> doSplit <- function(df, threshold){
+   # splits rows where number >= threshold until all rows have number < threshold
+   
+   colNames <- colnames(df)
+   df <- df %>% mutate(orig_id=rownames(df))
+   dfBelow <- df %>% filter(number<threshold)
+   dfAbove1 <- df %>% filter(number>=threshold) %>% mutate(number=(number%/%2)+(number%%2))
+   dfAbove2 <- df %>% filter(number>=threshold) %>% mutate(number=number%/%2)
+   combData <- rbind(dfBelow, dfAbove1, dfAbove2)
+   combData <- combData %>% arrange(orig_id) %>% select(colNames)
+   return(combData)  
+ }

Here, we define a threshold of 20. The while loop repeatedly calls the doSplit() function so long as a row exists with number >=20.

> myThreshold <- 20
> splitDf <- df
> while(splitDf %>% pull(number) %>% max() >= myThreshold){
+     splitDf <- doSplit(splitDf, myThreshold)
+ }

Here is the split dataframe:

> splitDf
   alpha number  fr color
1      a     10 0.8   rot
2      a     10 0.8   rot
3      a     11 2.0   rot
4      a     10 2.0   rot
5      a      2 0.8   rot
6      a     17 0.8   rot
7      a     17 0.8   rot
8      f     11 0.5  grün
9      f     10 0.5  grün
10     f     11 0.5  grün
11     f     10 0.5  grün