Collapse duplicated rows with different values in

2020-08-01 06:31发布

问题:

In my data frame there are rows with same ID but different values for test year and age. I'd like to collapse the duplicate rows and create new columns for the different values.

I'm new in R and have been struggling with it for a while.

This is the data frame:

>df
    id     project     testyr1      testyr2    age1     age2
1   16S       AS       2008         NA         29       NA 
2   32S       AS       2004         NA         30       NA 
3   37S       AS       NA           2011       NA       36 
4   50S       AS       2004         NA         23       NA 
5   50S       AS       1998         NA         16       NA 
6   55S       AS       2007         NA         28       NA 

testyr1 should have the earliest year and testyr2 the latest year. age1 should be the younger age and age2 the older age.

The output should be:

      id   project    testyr1   testyr2   age1   age2   
1     16S       AS    2008      NA        29     NA  
2     32S       AS    2004      NA        30     NA  
3     37S       AS    NA        2011      NA     36  
4     50S       AS    1998      2004      16     23  
6     55S       AS    2007      NA        28     NA  

I tried to write a loop but don't know how to end it:

df.undup <- c()
df.undup <- c()    
for (i in 1:nrow(df)){   
  if i == i+1    
    df$testyr1 != NA {   

    testyr2 = max(testyr1)   
    testyr1 = min(testyr1)   
    nage2 = max(nage1)   
    nage1 = min(nage1)   
  }   
 else{   
    testyr2 = max(testyr2)   
    testyr1 = min(testyr2)   
    nage2 = max(nage2)   
    nage1 = min(nage2)   
  }   
}   

Any help would be greatly appreciated.

回答1:

library(plyr)

data <- read.csv(textConnection("id,project,testyr1,testyr2,age1,age2
16S,AS,2008,NA,29,NA
32S,AS,2004,NA,30,NA
37S,AS,NA,2011,NA,36
50S,AS,2004,NA,23,NA
50S,AS,1998,NA,16,NA
55S,AS,2007,NA,28,NA"))


new_data <- ddply(data, .(id), function(x) {
  return(data.frame(id = unique(x$id), project = unique(x$project), 
    testyr1 = min(x$testyr1), 
    testyr2 = max(x$testyr2), age1= min(x$age1), age2 = max(x$age2)))
    })

> new_data

    id project testyr1 testyr2 age1 age2
1 16S      AS    2008      NA   29   NA
2 32S      AS    2004      NA   30   NA
3 37S      AS      NA    2011   NA   36
4 50S      AS    2004      NA   23   NA
5 50S      AS    1998      NA   16   NA
6 55S      AS    2007      NA   28   NA

# But your result example suggests you want the lowest 
# of testyr to be in testyr1 and the highest of the combined
# testyrs to be in testyr2. Same logic for ages.
# If so, the one below should work:

new_data <- ddply(data, .(id), function(x) {
    if(dim(x)[1]>1) {
    years <- c(x$testyr1, x$testyr2)
    ages <-  c(x$age1, x$age2)
    return(data.frame(id = unique(x$id), project = unique(x$project), 
        testyr1 = min(years, na.rm=T), testyr2 = max(years , na.rm=T), 
        age1= min(ages, na.rm=T), age2 = max(ages, na.rm=T)))   
    } else {
    return(data.frame(id = unique(x$id), project = unique(x$project), 
        testyr1 = x$testyr1, testyr2 = x$testyr2, 
        age1= x$age1, age2 = x$age2)) 
    }       
    })

> new_data
   id project testyr1 testyr2 age1 age2
1 16S      AS    2008      NA   29   NA
2 32S      AS    2004      NA   30   NA
3 37S      AS      NA    2011   NA   36
4 50S      AS    1998    2004   16   23
5 55S      AS    2007      NA   28   NA


回答2:

I really doubt this is the most effective way to do this, but my brain is not functioning at the moment.

temp = names(which(table(df$id) > 1))
temp1 = vector("list")
for (i in 1:length(temp)) {
  temp1[[i]] = df[df$id == temp[i], ]
  temp1[[i]] = data.frame(temp1[[i]][1, 1:2], 
                     testyr1 = min(temp1[[i]]$testyr1), 
                     testyr2 = max(temp1[[i]]$testyr1), 
                     age1 = min(temp1[[i]]$age1), 
                     age2 = max(temp1[[i]]$age1))
}

rbind(df[-c(which(df$id %in% temp)), ], do.call(rbind, temp1))
#    id project testyr1 testyr2 age1 age2
# 1 16S      AS    2008      NA   29   NA
# 2 32S      AS    2004      NA   30   NA
# 3 37S      AS      NA    2011   NA   36
# 6 55S      AS    2007      NA   28   NA
# 4 50S      AS    1998    2004   16   23

### rm(i, temp, temp1) ### Cleanup the workspace


标签: r