Why doesn't rle accept a factor as input?

2019-06-23 18:46发布

问题:

I'm having trouble passing this rle function on a data.frame. Function works great on another set:

fgroup <- aggregate(fevents2[,3:14], list(weeks = fevents2[, 1]), function(x) rle(x)$values)

Which yields the error:

Error in rle(x) : 'x' must be an atomic vector

Sample data:

> dput(fevents2[1:20,])
structure(list(weeks = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6", "7"), class = "factor"), A1M.Date = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("2012-05-09", "2012-05-10", "2012-05-11",
"2012-05-14", "2012-05-15", "2012-05-17", "2012-05-18", "2012-05-21",
"2012-05-22", "2012-05-24", "2012-05-25", "2012-05-28", "2012-05-29",
"2012-05-30", "2012-05-31", "2012-06-04", "2012-06-05", "2012-06-07",
"2012-06-08", "2012-06-11", "2012-06-12", "2012-06-14", "2012-06-15",
"2012-06-18", "2012-06-19", "2012-06-21", "2012-06-22"), class = "factor"),
    vv = structure(c(8L, 8L, 8L, 20L, 24L, 24L, 24L, 1L, 13L,
    13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 24L), .Label = c("C AA",
    "C AJ", "C BB", "C BV", "C JA", "C JR", "C RJ", "C RR", "C RV",
    "C VB", "C VR", "C VV", "G AA", "G AJ", "G BB", "G BV", "G JA",
    "G JR", "G RJ", "G RR", "G RV", "G VB", "G VR", "nil"), class = "factor"),
    rv = structure(c(25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
    10L, 10L, 22L, 22L, 22L, 25L, 10L, 22L, 22L, 22L, 22L, 25L
    ), .Label = c("C AA", "C AJ", "C BB", "C BV", "C JA", "C JR",
    "C RJ", "C RR", "C RV", "C VB", "C VR", "C VV", "G AA", "G AJ",
    "G BB", "G BV", "G JA", "G JR", "G RJ", "G RR", "G RV", "G VB",
    "G VR", "G VV", "nil"), class = "factor"), ja = structure(c(12L,
    12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 25L, 25L,
    12L, 24L, 24L, 24L, 24L, 24L, 24L), .Label = c("C AA", "C AJ",
    "C BB", "C BV", "C JA", "C JR", "C RJ", "C RR", "C RV", "C VB",
    "C VR", "C VV", "G AA", "G AJ", "G BB", "G BV", "G JA", "G JR",
    "G RJ", "G RR", "G RV", "G VB", "G VR", "G VV", "nil"), class = "factor"),
    aa = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 25L, 25L,
    25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L), .Label = c("C AA",
    "C AJ", "C BB", "C BV", "C JA", "C JR", "C RJ", "C RR", "C RV",
    "C VB", "C VR", "C VV", "G AA", "G AJ", "G BB", "G BV", "G JA",
    "G JR", "G RJ", "G RR", "G RV", "G VB", "G VR", "G VV", "nil"
    ), class = "factor"), bv = structure(c(25L, 11L, 11L, 11L,
    23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L,
    23L, 23L, 23L, 23L), .Label = c("C AA", "C AJ", "C BB", "C BV",
    "C JA", "C JR", "C RJ", "C RR", "C RV", "C VB", "C VR", "C VV",
    "G AA", "G AJ", "G BB", "G BV", "G JA", "G JR", "G RJ", "G RR",
    "G RV", "G VB", "G VR", "G VV", "nil"), class = "factor"),
    aj = structure(c(7L, 7L, 7L, 25L, 25L, 25L, 25L, 25L, 9L,
    9L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 25L, 25L), .Label = c("C AA",
    "C AJ", "C BB", "C BV", "C JA", "C JR", "C RJ", "C RR", "C RV",
    "C VB", "C VR", "C VV", "G AA", "G AJ", "G BB", "G BV", "G JA",
    "G JR", "G RJ", "G RR", "G RV", "G VB", "G VR", "G VV", "nil"
    ), class = "factor"), vb = structure(c(1L, 1L, 1L, 25L, 25L,
    25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 2L,
    25L, 2L, 2L), .Label = c("C AA", "C AJ", "C BB", "C BV",
    "C JA", "C JR", "C RJ", "C RR", "C RV", "C VB", "C VR", "C VV",
    "G AA", "G AJ", "G BB", "G BV", "G JA", "G JR", "G RJ", "G RR",
    "G RV", "G VB", "G VR", "G VV", "nil"), class = "factor"),
    rj = structure(c(5L, 5L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
    16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), .Label = c("C AA",
    "C AJ", "C BB", "C BV", "C JR", "C RJ", "C RR", "C RV", "C VB",
    "C VR", "C VV", "G AA", "G AJ", "G BB", "G BV", "G JR", "G RJ",
    "G RR", "G RV", "G VB", "G VR", "G VV", "nil"), class = "factor"),
    rr = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("C AA",
    "C AJ", "C BB", "C BV", "C JA", "C JR", "C RJ", "C RR", "C RV",
    "C VB", "C VR", "C VV", "G AA", "G AJ", "G BB", "G BV", "G JA",
    "G JR", "G RJ", "G RR", "G RV", "G VB", "G VR", "G VV", "nil"
    ), class = "factor"), vr = structure(c(5L, 5L, 5L, 25L, 25L,
    7L, 7L, 7L, 7L, 7L, 25L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
    7L), .Label = c("C AA", "C AJ", "C BB", "C BV", "C JA", "C JR",
    "C RJ", "C RR", "C RV", "C VB", "C VR", "C VV", "G AA", "G AJ",
    "G BB", "G BV", "G JA", "G JR", "G RJ", "G RR", "G RV", "G VB",
    "G VR", "G VV", "nil"), class = "factor"), bb = structure(c(4L,
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
    4L, 4L, 4L, 4L), .Label = c("C AA", "C AJ", "C BB", "C BV",
    "C JA", "C JR", "C RJ", "C RR", "C RV", "C VB", "C VR", "C VV",
    "G AA", "G AJ", "G BB", "G BV", "G JA", "G RJ", "G RR", "G RV",
    "G VB", "G VR", "G VV", "nil"), class = "factor"), jr = structure(c(25L,
    25L, 10L, 10L, 22L, 22L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
    25L, 25L, 25L, 5L, 5L, 5L, 5L), .Label = c("C AA", "C AJ",
    "C BB", "C BV", "C JA", "C JR", "C RJ", "C RR", "C RV", "C VB",
    "C VR", "C VV", "G AA", "G AJ", "G BB", "G BV", "G JA", "G JR",
    "G RJ", "G RR", "G RV", "G VB", "G VR", "G VV", "nil"), class = "factor")),
.Names = c("weeks",
"A1M.Date", "vv", "rv", "ja", "aa", "bv", "aj", "vb", "rj", "rr",
"vr", "bb", "jr"), row.names = c(NA, 20L), class = "data.frame")

Structure of data:

str(fevents2)
data.frame':   1430 obs. of  14 variables:
 $ weeks   : Factor w/ 7 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ A1M.Date: Factor w/ 27 levels "2012-05-09","2012-05-10",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ vv      : Factor w/ 24 levels "C AA","C AJ",..: 8 8 8 20 24 24 24 1 13 13 ..
 $ rv      : Factor w/ 25 levels "C AA","C AJ",..: 25 25 25 25 25 25 25 25 10 10 ...
 $ ja      : Factor w/ 25 levels "C AA","C AJ",..: 12 12 12 12 12 12 12 12 12 12 ...
 $ aa      : Factor w/ 25 levels "C AA","C AJ",..: 2 2 2 2 2 2 2 2 25 25 ...
 $ bv      : Factor w/ 25 levels "C AA","C AJ",..: 25 11 11 11 23 23 23 23 23 23 ...
 $ aj      : Factor w/ 25 levels "C AA","C AJ",..: 7 7 7 25 25 25 25 25 9 9 ...
 $ vb      : Factor w/ 25 levels "C AA","C AJ",..: 1 1 1 25 25 25 25 25 25 25 ...
 $ rj      : Factor w/ 23 levels "C AA","C AJ",..: 5 5 16 16 16 16 16 16 16 16 ...
 $ rr      : Factor w/ 25 levels "C AA","C AJ",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ vr      : Factor w/ 25 levels "C AA","C AJ",..: 5 5 5 25 25 7 7 7 7 7 ...
 $ bb      : Factor w/ 24 levels "C AA","C AJ",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ jr      : Factor w/ 25 levels "C AA","C AJ",..: 25 25 10 10 22 22 25 25 25 25 ...
NULL

I understand that I have factors, but converting factors to numeric with

as.numeric(as.character(fevents2))

or:

sapply(fevents2, function(x) as.numeric(as.character(x)))

doesn't solve my issue:

Error in fevents3[, 3:14] : incorrect number of dimensions
In addition: Warning message:
In eval.with.vis(expr, envir, enclos) : NAs introduced by coercion

Here's a sample data.frame on which the rle function works:

    dput(fevents[1:20,]
structure(list(weeks = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1), A1M.Date = c("2012-05-09", "2012-05-09",
"2012-05-09", "2012-05-09", "2012-05-09", "2012-05-09", "2012-05-09",
"2012-05-09", "2012-05-09", "2012-05-09", "2012-05-09", "2012-05-09",
"2012-05-09", "2012-05-09", "2012-05-09", "2012-05-09", "2012-05-09",
"2012-05-09", "2012-05-09", "2012-05-09"), vv = c("C RR", "C RR",
"C RR", "G RR", "nil", "nil", "nil", "C AA", "G AA", "G AA",
"G AA", "G AA", "G AA", "G AA", "G AA", "G AA", "G AA", "G AA",
"G AA", "nil"), rv = c("nil", "nil", "nil", "nil", "nil", "nil",
"nil", "nil", "C VB", "C VB", "G VB", "G VB", "G VB", "nil",
"G VB", "G VB", "G VB", "G VB", "G VB", "nil"), ja = c("C VV",
"C VV", "C VV", "C VV", "C VV", "C VV", "C VV", "C VV", "C VV",
"C VV", "C VV", "nil", "nil", "G VV", "G VV", "G VV", "G VV",
"G VV", "G VV", "G VV"), aa = c("C AJ", "C AJ", "C AJ", "C AJ",
"C AJ", "C AJ", "C AJ", "C AJ", "nil", "nil", "nil", "nil", "nil",
"nil", "nil", "nil", "nil", "nil", "nil", "nil"), bv = c("nil",
"C VR", "C VR", "C VR", "G VR", "G VR", "G VR", "G VR", "G VR",
"G VR", "G VR", "G VR", "G VR", "G VR", "G VR", "G VR", "G VR",
"G VR", "G VR", "G VR"), aj = c("C RJ", "C RJ", "C RJ", "nil",
"nil", "nil", "nil", "nil", "C RV", "C RV", "G RV", "G RV", "G RV",
"G RV", "G RV", "G RV", "G RV", "G RV", "nil", "nil"), vb = c("C AA",
"C AA", "C AA", "nil", "nil", "nil", "nil", "nil", "nil", "nil",
"nil", "nil", "nil", "nil", "nil", "nil", "C AJ", "nil", "C AJ",
"C AJ"), rj = c("C JR", "C JR", "G JR", "G JR", "G JR", "G JR",
"G JR", "G JR", "G JR", "G JR", "G JR", "G JR", "G JR", "G JR",
"G JR", "G JR", "G JR", "G JR", "G JR", "G JR"), rr = c("C BB",
"C BB", "C BB", "C BB", "C BB", "C BB", "C BB", "C BB", "C BB",
"C BB", "C BB", "C BB", "C BB", "C BB", "C BB", "C BB", "C BB",
"C BB", "C BB", "C BB"), vr = c("C JA", "C JA", "C JA", "nil",
"nil", "C RJ", "C RJ", "C RJ", "C RJ", "C RJ", "nil", "C RJ",
"C RJ", "C RJ", "C RJ", "C RJ", "C RJ", "C RJ", "C RJ", "C RJ"
), bb = c("C BV", "C BV", "C BV", "C BV", "C BV", "C BV", "C BV",
"C BV", "C BV", "C BV", "C BV", "C BV", "C BV", "C BV", "C BV",
"C BV", "C BV", "C BV", "C BV", "C BV"), jr = c("nil", "nil",
"C VB", "C VB", "G VB", "G VB", "nil", "nil", "nil", "nil", "nil",
"nil", "nil", "nil", "nil", "nil", "C JA", "C JA", "C JA", "C JA"
)), .Names = c("weeks", "A1M.Date", "vv", "rv", "ja", "aa", "bv",
"aj", "vb", "rj", "rr", "vr", "bb", "jr"), row.names = c(NA,
20L), class = "data.frame")

str(fevents)
'data.frame':   1430 obs. of  14 variables:
 $ weeks   : num  1 1 1 1 1 1 1 1 1 1 ...
 $ A1M.Date: chr  "2012-05-09" "2012-05-09" "2012-05-09" "2012-05-09" ...
 $ vv      : chr  "C RR" "C RR" "C RR" "G RR" ...
 $ rv      : chr  "nil" "nil" "nil" "nil" ...
 $ ja      : chr  "C VV" "C VV" "C VV" "C VV" ...
 $ aa      : chr  "C AJ" "C AJ" "C AJ" "C AJ" ...
 $ bv      : chr  "nil" "C VR" "C VR" "C VR" ...
 $ aj      : chr  "C RJ" "C RJ" "C RJ" "nil" ...
 $ vb      : chr  "C AA" "C AA" "C AA" "nil" ...
 $ rj      : chr  "C JR" "C JR" "G JR" "G JR" ...
 $ rr      : chr  "C BB" "C BB" "C BB" "C BB" ...
 $ vr      : chr  "C JA" "C JA" "C JA" "nil" ...
 $ bb      : chr  "C BV" "C BV" "C BV" "C BV" ...
 $ jr      : chr  "nil" "nil" "C VB" "C VB" ...

I found a really "not elegant" workaround. Writing data.frame to file as CSV and importing it with stringsAsFactors = FALSE. This is not what I want to write in my code... There must be a simpler way to rearrange the structure of the data.frame to please rle?

回答1:

The problem is that a factor is *not* an atomic vector as the error clearly says. Either convert all the factors to characters first (and not by coercing them to numeric!) or do the conversion inside the anonymous function you are applying.

So this, which implements the second idea, works:

aggregate(fevents2[,3:14], list(weeks = fevents2[, 1]),
          function(x) rle(as.character(x))$values)

after a fashion:

> aggregate(fevents2[,3:14], list(weeks = fevents2[, 1]),
+           function(x) rle(as.character(x))$values)
  weeks vv.1 vv.2 vv.3 vv.4 vv.5 vv.6 rv.1 rv.2 rv.3 rv.4 rv.5 rv.6 rv.7 ja.1
1     1 C RR G RR  nil C AA G AA  nil  nil C VB G VB  nil C VB G VB  nil C VV
  ja.2 ja.3 ja.4 aa.1 aa.2 bv.1 bv.2 bv.3 aj.1 aj.2 aj.3 aj.4 aj.5 vb.1 vb.2
1  nil C VV G VV C AJ  nil  nil C VR G VR C RJ  nil C RV G RV  nil C AA  nil
  vb.3 vb.4 vb.5 rj.1 rj.2   rr vr.1 vr.2 vr.3 vr.4 vr.5   bb jr.1 jr.2 jr.3
1 C AJ  nil C AJ C JR G JR C BB C JA  nil C RJ  nil C RJ C BV  nil C VB G VB
  jr.4 jr.5
1  nil C JA

though I am not sure what you expected to get - there is only one week here and aggregate and rle have stuck all the values together. Did you want separate $values for each of the variables in fevents2 that you are aggregating over?

Another thing:

as.numeric(as.character(fevents2)) can't possibly work as the data are not numeric! and you can't apply those functions to a data frame and get anything like what you intended - if they work at all.

The sapply() thing should work. Here is a version that checks whether each variable is a factor or not and coerces it if it is:

fevents3 <- sapply(fevents2,
                   function(x) if(is.factor(x)) { as.character(x) } else { x })

But note sapply() simplifies to a matrix which will change the aggregate() method dispatched:

> class(fevents3)
[1] "matrix"

Instead perhaps

fevents3 <- lapply(fevents2,
                   function(x) if(is.factor(x)) { as.character(x) } else { x })
fevents3 <- data.frame(fevents3, stringsAsFactors = FALSE)

Now if you wanted to apply rle() to each column of the split-up data and keep the separate how about

spl <- split(fevents3, list(weeks = fevents3[, 1]))
res <- lapply(spl, function(x) lapply(x[, 3:14], function(y) rle(y)$values))

which gives

> res
$`1`
$`1`$vv
[1] "C RR" "G RR" "nil"  "C AA" "G AA" "nil" 

$`1`$rv
[1] "nil"  "C VB" "G VB" "nil"  "C VB" "G VB" "nil" 

$`1`$ja
[1] "C VV" "nil"  "C VV" "G VV"

$`1`$aa
[1] "C AJ" "nil" 

$`1`$bv
[1] "nil"  "C VR" "G VR"

$`1`$aj
[1] "C RJ" "nil"  "C RV" "G RV" "nil" 

$`1`$vb
[1] "C AA" "nil"  "C AJ" "nil"  "C AJ"

$`1`$rj
[1] "C JR" "G JR"

$`1`$rr
[1] "C BB"

$`1`$vr
[1] "C JA" "nil"  "C RJ" "nil"  "C RJ"

$`1`$bb
[1] "C BV"

$`1`$jr
[1] "nil"  "C VB" "G VB" "nil"  "C JA"

Which is the same answer as that for aggregate() above, but with each rle() output kept separate:

> unlist(res)
 1.vv1  1.vv2  1.vv3  1.vv4  1.vv5  1.vv6  1.rv1  1.rv2  1.rv3  1.rv4  1.rv5 
"C RR" "G RR"  "nil" "C AA" "G AA"  "nil"  "nil" "C VB" "G VB"  "nil" "C VB" 
 1.rv6  1.rv7  1.ja1  1.ja2  1.ja3  1.ja4  1.aa1  1.aa2  1.bv1  1.bv2  1.bv3 
"G VB"  "nil" "C VV"  "nil" "C VV" "G VV" "C AJ"  "nil"  "nil" "C VR" "G VR" 
 1.aj1  1.aj2  1.aj3  1.aj4  1.aj5  1.vb1  1.vb2  1.vb3  1.vb4  1.vb5  1.rj1 
"C RJ"  "nil" "C RV" "G RV"  "nil" "C AA"  "nil" "C AJ"  "nil" "C AJ" "C JR" 
 1.rj2   1.rr  1.vr1  1.vr2  1.vr3  1.vr4  1.vr5   1.bb  1.jr1  1.jr2  1.jr3 
"G JR" "C BB" "C JA"  "nil" "C RJ"  "nil" "C RJ" "C BV"  "nil" "C VB" "G VB" 
 1.jr4  1.jr5 
 "nil" "C JA" 
> aggregate(fevents2[,3:14], list(weeks = fevents2[, 1]),
+           function(x) rle(as.character(x))$values)
  weeks vv.1 vv.2 vv.3 vv.4 vv.5 vv.6 rv.1 rv.2 rv.3 rv.4 rv.5 rv.6 rv.7 ja.1
1     1 C RR G RR  nil C AA G AA  nil  nil C VB G VB  nil C VB G VB  nil C VV
  ja.2 ja.3 ja.4 aa.1 aa.2 bv.1 bv.2 bv.3 aj.1 aj.2 aj.3 aj.4 aj.5 vb.1 vb.2
1  nil C VV G VV C AJ  nil  nil C VR G VR C RJ  nil C RV G RV  nil C AA  nil
  vb.3 vb.4 vb.5 rj.1 rj.2   rr vr.1 vr.2 vr.3 vr.4 vr.5   bb jr.1 jr.2 jr.3
1 C AJ  nil C AJ C JR G JR C BB C JA  nil C RJ  nil C RJ C BV  nil C VB G VB
  jr.4 jr.5
1  nil C JA

[Note: This is only true here because the data snippet you show has just one week. I can't recall how unlist(res)) will look if there is more than one week.]



标签: r r-factor