how to insert missing observations on a data frame

2019-01-15 17:49发布

问题:

I have a data that are observations over time. Unfortunately, some large gaps of time points are missing on a treatment. They are not coded as NA and if I make a plot out of them it becomes apparent.

My data frame looks like this. The number of samples per time points are irregular. (edit: sorry for not making the example reproducible)s

    structure(list(A = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 144L, 144L, 144L, 1809L, 1809L, 1809L, 
1809L, 1809L, 1809L, 1809L, 1809L, 1809L, 1809L, 1809L, 1809L, 
2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 
2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 
2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 
2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 
2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 
2070L, 2070L, 2070L, 2070L, 2757L, 2757L, 2757L, 2909L, 2909L, 
2909L, 2909L, 2909L, 2909L, 2909L, 2909L, 2909L, 2909L, 2975L, 
2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 
2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 
2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 
2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 
2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 
2975L, 2975L, 2975L, 2975L), cond = structure(c(2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Con", 
"Si"), class = "factor"), T = c(416L, 417L, 418L, 419L, 420L, 
423L, 424L, 425L, 426L, 427L, 428L, 429L, 430L, 431L, 432L, 433L, 
434L, 435L, 436L, 437L, 438L, 439L, 440L, 441L, 442L, 443L, 444L, 
445L, 446L, 447L, 448L, 449L, 450L, 451L, 452L, 453L, 454L, 458L, 
503L, 504L, 505L, 506L, 507L, 508L, 509L, 510L, 511L, 512L, 513L, 
514L, 515L, 516L, 517L, 518L, 519L, 520L, 521L, 522L, 523L, 524L, 
525L, 526L, 527L, 528L, 272L, 276L, 277L, 350L, 351L, 352L, 353L, 
354L, 355L, 356L, 357L, 358L, 359L, 360L, 361L, 372L, 373L, 374L, 
375L, 376L, 377L, 378L, 379L, 380L, 381L, 382L, 383L, 384L, 385L, 
386L, 387L, 388L, 389L, 390L, 391L, 392L, 393L, 394L, 395L, 396L, 
397L, 398L, 399L, 400L, 401L, 437L, 438L, 439L, 440L, 441L, 442L, 
443L, 444L, 445L, 446L, 447L, 448L, 449L, 450L, 451L, 452L, 453L, 
454L, 455L, 493L, 494L, 495L, 382L, 383L, 384L, 385L, 386L, 387L, 
388L, 389L, 390L, 391L, 523L, 524L, 525L, 526L, 527L, 528L, 529L, 
530L, 531L, 532L, 533L, 534L, 535L, 536L, 537L, 538L, 539L, 540L, 
541L, 542L, 543L, 544L, 545L, 546L, 547L, 548L, 549L, 550L, 551L, 
552L, 553L, 554L, 555L, 556L, 557L, 582L, 583L, 584L, 585L, 586L, 
587L, 588L, 589L, 590L, 591L, 592L, 593L, 594L, 595L, 596L), 
    Vlog = c(1.199206203, 0.92297866, 0.74831703, 1.180533889, 
    0.846435768, 1.823185531, 1.775303408, 0.9253633, 1.562371106, 
    1.237695416, 1.310507835, 1.431774566, 2.259365243, 1.721204598, 
    0.976929098, 0.673510525, 1.194940048, 0.878373924, 1.399859784, 
    1.04183351, 0.362465228, 1.345074816, 0.839639722, 1.235884973, 
    0.946877821, 0.810708992, 0.620516467, 0.99590939, 0.446167467, 
    0.635246561, 0.508835353, 0.470349764, 0.505083592, 0.363685506, 
    0.841427562, 1.502579534, 1.503814969, 1.962735861, 1.190111689, 
    1.208627789, 1.212606926, 1.3052429, 1.19648953, 1.399151795, 
    1.359988717, 1.530933258, 1.324386434, 1.429685474, 1.550040003, 
    1.209836455, 0.976675012, 1.396991989, 1.309972472, 0.884831368, 
    0.940578242, 0.622109712, 0.196736781, 0, 1.861481047, 1.166587204, 
    1.154778081, 0.750716468, 0.822148942, 0.324409805, 0.810379036, 
    2.218975354, 0.837542999, 1.597505982, 1.34988859, 2.109471773, 
    1.408734988, 1.006914696, 1.680242618, 1.842263128, 2.19564511, 
    1.80944452, 1.194273373, 1.953931263, 1.943781916, 2.136530509, 
    2.174627732, 1.837702354, 1.744745221, 1.744745221, 2.065910366, 
    1.3644043, 1.935629046, 1.327947423, 1.703751191, 1.595793931, 
    2.32443327, 1.815054551, 1.381916487, 1.535930503, 1.762742848, 
    1.214377396, 1.745046639, 0, 0, 1.314421325, 2.12544409, 
    1.961225517, 1.722393773, 1.763882649, 2.246794342, 1.462888398, 
    0, 2.699085109, 0.982206846, 1.678694356, 1.339419526, 1.856762396, 
    1.604863093, 1.439867691, 1.210451327, 0.988645101, 1.581116604, 
    0.868888993, 1.385699365, 1.377180499, 1.584445411, 1.76153307, 
    1.153021042, 1.427814276, 1.867219352, 1.726781152, 2.045476901, 
    1.231462515, 1.282774459, 1.194170351, 1.423430455, 1.813916126, 
    1.697914719, 1.343711186, 1.619115871, 1.590854952, 1.165150441, 
    0.84551636, 0.925836885, 0.0009995, 0, 2.672041587, 1.630536406, 
    2.084775235, 0.879027692, 2.150052605, 1.171591247, 2.589254624, 
    1.09594206, 1.788420568, 0.879027692, 1.768910948, 1.544705476, 
    0.961905249, 2.03675983, 1.189770451, 2.125034005, 1.921180059, 
    1.587902512, 1.113485404, 1.826744807, 0.961905249, 1.423828826, 
    1.392463308, 1.355448604, 1.638531529, 1.158778559, 1.257058585, 
    1.641075408, 1.652573524, 1.435915015, 1.072776171, 1.240686858, 
    1.647779212, 1.089811169, 1.723723056, 2.094419336, 0.544066958, 
    0.894454037, 1.651688305, 1.153416081, 0.961905249, 2.457446983, 
    0.704322704, 1.544705476, 1.970925317, 1.402837317, 1.651688305, 
    1.358923164, 1.153416081, 2.056674373)), .Names = c("A", 
"cond", "T", "Vlog"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 
21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 
47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 
60L, 61L, 62L, 63L, 64L, 66L, 67L, 68L, 201L, 202L, 203L, 204L, 
205L, 206L, 207L, 208L, 209L, 210L, 211L, 212L, 213L, 214L, 215L, 
216L, 217L, 218L, 219L, 220L, 221L, 222L, 223L, 224L, 225L, 226L, 
227L, 228L, 229L, 230L, 231L, 232L, 233L, 234L, 235L, 236L, 237L, 
238L, 239L, 240L, 241L, 242L, 243L, 244L, 245L, 246L, 247L, 248L, 
249L, 250L, 251L, 252L, 253L, 254L, 255L, 256L, 257L, 258L, 259L, 
260L, 261L, 695L, 696L, 697L, 698L, 699L, 700L, 701L, 702L, 703L, 
704L, 705L, 706L, 707L, 708L, 709L, 710L, 711L, 712L, 713L, 714L, 
715L, 716L, 717L, 718L, 719L, 720L, 721L, 722L, 723L, 724L, 725L, 
726L, 727L, 728L, 729L, 730L, 731L, 732L, 733L, 734L, 735L, 736L, 
737L, 738L, 739L, 740L, 741L, 742L, 743L, 744L, 745L, 746L, 747L, 
748L, 749L, 750L, 751L, 752L, 753L, 754L, 755L, 756L, 757L), class = "data.frame")

Is there a way of spotting the missing time points and insert n rows to it? What I thought of is to check the missing time points by making a freq table for each time point per treatment and then inserting a row. This is doable with a short time series but not with a large one. I am not sure if someone could help do it a little bit easier? Thanks!

edit: T is sequential but the number of data per T varies. And I want to insert a number of rows for each T. Hope the edits made it clear. :)

回答1:

This largely depends on how general you wish your solution to be. But, if you want a non-general solution you can do #1 pretty simply. Here, I assume that you're using T as your time variable.

insert_miss <- function(df, time_val= "T", by= 1) {
  val <- get(time_val, envir= as.environment(df))
  val_range <- range(val)
  comp <- seq(val_range[1], val_range[2], by=by)
  which_miss <- comp[!comp %in% val]
  # generating a sample row depends a lot on your particular problem
  # also, specifically how to impute the missing values depends on your 
  # specific problem / domain
  ## here's one simple solution which is not generic
  row_samp <- df[1,]
  df2 <- do.call("rbind", replicate(length(which_miss), row_samp, simplify= FALSE))
  df2[[time_val]] <- which_miss
  others <- which(names(df2) != time_val)
  df2[, others] <- NA
  return(df2)
}

run

insert_miss(<your_df>)
R> A cond   T Vlog
1 NA   NA 421   NA
2 NA   NA 422   NA


回答2:

Your example data doesn't match the chart image you posted, but here's a answer with random data

# random x-y series
set.seed(123)
dat <- data.frame(x=1:200,
                  y=cumsum(rnorm(200)))

# punch some holes
dat <- dat[-c(20:40, 90:120), ]

# for each point, find gap to next point
diff2next <- with(dat, x[-1] - x[-nrow(dat)])

# now find position of non consecutive points (i.e. where gap > 1)
holes_start <- which(diff2next > 1)
holes_end <- holes_start + 1 #(by definition the gap ends with the next point)

# that's it. here's a plot of the line and the identified holes
ggplot() + 
  geom_line(data=dat, aes(x, y)) + # the line
  geom_point(data=dat[c(holes_start, holes_end), ], 
             aes(x, y), color='red') # the hole start/ends



回答3:

Assuming that your data frame is called ts.df and T variable is sequential (as in it increases by one and only by one on each and every data point), you can generate data.frame with all T values in range and OUTER JOIN it into your existing data.frame to get NAs filled in automatically:

ids <- data.frame(T=seq(from=min(ts.df$T), to=max(ts.df$T)), A=0, cond="Si")
ts.df <- merge(ts.df, ids, all.y=TRUE)
ggplot(ts.df, aes(T, Vlog)) + geom_line() + geom_point()

This will assign Si value for cond variable for all rows and 0 value for A variable. The first one seems about right, the second one is irrelevant for your chart anyway.

You might need to split entire data.frame by condition, run above code to modify subset for one condition and join data.frames again to get it working on your current ggplot() call, but since you haven't posted reproducible example of your problem, there is only so much I can do.