Drop ID with NA in a conditional group

2019-08-30 01:36发布

问题:

Extending this question:

I have some data prepared using the below code:

# # Data Preparation ----------------------
library(lubridate)
start_date <- "2018-10-30 00:00:00"
start_date <- as.POSIXct(start_date, origin="1970-01-01")
dates <- c(start_date)
for(i in 1:287) {
    dates <- c(dates, start_date + minutes(i * 10))
}
dates <- as.POSIXct(dates, origin="1970-01-01")
date_val <- format(dates, '%d-%m-%Y')

weather.forecast.data <- data.frame(dateTime = dates, date = date_val)
weather.forecast.data <- rbind(weather.forecast.data, weather.forecast.data, weather.forecast.data, weather.forecast.data)
weather.forecast.data$id <- c(rep('GH1', 288), rep('GH2', 288), rep('GH3', 288), rep('GH4', 288))
weather.forecast.data$radiation <- round(runif(nrow(weather.forecast.data)), 2)

weather.forecast.data$hour <- as.integer(format(weather.forecast.data$dateTime, '%H'))
weather.forecast.data$day_night <- ifelse(weather.forecast.data$hour < 6, 'night', ifelse(weather.forecast.data$hour < 19, 'day', 'night'))

# # GH2: Total Morning missing # #
weather.forecast.data$radiation[(weather.forecast.data$id == 'GH2') & (weather.forecast.data$date == '30-10-2018') & (weather.forecast.data$day_night == 'day')] = NA
weather.forecast.data$hour <- NULL
weather.forecast.data$day_night <- NULL

My task is to remove ids from the weather.forecast.data where for each id and each date, morning half (06 hours to 18 hours), the radiation values are missing (NA) using dplyr in R.

I want to eliminate rows for a given id and date which has the entire morning radiation value as missing. i.e. if an id for a date has morning radiation missing. I drop all the rows with that particular id and date. So, we drop all the 144 records because its morning has radiation missing.

We can see that GH2 has entire morning radiation missing on date 30-10-2018. We therefore drop all 144 records with id == 'GH2' and date = '30-10-2018'.

setDT(weather.forecast.data)
weather.forecast.data[, sum(is.na(radiation)), .(id, date)]
    id       date V1
1: GH1 30-10-2018  0
2: GH1 31-10-2018  0
3: GH2 30-10-2018 78
4: GH2 31-10-2018  0
5: GH3 30-10-2018  0
6: GH3 31-10-2018  0
7: GH4 30-10-2018  0
8: GH4 31-10-2018  0

I have the code using data.table:

setDT(weather.forecast.data)
weather.forecast.data[, hour:= hour(dateTime)]
weather.forecast.data[, day_night:=c("night", "day")[(6 <= hour & hour < 19) + 1L]]
weather.forecast.data[, date_id := paste(date, id, sep = "__")]
weather.forecast.data[, all_is_na := all(is.na(radiation)), .(date_id, day_night)]
weather.forecast.data[!(date_id %in% unique(weather.forecast.data[(all_is_na == TRUE) & (day_night == 'day'), date_id]))]

I need the code using dplyr and I have tried the following. It is dropping many rows than required:

library(dplyr)
weather.forecast.data <- weather.forecast.data %>%
    mutate(hour = as.integer(format(dateTime, '%H'))) %>%
    mutate(day_night = ifelse(hour < 6, 'night', ifelse(hour < 19, 'day', 'night'))) %>%
    group_by(date, day_night, id) %>%
    filter((!all(is.na(radiation))) & (day_night == 'day')) %>%
    select (-c(hour, day_night)) %>%
    as.data.frame

Note: Output should return the data by dropping the rows where id = 'GH2' and date = '30-10-2018'

回答1:

I believe you are complicating a bit. The following code does what you describe in the question.

library(lubridate)
library(dplyr)

weather.forecast.data %>%
  mutate(hour = hour(dateTime),
         day_night = c("night", "day")[(6 <= hour & hour < 19) + 1L]) %>%
  group_by(date, id) %>%
  mutate(delete = all(!(is.na(radiation) & day_night == "day"))) %>%
  ungroup() %>%
  filter(delete) %>%
  select(-hour, -day_night, -delete) %>%
  as.data.frame() -> df1

See if it worked giving the expected 144 deleted rows.

nrow(weather.forecast.data) - nrow(df1)
#[1] 144

Data.

I repost the data generation code, simplified in two places and with a call to set.seed.

set.seed(4192)

start_date <- "2018-10-30 00:00:00"
start_date <- as.POSIXct(start_date, origin="1970-01-01")
dates <- start_date + minutes(0:287 * 10)
dates <- as.POSIXct(dates, origin="1970-01-01")
date_val <- format(dates, '%d-%m-%Y')

weather.forecast.data <- data.frame(dateTime = dates, date = date_val)
weather.forecast.data <- rbind(weather.forecast.data, weather.forecast.data, weather.forecast.data, weather.forecast.data)
weather.forecast.data$id <- c(rep('GH1', 288), rep('GH2', 288), rep('GH3', 288), rep('GH4', 288))
weather.forecast.data$radiation <- round(runif(nrow(weather.forecast.data)), 2)

weather.forecast.data$hour <- hour(weather.forecast.data$dateTime)
weather.forecast.data$day_night <- ifelse(weather.forecast.data$hour < 6, 'night', ifelse(weather.forecast.data$hour < 19, 'day', 'night'))

# # GH2: Total Morning missing # #
weather.forecast.data$radiation[(weather.forecast.data$id == 'GH2') & (weather.forecast.data$date == '30-10-2018') & (weather.forecast.data$day_night == 'day')] = NA
weather.forecast.data$hour <- NULL
weather.forecast.data$day_night <- NULL


回答2:

You are filtering for rows that only contain "day" in the day_night column. If I understood you correctly you want the following:

    library(dplyr)
    weather.forecast.data <- weather.forecast.data %>%
      mutate(hour = as.integer(format(dateTime, '%H'))) %>%
      mutate(day_night = ifelse(hour < 6, 'night', ifelse(hour < 19, 'day', 
                                                         'night'))) %>%
      group_by(date, day_night, id) %>%
      filter((!(all(is.na(radiation))) & (day_night == 'day'))) %>%
      select (-c(hour, day_night)) %>%
      as.data.frame

This would remove all IDs that have all NAs during the day.