Extending this question:
I have some data prepared using the below code:
# # Data Preparation ----------------------
library(lubridate)
start_date <- "2018-10-30 00:00:00"
start_date <- as.POSIXct(start_date, origin="1970-01-01")
dates <- c(start_date)
for(i in 1:287) {
dates <- c(dates, start_date + minutes(i * 10))
}
dates <- as.POSIXct(dates, origin="1970-01-01")
date_val <- format(dates, '%d-%m-%Y')
weather.forecast.data <- data.frame(dateTime = dates, date = date_val)
weather.forecast.data <- rbind(weather.forecast.data, weather.forecast.data, weather.forecast.data, weather.forecast.data)
weather.forecast.data$id <- c(rep('GH1', 288), rep('GH2', 288), rep('GH3', 288), rep('GH4', 288))
weather.forecast.data$radiation <- round(runif(nrow(weather.forecast.data)), 2)
weather.forecast.data$hour <- as.integer(format(weather.forecast.data$dateTime, '%H'))
weather.forecast.data$day_night <- ifelse(weather.forecast.data$hour < 6, 'night', ifelse(weather.forecast.data$hour < 19, 'day', 'night'))
# # GH2: Total Morning missing # #
weather.forecast.data$radiation[(weather.forecast.data$id == 'GH2') & (weather.forecast.data$date == '30-10-2018') & (weather.forecast.data$day_night == 'day')] = NA
weather.forecast.data$hour <- NULL
weather.forecast.data$day_night <- NULL
My task is to remove ids from the weather.forecast.data where for each id and each date, morning half (06 hours to 18 hours), the radiation values are missing (NA) using dplyr
in R
.
I want to eliminate rows for a given id
and date
which has the entire morning radiation
value as missing. i.e. if an id for a date
has morning radiation
missing. I drop all the rows with that particular id
and date
. So, we drop all the 144 records because its morning has radiation missing.
We can see that GH2
has entire morning radiation missing on date 30-10-2018
. We therefore drop all 144 records with id == 'GH2'
and date = '30-10-2018'
.
setDT(weather.forecast.data)
weather.forecast.data[, sum(is.na(radiation)), .(id, date)]
id date V1
1: GH1 30-10-2018 0
2: GH1 31-10-2018 0
3: GH2 30-10-2018 78
4: GH2 31-10-2018 0
5: GH3 30-10-2018 0
6: GH3 31-10-2018 0
7: GH4 30-10-2018 0
8: GH4 31-10-2018 0
I have the code using data.table
:
setDT(weather.forecast.data)
weather.forecast.data[, hour:= hour(dateTime)]
weather.forecast.data[, day_night:=c("night", "day")[(6 <= hour & hour < 19) + 1L]]
weather.forecast.data[, date_id := paste(date, id, sep = "__")]
weather.forecast.data[, all_is_na := all(is.na(radiation)), .(date_id, day_night)]
weather.forecast.data[!(date_id %in% unique(weather.forecast.data[(all_is_na == TRUE) & (day_night == 'day'), date_id]))]
I need the code using dplyr
and I have tried the following. It is dropping many rows than required:
library(dplyr)
weather.forecast.data <- weather.forecast.data %>%
mutate(hour = as.integer(format(dateTime, '%H'))) %>%
mutate(day_night = ifelse(hour < 6, 'night', ifelse(hour < 19, 'day', 'night'))) %>%
group_by(date, day_night, id) %>%
filter((!all(is.na(radiation))) & (day_night == 'day')) %>%
select (-c(hour, day_night)) %>%
as.data.frame
Note: Output should return the data by dropping the rows where id = 'GH2'
and date = '30-10-2018'