Create a variable in `df1` depending on one variab

2019-05-28 02:34发布

问题:

I have data frame df1 that summarises fish depths over time. df1$Site tells you the site where the fish was, df1$Ind tells you the individual and df1$Depth tells you the depth where the fish was at a specific df1$Datetime.

On the other hand, I have df2 that summarises the intensity of the currents over time (EVERY THREE HOURS) from the surface to 39 meters depth at intervals of 8 meters (m0-7, m8-15, m16-23, m24-31 and m32-39). As an example:

df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")


> df1
             Datetime Site Ind Depth
1 2016-08-01 15:34:07   BD  16   5.3
2 2016-08-01 16:25:16   HG  17  24.0
3 2016-08-01 17:29:16   BD  19  36.4
4 2016-08-01 18:33:16   BD  16  42.0
5 2016-08-01 20:54:16   BD  17    NA
6 2016-08-01 22:48:16   BD  16  22.1

df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")

> df2
             Datetime Site m0-7 m8-15 m16-23 m24-31 m32-39
1 2016-08-01 12:00:00   BD 2.75  3.00   2.75   3.25   3.00
2 2016-08-01 15:00:00   BD 4.00  4.00   4.00   3.00   4.00
3 2016-08-01 18:00:00   BD 6.75  4.75   5.75   6.50   4.75
4 2016-08-01 21:00:00   BD 2.25  3.00   2.25   2.75   3.00
5 2016-08-02 00:00:00   BD 4.30  2.10   1.40   3.40   1.70

I want to create a new column in df1 called df1$Current.Int that summarises the current intensity at the depth when and where the fish was according to what df2 says about currents.

I would like to get this:

> df1
             Datetime Site Ind Depth Current.Int
1 2016-08-01 15:34:07   BD  16   5.3        4.00
2 2016-08-01 16:25:16   HG  17  24.0          NA # Currents of this site are not included in df2
3 2016-08-01 17:29:16   BD  19  36.4        4.75
4 2016-08-01 18:33:16   BD  16  42.0        4.75
5 2016-08-01 20:54:16   BD  17    NA          NA
6 2016-08-01 22:48:16   BD  16  22.1        1.40

Just to point out that since the current records are every three hours, every hour indicated in df2$Datetime represents one hour and a half more, and one hour and a half less. That is, the current intensity pointed out in df2 at 21:00:00 reflects the currents between 19:30:00 and 22:30:00. The same with the rest of the hours.

Does anyone know how to do it?

回答1:

The dates didn't match so they are changed for the example. With this approach, you can check exactly how the match has worked and ensure it is as you want it.

df1<-data.frame(Datetime=c("2016-08-18 15:34:07","2016-08-18 16:25:16","2016-08-18 17:29:16","2016-08-18 18:33:16","2016-08-18 20:54:16","2016-08-18 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")

df2<-data.frame(Datetime=c("2016-08-18 12:00:00","2016-08-18 15:00:00","2016-08-18 18:00:00","2016-08-18 21:00:00","2016-08-19 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")

library(dplyr)
library(lubridate)

# Round the date and convert the depth to match the look-up. 
df1 = df1 %>% 
  mutate(
    Datetime_rounded = round_date(Datetime, "3 hour"),
    Depth_ind = ifelse(Depth < 8, "m0-7", 
                  ifelse(Depth > 7 & Depth < 16, "m8-15", 
                    ifelse(Depth > 15 & Depth < 24, "m16-23",
                      ifelse(Depth > 23 & Depth < 32, "m24-31",
                        ifelse(Depth > 31 & Depth < 40, "m32-39", NA)
                      )
                    )
                  )
                )
  )

# Wide to long on the intensity columns. 
df2 = df2 %>% 
  tidyr::gather("Depth_ind", "Intensity", 3:7)

# Join
df1 %>% 
  left_join(df2, by = c("Datetime_rounded" = "Datetime", 
                        "Site",
                        "Depth_ind"))

             Datetime Site Ind Depth    Datetime_rounded Depth_ind Intensity
1 2016-08-18 15:34:07   BD  16   5.3 2016-08-18 15:00:00      m0-7      4.00
2 2016-08-18 16:25:16   HG  17  24.0 2016-08-18 15:00:00    m24-31        NA
3 2016-08-18 17:29:16   BD  19  36.4 2016-08-18 18:00:00    m32-39      4.75
4 2016-08-18 18:33:16   BD  16  42.0 2016-08-18 18:00:00      <NA>        NA
5 2016-08-18 20:54:16   BD  17    NA 2016-08-18 21:00:00      <NA>        NA
6 2016-08-18 22:48:16   BD  16  22.1 2016-08-19 00:00:00    m16-23      1.40

# EDIT ----
## As per the request, the width of the final depth range can be adjusted as you wish, e.g. to a max depth of 60 m.

# Round the date and convert the depth to match the look-up. 
df1 = df1 %>% 
  mutate(
    Datetime_rounded = round_date(Datetime, "3 hour"),
    Depth_ind = ifelse(Depth < 8, "m0-7", 
                  ifelse(Depth > 7 & Depth < 16, "m8-15", 
                    ifelse(Depth > 15 & Depth < 24, "m16-23",
                      ifelse(Depth > 23 & Depth < 32, "m24-31",
                        ifelse(Depth > 31 & Depth < 60, "m32-39", NA)
                      )
                    )
                  )
                )
  )


回答2:

This can be done directly in a single SQL statement. We left join df1 to df2 with the indicated on condition grouping by the df1 row. Calculating max(b.Datetime) over the indicated group will pick out the appropriate row of df2. (If a.Datetime, a.Site does not uniquely define a row of df1 then group by a.rowid instead.) At the end we remove that column using [-1].

We used the data shown in the Note at the end since the data in the question did not have corresponding dates in df1 and df2.

library(sqldf)

sqldf("select max(b.Datetime), a.*,
  case when a.Depth <= 7 then b.[m0-7]
       when a.Depth <= 15 then b.[m8-15]
       when a.Depth <= 23 then b.[m16-23]
       when a.Depth <= 31 then b.[m24-31]
       else b.[m32-39]
  end as [Current.Int]
  from df1 a
  left join df2 b on a.Site = b.Site and a.Datetime >= b.Datetime
  group by a.Datetime, a.Site")[-1]

giving:

             Datetime Site Ind Depth Current.Int
1 2016-08-01 15:34:07   BD  16   5.3        4.00
2 2016-08-01 16:25:16   HG  17  24.0          NA
3 2016-08-01 17:29:16   BD  19  36.4        4.00
4 2016-08-01 18:33:16   BD  16  42.0        4.75
5 2016-08-01 20:54:16   BD  17    NA        4.75
6 2016-08-01 22:48:16   BD  16  22.1        2.25

Note

This is the input used and is the same as in the question except:

  1. the UTC time zone has been eliminated. If you want to keep the UTC time zone change your session time zone to UTC using Sys.setenv(TZ='UTC'). Another possibility to deal with timezones is to use character strings rather than POSIXct for the Datetime columns in which case you can't have time zone problems in the first place.

  2. the last line was added to improve the example since the dates did not match.

Here is the input used.

df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),Site=c("BD","HG","BD","BD","BD","BD"),Ind=c(16,17,19,16,17,16), Depth=c(5.3,24,36.4,42,NA,22.1))
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S")

df2<-data.frame(Datetime=c("2016-08-18 12:00:00","2016-08-18 15:00:00","2016-08-18 18:00:00","2016-08-18 21:00:00","2016-08-19 00:00:00"), Site=c("BD","BD","BD","BD","BD"),var1=c(2.75,4,6.75,2.25,4.3),var2=c(3,4,4.75,3,2.1),var3=c(2.75,4,5.75,2.25,1.4),var4=c(3.25,3,6.5,2.75,3.4),var5=c(3,4,4.75,3,1.7))
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S")
colnames(df2)<-c("Datetime","Site","m0-7","m8-15","m16-23","m24-31","m32-39")

df2$Datetime <- as.POSIXct(paste("2016-08-01", sub(".* ", "", df2$Datetime)))


回答3:

As long as your data isn't huge, you may not have to embark down the path of conditional joins. Instead, join based only using Site first and then filter out the extra observations afterwards. It's not particularly efficient, but it might be easier than turning to sqldf.

Note I made a few changes to the data you supplied so that the dates would match up.

library(tidyverse)  

df1<-data.frame(Datetime=c("2016-08-01 15:34:07","2016-08-01 16:25:16","2016-08-01 17:29:16","2016-08-01 18:33:16","2016-08-01 20:54:16","2016-08-01 22:48:16"),
                Site=c("BD","HG","BD","BD","BD","BD"),
                Ind=c(16,17,19,16,17,16), 
                Depth=c(5.3,24,36.4,42,NA,22.1),
                stringsAsFactors = FALSE)
df1$Datetime<-as.POSIXct(df1$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")

df2<-data.frame(Datetime=c("2016-08-01 12:00:00","2016-08-01 15:00:00","2016-08-01 18:00:00","2016-08-01 21:00:00","2016-08-02 00:00:00"), 
                Site=c("BD","BD","BD","BD","BD"),
                var1=c(2.75,4,6.75,2.25,4.3),
                var2=c(3,4,4.75,3,2.1),
                var3=c(2.75,4,5.75,2.25,1.4),
                var4=c(3.25,3,6.5,2.75,3.4),
                var5=c(3,4,4.75,3,1.7),
                stringsAsFactors = FALSE)
df2$Datetime<-as.POSIXct(df2$Datetime, format="%Y-%m-%d %H:%M:%S",tz="UTC")
colnames(df2)<-c("Datetime_CI","Site","m0-7","m8-15","m16-23","m24-31","m32-39")



#Tidy the data in df2 so that that we have two columns for min and max Depth
#and a single column for the value of the current intensity
df2 <- df2 %>% 
  gather(-Datetime_CI, -Site, key = Depth, value = Current.Int) %>% 
  separate(Depth, c("minDepth", "maxDepth")) %>% 
  mutate(minDepth = as.numeric(str_sub(minDepth, 2, nchar(minDepth))))

#join df1 and df2 based on the Site alone
df1 %>% 
  inner_join(df2, by = "Site") %>% 
  #now filter out any observations where depth is not between the min and max
  filter(Depth >= minDepth,
         Depth <= maxDepth,
         #now exclude any current intensity observations prior to Datetime
         Datetime > Datetime_CI) %>% 
  #finally, take the first current intensity observation after Datetime
  group_by(Datetime, Site, Ind, Depth) %>% 
  filter(Datetime_CI == max(Datetime_CI))


# A tibble: 6 x 8
# Groups:   Datetime, Site, Ind, Depth [4]
Datetime            Site    Ind Depth Datetime_CI         minDepth maxDepth Current.Int
<dttm>              <chr> <dbl> <dbl> <dttm>                 <dbl> <chr>          <dbl>
1 2016-08-01 15:34:07 BD       16   5.3 2016-08-01 15:00:00        0 7               4   
2 2016-08-01 17:29:16 BD       19  36.4 2016-08-01 15:00:00        0 7               4   
3 2016-08-01 17:29:16 BD       19  36.4 2016-08-01 15:00:00       32 39              4   
4 2016-08-01 18:33:16 BD       16  42   2016-08-01 18:00:00        0 7               6.75
5 2016-08-01 22:48:16 BD       16  22.1 2016-08-01 21:00:00        0 7               2.25
6 2016-08-01 22:48:16 BD       16  22.1 2016-08-01 21:00:00       16 23              2.25