I have given two dataframes below for you to test
df = pd.DataFrame({
'subject_id':[1,1,1,1,1,1,1,1,1,1,1],
'time_1' :['2173-04-03 12:35:00','2173-04-03 17:00:00','2173-04-03
20:00:00','2173-04-04 11:00:00','2173-04-04 11:30:00','2173-04-04
12:00:00','2173-04-05 16:00:00','2173-04-05 22:00:00','2173-04-06
04:00:00','2173-04-06 04:30:00','2173-04-06 06:30:00'],
'val' :[5,5,5,10,5,10,5,8,3,8,10]
})
df1 = pd.DataFrame({
'subject_id':[1,1,1,1,1,1,1,1,1,1,1],
'time_1' :['2173-04-03 12:35:00','2173-04-03 12:50:00','2173-04-03
12:59:00','2173-04-03 13:14:00','2173-04-03 13:37:00','2173-04-04
11:30:00','2173-04-05 16:00:00','2173-04-05 22:00:00','2173-04-06
04:00:00','2173-04-06 04:30:00','2173-04-06 08:00:00'],
'val' :[5,5,5,5,10,5,5,8,3,4,6]
})
what I would like to do is
1) Find all values (from val
column) which have been same for more than 1 hour
in each day for each subject_id
and get the minimum of it
Please note that values can also be captured at every 15 min duration
as well, so you might have to consider 5 records to see > 1 hr
condition). See sample screenshot below
2) If there are no values which were same for more than 1 hour
in a day, then just get the minimum of that day for that subject_id
The below screenshot for one subject will help you understand and the code I tried is given below
This is what I tried
df['time_1'] = pd.to_datetime(df['time_1'])
df['time_2'] = df['time_1'].shift(-1)
df['tdiff'] = (df['time_2'] - df['time_1']).dt.total_seconds() / 3600
df['reading_day'] = pd.DatetimeIndex(df['time_1']).day
# don't know how to apply if else condition here to check for 1 hr criteria
t1 = df.groupby(['subject_id','reading_start_day','tdiff])['val'].min()
As I have to apply this to million records, any elegant and efficient solution would be helpful
df = pd.DataFrame({
'subject_id':[1,1,1,1,1,1,1,1,1,1],
'time_1' :['2173-04-03 12:35:00','2173-04-03 17:00:00','2173-04-03 20:00:00','2173-04-04 11:00:00','2173-04-04 11:30:00','2173-04-04 12:00:00','2173-04-04 16:00:00','2173-04-04 22:00:00','2173-04-05 04:00:00','2173-04-05 06:30:00'],
'val' :[5,5,5,10,5,10,5,8,8,10]
})
# Separate Date and time
df['time_1']=pd.to_datetime(df['time_1'])
df['new_date'] = [d.date() for d in df['time_1']]
df['new_time'] = [d.time() for d in df['time_1']]
# find time diff in group with the first element to check > 1 hr
df['shift_val'] = df['val'].shift()
df1=df.assign(time_diff=df.groupby(['subject_id','new_date']).time_1.apply(lambda x: x - x.iloc[0]))
# Verify if time diff > 1 and value is not changed
df2=df1.loc[(df1['time_diff']/ np.timedelta64(1, 'h') >= 1) & (df1.val == df1.groupby('new_date').first().val[0])]
df3=df1.loc[(df1['time_diff']/ np.timedelta64(1, 'h') <= 1) & (df1.val == df1.shift_val)]
# Get the minimum within the group
df4=df2.append(df3).groupby(['new_date'], sort=False).min()
# drop unwanted columns
df4.drop(['new_time','shift_val','time_diff'],axis=1, inplace=True)
df4
Output
subject_id time_1 val
new_date
2173-04-03 1 2173-04-03 17:00:00 5
2173-04-04 1 2173-04-04 16:00:00 5
2173-04-05 1 2173-04-05 04:00:00 8
Try this.
from datetime import timedelta
def f(x):
dif = (x.iloc[0]-x.iloc[-1])//timedelta(minutes=1)
return dif
df1['time_1']= pd.to_datetime(df1['time_1'])
df1['flag']= df1.val.diff().ne(0).cumsum()
df1['t_d']=df1.groupby('flag')['time_1'].transform(f)
df1['date'] = df1['time_1'].dt.date
mask= df1['t_d'].ne(0)
dfa=df1[mask].groupby(['flag','date']).first().reset_index()
dfb=df1[~mask].groupby('date').first().reset_index().dropna(how='any')
df_f = dfa.merge(dfb, how='outer')
df_f.drop_duplicates(subset='date', keep='first', inplace=True)
df_f.drop(['flag','date','t_d'], axis=1, inplace=True)
df_f
Output.
subject_id time_1 val
0 1 2173-04-03 12:35:00 5
1 1 2173-04-04 11:30:00 5
2 1 2173-04-05 16:00:00 5
5 1 2173-04-06 04:00:00 3
Try this
from datetime import timedelta
df1['time_1']= pd.to_datetime(df1['time_1'])
df1['date'] = df1['time_1'].dt.date
df1['t_d'] = df1.groupby(['date'])['time_1'].diff().shift(-1)
mask= df1['t_d']>pd.Timedelta(1,'h')
dfa=df1[mask]
dfb=df1[~mask].groupby('date').first().reset_index()
df_f = dfa.merge(dfb, how='outer')
df_f.drop_duplicates(subset='date', keep='first', inplace=True)
df_f.drop(['date','t_d'], axis=1, inplace=True)
df_f.sort_values('time_1')
I came up with an approach like below and it is working. Any suggestions are welcome
s=pd.to_timedelta(24,unit='h')-(df.time_1-df.time_1.dt.normalize())
df['tdiff'] = df.groupby(df.time_1.dt.date).time_1.diff().shift(-1).fillna(s)
df['t_d'] = df['tdiff'].dt.total_seconds()/3600
df['hr'] = df['time_1'].dt.hour
df['date'] = df['time_1'].dt.date
df['day'] = pd.DatetimeIndex(df['time_1']).day
# here I get the freq and cumsum of each val for each day and each hour. Since sort = 'False', timeorder is retained as is
temp_1 = pd.DataFrame(df.groupby(['subject_id','date','hr','val'], sort=False)['t_d'].agg({'cumduration':sum,'freq':'count'}).reset_index())
# here i remove the `hour` component and sum the value duration in same day but different hours (for example `5` was in 12th hour and 13th hour. we sum them)
temp_2 = pd.DataFrame(temp_1.groupby(['subject_id','date','val'], sort=False)['cumduration'].agg({'sum_of_cumduration':sum,'freq':'count'}).reset_index())
# Later, I create a mask for `> 1` hr criteria
mask = temp_2.groupby(['subject_id','date'])['sum_of_cumduration'].apply(lambda x: x > 1)
output_1 = pd.DataFrame(temp_2[mask].groupby(['subject_id','date'])['val'].min()).reset_index()
# I check for `< 1 ` hr records here
output_2 = pd.DataFrame(temp_2[~mask].groupby(['subject_id','date'])['val'].min()).reset_index()
# I finally check for `subject_id` and `date` and then append
output = output_1.append(output_2[~output_2['subject_id'].isin(output_1['subject_id'])])
output