Count the number of messages within a date range per interval. I"m using python 2.6.5 only.
For example Start date: 12/11/2014 End date: 12/12/2014 Start time: 02:00 End time: 02:05 Interval: Per 1 min
So this translates to how many messages are between each interval of a minute from start date 12/11 to end date 12/12. So my out put will look like this: (does not need to have strings min and messages)
datetime(2014, 12, 11, 2, 0) min : 0 messages,
datetime(2014, 12, 11, 2, 1) min: 1 message,
datetime(2014, 12, 11, 2, 2) min: 2 messages,
datetime(2014, 12, 11, 2, 3) min: 1 message,
datetime(2014, 12, 11, 2, 4) min : 0 messages,
datetime(2014, 12, 11, 2, 5) min : 0 messages
I believe I accomplish this but its very slow with large datasets. I think because it uses two loops and if the the second loop is extremely large then it takes very long time and does it for each iteration of the first loop. I need better procedure or alrogithm to accomplish this?
Edit: I need to include zero for intervals that do not have messages. I'm also trying to find peak,min and average.
from datetime import date,datetime, timedelta, time
def perdelta(start, end, delta):
curr = start
while curr < end:
yield curr
curr += delta
def rdata(table, fromDate, toDate, fromTime, toTime, interval):
date_to_alert = {}
start_date = datetime(fromDate.year, fromDate.month, fromDate.day, fromTime.hour, fromTime.minute)
end_date = datetime(toDate.year, toDate.month, toDate.day, toTime.hour, toTime.minute)
list_range_of_dates = []
for date_range in perdelta(start_date ,end_date ,interval):
list_range_of_dates.append(date_range)
print list_range_of_dates
index = 0
for date_range in list_range_of_dates:
for row in table:
print('first_alerted_time 1: %s index: %s len: %s' % ( row['first_alerted_time'], index, len(list_range_of_dates)-1))
if row['first_alerted_time'] and row['first_alerted_time'] >= list_range_of_dates[index] and row['first_alerted_time'] < list_range_of_dates[index + 1]:
print('Start date: %s' % list_range_of_dates[index] )
print('first_alerted_time: %s' % row['first_alerted_time'])
print('end date: %s' % list_range_of_dates[index + 1])
if list_range_of_dates[index] in date_to_alert:
date_to_alert[list_range_of_dates[index]].append(row)
else:
date_to_alert[list_range_of_dates[index]] = [row]
elif row['first_alerted_time']:
print('first_alerted_time 2: %s' % row['first_alerted_time'])
index = index + 1
print date_to_alert
for key, value in date_to_alert.items():
date_to_alert[key] = len(value)
print date_to_alert
t1 = []
if date_to_alert:
avg = sum(date_to_alert.values())/len(date_to_alert.keys())
for date_period, num_of_alerts in date_to_alert.items():
#[date_period] = date_to_alert[date_period]
t1.append( [ date_period, num_of_alerts, avg] )
print t1
return t1
def main():
example_table = [
{'first_alerted_time':datetime(2014, 12, 11, 2, 1,45)},
{'first_alerted_time':datetime(2014, 12, 11, 2, 2,33)},
{'first_alerted_time':datetime(2014, 12, 11, 2, 2,45)},
{'first_alerted_time':datetime(2014, 12, 11, 2, 3,45)},
]
example_table.sort()
print example_table
print rdata(example_table, date(2014,12,11), date(2014,12,12), time(00,00,00), time(00,00,00), timedelta(minutes=1))
Update: First attempt for improvement:
Default Dictionary approach
def default_dict_approach(table, fromDate, toDate, fromTime, toTime, interval):
from collections import defaultdict
t1 = []
start_date = datetime.combine(fromDate, fromTime)
end_date = datetime.combine(toDate, toTime)+ interval
times = (d['first_alerted_time'] for d in table)
counter = defaultdict(int)
for dt in times:
if start_date <= dt < end_date:
counter[to_s(dt - start_date) // to_s(interval)] += 1
date_to_alert = {}
date_to_alert = dict((ts*interval + start_date, count) for ts, count in counter.iteritems())
max_num,min_num,avg = 0,0,0
list_of_dates = list(perdelta(start_date,end_date,interval))
if date_to_alert:
freq_values = date_to_alert.values()
size_freq_values = len(freq_values)
avg = sum(freq_values)/ size_freq_values
max_num = max(freq_values)
if size_freq_values == len(list_of_dates):
min_num = min(freq_values)
else:
min_num = 0
for date_period in list_of_dates:
if date_period in date_to_alert:
t1.append([ date_period.strftime("%Y-%m-%d %H:%M"), date_to_alert[date_period], avg, max_num, min_num])
else:
t1.append([ date_period.strftime("%Y-%m-%d %H:%M"), 0, avg, max_num, min_num])
return (t1,max_num,min_num,avg)
numpy approach
def numpy_approach(table, fromDate, toDate, fromTime, toTime, interval):
date_to_alert = {}
start_date = datetime.combine(fromDate, fromTime)
end_date = datetime.combine(toDate, toTime)+ interval
list_range_of_dates = []
for date_range in perdelta(start_date ,end_date ,interval):
list_range_of_dates.append(date_range)
#print list_range_of_dates
index = 0
times = np.fromiter((d['first_alerted_time'] for d in table),
dtype='datetime64[us]', count=len(table))
print times
bins = np.fromiter(list_range_of_dates,
dtype=times.dtype)
print bin
a, bins = np.histogram(times, bins)
print(dict(zip(bins[a.nonzero()].tolist(), a[a.nonzero()])))