tl;dr: how can I skip over periods where there is no data while plotting timeseries?
I'm running a long calculation and I'd like to monitor its progress. Sometimes I interrupt this calculation. The logs are stored in a huge CSV file which looks like this:
2016-01-03T01:36:30.958199,0,0,0,startup
2016-01-03T01:36:32.363749,10000,0,0,regular
...
2016-01-03T11:12:21.082301,51020000,13402105,5749367,regular
2016-01-03T11:12:29.065687,51030000,13404142,5749367,regular
2016-01-03T11:12:37.657022,51040000,13408882,5749367,regular
2016-01-03T11:12:54.236950,51050000,13412824,5749375,shutdown
2016-01-03T19:02:38.293681,51050000,13412824,5749375,startup
2016-01-03T19:02:49.296161,51060000,13419181,5749377,regular
2016-01-03T19:03:00.547644,51070000,13423127,5749433,regular
2016-01-03T19:03:05.599515,51080000,13427189,5750183,regular
...
In reality, there are 41 columns. Each of the columns is a certain indicator of progress. The second column is always incremented in steps of 10000. The last column is self-explanatory.
I would like to plot each column on the same graph while skipping over periods between "shutdown" and "startup". Ideally, I would also like to draw a vertical line on each skip.
Here's what I've got so far:
import matplotlib.pyplot as plt
import pandas as pd
# < ... reading my CSV in a Pandas dataframe `df` ... >
fig, ax = plt.subplots()
for col in ['total'] + ['%02d' % i for i in range(40)]:
ax.plot_date(df.index.values, df[col].values, '-')
fig.autofmt_xdate()
plt.show()
I want to get rid of that long flat period and just draw a vertical line instead.
I know about df.plot()
, but in my experience it's broken (among other things, Pandas converts datetime
objects in its own format instead of using date2num
and num2date
).
It looks like a possible solution is to write a custom scaler, but that seems quite complicated.
As far as I understand, writing a custom Locator
will only change the positions of ticks (little vertical lines and the associated labels), but not the position of the plot itself. Is that correct?
UPD: an easy solution would be to change the timestamps (say, recalculate them to "time elapsed since start"), but I'd prefer to preserve them.
UPD: the answer at https://stackoverflow.com/a/5657491/1214547 works for me with some modifications. I will write up my solution soon.
Here is a solution that works for me. It does not handle closely located breaks well (the labels may get too crowded), but in my case it doesn't matter.
import bisect
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.scale as mscale
import matplotlib.transforms as mtransforms
import matplotlib.dates as mdates
import pandas as pd
# heavily borrows from http://stackoverflow.com/a/5657491/1214547
def CustomScaleFactory(breaks):
class CustomScale(mscale.ScaleBase):
name = 'custom'
def __init__(self, axis, **kwargs):
mscale.ScaleBase.__init__(self)
def get_transform(self):
return self.CustomTransform()
def set_default_locators_and_formatters(self, axis):
class HourSkippingLocator(mdates.HourLocator):
_breaks = breaks
def __init__(self, *args, **kwargs):
super(HourSkippingLocator, self).__init__(*args, **kwargs)
def _tick_allowed(self, tick):
for left, right in self._breaks:
if left <= tick <= right:
return False
return True
def __call__(self):
ticks = super(HourSkippingLocator, self).__call__()
ticks = [tick for tick in ticks if self._tick_allowed(tick)]
ticks.extend(right for (left, right) in self._breaks)
return ticks
axis.set_major_locator(HourSkippingLocator(interval=3))
axis.set_major_formatter(mdates.DateFormatter("%h %d, %H:%M"))
class CustomTransform(mtransforms.Transform):
input_dims = 1
output_dims = 1
is_separable = True
has_inverse = True
_breaks = breaks
def __init__(self):
mtransforms.Transform.__init__(self)
def transform_non_affine(self, a):
# I have tried to write something smart using np.cumsum(),
# but failed, since it was too complicated to handle the
# transformation for points within breaks.
# On the other hand, these loops are very easily translated
# in plain C.
result = np.empty_like(a)
a_idx = 0
csum = 0
for left, right in self._breaks:
while a_idx < len(a) and a[a_idx] < left:
result[a_idx] = a[a_idx] - csum
a_idx += 1
while a_idx < len(a) and a[a_idx] <= right:
result[a_idx] = left - csum
a_idx += 1
csum += right - left
while a_idx < len(a):
result[a_idx] = a[a_idx] - csum
a_idx += 1
return result
def inverted(self):
return CustomScale.InvertedCustomTransform()
class InvertedCustomTransform(mtransforms.Transform):
input_dims = 1
output_dims = 1
is_separable = True
has_inverse = True
_breaks = breaks
def __init__(self):
mtransforms.Transform.__init__(self)
def transform_non_affine(self, a):
# Actually, this transformation isn't exactly invertible.
# It may glue together some points, and there is no way
# to separate them back. This implementation maps both
# points to the *left* side of the break.
diff = np.zeros(len(a))
total_shift = 0
for left, right in self._breaks:
pos = bisect.bisect_right(a, left - total_shift)
if pos >= len(diff):
break
diff[pos] = right - left
total_shift += right - left
return a + diff.cumsum()
def inverted(self):
return CustomScale.CustomTransform()
return CustomScale
# < ... reading my CSV in a Pandas dataframe `df` ... >
startups = np.where(df['kind'] == 'startup')[0]
shutdowns = np.where(df['kind'] == 'shutdown')[0]
breaks_idx = list(zip(shutdowns, startups[1:]))
breaks_dates = [(df.index[l], df.index[r]) for (l, r) in breaks_idx]
breaks = [(mdates.date2num(l), mdates.date2num(r)) for (l, r) in breaks_dates]
fig, ax = plt.subplots()
for col in ['total'] + ['%02d' % i for i in range(40)]:
ax.plot_date(df.index.values, df[col].values, '-')
# shame on matplotlib: there is no way to unregister a scale
mscale.register_scale(CustomScaleFactory(breaks))
ax.set_xscale('custom')
vlines_x = [r for (l, r) in breaks]
vlines_ymin = np.zeros(len(vlines_x))
vlines_ymax = [df.iloc[r]['total'] for (l, r) in breaks_idx]
plt.vlines(vlines_x, vlines_ymin, vlines_ymax, color='darkgrey')
fig.autofmt_xdate()
plt.ticklabel_format(axis='y', style='plain')
plt.show()
@Pastafarianist provides a good solution. However, I find a bug in the InvertedCustomTransform when I deal with the plotting with more than one break. For a example, in the following code the cross hair can't follow the cursor over the second and the third breaks.
import bisect
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.scale as mscale
import matplotlib.transforms as mtransforms
import matplotlib.dates as mdates
import pandas as pd
from matplotlib.widgets import Cursor
def CustomScaleFactory(breaks):
class CustomScale(mscale.ScaleBase):
name = 'custom'
def __init__(self, axis, **kwargs):
mscale.ScaleBase.__init__(self)
def get_transform(self):
return self.CustomTransform()
def set_default_locators_and_formatters(self, axis):
class HourSkippingLocator(mdates.HourLocator):
_breaks = breaks
def __init__(self, *args, **kwargs):
super(HourSkippingLocator, self).__init__(*args, **kwargs)
def _tick_allowed(self, tick):
for left, right in self._breaks:
if left <= tick <= right:
return False
return True
def __call__(self):
ticks = super(HourSkippingLocator, self).__call__()
ticks = [tick for tick in ticks if self._tick_allowed(tick)
]
ticks.extend(right for (left, right) in self._breaks)
return ticks
axis.set_major_locator(HourSkippingLocator(interval=3))
axis.set_major_formatter(mdates.DateFormatter("%h %d, %H:%M"))
class CustomTransform(mtransforms.Transform):
input_dims = 1
output_dims = 1
is_separable = True
has_inverse = True
_breaks = breaks
def __init__(self):
mtransforms.Transform.__init__(self)
def transform_non_affine(self, a):
# I have tried to write something smart using np.cumsum(),
# It may glue together some points, and there is no way
# to separate them back. This implementation maps both
# points to the *left* side of the break.
diff = np.zeros(len(a))
total_shift = 0
for left, right in self._breaks:
pos = bisect.bisect_right(a, left - total_shift)
if pos >= len(diff):
break
diff[pos] = right - left
total_shift += right - left
return a + diff.cumsum()
def inverted(self):
return CustomScale.CustomTransform()
return CustomScale
# stimulating data
index1 = pd.date_range(start='2016-01-08 9:30', periods=10, freq='30s')
index2 = pd.date_range(end='2016-01-08 15:00', periods=10, freq='30s')
index = index1.union(index2)
data1 = pd.Series(range(20), index=index.values)
index3 = pd.date_range(start='2016-01-09 9:30', periods=10, freq='30s')
index4 = pd.date_range(end='2016-01-09 15:00', periods=10, freq='30s')
index = index3.union(index4)
data2 = pd.Series(range(20), index=index.values)
data = pd.concat([data1, data2])
breaks_dates = [
pd.datetime.strptime('2016-01-08 9:35:00', '%Y-%m-%d %H:%M:%S'),
pd.datetime.strptime('2016-01-08 14:55:00', '%Y-%m-%d %H:%M:%S'),
pd.datetime.strptime('2016-01-08 15:00:00', '%Y-%m-%d %H:%M:%S'),
pd.datetime.strptime('2016-01-09 9:30:00', '%Y-%m-%d %H:%M:%S'),
pd.datetime.strptime('2016-01-09 9:35:00', '%Y-%m-%d %H:%M:%S'),
pd.datetime.strptime('2016-01-09 14:55:00', '%Y-%m-%d %H:%M:%S')
]
breaks_dates = [mdates.date2num(point_i) for point_i in breaks_dates]
breaks = [(breaks_dates[i], breaks_dates[i + 1]) for i in [0, 2, 4]]
fig, ax = plt.subplots()
ax.plot(data.index.values, data.values)
mscale.register_scale(CustomScaleFactory(breaks))
ax.set_xscale('custom')
cursor = Cursor(ax, useblit=True, color='r', linewidth=2)
plt.show()
enter image description here
If change the 'transform_non_affine' function in the 'InvertedCustomTransform' class as follows it works well.
def transform_non_affine(self, a):
# Actually, this transformation isn't exactly invertible.
# It may glue together some points, and there is no way
# to separate them back. This implementation maps both
# points to the *left* side of the break.
diff = np.zeros(len(a))
total_shift = 0
for left, right in self._breaks:
pos = bisect.bisect_right(a, left - total_shift)
if pos >= len(diff):
break
diff[pos] = right - left + total_shift # changed point
total_shift += right - left
return a + diff # changed point
The reason maybe that the input 'a' for the transformation method is not the whole axis, it is only a numpy.array with length 1.