matplotlib: plotting timeseries while skipping ove

2019-06-11 06:03发布

站内文章 / Python

63 0

再贱就再见

女 | 书童

私信

可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效，请关闭广告屏蔽插件后再试):

问题:

tl;dr: how can I skip over periods where there is no data while plotting timeseries?

I'm running a long calculation and I'd like to monitor its progress. Sometimes I interrupt this calculation. The logs are stored in a huge CSV file which looks like this:

2016-01-03T01:36:30.958199,0,0,0,startup
2016-01-03T01:36:32.363749,10000,0,0,regular
...
2016-01-03T11:12:21.082301,51020000,13402105,5749367,regular
2016-01-03T11:12:29.065687,51030000,13404142,5749367,regular
2016-01-03T11:12:37.657022,51040000,13408882,5749367,regular
2016-01-03T11:12:54.236950,51050000,13412824,5749375,shutdown
2016-01-03T19:02:38.293681,51050000,13412824,5749375,startup
2016-01-03T19:02:49.296161,51060000,13419181,5749377,regular
2016-01-03T19:03:00.547644,51070000,13423127,5749433,regular
2016-01-03T19:03:05.599515,51080000,13427189,5750183,regular
...

In reality, there are 41 columns. Each of the columns is a certain indicator of progress. The second column is always incremented in steps of 10000. The last column is self-explanatory.

I would like to plot each column on the same graph while skipping over periods between "shutdown" and "startup". Ideally, I would also like to draw a vertical line on each skip.

Here's what I've got so far:

import matplotlib.pyplot as plt
import pandas as pd

# < ... reading my CSV in a Pandas dataframe `df` ... >

fig, ax = plt.subplots()

for col in ['total'] + ['%02d' % i for i in range(40)]:
    ax.plot_date(df.index.values, df[col].values, '-')

fig.autofmt_xdate()
plt.show()

I want to get rid of that long flat period and just draw a vertical line instead.

I know about df.plot(), but in my experience it's broken (among other things, Pandas converts datetime objects in its own format instead of using date2num and num2date).

It looks like a possible solution is to write a custom scaler, but that seems quite complicated.

As far as I understand, writing a custom Locator will only change the positions of ticks (little vertical lines and the associated labels), but not the position of the plot itself. Is that correct?

UPD: an easy solution would be to change the timestamps (say, recalculate them to "time elapsed since start"), but I'd prefer to preserve them.

UPD: the answer at https://stackoverflow.com/a/5657491/1214547 works for me with some modifications. I will write up my solution soon.

回答1:

Here is a solution that works for me. It does not handle closely located breaks well (the labels may get too crowded), but in my case it doesn't matter.

import bisect
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.scale as mscale
import matplotlib.transforms as mtransforms
import matplotlib.dates as mdates
import pandas as pd

# heavily borrows from http://stackoverflow.com/a/5657491/1214547

def CustomScaleFactory(breaks):
    class CustomScale(mscale.ScaleBase):
        name = 'custom'

        def __init__(self, axis, **kwargs):
            mscale.ScaleBase.__init__(self)

        def get_transform(self):
            return self.CustomTransform()

        def set_default_locators_and_formatters(self, axis):
            class HourSkippingLocator(mdates.HourLocator):
                _breaks = breaks
                def __init__(self, *args, **kwargs):
                    super(HourSkippingLocator, self).__init__(*args, **kwargs)

                def _tick_allowed(self, tick):
                    for left, right in self._breaks:
                        if left <= tick <= right:
                            return False
                    return True

                def __call__(self):
                    ticks = super(HourSkippingLocator, self).__call__()
                    ticks = [tick for tick in ticks if self._tick_allowed(tick)]
                    ticks.extend(right for (left, right) in self._breaks)
                    return ticks

            axis.set_major_locator(HourSkippingLocator(interval=3))
            axis.set_major_formatter(mdates.DateFormatter("%h %d, %H:%M"))

        class CustomTransform(mtransforms.Transform):
            input_dims = 1
            output_dims = 1
            is_separable = True
            has_inverse = True
            _breaks = breaks

            def __init__(self):
                mtransforms.Transform.__init__(self)

            def transform_non_affine(self, a):
                # I have tried to write something smart using np.cumsum(),
                # but failed, since it was too complicated to handle the
                # transformation for points within breaks.
                # On the other hand, these loops are very easily translated
                # in plain C.

                result = np.empty_like(a)

                a_idx = 0
                csum = 0
                for left, right in self._breaks:
                    while a_idx < len(a) and a[a_idx] < left:
                        result[a_idx] = a[a_idx] - csum
                        a_idx += 1
                    while a_idx < len(a) and a[a_idx] <= right:
                        result[a_idx] = left - csum
                        a_idx += 1
                    csum += right - left

                while a_idx < len(a):
                    result[a_idx] = a[a_idx] - csum
                    a_idx += 1

                return result

            def inverted(self):
                return CustomScale.InvertedCustomTransform()

        class InvertedCustomTransform(mtransforms.Transform):
            input_dims = 1
            output_dims = 1
            is_separable = True
            has_inverse = True
            _breaks = breaks

            def __init__(self):
                mtransforms.Transform.__init__(self)

            def transform_non_affine(self, a):
                # Actually, this transformation isn't exactly invertible.
                # It may glue together some points, and there is no way
                # to separate them back. This implementation maps both
                # points to the *left* side of the break.

                diff = np.zeros(len(a))

                total_shift = 0

                for left, right in self._breaks:
                    pos = bisect.bisect_right(a, left - total_shift)
                    if pos >= len(diff):
                        break
                    diff[pos] = right - left
                    total_shift += right - left

                return a + diff.cumsum()

            def inverted(self):
                return CustomScale.CustomTransform()

    return CustomScale


# < ... reading my CSV in a Pandas dataframe `df` ... >

startups = np.where(df['kind'] == 'startup')[0]
shutdowns = np.where(df['kind'] == 'shutdown')[0]

breaks_idx = list(zip(shutdowns, startups[1:]))
breaks_dates = [(df.index[l], df.index[r]) for (l, r) in breaks_idx]
breaks = [(mdates.date2num(l), mdates.date2num(r)) for (l, r) in breaks_dates]

fig, ax = plt.subplots()

for col in ['total'] + ['%02d' % i for i in range(40)]:
  ax.plot_date(df.index.values, df[col].values, '-')

# shame on matplotlib: there is no way to unregister a scale
mscale.register_scale(CustomScaleFactory(breaks))
ax.set_xscale('custom')

vlines_x = [r for (l, r) in breaks]
vlines_ymin = np.zeros(len(vlines_x))
vlines_ymax = [df.iloc[r]['total'] for (l, r) in breaks_idx]
plt.vlines(vlines_x, vlines_ymin, vlines_ymax, color='darkgrey')

fig.autofmt_xdate()
plt.ticklabel_format(axis='y', style='plain')

plt.show()

回答2:

@Pastafarianist provides a good solution. However, I find a bug in the InvertedCustomTransform when I deal with the plotting with more than one break. For a example, in the following code the cross hair can't follow the cursor over the second and the third breaks.

import bisect
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.scale as mscale
import matplotlib.transforms as mtransforms
import matplotlib.dates as mdates
import pandas as pd
from matplotlib.widgets import Cursor


def CustomScaleFactory(breaks):
    class CustomScale(mscale.ScaleBase):
        name = 'custom'

        def __init__(self, axis, **kwargs):
            mscale.ScaleBase.__init__(self)

        def get_transform(self):
            return self.CustomTransform()

        def set_default_locators_and_formatters(self, axis):
            class HourSkippingLocator(mdates.HourLocator):
                _breaks = breaks

                def __init__(self, *args, **kwargs):
                    super(HourSkippingLocator, self).__init__(*args, **kwargs)

                def _tick_allowed(self, tick):
                    for left, right in self._breaks:
                        if left <= tick <= right:
                            return False
                    return True

                def __call__(self):
                    ticks = super(HourSkippingLocator, self).__call__()
                    ticks = [tick for tick in ticks if self._tick_allowed(tick)
                             ]
                    ticks.extend(right for (left, right) in self._breaks)
                    return ticks

            axis.set_major_locator(HourSkippingLocator(interval=3))
            axis.set_major_formatter(mdates.DateFormatter("%h %d, %H:%M"))

        class CustomTransform(mtransforms.Transform):
            input_dims = 1
            output_dims = 1
            is_separable = True
            has_inverse = True
            _breaks = breaks

            def __init__(self):
                mtransforms.Transform.__init__(self)

            def transform_non_affine(self, a):
                # I have tried to write something smart using np.cumsum(),
                # It may glue together some points, and there is no way
                # to separate them back. This implementation maps both
                # points to the *left* side of the break.

                diff = np.zeros(len(a))

                total_shift = 0

                for left, right in self._breaks:
                    pos = bisect.bisect_right(a, left - total_shift)
                    if pos >= len(diff):
                        break
                    diff[pos] = right - left
                    total_shift += right - left

                return a + diff.cumsum()

            def inverted(self):
                return CustomScale.CustomTransform()

    return CustomScale

# stimulating data
index1 = pd.date_range(start='2016-01-08 9:30', periods=10, freq='30s')
index2 = pd.date_range(end='2016-01-08 15:00', periods=10, freq='30s')
index = index1.union(index2)
data1 = pd.Series(range(20), index=index.values)
index3 = pd.date_range(start='2016-01-09 9:30', periods=10, freq='30s')
index4 = pd.date_range(end='2016-01-09 15:00', periods=10, freq='30s')
index = index3.union(index4)
data2 = pd.Series(range(20), index=index.values)
data = pd.concat([data1, data2])
breaks_dates = [
    pd.datetime.strptime('2016-01-08 9:35:00', '%Y-%m-%d %H:%M:%S'),
    pd.datetime.strptime('2016-01-08 14:55:00', '%Y-%m-%d %H:%M:%S'),
    pd.datetime.strptime('2016-01-08 15:00:00', '%Y-%m-%d %H:%M:%S'),
    pd.datetime.strptime('2016-01-09 9:30:00', '%Y-%m-%d %H:%M:%S'),
    pd.datetime.strptime('2016-01-09 9:35:00', '%Y-%m-%d %H:%M:%S'),
    pd.datetime.strptime('2016-01-09 14:55:00', '%Y-%m-%d %H:%M:%S')
]
breaks_dates = [mdates.date2num(point_i) for point_i in breaks_dates]
breaks = [(breaks_dates[i], breaks_dates[i + 1]) for i in [0, 2, 4]]
fig, ax = plt.subplots()
ax.plot(data.index.values, data.values)
mscale.register_scale(CustomScaleFactory(breaks))
ax.set_xscale('custom')
cursor = Cursor(ax, useblit=True, color='r', linewidth=2)
plt.show()

enter image description here If change the 'transform_non_affine' function in the 'InvertedCustomTransform' class as follows it works well.

def transform_non_affine(self, a):
    # Actually, this transformation isn't exactly invertible.
    # It may glue together some points, and there is no way
    # to separate them back. This implementation maps both
    # points to the *left* side of the break.

    diff = np.zeros(len(a))

    total_shift = 0

    for left, right in self._breaks:
        pos = bisect.bisect_right(a, left - total_shift)
        if pos >= len(diff):
            break
        diff[pos] = right - left + total_shift  # changed point
        total_shift += right - left
    return a + diff  # changed point

The reason maybe that the input 'a' for the transformation method is not the whole axis, it is only a numpy.array with length 1.