Pandas - Find and index rows that match row sequen

2019-01-25 13:34发布

I would like to find a pattern in a dataframe in a categorical variable going down rows. I can see how to use Series.shift() to look up / down and using boolean logic to find the pattern, however, I want to do this with a grouping variable and also label all rows that are part of the pattern, not just the starting row.

Code:

import pandas as pd
from numpy.random import choice, randn
import string

# df constructor
n_rows = 1000
df = pd.DataFrame({'date_time': pd.date_range('2/9/2018', periods=n_rows, freq='H'),
                   'group_var': choice(list(string.ascii_uppercase), n_rows),
                   'row_pat': choice([0, 1, 2, 3], n_rows),
                   'values': randn(n_rows)})

# sorting 
df.sort_values(by=['group_var', 'date_time'], inplace=True)
df.head(10)

Which returns this: enter image description here

I can find the start of the pattern (with no grouping though) by this:

# the row ordinal pattern to detect
p0, p1, p2, p3 = 1, 2, 2, 0 

# flag the row at the start of the pattern
df['pat_flag'] = \
df['row_pat'].eq(p0) & \
df['row_pat'].shift(-1).eq(p1) & \
df['row_pat'].shift(-2).eq(p2) & \
df['row_pat'].shift(-3).eq(p3)

df.head(10)

enter image description here

What i cant figure out, is how to do this only withing the "group_var", and instead of returning True for the start of the pattern, return true for all rows that are part of the pattern.

Appreciate any tips on how to solve this!

Thanks...

5条回答
Summer. ? 凉城
2楼-- · 2019-01-25 14:06

I think you have 2 ways - simplier and slowier solution or faster complicated.

  • use Rolling.apply and test pattern
  • replace 0s to NaNs by mask
  • use bfill with limit (same as fillna with method='bfill') for repeat 1
  • then fillna NaNs to 0
  • last cast to bool by astype

pat = np.asarray([1, 2, 2, 0])
N = len(pat)


df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N)
                          .apply(lambda x: (x==pat).all())
                          .mask(lambda x: x == 0) 
                          .bfill(limit=N-1)
                          .fillna(0)
                          .astype(bool)
             )

If is important performance, use strides, solution from link was modify:

  • use rolling window approach
  • compare with pattaern and return Trues for match by all
  • get indices of first occurencies by np.mgrid and indexing
  • create all indices with list comprehension
  • compare by numpy.in1d and create new column

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
    return c

arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
c = np.mgrid[0:len(b)][b]

d = [i  for x in c for i in range(x, x+N)]
df['rm2'] = np.in1d(np.arange(len(arr)), d)

Another solution, thanks @divakar:

arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)

m = (rolling_window(arr, len(pat)) == pat).all(1)
m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))

Timings:

np.random.seed(456) 

import pandas as pd
from numpy.random import choice, randn
from scipy.ndimage.morphology import binary_dilation
import string

# df constructor
n_rows = 100000
df = pd.DataFrame({'date_time': pd.date_range('2/9/2018', periods=n_rows, freq='H'),
                   'group_var': choice(list(string.ascii_uppercase), n_rows),
                   'row_pat': choice([0, 1, 2, 3], n_rows),
                   'values': randn(n_rows)})

# sorting 
df.sort_values(by=['group_var', 'date_time'], inplace=True)

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
    return c


arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)

m = (rolling_window(arr, len(pat)) == pat).all(1)
m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))

arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
c = np.mgrid[0:len(b)][b]

d = [i  for x in c for i in range(x, x+N)]
df['rm2'] = np.in1d(np.arange(len(arr)), d)

print (df.iloc[460:480])

                date_time group_var  row_pat    values    rm0    rm1    rm2
12045 2019-06-25 21:00:00         A        3 -0.081152  False  False  False
12094 2019-06-27 22:00:00         A        1 -0.818167  False  False  False
12125 2019-06-29 05:00:00         A        0 -0.051088  False  False  False
12143 2019-06-29 23:00:00         A        0 -0.937589  False  False  False
12145 2019-06-30 01:00:00         A        3  0.298460  False  False  False
12158 2019-06-30 14:00:00         A        1  0.647161  False  False  False
12164 2019-06-30 20:00:00         A        3 -0.735538  False  False  False
12210 2019-07-02 18:00:00         A        1 -0.881740  False  False  False
12341 2019-07-08 05:00:00         A        3  0.525652  False  False  False
12343 2019-07-08 07:00:00         A        1  0.311598  False  False  False
12358 2019-07-08 22:00:00         A        1 -0.710150   True   True   True
12360 2019-07-09 00:00:00         A        2 -0.752216   True   True   True
12400 2019-07-10 16:00:00         A        2 -0.205122   True   True   True
12404 2019-07-10 20:00:00         A        0  1.342591   True   True   True
12413 2019-07-11 05:00:00         A        1  1.707748  False  False  False
12506 2019-07-15 02:00:00         A        2  0.319227  False  False  False
12527 2019-07-15 23:00:00         A        3  2.130917  False  False  False
12600 2019-07-19 00:00:00         A        1 -1.314070  False  False  False
12604 2019-07-19 04:00:00         A        0  0.869059  False  False  False
12613 2019-07-19 13:00:00         A        2  1.342101  False  False  False

In [225]: %%timeit
     ...: df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N)
     ...:                           .apply(lambda x: (x==pat).all())
     ...:                           .mask(lambda x: x == 0) 
     ...:                           .bfill(limit=N-1)
     ...:                           .fillna(0)
     ...:                           .astype(bool)
     ...:              )
     ...: 
1 loop, best of 3: 356 ms per loop

In [226]: %%timeit
     ...: arr = df['row_pat'].values
     ...: b = np.all(rolling_window(arr, N) == pat, axis=1)
     ...: c = np.mgrid[0:len(b)][b]
     ...: d = [i  for x in c for i in range(x, x+N)]
     ...: df['rm2'] = np.in1d(np.arange(len(arr)), d)
     ...: 
100 loops, best of 3: 7.63 ms per loop

In [227]: %%timeit
     ...: arr = df['row_pat'].values
     ...: b = np.all(rolling_window(arr, N) == pat, axis=1)
     ...: 
     ...: m = (rolling_window(arr, len(pat)) == pat).all(1)
     ...: m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
     ...: df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))
     ...: 
100 loops, best of 3: 7.25 ms per loop
查看更多
萌系小妹纸
3楼-- · 2019-01-25 14:07

You can do this by defining a custom aggregate function, then using it in group_by statement, finally merge it back to the original dataframe. Something like this:

Aggregate function:

def pattern_detect(column):
 # define any other pattern to detect here
 p0, p1, p2, p3 = 1, 2, 2, 0       
 column.eq(p0) & \
 column.shift(-1).eq(p1) & \
 column.shift(-2).eq(p2) & \
 column.shift(-3).eq(p3)
 return column.any()

Use group by function next:

grp = df.group_by('group_var').agg([patter_detect])['row_pat']

Now merge it back to the original dataframe:

df = df.merge(grp, left_on='group_var',right_index=True, how='left')
查看更多
Fickle 薄情
4楼-- · 2019-01-25 14:09

This works.
It works like this:
a) For every group, it takes a window of size 4 and scans through the column until it finds the combination (1,2,2,0) in exact sequence. As soon as it finds the sequence, it populates the corresponding index values of new column 'pat_flag' with 1.
b) If it doesn't find the combination, it populates the column with 0.

pattern = [1,2,2,0]
def get_pattern(df):

    df = df.reset_index(drop=True)
    df['pat_flag'] = 0

    get_indexes = [] 
    temp = []

    for index, row in df.iterrows():

        mindex = index +1

        # get the next 4 values
        for j in range(mindex, mindex+4):

            if j == df.shape[0]:
                break
            else:
                get_indexes.append(j)
                temp.append(df.loc[j,'row_pat'])

        # check if sequence is matched
        if temp == pattern:
            df.loc[get_indexes,'pat_flag'] = 1
        else:
            # reset if the pattern is not found in given window
            temp = []
            get_indexes = []

    return df

# apply function to the groups
df = df.groupby('group_var').apply(get_pattern)

## snippet of output 

        date_time       group_var   row_pat     values  pat_flag
41  2018-03-13 21:00:00      C         3       0.731114     0
42  2018-03-14 05:00:00      C         0       1.350164     0
43  2018-03-14 11:00:00      C         1      -0.429754     1
44  2018-03-14 12:00:00      C         2       1.238879     1
45  2018-03-15 17:00:00      C         2      -0.739192     1
46  2018-03-18 06:00:00      C         0       0.806509     1
47  2018-03-20 06:00:00      C         1       0.065105     0
48  2018-03-20 08:00:00      C         1       0.004336     0
查看更多
Anthone
5楼-- · 2019-01-25 14:14

You could make use of the pd.rolling() methods and then simply compare the arrays that it returns with the array that contains the pattern that you are attempting to match on.

pattern = np.asarray([1.0, 2.0, 2.0, 0.0])
n_obs = len(pattern)
df['rolling_match'] = (df['row_pat']
                       .rolling(window=n_obs , min_periods=n_obs)
                       .apply(lambda x: (x==pattern).all())
                       .astype(bool)             # All as bools
                       .shift(-1 * (n_obs - 1))  # Shift back
                       .fillna(False)            # convert NaNs to False
                       )

It is important to specify the min periods here in order to ensure that you only find exact matches (and so the equality check won't fail when the shapes are misaligned). The apply function is doing a pairwise check between the two arrays, and then we use the .all() to ensure all match. We convert to a bool, and then call shift on the function to move it to being a 'forward looking' indicator instead of only occurring after the fact.

Help on the rolling functionality available here - https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rolling.html

查看更多
Luminary・发光体
6楼-- · 2019-01-25 14:16

Expanding on Emmet02's answer: using the rolling function for all groups and setting match-column to 1 for all matching pattern indices:

pattern = np.asarray([1,2,2,0])

# Create a match column in the main dataframe
df.assign(match=False, inplace=True)

for group_var, group in df.groupby("group_var"):

    # Per group do rolling window matching, the last 
    # values of matching patterns in array 'match'
    # will be True
    match = (
        group['row_pat']
        .rolling(window=len(pattern), min_periods=len(pattern))
        .apply(lambda x: (x==pattern).all())
    )

    # Get indices of matches in current group
    idx = np.arange(len(group))[match == True]

    # Include all indices of matching pattern, 
    # counting back from last index in pattern
    idx = idx.repeat(len(pattern)) - np.tile(np.arange(len(pattern)), len(idx))

    # Update matches
    match.values[idx] = True
    df.loc[group.index, 'match'] = match

df[df.match==True]

edit: Without a for loop

# Do rolling matching in group clause
match = (
    df.groupby("group_var")
    .rolling(len(pattern))
    .row_pat.apply(lambda x: (x==pattern).all())
)

# Convert NaNs
match = (~match.isnull() & match)

# Get indices of matches in current group
idx = np.arange(len(df))[match]
# Include all indices of matching pattern
idx = idx.repeat(len(pattern)) - np.tile(np.arange(len(pattern)), len(idx))

# Mark all indices that are selected by "idx" in match-column
df = df.assign(match=df.index.isin(df.index[idx]))
查看更多
登录 后发表回答