Highlight the outliers area in CDF plot

2019-03-06 18:43发布

问题:

I am trying to highlight the area of the CDF in which the "outliers" fall in my visualization (perhaps a light red shading to differentiate the area).

Can you assist with shading the area where the "outlier" points for as per the definition above? For some reason when I try to look what the outlier definition did, I get an empty output, whether it is print(outliers_iqr(days)) or print(str(outliers_iqr(days)[1:-1]). It just prints array([], dtype=int64),

This is my current code:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105, 
 50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133, 
 95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105, 
 368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214, 
 192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50, 
 178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19, 
 118, 105, 92, 133, 77, 54, 72, 34]

#create CDF definition
def ecdf(data):
    n = len(data)
    x = np.sort(data)
    y = np.arange(1.0, n+1) / n
    return x, y

#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))

days = pd.DataFrame({"days" : a})

x, y = ecdf(days)

plt.plot(x, y, marker='.', linestyle='none') 
plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean

x_m = int(x.mean())
y_m = stats.percentileofscore(days.as_matrix(), x.mean())/100.0

ax=plt.gca()
ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), 
            xytext=(10,-5), textcoords='offset points')

outliers= outliers_iqr(days) 
print(outliers_iqr(days)) #print outliers- doesn't print   
print(str(outliers_iqr(days))[1:-1]) #same

#highlight the outliers area in the CDF plot
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 0 and 1st quartile
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 3rd quartile and 1

percentiles= np.array([25,50,75])
x_p = np.percentile(days, percentiles)
y_p = percentiles/100.0

plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

for x,y in zip(x_p, y_p):                                        
    ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

plt.xlabel('Days')
plt.ylabel('ECDF')
plt.legend(('Days', "Mean", 'Quartiles'), loc='lower right')

plt.show()

回答1:

If your array of outliers can be sometimes empty, you have to take care of that eventuality with an if statement. Also, as you just want to shade regions of your plot, you can actually use Axes.axvspan for that. Here an example that is somewhat modified from the original (all the plotting commands inside a function and adding a second subplot with data that actually has outliers):

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105, 
 50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133, 
 95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105, 
 368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214, 
 192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50, 
 178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19, 
 118, 105, 92, 133, 77, 54, 72, 34]


#create CDF definition
def ecdf(data):
    n = len(data)
    x = np.sort(data)
    y = np.arange(1.0, n+1) / n
    return x, y

#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)

    return  np.where((ys < lower_bound)), np.where((ys > upper_bound))



def generate_plot(ax, df):

    x, y = ecdf(df)

    ax.plot(x, y, marker='.', linestyle='none') 
    ax.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean

    x_m = int(x.mean())
    y_m = stats.percentileofscore(df.as_matrix(), x.mean())/100.0

    ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), 
                xytext=(10,-5), textcoords='offset points')

    outliers= outliers_iqr(df.values) 

    #highlight the outliers area in the CDF plot
    for outl in outliers:
        vals = df.values[outl]
        if vals.size>0:
            ax.axvspan(np.min(vals),np.max(vals),alpha=0.5,color='red')


    percentiles= np.array([25,50,75])
    x_p = np.percentile(df, percentiles)
    y_p = percentiles/100.0

    ax.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

    for x,y in zip(x_p, y_p):                                        
        ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

    ax.set_xlabel('Days')
    ax.set_ylabel('ECDF')
    ax.legend(('Days', "Mean", 'Quartiles'), loc='lower right')


fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(10,5))

##original data
days = pd.DataFrame({"days" : a})
generate_plot(axes[0],days)

##fake data with outliers
b = np.concatenate([
    np.random.normal(200,50,300),
    np.random.normal(25,10,20),
    np.random.normal(375,10,20),
])
np.random.shuffle(b)
generate_plot(axes[1],pd.DataFrame({"days" : b}))

##naming the subplots
axes[0].set_title('original data')
axes[1].set_title('fake data with outliers')

plt.show()

The result looks like this:

Hope this helps.