Highlight the outliers area in CDF plot

I am trying to highlight the area of the CDF in which the "outliers" fall in my visualization (perhaps a light red shading to differentiate the area).

Can you assist with shading the area where the "outlier" points for as per the definition above? For some reason when I try to look what the outlier definition did, I get an empty output, whether it is print(outliers_iqr(days)) or print(str(outliers_iqr(days)[1:-1]). It just prints array([], dtype=int64),

This is my current code:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105, 
 50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133, 
 95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105, 
 368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214, 
 192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50, 
 178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19, 
 118, 105, 92, 133, 77, 54, 72, 34]

#create CDF definition
def ecdf(data):
    n = len(data)
    x = np.sort(data)
    y = np.arange(1.0, n+1) / n
    return x, y

#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))

days = pd.DataFrame({"days" : a})

x, y = ecdf(days)

plt.plot(x, y, marker='.', linestyle='none') 
plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean

x_m = int(x.mean())
y_m = stats.percentileofscore(days.as_matrix(), x.mean())/100.0

ax=plt.gca()
ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), 
            xytext=(10,-5), textcoords='offset points')

outliers= outliers_iqr(days) 
print(outliers_iqr(days)) #print outliers- doesn't print   
print(str(outliers_iqr(days))[1:-1]) #same

#highlight the outliers area in the CDF plot
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 0 and 1st quartile
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 3rd quartile and 1

percentiles= np.array([25,50,75])
x_p = np.percentile(days, percentiles)
y_p = percentiles/100.0

plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

for x,y in zip(x_p, y_p):                                        
    ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

plt.xlabel('Days')
plt.ylabel('ECDF')
plt.legend(('Days', "Mean", 'Quartiles'), loc='lower right')

plt.show()

标签： python python-2.7 matplotlib ecdf

1条回答

我只想做你的唯一

2楼-- · 2019-03-06 19:29

If your array of outliers can be sometimes empty, you have to take care of that eventuality with an if statement. Also, as you just want to shade regions of your plot, you can actually use Axes.axvspan for that. Here an example that is somewhat modified from the original (all the plotting commands inside a function and adding a second subplot with data that actually has outliers):

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105, 
 50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133, 
 95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105, 
 368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214, 
 192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50, 
 178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19, 
 118, 105, 92, 133, 77, 54, 72, 34]


#create CDF definition
def ecdf(data):
    n = len(data)
    x = np.sort(data)
    y = np.arange(1.0, n+1) / n
    return x, y

#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)

    return  np.where((ys < lower_bound)), np.where((ys > upper_bound))



def generate_plot(ax, df):

    x, y = ecdf(df)

    ax.plot(x, y, marker='.', linestyle='none') 
    ax.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean

    x_m = int(x.mean())
    y_m = stats.percentileofscore(df.as_matrix(), x.mean())/100.0

    ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), 
                xytext=(10,-5), textcoords='offset points')

    outliers= outliers_iqr(df.values) 

    #highlight the outliers area in the CDF plot
    for outl in outliers:
        vals = df.values[outl]
        if vals.size>0:
            ax.axvspan(np.min(vals),np.max(vals),alpha=0.5,color='red')


    percentiles= np.array([25,50,75])
    x_p = np.percentile(df, percentiles)
    y_p = percentiles/100.0

    ax.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

    for x,y in zip(x_p, y_p):                                        
        ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

    ax.set_xlabel('Days')
    ax.set_ylabel('ECDF')
    ax.legend(('Days', "Mean", 'Quartiles'), loc='lower right')


fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(10,5))

##original data
days = pd.DataFrame({"days" : a})
generate_plot(axes[0],days)

##fake data with outliers
b = np.concatenate([
    np.random.normal(200,50,300),
    np.random.normal(25,10,20),
    np.random.normal(375,10,20),
])
np.random.shuffle(b)
generate_plot(axes[1],pd.DataFrame({"days" : b}))

##naming the subplots
axes[0].set_title('original data')
axes[1].set_title('fake data with outliers')

plt.show()

The result looks like this:

Hope this helps.

0人赞添加讨论(0) 举报

Highlight the outliers area in CDF plot

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间