I am trying to highlight the area of the CDF in which the "outliers" fall in my visualization (perhaps a light red shading to differentiate the area).
Can you assist with shading the area where the "outlier" points for as per the definition above? For some reason when I try to look what the outlier definition did, I get an empty output, whether it is print(outliers_iqr(days))
or print(str(outliers_iqr(days)[1:-1])
. It just prints array([], dtype=int64),
This is my current code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105,
50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133,
95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105,
368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214,
192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50,
178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19,
118, 105, 92, 133, 77, 54, 72, 34]
#create CDF definition
def ecdf(data):
n = len(data)
x = np.sort(data)
y = np.arange(1.0, n+1) / n
return x, y
#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
quartile_1, quartile_3 = np.percentile(ys, [25, 75])
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (iqr * 1.5)
upper_bound = quartile_3 + (iqr * 1.5)
return np.where((ys > upper_bound) | (ys < lower_bound))
days = pd.DataFrame({"days" : a})
x, y = ecdf(days)
plt.plot(x, y, marker='.', linestyle='none')
plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean
x_m = int(x.mean())
y_m = stats.percentileofscore(days.as_matrix(), x.mean())/100.0
ax=plt.gca()
ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m),
xytext=(10,-5), textcoords='offset points')
outliers= outliers_iqr(days)
print(outliers_iqr(days)) #print outliers- doesn't print
print(str(outliers_iqr(days))[1:-1]) #same
#highlight the outliers area in the CDF plot
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 0 and 1st quartile
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 3rd quartile and 1
percentiles= np.array([25,50,75])
x_p = np.percentile(days, percentiles)
y_p = percentiles/100.0
plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles
for x,y in zip(x_p, y_p):
ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')
plt.xlabel('Days')
plt.ylabel('ECDF')
plt.legend(('Days', "Mean", 'Quartiles'), loc='lower right')
plt.show()