Fill area of overlap between two normal distributi

I want to fill the area overlapping between two normal distributions. I've got the x min and max, but I can't figure out how to set the y boundaries.

I've looked at the plt documentation and some examples. I think this related question and this one come close, but no luck. Here's what I have so far.

import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt

pepe_calories = np.array([361, 291, 263, 284, 311, 284, 282, 228, 328, 263, 354, 302, 293,
       254, 297, 281, 307, 281, 262, 302, 244, 259, 273, 299, 278, 257,
       296, 237, 276, 280, 291, 278, 251, 313, 314, 323, 333, 270, 317,
       321, 307, 256, 301, 264, 221, 251, 307, 283, 300, 292, 344, 239,
       288, 356, 224, 246, 196, 202, 314, 301, 336, 294, 237, 284, 311,
       257, 255, 287, 243, 267, 253, 257, 320, 295, 295, 271, 322, 343,
       313, 293, 298, 272, 267, 257, 334, 276, 337, 325, 261, 344, 298,
       253, 302, 318, 289, 302, 291, 343, 310, 241])



modern_calories = np.array([310, 315, 303, 360, 339, 416, 278, 326, 316, 314, 333, 317, 357,
       304, 363, 387, 279, 350, 367, 321, 366, 311, 308, 303, 299, 363,
       335, 357, 392, 321, 361, 285, 321, 290, 392, 341, 331, 338, 326,
       314, 327, 320, 293, 333, 297, 315, 365, 408, 352, 359, 312, 300,
       263, 358, 345, 360, 336, 378, 315, 354, 318, 300, 372, 305, 336,
       286, 296, 413, 383, 328, 418, 388, 416, 371, 313, 321, 321, 317,
       402, 290, 328, 344, 330, 319, 309, 327, 351, 324, 278, 369, 416,
       359, 381, 324, 306, 350, 385, 335, 395, 308])

ax = sns.distplot(pepe_calories, fit_kws={"color":"blue"}, kde=False,
        fit=stats.norm, hist=None, label="Pepe's");
ax = sns.distplot(modern_calories, fit_kws={"color":"orange"}, kde=False,
        fit=stats.norm, hist=None, label="Modern");

# Get the two lines from the axes to generate shading
l1 = ax.lines[0]
l2 = ax.lines[1]

# Get the xy data from the lines so that we can shade
x1 = l1.get_xydata()[:,0]
y1 = l1.get_xydata()[:,1]
x2 = l2.get_xydata()[:,0]
y2 = l2.get_xydata()[:,1]

x2min = np.min(x2)
x1max = np.max(x1)

ax.fill_between(x1,y1, where = ((x1 > x2min) & (x1 < x1max)), color="red", alpha=0.3)
#> <matplotlib.collections.PolyCollection at 0x1a200510b8>

plt.legend()
#> <matplotlib.legend.Legend at 0x1a1ff2e390>
plt.show()

Any ideas?

Created on 2018-12-01 by the reprexpy package

import reprexpy
print(reprexpy.SessionInfo())
#> Session info --------------------------------------------------------------------
#> Platform: Darwin-18.2.0-x86_64-i386-64bit (64-bit)
#> Python: 3.6
#> Date: 2018-12-01
#> Packages ------------------------------------------------------------------------
#> matplotlib==2.1.2
#> numpy==1.15.4
#> reprexpy==0.1.1
#> scipy==1.1.0
#> seaborn==0.9.0

标签： python python-3.x matplotlib seaborn

2条回答

放我归山

2楼-- · 2020-07-18 02:46

While gathering the pdf data from get_xydata is clever, you are now at the mercy of matplotlib's rendering / segmentation algorithm. Having x1 and x2 span different ranges also makes comparing y1 and y2 difficult.

You can avoid these problems by fitting the normals yourself instead of letting sns.distplot do it. Then you have more control over the values you are looking for.

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
norm = stats.norm

pepe_calories = np.array([361, 291, 263, 284, 311, 284, 282, 228, 328, 263, 354, 302, 293,
       254, 297, 281, 307, 281, 262, 302, 244, 259, 273, 299, 278, 257,
       296, 237, 276, 280, 291, 278, 251, 313, 314, 323, 333, 270, 317,
       321, 307, 256, 301, 264, 221, 251, 307, 283, 300, 292, 344, 239,
       288, 356, 224, 246, 196, 202, 314, 301, 336, 294, 237, 284, 311,
       257, 255, 287, 243, 267, 253, 257, 320, 295, 295, 271, 322, 343,
       313, 293, 298, 272, 267, 257, 334, 276, 337, 325, 261, 344, 298,
       253, 302, 318, 289, 302, 291, 343, 310, 241])



modern_calories = np.array([310, 315, 303, 360, 339, 416, 278, 326, 316, 314, 333, 317, 357,
       304, 363, 387, 279, 350, 367, 321, 366, 311, 308, 303, 299, 363,
       335, 357, 392, 321, 361, 285, 321, 290, 392, 341, 331, 338, 326,
       314, 327, 320, 293, 333, 297, 315, 365, 408, 352, 359, 312, 300,
       263, 358, 345, 360, 336, 378, 315, 354, 318, 300, 372, 305, 336,
       286, 296, 413, 383, 328, 418, 388, 416, 371, 313, 321, 321, 317,
       402, 290, 328, 344, 330, 319, 309, 327, 351, 324, 278, 369, 416,
       359, 381, 324, 306, 350, 385, 335, 395, 308])


pepe_params = norm.fit(pepe_calories)
modern_params = norm.fit(modern_calories)

xmin = min(pepe_calories.min(), modern_calories.min())
xmax = max(pepe_calories.max(), modern_calories.max())
x = np.linspace(xmin, xmax, 100)

pepe_pdf = norm(*pepe_params).pdf(x)
modern_pdf = norm(*modern_params).pdf(x)
y = np.minimum(modern_pdf, pepe_pdf)

fig, ax = plt.subplots()
ax.plot(x, pepe_pdf, label="Pepe's", color='blue')
ax.plot(x, modern_pdf, label="Modern", color='orange')
ax.fill_between(x, y, color='red', alpha=0.3)
plt.legend()
plt.show()

If, let's say, sns.distplot (or some other plotting function) made a plot that you did not want to have to reproduce, then you could use the data from get_xydata this way:

import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt

pepe_calories = np.array([361, 291, 263, 284, 311, 284, 282, 228, 328, 263, 354, 302, 293,
       254, 297, 281, 307, 281, 262, 302, 244, 259, 273, 299, 278, 257,
       296, 237, 276, 280, 291, 278, 251, 313, 314, 323, 333, 270, 317,
       321, 307, 256, 301, 264, 221, 251, 307, 283, 300, 292, 344, 239,
       288, 356, 224, 246, 196, 202, 314, 301, 336, 294, 237, 284, 311,
       257, 255, 287, 243, 267, 253, 257, 320, 295, 295, 271, 322, 343,
       313, 293, 298, 272, 267, 257, 334, 276, 337, 325, 261, 344, 298,
       253, 302, 318, 289, 302, 291, 343, 310, 241])



modern_calories = np.array([310, 315, 303, 360, 339, 416, 278, 326, 316, 314, 333, 317, 357,
       304, 363, 387, 279, 350, 367, 321, 366, 311, 308, 303, 299, 363,
       335, 357, 392, 321, 361, 285, 321, 290, 392, 341, 331, 338, 326,
       314, 327, 320, 293, 333, 297, 315, 365, 408, 352, 359, 312, 300,
       263, 358, 345, 360, 336, 378, 315, 354, 318, 300, 372, 305, 336,
       286, 296, 413, 383, 328, 418, 388, 416, 371, 313, 321, 321, 317,
       402, 290, 328, 344, 330, 319, 309, 327, 351, 324, 278, 369, 416,
       359, 381, 324, 306, 350, 385, 335, 395, 308])

ax = sns.distplot(pepe_calories, fit_kws={"color":"blue"}, kde=False,
        fit=stats.norm, hist=None, label="Pepe's");
ax = sns.distplot(modern_calories, fit_kws={"color":"orange"}, kde=False,
        fit=stats.norm, hist=None, label="Modern");

# Get the two lines from the axes to generate shading
l1 = ax.lines[0]
l2 = ax.lines[1]

# Get the xy data from the lines so that we can shade
x1, y1 = l1.get_xydata().T
x2, y2 = l2.get_xydata().T

xmin = max(x1.min(), x2.min())
xmax = min(x1.max(), x2.max())
x = np.linspace(xmin, xmax, 100)
y1 = np.interp(x, x1, y1)
y2 = np.interp(x, x2, y2)
y = np.minimum(y1, y2)
ax.fill_between(x, y, color="red", alpha=0.3)

plt.legend()
plt.show()

0人赞添加讨论(0) 举报

ら.Afraid

3楼-- · 2020-07-18 02:54

I suppose not using seaborn in cases where you want to have full control over the resulting plot is often a useful strategy. Hence just calculate the fits, plot them and use fill between the curves up to the point where they cross each other.

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

pepe_calories = np.array(...)
modern_calories = np.array(...)

x = np.linspace(150,470,1000)

y1 = stats.norm.pdf(x, *stats.norm.fit(pepe_calories))
y2 = stats.norm.pdf(x, *stats.norm.fit(modern_calories))

cross = x[y1-y2 <= 0][0]

fig, ax = plt.subplots()

ax.fill_between(x,y1,y2, where=(x<=cross), color="red", alpha=0.3)
ax.plot(x,y1, label="Pepe's")
ax.plot(x,y2, label="Modern")

ax.legend()
plt.show()

0人赞添加讨论(0) 举报

Fill area of overlap between two normal distributi

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间