import matplotlib.pyplot as plt
import numpy as np
boxplot
motivating example
(예제1) 전북고등학교: 평균은 좋은 측정값인가?
- 전북고등학교에서 통계학을 수업하는 A선생님과 B선생님의 있다. A선생님에게서 수업을 들을 학생들의 평균은 79.1이고 B선생님에게서 수업을 들은 학생들의 평균은 78.3이다.
=[75,75,76,76,77,77,79,79,79,98] # A선생님에게 통계학을 배운 학생의 점수들
y1=[76,76,77,77,78,78,80,80,80,81] # B선생님에게 통계학을 배운 학생의 점수들 y2
np.mean(y1), np.mean(y2)
(79.1, 78.3)
-
의사결정: A선생님에게 배운 학생들의 실력이 평균적으로 더 좋을 것이다.
-
평균은 A반(=A선생님에게 통계학을 배운 반)이 더 높다. 그런데 98점을 받은 학생이 A반에 포함되어서 A반이 전체평균이 높게 나온것이고 나머지 학생들은 전체적으로 B반 학생들이 더 시험을 잘 보았다고 해석할 수 있다.
-
교훈: 단순한 평균 비교보다 학생들이 받은 점수의 분포를 비교해보는 것이 중요하다. 분포를 살펴보는 방법 중 유용한 방법이 박스플랏이다.
matplotlib으로 boxplot 그리기
-
A반 학생들의 박스플랏 그리기
plt.boxplot(y1)
{'whiskers': [<matplotlib.lines.Line2D at 0x7f635fdf5f50>,
<matplotlib.lines.Line2D at 0x7f6360351c50>],
'caps': [<matplotlib.lines.Line2D at 0x7f635e60c2d0>,
<matplotlib.lines.Line2D at 0x7f635e60c610>],
'boxes': [<matplotlib.lines.Line2D at 0x7f635fd87c10>],
'medians': [<matplotlib.lines.Line2D at 0x7f635e60c990>],
'fliers': [<matplotlib.lines.Line2D at 0x7f635e60cc90>],
'means': []}
-
B반 학생들의 박스플랏 그리기
plt.boxplot(y2)
{'whiskers': [<matplotlib.lines.Line2D at 0x7f635e5ba410>,
<matplotlib.lines.Line2D at 0x7f635e5ba750>],
'caps': [<matplotlib.lines.Line2D at 0x7f635e5baa90>,
<matplotlib.lines.Line2D at 0x7f635e5badd0>],
'boxes': [<matplotlib.lines.Line2D at 0x7f635e5ba110>],
'medians': [<matplotlib.lines.Line2D at 0x7f635e60f190>],
'fliers': [<matplotlib.lines.Line2D at 0x7f635e60f4d0>],
'means': []}
-
A반 학생들의 점수와 B반 학생들의 점수를 나란히 박스플랏으로 그리자.
#리스트로 만들어주면 나란히 가능 plt.boxplot([y1,y2])
{'whiskers': [<matplotlib.lines.Line2D at 0x7f635ec47290>,
<matplotlib.lines.Line2D at 0x7f635ec475d0>,
<matplotlib.lines.Line2D at 0x7f635ec4fa10>,
<matplotlib.lines.Line2D at 0x7f635ec4fd10>],
'caps': [<matplotlib.lines.Line2D at 0x7f635ec47910>,
<matplotlib.lines.Line2D at 0x7f635ec47c50>,
<matplotlib.lines.Line2D at 0x7f635ec67090>,
<matplotlib.lines.Line2D at 0x7f635ec673d0>],
'boxes': [<matplotlib.lines.Line2D at 0x7f635ec48f50>,
<matplotlib.lines.Line2D at 0x7f635ec4f6d0>],
'medians': [<matplotlib.lines.Line2D at 0x7f635ec4f050>,
<matplotlib.lines.Line2D at 0x7f635ec67710>],
'fliers': [<matplotlib.lines.Line2D at 0x7f635ec4f350>,
<matplotlib.lines.Line2D at 0x7f635ec67a50>],
'means': []}
boxplot이란?
-
ref: https://github.com/mGalarnyk/Python_Tutorials/blob/master/Statistics/boxplot/box_plot.ipynb
916170)
np.random.seed(
# connection path is here: https://stackoverflow.com/questions/6146290/plotting-a-line-over-several-graphs
= 0, 1 # mean and standard deviation
mu, sigma = np.random.normal(mu, sigma, 1000)
s
= plt.subplots(nrows = 1, ncols = 1, figsize=(10, 5))
fig, axes
# rectangular box plot
= axes.boxplot(s,
bplot =False,
vert=True,
patch_artist=True, # This would show outliers (the remaining .7% of the data)
showfliers= [0],
positions = dict(linestyle='--', linewidth=2, color='Black', facecolor = 'red', alpha = .4),
boxprops = dict(linestyle='-', linewidth=2, color='Yellow'),
medianprops = dict(linestyle='-', linewidth=2, color='Blue', alpha = .4),
whiskerprops = dict(linestyle='-', linewidth=2, color='Black'),
capprops = dict(marker='o', markerfacecolor='green', markersize=10,
flierprops ='none', alpha = .4),
linestyle= .3,
widths = 1)
zorder
-4, 4)
axes.set_xlim(= 14)
plt.xticks(fontsize
axes.set_yticks([])r'',
axes.annotate(=(-.73, .205), xycoords='data',
xy=(.66, .205), textcoords='data',
xytext=dict(arrowstyle="|-|",
arrowprops="arc3")
connectionstyle;
)
0, .25, "Interquartile Range \n(IQR)", horizontalalignment='center', fontsize=18)
axes.text(0, -.21, r"Median", horizontalalignment='center', fontsize=16);
axes.text(2.65, -.15, "\"Maximum\"", horizontalalignment='center', fontsize=18);
axes.text(-2.65, -.15, "\"Minimum\"", horizontalalignment='center', fontsize=18);
axes.text(-.68, -.24, r"Q1", horizontalalignment='center', fontsize=18);
axes.text(-2.65, -.21, r"(Q1 - 1.5*IQR)", horizontalalignment='center', fontsize=16);
axes.text(.6745, -.24, r"Q3", horizontalalignment='center', fontsize=18);
axes.text(.6745, -.30, r"(75th Percentile)", horizontalalignment='center', fontsize=12);
axes.text(-.68, -.30, r"(25th Percentile)", horizontalalignment='center', fontsize=12);
axes.text(2.65, -.21, r"(Q3 + 1.5*IQR)", horizontalalignment='center', fontsize=16);
axes.text(
'Outliers', xy=(2.93,0.015), xytext=(2.52,0.20), fontsize = 18,
axes.annotate(={'arrowstyle': '->', 'color': 'black', 'lw': 2},
arrowprops='center');
va
'Outliers', xy=(-3.01,0.015), xytext=(-3.41,0.20), fontsize = 18,
axes.annotate(={'arrowstyle': '->', 'color': 'black', 'lw': 2},
arrowprops='center'); va
plotly로 boxplot 그리기
-
로컬에서 하기 위해서는 아래를 설치 (코랩은 ㄴ)
!conda env list
# conda environments:
#
base /home/koinup4/anaconda3
py37 * /home/koinup4/anaconda3/envs/py37
py39 /home/koinup4/anaconda3/envs/py39
!pip install plotly
!pip install ipywidgets
!pip install jupyter-dash
!pip install dash
!pip install pandas
Collecting plotly
Downloading plotly-5.13.0-py2.py3-none-any.whl (15.2 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 15.2/15.2 MB 93.1 MB/s eta 0:00:0000:0100:01
Collecting tenacity>=6.2.0
Downloading tenacity-8.2.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.13.0 tenacity-8.2.1
Collecting ipywidgets
Downloading ipywidgets-8.0.4-py3-none-any.whl (137 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 137.8/137.8 kB 5.8 MB/s eta 0:00:00
Requirement already satisfied: ipykernel>=4.5.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipywidgets) (5.5.5)
Collecting jupyterlab-widgets~=3.0
Downloading jupyterlab_widgets-3.0.5-py3-none-any.whl (384 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 384.3/384.3 kB 35.3 MB/s eta 0:00:00
Requirement already satisfied: traitlets>=4.3.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipywidgets) (5.8.1)
Collecting widgetsnbextension~=4.0
Downloading widgetsnbextension-4.0.5-py3-none-any.whl (2.0 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 75.0 MB/s eta 0:00:00
Requirement already satisfied: ipython>=6.1.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipywidgets) (7.33.0)
Requirement already satisfied: jupyter-client in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipykernel>=4.5.1->ipywidgets) (7.0.6)
Requirement already satisfied: tornado>=4.2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipykernel>=4.5.1->ipywidgets) (6.1)
Requirement already satisfied: backcall in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (0.2.0)
Requirement already satisfied: jedi>=0.16 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (0.18.2)
Requirement already satisfied: matplotlib-inline in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (0.1.6)
Requirement already satisfied: pexpect>4.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (4.8.0)
Requirement already satisfied: pickleshare in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (0.7.5)
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.36)
Requirement already satisfied: pygments in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (2.14.0)
Requirement already satisfied: decorator in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)
Requirement already satisfied: setuptools>=18.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (65.6.3)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.3)
Requirement already satisfied: ptyprocess>=0.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)
Requirement already satisfied: wcwidth in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=6.1.0->ipywidgets) (0.2.6)
Requirement already satisfied: jupyter-core>=4.6.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (4.11.1)
Requirement already satisfied: pyzmq>=13 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (19.0.2)
Requirement already satisfied: python-dateutil>=2.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (2.8.2)
Requirement already satisfied: entrypoints in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (0.4)
Requirement already satisfied: nest-asyncio>=1.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (1.5.6)
Requirement already satisfied: six>=1.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from python-dateutil>=2.1->jupyter-client->ipykernel>=4.5.1->ipywidgets) (1.16.0)
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.0.4 jupyterlab-widgets-3.0.5 widgetsnbextension-4.0.5
Collecting jupyter-dash
Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Collecting retrying
Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Collecting dash
Downloading dash-2.8.1-py3-none-any.whl (9.9 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.9/9.9 MB 80.9 MB/s eta 0:00:00:00:010:01
Collecting ansi2html
Downloading ansi2html-1.8.0-py3-none-any.whl (16 kB)
Collecting flask
Downloading Flask-2.2.3-py3-none-any.whl (101 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 101.8/101.8 kB 19.6 MB/s eta 0:00:00
Requirement already satisfied: ipykernel in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-dash) (5.5.5)
Requirement already satisfied: nest-asyncio in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-dash) (1.5.6)
Requirement already satisfied: ipython in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-dash) (7.33.0)
Requirement already satisfied: requests in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-dash) (2.28.2)
Requirement already satisfied: importlib-metadata in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ansi2html->jupyter-dash) (4.11.4)
Requirement already satisfied: plotly>=5.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash->jupyter-dash) (5.13.0)
Collecting dash-table==5.0.0
Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting dash-html-components==2.0.0
Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0
Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Requirement already satisfied: Jinja2>=3.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from flask->jupyter-dash) (3.1.2)
Collecting click>=8.0
Downloading click-8.1.3-py3-none-any.whl (96 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 96.6/96.6 kB 13.5 MB/s eta 0:00:00
Collecting Werkzeug>=2.2.2
Downloading Werkzeug-2.2.3-py3-none-any.whl (233 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.6/233.6 kB 44.7 MB/s eta 0:00:00
Collecting itsdangerous>=2.0
Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Requirement already satisfied: tornado>=4.2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipykernel->jupyter-dash) (6.1)
Requirement already satisfied: jupyter-client in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipykernel->jupyter-dash) (7.0.6)
Requirement already satisfied: traitlets>=4.1.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipykernel->jupyter-dash) (5.8.1)
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (3.0.36)
Requirement already satisfied: matplotlib-inline in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (0.1.6)
Requirement already satisfied: pexpect>4.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (4.8.0)
Requirement already satisfied: pygments in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (2.14.0)
Requirement already satisfied: jedi>=0.16 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (0.18.2)
Requirement already satisfied: decorator in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (5.1.1)
Requirement already satisfied: pickleshare in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (0.7.5)
Requirement already satisfied: setuptools>=18.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (65.6.3)
Requirement already satisfied: backcall in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (0.2.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests->jupyter-dash) (2.1.1)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests->jupyter-dash) (1.26.14)
Requirement already satisfied: certifi>=2017.4.17 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests->jupyter-dash) (2022.12.7)
Requirement already satisfied: idna<4,>=2.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests->jupyter-dash) (3.4)
Requirement already satisfied: six>=1.7.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from retrying->jupyter-dash) (1.16.0)
Requirement already satisfied: zipp>=0.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from importlib-metadata->ansi2html->jupyter-dash) (3.11.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from importlib-metadata->ansi2html->jupyter-dash) (4.4.0)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jedi>=0.16->ipython->jupyter-dash) (0.8.3)
Requirement already satisfied: MarkupSafe>=2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Jinja2>=3.0->flask->jupyter-dash) (2.1.1)
Requirement already satisfied: ptyprocess>=0.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pexpect>4.3->ipython->jupyter-dash) (0.7.0)
Requirement already satisfied: tenacity>=6.2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from plotly>=5.0.0->dash->jupyter-dash) (8.2.1)
Requirement already satisfied: wcwidth in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython->jupyter-dash) (0.2.6)
Requirement already satisfied: entrypoints in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel->jupyter-dash) (0.4)
Requirement already satisfied: pyzmq>=13 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel->jupyter-dash) (19.0.2)
Requirement already satisfied: python-dateutil>=2.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel->jupyter-dash) (2.8.2)
Requirement already satisfied: jupyter-core>=4.6.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel->jupyter-dash) (4.11.1)
Installing collected packages: dash-table, dash-html-components, dash-core-components, Werkzeug, retrying, itsdangerous, click, ansi2html, flask, dash, jupyter-dash
Successfully installed Werkzeug-2.2.3 ansi2html-1.8.0 click-8.1.3 dash-2.8.1 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0 flask-2.2.3 itsdangerous-2.1.2 jupyter-dash-0.4.2 retrying-1.3.4
Requirement already satisfied: dash in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (2.8.1)
Requirement already satisfied: dash-table==5.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash) (5.0.0)
Requirement already satisfied: dash-html-components==2.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash) (2.0.0)
Requirement already satisfied: Flask>=1.0.4 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash) (2.2.3)
Requirement already satisfied: plotly>=5.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash) (5.13.0)
Requirement already satisfied: dash-core-components==2.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash) (2.0.0)
Requirement already satisfied: Werkzeug>=2.2.2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Flask>=1.0.4->dash) (2.2.3)
Requirement already satisfied: itsdangerous>=2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Flask>=1.0.4->dash) (2.1.2)
Requirement already satisfied: Jinja2>=3.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Flask>=1.0.4->dash) (3.1.2)
Requirement already satisfied: importlib-metadata>=3.6.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Flask>=1.0.4->dash) (4.11.4)
Requirement already satisfied: click>=8.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Flask>=1.0.4->dash) (8.1.3)
Requirement already satisfied: tenacity>=6.2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from plotly>=5.0.0->dash) (8.2.1)
Requirement already satisfied: zipp>=0.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from importlib-metadata>=3.6.0->Flask>=1.0.4->dash) (3.11.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from importlib-metadata>=3.6.0->Flask>=1.0.4->dash) (4.4.0)
Requirement already satisfied: MarkupSafe>=2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Jinja2>=3.0->Flask>=1.0.4->dash) (2.1.1)
Requirement already satisfied: pandas in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (1.3.5)
Requirement already satisfied: pytz>=2017.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas) (2022.7.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas) (2.8.2)
Requirement already satisfied: numpy>=1.17.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas) (1.21.6)
Requirement already satisfied: six>=1.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0)
import plotly.express as px
import pandas as pd
from IPython.display import HTML
'A']*len(y1) # y1숫자만큼 A반복 [
['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A']
'A']*len(y1) + ['B']*len(y2) [
['A',
'A',
'A',
'A',
'A',
'A',
'A',
'A',
'A',
'A',
'B',
'B',
'B',
'B',
'B',
'B',
'B',
'B',
'B',
'B']
=pd.DataFrame({'score':y1+y2, 'class':['A']*len(y1) + ['B']*len(y2)})
df df
score | class | |
---|---|---|
0 | 75 | A |
1 | 75 | A |
2 | 76 | A |
3 | 76 | A |
4 | 77 | A |
5 | 77 | A |
6 | 79 | A |
7 | 79 | A |
8 | 79 | A |
9 | 98 | A |
10 | 76 | B |
11 | 76 | B |
12 | 77 | B |
13 | 77 | B |
14 | 78 | B |
15 | 78 | B |
16 | 80 | B |
17 | 80 | B |
18 | 80 | B |
19 | 81 | B |
=px.box(df,x='class',y='score')
fig fig
왜 안나오누..
='cdn', include_mathjax=False)) HTML(fig.to_html(include_plotlyjs
histogram
motivating example
-
전북고예제에서의 소망: 그냥 A반 B반 중에 어떤 반이 공부를 더 잘하냐? - 보통 이러한 질문은 중심경향값 중 하나를 골라서 비교하면 되었다. - 중심경향값이란 데이터 분포의 중심을 보여준 값으로 자료 전체를 대표할 수 있는 값을 말함. 평균, 중앙값 등이 대표적인 중심경향값이다.
-
전북고 예제에서는 “A반 B반 중에서 어떤 반이 공부를 더 잘하냐?” 라는 질문의 대답으로 단순평균비교로는 의미가 없었다. 오히려 결과론적으로 보면 중앙값이 더 타당해 보인다.
-
그런데 사실 생각해보면 중앙값을 기준으로 B반이 공부를 더 잘했다고 주장하는 것도 애매하다. 무튼 가장 공부잘한 학생은 A반에 있으니까!? (한명뿐이니까 빼고 가도 되지않나여? 이지만 2명 3명 점점 늘어난다 생각해보면 합리적인 기준을 제시할 수 있을까?)
-
사실 “A반 B반 중에 누가 더 공부를 잘하냐?” 라는 질문은 굉장히 대답하기 곤란한 질문이다. 왜냐하면 - 이슈1: 단순 평균비교로 이러한 질문에 답을 하기 어렵다. - 이슈2: 박스플랏으로 전체분포를 파악해도 어떠한 반이 더 공부를 잘한다는 기준을 잡는 것이 애매하다.
그런데 특수한 경우에는 “A반 B반 중에 누가 더 공부를 잘하냐?” 라는 질문에 대한 대답을 깔끔하게 할 수 있다.
(예제2) 정규분포 전북고등학교: 평균은 좋은 측정값인가?
-
A반과 B반의 통계학 성적이 아래와 같다고 하자.
43052)
np.random.seed(= np.random.randn(10000) # 평균 0 분산 1
y1 = np.random.randn(10000) + 0.5 # 평균 0.5 분산 1 y2
np.mean(y1), np.mean(y2)
(-0.011790879905079434, 0.4979147460611458)
- np.mean(y1) np.mean(y2)
0.5097056259662253
y2의 값이 y1의 값보다 전체적으로 0.51 정도 높다고 볼수 있다.?
plt.boxplot([y1,y2])
{'whiskers': [<matplotlib.lines.Line2D at 0x7f638623b0d0>,
<matplotlib.lines.Line2D at 0x7f63861c40d0>,
<matplotlib.lines.Line2D at 0x7f63861cf510>,
<matplotlib.lines.Line2D at 0x7f63861cf810>],
'caps': [<matplotlib.lines.Line2D at 0x7f63861c4410>,
<matplotlib.lines.Line2D at 0x7f63861c4750>,
<matplotlib.lines.Line2D at 0x7f63861cfb50>,
<matplotlib.lines.Line2D at 0x7f63861cfe90>],
'boxes': [<matplotlib.lines.Line2D at 0x7f638623b9d0>,
<matplotlib.lines.Line2D at 0x7f63861cf1d0>],
'medians': [<matplotlib.lines.Line2D at 0x7f63861c4ad0>,
<matplotlib.lines.Line2D at 0x7f63861dc210>],
'fliers': [<matplotlib.lines.Line2D at 0x7f63861c4e10>,
<matplotlib.lines.Line2D at 0x7f63861dc550>],
'means': []}
- 분포의 모양이 거의 비슷, 왼쪽그림을 컨트롤+C 하여 오른쪽에 붙인다음 0.5정도 y축으로 올린느낌이다!
-
이러한 상황에서는 “B반의 성적 \(\approx\) A반의 성적 + 0.5” 라고 주장해도 큰 무리가 없어보인다. 따라서 이 경우에는 “A반 B반 중에 어떤 반이 더 공부를 잘하냐?” 라는 질문에 대다하여 “B반이 평균적으로 0.51정도 공부를 잘한다”고 말할 수 있다.
-
결론: 정규분포 분포가정을 한다면 이슈 1, 2에 대한 무넺를 한번에 해결 가능함
-
정규분포 가정은 어떻게 할 수 있나? (=데이터를 보고 어떻게 정규분포라고 알 수 있는가?) : 데이터의 히스토그램을 그려서 종 모양이 되는지 확인해본다. (아직 초보단계라서 이것밖에 모를 수 있다.)
histogram이란?
-
히스토그램: X축이 변수의 구간, Y축은 그 구간에 포함된 빈도를 의미하는 그림
matplotlib으로 histogram 그리기
-
히스토그램의 예시1
=[10,11,12,15,16,20,21,22,23,24,25] y
plt.hist(y)
(array([2., 1., 0., 1., 1., 0., 1., 1., 2., 2.]),
array([10. , 11.5, 13. , 14.5, 16. , 17.5, 19. , 20.5, 22. , 23.5, 25. ]),
<BarContainer object of 10 artists>)
=10) # bins 빈도 10개 plt.hist(y,bins
(array([2., 1., 0., 1., 1., 0., 1., 1., 2., 2.]),
array([10. , 11.5, 13. , 14.5, 16. , 17.5, 19. , 20.5, 22. , 23.5, 25. ]),
<BarContainer object of 10 artists>)
-
히스토그램 예시2
=2) # 빈도 2개
plt.hist(y,bins# 범위 10~17.5에 5개, 17.5~25까지는 6개가 있음
(array([5., 6.]),
array([10. , 17.5, 25. ]),
<BarContainer object of 2 artists>)
-
히스토그램 예시3
=3) plt.hist(y,bins
(array([3., 2., 6.]),
array([10., 15., 20., 25.]),
<BarContainer object of 3 artists>)
- 가장 큰 값은 25, 가장 작은 값은 10이므로 range는 15이다.
- range / bins = 15 / 3 = 5 이므로 각 구간의 간격은 5이다.
- 구간은 [10,15), [15,20), [20,25] 로 나눈다.
- 각 구간에 포함된 자료의 수는 3, 2, 6이다.
-
히스토그램 예시4
=7) plt.hist(y,bins
(array([3., 0., 2., 0., 1., 2., 3.]),
array([10. , 12.14285714, 14.28571429, 16.42857143, 18.57142857,
20.71428571, 22.85714286, 25. ]),
<BarContainer object of 7 artists>)
- 가장 큰 값은 25, 가장 작은 값은 10이므로 range는 15이다.
- range / bins = 15 / 7 = 2.142857142857143 이므로 각 구간의 간격은 2.142857142857143이다.
- 구간은 [10,12.14285714), [12.14285714,14.28571429,), [22.85714286,25] 로 나눈다.
- 각 구간에 포함된 자료의 수는 3,0,2,0,1,2,3 이다.
= 15/7
_a _a
2.142857142857143
-
히스토그램 예시5
# np.random.seed(43052)
# y1 = np.random.randn(10000)
# y2 = np.random.randn(10000) + 0.5
=50); plt.hist([y1,y2],bins
seaborn으로 histogram 그리기
!pip install seaborn
import seaborn as sns
Collecting seaborn
Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 293.3/293.3 kB 13.8 MB/s eta 0:00:00
Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from seaborn) (3.5.3)
Requirement already satisfied: typing_extensions in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from seaborn) (4.4.0)
Requirement already satisfied: pandas>=0.25 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from seaborn) (1.3.5)
Requirement already satisfied: numpy!=1.24.0,>=1.17 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from seaborn) (1.21.6)
Requirement already satisfied: pillow>=6.2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (9.4.0)
Requirement already satisfied: cycler>=0.10 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.4)
Requirement already satisfied: fonttools>=4.22.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.38.0)
Requirement already satisfied: python-dateutil>=2.7 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2)
Requirement already satisfied: pyparsing>=2.2.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9)
Requirement already satisfied: packaging>=20.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (23.0)
Requirement already satisfied: pytz>=2017.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas>=0.25->seaborn) (2022.7.1)
Requirement already satisfied: six>=1.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.16.0)
Installing collected packages: seaborn
Successfully installed seaborn-0.12.2
y1, y2
(array([ 0.38342049, 1.0841745 , 1.14277825, ..., 1.03232398,
-0.18988252, -0.03578389]),
array([ 1.96391024, 0.31095591, -0.65422978, ..., -0.50052895,
1.26755071, 1.00486301]))
=pd.DataFrame({'score': np.concatenate([y1,y2]), 'class':['A']*len(y1) + ['B']*len(y2)})
df
df#list(y1)+list(y2) 위의 score를 이렇게도 쓸수있다.
score | class | |
---|---|---|
0 | 0.383420 | A |
1 | 1.084175 | A |
2 | 1.142778 | A |
3 | 0.307894 | A |
4 | 0.237787 | A |
... | ... | ... |
19995 | 0.493276 | B |
19996 | 0.619512 | B |
19997 | -0.500529 | B |
19998 | 1.267551 | B |
19999 | 1.004863 | B |
20000 rows × 2 columns
='score', hue='class') sns.histplot(df, x
<AxesSubplot:xlabel='score', ylabel='Count'>
plotnine으로 histogram 그리기
!pip install plotnine
Collecting plotnine
Downloading plotnine-0.8.0-py3-none-any.whl (4.7 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.7/4.7 MB 19.9 MB/s eta 0:00:0000:0100:01
Requirement already satisfied: matplotlib>=3.1.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from plotnine) (3.5.3)
Requirement already satisfied: numpy>=1.19.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from plotnine) (1.21.6)
Collecting scipy>=1.5.0
Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 38.1/38.1 MB 60.6 MB/s eta 0:00:0000:0100:01
Collecting mizani>=0.7.3
Downloading mizani-0.7.3-py3-none-any.whl (63 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 63.1/63.1 kB 11.2 MB/s eta 0:00:00
Collecting descartes>=1.1.0
Downloading descartes-1.1.0-py3-none-any.whl (5.8 kB)
Collecting patsy>=0.5.1
Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.8/233.8 kB 38.6 MB/s eta 0:00:00
Collecting statsmodels>=0.12.1
Downloading statsmodels-0.13.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.9 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.9/9.9 MB 96.2 MB/s eta 0:00:00ta 0:00:01
Requirement already satisfied: pandas>=1.1.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from plotnine) (1.3.5)
Requirement already satisfied: cycler>=0.10 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (4.38.0)
Requirement already satisfied: pyparsing>=2.2.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (3.0.9)
Requirement already satisfied: kiwisolver>=1.0.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (1.4.4)
Requirement already satisfied: python-dateutil>=2.7 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (2.8.2)
Requirement already satisfied: pillow>=6.2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (9.4.0)
Requirement already satisfied: packaging>=20.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (23.0)
Collecting palettable
Downloading palettable-3.3.0-py2.py3-none-any.whl (111 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 111.8/111.8 kB 14.4 MB/s eta 0:00:00
Requirement already satisfied: pytz>=2017.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas>=1.1.0->plotnine) (2022.7.1)
Requirement already satisfied: six in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from patsy>=0.5.1->plotnine) (1.16.0)
Requirement already satisfied: typing-extensions in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib>=3.1.1->plotnine) (4.4.0)
Installing collected packages: palettable, scipy, patsy, statsmodels, mizani, descartes, plotnine
Successfully installed descartes-1.1.0 mizani-0.7.3 palettable-3.3.0 patsy-0.5.3 plotnine-0.8.0 scipy-1.7.3 statsmodels-0.13.5
from plotnine import *
+ geom_histogram(aes(x='score', fill='class'), position='identity', alpha=0.5)
ggplot(df) # position: 겹쳐있게 보이게 함.
/home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages/plotnine/stats/stat_bin.py:95: PlotnineWarning: 'stat_bin()' using 'bins = 84'. Pick better value with 'binwidth'.
+ geom_histogram(aes(x='score', fill='class'), alpha=0.5)
ggplot(df) # 파란색을 그리고 빨간색을 그 위에 그림
# 비교를 위해서 관찰만 할것 이렇게 그리진 말자
/home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages/plotnine/stats/stat_bin.py:95: PlotnineWarning: 'stat_bin()' using 'bins = 84'. Pick better value with 'binwidth'.
plotly로 histogram 그리기
import plotly.figure_factory as ff
= [y1, y2]
hist_data
= ['A', 'B']
group_labels
# Create distplot with curve_type set to 'normal'
=.2, show_rug=False) ff.create_distplot(hist_data, group_labels,bin_size