DV 1주차

boxplot
histogram
Author

김보람

Published

September 5, 2022

import matplotlib.pyplot as plt
import numpy as np

boxplot

motivating example

(예제1) 전북고등학교: 평균은 좋은 측정값인가?

  • 전북고등학교에서 통계학을 수업하는 A선생님과 B선생님의 있다. A선생님에게서 수업을 들을 학생들의 평균은 79.1이고 B선생님에게서 수업을 들은 학생들의 평균은 78.3이다.
y1=[75,75,76,76,77,77,79,79,79,98] # A선생님에게 통계학을 배운 학생의 점수들
y2=[76,76,77,77,78,78,80,80,80,81] # B선생님에게 통계학을 배운 학생의 점수들 
np.mean(y1), np.mean(y2)
(79.1, 78.3)

- 의사결정: A선생님에게 배운 학생들의 실력이 평균적으로 더 좋을 것이다.

- 평균은 A반(=A선생님에게 통계학을 배운 반)이 더 높다. 그런데 98점을 받은 학생이 A반에 포함되어서 A반이 전체평균이 높게 나온것이고 나머지 학생들은 전체적으로 B반 학생들이 더 시험을 잘 보았다고 해석할 수 있다.

- 교훈: 단순한 평균 비교보다 학생들이 받은 점수의 분포를 비교해보는 것이 중요하다. 분포를 살펴보는 방법 중 유용한 방법이 박스플랏이다.

matplotlib으로 boxplot 그리기

- A반 학생들의 박스플랏 그리기

plt.boxplot(y1)
{'whiskers': [<matplotlib.lines.Line2D at 0x7f635fdf5f50>,
  <matplotlib.lines.Line2D at 0x7f6360351c50>],
 'caps': [<matplotlib.lines.Line2D at 0x7f635e60c2d0>,
  <matplotlib.lines.Line2D at 0x7f635e60c610>],
 'boxes': [<matplotlib.lines.Line2D at 0x7f635fd87c10>],
 'medians': [<matplotlib.lines.Line2D at 0x7f635e60c990>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f635e60cc90>],
 'means': []}

- B반 학생들의 박스플랏 그리기

plt.boxplot(y2)
{'whiskers': [<matplotlib.lines.Line2D at 0x7f635e5ba410>,
  <matplotlib.lines.Line2D at 0x7f635e5ba750>],
 'caps': [<matplotlib.lines.Line2D at 0x7f635e5baa90>,
  <matplotlib.lines.Line2D at 0x7f635e5badd0>],
 'boxes': [<matplotlib.lines.Line2D at 0x7f635e5ba110>],
 'medians': [<matplotlib.lines.Line2D at 0x7f635e60f190>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f635e60f4d0>],
 'means': []}

- A반 학생들의 점수와 B반 학생들의 점수를 나란히 박스플랏으로 그리자.

plt.boxplot([y1,y2]) #리스트로 만들어주면 나란히 가능
{'whiskers': [<matplotlib.lines.Line2D at 0x7f635ec47290>,
  <matplotlib.lines.Line2D at 0x7f635ec475d0>,
  <matplotlib.lines.Line2D at 0x7f635ec4fa10>,
  <matplotlib.lines.Line2D at 0x7f635ec4fd10>],
 'caps': [<matplotlib.lines.Line2D at 0x7f635ec47910>,
  <matplotlib.lines.Line2D at 0x7f635ec47c50>,
  <matplotlib.lines.Line2D at 0x7f635ec67090>,
  <matplotlib.lines.Line2D at 0x7f635ec673d0>],
 'boxes': [<matplotlib.lines.Line2D at 0x7f635ec48f50>,
  <matplotlib.lines.Line2D at 0x7f635ec4f6d0>],
 'medians': [<matplotlib.lines.Line2D at 0x7f635ec4f050>,
  <matplotlib.lines.Line2D at 0x7f635ec67710>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f635ec4f350>,
  <matplotlib.lines.Line2D at 0x7f635ec67a50>],
 'means': []}

boxplot이란?

- ref: https://github.com/mGalarnyk/Python_Tutorials/blob/master/Statistics/boxplot/box_plot.ipynb

np.random.seed(916170)

# connection path is here: https://stackoverflow.com/questions/6146290/plotting-a-line-over-several-graphs
mu, sigma = 0, 1 # mean and standard deviation
s = np.random.normal(mu, sigma, 1000)

fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize=(10, 5))

# rectangular box plot
bplot = axes.boxplot(s,
                vert=False,
                patch_artist=True, 
                showfliers=True, # This would show outliers (the remaining .7% of the data)
                positions = [0],
                boxprops = dict(linestyle='--', linewidth=2, color='Black', facecolor = 'red', alpha = .4),
                medianprops = dict(linestyle='-', linewidth=2, color='Yellow'),
                whiskerprops = dict(linestyle='-', linewidth=2, color='Blue', alpha = .4),
                capprops = dict(linestyle='-', linewidth=2, color='Black'),
                flierprops = dict(marker='o', markerfacecolor='green', markersize=10,
                  linestyle='none', alpha = .4),
                widths = .3,
                zorder = 1)   

axes.set_xlim(-4, 4)
plt.xticks(fontsize = 14)

axes.set_yticks([])
axes.annotate(r'',
            xy=(-.73, .205), xycoords='data',
            xytext=(.66, .205), textcoords='data',
            arrowprops=dict(arrowstyle="|-|",
                            connectionstyle="arc3")
            );

axes.text(0, .25, "Interquartile Range \n(IQR)",  horizontalalignment='center', fontsize=18)
axes.text(0, -.21, r"Median", horizontalalignment='center', fontsize=16);
axes.text(2.65, -.15, "\"Maximum\"", horizontalalignment='center', fontsize=18);
axes.text(-2.65, -.15, "\"Minimum\"", horizontalalignment='center', fontsize=18);
axes.text(-.68, -.24, r"Q1", horizontalalignment='center', fontsize=18);
axes.text(-2.65, -.21, r"(Q1 - 1.5*IQR)", horizontalalignment='center', fontsize=16);
axes.text(.6745, -.24, r"Q3", horizontalalignment='center', fontsize=18);
axes.text(.6745, -.30, r"(75th Percentile)", horizontalalignment='center', fontsize=12);
axes.text(-.68, -.30, r"(25th Percentile)", horizontalalignment='center', fontsize=12);
axes.text(2.65, -.21, r"(Q3 + 1.5*IQR)", horizontalalignment='center', fontsize=16);

axes.annotate('Outliers', xy=(2.93,0.015), xytext=(2.52,0.20), fontsize = 18,
            arrowprops={'arrowstyle': '->', 'color': 'black', 'lw': 2},
            va='center');

axes.annotate('Outliers', xy=(-3.01,0.015), xytext=(-3.41,0.20), fontsize = 18,
            arrowprops={'arrowstyle': '->', 'color': 'black', 'lw': 2},
            va='center');

plotly로 boxplot 그리기

- 로컬에서 하기 위해서는 아래를 설치 (코랩은 ㄴ)

!conda env list 
# conda environments:
#
base                     /home/koinup4/anaconda3
py37                  *  /home/koinup4/anaconda3/envs/py37
py39                     /home/koinup4/anaconda3/envs/py39
!pip install plotly 
!pip install ipywidgets
!pip install jupyter-dash
!pip install dash 
!pip install pandas 
Collecting plotly
  Downloading plotly-5.13.0-py2.py3-none-any.whl (15.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 15.2/15.2 MB 93.1 MB/s eta 0:00:0000:0100:01
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.13.0 tenacity-8.2.1
Collecting ipywidgets
  Downloading ipywidgets-8.0.4-py3-none-any.whl (137 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 137.8/137.8 kB 5.8 MB/s eta 0:00:00
Requirement already satisfied: ipykernel>=4.5.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipywidgets) (5.5.5)
Collecting jupyterlab-widgets~=3.0
  Downloading jupyterlab_widgets-3.0.5-py3-none-any.whl (384 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 384.3/384.3 kB 35.3 MB/s eta 0:00:00
Requirement already satisfied: traitlets>=4.3.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipywidgets) (5.8.1)
Collecting widgetsnbextension~=4.0
  Downloading widgetsnbextension-4.0.5-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 75.0 MB/s eta 0:00:00
Requirement already satisfied: ipython>=6.1.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipywidgets) (7.33.0)
Requirement already satisfied: jupyter-client in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipykernel>=4.5.1->ipywidgets) (7.0.6)
Requirement already satisfied: tornado>=4.2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipykernel>=4.5.1->ipywidgets) (6.1)
Requirement already satisfied: backcall in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (0.2.0)
Requirement already satisfied: jedi>=0.16 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (0.18.2)
Requirement already satisfied: matplotlib-inline in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (0.1.6)
Requirement already satisfied: pexpect>4.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (4.8.0)
Requirement already satisfied: pickleshare in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (0.7.5)
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.36)
Requirement already satisfied: pygments in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (2.14.0)
Requirement already satisfied: decorator in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)
Requirement already satisfied: setuptools>=18.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython>=6.1.0->ipywidgets) (65.6.3)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.3)
Requirement already satisfied: ptyprocess>=0.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)
Requirement already satisfied: wcwidth in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=6.1.0->ipywidgets) (0.2.6)
Requirement already satisfied: jupyter-core>=4.6.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (4.11.1)
Requirement already satisfied: pyzmq>=13 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (19.0.2)
Requirement already satisfied: python-dateutil>=2.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (2.8.2)
Requirement already satisfied: entrypoints in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (0.4)
Requirement already satisfied: nest-asyncio>=1.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (1.5.6)
Requirement already satisfied: six>=1.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from python-dateutil>=2.1->jupyter-client->ipykernel>=4.5.1->ipywidgets) (1.16.0)
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.0.4 jupyterlab-widgets-3.0.5 widgetsnbextension-4.0.5
Collecting jupyter-dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Collecting retrying
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Collecting dash
  Downloading dash-2.8.1-py3-none-any.whl (9.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.9/9.9 MB 80.9 MB/s eta 0:00:00:00:010:01
Collecting ansi2html
  Downloading ansi2html-1.8.0-py3-none-any.whl (16 kB)
Collecting flask
  Downloading Flask-2.2.3-py3-none-any.whl (101 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 101.8/101.8 kB 19.6 MB/s eta 0:00:00
Requirement already satisfied: ipykernel in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-dash) (5.5.5)
Requirement already satisfied: nest-asyncio in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-dash) (1.5.6)
Requirement already satisfied: ipython in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-dash) (7.33.0)
Requirement already satisfied: requests in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-dash) (2.28.2)
Requirement already satisfied: importlib-metadata in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ansi2html->jupyter-dash) (4.11.4)
Requirement already satisfied: plotly>=5.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash->jupyter-dash) (5.13.0)
Collecting dash-table==5.0.0
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting dash-html-components==2.0.0
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Requirement already satisfied: Jinja2>=3.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from flask->jupyter-dash) (3.1.2)
Collecting click>=8.0
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 96.6/96.6 kB 13.5 MB/s eta 0:00:00
Collecting Werkzeug>=2.2.2
  Downloading Werkzeug-2.2.3-py3-none-any.whl (233 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.6/233.6 kB 44.7 MB/s eta 0:00:00
Collecting itsdangerous>=2.0
  Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Requirement already satisfied: tornado>=4.2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipykernel->jupyter-dash) (6.1)
Requirement already satisfied: jupyter-client in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipykernel->jupyter-dash) (7.0.6)
Requirement already satisfied: traitlets>=4.1.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipykernel->jupyter-dash) (5.8.1)
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (3.0.36)
Requirement already satisfied: matplotlib-inline in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (0.1.6)
Requirement already satisfied: pexpect>4.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (4.8.0)
Requirement already satisfied: pygments in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (2.14.0)
Requirement already satisfied: jedi>=0.16 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (0.18.2)
Requirement already satisfied: decorator in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (5.1.1)
Requirement already satisfied: pickleshare in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (0.7.5)
Requirement already satisfied: setuptools>=18.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (65.6.3)
Requirement already satisfied: backcall in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from ipython->jupyter-dash) (0.2.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests->jupyter-dash) (2.1.1)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests->jupyter-dash) (1.26.14)
Requirement already satisfied: certifi>=2017.4.17 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests->jupyter-dash) (2022.12.7)
Requirement already satisfied: idna<4,>=2.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests->jupyter-dash) (3.4)
Requirement already satisfied: six>=1.7.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from retrying->jupyter-dash) (1.16.0)
Requirement already satisfied: zipp>=0.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from importlib-metadata->ansi2html->jupyter-dash) (3.11.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from importlib-metadata->ansi2html->jupyter-dash) (4.4.0)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jedi>=0.16->ipython->jupyter-dash) (0.8.3)
Requirement already satisfied: MarkupSafe>=2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Jinja2>=3.0->flask->jupyter-dash) (2.1.1)
Requirement already satisfied: ptyprocess>=0.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pexpect>4.3->ipython->jupyter-dash) (0.7.0)
Requirement already satisfied: tenacity>=6.2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from plotly>=5.0.0->dash->jupyter-dash) (8.2.1)
Requirement already satisfied: wcwidth in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython->jupyter-dash) (0.2.6)
Requirement already satisfied: entrypoints in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel->jupyter-dash) (0.4)
Requirement already satisfied: pyzmq>=13 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel->jupyter-dash) (19.0.2)
Requirement already satisfied: python-dateutil>=2.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel->jupyter-dash) (2.8.2)
Requirement already satisfied: jupyter-core>=4.6.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from jupyter-client->ipykernel->jupyter-dash) (4.11.1)
Installing collected packages: dash-table, dash-html-components, dash-core-components, Werkzeug, retrying, itsdangerous, click, ansi2html, flask, dash, jupyter-dash
Successfully installed Werkzeug-2.2.3 ansi2html-1.8.0 click-8.1.3 dash-2.8.1 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0 flask-2.2.3 itsdangerous-2.1.2 jupyter-dash-0.4.2 retrying-1.3.4
Requirement already satisfied: dash in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (2.8.1)
Requirement already satisfied: dash-table==5.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash) (5.0.0)
Requirement already satisfied: dash-html-components==2.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash) (2.0.0)
Requirement already satisfied: Flask>=1.0.4 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash) (2.2.3)
Requirement already satisfied: plotly>=5.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash) (5.13.0)
Requirement already satisfied: dash-core-components==2.0.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from dash) (2.0.0)
Requirement already satisfied: Werkzeug>=2.2.2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Flask>=1.0.4->dash) (2.2.3)
Requirement already satisfied: itsdangerous>=2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Flask>=1.0.4->dash) (2.1.2)
Requirement already satisfied: Jinja2>=3.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Flask>=1.0.4->dash) (3.1.2)
Requirement already satisfied: importlib-metadata>=3.6.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Flask>=1.0.4->dash) (4.11.4)
Requirement already satisfied: click>=8.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Flask>=1.0.4->dash) (8.1.3)
Requirement already satisfied: tenacity>=6.2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from plotly>=5.0.0->dash) (8.2.1)
Requirement already satisfied: zipp>=0.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from importlib-metadata>=3.6.0->Flask>=1.0.4->dash) (3.11.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from importlib-metadata>=3.6.0->Flask>=1.0.4->dash) (4.4.0)
Requirement already satisfied: MarkupSafe>=2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from Jinja2>=3.0->Flask>=1.0.4->dash) (2.1.1)
Requirement already satisfied: pandas in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (1.3.5)
Requirement already satisfied: pytz>=2017.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas) (2022.7.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas) (2.8.2)
Requirement already satisfied: numpy>=1.17.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas) (1.21.6)
Requirement already satisfied: six>=1.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0)
import plotly.express as px
import pandas as pd
from IPython.display import HTML
['A']*len(y1)  # y1숫자만큼 A반복
['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A']
['A']*len(y1) + ['B']*len(y2)
['A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B']
df=pd.DataFrame({'score':y1+y2, 'class':['A']*len(y1) + ['B']*len(y2)})
df
score class
0 75 A
1 75 A
2 76 A
3 76 A
4 77 A
5 77 A
6 79 A
7 79 A
8 79 A
9 98 A
10 76 B
11 76 B
12 77 B
13 77 B
14 78 B
15 78 B
16 80 B
17 80 B
18 80 B
19 81 B
fig=px.box(df,x='class',y='score')
fig

왜 안나오누..

HTML(fig.to_html(include_plotlyjs='cdn', include_mathjax=False))

histogram

motivating example

- 전북고예제에서의 소망: 그냥 A반 B반 중에 어떤 반이 공부를 더 잘하냐? - 보통 이러한 질문은 중심경향값 중 하나를 골라서 비교하면 되었다. - 중심경향값이란 데이터 분포의 중심을 보여준 값으로 자료 전체를 대표할 수 있는 값을 말함. 평균, 중앙값 등이 대표적인 중심경향값이다.

- 전북고 예제에서는 “A반 B반 중에서 어떤 반이 공부를 더 잘하냐?” 라는 질문의 대답으로 단순평균비교로는 의미가 없었다. 오히려 결과론적으로 보면 중앙값이 더 타당해 보인다.

- 그런데 사실 생각해보면 중앙값을 기준으로 B반이 공부를 더 잘했다고 주장하는 것도 애매하다. 무튼 가장 공부잘한 학생은 A반에 있으니까!? (한명뿐이니까 빼고 가도 되지않나여? 이지만 2명 3명 점점 늘어난다 생각해보면 합리적인 기준을 제시할 수 있을까?)

- 사실 “A반 B반 중에 누가 더 공부를 잘하냐?” 라는 질문은 굉장히 대답하기 곤란한 질문이다. 왜냐하면 - 이슈1: 단순 평균비교로 이러한 질문에 답을 하기 어렵다. - 이슈2: 박스플랏으로 전체분포를 파악해도 어떠한 반이 더 공부를 잘한다는 기준을 잡는 것이 애매하다.

그런데 특수한 경우에는 “A반 B반 중에 누가 더 공부를 잘하냐?” 라는 질문에 대한 대답을 깔끔하게 할 수 있다.

(예제2) 정규분포 전북고등학교: 평균은 좋은 측정값인가?

- A반과 B반의 통계학 성적이 아래와 같다고 하자.

np.random.seed(43052)
y1 = np.random.randn(10000)         # 평균 0 분산 1
y2 = np.random.randn(10000) + 0.5   # 평균 0.5 분산 1
np.mean(y1), np.mean(y2)
(-0.011790879905079434, 0.4979147460611458)
np.mean(y2) - np.mean(y1)
0.5097056259662253

y2의 값이 y1의 값보다 전체적으로 0.51 정도 높다고 볼수 있다.?

plt.boxplot([y1,y2])
{'whiskers': [<matplotlib.lines.Line2D at 0x7f638623b0d0>,
  <matplotlib.lines.Line2D at 0x7f63861c40d0>,
  <matplotlib.lines.Line2D at 0x7f63861cf510>,
  <matplotlib.lines.Line2D at 0x7f63861cf810>],
 'caps': [<matplotlib.lines.Line2D at 0x7f63861c4410>,
  <matplotlib.lines.Line2D at 0x7f63861c4750>,
  <matplotlib.lines.Line2D at 0x7f63861cfb50>,
  <matplotlib.lines.Line2D at 0x7f63861cfe90>],
 'boxes': [<matplotlib.lines.Line2D at 0x7f638623b9d0>,
  <matplotlib.lines.Line2D at 0x7f63861cf1d0>],
 'medians': [<matplotlib.lines.Line2D at 0x7f63861c4ad0>,
  <matplotlib.lines.Line2D at 0x7f63861dc210>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f63861c4e10>,
  <matplotlib.lines.Line2D at 0x7f63861dc550>],
 'means': []}

  • 분포의 모양이 거의 비슷, 왼쪽그림을 컨트롤+C 하여 오른쪽에 붙인다음 0.5정도 y축으로 올린느낌이다!

- 이러한 상황에서는 “B반의 성적 \(\approx\) A반의 성적 + 0.5” 라고 주장해도 큰 무리가 없어보인다. 따라서 이 경우에는 “A반 B반 중에 어떤 반이 더 공부를 잘하냐?” 라는 질문에 대다하여 “B반이 평균적으로 0.51정도 공부를 잘한다”고 말할 수 있다.

- 결론: 정규분포 분포가정을 한다면 이슈 1, 2에 대한 무넺를 한번에 해결 가능함

- 정규분포 가정은 어떻게 할 수 있나? (=데이터를 보고 어떻게 정규분포라고 알 수 있는가?) : 데이터의 히스토그램을 그려서 종 모양이 되는지 확인해본다. (아직 초보단계라서 이것밖에 모를 수 있다.)

histogram이란?

- 히스토그램: X축이 변수의 구간, Y축은 그 구간에 포함된 빈도를 의미하는 그림

matplotlib으로 histogram 그리기

- 히스토그램의 예시1

y=[10,11,12,15,16,20,21,22,23,24,25]
plt.hist(y)
(array([2., 1., 0., 1., 1., 0., 1., 1., 2., 2.]),
 array([10. , 11.5, 13. , 14.5, 16. , 17.5, 19. , 20.5, 22. , 23.5, 25. ]),
 <BarContainer object of 10 artists>)

plt.hist(y,bins=10)  # bins 빈도 10개
(array([2., 1., 0., 1., 1., 0., 1., 1., 2., 2.]),
 array([10. , 11.5, 13. , 14.5, 16. , 17.5, 19. , 20.5, 22. , 23.5, 25. ]),
 <BarContainer object of 10 artists>)

- 히스토그램 예시2

plt.hist(y,bins=2) # 빈도 2개
# 범위 10~17.5에 5개, 17.5~25까지는 6개가 있음
(array([5., 6.]),
 array([10. , 17.5, 25. ]),
 <BarContainer object of 2 artists>)

- 히스토그램 예시3

plt.hist(y,bins=3)
(array([3., 2., 6.]),
 array([10., 15., 20., 25.]),
 <BarContainer object of 3 artists>)

  • 가장 큰 값은 25, 가장 작은 값은 10이므로 range는 15이다.
  • range / bins = 15 / 3 = 5 이므로 각 구간의 간격은 5이다.
  • 구간은 [10,15), [15,20), [20,25] 로 나눈다.
  • 각 구간에 포함된 자료의 수는 3, 2, 6이다.

- 히스토그램 예시4

plt.hist(y,bins=7)
(array([3., 0., 2., 0., 1., 2., 3.]),
 array([10.        , 12.14285714, 14.28571429, 16.42857143, 18.57142857,
        20.71428571, 22.85714286, 25.        ]),
 <BarContainer object of 7 artists>)

  • 가장 큰 값은 25, 가장 작은 값은 10이므로 range는 15이다.
  • range / bins = 15 / 7 = 2.142857142857143 이므로 각 구간의 간격은 2.142857142857143이다.
  • 구간은 [10,12.14285714), [12.14285714,14.28571429,), [22.85714286,25] 로 나눈다.
  • 각 구간에 포함된 자료의 수는 3,0,2,0,1,2,3 이다.
_a = 15/7
_a
2.142857142857143

- 히스토그램 예시5

# np.random.seed(43052)
# y1 = np.random.randn(10000)
# y2 = np.random.randn(10000) + 0.5 
plt.hist([y1,y2],bins=50);

seaborn으로 histogram 그리기

!pip install seaborn
import seaborn as sns
Collecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 293.3/293.3 kB 13.8 MB/s eta 0:00:00
Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from seaborn) (3.5.3)
Requirement already satisfied: typing_extensions in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from seaborn) (4.4.0)
Requirement already satisfied: pandas>=0.25 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from seaborn) (1.3.5)
Requirement already satisfied: numpy!=1.24.0,>=1.17 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from seaborn) (1.21.6)
Requirement already satisfied: pillow>=6.2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (9.4.0)
Requirement already satisfied: cycler>=0.10 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.4)
Requirement already satisfied: fonttools>=4.22.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.38.0)
Requirement already satisfied: python-dateutil>=2.7 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2)
Requirement already satisfied: pyparsing>=2.2.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9)
Requirement already satisfied: packaging>=20.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (23.0)
Requirement already satisfied: pytz>=2017.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas>=0.25->seaborn) (2022.7.1)
Requirement already satisfied: six>=1.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.16.0)
Installing collected packages: seaborn
Successfully installed seaborn-0.12.2
y1, y2
(array([ 0.38342049,  1.0841745 ,  1.14277825, ...,  1.03232398,
        -0.18988252, -0.03578389]),
 array([ 1.96391024,  0.31095591, -0.65422978, ..., -0.50052895,
         1.26755071,  1.00486301]))
df=pd.DataFrame({'score': np.concatenate([y1,y2]), 'class':['A']*len(y1) + ['B']*len(y2)})
df
#list(y1)+list(y2)   위의 score를 이렇게도 쓸수있다.
score class
0 0.383420 A
1 1.084175 A
2 1.142778 A
3 0.307894 A
4 0.237787 A
... ... ...
19995 0.493276 B
19996 0.619512 B
19997 -0.500529 B
19998 1.267551 B
19999 1.004863 B

20000 rows × 2 columns

sns.histplot(df, x='score', hue='class')
<AxesSubplot:xlabel='score', ylabel='Count'>

plotnine으로 histogram 그리기

!pip install plotnine
Collecting plotnine
  Downloading plotnine-0.8.0-py3-none-any.whl (4.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.7/4.7 MB 19.9 MB/s eta 0:00:0000:0100:01
Requirement already satisfied: matplotlib>=3.1.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from plotnine) (3.5.3)
Requirement already satisfied: numpy>=1.19.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from plotnine) (1.21.6)
Collecting scipy>=1.5.0
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 38.1/38.1 MB 60.6 MB/s eta 0:00:0000:0100:01
Collecting mizani>=0.7.3
  Downloading mizani-0.7.3-py3-none-any.whl (63 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 63.1/63.1 kB 11.2 MB/s eta 0:00:00
Collecting descartes>=1.1.0
  Downloading descartes-1.1.0-py3-none-any.whl (5.8 kB)
Collecting patsy>=0.5.1
  Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.8/233.8 kB 38.6 MB/s eta 0:00:00
Collecting statsmodels>=0.12.1
  Downloading statsmodels-0.13.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.9/9.9 MB 96.2 MB/s eta 0:00:00ta 0:00:01
Requirement already satisfied: pandas>=1.1.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from plotnine) (1.3.5)
Requirement already satisfied: cycler>=0.10 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (4.38.0)
Requirement already satisfied: pyparsing>=2.2.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (3.0.9)
Requirement already satisfied: kiwisolver>=1.0.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (1.4.4)
Requirement already satisfied: python-dateutil>=2.7 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (2.8.2)
Requirement already satisfied: pillow>=6.2.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (9.4.0)
Requirement already satisfied: packaging>=20.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (23.0)
Collecting palettable
  Downloading palettable-3.3.0-py2.py3-none-any.whl (111 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 111.8/111.8 kB 14.4 MB/s eta 0:00:00
Requirement already satisfied: pytz>=2017.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas>=1.1.0->plotnine) (2022.7.1)
Requirement already satisfied: six in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from patsy>=0.5.1->plotnine) (1.16.0)
Requirement already satisfied: typing-extensions in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib>=3.1.1->plotnine) (4.4.0)
Installing collected packages: palettable, scipy, patsy, statsmodels, mizani, descartes, plotnine
Successfully installed descartes-1.1.0 mizani-0.7.3 palettable-3.3.0 patsy-0.5.3 plotnine-0.8.0 scipy-1.7.3 statsmodels-0.13.5
from plotnine import *
ggplot(df) + geom_histogram(aes(x='score', fill='class'), position='identity', alpha=0.5)
# position: 겹쳐있게 보이게 함.
/home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages/plotnine/stats/stat_bin.py:95: PlotnineWarning: 'stat_bin()' using 'bins = 84'. Pick better value with 'binwidth'.

ggplot(df) + geom_histogram(aes(x='score', fill='class'), alpha=0.5)
# 파란색을 그리고 빨간색을 그 위에 그림
# 비교를 위해서 관찰만 할것 이렇게 그리진 말자
/home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages/plotnine/stats/stat_bin.py:95: PlotnineWarning: 'stat_bin()' using 'bins = 84'. Pick better value with 'binwidth'.

plotly로 histogram 그리기

import plotly.figure_factory as ff

hist_data = [y1, y2]

group_labels = ['A', 'B']

# Create distplot with curve_type set to 'normal'
ff.create_distplot(hist_data, group_labels,bin_size=.2, show_rug=False)