import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Seaborn 특징
-
특1: 입력으로 데이터프레임을 선호한다. (matplotlib은 array를 선호)
- 그렇다고 해서 데이터프레임이 아닌 경우 그림이 아예 안 그려지지는 않는다.
- 데이터프레임 형태는 long form 과 wide form 이 있다. (ref:https://seaborn.pydata.org/tutorial/data_structure.html#long-form-vs-wide-form-data)
- 참고로 long form이 더 우수한 저장형태!
- wide-df = [array1,array2,array3]
- long-df = [array_val, array_cat]
-
특2: matplotlib을 존경함. (ref:https://seaborn.pydata.org/)
sns boxplot
https://seaborn.pydata.org/generated/seaborn.boxplot.html
=[75,75,76,76,77,77,79,79,79,98] # A선생님에게 통계학을 배운 학생의 점수들
y1=[76,76,77,77,78,78,80,80,80,81] # B선생님에게 통계학을 배운 학생의 점수들 y2
plt복습
plt.boxplot([y1,y2])
{'whiskers': [<matplotlib.lines.Line2D at 0x7f2a64e7a950>,
<matplotlib.lines.Line2D at 0x7f2a64e7ac90>,
<matplotlib.lines.Line2D at 0x7f2a64e910d0>,
<matplotlib.lines.Line2D at 0x7f2a64e913d0>],
'caps': [<matplotlib.lines.Line2D at 0x7f2a64e83050>,
<matplotlib.lines.Line2D at 0x7f2a64e83350>,
<matplotlib.lines.Line2D at 0x7f2a64e91710>,
<matplotlib.lines.Line2D at 0x7f2a64e91a50>],
'boxes': [<matplotlib.lines.Line2D at 0x7f2a64e7a650>,
<matplotlib.lines.Line2D at 0x7f2a64e83d50>],
'medians': [<matplotlib.lines.Line2D at 0x7f2a64e836d0>,
<matplotlib.lines.Line2D at 0x7f2a64e91d90>],
'fliers': [<matplotlib.lines.Line2D at 0x7f2a64e83a10>,
<matplotlib.lines.Line2D at 0x7f2a64e9e110>],
'means': []}
sns: wide df
=pd.DataFrame({1:y1,2:y2})
df1 df1
1 | 2 | |
---|---|---|
0 | 75 | 76 |
1 | 75 | 76 |
2 | 76 | 77 |
3 | 76 | 77 |
4 | 77 | 78 |
5 | 77 | 78 |
6 | 79 | 80 |
7 | 79 | 80 |
8 | 79 | 80 |
9 | 98 | 81 |
-
예시1
=df1) sns.boxplot(data
<AxesSubplot:>
=1)) # 잘 쓰진 않는데 된다 sns.boxplot(np.stack([y1,y2],axis
<AxesSubplot:>
sns: long df
value 를 넣고 그 value가 어떤 category에 있는지 넣는 방법
= pd.DataFrame({'score': y1+y2, 'class': ['A']*len(y1)+['B']*len(y2)})
df2 df2
score | class | |
---|---|---|
0 | 75 | A |
1 | 75 | A |
2 | 76 | A |
3 | 76 | A |
4 | 77 | A |
5 | 77 | A |
6 | 79 | A |
7 | 79 | A |
8 | 79 | A |
9 | 98 | A |
10 | 76 | B |
11 | 76 | B |
12 | 77 | B |
13 | 77 | B |
14 | 78 | B |
15 | 78 | B |
16 | 80 | B |
17 | 80 | B |
18 | 80 | B |
19 | 81 | B |
-
예시1
=df2, x='class', y='score') sns.boxplot(data
<AxesSubplot:xlabel='class', ylabel='score'>
sns: array
-
예시1
=y1) sns.boxplot(data
<AxesSubplot:>
-
예시2
=y1) sns.boxplot(y
<AxesSubplot:>
-
예시3
=y1) sns.boxplot(x
<AxesSubplot:>
sns histplot
-
데이터
= np.random.randn(10000)
x = np.random.randn(10000) + 1 y
plt 복습
=0.5)
plt.hist(x,alpha=0.5) plt.hist(y,alpha
(array([2.000e+00, 1.500e+01, 1.550e+02, 7.670e+02, 2.062e+03, 3.085e+03,
2.479e+03, 1.117e+03, 2.790e+02, 3.900e+01]),
array([-3.5473064 , -2.74724651, -1.94718662, -1.14712673, -0.34706684,
0.45299304, 1.25305293, 2.05311282, 2.85317271, 3.6532326 ,
4.45329248]),
<BarContainer object of 10 artists>)
-
예시2
; plt.hist([x,y])
sns: wide df
=pd.DataFrame({'x':x, 'y':y})
df1 df1
x | y | |
---|---|---|
0 | 0.392340 | -0.520932 |
1 | -0.027382 | 2.332888 |
2 | -0.266977 | 0.973511 |
3 | -0.493336 | 2.801266 |
4 | 0.282255 | 0.433189 |
... | ... | ... |
9995 | -0.752878 | 2.394238 |
9996 | -0.212005 | 2.293700 |
9997 | -1.118235 | 2.660186 |
9998 | 1.558492 | 0.886679 |
9999 | -0.753399 | 1.977537 |
10000 rows × 2 columns
-
예시
=df1); sns.histplot(data
=df1,bins=20) # 칸 조정 sns.histplot(data
<AxesSubplot:ylabel='Count'>
=df1,bins=20,kde=True); # kde : 곡선 sns.histplot(data
=df1,bins=20,kde=True,element="step"); sns.histplot(data
=df1,bins=20,kde=True,element="step",lw=5) # mpl에 대한 존경심 확인 sns.histplot(data
<AxesSubplot:ylabel='Count'>
sns: long df
=pd.DataFrame({'val' : np.concatenate([x,y]), 'var': ['x']*len(x) + ['y']*len(y)})
df2 df2
val | var | |
---|---|---|
0 | 0.392340 | x |
1 | -0.027382 | x |
2 | -0.266977 | x |
3 | -0.493336 | x |
4 | 0.282255 | x |
... | ... | ... |
19995 | 2.394238 | y |
19996 | 2.293700 | y |
19997 | 2.660186 | y |
19998 | 0.886679 | y |
19999 | 1.977537 | y |
20000 rows × 2 columns
=df2, x='val', hue='var', bins=20, kde=True, lw=0)
sns.histplot(data# hue:색깔 var로 구분하겠다!
<AxesSubplot:xlabel='val', ylabel='Count'>
sns: array
=x) sns.histplot(data
<AxesSubplot:ylabel='Count'>
=x) sns.histplot(x
<AxesSubplot:ylabel='Count'>
=x, color='C0', bins=20, lw=0)
sns.histplot(x=y, color='C0', bins=20, lw=0) sns.histplot(x
<AxesSubplot:ylabel='Count'>
sns lineplot
43052)
np.random.seed(= np.random.randn(100) ϵ
= np.cumsum(ϵ) # 누적값 더하기 y
plt복습
'--o') plt.plot(ϵ,
'--o') plt.plot(y,
sns: array
=ϵ) sns.lineplot(data
<AxesSubplot:>
=y) sns.lineplot(data
<AxesSubplot:>
sns: wide df
= pd.DataFrame({'ϵ' : ϵ , 'y' : y})
df4 df4
ϵ | y | |
---|---|---|
0 | 0.383420 | 0.383420 |
1 | 1.084175 | 1.467595 |
2 | 1.142778 | 2.610373 |
3 | 0.307894 | 2.918267 |
4 | 0.237787 | 3.156054 |
... | ... | ... |
95 | 1.308688 | -10.598788 |
96 | 0.405376 | -10.193412 |
97 | -0.185070 | -10.378481 |
98 | 1.055388 | -9.323094 |
99 | 1.187014 | -8.136079 |
100 rows × 2 columns
=df4) sns.lineplot(data
<AxesSubplot:>
=df4, dashes=False) # dashes : 둘다 실선 sns.lineplot(data
<AxesSubplot:>
sns: long df
= pd.DataFrame({'idx':list(range(100))*2,'val':np.concatenate([ϵ,y]),'cat':['eps']*100 + ['y']*100 })
df5 df5
idx | val | cat | |
---|---|---|---|
0 | 0 | 0.383420 | eps |
1 | 1 | 1.084175 | eps |
2 | 2 | 1.142778 | eps |
3 | 3 | 0.307894 | eps |
4 | 4 | 0.237787 | eps |
... | ... | ... | ... |
195 | 95 | -10.598788 | y |
196 | 96 | -10.193412 | y |
197 | 97 | -10.378481 | y |
198 | 98 | -9.323094 | y |
199 | 99 | -8.136079 | y |
200 rows × 3 columns
=df5, x='idx',y='val',hue='cat') sns.lineplot(data
<AxesSubplot:xlabel='idx', ylabel='val'>
=df5, x='idx',y='val',style='cat',hue='cat',markers=True) sns.lineplot(data
<AxesSubplot:xlabel='idx', ylabel='val'>
=df5, x='idx',y='val',style='cat',hue='cat',dashes=[(3,1),(3,3)],markers=['o','o']) sns.lineplot(data
<AxesSubplot:xlabel='idx', ylabel='val'>
숙제
= np.random.randn(90).cumsum()
y1 = np.random.randn(120).cumsum() y2
'--o')
plt.plot(y1,'--o') plt.plot(y2,
= ([y1,y2]) y
=y, markers=['o','o']) sns.lineplot(data
<AxesSubplot:>