import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd

Seaborn 특징

- 특1: 입력으로 데이터프레임을 선호한다. (matplotlib은 array를 선호)

그렇다고 해서 데이터프레임이 아닌 경우 그림이 아예 안 그려지지는 않는다.
데이터프레임 형태는 long form 과 wide form 이 있다. (ref:https://seaborn.pydata.org/tutorial/data_structure.html#long-form-vs-wide-form-data)
참고로 long form이 더 우수한 저장형태!
wide-df = [array1,array2,array3]
long-df = [array_val, array_cat]

- 특2: matplotlib을 존경함. (ref:https://seaborn.pydata.org/)

sns boxplot

https://seaborn.pydata.org/generated/seaborn.boxplot.html

y1=[75,75,76,76,77,77,79,79,79,98] # A선생님에게 통계학을 배운 학생의 점수들
y2=[76,76,77,77,78,78,80,80,80,81] # B선생님에게 통계학을 배운 학생의 점수들

plt복습

plt.boxplot([y1,y2])

{'whiskers': [<matplotlib.lines.Line2D at 0x7f2a64e7a950>,
  <matplotlib.lines.Line2D at 0x7f2a64e7ac90>,
  <matplotlib.lines.Line2D at 0x7f2a64e910d0>,
  <matplotlib.lines.Line2D at 0x7f2a64e913d0>],
 'caps': [<matplotlib.lines.Line2D at 0x7f2a64e83050>,
  <matplotlib.lines.Line2D at 0x7f2a64e83350>,
  <matplotlib.lines.Line2D at 0x7f2a64e91710>,
  <matplotlib.lines.Line2D at 0x7f2a64e91a50>],
 'boxes': [<matplotlib.lines.Line2D at 0x7f2a64e7a650>,
  <matplotlib.lines.Line2D at 0x7f2a64e83d50>],
 'medians': [<matplotlib.lines.Line2D at 0x7f2a64e836d0>,
  <matplotlib.lines.Line2D at 0x7f2a64e91d90>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f2a64e83a10>,
  <matplotlib.lines.Line2D at 0x7f2a64e9e110>],
 'means': []}

	1	2
0	75	76
1	75	76
2	76	77
3	76	77
4	77	78
5	77	78
6	79	80
7	79	80
8	79	80
9	98	81

sns: long df

value 를 넣고 그 value가 어떤 category에 있는지 넣는 방법

df2 = pd.DataFrame({'score': y1+y2, 'class': ['A']*len(y1)+['B']*len(y2)})
df2

	score	class
0	75	A
1	75	A
2	76	A
3	76	A
4	77	A
5	77	A
6	79	A
7	79	A
8	79	A
9	98	A
10	76	B
11	76	B
12	77	B
13	77	B
14	78	B
15	78	B
16	80	B
17	80	B
18	80	B
19	81	B

- 예시1

sns.boxplot(data=df2, x='class', y='score')

<AxesSubplot:xlabel='class', ylabel='score'>

sns histplot

- 데이터

x = np.random.randn(10000)
y = np.random.randn(10000) + 1

plt 복습

plt.hist(x,alpha=0.5)
plt.hist(y,alpha=0.5)

(array([2.000e+00, 1.500e+01, 1.550e+02, 7.670e+02, 2.062e+03, 3.085e+03,
        2.479e+03, 1.117e+03, 2.790e+02, 3.900e+01]),
 array([-3.5473064 , -2.74724651, -1.94718662, -1.14712673, -0.34706684,
         0.45299304,  1.25305293,  2.05311282,  2.85317271,  3.6532326 ,
         4.45329248]),
 <BarContainer object of 10 artists>)

- 예시2

plt.hist([x,y]);

sns: wide df

df1=pd.DataFrame({'x':x, 'y':y})
df1

	x	y
0	0.392340	-0.520932
1	-0.027382	2.332888
2	-0.266977	0.973511
3	-0.493336	2.801266
4	0.282255	0.433189
...	...	...
9995	-0.752878	2.394238
9996	-0.212005	2.293700
9997	-1.118235	2.660186
9998	1.558492	0.886679
9999	-0.753399	1.977537

10000 rows × 2 columns

- 예시

sns.histplot(data=df1);

sns.histplot(data=df1,bins=20)  # 칸 조정

<AxesSubplot:ylabel='Count'>

sns.histplot(data=df1,bins=20,kde=True);    # kde : 곡선

sns.histplot(data=df1,bins=20,kde=True,element="step");

sns.histplot(data=df1,bins=20,kde=True,element="step",lw=5) # mpl에 대한 존경심 확인

<AxesSubplot:ylabel='Count'>

sns: long df

df2=pd.DataFrame({'val' : np.concatenate([x,y]), 'var': ['x']*len(x) + ['y']*len(y)})
df2

	val	var
0	0.392340	x
1	-0.027382	x
2	-0.266977	x
3	-0.493336	x
4	0.282255	x
...	...	...
19995	2.394238	y
19996	2.293700	y
19997	2.660186	y
19998	0.886679	y
19999	1.977537	y

20000 rows × 2 columns

sns.histplot(data=df2, x='val', hue='var', bins=20, kde=True, lw=0)
# hue:색깔 var로 구분하겠다!

<AxesSubplot:xlabel='val', ylabel='Count'>

sns: array

sns.histplot(data=x)

<AxesSubplot:ylabel='Count'>

sns.histplot(x=x)

<AxesSubplot:ylabel='Count'>

sns.histplot(x=x, color='C0', bins=20, lw=0)
sns.histplot(x=y, color='C0', bins=20, lw=0)

<AxesSubplot:ylabel='Count'>

sns lineplot

np.random.seed(43052)
ϵ = np.random.randn(100)

y = np.cumsum(ϵ)  # 누적값 더하기

plt복습

plt.plot(ϵ,'--o')

plt.plot(y,'--o')

sns: wide df

df4 = pd.DataFrame({'ϵ' : ϵ , 'y' : y})
df4

	ϵ	y
0	0.383420	0.383420
1	1.084175	1.467595
2	1.142778	2.610373
3	0.307894	2.918267
4	0.237787	3.156054
...	...	...
95	1.308688	-10.598788
96	0.405376	-10.193412
97	-0.185070	-10.378481
98	1.055388	-9.323094
99	1.187014	-8.136079

100 rows × 2 columns

sns.lineplot(data=df4)

<AxesSubplot:>

sns.lineplot(data=df4, dashes=False)   # dashes : 둘다 실선

<AxesSubplot:>

sns: long df

df5= pd.DataFrame({'idx':list(range(100))*2,'val':np.concatenate([ϵ,y]),'cat':['eps']*100 + ['y']*100 })
df5

	idx	val	cat
0	0	0.383420	eps
1	1	1.084175	eps
2	2	1.142778	eps
3	3	0.307894	eps
4	4	0.237787	eps
...	...	...	...
195	95	-10.598788	y
196	96	-10.193412	y
197	97	-10.378481	y
198	98	-9.323094	y
199	99	-8.136079	y

200 rows × 3 columns

sns.lineplot(data=df5, x='idx',y='val',hue='cat')

<AxesSubplot:xlabel='idx', ylabel='val'>

sns.lineplot(data=df5, x='idx',y='val',style='cat',hue='cat',markers=True)

<AxesSubplot:xlabel='idx', ylabel='val'>

sns.lineplot(data=df5, x='idx',y='val',style='cat',hue='cat',dashes=[(3,1),(3,3)],markers=['o','o'])

<AxesSubplot:xlabel='idx', ylabel='val'>

숙제

y1 = np.random.randn(90).cumsum() 
y2 = np.random.randn(120).cumsum()

plt.plot(y1,'--o')
plt.plot(y2,'--o')

y = ([y1,y2])

sns.lineplot(data=y, markers=['o','o'])

<AxesSubplot:>