import numpy as np
import pandas as pd
!pip install pandas_datareader
Collecting pandas_datareader
Downloading pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 109.5/109.5 kB 5.9 MB/s eta 0:00:00
Requirement already satisfied: requests>=2.19.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas_datareader) (2.28.2)
Collecting lxml
Downloading lxml-4.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.6 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 79.4 MB/s eta 0:00:00:00:010:01
Requirement already satisfied: pandas>=0.23 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas_datareader) (1.3.5)
Requirement already satisfied: numpy>=1.17.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas>=0.23->pandas_datareader) (1.21.6)
Requirement already satisfied: python-dateutil>=2.7.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas>=0.23->pandas_datareader) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas>=0.23->pandas_datareader) (2022.7.1)
Requirement already satisfied: idna<4,>=2.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests>=2.19.0->pandas_datareader) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests>=2.19.0->pandas_datareader) (1.26.14)
Requirement already satisfied: certifi>=2017.4.17 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests>=2.19.0->pandas_datareader) (2022.12.7)
Requirement already satisfied: charset-normalizer<4,>=2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests>=2.19.0->pandas_datareader) (2.1.1)
Requirement already satisfied: six>=1.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas>=0.23->pandas_datareader) (1.16.0)
Installing collected packages: lxml, pandas_datareader
Successfully installed lxml-4.9.2 pandas_datareader-0.10.0
from pandas_datareader import data as pdr
- 인터넷상에 있는 주가정보를 크롤링해서 저장하는 함수
!pip install yfinance
Collecting yfinance
Downloading yfinance-0.2.12-py2.py3-none-any.whl (59 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 59.2/59.2 kB 4.5 MB/s eta 0:00:00
Collecting html5lib>=1.1
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.2/112.2 kB 12.9 MB/s eta 0:00:00
Requirement already satisfied: requests>=2.26 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from yfinance) (2.28.2)
Requirement already satisfied: cryptography>=3.3.2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from yfinance) (3.4.8)
Collecting frozendict>=2.3.4
Downloading frozendict-2.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (99 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.8/99.8 kB 19.9 MB/s eta 0:00:00
Collecting multitasking>=0.0.7
Downloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Requirement already satisfied: numpy>=1.16.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from yfinance) (1.21.6)
Requirement already satisfied: lxml>=4.9.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from yfinance) (4.9.2)
Requirement already satisfied: beautifulsoup4>=4.11.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from yfinance) (4.11.1)
Requirement already satisfied: pandas>=1.3.0 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from yfinance) (1.3.5)
Requirement already satisfied: pytz>=2022.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from yfinance) (2022.7.1)
Collecting appdirs>=1.4.4
Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Requirement already satisfied: soupsieve>1.2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from beautifulsoup4>=4.11.1->yfinance) (2.3.2.post1)
Requirement already satisfied: cffi>=1.12 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from cryptography>=3.3.2->yfinance) (1.15.0)
Requirement already satisfied: webencodings in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from html5lib>=1.1->yfinance) (0.5.1)
Requirement already satisfied: six>=1.9 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from html5lib>=1.1->yfinance) (1.16.0)
Requirement already satisfied: python-dateutil>=2.7.3 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from pandas>=1.3.0->yfinance) (2.8.2)
Requirement already satisfied: certifi>=2017.4.17 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests>=2.26->yfinance) (2022.12.7)
Requirement already satisfied: charset-normalizer<4,>=2 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests>=2.26->yfinance) (2.1.1)
Requirement already satisfied: idna<4,>=2.5 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests>=2.26->yfinance) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from requests>=2.26->yfinance) (1.26.14)
Requirement already satisfied: pycparser in /home/koinup4/anaconda3/envs/py37/lib/python3.7/site-packages (from cffi>=1.12->cryptography>=3.3.2->yfinance) (2.21)
Installing collected packages: multitasking, appdirs, html5lib, frozendict, yfinance
Successfully installed appdirs-1.4.4 frozendict-2.3.5 html5lib-1.1 multitasking-0.0.11 yfinance-0.2.12
import yfinance as yf
line
data1: 야후 파이낸스
-
yahoo finance: https://finance.yahoo.com/
yf.pdr_override()
= ['AMZN','AAPL','GOOG','MSFT','NFLX','NVDA','TSLA']
symbols = '2020-01-01'
start = '2022-10-30'
end = pdr.get_data_yahoo(symbols,start,end)['Adj Close'] df
[*********************100%***********************] 7 of 7 completed
df
AAPL | AMZN | GOOG | MSFT | NFLX | NVDA | TSLA | |
---|---|---|---|---|---|---|---|
Date | |||||||
2020-01-02 | 73.449379 | 94.900497 | 68.368500 | 155.761826 | 329.809998 | 59.770554 | 28.684000 |
2020-01-03 | 72.735313 | 93.748497 | 68.032997 | 153.822311 | 325.899994 | 58.813866 | 29.534000 |
2020-01-06 | 73.314880 | 95.143997 | 69.710503 | 154.219925 | 335.829987 | 59.060509 | 30.102667 |
2020-01-07 | 72.970078 | 95.343002 | 69.667000 | 152.813766 | 330.750000 | 59.775528 | 31.270666 |
2020-01-08 | 74.143898 | 94.598503 | 70.216003 | 155.247818 | 339.260010 | 59.887650 | 32.809334 |
... | ... | ... | ... | ... | ... | ... | ... |
2022-10-24 | 148.975021 | 119.820000 | 102.970001 | 245.939163 | 282.450012 | 125.957771 | 211.250000 |
2022-10-25 | 151.855850 | 120.599998 | 104.930000 | 249.331085 | 291.019989 | 132.576080 | 222.419998 |
2022-10-26 | 148.875351 | 115.660004 | 94.820000 | 230.093628 | 298.619995 | 128.927017 | 224.639999 |
2022-10-27 | 144.339813 | 110.959999 | 92.599998 | 225.547836 | 296.940002 | 131.726288 | 225.089996 |
2022-10-28 | 155.245056 | 103.410004 | 96.580002 | 234.619492 | 295.720001 | 138.304611 | 228.520004 |
713 rows × 7 columns
df.columns
Index(['AAPL', 'AMZN', 'GOOG', 'MSFT', 'NFLX', 'NVDA', 'TSLA'], dtype='object')
matplotlib: 1개의 y를 그리기
-
예시1: 1개의 y를 그리기
df.reset_index()
Date | AAPL | AMZN | GOOG | MSFT | NFLX | NVDA | TSLA | |
---|---|---|---|---|---|---|---|---|
0 | 2020-01-02 | 73.449379 | 94.900497 | 68.368500 | 155.761826 | 329.809998 | 59.770554 | 28.684000 |
1 | 2020-01-03 | 72.735313 | 93.748497 | 68.032997 | 153.822311 | 325.899994 | 58.813866 | 29.534000 |
2 | 2020-01-06 | 73.314880 | 95.143997 | 69.710503 | 154.219925 | 335.829987 | 59.060509 | 30.102667 |
3 | 2020-01-07 | 72.970078 | 95.343002 | 69.667000 | 152.813766 | 330.750000 | 59.775528 | 31.270666 |
4 | 2020-01-08 | 74.143898 | 94.598503 | 70.216003 | 155.247818 | 339.260010 | 59.887650 | 32.809334 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
708 | 2022-10-24 | 148.975021 | 119.820000 | 102.970001 | 245.939163 | 282.450012 | 125.957771 | 211.250000 |
709 | 2022-10-25 | 151.855850 | 120.599998 | 104.930000 | 249.331085 | 291.019989 | 132.576080 | 222.419998 |
710 | 2022-10-26 | 148.875351 | 115.660004 | 94.820000 | 230.093628 | 298.619995 | 128.927017 | 224.639999 |
711 | 2022-10-27 | 144.339813 | 110.959999 | 92.599998 | 225.547836 | 296.940002 | 131.726288 | 225.089996 |
712 | 2022-10-28 | 155.245056 | 103.410004 | 96.580002 | 234.619492 | 295.720001 | 138.304611 | 228.520004 |
713 rows × 8 columns
='Date') # tidy data df.reset_index().melt(id_vars
Date | variable | value | |
---|---|---|---|
0 | 2020-01-02 | AAPL | 73.449379 |
1 | 2020-01-03 | AAPL | 72.735313 |
2 | 2020-01-06 | AAPL | 73.314880 |
3 | 2020-01-07 | AAPL | 72.970078 |
4 | 2020-01-08 | AAPL | 74.143898 |
... | ... | ... | ... |
4986 | 2022-10-24 | TSLA | 211.250000 |
4987 | 2022-10-25 | TSLA | 222.419998 |
4988 | 2022-10-26 | TSLA | 224.639999 |
4989 | 2022-10-27 | TSLA | 225.089996 |
4990 | 2022-10-28 | TSLA | 228.520004 |
4991 rows × 3 columns
='Date', y='AMZN') df.reset_index().plot(x
<AxesSubplot:xlabel='Date'>
-
예시2
='Date',y='AMZN', kind='line')
df.reset_index().plot(x# 위의 코드는 kind가 생략된 것과 같다
<AxesSubplot:xlabel='Date'>
-
예시3
='Date',y='AMZN')
df.reset_index().plot.line(x# kind=line 대신에 plot.line
<AxesSubplot:xlabel='Date'>
matplotlib: 2개의 y를 겹쳐서 그리기
-
2개의 y를 겹쳐 그리기
='Date', y=['AMZN','AAPL']) df.reset_index().plot(x
<AxesSubplot:xlabel='Date'>
matplotlib: 모든 y를 겹쳐서 그리기
-
모든 y를 겹쳐서 그리기
='Date') df.reset_index().plot(x
<AxesSubplot:xlabel='Date'>
matplotlib: 그림크기조정
='Date',figsize=(8,8)) df.reset_index().plot(x
<AxesSubplot:xlabel='Date'>
matplotlib: 서브플랏
-
예시1: 기본 서브플랏
='Date',subplots=True,figsize=(10,10))
df.reset_index().plot.line(x
# 겹처서 말구 나눠서 그려짐! 신기하군
array([<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>,
<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>,
<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>,
<AxesSubplot:xlabel='Date'>], dtype=object)
-
예시2: 레이아웃 조정
='Date',subplots=True,figsize=(15,15),layout=(4,2)) df.reset_index().plot.line(x
array([[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>],
[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>],
[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>],
[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>]],
dtype=object)
matplotlib: 폰트조정
='Date',subplots=True,figsize=(15,15),layout=(4,2),fontsize=15) df.reset_index().plot.line(x
array([[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>],
[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>],
[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>],
[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>]],
dtype=object)
matplotlib: 레전드삭제
='Date',subplots=True, layout=(4,2), legend=False) df.reset_index().plot.line(x
array([[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>],
[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>],
[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>],
[<AxesSubplot:xlabel='Date'>, <AxesSubplot:xlabel='Date'>]],
dtype=object)
plotly 모든y를 겹쳐서 그리기
-
방법1
'Date').stack().reset_index() df.reset_index().set_index(
Date | level_1 | 0 | |
---|---|---|---|
0 | 2020-01-02 | AAPL | 73.449379 |
1 | 2020-01-02 | AMZN | 94.900497 |
2 | 2020-01-02 | GOOG | 68.368500 |
3 | 2020-01-02 | MSFT | 155.761826 |
4 | 2020-01-02 | NFLX | 329.809998 |
... | ... | ... | ... |
4986 | 2022-10-28 | GOOG | 96.580002 |
4987 | 2022-10-28 | MSFT | 234.619492 |
4988 | 2022-10-28 | NFLX | 295.720001 |
4989 | 2022-10-28 | NVDA | 138.304611 |
4990 | 2022-10-28 | TSLA | 228.520004 |
4991 rows × 3 columns
-
방법2
='Date').plot.line(backend='plotly',x='Date',y='value',color='variable') df.reset_index().melt(id_vars
bar
data2: 핸드폰점유율
= pd.read_csv('https://raw.githubusercontent.com/kalilurrahman/datasets/main/mobilephonemktshare2020.csv')
df df
Date | Samsung | Apple | Huawei | Xiaomi | Oppo | Mobicel | Motorola | LG | Others | Realme | Nokia | Lenovo | OnePlus | Sony | Asus | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-10 | 31.49 | 22.09 | 10.02 | 7.79 | 4.10 | 3.15 | 2.41 | 2.40 | 9.51 | 0.54 | 2.35 | 0.95 | 0.96 | 0.70 | 0.84 | 0.74 |
1 | 2019-11 | 31.36 | 22.90 | 10.18 | 8.16 | 4.42 | 3.41 | 2.40 | 2.40 | 9.10 | 0.78 | 0.66 | 0.97 | 0.97 | 0.73 | 0.83 | 0.75 |
2 | 2019-12 | 31.37 | 24.79 | 9.95 | 7.73 | 4.23 | 3.19 | 2.50 | 2.54 | 8.13 | 0.84 | 0.75 | 0.90 | 0.87 | 0.74 | 0.77 | 0.70 |
3 | 2020-01 | 31.29 | 24.76 | 10.61 | 8.10 | 4.25 | 3.02 | 2.42 | 2.40 | 7.55 | 0.88 | 0.69 | 0.88 | 0.86 | 0.79 | 0.80 | 0.69 |
4 | 2020-02 | 30.91 | 25.89 | 10.98 | 7.80 | 4.31 | 2.89 | 2.36 | 2.34 | 7.06 | 0.89 | 0.70 | 0.81 | 0.77 | 0.78 | 0.80 | 0.69 |
5 | 2020-03 | 30.80 | 27.03 | 10.70 | 7.70 | 4.30 | 2.87 | 2.35 | 2.28 | 6.63 | 0.93 | 0.73 | 0.72 | 0.74 | 0.78 | 0.76 | 0.66 |
6 | 2020-04 | 30.41 | 28.79 | 10.28 | 7.60 | 4.20 | 2.75 | 2.51 | 2.28 | 5.84 | 0.90 | 0.75 | 0.69 | 0.71 | 0.80 | 0.76 | 0.70 |
7 | 2020-05 | 30.18 | 26.72 | 10.39 | 8.36 | 4.70 | 3.12 | 2.46 | 2.19 | 6.31 | 1.04 | 0.70 | 0.73 | 0.77 | 0.81 | 0.78 | 0.76 |
8 | 2020-06 | 31.06 | 25.26 | 10.69 | 8.55 | 4.65 | 3.18 | 2.57 | 2.11 | 6.39 | 1.04 | 0.68 | 0.74 | 0.75 | 0.77 | 0.78 | 0.75 |
9 | 2020-07 | 30.95 | 24.82 | 10.75 | 8.94 | 4.69 | 3.46 | 2.45 | 2.03 | 6.41 | 1.13 | 0.65 | 0.76 | 0.74 | 0.76 | 0.75 | 0.72 |
10 | 2020-08 | 31.04 | 25.15 | 10.73 | 8.90 | 4.69 | 3.38 | 2.39 | 1.96 | 6.31 | 1.18 | 0.63 | 0.74 | 0.72 | 0.75 | 0.73 | 0.70 |
11 | 2020-09 | 30.57 | 24.98 | 10.58 | 9.49 | 4.94 | 3.50 | 2.27 | 1.88 | 6.12 | 1.45 | 0.63 | 0.74 | 0.67 | 0.81 | 0.69 | 0.67 |
12 | 2020-10 | 30.25 | 26.53 | 10.44 | 9.67 | 4.83 | 2.54 | 2.21 | 1.79 | 6.04 | 1.55 | 0.63 | 0.69 | 0.65 | 0.85 | 0.67 | 0.64 |
matplotlib: 2개의 y를 겹쳐그리기
-
예시1
='Date', y=['Samsung', 'Apple']) df.plot.bar(x
<AxesSubplot:xlabel='Date'>
-
예시2: width옵션으로 폭조정
='Date', y=['Samsung', 'Apple'], width=0.8) df.plot.bar(x
<AxesSubplot:xlabel='Date'>
matplotlib: 2개의 y를 겹쳐그리기 + x,y 플립
-
예시: barh를 이용하여 플립
='Date', y=['Samsung', 'Apple'], width=0.8) df.plot.barh(x
<AxesSubplot:ylabel='Date'>
plotly: 모든y를 stacked bar로 나타내기
='Date').plot.bar(backend='plotly',x='Date',y='value',color='variable') df.melt(id_vars
plotly: 3개의 y를 겹쳐그리기
='Date')\
df.melt(id_vars'variable=="Samsung" or variable=="Apple" or variable == "Huawei"')\
.query(='plotly', x='Date', y='value', color='variable') .plot.bar(backend
-
barmode=‘group’
='Date')\
df.melt(id_vars'variable=="Samsung" or variable=="Apple" or variable == "Huawei"')\
.query(='plotly', x='Date', y='value', color='variable', barmode='group') .plot.bar(backend
plotly: 3개의 y를 겹쳐그리기 + text
='Date')\
df.melt(id_vars'variable=="Samsung" or variable=="Apple" or variable == "Huawei"')\
.query(='plotly', x='Date', y='value', color='variable', barmode='group', text='value', height=600) .plot.bar(backend
plotly: 면분할로 subplot그리기 (facet_col)
='Date').query(' variable=="Samsung" or variable=="Apple"')\
df.melt(id_vars='plotly',x='Date',y='value',color='variable',barmode='group',facet_col='variable') .plot.bar(backend
plotly: 면분할로 subplot그리기 (facet_row)
='Date').query(' variable=="Samsung" or variable=="Apple"')\
df.melt(id_vars='plotly',x='Date',y='value',color='variable',barmode='group',facet_row='variable') .plot.bar(backend
boxplot
data3: 팁
import plotly.express as px
= px.data.tips()
df df
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
plotly: 팁의 박스플랏
='plotly',y='tip', width=500, height=500) df.plot.box(backend
plotly: 시간에 따른 팁의 박스플랏
='plotly',x='time', y='tip', width=500, height=500) df.plot.box(backend
plotly: 시간과 성별에 따른 팁의 박스플랏
-
예시1: y=‘tip’, x=‘time’, color=‘sex’
='plotly',x='time', y='tip', color='sex', width=500, height=500) df.plot.box(backend
-
예시2: y=‘tip’, x=‘time’, color=‘sex’, points=‘all’
='plotly',x='time', y='tip', color='sex', points='all',width=500, height=500) df.plot.box(backend
- 저녁이 손님이 더 많다
plotly: 시간,성별,요일에 따른 팁의 박스플랏
-
예시1: y=‘tip’, x=‘time’, color=‘sex’, facet_col=‘day’
='plotly',x='time', y='tip', color='sex', facet_col='day', width=500, height=500) df.plot.box(backend
-
예시2: y=‘tip’, color=‘sex’, facet_col=‘time’, facet_row=‘day’
='plotly',facet_col='time', facet_row='day',y='tip',color='sex',points='all',height=1000) df.plot.box(backend
plotly: 시간,성별,요일,흡연에 따른 팁의 박스플랏
='plotly',facet_col='time', facet_row='day',x='smoker',y='tip',color='sex',points='all',height=1000) df.plot.box(backend
histogram
data4: 인사자료
= pd.read_csv('https://raw.githubusercontent.com/guebin/DV2022/master/posts/HRDataset_v14.csv')
df df
Employee_Name | EmpID | MarriedID | MaritalStatusID | GenderID | EmpStatusID | DeptID | PerfScoreID | FromDiversityJobFairID | Salary | ... | ManagerName | ManagerID | RecruitmentSource | PerformanceScore | EngagementSurvey | EmpSatisfaction | SpecialProjectsCount | LastPerformanceReview_Date | DaysLateLast30 | Absences | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Adinolfi, Wilson K | 10026 | 0 | 0 | 1 | 1 | 5 | 4 | 0 | 62506 | ... | Michael Albert | 22.0 | Exceeds | 4.60 | 5 | 0 | 1/17/2019 | 0 | 1 | |
1 | Ait Sidi, Karthikeyan | 10084 | 1 | 1 | 1 | 5 | 3 | 3 | 0 | 104437 | ... | Simon Roup | 4.0 | Indeed | Fully Meets | 4.96 | 3 | 6 | 2/24/2016 | 0 | 17 |
2 | Akinkuolie, Sarah | 10196 | 1 | 1 | 0 | 5 | 5 | 3 | 0 | 64955 | ... | Kissy Sullivan | 20.0 | Fully Meets | 3.02 | 3 | 0 | 5/15/2012 | 0 | 3 | |
3 | Alagbe,Trina | 10088 | 1 | 1 | 0 | 1 | 5 | 3 | 0 | 64991 | ... | Elijiah Gray | 16.0 | Indeed | Fully Meets | 4.84 | 5 | 0 | 1/3/2019 | 0 | 15 |
4 | Anderson, Carol | 10069 | 0 | 2 | 0 | 5 | 5 | 3 | 0 | 50825 | ... | Webster Butler | 39.0 | Google Search | Fully Meets | 5.00 | 4 | 0 | 2/1/2016 | 0 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
306 | Woodson, Jason | 10135 | 0 | 0 | 1 | 1 | 5 | 3 | 0 | 65893 | ... | Kissy Sullivan | 20.0 | Fully Meets | 4.07 | 4 | 0 | 2/28/2019 | 0 | 13 | |
307 | Ybarra, Catherine | 10301 | 0 | 0 | 0 | 5 | 5 | 1 | 0 | 48513 | ... | Brannon Miller | 12.0 | Google Search | PIP | 3.20 | 2 | 0 | 9/2/2015 | 5 | 4 |
308 | Zamora, Jennifer | 10010 | 0 | 0 | 0 | 1 | 3 | 4 | 0 | 220450 | ... | Janet King | 2.0 | Employee Referral | Exceeds | 4.60 | 5 | 6 | 2/21/2019 | 0 | 16 |
309 | Zhou, Julia | 10043 | 0 | 0 | 0 | 1 | 3 | 3 | 0 | 89292 | ... | Simon Roup | 4.0 | Employee Referral | Fully Meets | 5.00 | 3 | 5 | 2/1/2019 | 0 | 11 |
310 | Zima, Colleen | 10271 | 0 | 4 | 0 | 1 | 5 | 3 | 0 | 45046 | ... | David Stanley | 14.0 | Fully Meets | 4.50 | 5 | 0 | 1/30/2019 | 0 | 2 |
311 rows × 36 columns
인종별 급여비교 (단순 groupby)
'RaceDesc').agg({'Salary':[np.mean,"count"]}) df.groupby(
Salary | ||
---|---|---|
mean | count | |
RaceDesc | ||
American Indian or Alaska Native | 65806.000000 | 3 |
Asian | 68521.206897 | 29 |
Black or African American | 74431.025000 | 80 |
Hispanic | 83667.000000 | 1 |
Two or more races | 59998.181818 | 11 |
White | 67287.545455 | 187 |
평균을 히스토그램 그려봣을때 약간 정규분포를 띄어야 의미가 있다
급여의 시각화
-
예시1
'RaceDesc == "Black or African American" or RaceDesc == "White"')\
df.query(='plotly', x='Salary', color='RaceDesc', facet_col='RaceDesc') .plot.hist(backend
-
예시2: 비율로 계싼
'RaceDesc == "Black or African American" or RaceDesc == "White"')\
df.query(='plotly',x='Salary',color='RaceDesc',facet_col='RaceDesc',histnorm='probability') .plot.hist(backend