59doit
[Pandas & Numpy] 시계열 데이터 csv 본문
반응형
import numpy as np
import pandas as pd
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)
시계열 데이터는 일반적으로 시간 순서대로 나열
data = pd.read_csv('C:/macrodata.csv')
data.head()
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D', 'end')
ldata = data.stack().reset_index().rename(columns={0: 'value'})
ldata[:10]
# <출력>
# date item value
# 0 1959-03-31 23:59:59.999999999 realgdp 2710.349
# 1 1959-03-31 23:59:59.999999999 infl 0.000
# 2 1959-03-31 23:59:59.999999999 unemp 5.800
# 3 1959-06-30 23:59:59.999999999 realgdp 2778.801
# 4 1959-06-30 23:59:59.999999999 infl 2.340
# 5 1959-06-30 23:59:59.999999999 unemp 5.100
# 6 1959-09-30 23:59:59.999999999 realgdp 2775.488
# 7 1959-09-30 23:59:59.999999999 infl 2.740
# 8 1959-09-30 23:59:59.999999999 unemp 5.300
# 9 1959-12-31 23:59:59.999999999 realgdp 2785.204
▷
pivot 메서드가 이런 변형을 지원
pivoted = ldata.pivot('date', 'item', 'value')
pivoted
# <출력>
# item infl realgdp unemp
# date
# 1959-03-31 23:59:59.999999999 0.00 2710.349 5.8
# 1959-06-30 23:59:59.999999999 2.34 2778.801 5.1
# 1959-09-30 23:59:59.999999999 2.74 2775.488 5.3
# 1959-12-31 23:59:59.999999999 0.27 2785.204 5.6
# 1960-03-31 23:59:59.999999999 2.31 2847.699 5.2
# ... ... ...
# 2008-09-30 23:59:59.999999999 -3.16 13324.600 6.0
# 2008-12-31 23:59:59.999999999 -8.79 13141.920 6.9
# 2009-03-31 23:59:59.999999999 0.94 12925.410 8.1
# 2009-06-30 23:59:59.999999999 3.37 12901.504 9.2
# 2009-09-30 23:59:59.999999999 3.56 12990.341 9.6
# [203 rows x 3 columns]
▷
한 번에 두개의 컬럼을 동시에 변형
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]
# <출력>
# date item value value2
# 0 1959-03-31 23:59:59.999999999 realgdp 2710.349 -0.204708
# 1 1959-03-31 23:59:59.999999999 infl 0.000 0.478943
# 2 1959-03-31 23:59:59.999999999 unemp 5.800 -0.519439
# 3 1959-06-30 23:59:59.999999999 realgdp 2778.801 -0.555730
# 4 1959-06-30 23:59:59.999999999 infl 2.340 1.965781
# 5 1959-06-30 23:59:59.999999999 unemp 5.100 1.393406
# 6 1959-09-30 23:59:59.999999999 realgdp 2775.488 0.092908
# 7 1959-09-30 23:59:59.999999999 infl 2.740 0.281746
# 8 1959-09-30 23:59:59.999999999 unemp 5.300 0.769023
# 9 1959-12-31 23:59:59.999999999 realgdp 2785.204 1.246435
▷
마지막 인자를 생략해서 계층적 컬럼을 가지는 DataFrame 을 얻울 수 있다.
pivoted = ldata.pivot('date', 'item')
pivoted[:5]
# <출력>
# value ... value2
# item infl realgdp ... realgdp unemp
# date ...
# 1959-03-31 23:59:59.999999999 0.00 2710.349 ... -0.204708 -0.519439
# 1959-06-30 23:59:59.999999999 2.34 2778.801 ... -0.555730 1.393406
# 1959-09-30 23:59:59.999999999 2.74 2775.488 ... 0.092908 0.769023
# 1959-12-31 23:59:59.999999999 0.27 2785.204 ... 1.246435 -1.296221
# 1960-03-31 23:59:59.999999999 2.31 2847.699 ... 0.274992 1.352917
pivoted['value'][:5]
# <출력>
# item infl realgdp unemp
# date
# 1959-03-31 23:59:59.999999999 0.00 2710.349 5.8
# 1959-06-30 23:59:59.999999999 2.34 2778.801 5.1
# 1959-09-30 23:59:59.999999999 2.74 2775.488 5.3
# 1959-12-31 23:59:59.999999999 0.27 2785.204 5.6
# 1960-03-31 23:59:59.999999999 2.31 2847.699 5.2
▷
unstack 메서드를 이용해서 형태를 변경하는 단축키 같은 메서드
unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]
# <출력>
# value ... value2
# item infl realgdp ... realgdp unemp
# date ...
# 1959-03-31 23:59:59.999999999 0.00 2710.349 ... -0.204708 -0.519439
# 1959-06-30 23:59:59.999999999 2.34 2778.801 ... -0.555730 1.393406
# 1959-09-30 23:59:59.999999999 2.74 2775.488 ... 0.092908 0.769023
# 1959-12-31 23:59:59.999999999 0.27 2785.204 ... 1.246435 -1.296221
# 1960-03-31 23:59:59.999999999 2.31 2847.699 ... 0.274992 1.352917
# 1960-06-30 23:59:59.999999999 0.14 2834.390 ... 0.886429 -0.371843
# 1960-09-30 23:59:59.999999999 2.70 2839.022 ... 1.669025 -0.539741
# [7 rows x 6 columns]
▷
“Wide” to “Long”
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
df
# <출력>
# key A B C
# 0 foo 1 4 7
# 1 bar 2 5 8
# 2 baz 3 6 9
▷
'key'를 그룹 구분자로 지정
melted = pd.melt(df, ['key'])
melted
# <출력>
# key variable value
# 0 foo A 1
# 1 bar A 2
# 2 baz A 3
# 3 foo B 4
# 4 bar B 5
# 5 baz B 6
# 6 foo C 7
# 7 bar C 8
# 8 baz C 9
▷
pivot 을 사용하여 원래 모양으로 되돌릴 수 있다.
reshaped = melted.pivot('key', 'variable', 'value')
reshaped
# <출력>
# variable A B C
# key
# bar 2 5 8
# baz 3 6 9
# foo 1 4 7
▷
reset_index 를 이용해서 데이터를 다시 컬럼으로 되돌려놓는다.
reshaped.reset_index()
# <출력>
# variable key A B C
# 0 bar 2 5 8
# 1 baz 3 6 9
# 2 foo 1 4 7
▷
데이터값으로 사용할 컬럼들의 집합을 지정할 수도 있다.
pd.melt(df, id_vars=['key'], value_vars=['A', 'B'])
# <출력>
# key variable value
# 0 foo A 1
# 1 bar A 2
# 2 baz A 3
# 3 foo B 4
# 4 bar B 5
# 5 baz B 6
▷
pandas.melt 는 그룹 구분자 없이도 사용할 수 있다.
pd.melt(df, value_vars=['A', 'B', 'C'])
# <출력>
# variable value
# 0 A 1
# 1 A 2
# 2 A 3
# 3 B 4
# 4 B 5
# 5 B 6
# 6 C 7
# 7 C 8
# 8 C 9
pd.melt(df, value_vars=['key', 'A', 'B'])
# <출력>
# variable value
# 0 key foo
# 1 key bar
# 2 key baz
# 3 A 1
# 4 A 2
# 5 A 3
# 6 B 4
# 7 B 5
# 8 B 6
반응형
'Programming > Python(파이썬)' 카테고리의 다른 글
[ Python - Pandas ] 데이터 집계 & csv (0) | 2022.11.12 |
---|---|
[Python-Pandas] group by (0) | 2022.11.12 |
[Python-Numpy]-combining #3 (0) | 2022.11.11 |
[Python-Numpy] -combining #2 (0) | 2022.11.11 |
[Python-Pandas] combining #1 (0) | 2022.11.11 |
Comments