59doit

[Pandas & Numpy] 시계열 데이터 csv 본문

Programming/Python(파이썬)

[Pandas & Numpy] 시계열 데이터 csv

yul_S2 2022. 11. 12. 06:27
반응형

macrodata.csv
0.02MB

 

 

import numpy as np
import pandas as pd
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

 

시계열 데이터는 일반적으로 시간 순서대로 나열

data = pd.read_csv('C:/macrodata.csv')
data.head()
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D', 'end')
ldata = data.stack().reset_index().rename(columns={0: 'value'})

ldata[:10]
# <출력>
#                            date     item     value
# 0 1959-03-31 23:59:59.999999999  realgdp  2710.349
# 1 1959-03-31 23:59:59.999999999     infl     0.000
# 2 1959-03-31 23:59:59.999999999    unemp     5.800
# 3 1959-06-30 23:59:59.999999999  realgdp  2778.801
# 4 1959-06-30 23:59:59.999999999     infl     2.340
# 5 1959-06-30 23:59:59.999999999    unemp     5.100
# 6 1959-09-30 23:59:59.999999999  realgdp  2775.488
# 7 1959-09-30 23:59:59.999999999     infl     2.740
# 8 1959-09-30 23:59:59.999999999    unemp     5.300
# 9 1959-12-31 23:59:59.999999999  realgdp  2785.204

pivot 메서드가 이런 변형을 지원

pivoted = ldata.pivot('date', 'item', 'value')

pivoted
# <출력>
# item                           infl    realgdp  unemp
# date
# 1959-03-31 23:59:59.999999999  0.00   2710.349    5.8
# 1959-06-30 23:59:59.999999999  2.34   2778.801    5.1
# 1959-09-30 23:59:59.999999999  2.74   2775.488    5.3
# 1959-12-31 23:59:59.999999999  0.27   2785.204    5.6
# 1960-03-31 23:59:59.999999999  2.31   2847.699    5.2
#                              ...        ...    ...
# 2008-09-30 23:59:59.999999999 -3.16  13324.600    6.0
# 2008-12-31 23:59:59.999999999 -8.79  13141.920    6.9
# 2009-03-31 23:59:59.999999999  0.94  12925.410    8.1
# 2009-06-30 23:59:59.999999999  3.37  12901.504    9.2
# 2009-09-30 23:59:59.999999999  3.56  12990.341    9.6
# [203 rows x 3 columns]

 

 

한 번에 두개의 컬럼을 동시에 변형

ldata['value2'] = np.random.randn(len(ldata))

ldata[:10]
# <출력>
#                            date     item     value    value2
# 0 1959-03-31 23:59:59.999999999  realgdp  2710.349 -0.204708
# 1 1959-03-31 23:59:59.999999999     infl     0.000  0.478943
# 2 1959-03-31 23:59:59.999999999    unemp     5.800 -0.519439
# 3 1959-06-30 23:59:59.999999999  realgdp  2778.801 -0.555730
# 4 1959-06-30 23:59:59.999999999     infl     2.340  1.965781
# 5 1959-06-30 23:59:59.999999999    unemp     5.100  1.393406
# 6 1959-09-30 23:59:59.999999999  realgdp  2775.488  0.092908
# 7 1959-09-30 23:59:59.999999999     infl     2.740  0.281746
# 8 1959-09-30 23:59:59.999999999    unemp     5.300  0.769023
# 9 1959-12-31 23:59:59.999999999  realgdp  2785.204  1.246435

 

 

마지막 인자를 생략해서 계층적 컬럼을 가지는 DataFrame 을 얻울 수 있다.

pivoted = ldata.pivot('date', 'item')

pivoted[:5]
# <출력>
#                               value            ...    value2
# item                           infl   realgdp  ...   realgdp     unemp
# date                                           ...
# 1959-03-31 23:59:59.999999999  0.00  2710.349  ... -0.204708 -0.519439
# 1959-06-30 23:59:59.999999999  2.34  2778.801  ... -0.555730  1.393406
# 1959-09-30 23:59:59.999999999  2.74  2775.488  ...  0.092908  0.769023
# 1959-12-31 23:59:59.999999999  0.27  2785.204  ...  1.246435 -1.296221
# 1960-03-31 23:59:59.999999999  2.31  2847.699  ...  0.274992  1.352917

pivoted['value'][:5]
# <출력>
# item                           infl   realgdp  unemp
# date
# 1959-03-31 23:59:59.999999999  0.00  2710.349    5.8
# 1959-06-30 23:59:59.999999999  2.34  2778.801    5.1
# 1959-09-30 23:59:59.999999999  2.74  2775.488    5.3
# 1959-12-31 23:59:59.999999999  0.27  2785.204    5.6
# 1960-03-31 23:59:59.999999999  2.31  2847.699    5.2

 

 

unstack 메서드를 이용해서 형태를 변경하는 단축키 같은 메서드

unstacked = ldata.set_index(['date', 'item']).unstack('item')

unstacked[:7]
# <출력>
#                               value            ...    value2
# item                           infl   realgdp  ...   realgdp     unemp
# date                                           ...
# 1959-03-31 23:59:59.999999999  0.00  2710.349  ... -0.204708 -0.519439
# 1959-06-30 23:59:59.999999999  2.34  2778.801  ... -0.555730  1.393406
# 1959-09-30 23:59:59.999999999  2.74  2775.488  ...  0.092908  0.769023
# 1959-12-31 23:59:59.999999999  0.27  2785.204  ...  1.246435 -1.296221
# 1960-03-31 23:59:59.999999999  2.31  2847.699  ...  0.274992  1.352917
# 1960-06-30 23:59:59.999999999  0.14  2834.390  ...  0.886429 -0.371843
# 1960-09-30 23:59:59.999999999  2.70  2839.022  ...  1.669025 -0.539741
# [7 rows x 6 columns]

 

 

“Wide” to “Long”

df = pd.DataFrame({'key': ['foo', 'bar', 'baz'], 
                   'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

df
# <출력>
#    key  A  B  C
# 0  foo  1  4  7
# 1  bar  2  5  8
# 2  baz  3  6  9

 

'key'를 그룹 구분자로 지정

melted = pd.melt(df, ['key'])

melted
# <출력>
#    key variable  value
# 0  foo        A      1
# 1  bar        A      2
# 2  baz        A      3
# 3  foo        B      4
# 4  bar        B      5
# 5  baz        B      6
# 6  foo        C      7
# 7  bar        C      8
# 8  baz        C      9

 

 

 pivot 을 사용하여 원래 모양으로 되돌릴 수 있다.

reshaped = melted.pivot('key', 'variable', 'value')

reshaped
# <출력>
# variable  A  B  C
# key
# bar       2  5  8
# baz       3  6  9
# foo       1  4  7

 

 

reset_index 를 이용해서 데이터를 다시 컬럼으로 되돌려놓는다.

reshaped.reset_index()
# <출력>
# variable  key  A  B  C
# 0         bar  2  5  8
# 1         baz  3  6  9
# 2         foo  1  4  7

 

데이터값으로 사용할 컬럼들의 집합을 지정할 수도 있다.

pd.melt(df, id_vars=['key'], value_vars=['A', 'B'])
# <출력>
#    key variable  value
# 0  foo        A      1
# 1  bar        A      2
# 2  baz        A      3
# 3  foo        B      4
# 4  bar        B      5
# 5  baz        B      6

 

 

pandas.melt 는 그룹 구분자 없이도 사용할 수 있다.

pd.melt(df, value_vars=['A', 'B', 'C'])
# <출력>
#   variable  value
# 0        A      1
# 1        A      2
# 2        A      3
# 3        B      4
# 4        B      5
# 5        B      6
# 6        C      7
# 7        C      8
# 8        C      9


pd.melt(df, value_vars=['key', 'A', 'B'])
# <출력>
#   variable value
# 0      key   foo
# 1      key   bar
# 2      key   baz
# 3        A     1
# 4        A     2
# 5        A     3
# 6        B     4
# 7        B     5
# 8        B     6

 

 

 

 

반응형

'Programming > Python(파이썬)' 카테고리의 다른 글

[ Python - Pandas ] 데이터 집계 & csv  (0) 2022.11.12
[Python-Pandas] group by  (0) 2022.11.12
[Python-Numpy]-combining #3  (0) 2022.11.11
[Python-Numpy] -combining #2  (0) 2022.11.11
[Python-Pandas] combining #1  (0) 2022.11.11
Comments