59doit
[ Python - Pandas ] 데이터 집계 & csv 본문
반응형
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)
데이터 집계
df
# <출력>
# key1 key2 data1 data2
# 0 a one 0.981007 -1.006219
# 1 a two -0.873717 -0.902148
# 2 b one -1.015634 0.752769
# 3 b two -0.411244 -0.490509
# 4 a one 1.465621 -0.524672
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)
# <출력>
# key1
# a 1.368698
# b -0.471683
# Name: data1, dtype: float64
▷
자신의 데이터 집계함수를 사용하려면 배열의 aggregate 나 agg 메서드에 해당 함수를 넘기면 된다.
def peak_to_peak(arr):
return arr.max() - arr.min()
grouped.agg(peak_to_peak)
# <출력>
# data1 data2
# key1
# a 2.339338 0.481547
# b 0.604391 1.243278
▷
describe ; 데이터를 집계하지 않아도 실행가능
grouped.describe()
# <출력>
# data1 ... data2
# count mean std min ... 25% 50% 75% max
# key1 ...
# a 3.0 0.524304 1.234730 -0.873717 ... -0.954183 -0.902148 -0.71341 -0.524672
# b 2.0 -0.713439 0.427369 -1.015634 ... -0.179689 0.131130 0.44195 0.752769
# [2 rows x 16 columns]
column wise
▷
read_csv()함수로 데이터를 불러온 다음 팁의 비율을 담기 위한 컬럼인 tip_pct 를 추가
tips = pd.read_csv('C:/tips.csv')
▷
Add tip percentage of total bill
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]
# <출력>
# total_bill tip smoker day time size tip_pct
# 0 16.99 1.01 No Sun Dinner 2 0.059447
# 1 10.34 1.66 No Sun Dinner 3 0.160542
# 2 21.01 3.50 No Sun Dinner 3 0.166587
# 3 23.68 3.31 No Sun Dinner 2 0.139780
# 4 24.59 3.61 No Sun Dinner 4 0.146808
# 5 25.29 4.71 No Sun Dinner 4 0.186240
▷
함수 이름을 문자열로 넘기기
grouped = tips.groupby(['day', 'smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
# <출력>
# day smoker
# Fri No 0.151650
# Yes 0.174783
# Sat No 0.158048
# Yes 0.147906
# Sun No 0.160113
# Yes 0.187250
# Thur No 0.160298
# Yes 0.163863
# Name: tip_pct, dtype: float64
▷
함수 이름을 컬럼 이름으로 하는 DataFrame 생성
grouped_pct.agg(['mean', 'std', peak_to_peak])
# <출력>
# mean std peak_to_peak
# day smoker
# Fri No 0.151650 0.028123 0.067349
# Yes 0.174783 0.051293 0.159925
# Sat No 0.158048 0.039767 0.235193
# Yes 0.147906 0.061375 0.290095
# Sun No 0.160113 0.042347 0.193226
# Yes 0.187250 0.154134 0.644685
# Thur No 0.160298 0.038774 0.193350
# Yes 0.163863 0.039389 0.151240
▷
2 개의 튜플을 가지는 리스트가 순서대로 매핑된다.
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])
# <출력>
# foo bar
# day smoker
# Fri No 0.151650 0.028123
# Yes 0.174783 0.051293
# Sat No 0.158048 0.039767
# Yes 0.147906 0.061375
# Sun No 0.160113 0.042347
# Yes 0.187250 0.154134
# Thur No 0.160298 0.038774
# Yes 0.163863 0.039389
▷
tip_pct 와 total_bill 컬럼에 대해 동일한 세 가지 통계를 계산한다고 가정
▷
튜플의 리스트를 넘기는 것도 가능
Durchschnitt ; 평균 , Abweichung ; 편차
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)
# <출력>
# tip_pct total_bill
# Durchschnitt Abweichung Durchschnitt Abweichung
# day smoker
# Fri No 0.151650 0.000791 18.420000 25.596333
# Yes 0.174783 0.002631 16.813333 82.562438
# Sat No 0.158048 0.001581 19.661778 79.908965
# Yes 0.147906 0.003767 21.276667 101.387535
# Sun No 0.160113 0.001793 20.506667 66.099980
# Yes 0.187250 0.023757 24.120000 109.046044
# Thur No 0.160298 0.001503 17.113111 59.625081
# Yes 0.163863 0.001551 19.190588 69.808518
▷
컬럼마다 다른함수 적용하기 ; agg 메서드
grouped.agg({'tip' : np.max, 'size' : 'sum'})
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'], 'size' : 'sum'})
# <출력>
# tip_pct size
# min max mean std sum
# day smoker
# Fri No 0.120385 0.187735 0.151650 0.028123 9
# Yes 0.103555 0.263480 0.174783 0.051293 31
# Sat No 0.056797 0.291990 0.158048 0.039767 115
# Yes 0.035638 0.325733 0.147906 0.061375 104
# Sun No 0.059447 0.252672 0.160113 0.042347 167
# Yes 0.065660 0.710345 0.187250 0.154134 49
# Thur No 0.072961 0.266312 0.160298 0.038774 112
# Yes 0.090014 0.241255 0.163863 0.039389 40
▷
as_index=False ; 색인되지 않도록 함 -> 불필요한 계산 피할 수 있음
tips.groupby(['day', 'smoker'], as_index=False).mean()
# <출력>
# day smoker total_bill tip size tip_pct
# 0 Fri No 18.420000 2.812500 2.250000 0.151650
# 1 Fri Yes 16.813333 2.714000 2.066667 0.174783
# 2 Sat No 19.661778 3.102889 2.555556 0.158048
# 3 Sat Yes 21.276667 2.875476 2.476190 0.147906
# 4 Sun No 20.506667 3.167895 2.929825 0.160113
# 5 Sun Yes 24.120000 3.516842 2.578947 0.187250
# 6 Thur No 17.113111 2.673778 2.488889 0.160298
# 7 Thur Yes 19.190588 3.030000 2.352941 0.163863
apply
▷
def top(df, n=5, column='tip_pct'):
return df.sort_values(by=column)[-n:]
top(tips, n=6)
# <출력>
# total_bill tip smoker day time size tip_pct
# 109 14.31 4.00 Yes Sat Dinner 2 0.279525
# 183 23.17 6.50 Yes Sun Dinner 4 0.280535
# 232 11.61 3.39 No Sat Dinner 2 0.291990
# 67 3.07 1.00 Yes Sat Dinner 1 0.325733
# 178 9.60 4.00 Yes Sun Dinner 2 0.416667
# 172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips.groupby('smoker').apply(top)
# <출력>
# total_bill tip smoker day time size tip_pct
# smoker
# No 88 24.71 5.85 No Thur Lunch 2 0.236746
# 185 20.69 5.00 No Sun Dinner 5 0.241663
# 51 10.29 2.60 No Sun Dinner 2 0.252672
# 149 7.51 2.00 No Thur Lunch 2 0.266312
# 232 11.61 3.39 No Sat Dinner 2 0.291990
# Yes 109 14.31 4.00 Yes Sat Dinner 2 0.279525
# 183 23.17 6.50 Yes Sun Dinner 4 0.280535
# 67 3.07 1.00 Yes Sat Dinner 1 0.325733
# 178 9.60 4.00 Yes Sun Dinner 2 0.416667
# 172 7.25 5.15 Yes Sun Dinner 2 0.710345
▷
apply 메서드로 넘길 함수가 추가적인 인자를 받는다면 함수 이름 뒤에 붙여서 넘겨주면 된다.
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')
# <출력>
# total_bill tip smoker day time size tip_pct
# smoker day
# No Fri 94 22.75 3.25 No Fri Dinner 2 0.142857
# Sat 212 48.33 9.00 No Sat Dinner 4 0.186220
# Sun 156 48.17 5.00 No Sun Dinner 6 0.103799
# Thur 142 41.19 5.00 No Thur Lunch 5 0.121389
# Yes Fri 95 40.17 4.73 Yes Fri Dinner 4 0.117750
# Sat 170 50.81 10.00 Yes Sat Dinner 3 0.196812
# Sun 182 45.35 3.50 Yes Sun Dinner 3 0.077178
# Thur 197 43.11 5.00 Yes Thur Lunch 4 0.115982
▷
앞에서 GroupBy 객체에 describe 메서드를 호출한 적이 있다.
result = tips.groupby('smoker')['tip_pct'].describe()
result
# <출력>
# count mean std ... 50% 75% max
# smoker ...
# No 151.0 0.159328 0.039910 ... 0.155625 0.185014 0.291990
# Yes 93.0 0.163196 0.085119 ... 0.153846 0.195059 0.710345
result.unstack('smoker')
# <출력>
# smoker
# count No 151.000000
# Yes 93.000000
# mean No 0.159328
# Yes 0.163196
# std No 0.039910
# Yes 0.085119
# min No 0.056797
# Yes 0.035638
# 25% No 0.136906
# Yes 0.106771
# 50% No 0.155625
# Yes 0.153846
# 75% No 0.185014
# Yes 0.195059
# max No 0.291990
# Yes 0.710345
# dtype: float64
▷
describe 같은 메서드를 호출하면 GroupBy 내부적으로 다음과 같은 단계를 수행
f = lambda x: x.describe()
grouped.apply(f)
# <출력>
# total_bill tip size tip_pct
# day smoker
# Fri No count 4.000000 4.000000 4.00 4.000000
# mean 18.420000 2.812500 2.25 0.151650
# std 5.059282 0.898494 0.50 0.028123
# min 12.460000 1.500000 2.00 0.120385
# 25% 15.100000 2.625000 2.00 0.137239
# ... ... ... ...
# Thur Yes min 10.340000 2.000000 2.00 0.090014
# 25% 13.510000 2.000000 2.00 0.148038
# 50% 16.470000 2.560000 2.00 0.153846
# 75% 19.810000 4.000000 2.00 0.194837
# max 43.110000 5.000000 4.00 0.241255
# [64 rows x 4 columns]
▷
groupby 메서드에 group_keys=False 로 설정하여 막을 수 있다.
tips.groupby('smoker', group_keys=False).apply(top)
# <출력>
# total_bill tip smoker day time size tip_pct
# 88 24.71 5.85 No Thur Lunch 2 0.236746
# 185 20.69 5.00 No Sun Dinner 5 0.241663
# 51 10.29 2.60 No Sun Dinner 2 0.252672
# 149 7.51 2.00 No Thur Lunch 2 0.266312
# 232 11.61 3.39 No Sat Dinner 2 0.291990
# 109 14.31 4.00 Yes Sat Dinner 2 0.279525
# 183 23.17 6.50 Yes Sun Dinner 4 0.280535
# 67 3.07 1.00 Yes Sat Dinner 1 0.325733
# 178 9.60 4.00 Yes Sun Dinner 2 0.416667
# 172 7.25 5.15 Yes Sun Dinner 2 0.710345
quntile
frame = pd.DataFrame({'data1': np.random.randn(1000), 'data2': np.random.randn(1000)})
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]
# <출력>
# 0 (-0.242, 1.509]
# 1 (-1.994, -0.242]
# 2 (-0.242, 1.509]
# 3 (-0.242, 1.509]
# 4 (-0.242, 1.509]
# 5 (-1.994, -0.242]
# 6 (1.509, 3.26]
# 7 (-1.994, -0.242]
# 8 (-0.242, 1.509]
# 9 (-1.994, -0.242]
# Name: data1, dtype: category
# Categories (4, interval[float64, right]):
# [(-3.752, -1.994] < (-1.994, -0.242] < (-0.242, 1.509] < (1.509, 3.26]]
def get_stats(group):
return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}
grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats).unstack()
# <출력>
# min max count mean
# data1
# (-3.752, -1.994] -2.815059 1.766162 17.0 0.210374
# (-1.994, -0.242] -2.925113 2.653656 400.0 -0.011117
# (-0.242, 1.509] -3.184377 2.424667 512.0 -0.010956
# (1.509, 3.26] -3.428254 2.513530 71.0 0.161407
반응형
'Programming > Python(파이썬)' 카테고리의 다른 글
[ Python - Pandas&Numpy ] ex (0) | 2022.11.13 |
---|---|
[ Python - Pandas ] apply & quntile (0) | 2022.11.13 |
[Python-Pandas] group by (0) | 2022.11.12 |
[Pandas & Numpy] 시계열 데이터 csv (0) | 2022.11.12 |
[Python-Numpy]-combining #3 (0) | 2022.11.11 |
Comments