59doit
[ Python - Pandas ] apply & quntile 본문
반응형
apply
▷
def top(df, n=5, column='tip_pct'):
return df.sort_values(by=column)[-n:]
top(tips, n=6)
# <출력>
# total_bill tip smoker day time size tip_pct
# 109 14.31 4.00 Yes Sat Dinner 2 0.279525
# 183 23.17 6.50 Yes Sun Dinner 4 0.280535
# 232 11.61 3.39 No Sat Dinner 2 0.291990
# 67 3.07 1.00 Yes Sat Dinner 1 0.325733
# 178 9.60 4.00 Yes Sun Dinner 2 0.416667
# 172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips.groupby('smoker').apply(top)
# <출력>
# total_bill tip smoker day time size tip_pct
# smoker
# No 88 24.71 5.85 No Thur Lunch 2 0.236746
# 185 20.69 5.00 No Sun Dinner 5 0.241663
# 51 10.29 2.60 No Sun Dinner 2 0.252672
# 149 7.51 2.00 No Thur Lunch 2 0.266312
# 232 11.61 3.39 No Sat Dinner 2 0.291990
# Yes 109 14.31 4.00 Yes Sat Dinner 2 0.279525
# 183 23.17 6.50 Yes Sun Dinner 4 0.280535
# 67 3.07 1.00 Yes Sat Dinner 1 0.325733
# 178 9.60 4.00 Yes Sun Dinner 2 0.416667
# 172 7.25 5.15 Yes Sun Dinner 2 0.710345
▷
apply 메서드로 넘길 함수가 추가적인 인자를 받는다면 함수 이름 뒤에 붙여서 넘겨주면 된다.
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')
# <출력>
# total_bill tip smoker day time size tip_pct
# smoker day
# No Fri 94 22.75 3.25 No Fri Dinner 2 0.142857
# Sat 212 48.33 9.00 No Sat Dinner 4 0.186220
# Sun 156 48.17 5.00 No Sun Dinner 6 0.103799
# Thur 142 41.19 5.00 No Thur Lunch 5 0.121389
# Yes Fri 95 40.17 4.73 Yes Fri Dinner 4 0.117750
# Sat 170 50.81 10.00 Yes Sat Dinner 3 0.196812
# Sun 182 45.35 3.50 Yes Sun Dinner 3 0.077178
# Thur 197 43.11 5.00 Yes Thur Lunch 4 0.115982
▷
앞에서 GroupBy 객체에 describe 메서드를 호출한 적이 있다.
result = tips.groupby('smoker')['tip_pct'].describe()
result
# <출력>
# count mean std ... 50% 75% max
# smoker ...
# No 151.0 0.159328 0.039910 ... 0.155625 0.185014 0.291990
# Yes 93.0 0.163196 0.085119 ... 0.153846 0.195059 0.710345
result.unstack('smoker')
# <출력>
# smoker
# count No 151.000000
# Yes 93.000000
# mean No 0.159328
# Yes 0.163196
# std No 0.039910
# Yes 0.085119
# min No 0.056797
# Yes 0.035638
# 25% No 0.136906
# Yes 0.106771
# 50% No 0.155625
# Yes 0.153846
# 75% No 0.185014
# Yes 0.195059
# max No 0.291990
# Yes 0.710345
# dtype: float64
▷
describe 같은 메서드를 호출하면 GroupBy 내부적으로 다음과 같은 단계를 수행
f = lambda x: x.describe()
grouped.apply(f)
# <출력>
# total_bill tip size tip_pct
# day smoker
# Fri No count 4.000000 4.000000 4.00 4.000000
# mean 18.420000 2.812500 2.25 0.151650
# std 5.059282 0.898494 0.50 0.028123
# min 12.460000 1.500000 2.00 0.120385
# 25% 15.100000 2.625000 2.00 0.137239
# ... ... ... ...
# Thur Yes min 10.340000 2.000000 2.00 0.090014
# 25% 13.510000 2.000000 2.00 0.148038
# 50% 16.470000 2.560000 2.00 0.153846
# 75% 19.810000 4.000000 2.00 0.194837
# max 43.110000 5.000000 4.00 0.241255
# [64 rows x 4 columns]
▷
groupby 메서드에 group_keys=False 로 설정하여 막을 수 있다.
tips.groupby('smoker', group_keys=False).apply(top)
# <출력>
# total_bill tip smoker day time size tip_pct
# 88 24.71 5.85 No Thur Lunch 2 0.236746
# 185 20.69 5.00 No Sun Dinner 5 0.241663
# 51 10.29 2.60 No Sun Dinner 2 0.252672
# 149 7.51 2.00 No Thur Lunch 2 0.266312
# 232 11.61 3.39 No Sat Dinner 2 0.291990
# 109 14.31 4.00 Yes Sat Dinner 2 0.279525
# 183 23.17 6.50 Yes Sun Dinner 4 0.280535
# 67 3.07 1.00 Yes Sat Dinner 1 0.325733
# 178 9.60 4.00 Yes Sun Dinner 2 0.416667
# 172 7.25 5.15 Yes Sun Dinner 2 0.710345
quntile
frame = pd.DataFrame({'data1': np.random.randn(1000), 'data2': np.random.randn(1000)})
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]
# <출력>
# 0 (-0.242, 1.509]
# 1 (-1.994, -0.242]
# 2 (-0.242, 1.509]
# 3 (-0.242, 1.509]
# 4 (-0.242, 1.509]
# 5 (-1.994, -0.242]
# 6 (1.509, 3.26]
# 7 (-1.994, -0.242]
# 8 (-0.242, 1.509]
# 9 (-1.994, -0.242]
# Name: data1, dtype: category
# Categories (4, interval[float64, right]):
# [(-3.752, -1.994] < (-1.994, -0.242] < (-0.242, 1.509] < (1.509, 3.26]]
def get_stats(group):
return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}
grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats).unstack()
# <출력>
# min max count mean
# data1
# (-3.752, -1.994] -2.815059 1.766162 17.0 0.210374
# (-1.994, -0.242] -2.925113 2.653656 400.0 -0.011117
# (-0.242, 1.509] -3.184377 2.424667 512.0 -0.010956
# (1.509, 3.26] -3.428254 2.513530 71.0 0.161407
반응형
'Programming > Python(파이썬)' 카테고리의 다른 글
[ Python - Pandas&Numpy ] ex (0) | 2022.11.13 |
---|---|
[ Python - Pandas ] 데이터 집계 & csv (0) | 2022.11.12 |
[Python-Pandas] group by (0) | 2022.11.12 |
[Pandas & Numpy] 시계열 데이터 csv (0) | 2022.11.12 |
[Python-Numpy]-combining #3 (0) | 2022.11.11 |
Comments