59doit

[ Python - Pandas ] apply & quntile 본문

Programming/Python(파이썬)

[ Python - Pandas ] apply & quntile

yul_S2 2022. 11. 13. 12:08
반응형

apply

 

def top(df, n=5, column='tip_pct'):
 return df.sort_values(by=column)[-n:]

top(tips, n=6)
# <출력> 
#      total_bill   tip smoker  day    time  size   tip_pct
# 109       14.31  4.00    Yes  Sat  Dinner     2  0.279525
# 183       23.17  6.50    Yes  Sun  Dinner     4  0.280535
# 232       11.61  3.39     No  Sat  Dinner     2  0.291990
# 67         3.07  1.00    Yes  Sat  Dinner     1  0.325733
# 178        9.60  4.00    Yes  Sun  Dinner     2  0.416667
# 172        7.25  5.15    Yes  Sun  Dinner     2  0.710345
tips.groupby('smoker').apply(top)
# <출력>     
#             total_bill   tip smoker   day    time  size   tip_pct
# smoker                                                           
# No     88        24.71  5.85     No  Thur   Lunch     2  0.236746
#        185       20.69  5.00     No   Sun  Dinner     5  0.241663
#        51        10.29  2.60     No   Sun  Dinner     2  0.252672
#        149        7.51  2.00     No  Thur   Lunch     2  0.266312
#        232       11.61  3.39     No   Sat  Dinner     2  0.291990
# Yes    109       14.31  4.00    Yes   Sat  Dinner     2  0.279525
#        183       23.17  6.50    Yes   Sun  Dinner     4  0.280535
#        67         3.07  1.00    Yes   Sat  Dinner     1  0.325733
#        178        9.60  4.00    Yes   Sun  Dinner     2  0.416667
#        172        7.25  5.15    Yes   Sun  Dinner     2  0.710345

 

 

apply 메서드로 넘길 함수가 추가적인 인자를 받는다면 함수 이름 뒤에 붙여서 넘겨주면 된다.

tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')
# <출력>             
#                  total_bill    tip smoker   day    time  size   tip_pct
# smoker day                                                             
# No     Fri  94        22.75   3.25     No   Fri  Dinner     2  0.142857
#        Sat  212       48.33   9.00     No   Sat  Dinner     4  0.186220
#        Sun  156       48.17   5.00     No   Sun  Dinner     6  0.103799
#        Thur 142       41.19   5.00     No  Thur   Lunch     5  0.121389
# Yes    Fri  95        40.17   4.73    Yes   Fri  Dinner     4  0.117750
#        Sat  170       50.81  10.00    Yes   Sat  Dinner     3  0.196812
#        Sun  182       45.35   3.50    Yes   Sun  Dinner     3  0.077178
#        Thur 197       43.11   5.00    Yes  Thur   Lunch     4  0.115982

 

 

 

앞에서 GroupBy 객체에 describe 메서드를 호출한 적이 있다.

result = tips.groupby('smoker')['tip_pct'].describe()
result
# <출력>       
#         count      mean       std  ...       50%       75%       max
# smoker                             ...                              
# No      151.0  0.159328  0.039910  ...  0.155625  0.185014  0.291990
# Yes      93.0  0.163196  0.085119  ...  0.153846  0.195059  0.710345

 

result.unstack('smoker')
# <출력> 
#        smoker
# count  No        151.000000
#        Yes        93.000000
# mean   No          0.159328
#        Yes         0.163196
# std    No          0.039910
#        Yes         0.085119
# min    No          0.056797
#        Yes         0.035638
# 25%    No          0.136906
#        Yes         0.106771
# 50%    No          0.155625
#        Yes         0.153846
# 75%    No          0.185014
#        Yes         0.195059
# max    No          0.291990
#        Yes         0.710345
# dtype: float64

 

 

describe 같은 메서드를 호출하면 GroupBy 내부적으로 다음과 같은 단계를 수행

f = lambda x: x.describe()
grouped.apply(f)
# <출력> 
#                    total_bill       tip  size   tip_pct
# day  smoker                                            
# Fri  No     count    4.000000  4.000000  4.00  4.000000
#             mean    18.420000  2.812500  2.25  0.151650
#             std      5.059282  0.898494  0.50  0.028123
#             min     12.460000  1.500000  2.00  0.120385
#             25%     15.100000  2.625000  2.00  0.137239
#                        ...       ...   ...       ...
# Thur Yes    min     10.340000  2.000000  2.00  0.090014
#             25%     13.510000  2.000000  2.00  0.148038
#             50%     16.470000  2.560000  2.00  0.153846
#             75%     19.810000  4.000000  2.00  0.194837
#             max     43.110000  5.000000  4.00  0.241255
# [64 rows x 4 columns]

 

 

 

groupby 메서드에 group_keys=False 로 설정하여 막을 수 있다.

tips.groupby('smoker', group_keys=False).apply(top)
# <출력> 
#      total_bill   tip smoker   day    time  size   tip_pct
# 88        24.71  5.85     No  Thur   Lunch     2  0.236746
# 185       20.69  5.00     No   Sun  Dinner     5  0.241663
# 51        10.29  2.60     No   Sun  Dinner     2  0.252672
# 149        7.51  2.00     No  Thur   Lunch     2  0.266312
# 232       11.61  3.39     No   Sat  Dinner     2  0.291990
# 109       14.31  4.00    Yes   Sat  Dinner     2  0.279525
# 183       23.17  6.50    Yes   Sun  Dinner     4  0.280535
# 67         3.07  1.00    Yes   Sat  Dinner     1  0.325733
# 178        9.60  4.00    Yes   Sun  Dinner     2  0.416667
# 172        7.25  5.15    Yes   Sun  Dinner     2  0.710345

 

 

 

 


quntile

frame = pd.DataFrame({'data1': np.random.randn(1000), 'data2': np.random.randn(1000)})
quartiles = pd.cut(frame.data1, 4)

quartiles[:10]
# <출력> 
# 0     (-0.242, 1.509]
# 1    (-1.994, -0.242]
# 2     (-0.242, 1.509]
# 3     (-0.242, 1.509]
# 4     (-0.242, 1.509]
# 5    (-1.994, -0.242]
# 6       (1.509, 3.26]
# 7    (-1.994, -0.242]
# 8     (-0.242, 1.509]
# 9    (-1.994, -0.242]
# Name: data1, dtype: category
# Categories (4, interval[float64, right]): 
# [(-3.752, -1.994] < (-1.994, -0.242] < (-0.242, 1.509] < (1.509, 3.26]]

 

 

def get_stats(group):
 return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}

grouped = frame.data2.groupby(quartiles)

grouped.apply(get_stats).unstack()
# <출력>
#                        min       max  count      mean
# data1
# (-3.752, -1.994] -2.815059  1.766162   17.0  0.210374
# (-1.994, -0.242] -2.925113  2.653656  400.0 -0.011117
# (-0.242, 1.509]  -3.184377  2.424667  512.0 -0.010956
# (1.509, 3.26]    -3.428254  2.513530   71.0  0.161407

 

 

 

반응형

'Programming > Python(파이썬)' 카테고리의 다른 글

[ Python - Pandas&Numpy ] ex  (0) 2022.11.13
[ Python - Pandas ] 데이터 집계 & csv  (0) 2022.11.12
[Python-Pandas] group by  (0) 2022.11.12
[Pandas & Numpy] 시계열 데이터 csv  (0) 2022.11.12
[Python-Numpy]-combining #3  (0) 2022.11.11
Comments