Programming/Python(파이썬)
[Python-Pandas] group by
yul_S2
2022. 11. 12. 11:34
반응형
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)
group by
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})
df
# <출력>
# key1 key2 data1 data2
# 0 a one 0.981007 -1.006219
# 1 a two -0.873717 -0.902148
# 2 b one -1.015634 0.752769
# 3 b two -0.411244 -0.490509
# 4 a one 1.465621 -0.524672
▷
이 데이터를 key1 으로 묶고 각 그룹에서 data1 의 평균을 구하기
grouped = df['data1'].groupby(df['key1'])
grouped
grouped.mean()
# <출력>
# key1
# a 0.524304
# b -0.713439
# Name: data1, dtype: float64
means=df['data1'].groupby([df['key1'],df['key2']]).mean()
means
# <출력>
# key1 key2
# a one 1.223314
# two -0.873717
# b one -1.015634
# two -0.411244
# Name: data1, dtype: float64
▷
두개의 색인으로 묶음
means.unstack()
# <출력>
# key2 one two
# key1
# a 1.223314 -0.873717
# b -1.015634 -0.411244
▷
길이만 같다면 어떤 배열도 상관없다.
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()
# <출력>
# California 2005 -0.873717
# 2006 -1.015634
# Ohio 2005 0.284882
# 2006 1.465621
# Name: data1, dtype: float64
df.groupby('key1').mean()
# <출력>
# data1 data2
# key1
# a 0.524304 -0.811013
# b -0.713439 0.131130
df.groupby(['key1', 'key2']).mean()
# <출력>
# data1 data2
# key1 key2
# a one 1.223314 -0.765446
# two -0.873717 -0.902148
# b one -1.015634 0.752769
# two -0.411244 -0.490509
▷
GroupBy 메서드는 그룹의 크기를 담고 있는 Series 를 반환하는 size 메서드
df.groupby(['key1', 'key2']).size()
# <출력>
# key1 key2
# a one 2
# two 1
# b one 1
# two 1
# dtype: int64
▷
튜플로 반환
for name, group in df.groupby('key1'):
print(name)
print(group)
# <출력>
# a
# key1 key2 data1 data2
# 0 a one 0.981007 -1.006219
# 1 a two -0.873717 -0.902148
# 4 a one 1.465621 -0.524672
# b
# key1 key2 data1 data2
# 2 b one -1.015634 0.752769
# 3 b two -0.411244 -0.490509
▷
색인이 여럿 존재하는 경우 튜플의 첫 번째 원소가 색인값이 된다.
for (k1, k2), group in df.groupby(['key1', 'key2']):
print((k1, k2))
print(group)
# <출력>
# ('a', 'one')
# key1 key2 data1 data2
# 0 a one 0.981007 -1.006219
# 4 a one 1.465621 -0.524672
# ('a', 'two')
# key1 key2 data1 data2
# 1 a two -0.873717 -0.902148
# ('b', 'one')
# key1 key2 data1 data2
# 2 b one -1.015634 0.752769
# ('b', 'two')
# key1 key2 data1 data2
# 3 b two -0.411244 -0.490509
▷
dict 로 반환
pieces = dict(list(df.groupby('key1')))
pieces['b']
# <출력>
# key1 key2 data1 data2
# 2 b one -1.015634 0.752769
# 3 b two -0.411244 -0.490509
▷
axis = 0,1
df.dtypes
grouped = df.groupby(df.dtypes, axis=1)
▷
for dtype, group in grouped:
print(dtype)
print(group)
# <출력>
# float64
# data1 data2
# 0 0.981007 -1.006219
# 1 -0.873717 -0.902148
# 2 -1.015634 0.752769
# 3 -0.411244 -0.490509
# 4 1.465621 -0.524672
# object
# key1 key2
# 0 a one
# 1 a two
# 2 b one
# 3 b two
# 4 a one
selecting
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
=
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])
▷
소수의 컬럼만 집계하고 싶은 경우
df.groupby(['key1', 'key2'])[['data2']].mean()
# <출력>
# data2
# key1 key2
# a one -0.765446
# two -0.902148
# b one 0.752769
# two -0.490509
▷
단일 값으로 하나의 컬럼 이름만 넘겼을 경우
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped
s_grouped.mean()
# <출력>
# key1 key2
# a one -0.765446
# two -0.902148
# b one 0.752769
# two -0.490509
# Name: data2, dtype: float64
grouping
people = pd.DataFrame(np.random.randn(5, 5),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people
# <출력>
# a b c d e
# Joe -0.699196 0.352361 0.068103 -0.930342 0.845400
# Steve 0.016472 0.844963 1.850834 0.022074 -1.369179
# Wes 0.887204 NaN NaN -0.048565 1.235021
# Jim -0.433295 1.391035 0.820211 -0.247423 0.302271
# Travis 0.543980 -0.942369 -1.266383 0.937250 -0.720102
▷
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}
by_column = people.groupby(mapping, axis=1)
by_column.sum()
# <출력>
# blue red
# Joe -0.862239 0.498565
# Steve 1.872908 -0.507744
# Wes -0.048565 2.122225
# Jim 0.572787 1.260011
# Travis -0.329133 -1.118490
map_series = pd.Series(mapping)
map_series
# <출력>
# a red
# b red
# c blue
# d blue
# e red
# f orange
# dtype: object
people.groupby(map_series, axis=1).count()
# <출력>
# blue red
# Joe 2 3
# Steve 2 3
# Wes 1 2
# Jim 2 3
# Travis 2 3
▷
길이별로 그룹묶기
people.groupby(len).sum()
# <출력>
# a b c d e
# 3 -0.245287 1.743396 0.888313 -1.226330 2.382692
# 5 0.016472 0.844963 1.850834 0.022074 -1.369179
# 6 0.543980 -0.942369 -1.266383 0.937250 -0.720102
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
# <출력>
# a b c d e
# 3 one -0.699196 0.352361 0.068103 -0.930342 0.845400
# two -0.433295 1.391035 0.820211 -0.247423 0.302271
# 5 one 0.016472 0.844963 1.850834 0.022074 -1.369179
# 6 two 0.543980 -0.942369 -1.266383 0.937250 -0.720102
▷
하나만 사용해서 집계
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
[1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4,5),columns=columns)
hier_df
# <출력>
# cty US JP
# tenor 1 3 5 1 3
# 0 -1.593952 -0.375498 -0.958704 0.794336 -1.605108
# 1 0.543710 0.925166 -1.469629 -0.399592 1.417343
# 2 -0.897609 1.844805 1.253168 -1.490932 -0.027734
# 3 1.375236 -0.025208 -0.667880 -2.868018 0.210689
▷
level 예약어 사용
hier_df.groupby(level='cty',axis=1).count()
# <출력>
# cty JP US
# 0 2 3
# 1 2 3
# 2 2 3
# 3 2 3
반응형