59doit

[Python-Pandas] combining #1 본문

Programming/Python(파이썬)

[Python-Pandas] combining #1

yul_S2 2022. 11. 11. 08:38
반응형

combining 

df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df1
# <출력>
#   key  data1
# 0   b      0
# 1   b      1
# 2   a      2
# 3   c      3
# 4   a      4
# 5   a      5
# 6   b      6
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})
df2
# <출력>
#   key  data2
# 0   a      0
# 1   b      1
# 2   d      2

merg() 함수

pd.merge(df1, df2)
# <출력>
#   key  data1  data2
# 0   b      0      1
# 1   b      1      1
# 2   b      6      1
# 3   a      2      0
# 4   a      4      0
# 5   a      5      0

 

merge()함수는 중복된 컬럼을 키로 사용('key'컬럼)

pd.merge(df1, df2, on='key')
# <출력>
#   key  data1  data2
# 0   b      0      1
# 1   b      1      1
# 2   b      6      1
# 3   a      2      0
# 4   a      4      0
# 5   a      5      0

 

 

 

 

중복된 컬럼이 없는 경우 따로 지정

df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df3
# <출력>
#   lkey  data1
# 0    b      0
# 1    b      1
# 2    a      2
# 3    c      3
# 4    a      4
# 5    a      5
# 6    b      6

df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})
df4
# <출력>
#   rkey  data2
# 0    a      0
# 1    b      1
# 2    d      2
pd.merge(df3, df4, left_on='lkey', right_on='rkey')
# <출력>
#   lkey  data1 rkey  data2
# 0    b      0    b      1
# 1    b      1    b      1
# 2    b      6    b      1
# 3    a      2    a      0
# 4    a      4    a      0
# 5    a      5    a      0

 

merge()함수는 기본적으로 inner join 수행

: how 인자로 'left', 'right', 'outer' 설정 가능

pd.merge(df1, df2, how='outer')
# <출력>
#   key  data1  data2
# 0   b    0.0    1.0
# 1   b    1.0    1.0
# 2   b    6.0    1.0
# 3   a    2.0    0.0
# 4   a    4.0    0.0
# 5   a    5.0    0.0
# 6   c    3.0    NaN
# 7   d    NaN    2.0

 

df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
df1
# <출력>
#   key  data1
# 0   b      0
# 1   b      1
# 2   a      2
# 3   c      3
# 4   a      4
# 5   b      5
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'], 'data2': range(5)})
df2
# <출력>
#   key  data2
# 0   a      0
# 1   b      1
# 2   a      2
# 3   b      3
# 4   d      4

pd.merge(df1, df2, on='key', how='left')
# <출력>
#    key  data1  data2
# 0    b      0    1.0
# 1    b      0    3.0
# 2    b      1    1.0
# 3    b      1    3.0
# 4    a      2    0.0
# 5    a      2    2.0
# 6    c      3    NaN
# 7    a      4    0.0
# 8    a      4    2.0
# 9    b      5    1.0
# 10   b      5    3.0

 

두 로우의 데카르트곱 반환

pd.merge(df1, df2, how='inner')
# <출력>
#   key  data1  data2
# 0   b      0      1
# 1   b      0      3
# 2   b      1      1
# 3   b      1      3
# 4   b      5      1
# 5   b      5      3
# 6   a      2      0
# 7   a      2      2
# 8   a      4      0
# 9   a      4      2

 

 

여러 개의 키를 병합하려면 컬럼 이름이 담긴 리스트 이용

left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],  'key2': ['one', 'two', 'one'], 'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],  'key2': ['one', 'one', 'one', 'two'], 'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')
# <출력>
#   key1 key2  lval  rval
# 0  foo  one   1.0   4.0
# 1  foo  one   1.0   5.0
# 2  foo  two   2.0   NaN
# 3  bar  one   3.0   6.0
# 4  bar  two   NaN   7.0

 

 

겹치는 컬럼 이름에 대한 처리
#1. 수동으로 컬럼이름을 겹치게 한다.
#2. merge()함수에 있는 suffixes 인자로 두 DataFrame 객체에서 겹치는 컬럼 이름 뒤에 붙일 문자열지정

pd.merge(left, right, on='key1')
# <출력>
#   key1 key2_x  lval key2_y  rval
# 0  foo    one     1    one     4
# 1  foo    one     1    one     5
# 2  foo    two     2    one     4
# 3  foo    two     2    one     5
# 4  bar    one     3    one     6
# 5  bar    one     3    two     7
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))
# <출력>
#   key1 key2_left  lval key2_right  rval
# 0  foo       one     1        one     4
# 1  foo       one     1        one     5
# 2  foo       two     2        one     4
# 3  foo       two     2        one     5
# 4  bar       one     3        one     6
# 5  bar       one     3        two     7

 

 

병합하려는 키가 DataFrame 의 색인일 경우

left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
left1
# <출력>
#   key  value
# 0   a      0
# 1   b      1
# 2   a      2
# 3   a      3
# 4   b      4
# 5   c      5

right1
# <출력>
#    group_val
# a        3.5
# b        7.0
pd.merge(left1, right1, left_on='key', right_index=True)
# <출력>
#   key  value  group_val
# 0   a      0        3.5
# 2   a      2        3.5
# 3   a      3        3.5
# 1   b      1        7.0
# 4   b      4        7.0

 

외부조인

pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
# <출력>
#   key  value  group_val
# 0   a      0        3.5
# 2   a      2        3.5
# 3   a      3        3.5
# 1   b      1        7.0
# 4   b      4        7.0
# 5   c      5        NaN

 

계층 색인된 데이터 병합

lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002], 'data': np.arange(5.)})
righth = pd.DataFrame(np.arange(12).reshape((6, 2)), 
                      index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'], 
                             [2001, 2000, 2000, 2000, 2001, 2002]], 
                      columns=['event1', 'event2'])
lefth
# <출력> 
#      key1  key2  data
# 0    Ohio  2000   0.0
# 1    Ohio  2001   1.0
# 2    Ohio  2002   2.0
# 3  Nevada  2001   3.0
# 4  Nevada  2002   4.0

righth
# <출력> 
#              event1  event2
# Nevada 2001       0       1
#        2000       2       3
# Ohio   2000       4       5
#        2000       6       7
#        2001       8       9
#        2002      10      11

 

 

리스트로 여러 개의 컬럼을 지정하여 병합 (중복되는 색인값을 다룰 때는 how='outer'옵션 사용)

pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)
# <출력>
#      key1  key2  data  event1  event2
# 0    Ohio  2000   0.0       4       5
# 0    Ohio  2000   0.0       6       7
# 1    Ohio  2001   1.0       8       9
# 2    Ohio  2002   2.0      10      11
# 3  Nevada  2001   3.0       0       1

pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True, how='outer')
# <출력>
#      key1  key2  data  event1  event2
# 0    Ohio  2000   0.0     4.0     5.0
# 0    Ohio  2000   0.0     6.0     7.0
# 1    Ohio  2001   1.0     8.0     9.0
# 2    Ohio  2002   2.0    10.0    11.0
# 3  Nevada  2001   3.0     0.0     1.0
# 4  Nevada  2002   4.0     NaN     NaN
# 4  Nevada  2000   NaN     2.0     3.0

 

 

양쪽에 공통적으로 존재하는 여러 개의 색인을 병합

left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'], columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
left2
# <출력>
#    Ohio  Nevada
# a   1.0     2.0
# c   3.0     4.0
# e   5.0     6.0

right2
# <출력>
#    Missouri  Alabama
# b       7.0      8.0
# c       9.0     10.0
# d      11.0     12.0
# e      13.0     14.0
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
# <출력>
#    Ohio  Nevada  Missouri  Alabama
# a   1.0     2.0       NaN      NaN
# b   NaN     NaN       7.0      8.0
# c   3.0     4.0       9.0     10.0
# d   NaN     NaN      11.0     12.0
# e   5.0     6.0      13.0     14.0

 

색인으로 병합할때 DataFrame 의 join 메서드 사용

left2.join(right2, how='outer')
# <출력>
#    Ohio  Nevada  Missouri  Alabama
# a   1.0     2.0       NaN      NaN
# b   NaN     NaN       7.0      8.0
# c   3.0     4.0       9.0     10.0
# d   NaN     NaN      11.0     12.0
# e   5.0     6.0      13.0     14.0

 

 

join 메서드를 호출한 DataFrame 의 컬럼 중 하나에 대해 조인을 수행

left1.join(right1, on='key')
# <출력>
#   key  value  group_val
# 0   a      0        3.5
# 1   b      1        7.0
# 2   a      2        3.5
# 3   a      3        3.5
# 4   b      4        7.0
# 5   c      5        NaN

 

 

 

색인 대 색인으로 두 DataFrame 을 병합하려면 DataFrame 의 리스트에 join 메서드 사용

another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=['a', 'c', 'e', 'f'],
                       columns=['New York', 'Oregon'])
another
# <출력>
#    New York  Oregon
# a       7.0     8.0
# c       9.0    10.0
# e      11.0    12.0
# f      16.0    17.0
left2.join([right2, another])
# <출력>
#    Ohio  Nevada  Missouri  Alabama  New York  Oregon
# a   1.0     2.0       NaN      NaN       7.0     8.0
# c   3.0     4.0       9.0     10.0       9.0    10.0
# e   5.0     6.0      13.0     14.0      11.0    12.0

left2.join([right2, another], how='outer')
# <출력>
#    Ohio  Nevada  Missouri  Alabama  New York  Oregon
# a   1.0     2.0       NaN      NaN       7.0     8.0
# c   3.0     4.0       9.0     10.0       9.0    10.0
# e   5.0     6.0      13.0     14.0      11.0    12.0
# b   NaN     NaN       7.0      8.0       NaN     NaN
# d   NaN     NaN      11.0     12.0       NaN     NaN
# f   NaN     NaN       NaN      NaN      16.0    17.0

 

 

 

반응형

'Programming > Python(파이썬)' 카테고리의 다른 글

[Python-Numpy]-combining #3  (0) 2022.11.11
[Python-Numpy] -combining #2  (0) 2022.11.11
[Python-Pandas] data index  (0) 2022.11.10
[Python-Pandas] 치환  (0) 2022.11.10
[Python-Pandas] 결측치  (0) 2022.11.10
Comments