59doit

[Python-Pandas] 치환 본문

Programming/Python(파이썬)

[Python-Pandas] 치환

yul_S2 2022. 11. 10. 10:20
반응형
import pandas as pd
import numpy as np
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
# <출력>
# 0       1.0
# 1    -999.0
# 2       2.0
# 3    -999.0
# 4   -1000.0
# 5       3.0
# dtype: float64

여러개를 한번에 치환

data.replace(-999, np.nan)
# <출력>
# 0       1.0
# 1       NaN
# 2       2.0
# 3       NaN
# 4   -1000.0
# 5       3.0
# dtype: float64

data.replace([-999, -1000], np.nan)   
# <출력>
# 0    1.0
# 1    NaN
# 2    2.0
# 3    NaN
# 4    NaN
# 5    3.0
# dtype: float64

 

 

 

대응관계 dict 사용도 가능

data.replace([-999,-1000],[np.nan,0])
data.replace({-999:np.nan,-1000:0})
# <출력>
# 1.0
# 1    NaN
# 2    2.0
# 3    NaN
# 4    0.0
# 5    3.0
# dtype: float64

두 경우 같은 결과값

 

 

 

 

 

        

data = pd.DataFrame(np.arange(12).reshape((3, 4)), 
                    index=['Ohio', 'Colorado', 'New York'], 
                    columns=['one', 'two', 'three', 'four'])
data
# <출력> 
#           one  two  three  four
# Ohio        0    1      2     3
# Colorado    4    5      6     7
# New York    8    9     10    11

 

lambda

transform = lambda x:x[:4].upper()
data.index.map(transform)
# <출력>
# Index(['OHIO', 'COLO', 'NEW '], dtype='object')

 

대문자로 변경된 축 index 변경 대입

data.index=data.index.map(transform)
data
# <출력>
#       one  two  three  four
# OHIO    0    1      2     3
# COLO    4    5      6     7
# NEW     8    9     10    11

data.rename(index=str.title, columns=str.upper)
# <출력>      ONE  TWO  THREE  FOUR
# Ohio    0    1      2     3
# Colo    4    5      6     7
# New     8    9     10    11

 

바꾸려면 다른 변수에 저장해서 담아서 사용

data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'})
data
# <출력> 
#       one  two  three  four
# OHIO    0    1      2     3
# COLO    4    5      6     7
# NEW     8    9     10    11

아니면 data 는 원본유지임    원본을 바꾸는건 바람직하지않음

 

 

 


inplace=True ; 원본 수정됨

data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data
# <출력>
#          one  two  three  four
# INDIANA    0    1      2     3
# COLO       4    5      6     7
# NEW        8    9     10    11

 

 

        

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

 

cut 함수 나누기 >> 18-25, 26-35, 35-60, 60 이상 그룹으로

bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
# <출력>
# [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
# Length: 12
# Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

 

cats.codes & cats.categories

cats.codes
# <출력> 
# array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

cats.categories
# <출력>
# IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

pd.value_counts(cats)
# <출력> 
# (18, 25]     5
# (25, 35]     3
# (35, 60]     3
# (60, 100]    1
# dtype: int64

 

 

 

대괄호가 포함되지 않도록 변경

pd.cut(ages, [18, 26, 36, 61, 100], right=False)
# <출력>
# [[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
# Length: 12
# Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

 

 

그룹이름 추가기능

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
# <출력>
# ['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
# Length: 12
# Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

 

 

 

precision 소수점 자리 표시

data = np.random.rand(20)
pd.cut(data, 4, precision=2)
# <출력>
# [(0.73, 0.96], (0.25, 0.49], (0.0074, 0.25], (0.0074, 0.25], (0.49, 0.73], ..., (0.49, 0.73], (0.73, 0.96], (0.73, 0.96], (0.73, 0.96], (0.49, 0.73]]
# Length: 20
# Categories (4, interval[float64, right]): [(0.0074, 0.25] < (0.25, 0.49] < (0.49, 0.73] < (0.73, 0.96]]

 

 

같은 크기의 그룹으로 나눌때는 표준 변위치를 사용하는 qcut() 함수 이용

data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats
# <출력> 
# [(0.626, 3.928], (0.626, 3.928], (-2.9499999999999997, -0.691], (-0.691, -0.0171], (0.626, 3.928], ..., (-0.0171, 0.626], (0.626, 3.928], (-0.691, -0.0171], (-0.691, -0.0171], (-0.0171, 0.626]]
# Length: 1000
# Categories (4, interval[float64, right]): [(-2.9499999999999997, -0.691] < (-0.691, -0.0171] < (-0.0171, 0.626] < (0.626, 3.928]]

 

 

변위치를 직접 지정 가능(변위치는 0~1)

pd.value_counts(cats)
# <출력> 
# (-2.9499999999999997, -0.691]    250
# (-0.691, -0.0171]                250
# (-0.0171, 0.626]                 250
# (0.626, 3.928]                   250
# dtype: int64


pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
# <출력>
# [(1.297, 3.928], (-0.0171, 1.297], (-2.9499999999999997, -1.191], (-1.191, -0.0171], (1.297, 3.928], ..., (-0.0171, 1.297], (-0.0171, 1.297], (-1.191, -0.0171], (-1.191, -0.0171], (-0.0171, 1.297]]
# Length: 1000
# Categories (4, interval[float64, right]): [(-2.9499999999999997, -1.191] < (-1.191, -0.0171] < (-0.0171, 1.297] < (1.297, 3.928]]

 

 

 

이상값을 제외하거나 다른값으로 대체

data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
# <출력> 
#                  0            1            2            3
# count  1000.000000  1000.000000  1000.000000  1000.000000
# mean      0.021425    -0.005165    -0.055670     0.044107
# std       1.008404     0.995532     0.994484     0.996884
# min      -3.184377    -3.745356    -3.428254    -3.645860
# 25%      -0.628122    -0.699383    -0.747478    -0.599807
# 50%      -0.016127    -0.029924    -0.091364     0.043663
# 75%       0.690847     0.694459     0.620197     0.740562
# max       3.525865     2.735527     3.366626     2.653656

 

 

 

컬럼에서 절대값이 3을 초과하는 값 찾기

col = data[2]
col[np.abs(col) > 3]
# <출력>
# 269   -3.428254
# 646    3.366626
# Name: 2, dtype: float64

 

 

절대값이 들어있는 모든 로우를 선택하려면 any 메서드 사용

data[np.abs(data)>3]=np.sign(data)*3
data.describe()
# <출력>
#                  0            1            2            3
# count  1000.000000  1000.000000  1000.000000  1000.000000
# mean      0.020880    -0.004021    -0.055608     0.045301
# std       1.005160     0.991718     0.991924     0.992863
# min      -3.000000    -3.000000    -3.000000    -3.000000
# 25%      -0.628122    -0.699383    -0.747478    -0.599807
# 50%      -0.016127    -0.029924    -0.091364     0.043663
# 75%       0.690847     0.694459     0.620197     0.740562
# max       3.000000     2.735527     3.000000     2.653656

 

 

data 값이 양수,음수 인지에 따라 1이나 -1담긴 배열 반환

np.sign(data).head()
# <출력>
#      0    1    2    3
# 0  1.0 -1.0  1.0  1.0
# 1 -1.0 -1.0 -1.0 -1.0
# 2  1.0  1.0  1.0 -1.0
# 3  1.0  1.0 -1.0 -1.0
# 4  1.0 -1.0 -1.0 -1.0

 

 

sampling

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df
# <출력>
#     0   1   2   3
# 0   0   1   2   3
# 1   4   5   6   7
# 2   8   9  10  11
# 3  12  13  14  15
# 4  16  17  18  19
sampler = np.random.permutation(5)
sampler
# <출력>
# array([1, 0, 4, 2, 3])

# <df>
#     0   1   2   3
# 0   0   1   2   3
# 1   4   5   6   7
# 2   8   9  10  11
# 3  12  13  14  15
# 4  16  17  18  19
df.take(sampler)
# <출력>
#     0   1   2   3
# 1   4   5   6   7
# 0   0   1   2   3
# 4  16  17  18  19
# 2   8   9  10  11
# 3  12  13  14  15

일부만 임의로 선택

df.sample(n=3)
# <출력>
#    0  1   2   3
# 0  0  1   2   3
# 2  8  9  10  11
# 1  4  5   6   7

 

 

 

복원 추출 표본치환 raplace = True

choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws
# <출력>
# 0    5
# 3    6
# 1    7
# 4    4
# 0    5
# 3    6
# 0    5
# 3    6
# 2   -1
# 3    6
# dtype: int64

 

get_dummies() 의 역할; key 값에 해당하는 값만 데이터프레임에 입력값이 들어감

df=pd.DataFrame({'key':['b','b','a','b','c','b'],'data1':range(6)})
pd.get_dummies(df['key'])
# <출력>
#    a  b  c
# 0  0  1  0
# 1  0  1  0
# 2  1  0  0
# 3  0  1  0
# 4  0  0  1
# 5  0  1  0

 

 

다른데이터와 병합하고 싶을때 ; prefix

dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
# <출력>
#    data1  key_a  key_b  key_c
# 0      0      0      1      0
# 1      1      0      1      0
# 2      2      1      0      0
# 3      3      0      1      0
# 4      4      0      0      1
# 5      5      0      1      0

 

 

 

반응형

'Programming > Python(파이썬)' 카테고리의 다른 글

[Python-Pandas] combining #1  (0) 2022.11.11
[Python-Pandas] data index  (0) 2022.11.10
[Python-Pandas] 결측치  (0) 2022.11.10
[Python-Pandas] csv 파일 불러오기  (0) 2022.11.09
[Python-Pandas] 파일경로설정  (0) 2022.11.09
Comments