59doit

[Python-Pandas] 결측치 본문

Programming/Python(파이썬)

[Python-Pandas] 결측치

yul_S2 2022. 11. 10. 08:40
반응형
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)
pd.options.display.max_rows = PREVIOUS_MAX_ROWS

결측치, 중복데이터 문자열 처리 , 분석적 데이터 변환에 대한 도구 설명

 

 

 

 

nan == non == null

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
# <출력>
# 0     aardvark
# 1    artichoke
# 2          NaN
# 3      avocado
# dtype: object 

string_data.isnull()
# <출력>
# 0    False
# 1    False
# 2     True
# 3    False
# dtype: bool

string_data[0] = None

string_data.isnull()
# <출력>
# 0     True
# 1    False
# 2     True
# 3    False
# dtype: bool

 

 

 

누락된 데이터 제외시키기

from numpy import nan as NA

null 이 아닌 데이터와 색인값만 추출

cleaned ; na 값을 없애버림 > 제외

data = pd.Series([1,NA,3.5,NA,7])
data.dropna()

# cleaned ; na 값을 없애버림 > 제외
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()

data
# <출력> 
#      0    1    2
# 0  1.0  6.5  3.0
# 1  1.0  NaN  NaN
# 2  NaN  NaN  NaN
# 3  NaN  6.5  3.0

cleaned
# <출력>
#      0    1    2
# 0  1.0  6.5  3.0

 

how='all'

data.dropna(how='all')
# <출력>
#      0    1    2
# 0  1.0  6.5  3.0
# 1  1.0  NaN  NaN
# 3  NaN  6.5  3.0

data[4]=NA
data
# <출력>
#      0    1    2   4
# 0  1.0  6.5  3.0 NaN
# 1  1.0  NaN  NaN NaN
# 2  NaN  NaN  NaN NaN
# 3  NaN  6.5  3.0 NaN

nan 값이 있는 열을 제외

data.dropna(axis=1,how='all')       
# <출력>
#      0    1    2
# 0  1.0  6.5  3.0
# 1  1.0  NaN  NaN
# 2  NaN  NaN  NaN
# 3  NaN  6.5  3.0

 

 

nan 값이 있는 행을 제외

data.dropna(axis=0,how='all')
# <출력>
#      0    1    2   4
# 0  1.0  6.5  3.0 NaN
# 1  1.0  NaN  NaN NaN
# 3  NaN  6.5  3.0 NaN

 

df = pd.DataFrame(np.random.randn(7, 3))

df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
# <출력> 
#           0         1         2
# 0  0.331286       NaN       NaN
# 1  0.246674       NaN       NaN
# 2  1.327195       NaN -1.549106
# 3  0.022185       NaN -0.660524
# 4  0.862580 -0.010032  0.050009
# 5  0.670216  0.852965 -0.955869
# 6 -0.023493 -2.304234 -0.652469

df.dropna()
# <출력>
#           0         1         2
# 4  0.862580 -0.010032  0.050009
# 5  0.670216  0.852965 -0.955869
# 6 -0.023493 -2.304234 -0.652469

df.dropna(thresh=2)
# <출력> 
#           0         1         2
# 2  1.327195       NaN -1.549106
# 3  0.022185       NaN -0.660524
# 4  0.862580 -0.010032  0.050009
# 5  0.670216  0.852965 -0.955869
# 6 -0.023493 -2.304234 -0.652469

 

 

fill 

# fill => 바로 직전값으로 채우는 메서드 ex) 3 _ 2  => _ 값에 3 채워짐
# bfill => 다음값을 채우는 메서드 ex) 3 _ 2  => _ 값에 2 채워짐

 

누락된 값을 제외시키지 않고 다른 값으로 대체> fillna

df.fillna(0)
# <출력>
#           0         1         2
# 0  0.331286  0.000000  0.000000
# 1  0.246674  0.000000  0.000000
# 2  1.327195  0.000000 -1.549106
# 3  0.022185  0.000000 -0.660524
# 4  0.862580 -0.010032  0.050009
# 5  0.670216  0.852965 -0.955869
# 6 -0.023493 -2.304234 -0.652469

 

 

컬럼명지정

df.fillna({1: 0.5, 2: 0})
# <출력> 
#           0         1         2
# 0  0.331286  0.500000  0.000000
# 1  0.246674  0.500000  0.000000
# 2  1.327195  0.500000 -1.549106
# 3  0.022185  0.500000 -0.660524
# 4  0.862580 -0.010032  0.050009
# 5  0.670216  0.852965 -0.955869
# 6 -0.023493 -2.304234 -0.652469

컬럼 1 에는 0.5, 컬럼 2 에는 0 대체

 

 

기존 객체를 변경도 가능

_ = df.fillna(0, inplace=True)           
df
# <출력> 
#           0         1         2
# 0  0.331286  0.000000  0.000000
# 1  0.246674  0.000000  0.000000
# 2  1.327195  0.000000 -1.549106
# 3  0.022185  0.000000 -0.660524
# 4  0.862580 -0.010032  0.050009
# 5  0.670216  0.852965 -0.955869
# 6 -0.023493 -2.304234 -0.652469

 

 

 

 

재색인에서 사용한 보간메서드는 fillna 메서드에서도 사용가능하다.

df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
# <출력>  
#           0         1         2
# 0 -1.218302 -1.332610  1.074623
# 1  0.723642  0.690002  1.001543
# 2 -0.503087       NaN -0.921169
# 3 -0.726213       NaN  0.051316
# 4 -1.157719       NaN       NaN
# 5  1.010737       NaN       NaN

df.fillna(method='ffill')
# <출력>  
#           0         1         2
# 0 -1.218302 -1.332610  1.074623
# 1  0.723642  0.690002  1.001543
# 2 -0.503087  0.690002 -0.921169
# 3 -0.726213  0.690002  0.051316
# 4 -1.157719  0.690002  0.051316
# 5  1.010737  0.690002  0.051316

df.fillna(method='ffill', limit=2)
# <출력>  
#           0         1         2
# 0 -1.218302 -1.332610  1.074623
# 1  0.723642  0.690002  1.001543
# 2 -0.503087  0.690002 -0.921169
# 3 -0.726213  0.690002  0.051316
# 4 -1.157719       NaN  0.051316
# 5  1.010737       NaN  0.051316

 

 

Series 의 평균값이나 중간값을 전달

data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())
# <출력>
# 0    1.000000
# 1    3.833333
# 2    3.500000
# 3    3.833333
# 4    7.000000
# dtype: float64

 

 

필터링, 정제 및 다른 변형

 

중복 로우

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})
data
# <출력> 
#     k1  k2
# 0  one   1
# 1  two   1
# 2  one   2
# 3  two   3
# 4  one   3
# 5  two   4
# 6  two   4

 

 

 

로우가 중복인지 아닌지

# 로우가 중복인지 아닌지
data.duplicated()
# <출력>
# 0    False
# 1    False
# 2    False
# 3    False
# 4    False
# 5    False
# 6     True
# dtype: bool

 

 

 

false인 dataframe 추출

data.drop_duplicates()
# <출력>
#     k1  k2
# 0  one   1
# 1  two   1
# 2  one   2
# 3  two   3
# 4  one   3
# 5  two   4

 

 

중복걸러내기 -> 최초에 발견된 값만 추출

data['v1'] = range(7)
data
# <출력>
#       k1  k2  v1
# 0  one   1   0
# 1  two   1   1
# 2  one   2   2
# 3  two   3   3
# 4  one   3   4
# 5  two   4   5
# 6  two   4   6

data.drop_duplicates(['k1'])
# <출력> 
#     k1  k2  v1
# 0  one   1   0
# 1  two   1   1

 

 

 

 

keep = 'last' 마지막 발견값만 추출

data.drop_duplicates(['k1', 'k2'], keep='last')
# <출력>
#     k1  k2  v1
# 0  one   1   0
# 1  two   1   1
# 2  one   2   2
# 3  two   3   3
# 4  one   3   4
# 6  two   4   6

 

DataFrame 의 컬럼이나 Series, 배열 내의 값을 기반으로 데이터의 형태를 변환

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon',
                'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
# <출력>
#           food  ounces
# 0        bacon     4.0
# 1  pulled pork     3.0
# 2        bacon    12.0
# 3     Pastrami     6.0
# 4  corned beef     7.5
# 5        Bacon     8.0
# 6     pastrami     3.0
# 7    honey ham     5.0
# 8     nova lox     6.0

 

 

key 값이 어떤 data 값인지 알려줄 수 있는 컬럼을 하나 추가

meat_to_animal = { 'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow',
                 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon'}
data
# <출력>
#           food  ounces
# 0        bacon     4.0
# 1  pulled pork     3.0
# 2        bacon    12.0
# 3     Pastrami     6.0
# 4  corned beef     7.5
# 5        Bacon     8.0
# 6     pastrami     3.0
# 7    honey ham     5.0
# 8     nova lox     6.0

육류별 동물을 담고 있는 사전 데이터 작성

 

 

 

lowercased = data['food'].str.lower()
lowercased
# <출력>
# 0          bacon
# 1    pulled pork
# 2          bacon
# 3       pastrami
# 4    corned beef
# 5          bacon
# 6       pastrami
# 7      honey ham
# 8       nova lox
# Name: food, dtype: object

 

 

data['animal']=lowercased.map(meat_to_animal)
data
# <출력>
#           food  ounces  animal
# 0        bacon     4.0     pig
# 1  pulled pork     3.0     pig
# 2        bacon    12.0     pig
# 3     Pastrami     6.0     cow
# 4  corned beef     7.5     cow
# 5        Bacon     8.0     pig
# 6     pastrami     3.0     cow
# 7    honey ham     5.0     pig
# 8     nova lox     6.0  salmon

 

 

 

 

반응형

'Programming > Python(파이썬)' 카테고리의 다른 글

[Python-Pandas] data index  (0) 2022.11.10
[Python-Pandas] 치환  (0) 2022.11.10
[Python-Pandas] csv 파일 불러오기  (0) 2022.11.09
[Python-Pandas] 파일경로설정  (0) 2022.11.09
[Python-Pandas] 수학메서드  (0) 2022.11.09
Comments