59doit
[ R ] 의사결정나무 #2 본문
반응형
K겹 교차 검정 샘플링으로 분류 분석하기
#1 k겹 교차 검정을 위한 샘플링
library(cvTools) cross <- cvFolds(nrow(iris), K = 3, R = 2) |
#2 K겹 교차 검정 데이터 보기
str(cross) # List of 5 # $ n : num 150 # $ K : num 3 # $ R : num 2 # $ subsets: int [1:150, 1:2] 21 102 134 9 19 22 40 29 109 38 ... # $ which : int [1:150] 1 2 3 1 2 3 1 2 3 1 ... # - attr(*, "class")= chr "cvFolds" cross length(cross$which) # [1] 150 dim(cross$subsets) # [1] 150 2 table(cross$which) # 1 2 3 # 50 50 50 |
#3 K겹 교차 검정 수행 ★★★
R = 1:2 K = 1:3 CNT = 0 ACC <- numeric() for(r in R) { cat('\n R = ', r, '\n') for(k in K) { datas_ids <- cross$subsets[cross$which == k, r] test <- iris[datas_ids, ] cat('test : ', nrow(test), '\n') formual <- Species ~ . # 종속변수 Species, 독립변수 전체 다 train <- iris[-datas_ids, ] cat('train : ', nrow(train), '\n') model <- ctree(Species ~ ., data = train) pred <- predict(model, test) t <- table(pred, test$Species) print(t) CNT <- CNT + 1 ACC[CNT] <- (t[1, 1] + t[2, 2] + t[3, 3]) / sum(t) } } CNT #6 |
#4 교차 검정 모델 평가
ACC # [1] 0.96 0.94 0.92 0.96 0.92 0.88 length(ACC) # [1] 6 result_acc <- mean(ACC, na.rm = T) result_acc # [1] 0.93 |
고속도로 주행거리에 미치는 영향변수 보기
#1 패키지 설치 및 로딩
library(ggplot2) data(mpg) |
#2 학습데이터와 검정데이터 생성
t <- sample(1:nrow(mpg), 0.7*nrow(mpg)) train <- mpg[-t, ] test <- mpg[t, ] dim(train) dim(test) |
#3 formula작성과 분류모델 생성
test$drv <- factor(test$drv) formula <- hwy ~ displ + cyl + drv tree_model <- ctree(formula, data = test) plot(tree_model) ![]() |
Adultuci 데이터 셋을 이용한 분류분석
#1 패키지 설치 및 데이터 셋 구조 보기
install.packages("arules") library(arules) data(AdultUCI) str(AdultUCI) # 'data.frame': 48842 obs. of 15 variables: # $ age : int 39 50 38 53 28 37 49 52 31 42 ... # $ workclass : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ... # $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ... # $ education : Ord.factor w/ 16 levels "Preschool"<"1st-4th"<..: 14 14 9 7 14 15 5 9 15 14 ... # $ education-num : int 13 13 9 7 13 14 5 9 14 13 ... # $ marital-status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ... # $ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ... # $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ... # $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ... # $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ... # $ capital-gain : int 2174 0 0 0 0 0 0 0 14084 5178 ... # $ capital-loss : int 0 0 0 0 0 0 0 0 0 0 ... # $ hours-per-week: int 40 13 40 40 40 40 16 45 50 40 ... # $ native-country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ... # $ income : Ord.factor w/ 2 levels "small"<"large": 1 1 1 1 1 1 1 2 2 2 ... names(AdultUCI) # [1] "age" "workclass" "fnlwgt" "education" # [5] "education-num" "marital-status" "occupation" "relationship" # [9] "race" "sex" "capital-gain" "capital-loss" # [13] "hours-per-week" "native-country" "income" |
#2 데이터 샘플링
set.seed(1234) choice <- sample(1:nrow(AdultUCI), 0.7*nrow(AdultUCI)) choice adult.df <- AdultUCI[choice, ] str(adult.df) # 'data.frame': 34189 obs. of 15 variables: # $ age : int 76 34 44 44 50 36 17 26 43 25 ... # $ workclass : Factor w/ 8 levels "Federal-gov",..: 6 6 4 4 4 4 4 4 4 4 ... # $ fnlwgt : int 106430 201292 318046 368757 115284 207853 158704 147821 160246 135645 ... # $ education : Ord.factor w/ 16 levels "Preschool"<"1st-4th"<..: 5 12 13 13 15 14 6 14 13 15 ... # $ education-num : int 5 11 10 10 14 13 6 13 10 14 ... # $ marital-status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 3 3 3 3 3 3 5 5 1 5 ... # $ occupation : Factor w/ 14 levels "Adm-clerical",..: 5 5 12 7 3 12 12 12 10 12 ... # $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 1 1 1 1 1 1 4 4 5 2 ... # $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 5 5 5 5 5 3 5 ... # $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 1 1 2 ... # $ capital-gain : int 0 0 0 0 0 15024 0 0 0 0 ... # $ capital-loss : int 0 0 0 0 0 0 0 0 0 0 ... # $ hours-per-week: int 40 50 35 40 40 65 20 45 40 20 ... # $ native-country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 39 39 39 NA 39 39 ... # $ income : Ord.factor w/ 2 levels "small"<"large": NA NA NA 1 NA NA 1 1 1 1 ... |
#3 변수 추출 및 데이터프레임 생성
3-1) 변수 추출
capital <- adult.df$`capital-gain` hours <- adult.df$`hours-per-week` education <- adult.df$`education-num` race <- adult.df$race age <- adult.df$age income <- adult.df$income |
3-2) 데이터프레임 생성
adult_df <- data.frame(capital = capital, age = age, race = race, hours = hours, education = education, income = income) str(adult_df) # 'data.frame': 34189 obs. of 6 variables: # $ capital : int 0 0 0 0 0 15024 0 0 0 0 ... # $ age : int 76 34 44 44 50 36 17 26 43 25 ... # $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 5 5 5 5 5 3 5 ... # $ hours : int 40 50 35 40 40 65 20 45 40 20 ... # $ education: int 5 11 10 10 14 13 6 13 10 14 ... # $ income : Ord.factor w/ 2 levels "small"<"large": NA NA NA 1 NA NA 1 1 1 1 ... |
#4 formula생성 – 자본이득(capital)에 영향을 미치는 변수
formula <- capital ~ income + education + hours + race + age |
#5 분류모델 생성 및 예측
adult_ctree <- ctree(formula, data = adult_df) adult_ctree |
#6 분류모델 플로팅
plot(adult_ctree) |
#7 자본이득(capital) 요약 통계량 보기
adultResult <- subset(adult_df, adult_df$income == 'large' & adult_df$education > 14) length(adultResult$education) # [1] 509 summary(adultResult$capital) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0 0 0 10718 7688 99999 boxplot(adultResult$capital) ![]() |
반응형
'통계기반 데이터분석' 카테고리의 다른 글
[ R ] 의사결정나무 #4 rpart패키지 이용 분류분석 (0) | 2022.11.29 |
---|---|
[ R ] 의사결정나무 조건부 추론나무 #3 예제 (0) | 2022.11.29 |
[ R ] 의사결정나무 #1 (0) | 2022.11.29 |
[ R ] 시계열분석 #2 (0) | 2022.11.28 |
[ R ] 시계열분석 #1 (1) | 2022.11.28 |
Comments