# 은행 타겟 마케팅 사례 - 모델링 종합 연습
#-----------[ Bank Marketing ]--------
# set path
# setwd("E:/restore_Yong/0_RnModeling20161215/bank/")
# import data
# bnk01 <- read.csv("bank-additional-full1.csv")
bnk01 <- read.csv("https://t1.daumcdn.net/cfile/blog/2235793B588EED2E33?download")
# understand the data set
dim(bnk01)
names(bnk01)
head(bnk01)
nrow(bnk01)
table(bnk01$y)
# 종속변수에서 yes가 차지하는 비율 확인
summary(bnk01$y)[2]/(summary(bnk01$y)[1]+summary(bnk01$y)[2])*100
# explore variables
plot(bnk01$age)
plot(sort(bnk01$age))
plot(table(bnk01$housing, bnk01$loan), ylab="loan", xlab="housing")
plot(jitter(bnk01$age), bnk01$duration, cex.lab=1.2, cex.axis=0.7, cex=.3, pch=20, col=ifelse(bnk01$y=="yes", "navy","green"))
counts <- table(bnk01$y, bnk01$marital)
barplot(counts, main="Y by marital",
xlab="Marital Status",
legend = rownames(counts))
# logistic regression 로지스틱회귀분석
#-----------------
# c.f. :: https://www.r-bloggers.com/how-to-perform-a-logistic-regression-in-r/
#---------------
lm3 <- glm(y ~ age + marital + housing + loan, family=binomial(link='logit'),data=bnk01)
summary(lm3)
# model interpretation
# age가 높을수록 log odds 높고, marital single log odds 높음
# 데이터 분할 partitioning
# data partitioning - random sampling
smp_size <- floor(0.8 * nrow(bnk01))
## set the seed to make your partition reproductible
set.seed(123)
train_ind <- sample(seq_len(nrow(bnk01)), size = smp_size)
bnk01train <- bnk01[train_ind, ]
bnk01test <- bnk01[-train_ind, ]
nrow(bnk01train) ; nrow(bnk01test)
barplot(c(nrow(bnk01train),nrow(bnk01test) ))
axis(side=1, at=1:2, labels=c("Train", "Test"))
# modeling using train set
lm4 <- glm(y ~ age + job + marital+ housing + loan, family=binomial(link='logit'),data=bnk01train)
summary(lm4)
lm5 <- glm(y ~ ., family=binomial(link='logit'),data=bnk01train)
summary(lm5)
# accuracy measurement and misclassification table
fitted.results <- predict(lm4, newdata=bnk01test[,c("age","job", "marital","housing","loan")])
plot(sort(fitted.results))
bnk01test$tmp.fitted.results <- fitted.results
fitted.results <- ifelse(fitted.results > -1.734452,"yes","no")
misClasificError <- mean(fitted.results != bnk01test$y)
print(paste('Accuracy',1-misClasificError))
# confusion matrix
addmargins(table(fitted.results,bnk01test$y))
# accuracy measurement and confusion matrix using lm5
fitted.results <- predict(lm5, newdata=bnk01test, type="response")
plot(sort(fitted.results))
fitted.results <- ifelse(fitted.results > 0.3075515,"yes","no")
misClasificError <- mean(fitted.results != bnk01test$y)
print(paste('Accuracy',1-misClasificError))
# confusion matrix
table(fitted.results,bnk01test$y)
#--- 의사결정나무 모델 생성 ----------
require(party)
# library(rpart)
varNames <- names(bnk01)
varNames <- varNames[!varNames %in% c("y")]
# 불필요한 컬럼을 제외한 변수명을 벡터로 저장. y는 예측 대상 변수
# add + sign between exploratory variables
# formula 데이터 타입으로 모델의 식 (Y와 X의 리스트로 구성) 정의
varNames1 <- paste(varNames, collapse = "+")
form1 <- as.formula(paste("y", varNames1, sep = " ~ "))
print(form1)
# form1 출력 결과
# y ~ age + job + marital + education + default + housing + loan +
# contact + month + day_of_week + duration + campaign + pdays +
# previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx +
# euribor3m + nr.employed
t1 <- ctree(form1, data=bnk01train)
plot(t1)
t1 <- ctree(form1, data=bnk01train, controls = ctree_control(maxdepth = 2))
plot(t1)
t1 <- ctree(form1, data=bnk01train, controls = ctree_control(maxdepth = 3, minbucket=500))
plot(t1)
# 트리의 깊이 3까지로 한정 + 하나의 리프노드가 500이상의 표본을 가지도록 Stopping Rule 지정
t1 <- ctree(form1, data=bnk01train, controls = ctree_control(maxdepth = 4, minbucket=500))
plot(t1)
#----[ randomForest 랜덤포리스트]---------
require(randomForest)
rf1 <- randomForest(y ~ ., data=bnk01train, do.trace=50, ntree=200, importance=T)
plot(rf1)
varImpPlot(rf1)
rf1
# trace에서 1, 2는 각각 1번, 2번 클래스에 대한 오차율
#==============
#-----------[ Bank Marketing ]--------
# import data
bnk01 <- read.csv('https://t1.daumcdn.net/cfile/blog/99E6173359C44CE507?download')
# 데이터 분할 partitioning
# data partitioning - random sampling
smp_size <- floor(0.75 * nrow(bnk01))
## set the seed to make your partition reproductible
set.seed(121)
train_ind <- sample(seq_len(nrow(bnk01)), size = smp_size)
bnk01train <- bnk01[train_ind, ]
bnk01test <- bnk01[-train_ind, ]
nrow(bnk01train) ; nrow(bnk01test)
#----[ randomForest 랜덤포리스트]---------
require(randomForest)
rf1 <- randomForest(y ~ ., data=bnk01train, do.trace=50, ntree=500, importance=T)
plot(rf1)
varImpPlot(rf1)
rf1
# trace에서 1, 2는 각각 1번, 2번 클래스에 대한 오차율
# accuracy measurement and misclassification table
fitted.results <- predict(rf1, newdata=bnk01test, type="prob")
plot(sort(fitted.results[,2]))
bnk01test$tmp.fitted.results <- ifelse(fitted.results[,2] > 0.5,"yes","no")
misClasificError <- mean(bnk01test$tmp.fitted.results != bnk01test$y)
print(paste('Accuracy',(1-misClasificError)*100))
# confusion matrix
addmargins(table(bnk01test$tmp.fitted.results, bnk01test$y))
# accuracy measurement and confusion matrix using lm5
# fitted.results <- predict(lm5, newdata=bnk01test, type="response")
# plot(sort(fitted.results))
'R 데이터 분석' 카테고리의 다른 글
[kbdaa_bda] 시계열예측 (0) | 2017.09.21 |
---|---|
[kbdaa_bda] 데이터 처리 연습 GDA (0) | 2017.09.21 |
[kbdaa_bda] 빅데이터고객분석 _ 군집 (0) | 2017.09.21 |
[kbdaa_bda] 빅데이터고객분석 GDA (0) | 2017.09.20 |
[kbdaa_bda] 빅데이터고객분석 (0) | 2017.09.09 |