>RE::VISION CRM

R 데이터 분석

[kbdaa_bda] 고객빅데이터분석 _은행모델

YONG_X 2017. 9. 21. 13:35

# 은행 타겟 마케팅 사례 - 모델링 종합 연습



#-----------[ Bank Marketing ]--------


# set path

# setwd("E:/restore_Yong/0_RnModeling20161215/bank/")


# import data

# bnk01 <- read.csv("bank-additional-full1.csv")

bnk01 <- read.csv("https://t1.daumcdn.net/cfile/blog/2235793B588EED2E33?download")


# understand the data set

dim(bnk01)

names(bnk01)

head(bnk01)

nrow(bnk01)


table(bnk01$y)


# 종속변수에서 yes가 차지하는 비율 확인

summary(bnk01$y)[2]/(summary(bnk01$y)[1]+summary(bnk01$y)[2])*100


# explore variables

plot(bnk01$age)

plot(sort(bnk01$age))


plot(table(bnk01$housing, bnk01$loan), ylab="loan", xlab="housing")


plot(jitter(bnk01$age), bnk01$duration, cex.lab=1.2, cex.axis=0.7, cex=.3, pch=20, col=ifelse(bnk01$y=="yes", "navy","green")) 


counts <- table(bnk01$y, bnk01$marital)

barplot(counts, main="Y by marital",

  xlab="Marital Status",

  legend = rownames(counts))



# logistic regression 로지스틱회귀분석

#-----------------

# c.f. :: https://www.r-bloggers.com/how-to-perform-a-logistic-regression-in-r/ 

#---------------


lm3 <- glm(y ~ age + marital + housing + loan, family=binomial(link='logit'),data=bnk01)

summary(lm3)


# model interpretation

# age가 높을수록 log odds 높고, marital single log odds 높음



# 데이터 분할 partitioning

# data partitioning - random sampling

smp_size <- floor(0.8 * nrow(bnk01))


## set the seed to make your partition reproductible

set.seed(123)

train_ind <- sample(seq_len(nrow(bnk01)), size = smp_size)

bnk01train <- bnk01[train_ind, ]

bnk01test <- bnk01[-train_ind, ]


nrow(bnk01train) ; nrow(bnk01test)


barplot(c(nrow(bnk01train),nrow(bnk01test) ))

axis(side=1, at=1:2, labels=c("Train", "Test"))



# modeling using train set

lm4 <- glm(y ~ age + job + marital+ housing + loan, family=binomial(link='logit'),data=bnk01train)

summary(lm4)


lm5 <- glm(y ~ ., family=binomial(link='logit'),data=bnk01train)

summary(lm5)


# accuracy measurement and misclassification table

fitted.results <- predict(lm4, newdata=bnk01test[,c("age","job", "marital","housing","loan")])

plot(sort(fitted.results))


bnk01test$tmp.fitted.results <- fitted.results


fitted.results <- ifelse(fitted.results > -1.734452,"yes","no")

misClasificError <- mean(fitted.results != bnk01test$y)

print(paste('Accuracy',1-misClasificError))


# confusion matrix

addmargins(table(fitted.results,bnk01test$y))


# accuracy measurement and confusion matrix using lm5

fitted.results <- predict(lm5, newdata=bnk01test, type="response")

plot(sort(fitted.results))


fitted.results <- ifelse(fitted.results > 0.3075515,"yes","no")

misClasificError <- mean(fitted.results != bnk01test$y)

print(paste('Accuracy',1-misClasificError))


# confusion matrix

table(fitted.results,bnk01test$y)



#--- 의사결정나무 모델 생성 ----------


require(party)

# library(rpart)


varNames <- names(bnk01)

varNames <- varNames[!varNames %in% c("y")]

# 불필요한 컬럼을 제외한 변수명을 벡터로 저장. y는 예측 대상 변수


# add + sign between exploratory variables

# formula 데이터 타입으로 모델의 식 (Y와 X의 리스트로 구성) 정의

varNames1 <- paste(varNames, collapse = "+")

form1 <- as.formula(paste("y", varNames1, sep = " ~ "))


print(form1)

# form1 출력 결과

# y ~ age + job + marital + education + default + housing + loan + 

#    contact + month + day_of_week + duration + campaign + pdays + 

#    previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 

#    euribor3m + nr.employed


t1 <- ctree(form1, data=bnk01train)

plot(t1)


t1 <- ctree(form1, data=bnk01train, controls = ctree_control(maxdepth = 2))

plot(t1)


t1 <- ctree(form1, data=bnk01train, controls = ctree_control(maxdepth = 3, minbucket=500))

plot(t1)

# 트리의 깊이 3까지로 한정 + 하나의 리프노드가 500이상의 표본을 가지도록 Stopping Rule 지정


t1 <- ctree(form1, data=bnk01train, controls = ctree_control(maxdepth = 4, minbucket=500))

plot(t1)



#----[ randomForest 랜덤포리스트]---------

require(randomForest)

rf1 <- randomForest(y ~ ., data=bnk01train, do.trace=50, ntree=200, importance=T)

plot(rf1)

varImpPlot(rf1)

rf1

# trace에서 1, 2는 각각 1번, 2번 클래스에 대한 오차율



#==============


#-----------[ Bank Marketing ]--------



# import data


bnk01 <- read.csv('https://t1.daumcdn.net/cfile/blog/99E6173359C44CE507?download')




# 데이터 분할 partitioning


# data partitioning - random sampling

smp_size <- floor(0.75 * nrow(bnk01))


## set the seed to make your partition reproductible


set.seed(121)

train_ind <- sample(seq_len(nrow(bnk01)), size = smp_size)

bnk01train <- bnk01[train_ind, ]

bnk01test <- bnk01[-train_ind, ]


nrow(bnk01train) ; nrow(bnk01test)



#----[ randomForest 랜덤포리스트]---------


require(randomForest)

rf1 <- randomForest(y ~ ., data=bnk01train, do.trace=50, ntree=500, importance=T)

plot(rf1)

varImpPlot(rf1)

rf1


# trace에서 1, 2는 각각 1번, 2번 클래스에 대한 오차율



# accuracy measurement and misclassification table


fitted.results <- predict(rf1, newdata=bnk01test, type="prob")

plot(sort(fitted.results[,2]))


bnk01test$tmp.fitted.results <- ifelse(fitted.results[,2] > 0.5,"yes","no")


misClasificError <- mean(bnk01test$tmp.fitted.results != bnk01test$y)

print(paste('Accuracy',(1-misClasificError)*100))



# confusion matrix

addmargins(table(bnk01test$tmp.fitted.results, bnk01test$y))


# accuracy measurement and confusion matrix using lm5

# fitted.results <- predict(lm5, newdata=bnk01test, type="response")

# plot(sort(fitted.results))