>RE::VISION CRM

R 데이터 분석

[CRMAJU2018] 예측분석 Review

YONG_X 2018. 6. 15. 16:27

#---- RFM 분석? 예측분석 프로세스 요약! -------


# data : UCI 머신러닝  data repository

# 총 305일분의 온라인리테일 거래 데이터

# 500명의 고객만 표본으로 추출

mData2 <- read.csv("https://t1.daumcdn.net/cfile/blog/99CE763F5A61942711?download") 

nrow(mData2)

length(unique(mData2$CustomerID))


# 데이터 기간 확인

length(unique(substr(as.character(mData2$InvoiceDate),1,10)))

range(unique(substr(as.character(mData2$InvoiceDate),1,10)))

# 약 1년간의 거래 내역임을 확인



# 2011-11-01 이후 데이터를 구분 (과거기간과 미래기간 구분)

mData3 <- mData2[substr(as.character(mData2$InvoiceDate),1,10)<="2011-10-31",]

mData4 <- mData2[substr(as.character(mData2$InvoiceDate),1,10)>"2011-10-31",]

# 예측모델링을 위해 과거와 미래로 데이터 구분


# get R


agg1 <-aggregate(substr(as.character(InvoiceDate),1,10) ~ CustomerID, 

               FUN=max, na.rm=TRUE, data=mData3)

head(agg1)


# R계산을 위한 최종일 +1일 기준

agg1$R <- as.Date("2011-12-10")- as.Date(agg1[,2])

head(agg1)

hist(as.numeric(agg1$R)) 



# get Frequency


# 날짜가 같다면 (=하루에 여러건 구매했어도) Frequency는 1로 count

df_F <- unique(data.frame(CustomerID = mData3$CustomerID, 

      InvoiceDate = substr(as.character(mData3$InvoiceDate),1,10) ))


agg2 <-aggregate(df_F$InvoiceDate, by=list(df_F$CustomerID), 

   FUN=length)


names(agg2) <- c("CustomerID", "InvoiceFrequency")

head(agg2)

hist(agg2$InvoiceFrequency, breaks=100)

plot(density(agg2$InvoiceFrequency))



# merge R and F


dfbase <- agg1[,names(agg1) %in% c("CustomerID", "R")]

dfbase <- merge(dfbase, agg2[,names(agg2) %in% c("CustomerID", "InvoiceFrequency")], 

    by="CustomerID", all.x=T)

head(dfbase)

dim(dfbase)


plot(dfbase$R, jitter(dfbase$InvoiceFrequency),

   cex=0.5)

abline(lm(dfbase$InvoiceFrequency~dfbase$R), col="red")

lines(lowess(dfbase$InvoiceFrequency~dfbase$R), col="green", lty=2) 



# 국가 변수 생성


# 국가 분포 확인

plot(sort(table(mData3$Country)))


cntr_df<- as.data.frame(table(mData3$CustomerID, mData3$Country))

head(cntr_df)

names(cntr_df) <- c("CustomerID", "country", "Freq")


cntr_df <- cntr_df[cntr_df$Freq>0,]

cntr_df <- unique(cntr_df)

head(cntr_df)


# 한 고객이 복수 국가에서 주문했는가?

plot(sort(table(cntr_df$CustomerID)))


head(sort(-table(cntr_df$CustomerID)))

# 국가가 복수인 고객이 존재하므로 처리 필요


# 고객번호 파악후 국가 리스트 확인

cntr_df[cntr_df$CustomerID %in% c("12422", "12457"),]

cntr_df <- cntr_df[!(cntr_df$CustomerID=="12457" & cntr_df$country=="Cyprus"), ]

cntr_df <- cntr_df[!(cntr_df$CustomerID=="12422" & cntr_df$country=="Switzerland"), ]


# 처리 성공여부  확인

plot(sort(table(cntr_df$CustomerID)))


mean(table(cntr_df$CustomerID)) # 평균이 1이면 모두 1

sd(table(cntr_df$CustomerID))  # 표준편차가가 0라면 모두 동일


dfbase <- merge(dfbase, cntr_df, by="CustomerID", all.x=T)

plot(dfbase$InvoiceFrequency, dfbase$Freq)

head(dfbase)


dfbase <- dfbase[,names(dfbase) !="Freq"]

names(dfbase) <- c("custid", "R", "iDays","country")




# 상품분류코드 생성

mData3$itemGrp <- substr(mData3$Description, 

        nchar(as.character(mData3$Description))-7, 

        nchar(as.character(mData3$Description)))

itemsGrps <- names(tail(sort(table(mData3$itemGrp)),50))

mData3$itemGrp <- ifelse(mData3$itemGrp %in% itemsGrps, mData3$itemGrp,

                           "Etc")

barplot(table(mData3$itemGrp))



# 고객별 상품분류별수량 집계

agg3 <-aggregate(Quantity~CustomerID+itemGrp, data=mData3,

          FUN=sum)

names(agg3) <- c("CustomerID", "itemGrp", "SumQnt")

agg3$isItemT50 <- ifelse(agg3$itemGrp!="Etc", 1, 0)

agg3 <- agg3[agg3$isItemT50==1,]

agg4 <-aggregate(SumQnt~CustomerID, data=agg3,

          FUN=sum)

names(agg4) <- c("custid", "sumT50Qnt")


dfbase <- merge(dfbase, agg4, by="custid", all.x=T)

dfbase$sumT50Qnt[is.na(dfbase$sumT50Qnt)] <- 0




#---- 미래기간 구매행동 집계 ----


dfF1 <- data.frame(custid=mData4$CustomerID,

    iDate = substr(as.character(mData4$InvoiceDate),1,10))

dfF1 <- unique(dfF1)


agg5 <-aggregate(iDate ~ custid, FUN=length, data=dfF1)

head(agg5)

names(agg5) <- c("custid", "iDaysF")

plot(sort(agg5$iDaysF))


dfbase <- merge(dfbase, agg5, by="custid", all.x=T)

head(dfbase)


# NA 처리 (결측치 치환)


dfbase$iDaysF[is.na(dfbase$iDaysF)] <- 0

head(dfbase)


# 타겟변수와 predictor간 관계 확인

plot(jitter(dfbase$iDays), jitter(as.numeric(dfbase$R)), 

      pch=19, cex=0.4,

      col=ifelse(dfbase$iDaysF>=3, "blue", "grey"))


cor(dfbase$iDays, dfbase$iDaysF)

cor(as.numeric(dfbase$R), dfbase$iDaysF)



# 타겟변수를 Binary로 변환

dfbase$isIF <- ifelse(dfbase$iDaysF>=1, "Y","N")

barplot(table(dfbase$isIF))




dfbase$R <- as.numeric(dfbase$R)

dfbase$iDaysF <- NULL # 중복된 타겟변수 후보 제외

# 트리모델에 사용하기 위해 팩터로 변환

dfbase$isIF <- as.factor(dfbase$isIF)




#---

#--- 데이터 분할 partitioning ( random sampling)



smp_size <- floor(0.8 * nrow(dfbase))


## set the seed to make your partition reproducible

set.seed(123)

train_ind <- sample(seq_len(nrow(dfbase)), size = smp_size)

dfbasetrain <- dfbase[train_ind, ]

dfbasetest <- dfbase[-train_ind, ]





#----- Decision Tree --------



library(party)

t1 <- ctree(isIF ~.,

     controls = ctree_control(maxdepth = 5,  minbucket=10),

     data=dfbasetrain)

plot(t1)


# Confusion matrix

pred <- predict(t1, newdata=dfbasetest)

table(pred, dfbasetest$isIF)

sum(table(pred, dfbasetest$isIF))

table(pred, dfbasetest$isIF)[1,1] + table(pred, dfbasetest$isIF)[2,2]

plot(table(pred, dfbasetest$isIF))



#---- Post Modeling Assessment : Why sumT50Qnt does not show up?


boxplot(log(dfbasetrain$sumT50Qnt+1)~dfbasetrain$isIF)

plot(dfbasetrain$sumT50Qnt, jitter(dfbasetrain$iDays), 

   col=ifelse(dfbasetrain$isIF=="Y","red","blue"), cex=0.5)

cor(dfbasetrain$sumT50Qnt, dfbasetrain$iDays)


# Zoom In

plot(dfbasetrain$sumT50Qnt, jitter(dfbasetrain$iDays), 

   col=ifelse(dfbasetrain$isIF=="Y","red","blue"), cex=0.7,

   xlim=c(0,1000), ylim=c(0,10))