#---- RFM 분석? 예측분석 프로세스 요약! -------
# data : UCI 머신러닝 data repository
# 총 305일분의 온라인리테일 거래 데이터
# 500명의 고객만 표본으로 추출
mData2 <- read.csv("https://t1.daumcdn.net/cfile/blog/99CE763F5A61942711?download")
nrow(mData2)
length(unique(mData2$CustomerID))
# 데이터 기간 확인
length(unique(substr(as.character(mData2$InvoiceDate),1,10)))
range(unique(substr(as.character(mData2$InvoiceDate),1,10)))
# 약 1년간의 거래 내역임을 확인
# 2011-11-01 이후 데이터를 구분 (과거기간과 미래기간 구분)
mData3 <- mData2[substr(as.character(mData2$InvoiceDate),1,10)<="2011-10-31",]
mData4 <- mData2[substr(as.character(mData2$InvoiceDate),1,10)>"2011-10-31",]
# 예측모델링을 위해 과거와 미래로 데이터 구분
# get R
agg1 <-aggregate(substr(as.character(InvoiceDate),1,10) ~ CustomerID,
FUN=max, na.rm=TRUE, data=mData3)
head(agg1)
# R계산을 위한 최종일 +1일 기준
agg1$R <- as.Date("2011-12-10")- as.Date(agg1[,2])
head(agg1)
hist(as.numeric(agg1$R))
# get Frequency
# 날짜가 같다면 (=하루에 여러건 구매했어도) Frequency는 1로 count
df_F <- unique(data.frame(CustomerID = mData3$CustomerID,
InvoiceDate = substr(as.character(mData3$InvoiceDate),1,10) ))
agg2 <-aggregate(df_F$InvoiceDate, by=list(df_F$CustomerID),
FUN=length)
names(agg2) <- c("CustomerID", "InvoiceFrequency")
head(agg2)
hist(agg2$InvoiceFrequency, breaks=100)
plot(density(agg2$InvoiceFrequency))
# merge R and F
dfbase <- agg1[,names(agg1) %in% c("CustomerID", "R")]
dfbase <- merge(dfbase, agg2[,names(agg2) %in% c("CustomerID", "InvoiceFrequency")],
by="CustomerID", all.x=T)
head(dfbase)
dim(dfbase)
plot(dfbase$R, jitter(dfbase$InvoiceFrequency),
cex=0.5)
abline(lm(dfbase$InvoiceFrequency~dfbase$R), col="red")
lines(lowess(dfbase$InvoiceFrequency~dfbase$R), col="green", lty=2)
# 국가 변수 생성
# 국가 분포 확인
plot(sort(table(mData3$Country)))
cntr_df<- as.data.frame(table(mData3$CustomerID, mData3$Country))
head(cntr_df)
names(cntr_df) <- c("CustomerID", "country", "Freq")
cntr_df <- cntr_df[cntr_df$Freq>0,]
cntr_df <- unique(cntr_df)
head(cntr_df)
# 한 고객이 복수 국가에서 주문했는가?
plot(sort(table(cntr_df$CustomerID)))
head(sort(-table(cntr_df$CustomerID)))
# 국가가 복수인 고객이 존재하므로 처리 필요
# 고객번호 파악후 국가 리스트 확인
cntr_df[cntr_df$CustomerID %in% c("12422", "12457"),]
cntr_df <- cntr_df[!(cntr_df$CustomerID=="12457" & cntr_df$country=="Cyprus"), ]
cntr_df <- cntr_df[!(cntr_df$CustomerID=="12422" & cntr_df$country=="Switzerland"), ]
# 처리 성공여부 확인
plot(sort(table(cntr_df$CustomerID)))
mean(table(cntr_df$CustomerID)) # 평균이 1이면 모두 1
sd(table(cntr_df$CustomerID)) # 표준편차가가 0라면 모두 동일
dfbase <- merge(dfbase, cntr_df, by="CustomerID", all.x=T)
plot(dfbase$InvoiceFrequency, dfbase$Freq)
head(dfbase)
dfbase <- dfbase[,names(dfbase) !="Freq"]
names(dfbase) <- c("custid", "R", "iDays","country")
# 상품분류코드 생성
mData3$itemGrp <- substr(mData3$Description,
nchar(as.character(mData3$Description))-7,
nchar(as.character(mData3$Description)))
itemsGrps <- names(tail(sort(table(mData3$itemGrp)),50))
mData3$itemGrp <- ifelse(mData3$itemGrp %in% itemsGrps, mData3$itemGrp,
"Etc")
barplot(table(mData3$itemGrp))
# 고객별 상품분류별수량 집계
agg3 <-aggregate(Quantity~CustomerID+itemGrp, data=mData3,
FUN=sum)
names(agg3) <- c("CustomerID", "itemGrp", "SumQnt")
agg3$isItemT50 <- ifelse(agg3$itemGrp!="Etc", 1, 0)
agg3 <- agg3[agg3$isItemT50==1,]
agg4 <-aggregate(SumQnt~CustomerID, data=agg3,
FUN=sum)
names(agg4) <- c("custid", "sumT50Qnt")
dfbase <- merge(dfbase, agg4, by="custid", all.x=T)
dfbase$sumT50Qnt[is.na(dfbase$sumT50Qnt)] <- 0
#---- 미래기간 구매행동 집계 ----
dfF1 <- data.frame(custid=mData4$CustomerID,
iDate = substr(as.character(mData4$InvoiceDate),1,10))
dfF1 <- unique(dfF1)
agg5 <-aggregate(iDate ~ custid, FUN=length, data=dfF1)
head(agg5)
names(agg5) <- c("custid", "iDaysF")
plot(sort(agg5$iDaysF))
dfbase <- merge(dfbase, agg5, by="custid", all.x=T)
head(dfbase)
# NA 처리 (결측치 치환)
dfbase$iDaysF[is.na(dfbase$iDaysF)] <- 0
head(dfbase)
# 타겟변수와 predictor간 관계 확인
plot(jitter(dfbase$iDays), jitter(as.numeric(dfbase$R)),
pch=19, cex=0.4,
col=ifelse(dfbase$iDaysF>=3, "blue", "grey"))
cor(dfbase$iDays, dfbase$iDaysF)
cor(as.numeric(dfbase$R), dfbase$iDaysF)
# 타겟변수를 Binary로 변환
dfbase$isIF <- ifelse(dfbase$iDaysF>=1, "Y","N")
barplot(table(dfbase$isIF))
dfbase$R <- as.numeric(dfbase$R)
dfbase$iDaysF <- NULL # 중복된 타겟변수 후보 제외
# 트리모델에 사용하기 위해 팩터로 변환
dfbase$isIF <- as.factor(dfbase$isIF)
#---
#--- 데이터 분할 partitioning ( random sampling)
smp_size <- floor(0.8 * nrow(dfbase))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(dfbase)), size = smp_size)
dfbasetrain <- dfbase[train_ind, ]
dfbasetest <- dfbase[-train_ind, ]
#----- Decision Tree --------
library(party)
t1 <- ctree(isIF ~.,
controls = ctree_control(maxdepth = 5, minbucket=10),
data=dfbasetrain)
plot(t1)
# Confusion matrix
pred <- predict(t1, newdata=dfbasetest)
table(pred, dfbasetest$isIF)
sum(table(pred, dfbasetest$isIF))
table(pred, dfbasetest$isIF)[1,1] + table(pred, dfbasetest$isIF)[2,2]
plot(table(pred, dfbasetest$isIF))
#---- Post Modeling Assessment : Why sumT50Qnt does not show up?
boxplot(log(dfbasetrain$sumT50Qnt+1)~dfbasetrain$isIF)
plot(dfbasetrain$sumT50Qnt, jitter(dfbasetrain$iDays),
col=ifelse(dfbasetrain$isIF=="Y","red","blue"), cex=0.5)
cor(dfbasetrain$sumT50Qnt, dfbasetrain$iDays)
# Zoom In
plot(dfbasetrain$sumT50Qnt, jitter(dfbasetrain$iDays),
col=ifelse(dfbasetrain$isIF=="Y","red","blue"), cex=0.7,
xlim=c(0,1000), ylim=c(0,10))
'R 데이터 분석' 카테고리의 다른 글
[CRMAJU2018] 기말고사 후보답안 예시 (0) | 2018.06.20 |
---|---|
[ CRMAJU2018 ] 기말고사 (0) | 2018.06.20 |
rolling 2 (0) | 2018.05.12 |
[R분석] rolling base stat generator for time series data (0) | 2018.05.01 |
[CRMAJU2018] R 프로그래밍 기초 복습 -- 2018.04.25 (0) | 2018.04.23 |