>RE::VISION CRM

R 데이터 분석

retail :: 예측모델링 data prep + tree model

YONG_X 2015. 7. 7. 08:56

 # retail 예측 모델링 데이터 준비 :: ===============



tr01 <- read.csv("tb_tr01.csv")

sku01 <- read.csv("sku_mast01.csv")


tr01$d_date <- as.Date(as.POSIXct(tr01$d_date, origin='1960-01-01',tz="UTC"))


tr01$year <-substr(as.character(tr01$d_date),1,4)



require(sqldf)


# [1] 2002년 구매고객 리스트 추출

tr01$year <- substr(as.character(tr01$d_date ),1,4)

q71 <- sqldf('select distinct customer_id from tr01 where year="2002"')


# [2] 2003년 고객별 구매액 산출

q72 <- sqldf('select customer_id, sum(t_item_amt_sale) as amt2003

from tr01

where customer_id in (select customer_id from q71) 

  and year="2003"

group by customer_id

')


q73 <- sqldf('select a.*, b.amt2003 

from q71 as a left join q72 as b

on a.customer_id=b.customer_id')


# [3] 구매일수 (입력변수 )

q74 <- sqldf('select customer_id, 

count(distinct d_date) as ndays,

sum(t_item_amt_sale) as amt_sale,

count(distinct channel_id) as nchannel,

count(distinct sk_item_id) as nitems

from tr01

where customer_id in (select customer_id from q71) 

  and year="2002"

group by customer_id

')


q75 <- sqldf('select a.*, b.ndays, b.amt_sale, b.nchannel,

b.nitems 

from q73 as a left join q74 as b

on a.customer_id=b.customer_id')


q75$amt2003 <- ifelse(is.na(q75$amt2003), 0, q75$amt2003 )


require(party)


t1 <- ctree(q75$amt2003~q75$ndays+q75$amt_sale+ q75$nitems + q75$nchannel, data=q75, controls = 


ctree_control(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )

t1

summary(t1)

predt1<- predict(t1, q75)


q75$is_amthigh <- as.factor(ifelse(q75$amt2003 > 80, "H","L"))

t1 <- ctree(q75$is_amthigh~q75$ndays+q75$amt_sale+ q75$nitems + q75$nchannel, data=q75, controls 


= ctree_control(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )


# 학습용과 테스트용 구분

q75$rnseq <- rnorm(nrow(q75),1)

trainset1 <- q75[q75$rnseq>1, ] 

testset1 <-  q75[q75$rnseq<=1, ] 


t2 <- ctree(trainset1$is_amthigh~trainset1$ndays+trainset1$amt_sale+ trainset1$nitems + 


trainset1$nchannel, data=trainset1, controls = ctree_control(maxdepth = 3, mincriterion = 0.95, minsplit 


= 50) )


testset1$is_amthighpred <- predict(t2, testset1)

table( testset1$is_amthigh , predt2)

length(testset1$is_amthigh)

length(predt2)


r1 <- lm(trainset1$amt2003~trainset1$ndays+trainset1$amt_sale+ trainset1$nitems +  


trainset1$nchannel, data=trainset1)


# ... maxdsale ----------------------


q76 <- sqldf('select customer_id, d_date, 

sum(t_item_amt_sale) as dsale

from tr01

where year="2002" 

group by customer_id, d_date ')


q77 <- sqldf('select customer_id, 

max(dsale)  as maxdsale

from q76

group by customer_id')


q78 <- sqldf('select a.*, b.maxdsale

from q75 as a left join q77 as b

on a.customer_id=b.customer_id' )


# NULL 처리

q78$maxdsale[is.na(q78$maxdsale)] <- 0



#------- 3차원 시각적 관계 데이터 탐색 예제 ----

data(mtcars)

names(mtcars)

attach(mtcars)

mtcars$wthp <- wt/hp


attach(mtcars)

wthpgrp <- floor(wthp/max(wthp)*9)


require(RColorBrewer)

plot(hp,wt, col=brewer.pal(9, "Blues")[wthpgrp] , pch=20 )


mtcars$mpggrp <- floor(mpg/max(mpg)*9)

attach(mtcars)

plot(hp,wt, col=brewer.pal(9, "Reds")[mpggrp] , pch=20 )


mtcars$carname <- ifelse(hp>250 | wt>4, row.names(mtcars) , NA)

text(hp,wt, mtcars$carname, adj=c(1,1) , cex=0.5)


#----------------- [적용]--------------


require(RColorBrewer)


q78$amt2003grp <- floor(q78$amt2003/max(q78$amt2003)*9)


plot(q78$amt_sale, q78$maxdsale, col=brewer.pal(9, "Blues")[q78$amt2003grp] , pch=20)

plot(q78$amt_sale, q78$maxdsale)


plot(q78$ndays, q78$nitems, col=brewer.pal(9, "Blues")[q78$amt2003grp] , pch=20)


# ... q75$nclass ----------------------------


q79 <- sqldf('select distinct customer_id, sk_item_id 

from tr01 where year="2002" ')


q80 <- sqldf('select a.*, b.sk_class_id

from q79 as a left join sku01 as b

 on a.sk_item_id=b.sk_item_id ')


q81 <- sqldf('select customer_id, 

count(distinct sk_class_id) as nclass

from q80

group by customer_id')


q82 <- merge(q78, q81, by=c("customer_id"), all.x=TRUE) 

nrow(q82[is.na(q82$nclass),])


plot(q82$ndays, q82$nclass, col=brewer.pal(9, "Blues")[q82$amt2003grp] , pch=20)


#--------- create dept level variable ---


q83 <- sqldf('select a.*, b.sk_department_id

from q79 as a left join sku01 as b

 on a.sk_item_id=b.sk_item_id ')


q84 <- sqldf('select customer_id, 

count(distinct sk_department_id) as ndept

from q83

group by customer_id')


q85 <- merge(q82, q84, by=c("customer_id"), all.x=TRUE) 

nrow(q85[is.na(q85$ndept),])


plot(q85$ndays, q85$ndept, col=brewer.pal(9, "Blues")[q85$amt2003grp] , pch=20)


#-----------[ subclass level predictor ] -------


q86 <- sqldf('select a.*, b.sk_subclass_id

from q79 as a left join sku01 as b

 on a.sk_item_id=b.sk_item_id ')


q87 <- sqldf('select customer_id, 

count(distinct sk_subclass_id) as nsubclass

from q86

group by customer_id')


q88 <- merge(q85, q87, by=c("customer_id"), all.x=TRUE) 

nrow(q88[is.na(q88$nsubclass),])


plot(q88$ndays, q88$nsubclass, col=brewer.pal(9, "Blues")[q88$amt2003grp] , pch=20)


require(party)

t1 <- ctree(q88$amt2003 ~ q88$ndays+q88$amt_sale+ q88$nitems + q88$nchannel + q88$maxdsale + 


q88$nclass + q88$ndept+ q88$nsubclass, data=q88, controls = ctree_control(maxdepth = 4, 


mincriterion = 0.95, minsplit = 50) )

plot(t1)


#----------------------------------------

# ... DaysfromLastP  ------------------

# 마지막으로 구매한지 얼마나 지났는가?


tr01$d_datec <- as.character(tr01$d_date)


q89 <- sqldf('select customer_id, 

max(d_datec) as maxddate 

from tr01 

where year="2002" 

group by customer_id ')


q89$maxddate <- as.Date(q89$maxddate)

# 날짜간 거리 계산을 위해서

q89$DaysfromLastP <- as.numeric(as.Date("2002-12-31") - q89$maxddate )


q90 <- merge(q88, q89[,c(1,3)], by=c("customer_id"), all.x=TRUE) 


plot(q90$DaysfromLastP, q90$amt2003)

abline( lm(q90$amt2003~q90$DaysfromLastP, data=q90), col="red" )


#-----------[3개월간 구매만 집계 변수 추가]---------


tr01$monthc <- substr(as.character(tr01$d_date),6,7)


q92 <- sqldf('select * 

from tr01 

where monthc in ("10", "11", "12")' )


# 3개월 기간 중의 구매액과 건수, item종류수, 구매일수

q93 <- sqldf('select customer_id,

sum(t_item_amt_sale) as sale_3m,

count(distinct sk_transaction_id) as ntrx_3m,

count(distinct sk_item_id) as nitem_3m,

count(distinct d_date) as ndays_3m,

count(distinct monthc) as nmonths_3m,

count(distinct channel_id) as nchannel_3m

from q92

group by customer_id')

 

# 6개월 기간 중의 구매액과 건수, item종류수, 구매일수

q94 <- sqldf('select * 

from tr01 

where monthc in ("07", "08","09","10", "11", "12")' )


q95 <- sqldf('select customer_id,

sum(t_item_amt_sale) as sale_6m,

count(distinct sk_transaction_id) as ntrx_6m,

count(distinct sk_item_id) as nitem_6m,

count(distinct d_date) as ndays_6m,

count(distinct monthc) as nmonths_6m,

count(distinct channel_id) as nchannel_6m

from q94

group by customer_id')


q96 <- merge(q90, q93, by=c("customer_id"), all.x=TRUE)  

q97 <- merge(q96, q95, by=c("customer_id"), all.x=TRUE)  


#----------- 요일을 사용한 변수 추가 ---------


require(lubridate)


tr01$wday <- weekdays(tr01$d_date)

tr01$is_wend <-ifelse(tr01$wday=="토요일" | tr01$wday=="일요일", "weekend", "weekday")


q101 <- sqldf('select customer_id,

sum(t_item_amt_sale) as sale_we,

count(distinct sk_transaction_id) as ntrx_we,

count(distinct sk_item_id) as nitem_we,

count(distinct d_date) as ndays_we,

count(distinct monthc) as nmonths_we,

count(distinct channel_id) as nchannel_we

from tr01

where year="2002" and is_wend="weekend"

group by customer_id ')


q102 <- merge(q97, q101, by=c("customer_id"), all.x=TRUE)  




require(party)


t1 <- ctree(q102$amt2003 ~ q102$ndays+q102$amt_sale+ q102$nitems + q102$nchannel + 


q102$maxdsale + q102$nclass + q102$ndept+ q102$nsubclass + q102$DaysfromLastP + q102$sale_3m 


+ q102$ntrx_3m + q102$nitem_3m + q102$ndays_3m + q102$nmonths_3m + q102$nchannel_3m  + 


q102$sale_6m + q102$ntrx_6m + q102$nitem_6m + q102$ndays_6m + q102$nmonths_6m + 


q102$nchannel_6m + q102$sale_we  + q102$ntrx_we + q102$nitem_we + q102$ndays_we + 


q102$nmonths_we + q102$nchannel_we , data=q102, controls =ctree_control(maxdepth = 3, 


mincriterion = 0.95, minsplit = 50) )

plot(t1)


r1 <- lm(q102$amt2003 ~ q102$ndays+q102$amt_sale+ q102$nitems + q102$nchannel + 


q102$maxdsale + q102$nclass + q102$ndept+ q102$nsubclass + q102$DaysfromLastP + q102$sale_3m 


+ q102$ntrx_3m + q102$nitem_3m + q102$ndays_3m + q102$nmonths_3m + q102$nchannel_3m  + 


q102$sale_6m + q102$ntrx_6m + q102$nitem_6m + q102$ndays_6m + q102$nmonths_6m + 


q102$nchannel_6m + q102$sale_we  + q102$ntrx_we + q102$nitem_we + q102$ndays_we + 


q102$nmonths_we + q102$nchannel_we , data=q102  )


plot(q102$ndays_6m, q102$ndays, col=brewer.pal(9, "Blues")[q102$amt2003grp] , pch=20)


#----------- 2차원 분포 확인 -------------------

# 1년 중 구매일수가 많으면 최근 6개월중 구매일수도 

# 많은지


require(hdrcde)


# ndays_6m NULL 처리 -- 6개월중 구매가 없는 고객 경우

q102$ndays_6m <- ifelse(is.na(q102$ndays_6m), 0, q102$ndays_6m )


hdr.boxplot.2d(q102$ndays_6m, q102$ndays)

# 밀집도 등고선 구간 지정

hdr.boxplot.2d(q102$ndays_6m, q102$ndays, prob = c(50, 75, 90) ) 

# 회귀선 추가

abline(lm(q102$ndays~q102$ndays_6m), col="red")



#------------------


data(mtcars)

mtcars$gear1<-ifelse(mtcars$gear>3,"H", "L")

mtcars$gear1<-as.factor(mtcars$gear1)


y <- c(1:nrow(mtcars))

mtcars$samplingkey  <-sample(2,length(y),replace=TRUE,prob=c(0.7,0.3)) 

attach(mtcars)

train1 <- mtcars[samplingkey==1,]

test1 <- mtcars[samplingkey==2,]

nrow(test1)


require(party)

attach(mtcars)

ct1 <- ctree(gear1 ~hp+wt+vs, data=train1)

cfm1 <- table(predict(ct1, newdata= test1),test1$gear1)

class(cfm1)


accuracyct1 <- (cfm1[1,1] + cfm1[2,2] ) / nrow(test1) *100


> cfm1

   

     H L

  H 6 1

  L  0 1

> accuracyct1

[1] 87.5


#----------------------[confusion matrix] 연습 ---------


q102$is_hamt2003 <- as.factor(ifelse(q102$amt2003>300, "H", "L"))


y <- c(1:nrow(q102))

q102$samplingkey  <-sample(2,length(y),replace=TRUE,prob=c(0.7,0.3)) 


tq102 <- q102[q102$samplingkey==1,]

test1 <- q102[q102$samplingkey==2,]

test2 <- head(test1,30)


t1 <- ctree(is_hamt2003 ~ ndays+amt_sale+ nitems + nchannel + maxdsale + nclass + ndept+ 


nsubclass + DaysfromLastP + sale_3m + ntrx_3m + nitem_3m + ndays_3m + nmonths_3m + 


nchannel_3m  + sale_6m + ntrx_6m + nitem_6m + ndays_6m + nmonths_6m + nchannel_6m + sale_we  


+ ntrx_we + nitem_we + ndays_we + nmonths_we + nchannel_we , data=tq102, controls =ctree_control


(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )

plot(t1)


cfm1 <- table(predict(t1, newdata= test1), test1$is_hamt2003 )


       H    L

  H  561  138

  L  328 2330


accuracyct1 <- (cfm1[1,1] + cfm1[2,2] ) / nrow(test1) *100


#------------------------------------


t2 <- ctree(amt2003 ~ ndays+amt_sale+ nitems + nchannel + maxdsale + nclass + ndept+ nsubclass + 


DaysfromLastP + sale_3m + ntrx_3m + nitem_3m + ndays_3m + nmonths_3m + nchannel_3m  + 


sale_6m + ntrx_6m + nitem_6m + ndays_6m + nmonths_6m + nchannel_6m + sale_we  + ntrx_we + 


nitem_we + ndays_we + nmonths_we + nchannel_we , data=tq102, controls =ctree_control(maxdepth = 


3, mincriterion = 0.95, minsplit = 50) )

plot(t2)


predictedamt2003 <- as.data.frame(predict(t2, newdata= test1) )

test1$predictedamt2003 <- predictedamt2003$amt2003


test1$is_error_high <- ifelse(abs(test1$amt2003 - test1$predictedamt2003) <100, 1, 0)


# test1$predictionerror <- abs(test1$amt2003 - test1$predictedamt2003)

 test1$predictionerror <- abs(test1$amt2003 -test1$predictedamt2003) / (test1$amt2003+1)

mean(test1$predictionerror)


#----------- treating predictors with NULLs ---------


q102$sale_3m[is.na(q102$sale_3m)] <- 0

q102$ntrx_3m[is.na(q102$ntrx_3m)] <- 0

q102$nitem_3m[is.na(q102$nitem_3m)] <- 0

q102$ndays_3m[is.na(q102$ndays_3m)] <- 0

q102$nmonths_3m[is.na(q102$nmonths_3m)] <- 0

q102$nchannel_3m[is.na(q102$nchannel_3m)] <- 0

q102$sale_6m[is.na(q102$sale_6m)] <- 0

q102$ntrx_6m[is.na(q102$ntrx_6m)] <- 0

q102$nitem_6m[is.na(q102$nitem_6m)] <- 0

q102$ndays_6m[is.na(q102$ndays_6m)] <- 0

q102$nmonths_6m[is.na(q102$nmonths_6m)] <- 0

q102$nchannel_6m[is.na(q102$nchannel_6m)] <- 0

q102$sale_we[is.na(q102$sale_we)] <- 0

q102$ntrx_we[is.na(q102$ntrx_we)] <- 0

q102$nitem_we[is.na(q102$nitem_we)] <- 0

q102$ndays_we[is.na(q102$ndays_we)] <- 0

q102$nmonths_we[is.na(q102$nmonths_we)] <- 0

q102$nchannel_we[is.na(q102$nchannel_we)] <- 0


#------------------------


[1] "customer_id"      "amt2003"         

 [3] "ndays"            "amt_sale"        

 [5] "nchannel"         "nitems"          

 [7] "maxdsale"         "amt2003grp"      

 [9] "nclass"           "ndept"           

[11] "nsubclass"        "DaysfromLastP"   

[13] "sale_3m"          "ntrx_3m"         

[15] "nitem_3m"         "ndays_3m"        

[17] "nmonths_3m"       "nchannel_3m"     

[19] "sale_6m"          "ntrx_6m"         

[21] "nitem_6m"         "ndays_6m"        

[23] "nmonths_6m"       "nchannel_6m"     

[25] "sale_we"          "ntrx_we"         

[27] "nitem_we"         "ndays_we"        

[29] "nmonths_we"       "nchannel_we"     

[31] "is_hamt2003"      "samplingkey"  


* 전체구매건수중 오프라인에서 구매한 건수는? -- 비율


q111 <- sqldf('select customer_id,

count(distinct sk_transaction_id) as ntrx_rt

from tr01

where year="2002" and channel_id="RT"

group by customer_id')


q112 <- merge(q102, q111, by=c("customer_id"), all.x=TRUE)  

length(q112$ntrx_rt[is.na(q112$ntrx_rt)])

q112$ntrx_rt[is.na(q112$ntrx_rt)] <- 0



# * 전체구매건수중 온라인 채널에서 구매한 건수는? -- 비율


q113 <- sqldf('select customer_id,

count(distinct sk_transaction_id) as ntrx_on

from tr01

where year="2002" and channel_id=on"

group by customer_id')


q114 <- merge(q112, q113, by=c("customer_id"), all.x=TRUE)  

length(q114$ntrx_on[is.na(q114$ntrx_on)])


q114$ntrx_on[is.na(q114$ntrx_on)] <- 0


# 하루구매액이 얼마 이상인 건들을 기준으로 (일 day)  구매일수 추가변수를

# 만들어 볼까?


# 먼저, 한 고객이 하루에 산 금액이 얼마인지? 분포 확인


q115 <- sqldf('select customer_id, d_date,

sum(t_item_amt_sale) as dsale

from tr01

where year="2002" 

group by customer_id, d_date ')


# 150불이상 구매일수

q116 <- sqldf('select customer_id,

count(distinct d_date) as ndays_hamt

from q115

where dsale >= 150

group by customer_id')


q117 <- merge(q114, q116, by=c("customer_id"), all.x=TRUE)  

q117$ndays_hamt[is.na(q117$ndays_hamt)] <- 0

length(q117$ndays_hamt[is.na(q117$ndays_hamt)])


hdr.boxplot.2d(q117$ndays, q117$ndays_hamt) 

# 회귀선 추가

abline(lm(q117$ndays_hamt~q117$ndays), col="red")


#--------------------------------------

# class 를 기준으로 상품을 프로파일

#---------


q201 <- sqldf('select distinct sk_class_id,

class_name

from sku01 ')


q202 <- sqldf('select distinct sk_class_id, class_name,

count(distinct sk_subclass_id) as nsubclass

from sku01

group by sk_class_id ')


# 최대, 최소 값을 확인

range(q202$nsubclass)


# nsubclass를 기준으로 큰 것 부터 3개만

head(q202[order(-q202$nsubclass),],3)


# 클래스별 구매고객수 산출


q203 <- sqldf('select b.sk_class_id, 

count(distinct a.customer_id) as ncust

from tr01 as a left join sku01 as b

on a.sk_item_id = b.sk_item_id

group by b.sk_class_id')


'R 데이터 분석' 카테고리의 다른 글

movie prediction exmple  (0) 2015.07.08
predictive modeling :: algorithms and interpretation  (0) 2015.07.08
:: retail data preparation and EDA 연습 과제  (0) 2015.07.06
0702 빅Labor .... 주제  (0) 2015.07.03
retail example  (0) 2015.07.02