retail :: 예측모델링 data prep + tree model

R 데이터 분석

retail :: 예측모델링 data prep + tree model

YONG_X 2015. 7. 7. 08:56

# retail 예측 모델링 데이터 준비 :: ===============

tr01 <- read.csv("tb_tr01.csv")

sku01 <- read.csv("sku_mast01.csv")

tr01$d_date <- as.Date(as.POSIXct(tr01$d_date, origin='1960-01-01',tz="UTC"))

tr01$year <-substr(as.character(tr01$d_date),1,4)

require(sqldf)

# [1] 2002년 구매고객 리스트 추출

tr01$year <- substr(as.character(tr01$d_date ),1,4)

q71 <- sqldf('select distinct customer_id from tr01 where year="2002"')

# [2] 2003년 고객별 구매액 산출

q72 <- sqldf('select customer_id, sum(t_item_amt_sale) as amt2003

from tr01

where customer_id in (select customer_id from q71)

and year="2003"

group by customer_id

q73 <- sqldf('select a.*, b.amt2003

from q71 as a left join q72 as b

on a.customer_id=b.customer_id')

# [3] 구매일수 (입력변수 )

q74 <- sqldf('select customer_id,

count(distinct d_date) as ndays,

sum(t_item_amt_sale) as amt_sale,

count(distinct channel_id) as nchannel,

count(distinct sk_item_id) as nitems

from tr01

where customer_id in (select customer_id from q71)

and year="2002"

group by customer_id

q75 <- sqldf('select a.*, b.ndays, b.amt_sale, b.nchannel,

b.nitems

from q73 as a left join q74 as b

on a.customer_id=b.customer_id')

q75$amt2003 <- ifelse(is.na(q75$amt2003), 0, q75$amt2003 )

require(party)

t1 <- ctree(q75$amt2003~q75$ndays+q75$amt_sale+ q75$nitems + q75$nchannel, data=q75, controls =

ctree_control(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )

summary(t1)

predt1<- predict(t1, q75)

q75$is_amthigh <- as.factor(ifelse(q75$amt2003 > 80, "H","L"))

t1 <- ctree(q75$is_amthigh~q75$ndays+q75$amt_sale+ q75$nitems + q75$nchannel, data=q75, controls

= ctree_control(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )

# 학습용과 테스트용 구분

q75$rnseq <- rnorm(nrow(q75),1)

trainset1 <- q75[q75$rnseq>1, ]

testset1 <- q75[q75$rnseq<=1, ]

t2 <- ctree(trainset1$is_amthigh~trainset1$ndays+trainset1$amt_sale+ trainset1$nitems +

trainset1$nchannel, data=trainset1, controls = ctree_control(maxdepth = 3, mincriterion = 0.95, minsplit

= 50) )

testset1$is_amthighpred <- predict(t2, testset1)

table( testset1$is_amthigh , predt2)

length(testset1$is_amthigh)

length(predt2)

r1 <- lm(trainset1$amt2003~trainset1$ndays+trainset1$amt_sale+ trainset1$nitems +

trainset1$nchannel, data=trainset1)

# ... maxdsale ----------------------

q76 <- sqldf('select customer_id, d_date,

sum(t_item_amt_sale) as dsale

from tr01

where year="2002"

group by customer_id, d_date ')

q77 <- sqldf('select customer_id,

max(dsale) as maxdsale

from q76

group by customer_id')

q78 <- sqldf('select a.*, b.maxdsale

from q75 as a left join q77 as b

on a.customer_id=b.customer_id' )

# NULL 처리

q78$maxdsale[is.na(q78$maxdsale)] <- 0

#------- 3차원 시각적 관계 데이터 탐색 예제 ----

data(mtcars)

names(mtcars)

attach(mtcars)

mtcars$wthp <- wt/hp

attach(mtcars)

wthpgrp <- floor(wthp/max(wthp)*9)

require(RColorBrewer)

plot(hp,wt, col=brewer.pal(9, "Blues")[wthpgrp] , pch=20 )

mtcars$mpggrp <- floor(mpg/max(mpg)*9)

attach(mtcars)

plot(hp,wt, col=brewer.pal(9, "Reds")[mpggrp] , pch=20 )

mtcars$carname <- ifelse(hp>250 | wt>4, row.names(mtcars) , NA)

text(hp,wt, mtcars$carname, adj=c(1,1) , cex=0.5)

#----------------- [적용]--------------

require(RColorBrewer)

q78$amt2003grp <- floor(q78$amt2003/max(q78$amt2003)*9)

plot(q78$amt_sale, q78$maxdsale, col=brewer.pal(9, "Blues")[q78$amt2003grp] , pch=20)

plot(q78$amt_sale, q78$maxdsale)

plot(q78$ndays, q78$nitems, col=brewer.pal(9, "Blues")[q78$amt2003grp] , pch=20)

# ... q75$nclass ----------------------------

q79 <- sqldf('select distinct customer_id, sk_item_id

from tr01 where year="2002" ')

q80 <- sqldf('select a.*, b.sk_class_id

from q79 as a left join sku01 as b

on a.sk_item_id=b.sk_item_id ')

q81 <- sqldf('select customer_id,

count(distinct sk_class_id) as nclass

from q80

group by customer_id')

q82 <- merge(q78, q81, by=c("customer_id"), all.x=TRUE)

nrow(q82[is.na(q82$nclass),])

plot(q82$ndays, q82$nclass, col=brewer.pal(9, "Blues")[q82$amt2003grp] , pch=20)

#--------- create dept level variable ---

q83 <- sqldf('select a.*, b.sk_department_id

from q79 as a left join sku01 as b

on a.sk_item_id=b.sk_item_id ')

q84 <- sqldf('select customer_id,

count(distinct sk_department_id) as ndept

from q83

group by customer_id')

q85 <- merge(q82, q84, by=c("customer_id"), all.x=TRUE)

nrow(q85[is.na(q85$ndept),])

plot(q85$ndays, q85$ndept, col=brewer.pal(9, "Blues")[q85$amt2003grp] , pch=20)

#-----------[ subclass level predictor ] -------

q86 <- sqldf('select a.*, b.sk_subclass_id

from q79 as a left join sku01 as b

on a.sk_item_id=b.sk_item_id ')

q87 <- sqldf('select customer_id,

count(distinct sk_subclass_id) as nsubclass

from q86

group by customer_id')

q88 <- merge(q85, q87, by=c("customer_id"), all.x=TRUE)

nrow(q88[is.na(q88$nsubclass),])

plot(q88$ndays, q88$nsubclass, col=brewer.pal(9, "Blues")[q88$amt2003grp] , pch=20)

require(party)

t1 <- ctree(q88$amt2003 ~ q88$ndays+q88$amt_sale+ q88$nitems + q88$nchannel + q88$maxdsale +

q88$nclass + q88$ndept+ q88$nsubclass, data=q88, controls = ctree_control(maxdepth = 4,

mincriterion = 0.95, minsplit = 50) )

plot(t1)

#----------------------------------------

# ... DaysfromLastP ------------------

# 마지막으로 구매한지 얼마나 지났는가?

tr01$d_datec <- as.character(tr01$d_date)

q89 <- sqldf('select customer_id,

max(d_datec) as maxddate

from tr01

where year="2002"

group by customer_id ')

q89$maxddate <- as.Date(q89$maxddate)

# 날짜간 거리 계산을 위해서

q89$DaysfromLastP <- as.numeric(as.Date("2002-12-31") - q89$maxddate )

q90 <- merge(q88, q89[,c(1,3)], by=c("customer_id"), all.x=TRUE)

plot(q90$DaysfromLastP, q90$amt2003)

abline( lm(q90$amt2003~q90$DaysfromLastP, data=q90), col="red" )

#-----------[3개월간 구매만 집계 변수 추가]---------

tr01$monthc <- substr(as.character(tr01$d_date),6,7)

q92 <- sqldf('select *

from tr01

where monthc in ("10", "11", "12")' )

# 3개월 기간 중의 구매액과 건수, item종류수, 구매일수

q93 <- sqldf('select customer_id,

sum(t_item_amt_sale) as sale_3m,

count(distinct sk_transaction_id) as ntrx_3m,

count(distinct sk_item_id) as nitem_3m,

count(distinct d_date) as ndays_3m,

count(distinct monthc) as nmonths_3m,

count(distinct channel_id) as nchannel_3m

from q92

group by customer_id')

# 6개월 기간 중의 구매액과 건수, item종류수, 구매일수

q94 <- sqldf('select *

from tr01

where monthc in ("07", "08","09","10", "11", "12")' )

q95 <- sqldf('select customer_id,

sum(t_item_amt_sale) as sale_6m,

count(distinct sk_transaction_id) as ntrx_6m,

count(distinct sk_item_id) as nitem_6m,

count(distinct d_date) as ndays_6m,

count(distinct monthc) as nmonths_6m,

count(distinct channel_id) as nchannel_6m

from q94

group by customer_id')

q96 <- merge(q90, q93, by=c("customer_id"), all.x=TRUE)

q97 <- merge(q96, q95, by=c("customer_id"), all.x=TRUE)

#----------- 요일을 사용한 변수 추가 ---------

require(lubridate)

tr01$wday <- weekdays(tr01$d_date)

tr01$is_wend <-ifelse(tr01$wday=="토요일" | tr01$wday=="일요일", "weekend", "weekday")

q101 <- sqldf('select customer_id,

sum(t_item_amt_sale) as sale_we,

count(distinct sk_transaction_id) as ntrx_we,

count(distinct sk_item_id) as nitem_we,

count(distinct d_date) as ndays_we,

count(distinct monthc) as nmonths_we,

count(distinct channel_id) as nchannel_we

from tr01

where year="2002" and is_wend="weekend"

group by customer_id ')

q102 <- merge(q97, q101, by=c("customer_id"), all.x=TRUE)

require(party)

t1 <- ctree(q102$amt2003 ~ q102$ndays+q102$amt_sale+ q102$nitems + q102$nchannel +

q102$maxdsale + q102$nclass + q102$ndept+ q102$nsubclass + q102$DaysfromLastP + q102$sale_3m

+ q102$ntrx_3m + q102$nitem_3m + q102$ndays_3m + q102$nmonths_3m + q102$nchannel_3m +

q102$sale_6m + q102$ntrx_6m + q102$nitem_6m + q102$ndays_6m + q102$nmonths_6m +

q102$nchannel_6m + q102$sale_we + q102$ntrx_we + q102$nitem_we + q102$ndays_we +

q102$nmonths_we + q102$nchannel_we , data=q102, controls =ctree_control(maxdepth = 3,

mincriterion = 0.95, minsplit = 50) )

plot(t1)

r1 <- lm(q102$amt2003 ~ q102$ndays+q102$amt_sale+ q102$nitems + q102$nchannel +

q102$maxdsale + q102$nclass + q102$ndept+ q102$nsubclass + q102$DaysfromLastP + q102$sale_3m

+ q102$ntrx_3m + q102$nitem_3m + q102$ndays_3m + q102$nmonths_3m + q102$nchannel_3m +

q102$sale_6m + q102$ntrx_6m + q102$nitem_6m + q102$ndays_6m + q102$nmonths_6m +

q102$nchannel_6m + q102$sale_we + q102$ntrx_we + q102$nitem_we + q102$ndays_we +

q102$nmonths_we + q102$nchannel_we , data=q102 )

plot(q102$ndays_6m, q102$ndays, col=brewer.pal(9, "Blues")[q102$amt2003grp] , pch=20)

#----------- 2차원 분포 확인 -------------------

# 1년 중 구매일수가 많으면 최근 6개월중 구매일수도

# 많은지

require(hdrcde)

# ndays_6m NULL 처리 -- 6개월중 구매가 없는 고객 경우

q102$ndays_6m <- ifelse(is.na(q102$ndays_6m), 0, q102$ndays_6m )

hdr.boxplot.2d(q102$ndays_6m, q102$ndays)

# 밀집도 등고선 구간 지정

hdr.boxplot.2d(q102$ndays_6m, q102$ndays, prob = c(50, 75, 90) )

# 회귀선 추가

abline(lm(q102$ndays~q102$ndays_6m), col="red")

#------------------

data(mtcars)

mtcars$gear1<-ifelse(mtcars$gear>3,"H", "L")

mtcars$gear1<-as.factor(mtcars$gear1)

y <- c(1:nrow(mtcars))

mtcars$samplingkey <-sample(2,length(y),replace=TRUE,prob=c(0.7,0.3))

attach(mtcars)

train1 <- mtcars[samplingkey==1,]

test1 <- mtcars[samplingkey==2,]

nrow(test1)

require(party)

attach(mtcars)

ct1 <- ctree(gear1 ~hp+wt+vs, data=train1)

cfm1 <- table(predict(ct1, newdata= test1),test1$gear1)

class(cfm1)

accuracyct1 <- (cfm1[1,1] + cfm1[2,2] ) / nrow(test1) *100

> cfm1

H L

H 6 1

L 0 1

> accuracyct1

[1] 87.5

#----------------------[confusion matrix] 연습 ---------

q102$is_hamt2003 <- as.factor(ifelse(q102$amt2003>300, "H", "L"))

y <- c(1:nrow(q102))

q102$samplingkey <-sample(2,length(y),replace=TRUE,prob=c(0.7,0.3))

tq102 <- q102[q102$samplingkey==1,]

test1 <- q102[q102$samplingkey==2,]

test2 <- head(test1,30)

t1 <- ctree(is_hamt2003 ~ ndays+amt_sale+ nitems + nchannel + maxdsale + nclass + ndept+

nsubclass + DaysfromLastP + sale_3m + ntrx_3m + nitem_3m + ndays_3m + nmonths_3m +

nchannel_3m + sale_6m + ntrx_6m + nitem_6m + ndays_6m + nmonths_6m + nchannel_6m + sale_we

+ ntrx_we + nitem_we + ndays_we + nmonths_we + nchannel_we , data=tq102, controls =ctree_control

(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )

plot(t1)

cfm1 <- table(predict(t1, newdata= test1), test1$is_hamt2003 )

H L

H 561 138

L 328 2330

accuracyct1 <- (cfm1[1,1] + cfm1[2,2] ) / nrow(test1) *100

#------------------------------------

t2 <- ctree(amt2003 ~ ndays+amt_sale+ nitems + nchannel + maxdsale + nclass + ndept+ nsubclass +

DaysfromLastP + sale_3m + ntrx_3m + nitem_3m + ndays_3m + nmonths_3m + nchannel_3m +

sale_6m + ntrx_6m + nitem_6m + ndays_6m + nmonths_6m + nchannel_6m + sale_we + ntrx_we +

nitem_we + ndays_we + nmonths_we + nchannel_we , data=tq102, controls =ctree_control(maxdepth =

3, mincriterion = 0.95, minsplit = 50) )

plot(t2)

predictedamt2003 <- as.data.frame(predict(t2, newdata= test1) )

test1$predictedamt2003 <- predictedamt2003$amt2003

test1$is_error_high <- ifelse(abs(test1$amt2003 - test1$predictedamt2003) <100, 1, 0)

# test1$predictionerror <- abs(test1$amt2003 - test1$predictedamt2003)

test1$predictionerror <- abs(test1$amt2003 -test1$predictedamt2003) / (test1$amt2003+1)

mean(test1$predictionerror)

#----------- treating predictors with NULLs ---------

q102$sale_3m[is.na(q102$sale_3m)] <- 0

q102$ntrx_3m[is.na(q102$ntrx_3m)] <- 0

q102$nitem_3m[is.na(q102$nitem_3m)] <- 0

q102$ndays_3m[is.na(q102$ndays_3m)] <- 0

q102$nmonths_3m[is.na(q102$nmonths_3m)] <- 0

q102$nchannel_3m[is.na(q102$nchannel_3m)] <- 0

q102$sale_6m[is.na(q102$sale_6m)] <- 0

q102$ntrx_6m[is.na(q102$ntrx_6m)] <- 0

q102$nitem_6m[is.na(q102$nitem_6m)] <- 0

q102$ndays_6m[is.na(q102$ndays_6m)] <- 0

q102$nmonths_6m[is.na(q102$nmonths_6m)] <- 0

q102$nchannel_6m[is.na(q102$nchannel_6m)] <- 0

q102$sale_we[is.na(q102$sale_we)] <- 0

q102$ntrx_we[is.na(q102$ntrx_we)] <- 0

q102$nitem_we[is.na(q102$nitem_we)] <- 0

q102$ndays_we[is.na(q102$ndays_we)] <- 0

q102$nmonths_we[is.na(q102$nmonths_we)] <- 0

q102$nchannel_we[is.na(q102$nchannel_we)] <- 0

#------------------------

[1] "customer_id" "amt2003"

[3] "ndays" "amt_sale"

[5] "nchannel" "nitems"

[7] "maxdsale" "amt2003grp"

[9] "nclass" "ndept"

[11] "nsubclass" "DaysfromLastP"

[13] "sale_3m" "ntrx_3m"

[15] "nitem_3m" "ndays_3m"

[17] "nmonths_3m" "nchannel_3m"

[19] "sale_6m" "ntrx_6m"

[21] "nitem_6m" "ndays_6m"

[23] "nmonths_6m" "nchannel_6m"

[25] "sale_we" "ntrx_we"

[27] "nitem_we" "ndays_we"

[29] "nmonths_we" "nchannel_we"

[31] "is_hamt2003" "samplingkey"

* 전체구매건수중 오프라인에서 구매한 건수는? -- 비율

q111 <- sqldf('select customer_id,

count(distinct sk_transaction_id) as ntrx_rt

from tr01

where year="2002" and channel_id="RT"

group by customer_id')

q112 <- merge(q102, q111, by=c("customer_id"), all.x=TRUE)

length(q112$ntrx_rt[is.na(q112$ntrx_rt)])

q112$ntrx_rt[is.na(q112$ntrx_rt)] <- 0

# * 전체구매건수중 온라인 채널에서 구매한 건수는? -- 비율

q113 <- sqldf('select customer_id,

count(distinct sk_transaction_id) as ntrx_on

from tr01

where year="2002" and channel_id=on"

group by customer_id')

q114 <- merge(q112, q113, by=c("customer_id"), all.x=TRUE)

length(q114$ntrx_on[is.na(q114$ntrx_on)])

q114$ntrx_on[is.na(q114$ntrx_on)] <- 0

# 하루구매액이 얼마 이상인 건들을 기준으로 (일 day) 구매일수 추가변수를

# 만들어 볼까?

# 먼저, 한 고객이 하루에 산 금액이 얼마인지? 분포 확인

q115 <- sqldf('select customer_id, d_date,

sum(t_item_amt_sale) as dsale

from tr01

where year="2002"

group by customer_id, d_date ')

# 150불이상 구매일수

q116 <- sqldf('select customer_id,

count(distinct d_date) as ndays_hamt

from q115

where dsale >= 150

group by customer_id')

q117 <- merge(q114, q116, by=c("customer_id"), all.x=TRUE)

q117$ndays_hamt[is.na(q117$ndays_hamt)] <- 0

length(q117$ndays_hamt[is.na(q117$ndays_hamt)])

hdr.boxplot.2d(q117$ndays, q117$ndays_hamt)

# 회귀선 추가

abline(lm(q117$ndays_hamt~q117$ndays), col="red")

#--------------------------------------

# class 를 기준으로 상품을 프로파일

#---------

q201 <- sqldf('select distinct sk_class_id,

class_name

from sku01 ')

q202 <- sqldf('select distinct sk_class_id, class_name,

count(distinct sk_subclass_id) as nsubclass

from sku01

group by sk_class_id ')

# 최대, 최소 값을 확인

range(q202$nsubclass)

# nsubclass를 기준으로 큰 것 부터 3개만

head(q202[order(-q202$nsubclass),],3)

# 클래스별 구매고객수 산출

q203 <- sqldf('select b.sk_class_id,

count(distinct a.customer_id) as ncust

from tr01 as a left join sku01 as b

on a.sk_item_id = b.sk_item_id

group by b.sk_class_id')

저작자표시 비영리 변경금지

'R 데이터 분석' 카테고리의 다른 글

movie prediction exmple (0)	2015.07.08
predictive modeling :: algorithms and interpretation (0)	2015.07.08
:: retail data preparation and EDA 연습 과제 (0)	2015.07.06
0702 빅Labor .... 주제 (0)	2015.07.03
retail example (0)	2015.07.02

현재글retail :: 예측모델링 data prep + tree model

리비젼 CRM ( revisioncrm )

전용준 빅데이터, 리비젼, 디지털마케팅, 머신러닝, GPT, 챗GPT, 리비젼컨설팅, R, 전용준, 빅 데이터, CRM, 프롬프트엔지니어링, 빅데이터, chatGPT, 프롬프트, AI, 데이터 분석, 데이터 사이언티스트, 인공지능, 데이터분석,

Today :
Yesterday :