# retail 예측 모델링 데이터 준비 :: ===============
tr01 <- read.csv("tb_tr01.csv")
sku01 <- read.csv("sku_mast01.csv")
tr01$d_date <- as.Date(as.POSIXct(tr01$d_date, origin='1960-01-01',tz="UTC"))
tr01$year <-substr(as.character(tr01$d_date),1,4)
require(sqldf)
# [1] 2002년 구매고객 리스트 추출
tr01$year <- substr(as.character(tr01$d_date ),1,4)
q71 <- sqldf('select distinct customer_id from tr01 where year="2002"')
# [2] 2003년 고객별 구매액 산출
q72 <- sqldf('select customer_id, sum(t_item_amt_sale) as amt2003
from tr01
where customer_id in (select customer_id from q71)
and year="2003"
group by customer_id
')
q73 <- sqldf('select a.*, b.amt2003
from q71 as a left join q72 as b
on a.customer_id=b.customer_id')
# [3] 구매일수 (입력변수 )
q74 <- sqldf('select customer_id,
count(distinct d_date) as ndays,
sum(t_item_amt_sale) as amt_sale,
count(distinct channel_id) as nchannel,
count(distinct sk_item_id) as nitems
from tr01
where customer_id in (select customer_id from q71)
and year="2002"
group by customer_id
')
q75 <- sqldf('select a.*, b.ndays, b.amt_sale, b.nchannel,
b.nitems
from q73 as a left join q74 as b
on a.customer_id=b.customer_id')
q75$amt2003 <- ifelse(is.na(q75$amt2003), 0, q75$amt2003 )
require(party)
t1 <- ctree(q75$amt2003~q75$ndays+q75$amt_sale+ q75$nitems + q75$nchannel, data=q75, controls =
ctree_control(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )
t1
summary(t1)
predt1<- predict(t1, q75)
q75$is_amthigh <- as.factor(ifelse(q75$amt2003 > 80, "H","L"))
t1 <- ctree(q75$is_amthigh~q75$ndays+q75$amt_sale+ q75$nitems + q75$nchannel, data=q75, controls
= ctree_control(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )
# 학습용과 테스트용 구분
q75$rnseq <- rnorm(nrow(q75),1)
trainset1 <- q75[q75$rnseq>1, ]
testset1 <- q75[q75$rnseq<=1, ]
t2 <- ctree(trainset1$is_amthigh~trainset1$ndays+trainset1$amt_sale+ trainset1$nitems +
trainset1$nchannel, data=trainset1, controls = ctree_control(maxdepth = 3, mincriterion = 0.95, minsplit
= 50) )
testset1$is_amthighpred <- predict(t2, testset1)
table( testset1$is_amthigh , predt2)
length(testset1$is_amthigh)
length(predt2)
r1 <- lm(trainset1$amt2003~trainset1$ndays+trainset1$amt_sale+ trainset1$nitems +
trainset1$nchannel, data=trainset1)
# ... maxdsale ----------------------
q76 <- sqldf('select customer_id, d_date,
sum(t_item_amt_sale) as dsale
from tr01
where year="2002"
group by customer_id, d_date ')
q77 <- sqldf('select customer_id,
max(dsale) as maxdsale
from q76
group by customer_id')
q78 <- sqldf('select a.*, b.maxdsale
from q75 as a left join q77 as b
on a.customer_id=b.customer_id' )
# NULL 처리
q78$maxdsale[is.na(q78$maxdsale)] <- 0
#------- 3차원 시각적 관계 데이터 탐색 예제 ----
data(mtcars)
names(mtcars)
attach(mtcars)
mtcars$wthp <- wt/hp
attach(mtcars)
wthpgrp <- floor(wthp/max(wthp)*9)
require(RColorBrewer)
plot(hp,wt, col=brewer.pal(9, "Blues")[wthpgrp] , pch=20 )
mtcars$mpggrp <- floor(mpg/max(mpg)*9)
attach(mtcars)
plot(hp,wt, col=brewer.pal(9, "Reds")[mpggrp] , pch=20 )
mtcars$carname <- ifelse(hp>250 | wt>4, row.names(mtcars) , NA)
text(hp,wt, mtcars$carname, adj=c(1,1) , cex=0.5)
#----------------- [적용]--------------
require(RColorBrewer)
q78$amt2003grp <- floor(q78$amt2003/max(q78$amt2003)*9)
plot(q78$amt_sale, q78$maxdsale, col=brewer.pal(9, "Blues")[q78$amt2003grp] , pch=20)
plot(q78$amt_sale, q78$maxdsale)
plot(q78$ndays, q78$nitems, col=brewer.pal(9, "Blues")[q78$amt2003grp] , pch=20)
# ... q75$nclass ----------------------------
q79 <- sqldf('select distinct customer_id, sk_item_id
from tr01 where year="2002" ')
q80 <- sqldf('select a.*, b.sk_class_id
from q79 as a left join sku01 as b
on a.sk_item_id=b.sk_item_id ')
q81 <- sqldf('select customer_id,
count(distinct sk_class_id) as nclass
from q80
group by customer_id')
q82 <- merge(q78, q81, by=c("customer_id"), all.x=TRUE)
nrow(q82[is.na(q82$nclass),])
plot(q82$ndays, q82$nclass, col=brewer.pal(9, "Blues")[q82$amt2003grp] , pch=20)
#--------- create dept level variable ---
q83 <- sqldf('select a.*, b.sk_department_id
from q79 as a left join sku01 as b
on a.sk_item_id=b.sk_item_id ')
q84 <- sqldf('select customer_id,
count(distinct sk_department_id) as ndept
from q83
group by customer_id')
q85 <- merge(q82, q84, by=c("customer_id"), all.x=TRUE)
nrow(q85[is.na(q85$ndept),])
plot(q85$ndays, q85$ndept, col=brewer.pal(9, "Blues")[q85$amt2003grp] , pch=20)
#-----------[ subclass level predictor ] -------
q86 <- sqldf('select a.*, b.sk_subclass_id
from q79 as a left join sku01 as b
on a.sk_item_id=b.sk_item_id ')
q87 <- sqldf('select customer_id,
count(distinct sk_subclass_id) as nsubclass
from q86
group by customer_id')
q88 <- merge(q85, q87, by=c("customer_id"), all.x=TRUE)
nrow(q88[is.na(q88$nsubclass),])
plot(q88$ndays, q88$nsubclass, col=brewer.pal(9, "Blues")[q88$amt2003grp] , pch=20)
require(party)
t1 <- ctree(q88$amt2003 ~ q88$ndays+q88$amt_sale+ q88$nitems + q88$nchannel + q88$maxdsale +
q88$nclass + q88$ndept+ q88$nsubclass, data=q88, controls = ctree_control(maxdepth = 4,
mincriterion = 0.95, minsplit = 50) )
plot(t1)
#----------------------------------------
# ... DaysfromLastP ------------------
# 마지막으로 구매한지 얼마나 지났는가?
tr01$d_datec <- as.character(tr01$d_date)
q89 <- sqldf('select customer_id,
max(d_datec) as maxddate
from tr01
where year="2002"
group by customer_id ')
q89$maxddate <- as.Date(q89$maxddate)
# 날짜간 거리 계산을 위해서
q89$DaysfromLastP <- as.numeric(as.Date("2002-12-31") - q89$maxddate )
q90 <- merge(q88, q89[,c(1,3)], by=c("customer_id"), all.x=TRUE)
plot(q90$DaysfromLastP, q90$amt2003)
abline( lm(q90$amt2003~q90$DaysfromLastP, data=q90), col="red" )
#-----------[3개월간 구매만 집계 변수 추가]---------
tr01$monthc <- substr(as.character(tr01$d_date),6,7)
q92 <- sqldf('select *
from tr01
where monthc in ("10", "11", "12")' )
# 3개월 기간 중의 구매액과 건수, item종류수, 구매일수
q93 <- sqldf('select customer_id,
sum(t_item_amt_sale) as sale_3m,
count(distinct sk_transaction_id) as ntrx_3m,
count(distinct sk_item_id) as nitem_3m,
count(distinct d_date) as ndays_3m,
count(distinct monthc) as nmonths_3m,
count(distinct channel_id) as nchannel_3m
from q92
group by customer_id')
# 6개월 기간 중의 구매액과 건수, item종류수, 구매일수
q94 <- sqldf('select *
from tr01
where monthc in ("07", "08","09","10", "11", "12")' )
q95 <- sqldf('select customer_id,
sum(t_item_amt_sale) as sale_6m,
count(distinct sk_transaction_id) as ntrx_6m,
count(distinct sk_item_id) as nitem_6m,
count(distinct d_date) as ndays_6m,
count(distinct monthc) as nmonths_6m,
count(distinct channel_id) as nchannel_6m
from q94
group by customer_id')
q96 <- merge(q90, q93, by=c("customer_id"), all.x=TRUE)
q97 <- merge(q96, q95, by=c("customer_id"), all.x=TRUE)
#----------- 요일을 사용한 변수 추가 ---------
require(lubridate)
tr01$wday <- weekdays(tr01$d_date)
tr01$is_wend <-ifelse(tr01$wday=="토요일" | tr01$wday=="일요일", "weekend", "weekday")
q101 <- sqldf('select customer_id,
sum(t_item_amt_sale) as sale_we,
count(distinct sk_transaction_id) as ntrx_we,
count(distinct sk_item_id) as nitem_we,
count(distinct d_date) as ndays_we,
count(distinct monthc) as nmonths_we,
count(distinct channel_id) as nchannel_we
from tr01
where year="2002" and is_wend="weekend"
group by customer_id ')
q102 <- merge(q97, q101, by=c("customer_id"), all.x=TRUE)
require(party)
t1 <- ctree(q102$amt2003 ~ q102$ndays+q102$amt_sale+ q102$nitems + q102$nchannel +
q102$maxdsale + q102$nclass + q102$ndept+ q102$nsubclass + q102$DaysfromLastP + q102$sale_3m
+ q102$ntrx_3m + q102$nitem_3m + q102$ndays_3m + q102$nmonths_3m + q102$nchannel_3m +
q102$sale_6m + q102$ntrx_6m + q102$nitem_6m + q102$ndays_6m + q102$nmonths_6m +
q102$nchannel_6m + q102$sale_we + q102$ntrx_we + q102$nitem_we + q102$ndays_we +
q102$nmonths_we + q102$nchannel_we , data=q102, controls =ctree_control(maxdepth = 3,
mincriterion = 0.95, minsplit = 50) )
plot(t1)
r1 <- lm(q102$amt2003 ~ q102$ndays+q102$amt_sale+ q102$nitems + q102$nchannel +
q102$maxdsale + q102$nclass + q102$ndept+ q102$nsubclass + q102$DaysfromLastP + q102$sale_3m
+ q102$ntrx_3m + q102$nitem_3m + q102$ndays_3m + q102$nmonths_3m + q102$nchannel_3m +
q102$sale_6m + q102$ntrx_6m + q102$nitem_6m + q102$ndays_6m + q102$nmonths_6m +
q102$nchannel_6m + q102$sale_we + q102$ntrx_we + q102$nitem_we + q102$ndays_we +
q102$nmonths_we + q102$nchannel_we , data=q102 )
plot(q102$ndays_6m, q102$ndays, col=brewer.pal(9, "Blues")[q102$amt2003grp] , pch=20)
#----------- 2차원 분포 확인 -------------------
# 1년 중 구매일수가 많으면 최근 6개월중 구매일수도
# 많은지
require(hdrcde)
# ndays_6m NULL 처리 -- 6개월중 구매가 없는 고객 경우
q102$ndays_6m <- ifelse(is.na(q102$ndays_6m), 0, q102$ndays_6m )
hdr.boxplot.2d(q102$ndays_6m, q102$ndays)
# 밀집도 등고선 구간 지정
hdr.boxplot.2d(q102$ndays_6m, q102$ndays, prob = c(50, 75, 90) )
# 회귀선 추가
abline(lm(q102$ndays~q102$ndays_6m), col="red")
#------------------
data(mtcars)
mtcars$gear1<-ifelse(mtcars$gear>3,"H", "L")
mtcars$gear1<-as.factor(mtcars$gear1)
y <- c(1:nrow(mtcars))
mtcars$samplingkey <-sample(2,length(y),replace=TRUE,prob=c(0.7,0.3))
attach(mtcars)
train1 <- mtcars[samplingkey==1,]
test1 <- mtcars[samplingkey==2,]
nrow(test1)
require(party)
attach(mtcars)
ct1 <- ctree(gear1 ~hp+wt+vs, data=train1)
cfm1 <- table(predict(ct1, newdata= test1),test1$gear1)
class(cfm1)
accuracyct1 <- (cfm1[1,1] + cfm1[2,2] ) / nrow(test1) *100
> cfm1
H L
H 6 1
L 0 1
> accuracyct1
[1] 87.5
#----------------------[confusion matrix] 연습 ---------
q102$is_hamt2003 <- as.factor(ifelse(q102$amt2003>300, "H", "L"))
y <- c(1:nrow(q102))
q102$samplingkey <-sample(2,length(y),replace=TRUE,prob=c(0.7,0.3))
tq102 <- q102[q102$samplingkey==1,]
test1 <- q102[q102$samplingkey==2,]
test2 <- head(test1,30)
t1 <- ctree(is_hamt2003 ~ ndays+amt_sale+ nitems + nchannel + maxdsale + nclass + ndept+
nsubclass + DaysfromLastP + sale_3m + ntrx_3m + nitem_3m + ndays_3m + nmonths_3m +
nchannel_3m + sale_6m + ntrx_6m + nitem_6m + ndays_6m + nmonths_6m + nchannel_6m + sale_we
+ ntrx_we + nitem_we + ndays_we + nmonths_we + nchannel_we , data=tq102, controls =ctree_control
(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )
plot(t1)
cfm1 <- table(predict(t1, newdata= test1), test1$is_hamt2003 )
H L
H 561 138
L 328 2330
accuracyct1 <- (cfm1[1,1] + cfm1[2,2] ) / nrow(test1) *100
#------------------------------------
t2 <- ctree(amt2003 ~ ndays+amt_sale+ nitems + nchannel + maxdsale + nclass + ndept+ nsubclass +
DaysfromLastP + sale_3m + ntrx_3m + nitem_3m + ndays_3m + nmonths_3m + nchannel_3m +
sale_6m + ntrx_6m + nitem_6m + ndays_6m + nmonths_6m + nchannel_6m + sale_we + ntrx_we +
nitem_we + ndays_we + nmonths_we + nchannel_we , data=tq102, controls =ctree_control(maxdepth =
3, mincriterion = 0.95, minsplit = 50) )
plot(t2)
predictedamt2003 <- as.data.frame(predict(t2, newdata= test1) )
test1$predictedamt2003 <- predictedamt2003$amt2003
test1$is_error_high <- ifelse(abs(test1$amt2003 - test1$predictedamt2003) <100, 1, 0)
# test1$predictionerror <- abs(test1$amt2003 - test1$predictedamt2003)
test1$predictionerror <- abs(test1$amt2003 -test1$predictedamt2003) / (test1$amt2003+1)
mean(test1$predictionerror)
#----------- treating predictors with NULLs ---------
q102$sale_3m[is.na(q102$sale_3m)] <- 0
q102$ntrx_3m[is.na(q102$ntrx_3m)] <- 0
q102$nitem_3m[is.na(q102$nitem_3m)] <- 0
q102$ndays_3m[is.na(q102$ndays_3m)] <- 0
q102$nmonths_3m[is.na(q102$nmonths_3m)] <- 0
q102$nchannel_3m[is.na(q102$nchannel_3m)] <- 0
q102$sale_6m[is.na(q102$sale_6m)] <- 0
q102$ntrx_6m[is.na(q102$ntrx_6m)] <- 0
q102$nitem_6m[is.na(q102$nitem_6m)] <- 0
q102$ndays_6m[is.na(q102$ndays_6m)] <- 0
q102$nmonths_6m[is.na(q102$nmonths_6m)] <- 0
q102$nchannel_6m[is.na(q102$nchannel_6m)] <- 0
q102$sale_we[is.na(q102$sale_we)] <- 0
q102$ntrx_we[is.na(q102$ntrx_we)] <- 0
q102$nitem_we[is.na(q102$nitem_we)] <- 0
q102$ndays_we[is.na(q102$ndays_we)] <- 0
q102$nmonths_we[is.na(q102$nmonths_we)] <- 0
q102$nchannel_we[is.na(q102$nchannel_we)] <- 0
#------------------------
[1] "customer_id" "amt2003"
[3] "ndays" "amt_sale"
[5] "nchannel" "nitems"
[7] "maxdsale" "amt2003grp"
[9] "nclass" "ndept"
[11] "nsubclass" "DaysfromLastP"
[13] "sale_3m" "ntrx_3m"
[15] "nitem_3m" "ndays_3m"
[17] "nmonths_3m" "nchannel_3m"
[19] "sale_6m" "ntrx_6m"
[21] "nitem_6m" "ndays_6m"
[23] "nmonths_6m" "nchannel_6m"
[25] "sale_we" "ntrx_we"
[27] "nitem_we" "ndays_we"
[29] "nmonths_we" "nchannel_we"
[31] "is_hamt2003" "samplingkey"
* 전체구매건수중 오프라인에서 구매한 건수는? -- 비율
q111 <- sqldf('select customer_id,
count(distinct sk_transaction_id) as ntrx_rt
from tr01
where year="2002" and channel_id="RT"
group by customer_id')
q112 <- merge(q102, q111, by=c("customer_id"), all.x=TRUE)
length(q112$ntrx_rt[is.na(q112$ntrx_rt)])
q112$ntrx_rt[is.na(q112$ntrx_rt)] <- 0
# * 전체구매건수중 온라인 채널에서 구매한 건수는? -- 비율
q113 <- sqldf('select customer_id,
count(distinct sk_transaction_id) as ntrx_on
from tr01
where year="2002" and channel_id=on"
group by customer_id')
q114 <- merge(q112, q113, by=c("customer_id"), all.x=TRUE)
length(q114$ntrx_on[is.na(q114$ntrx_on)])
q114$ntrx_on[is.na(q114$ntrx_on)] <- 0
# 하루구매액이 얼마 이상인 건들을 기준으로 (일 day) 구매일수 추가변수를
# 만들어 볼까?
# 먼저, 한 고객이 하루에 산 금액이 얼마인지? 분포 확인
q115 <- sqldf('select customer_id, d_date,
sum(t_item_amt_sale) as dsale
from tr01
where year="2002"
group by customer_id, d_date ')
# 150불이상 구매일수
q116 <- sqldf('select customer_id,
count(distinct d_date) as ndays_hamt
from q115
where dsale >= 150
group by customer_id')
q117 <- merge(q114, q116, by=c("customer_id"), all.x=TRUE)
q117$ndays_hamt[is.na(q117$ndays_hamt)] <- 0
length(q117$ndays_hamt[is.na(q117$ndays_hamt)])
hdr.boxplot.2d(q117$ndays, q117$ndays_hamt)
# 회귀선 추가
abline(lm(q117$ndays_hamt~q117$ndays), col="red")
#--------------------------------------
# class 를 기준으로 상품을 프로파일
#---------
q201 <- sqldf('select distinct sk_class_id,
class_name
from sku01 ')
q202 <- sqldf('select distinct sk_class_id, class_name,
count(distinct sk_subclass_id) as nsubclass
from sku01
group by sk_class_id ')
# 최대, 최소 값을 확인
range(q202$nsubclass)
# nsubclass를 기준으로 큰 것 부터 3개만
head(q202[order(-q202$nsubclass),],3)
# 클래스별 구매고객수 산출
q203 <- sqldf('select b.sk_class_id,
count(distinct a.customer_id) as ncust
from tr01 as a left join sku01 as b
on a.sk_item_id = b.sk_item_id
group by b.sk_class_id')
'R 데이터 분석' 카테고리의 다른 글
movie prediction exmple (0) | 2015.07.08 |
---|---|
predictive modeling :: algorithms and interpretation (0) | 2015.07.08 |
:: retail data preparation and EDA 연습 과제 (0) | 2015.07.06 |
0702 빅Labor .... 주제 (0) | 2015.07.03 |
retail example (0) | 2015.07.02 |