>RE::VISION CRM

R 데이터 분석

[kdata 2019 recsys 0030] retail recommender using R[전용준 리비젼 recsys r]

YONG_X 2019. 5. 18. 16:08

유통업의 추천서비스 구현 -- Drill using R


20190521  ~

20190522


[전용준 리비젼 recsys r]

[전용준 리비젼 recsys r]



Sample Data:


mDataAR.csv



Script : 



retail_recsys_YONG_revision_script20190425.txt



_review_R_EDA_scrpt_201910.txt



recsys_YONG_revision_R_script20191023.txt


##############


recsys_YONG_revision_R_script20191112_ABC.txt


##########


recsys_YONG_revision_R_script20191112_ABCD_FX.txt







AGENDA >>

* EDA before building recommender


* Association rule discovery


* Clustering

- item clustering

- customer clustering


* Collaborative filtering




(데이터 가공과 시각화를 통한 고객분석 -- Simple 예제)


[CRMAJU2018] R데이터분석 - 탐색적 고객세분화 :: 서점 데이터 활용

- Grid 기반의 세분화

- Cluster기반의 세분화

링크 >>  http://blog.daum.net/revisioncrm/405


Cubic Cluster Criteria (CCC) 지표

==> 참고 링크 : https://rpubs.com/cardiomoon/249084



(추가로 살펴본 슬라이드 몇장)

including .... [ 추천: map of spectrum ]

Kdata_유통빅데이터추천_전용준_리비젼컨설팅_머신러닝_20190521_sum.pdf




###########

[ 연습문제1] 2011년 2월15일에서 2월 19일 까지 밤 열시 이후 심야시간대

(새벽 3시까지)에 구매건이 발생한 적이 있는 고객의 수를 구하라


[연습문제2] 2011년 상반기중에 월별 고객별 평균 구매건수를 구하라


[연습문제3] 2011년 구매건으로부터 구매총금액이 가장 많은 5 고객의 번호를 구하라


mDatas <- mData2[substr(as.character(mData2$InvoiceDate),1,10)>="2011-01-01",]
mDatas <- mDatas[,c('CustomerID', 'UnitPrice', 'Quantity')]
mDatas$subtot <- mDatas$UnitPrice * mDatas$Quantity
head(mDatas)
aggitems <-aggregate(mDatas$subtot, by=list(mDatas$CustomerID), 
    FUN=sum)
head(aggitems)
names(aggitems) <- c("CustomerID", "TotAMT")
head(aggitems[order(-aggitems$TotAMT),],5)


#---- randomForest for Prediction ----

aggitem8_4rf <- aggitem8
aggitem8_4rf$Isclst2 <- as.factor(ifelse(aggitem8_4rf$clstKey==2,'Y','N' ))

rf1 <- randomForest(Isclst2 ~ InvoiceCount + MeanPriceSale +
  + InvoiceCountUKRatio + IsCakeCases + IsPaperParasol + IsRed + IsBlue, data=aggitem8_4rf, ntree=1000, 
   do.trace=20, importance=T)
plot(rf1)
varImpPlot(rf1)




###############


# [실습] --------- !!! 

# 신나게몰닷컴의 추천로직을 성별, 연령만을 사용하여 구현해본다면?


1) 분포부터 확인 - 성별, 연령 각각 

barplot(table((user$gender)))

barplot(table((user$generation)))


2) 성별, 연령을 trx 테이블에 붙여주고

dd01 <- merge(d03, user[,c('user_id', 'gender','generation')], by="user_id", all.x=T)

head(dd01)


3) 성별X연령대 별로 집계 -- 성별 연령대별 품목별 구매건수


dd02 <- aggregate(dd01$user_id, by=list(dd01$gender, dd01$generation, dd01$item_nm), 

   FUN=length)

names(dd02) <- c('gender', 'generation', 'item_nm','cnt_trx')

dd02


dd03 <- dd02[order(dd02[,1], dd02[,2], -dd02[,4]),]

head(dd03)


dd04 <- aggregate(dd01$user_id, by=list(dd01$gender, dd01$generation), 

   FUN=length)

names(dd04) <- c('gender', 'generation','Freq')

head(dd04)


dd05 <- merge(dd03, dd04, by=c('gender','generation'), all.x=T)

dd05

dd05$r_cnt_trx <- dd05$cnt_trx / dd05$Freq


# 집계가 끝난 성별 연령대별 비율

head(dd05,20)


head(user,3)


# 특정 고객이 오면 (예: 고객 1번) 남자 10대라면 그 고객이 구매한 목록을 먼저

# 계산하고, dd05에서 남자10대의 품목별 순서에서 해당 고객이 구매한적 없는

# 품목만 추출 





######## recommendation using keras DNN ---------


# install anaconda and tensorflow beforehand!

# install.packages("keras")

install.packages("tensorflow")


library(keras)


cars<- read.csv("https://raw.githubusercontent.com/MGCodesandStats/datasets/master/cars.csv")


#Max-Min Normalization

normalize <- function(x) {

  return ((x - min(x)) / (max(x) - min(x)))

}

maxmindf <- as.data.frame(lapply(cars, normalize))

attach(maxmindf)


# Random sample indexes

train_index <- sample(1:nrow(maxmindf), 0.8 * nrow(maxmindf))

test_index <- setdiff(1:nrow(maxmindf), train_index)

# Build X_train, y_train, X_test, y_test

X_train <- as.matrix(maxmindf[train_index, -15])

y_train <- as.matrix(maxmindf[train_index, "sales"])

X_test <- as.matrix(maxmindf[test_index, -15])

y_test <- as.matrix(maxmindf[test_index, "sales"])


model <- keras_model_sequential() 

model %>% 

  layer_dense(units = 12, activation = 'relu', kernel_initializer='RandomNormal', input_shape = c(6)) %>% 

  layer_dense(units = 8, activation = 'relu') %>%

  layer_dense(units = 4, activation = 'relu') %>%

  layer_dense(units = 2, activation = 'relu') %>%

  layer_dense(units = 1, activation = 'linear')

summary(model)


model %>% compile(

  loss = 'mean_squared_error',

  optimizer = 'adam',

  metrics = c('mae')

)


# training

history <- model %>% fit(

  X_train, y_train, 

  epochs = 500, batch_size = 50, 

  validation_split = 0.2

)



model %>% evaluate(X_test, y_test)


pred <- data.frame(y = predict(model, as.matrix(X_test)))

df<-data.frame(pred,X_test)

attach(df)

deviation=((pred-sales)/sales)

mean(deviation$y)*100


plot(df$sales, pred$y, cex=0.5)






######## recommendation using keras DNN ---------


## data prep

mDataK1 <- mData2[substr(as.character(mData2$InvoiceDate),1,10)<="2011-09-31",]


mDataK2 <- mData2[substr(as.character(mData2$InvoiceDate),1,10)>"2011-09-31",]



# 입력기간 집계

# 고객별 품목별 구매건수 산출

mDataK1Top20 <- mDataK1[mDataK1$Description %in% Top20Items,]

aggcustitem_cnt <-aggregate(mDataK1Top20$InvoiceNo, by=list(mDataK1Top20$Description, 

   mDataK1Top20$CustomerID), 

   FUN=length)

names(aggcustitem_cnt) <- c("Description","CustomerID", "InvoiceCount")

head(aggcustitem_cnt)



# pivoting

library(reshape2)

custitem_Top2cnt <- dcast(data = aggcustitem_cnt, formula = CustomerID ~ Description, 

    fun.aggregate = mean, 

    value.var = "InvoiceCount")


head(custitem_Top2cnt)

str(custitem_Top2cnt)


# NA값을 0으로 대체

custitem_Top2cnt[is.na(custitem_Top2cnt)] <- 0


head(custitem_Top2cnt)

str(custitem_Top2cnt)



dfInput <- custitem_Top2cnt 


# 출력기간 집계

# 고객별 품목별 구매건수 산출

mDataK2Top20 <- mDataK2[mDataK2$Description %in% Top20Items,]

aggcustitem_cnt <-aggregate(mDataK2Top20$InvoiceNo, by=list(mDataK2Top20$Description, 

   mDataK2Top20$CustomerID), 

   FUN=length)

names(aggcustitem_cnt) <- c("Description","CustomerID", "InvoiceCount")

head(aggcustitem_cnt)



# pivoting

library(reshape2)

custitem_Top2cnt1 <- dcast(data = aggcustitem_cnt, formula = CustomerID ~ Description, 

    fun.aggregate = mean, 

    value.var = "InvoiceCount")


# NA값을 0으로 대체

custitem_Top2cnt1[is.na(custitem_Top2cnt1)] <- 0


dfOutput <- custitem_Top2cnt1 



head(dfInput,2)

head(dfOutput,2)


names(dfInput) <- c('CustomerID', paste0('if_', as.character(1:20) ))

names(dfOutput) <- c('CustomerID', paste0('of_', as.character(1:20) ))



dfKrs <- merge(dfInput, dfOutput, by='CustomerID', all.x=T)

dfKrs[is.na(dfKrs)] <- 0





#-------------

# install anaconda and tensorflow beforehand!

# install.packages("keras")

install.packages("tensorflow")


library(keras)



#Max-Min Normalization

normalize <- function(x) {

  return ((x - min(x)) / (max(x) - min(x)))

}

maxmindf <- as.data.frame(lapply(dfKrs[,2:41], normalize))

attach(maxmindf)


# Random sample indexes

train_index <- sample(1:nrow(maxmindf), 0.8 * nrow(maxmindf))

test_index <- setdiff(1:nrow(maxmindf), train_index)


# Build X_train, y_train, X_test, y_test

X_train <- as.matrix(maxmindf[train_index, c(1:20)])

y_train <- as.matrix(maxmindf[train_index, c(21:40)])

X_test <- as.matrix(maxmindf[test_index, c(1:20)])

y_test <- as.matrix(maxmindf[test_index,c(21:40)])


model <- keras_model_sequential() 

model %>% 

  layer_dense(units = 20, activation = 'relu', kernel_initializer='RandomNormal', input_shape = c(20)) %>% 

  layer_dense(units = 8, activation = 'relu') %>%

  layer_dense(units = 4, activation = 'relu') %>%

#   layer_dense(units = 2, activation = 'relu') %>%

  layer_dense(units = 20, activation = 'linear')

summary(model)


model %>% compile(

  loss = 'mean_squared_error',

  optimizer = 'adam',

  metrics = c('mae')

)


# training

history <- model %>% fit(

  X_train, y_train, 

  epochs = 200, batch_size = 16, 

  validation_split = 0.1

)



model %>% evaluate(X_test, y_test)


pred <- data.frame(predict(model, as.matrix(X_test)))


plot(jitter(pred$X1), jitter(y_test[,1]))

cor(pred$X1, y_test[,1])

plot(jitter(pred$X2), jitter(y_test[,2]))

cor(pred$X2, y_test[,2])

plot(jitter(pred$X3), jitter(y_test[,3]))

cor(pred$X3, y_test[,3])




# 변수 목록 도출 - 구체화 참조 사례 


영화_리비젼_전용준_20171115.pptx




# 전체를 반복문으로 일괄처리 

for (i in 2:21){

  plot(jitter(custitem_Top2cnt[,i]), 

    jitter(custitem_Top2cnt[,i+1]),  

    main = as.character(i),

    col=fitc$cluster, pch=19, cex=0.5)

}


# 함수형식으로 포장

clstFeat_sctter <- function(i){

  plot(jitter(custitem_Top2cnt[,i]), 

    jitter(custitem_Top2cnt[,i+1]),  

    main = as.character(i),

    col=fitc$cluster, pch=19, cex=0.5)

   }



??_???_???_20171115.pptx
1.24MB
mDataAR.csv
1.89MB
recsys_YONG_revision_R_script20191112_ABCD.txt
0.04MB
recsys_YONG_revision_R_script20191112_ABCD_F.txt
0.04MB
Kdata_유통빅데이터추천_전용준_리비젼컨설팅_머신러닝_20190521_sum.pdf
1.01MB
recsys_YONG_revision_R_script20191112_ABC.txt
0.04MB
recsys_YONG_revision_R_script20191112_ABCD_FX.txt
0.04MB
recsys_YONG_revision_R_script20191023.txt
0.03MB
retail_recsys_YONG_revision_script20190425.txt
0.03MB
_review_R_EDA_scrpt_201910.txt
0.09MB
전용준_리비젼_유통추천_머신러닝_201905_using_R.png
0.68MB