유통업의 추천서비스 구현 -- Drill using R
20190521 ~
20190522
[전용준 리비젼 recsys r]
[전용준 리비젼 recsys r]
Sample Data:
Script :
retail_recsys_YONG_revision_script20190425.txt
_review_R_EDA_scrpt_201910.txt
recsys_YONG_revision_R_script20191023.txt
##############
recsys_YONG_revision_R_script20191112_ABC.txt
##########
recsys_YONG_revision_R_script20191112_ABCD_FX.txt
AGENDA >>
* EDA before building recommender
* Association rule discovery
* Clustering
- item clustering
- customer clustering
* Collaborative filtering
(데이터 가공과 시각화를 통한 고객분석 -- Simple 예제)
[CRMAJU2018] R데이터분석 - 탐색적 고객세분화 :: 서점 데이터 활용
- Grid 기반의 세분화
- Cluster기반의 세분화
링크 >> http://blog.daum.net/revisioncrm/405
Cubic Cluster Criteria (CCC) 지표
==> 참고 링크 : https://rpubs.com/cardiomoon/249084
(추가로 살펴본 슬라이드 몇장)
including .... [ 추천: map of spectrum ]
Kdata_유통빅데이터추천_전용준_리비젼컨설팅_머신러닝_20190521_sum.pdf
###########
[ 연습문제1] 2011년 2월15일에서 2월 19일 까지 밤 열시 이후 심야시간대
(새벽 3시까지)에 구매건이 발생한 적이 있는 고객의 수를 구하라
[연습문제2] 2011년 상반기중에 월별 고객별 평균 구매건수를 구하라
[연습문제3] 2011년 구매건으로부터 구매총금액이 가장 많은 5 고객의 번호를 구하라
###############
# [실습] --------- !!!
# 신나게몰닷컴의 추천로직을 성별, 연령만을 사용하여 구현해본다면?
1) 분포부터 확인 - 성별, 연령 각각
barplot(table((user$gender)))
barplot(table((user$generation)))
2) 성별, 연령을 trx 테이블에 붙여주고
dd01 <- merge(d03, user[,c('user_id', 'gender','generation')], by="user_id", all.x=T)
head(dd01)
3) 성별X연령대 별로 집계 -- 성별 연령대별 품목별 구매건수
dd02 <- aggregate(dd01$user_id, by=list(dd01$gender, dd01$generation, dd01$item_nm),
FUN=length)
names(dd02) <- c('gender', 'generation', 'item_nm','cnt_trx')
dd02
dd03 <- dd02[order(dd02[,1], dd02[,2], -dd02[,4]),]
head(dd03)
dd04 <- aggregate(dd01$user_id, by=list(dd01$gender, dd01$generation),
FUN=length)
names(dd04) <- c('gender', 'generation','Freq')
head(dd04)
dd05 <- merge(dd03, dd04, by=c('gender','generation'), all.x=T)
dd05
dd05$r_cnt_trx <- dd05$cnt_trx / dd05$Freq
# 집계가 끝난 성별 연령대별 비율
head(dd05,20)
head(user,3)
# 특정 고객이 오면 (예: 고객 1번) 남자 10대라면 그 고객이 구매한 목록을 먼저
# 계산하고, dd05에서 남자10대의 품목별 순서에서 해당 고객이 구매한적 없는
# 품목만 추출
######## recommendation using keras DNN ---------
# install anaconda and tensorflow beforehand!
# install.packages("keras")
install.packages("tensorflow")
library(keras)
cars<- read.csv("https://raw.githubusercontent.com/MGCodesandStats/datasets/master/cars.csv")
#Max-Min Normalization
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
maxmindf <- as.data.frame(lapply(cars, normalize))
attach(maxmindf)
# Random sample indexes
train_index <- sample(1:nrow(maxmindf), 0.8 * nrow(maxmindf))
test_index <- setdiff(1:nrow(maxmindf), train_index)
# Build X_train, y_train, X_test, y_test
X_train <- as.matrix(maxmindf[train_index, -15])
y_train <- as.matrix(maxmindf[train_index, "sales"])
X_test <- as.matrix(maxmindf[test_index, -15])
y_test <- as.matrix(maxmindf[test_index, "sales"])
model <- keras_model_sequential()
model %>%
layer_dense(units = 12, activation = 'relu', kernel_initializer='RandomNormal', input_shape = c(6)) %>%
layer_dense(units = 8, activation = 'relu') %>%
layer_dense(units = 4, activation = 'relu') %>%
layer_dense(units = 2, activation = 'relu') %>%
layer_dense(units = 1, activation = 'linear')
summary(model)
model %>% compile(
loss = 'mean_squared_error',
optimizer = 'adam',
metrics = c('mae')
)
# training
history <- model %>% fit(
X_train, y_train,
epochs = 500, batch_size = 50,
validation_split = 0.2
)
model %>% evaluate(X_test, y_test)
pred <- data.frame(y = predict(model, as.matrix(X_test)))
df<-data.frame(pred,X_test)
attach(df)
deviation=((pred-sales)/sales)
mean(deviation$y)*100
plot(df$sales, pred$y, cex=0.5)
######## recommendation using keras DNN ---------
## data prep
mDataK1 <- mData2[substr(as.character(mData2$InvoiceDate),1,10)<="2011-09-31",]
mDataK2 <- mData2[substr(as.character(mData2$InvoiceDate),1,10)>"2011-09-31",]
# 입력기간 집계
# 고객별 품목별 구매건수 산출
mDataK1Top20 <- mDataK1[mDataK1$Description %in% Top20Items,]
aggcustitem_cnt <-aggregate(mDataK1Top20$InvoiceNo, by=list(mDataK1Top20$Description,
mDataK1Top20$CustomerID),
FUN=length)
names(aggcustitem_cnt) <- c("Description","CustomerID", "InvoiceCount")
head(aggcustitem_cnt)
# pivoting
library(reshape2)
custitem_Top2cnt <- dcast(data = aggcustitem_cnt, formula = CustomerID ~ Description,
fun.aggregate = mean,
value.var = "InvoiceCount")
head(custitem_Top2cnt)
str(custitem_Top2cnt)
# NA값을 0으로 대체
custitem_Top2cnt[is.na(custitem_Top2cnt)] <- 0
head(custitem_Top2cnt)
str(custitem_Top2cnt)
dfInput <- custitem_Top2cnt
# 출력기간 집계
# 고객별 품목별 구매건수 산출
mDataK2Top20 <- mDataK2[mDataK2$Description %in% Top20Items,]
aggcustitem_cnt <-aggregate(mDataK2Top20$InvoiceNo, by=list(mDataK2Top20$Description,
mDataK2Top20$CustomerID),
FUN=length)
names(aggcustitem_cnt) <- c("Description","CustomerID", "InvoiceCount")
head(aggcustitem_cnt)
# pivoting
library(reshape2)
custitem_Top2cnt1 <- dcast(data = aggcustitem_cnt, formula = CustomerID ~ Description,
fun.aggregate = mean,
value.var = "InvoiceCount")
# NA값을 0으로 대체
custitem_Top2cnt1[is.na(custitem_Top2cnt1)] <- 0
dfOutput <- custitem_Top2cnt1
head(dfInput,2)
head(dfOutput,2)
names(dfInput) <- c('CustomerID', paste0('if_', as.character(1:20) ))
names(dfOutput) <- c('CustomerID', paste0('of_', as.character(1:20) ))
dfKrs <- merge(dfInput, dfOutput, by='CustomerID', all.x=T)
dfKrs[is.na(dfKrs)] <- 0
#-------------
# install anaconda and tensorflow beforehand!
# install.packages("keras")
install.packages("tensorflow")
library(keras)
#Max-Min Normalization
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
maxmindf <- as.data.frame(lapply(dfKrs[,2:41], normalize))
attach(maxmindf)
# Random sample indexes
train_index <- sample(1:nrow(maxmindf), 0.8 * nrow(maxmindf))
test_index <- setdiff(1:nrow(maxmindf), train_index)
# Build X_train, y_train, X_test, y_test
X_train <- as.matrix(maxmindf[train_index, c(1:20)])
y_train <- as.matrix(maxmindf[train_index, c(21:40)])
X_test <- as.matrix(maxmindf[test_index, c(1:20)])
y_test <- as.matrix(maxmindf[test_index,c(21:40)])
model <- keras_model_sequential()
model %>%
layer_dense(units = 20, activation = 'relu', kernel_initializer='RandomNormal', input_shape = c(20)) %>%
layer_dense(units = 8, activation = 'relu') %>%
layer_dense(units = 4, activation = 'relu') %>%
# layer_dense(units = 2, activation = 'relu') %>%
layer_dense(units = 20, activation = 'linear')
summary(model)
model %>% compile(
loss = 'mean_squared_error',
optimizer = 'adam',
metrics = c('mae')
)
# training
history <- model %>% fit(
X_train, y_train,
epochs = 200, batch_size = 16,
validation_split = 0.1
)
model %>% evaluate(X_test, y_test)
pred <- data.frame(predict(model, as.matrix(X_test)))
plot(jitter(pred$X1), jitter(y_test[,1]))
cor(pred$X1, y_test[,1])
plot(jitter(pred$X2), jitter(y_test[,2]))
cor(pred$X2, y_test[,2])
plot(jitter(pred$X3), jitter(y_test[,3]))
cor(pred$X3, y_test[,3])
# 변수 목록 도출 - 구체화 참조 사례
# 전체를 반복문으로 일괄처리
for (i in 2:21){
plot(jitter(custitem_Top2cnt[,i]),
jitter(custitem_Top2cnt[,i+1]),
main = as.character(i),
col=fitc$cluster, pch=19, cex=0.5)
}
# 함수형식으로 포장
clstFeat_sctter <- function(i){
plot(jitter(custitem_Top2cnt[,i]),
jitter(custitem_Top2cnt[,i+1]),
main = as.character(i),
col=fitc$cluster, pch=19, cex=0.5)
}
'R 데이터 분석' 카테고리의 다른 글
[KDATA VDXF] 금융분석 R (0) | 2019.07.27 |
---|---|
[KDATA PLOT EDA retail] 플롯 그리기 (0) | 2019.06.08 |
[AI Summit workshop] rf anomaly 1206 (0) | 2018.11.30 |
GameLog-In 데이터준비 (0) | 2018.10.16 |
[R분석] 실전 EDA 탐색적분석 R 팁 3 (0) | 2018.10.05 |