>RE::VISION CRM

R 데이터 분석

[KDBRBD] retail data analysis practice

YONG_X 2018. 8. 4. 09:57

# UCI online retail data set link ::

# https://archive.ics.uci.edu/ml/datasets/online+retail 


### recomm MBA



setwd("C:/YONG/w20180728/")

a1 <- read.csv("custAtt.csv")

names(a1) <- c("cust", "att")

write.csv(a1, "ca1.csv")


# MBA --------- 

# EDA 

barplot(sort(table(a1$att), decreasing=T))


library(arules)

library(arulesViz)



b01 <- read.transactions("ca1.csv", 

  rm.duplicates=T, format="single", 

  sep=",", cols = c("cust", "att"))




itemFrequencyPlot(b01,

                  type="absolute",

                  topN=6,

                  horiz=TRUE,

                  col='steelblue3',

                  xlab='',

                  main='Item frequency, absolute')


itemsets <- apriori(b01,

                parameter = list(support=.01, minlen=2,

                    target='frequent' # to mine for itemsets

                    ))


inspect(sort(itemsets, by='support', decreasing = T)[1:5])



rules <- apriori(b01,

                 parameter = list(support=.03,

                      confidence=.5,

                       maxlen=2,

                       target='rules' # to mine for rules

                          ))


options(digits=3)

inspect(sort(rules, by='conf', decreasing = T)[1:10])

plot(subset(rules, subset= confidence >.5), method="graph")





#---- 수퍼마켓 Market Basket Analysis Example (추가)--------------

# https://rpubs.com/sbushmanov/180410


library(arules)

library(arulesViz)


# 수퍼마켓 상품 구매이력 POS 데이터

data("Groceries")

str(Groceries)

summary(Groceries)




#--- EDA on data set characteristics --------


tail(itemFrequency(Groceries))

tail(sort(itemFrequency(Groceries, type="absolute")))


itemFrequencyPlot(Groceries,

                  type="absolute",

                  topN=6,

                  horiz=TRUE,

                  col='steelblue3',

                  xlab='',

                  main='Item frequency, absolute')


# get frequent item sets

itemsets <- apriori(Groceries,

                parameter = list(support=.001, minlen=2,

                    target='frequent' # to mine for itemsets

                    ))

inspect(sort(itemsets, by='support', decreasing = T)[1:5])


rules <- apriori(Groceries,

                 parameter = list(support=.002,

                      confidence=.7,

                       minlen=2,

                       target='rules' # to mine for rules

                          ))

options(digits=3)

inspect(sort(rules, by='lift', decreasing = T)[1:10])

plot(subset(rules, subset= confidence >.8), method="graph")


rules <- apriori(Groceries,

                 parameter = list(support=.001,

                      confidence=.2,

                      minlen=2, maxlen=2,

                      target='rules' # to mine for rules

                          ))

options(digits=3)

inspect(sort(rules, by='lift', decreasing = T)[1:10])

plot(subset(rules, subset= lift >4), method="graph")


plot(subset(rules, subset= lift >=4.87), method="graph")


# subsetting 

inspect(sort(subset(rules,

                    subset=rhs %in% 'bottled beer' & confidence > .7),

                    by = 'lift',

                    decreasing = T))


# 규칙 해당(=특정상품 포함) transaction 추출

inspect(subset(Groceries[1:500],items %in% c("softener", "liquor")))




#--- item hierarchy 상품분류 계층구조 파악 ---


Groceries_level2 <- aggregate(Groceries, by = "level2")

<- tail(sort(itemFrequency(Groceries_level2)))


df_item <- as.data.frame(Groceries@itemInfo)

df_item$item_freq <- itemFrequency(Groceries, type="absolute")

df_item[df_item$item_freq>=1000,]

aggregate(df_item$item_freq~df_item$level2, FUN=sum)


Groceries_level1 <- aggregate(Groceries, by = "level1")

barplot(tail(sort(itemFrequency(Groceries_level1, type="absolute"))))




#---------

#=============== T3 :: 13i ==========



setwd("C:/YONG/t3/")

tr01 <- read.csv("trx.csv", stringsAsFactors = F) 

tr01 <- head(tr01, nrow(tr01)-1) # 합계행 제거


sort(table(tr01$상품명))

sort(table(tr01$주문번호)) 

sort(table(tr01$주문번호))

sort(table(tr01$회원명))

sort(table(tr01$카테고리명))



tr01[,names(tr01) %in% c("주문번호","회원명","판매금액")]



tr01$판매금액 <- as.numeric(gsub(",","",tr01$판매금액))

tr01$실판매금액 <- as.numeric(gsub(",","",tr01$실판매금액))

tr01$배송비 <- as.numeric(gsub(",","",tr01$배송비))

tr01$쿠폰액 <- as.numeric(gsub(",","",tr01$쿠폰액))



agg1 <- aggregate(tr01$판매금액~tr01$카테고리명, FUN=sum)

agg1[order(agg1$'tr01$판매금액', decreasing=T),]

names(agg1) <- c("category", "sum_sale_amt")


agg2 <- aggregate(tr01$판매금액~tr01$카테고리명, FUN=length)

agg2[order(agg2$'tr01$판매금액', decreasing=T),]

names(agg2) <- c("category", "sale_cnt")


magg <- merge(agg1, agg2, by="category", all.x=T)

plot(magg$sale_cnt, magg$sum_sale_amt/1000, pch=19, col="brown", 

  xlim=c(0,5), ylim=c(0,500))

text(magg$sale_cnt, magg$sum_sale_amt/1000, labels=magg$category, 

  cex=0.7, pos=4)


plot(tr01$판매금액, tr01$배송비)

abline(lm(tr01$배송비~tr01$판매금액))

plot(density(tr01$배송비/tr01$판매금액))


plot(tr01$판매금액, tr01$쿠폰액, pch=19, 

  col=ifelse(tr01$판매수량>1,"red","blue"))

abline(lm(tr01$쿠폰액~tr01$판매금액))




agg3 <- aggregate(tr01$판매금액~tr01$회원명, FUN=sum)

agg3[order(agg3$'tr01$판매금액', decreasing=T),]

names(agg3) <- c("member", "sum_sale_amt")


# agg4 <- aggregate(tr01$판매금액~tr01$회원명, FUN=length)

# agg4[order(agg4$'tr01$판매금액', decreasing=T),]

# names(agg4) <- c("member", "sale_cnt")


tr02 <- unique(tr01[,c("주문일자","회원명")])

agg4 <- aggregate(tr02$주문일자~tr02$회원명, FUN=length)

agg4[order(agg4$'tr02$주문일자', decreasing=T),]

names(agg4) <- c("member", "sale_cnt")



magg <- merge(agg3, agg4, by="member", all.x=T)

plot(jitter(magg$sale_cnt), jitter(magg$sum_sale_amt),

  pch=19, col=rgb(0,0,1,0.2), 

  xlim=c(0,4), ylim=c(0,400000), cex=1.5)

abline(lm(magg$sum_sale_amt~magg$sale_cnt))

abline(h=mean(magg$sum_sale_amt), lty=2)

abline(v=mean(magg$sale_cnt), lty=2)





tr03 <- unique(tr01[,c("카테고리명","회원명")])

agg5 <- aggregate(tr03$카테고리명~tr03$회원명, FUN=length)

names(agg5) <- c("member", "cnt_category")


magg <- merge(agg3, agg5, by="member", all.x=T)

plot(jitter(magg$cnt_category), jitter(magg$sum_sale_amt),

  pch=19, col=rgb(0,0,1,0.2), 

  xlim=c(0,4), ylim=c(0,400000), cex=1.5)

abline(lm(magg$sum_sale_amt~magg$cnt_category))

abline(h=mean(magg$sum_sale_amt), lty=2)

abline(v=mean(magg$cnt_category), lty=2)



magg1 <- merge(magg, agg4, by="member", all.x=T)

plot(jitter(magg1$cnt_category), jitter(magg1$sale_cnt),

  pch=19, col=rgb(0,0,ifelse(magg1$sum_sale_amt>mean(magg1$sum_sale_amt),1,0),0.2), 

  xlim=c(0,4), ylim=c(0,3), cex=1.5)

abline(lm(magg1$sale_cnt~magg1$cnt_category), lty=2)

abline(h=mean(magg1$sale_cnt), lty=2)

abline(v=mean(magg1$cnt_category), lty=2)