# UCI online retail data set link ::
# https://archive.ics.uci.edu/ml/datasets/online+retail
### recomm MBA
setwd("C:/YONG/w20180728/")
a1 <- read.csv("custAtt.csv")
names(a1) <- c("cust", "att")
write.csv(a1, "ca1.csv")
# MBA ---------
# EDA
barplot(sort(table(a1$att), decreasing=T))
library(arules)
library(arulesViz)
b01 <- read.transactions("ca1.csv",
rm.duplicates=T, format="single",
sep=",", cols = c("cust", "att"))
itemFrequencyPlot(b01,
type="absolute",
topN=6,
horiz=TRUE,
col='steelblue3',
xlab='',
main='Item frequency, absolute')
itemsets <- apriori(b01,
parameter = list(support=.01, minlen=2,
target='frequent' # to mine for itemsets
))
inspect(sort(itemsets, by='support', decreasing = T)[1:5])
rules <- apriori(b01,
parameter = list(support=.03,
confidence=.5,
maxlen=2,
target='rules' # to mine for rules
))
options(digits=3)
inspect(sort(rules, by='conf', decreasing = T)[1:10])
plot(subset(rules, subset= confidence >.5), method="graph")
#---- 수퍼마켓 Market Basket Analysis Example (추가)--------------
# https://rpubs.com/sbushmanov/180410
library(arules)
library(arulesViz)
# 수퍼마켓 상품 구매이력 POS 데이터
data("Groceries")
str(Groceries)
summary(Groceries)
#--- EDA on data set characteristics --------
tail(itemFrequency(Groceries))
tail(sort(itemFrequency(Groceries, type="absolute")))
itemFrequencyPlot(Groceries,
type="absolute",
topN=6,
horiz=TRUE,
col='steelblue3',
xlab='',
main='Item frequency, absolute')
# get frequent item sets
itemsets <- apriori(Groceries,
parameter = list(support=.001, minlen=2,
target='frequent' # to mine for itemsets
))
inspect(sort(itemsets, by='support', decreasing = T)[1:5])
rules <- apriori(Groceries,
parameter = list(support=.002,
confidence=.7,
minlen=2,
target='rules' # to mine for rules
))
options(digits=3)
inspect(sort(rules, by='lift', decreasing = T)[1:10])
plot(subset(rules, subset= confidence >.8), method="graph")
rules <- apriori(Groceries,
parameter = list(support=.001,
confidence=.2,
minlen=2, maxlen=2,
target='rules' # to mine for rules
))
options(digits=3)
inspect(sort(rules, by='lift', decreasing = T)[1:10])
plot(subset(rules, subset= lift >4), method="graph")
plot(subset(rules, subset= lift >=4.87), method="graph")
# subsetting
inspect(sort(subset(rules,
subset=rhs %in% 'bottled beer' & confidence > .7),
by = 'lift',
decreasing = T))
# 규칙 해당(=특정상품 포함) transaction 추출
inspect(subset(Groceries[1:500],items %in% c("softener", "liquor")))
#--- item hierarchy 상품분류 계층구조 파악 ---
Groceries_level2 <- aggregate(Groceries, by = "level2")
<- tail(sort(itemFrequency(Groceries_level2)))
df_item <- as.data.frame(Groceries@itemInfo)
df_item$item_freq <- itemFrequency(Groceries, type="absolute")
df_item[df_item$item_freq>=1000,]
aggregate(df_item$item_freq~df_item$level2, FUN=sum)
Groceries_level1 <- aggregate(Groceries, by = "level1")
barplot(tail(sort(itemFrequency(Groceries_level1, type="absolute"))))
#---------
#=============== T3 :: 13i ==========
setwd("C:/YONG/t3/")
tr01 <- read.csv("trx.csv", stringsAsFactors = F)
tr01 <- head(tr01, nrow(tr01)-1) # 합계행 제거
sort(table(tr01$상품명))
sort(table(tr01$주문번호))
sort(table(tr01$주문번호))
sort(table(tr01$회원명))
sort(table(tr01$카테고리명))
tr01[,names(tr01) %in% c("주문번호","회원명","판매금액")]
tr01$판매금액 <- as.numeric(gsub(",","",tr01$판매금액))
tr01$실판매금액 <- as.numeric(gsub(",","",tr01$실판매금액))
tr01$배송비 <- as.numeric(gsub(",","",tr01$배송비))
tr01$쿠폰액 <- as.numeric(gsub(",","",tr01$쿠폰액))
agg1 <- aggregate(tr01$판매금액~tr01$카테고리명, FUN=sum)
agg1[order(agg1$'tr01$판매금액', decreasing=T),]
names(agg1) <- c("category", "sum_sale_amt")
agg2 <- aggregate(tr01$판매금액~tr01$카테고리명, FUN=length)
agg2[order(agg2$'tr01$판매금액', decreasing=T),]
names(agg2) <- c("category", "sale_cnt")
magg <- merge(agg1, agg2, by="category", all.x=T)
plot(magg$sale_cnt, magg$sum_sale_amt/1000, pch=19, col="brown",
xlim=c(0,5), ylim=c(0,500))
text(magg$sale_cnt, magg$sum_sale_amt/1000, labels=magg$category,
cex=0.7, pos=4)
plot(tr01$판매금액, tr01$배송비)
abline(lm(tr01$배송비~tr01$판매금액))
plot(density(tr01$배송비/tr01$판매금액))
plot(tr01$판매금액, tr01$쿠폰액, pch=19,
col=ifelse(tr01$판매수량>1,"red","blue"))
abline(lm(tr01$쿠폰액~tr01$판매금액))
agg3 <- aggregate(tr01$판매금액~tr01$회원명, FUN=sum)
agg3[order(agg3$'tr01$판매금액', decreasing=T),]
names(agg3) <- c("member", "sum_sale_amt")
# agg4 <- aggregate(tr01$판매금액~tr01$회원명, FUN=length)
# agg4[order(agg4$'tr01$판매금액', decreasing=T),]
# names(agg4) <- c("member", "sale_cnt")
tr02 <- unique(tr01[,c("주문일자","회원명")])
agg4 <- aggregate(tr02$주문일자~tr02$회원명, FUN=length)
agg4[order(agg4$'tr02$주문일자', decreasing=T),]
names(agg4) <- c("member", "sale_cnt")
magg <- merge(agg3, agg4, by="member", all.x=T)
plot(jitter(magg$sale_cnt), jitter(magg$sum_sale_amt),
pch=19, col=rgb(0,0,1,0.2),
xlim=c(0,4), ylim=c(0,400000), cex=1.5)
abline(lm(magg$sum_sale_amt~magg$sale_cnt))
abline(h=mean(magg$sum_sale_amt), lty=2)
abline(v=mean(magg$sale_cnt), lty=2)
tr03 <- unique(tr01[,c("카테고리명","회원명")])
agg5 <- aggregate(tr03$카테고리명~tr03$회원명, FUN=length)
names(agg5) <- c("member", "cnt_category")
magg <- merge(agg3, agg5, by="member", all.x=T)
plot(jitter(magg$cnt_category), jitter(magg$sum_sale_amt),
pch=19, col=rgb(0,0,1,0.2),
xlim=c(0,4), ylim=c(0,400000), cex=1.5)
abline(lm(magg$sum_sale_amt~magg$cnt_category))
abline(h=mean(magg$sum_sale_amt), lty=2)
abline(v=mean(magg$cnt_category), lty=2)
magg1 <- merge(magg, agg4, by="member", all.x=T)
plot(jitter(magg1$cnt_category), jitter(magg1$sale_cnt),
pch=19, col=rgb(0,0,ifelse(magg1$sum_sale_amt>mean(magg1$sum_sale_amt),1,0),0.2),
xlim=c(0,4), ylim=c(0,3), cex=1.5)
abline(lm(magg1$sale_cnt~magg1$cnt_category), lty=2)
abline(h=mean(magg1$sale_cnt), lty=2)
abline(v=mean(magg1$cnt_category), lty=2)
'R 데이터 분석' 카테고리의 다른 글
[DSM1809] statistical data analysis using R (0) | 2018.08.24 |
---|---|
[KDDTprj4] decision tree sample (0) | 2018.08.04 |
[CRMAJU2018] 기말고사 후보답안 예시 (0) | 2018.06.20 |
[ CRMAJU2018 ] 기말고사 (0) | 2018.06.20 |
[CRMAJU2018] 예측분석 Review (0) | 2018.06.15 |