>RE::VISION CRM

R 데이터 분석

retail example

YONG_X 2015. 7. 2. 14:39

#-----------


tr01 <- read.csv("tb_tr01.csv")


sku01 <- read.csv("sku_mast01.csv")



# sas 시간에서 R 시간으로 변경

tr01$d_date <- as.POSIXct(tr01$d_date,origin='1960-01-01',tz="UTC")


tr01$customer_id <- as.character(tr01$customer_id )

tmpcust_id <- as.character(tr01$customer_id )

tr02 <- tr01

tr01$customer_id <- as.character(tmpcust_id)

q01 <- aggregate(tr01$t_item_amt_sale, by=list(tr01$customer_id), FUN=count)


require(sqldf)


q04 <- sqldf('select * from q02 order by salesum desc')


# 2년간 총 영수증수 집계

q05 <- sqldf('select count(distinct sk_transaction_id) as trcnt from tr01')


q06 <- tr01[substr(as.character(tr01$d_date),1,4)=="2003" ,]

q07 <- sqldf('select count(distinct sk_transaction_id) as trcnt from q06')


q08 <- sqldf('select customer_id, count(*) as cnt from tr01 group by customer_id')


#---------------


q36<- sqldf('select sk_item_id, count(*) as cnt from tr01 group by sk_item_id')


q37 <- sqldf('select sk_item_id, max(cnt) from q36')


q38 <- sqldf('select a.*, b.sku_description 

from q37 as a left join sku01 as b

on a.sk_item_id=b.sk_item_id')



#----------------------

#----------------------


department_id 별 구매고객수 그래프를 작성하라!



q41 <- sqldf('select a.customer_id, b.department_name, 

b.sk_department_id 

from tr01 as a left join sku01 as b

on a.sk_item_id=b.sk_item_id

where a.year="2002" ' )


q41a <- sqldf( 'select sk_department_id, department_name, 

count(distinct customer_id) as ncust

from q41

group by sk_department_id' )


q42 <- sqldf('select sk_department_id, count(distinct customer_id) as ncust from q41  group by sk_department_id')


q43 <- sqldf('select * from q41

where sk_department_id in 

  (select sk_department_id from q42 where ncust>1000 ) ' )


counts <- table(q43$department_name)

barplot(counts)


q51 <- sqldf('select channel_id, count(*) as ntrxdet

from tr01 group by channel_id')


q52 <- sqldf('select channel_id, count(distinct customer_id)  as ncust

from tr01 group by channel_id')


q53 <- sqldf('select a.*, b.ncust, 

a.ntrxdet*100/b.ncust as ntrxdetpercust

from q51 as a left join q52 as b

on a.channel_id=b.channel_id')



q54 <- sqldf('select * from tr01 where t_item_qty_sale >=200')

q55 <- sqldf('select a.*, sku_description 

from q54 as a left join sku01 as b

on a.sk_item_id=b.sk_item_id ')


#--------------

# 가장 간단한 수준의 모델 만들기

(다중선형회귀 분석과 트리 )


#--------------


q61 <- sqldf('select customer_id, 

sum(t_item_amt_sale) as amt_sale,

count(distinct d_date) as ndays

from tr01 

where year="2002" 

group by customer_id ')


q62 <- sqldf('select customer_id, 

sum(t_item_amt_sale) as amt_sale

from tr01 

where year="2003" 

group by customer_id ')


q63 <- sqldf('select a.*, b.amt_sale as amt2003

from q61 as a left join q62 as b

on a.customer_id=b.customer_id')


plot(q63$amt_sale, q63$amt2003)

abline(lm(q63$amt2003~q63$amt_sale), col="red")


plot(q63$ndays, q63$amt2003)

abline(lm(q63$amt2003~q63$ndays), col="blue")


r1 <- lm(q63$amt2003~q63$ndays+q63$amt_sale)

summary(r1)


t1 <- ctree(q63$amt2003~q63$ndays+q63$amt_sale, data=q63, controls = ctree_control


(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )


plot(t1) 


plot(q63$amt_sale[order(-q63$amt_sale)]  )

quantile(q63$amt_sale, c(0.1, 0.5))


q64<- sqldf('select sum(amt_sale) as sum_amt_sale,

sum(amt2003) as sum_amt2003,

sum(amt2003) /sum(amt_sale) as rchange

from q63' )


q63$is_amthigh2003 <- ifelse(q63$amt2003>=500 , "H", "NH")

q63$is_amthigh2003 <- as.factor(q63$is_amthigh2003)



t2 <- ctree(q63$is_amthigh2003~q63$ndays+q63$amt_sale, data=q63, controls = ctree_control


(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )


plot(t2, main="TREE 500") 


q66 <- sqldf('select distinct customer_id, cust_type_id from tr01 order by customer_id')


q67 <- sqldf('select a.*, b.cust_type_id from q63 as a left join q66 as b 

on a.customer_id=b.customer_id' )


t3 <- ctree(q67$is_amthigh2003~q67$ndays+q67$amt_sale + q67$cust_type_id, data=q67, 


controls = ctree_control(maxdepth = 4, mincriterion = 0.95, minsplit = 20) )


plot(t3) 


q68 <- sqldf('select customer_id, d_date, sum(t_item_amt_sale) as dsale

from tr01

where year="2002" 

group by customer_id, d_date ')


q69 <- sqldf('select customer_id, max(dsale)  as maxdsale

from q68

group by customer_id')


q70 <- sqldf('select a.*, b.maxdsale

from q63 as a left join q69 as b

on a.customer_id=b.customer_id' )


# NULL 처리

q70$maxdsale[is.na(q70$maxdsale)] <- 0


t4 <- ctree(q70$is_amthigh2003~q70$ndays+q70$amt_sale + q70$maxdsale, data=q70, controls = 


ctree_control(maxdepth = 4, mincriterion = 0.95, minsplit = 20) )


plot(t4) 


plot(q70$maxdsale, q70$amt2003)


#------------------------------

require(gplots)

ci2d(q70$maxdsale, q70$amt2003)

'R 데이터 분석' 카테고리의 다른 글

:: retail data preparation and EDA 연습 과제  (0) 2015.07.06
0702 빅Labor .... 주제  (0) 2015.07.03
경기도 교통 데이터 추가[Dataset, 작업과제]   (0) 2015.06.30
0630 ... PART 3   (0) 2015.06.30
0630 파트 2  (0) 2015.06.30