#-----------
tr01 <- read.csv("tb_tr01.csv")
sku01 <- read.csv("sku_mast01.csv")
# sas 시간에서 R 시간으로 변경
tr01$d_date <- as.POSIXct(tr01$d_date,origin='1960-01-01',tz="UTC")
tr01$customer_id <- as.character(tr01$customer_id )
tmpcust_id <- as.character(tr01$customer_id )
tr02 <- tr01
tr01$customer_id <- as.character(tmpcust_id)
q01 <- aggregate(tr01$t_item_amt_sale, by=list(tr01$customer_id), FUN=count)
require(sqldf)
q04 <- sqldf('select * from q02 order by salesum desc')
# 2년간 총 영수증수 집계
q05 <- sqldf('select count(distinct sk_transaction_id) as trcnt from tr01')
q06 <- tr01[substr(as.character(tr01$d_date),1,4)=="2003" ,]
q07 <- sqldf('select count(distinct sk_transaction_id) as trcnt from q06')
q08 <- sqldf('select customer_id, count(*) as cnt from tr01 group by customer_id')
#---------------
q36<- sqldf('select sk_item_id, count(*) as cnt from tr01 group by sk_item_id')
q37 <- sqldf('select sk_item_id, max(cnt) from q36')
q38 <- sqldf('select a.*, b.sku_description
from q37 as a left join sku01 as b
on a.sk_item_id=b.sk_item_id')
#----------------------
#----------------------
department_id 별 구매고객수 그래프를 작성하라!
q41 <- sqldf('select a.customer_id, b.department_name,
b.sk_department_id
from tr01 as a left join sku01 as b
on a.sk_item_id=b.sk_item_id
where a.year="2002" ' )
q41a <- sqldf( 'select sk_department_id, department_name,
count(distinct customer_id) as ncust
from q41
group by sk_department_id' )
q42 <- sqldf('select sk_department_id, count(distinct customer_id) as ncust from q41 group by sk_department_id')
q43 <- sqldf('select * from q41
where sk_department_id in
(select sk_department_id from q42 where ncust>1000 ) ' )
counts <- table(q43$department_name)
barplot(counts)
q51 <- sqldf('select channel_id, count(*) as ntrxdet
from tr01 group by channel_id')
q52 <- sqldf('select channel_id, count(distinct customer_id) as ncust
from tr01 group by channel_id')
q53 <- sqldf('select a.*, b.ncust,
a.ntrxdet*100/b.ncust as ntrxdetpercust
from q51 as a left join q52 as b
on a.channel_id=b.channel_id')
q54 <- sqldf('select * from tr01 where t_item_qty_sale >=200')
q55 <- sqldf('select a.*, sku_description
from q54 as a left join sku01 as b
on a.sk_item_id=b.sk_item_id ')
#--------------
# 가장 간단한 수준의 모델 만들기
(다중선형회귀 분석과 트리 )
#--------------
q61 <- sqldf('select customer_id,
sum(t_item_amt_sale) as amt_sale,
count(distinct d_date) as ndays
from tr01
where year="2002"
group by customer_id ')
q62 <- sqldf('select customer_id,
sum(t_item_amt_sale) as amt_sale
from tr01
where year="2003"
group by customer_id ')
q63 <- sqldf('select a.*, b.amt_sale as amt2003
from q61 as a left join q62 as b
on a.customer_id=b.customer_id')
plot(q63$amt_sale, q63$amt2003)
abline(lm(q63$amt2003~q63$amt_sale), col="red")
plot(q63$ndays, q63$amt2003)
abline(lm(q63$amt2003~q63$ndays), col="blue")
r1 <- lm(q63$amt2003~q63$ndays+q63$amt_sale)
summary(r1)
t1 <- ctree(q63$amt2003~q63$ndays+q63$amt_sale, data=q63, controls = ctree_control
(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )
plot(t1)
plot(q63$amt_sale[order(-q63$amt_sale)] )
quantile(q63$amt_sale, c(0.1, 0.5))
q64<- sqldf('select sum(amt_sale) as sum_amt_sale,
sum(amt2003) as sum_amt2003,
sum(amt2003) /sum(amt_sale) as rchange
from q63' )
q63$is_amthigh2003 <- ifelse(q63$amt2003>=500 , "H", "NH")
q63$is_amthigh2003 <- as.factor(q63$is_amthigh2003)
t2 <- ctree(q63$is_amthigh2003~q63$ndays+q63$amt_sale, data=q63, controls = ctree_control
(maxdepth = 3, mincriterion = 0.95, minsplit = 50) )
plot(t2, main="TREE 500")
q66 <- sqldf('select distinct customer_id, cust_type_id from tr01 order by customer_id')
q67 <- sqldf('select a.*, b.cust_type_id from q63 as a left join q66 as b
on a.customer_id=b.customer_id' )
t3 <- ctree(q67$is_amthigh2003~q67$ndays+q67$amt_sale + q67$cust_type_id, data=q67,
controls = ctree_control(maxdepth = 4, mincriterion = 0.95, minsplit = 20) )
plot(t3)
q68 <- sqldf('select customer_id, d_date, sum(t_item_amt_sale) as dsale
from tr01
where year="2002"
group by customer_id, d_date ')
q69 <- sqldf('select customer_id, max(dsale) as maxdsale
from q68
group by customer_id')
q70 <- sqldf('select a.*, b.maxdsale
from q63 as a left join q69 as b
on a.customer_id=b.customer_id' )
# NULL 처리
q70$maxdsale[is.na(q70$maxdsale)] <- 0
t4 <- ctree(q70$is_amthigh2003~q70$ndays+q70$amt_sale + q70$maxdsale, data=q70, controls =
ctree_control(maxdepth = 4, mincriterion = 0.95, minsplit = 20) )
plot(t4)
plot(q70$maxdsale, q70$amt2003)
#------------------------------
require(gplots)
ci2d(q70$maxdsale, q70$amt2003)
'R 데이터 분석' 카테고리의 다른 글
:: retail data preparation and EDA 연습 과제 (0) | 2015.07.06 |
---|---|
0702 빅Labor .... 주제 (0) | 2015.07.03 |
경기도 교통 데이터 추가[Dataset, 작업과제] (0) | 2015.06.30 |
0630 ... PART 3 (0) | 2015.06.30 |
0630 파트 2 (0) | 2015.06.30 |