# SAS 시간 변환
library(timeDate)
tr001$d_date <- as.POSIXct(tr001$d_date, origin = "1960-01-01", tz="Pacific")
head(tr001)
str(tr001)
#
# calculating number of days between 2 columns of dates in data frame
# http://stackoverflow.com/questions/11666172/calculating-number-of-days-between-2-columns-of-dates-in-data-frame
# survey <- data.frame(date=c("2012/07/26","2012/07/25", "2012/01/01"),tx_start=c("2012/01/01","2012/01/01","2012/01/01"))
# survey$date_diff <- as.Date(as.character(survey$date), format="%Y/%m/%d") - as.Date(as.character(survey$tx_start), format="%Y/%m/%d")
tr001$c_date<-substr(as.character(tr001$d_date),1,10)
tr001$c_date <- tr001$c_date
# eliminate unneccessay columns
tr001<-tr001[,!c( names(tr001) %in% c("cust_type_id", "t_item_amt_other", "t_discount_amt_other" , "t_item_qty_other") )]
# date difference computation example ::
#---------------------
tr002tmp01 <- head(tr002,100)
tr002tmp01$a <- as.Date(as.character(tr002tmp01$d_date), format="%Y-%m-%d")
tr002tmp01$b <- as.Date("2003-01-01", format="%Y-%m-%d")
tr002tmp01$rcndays <- as.numeric( tr002tmp01$b - tr002tmp01$a )
#---- end of example ----
# predictor01 : recency
tr00203 <- sqldf('select customer_id, max(c_date) as recent_date from tr002 group by customer_id')
tr00203$rcndays <- as.numeric( as.Date("2003-01-01", format="%Y-%m-%d") - as.Date(as.character(tr00203$recent_date), format="%Y-%m-%d") )
max(tr00203$rcndays)
# predictor02 : frequency
tr00201 <- sqldf('select customer_id, count(distinct d_date) as freqdays from tr002 group by customer_id' )
# predictor03 : monetary
tr00202 <- sqldf('select customer_id, sum(t_item_amt_sale) as mon12M from tr002 group by customer_id' )
# merge RFM predictor
tr00205 <- merge( tr00203, merge(tr00201, tr00202,
by="customer_id"),by="customer_id")
tr00205 <- tr00205[,!(names(tr00205) %in% c("recent_date"))]
# to check if all customers in
nrow(tr00205)
nrow(tr00200)
# target variable creation
#--------------------
tr00301 <- sqldf('select customer_id, sum(t_item_amt_sale) as T_mon from tr003 group by customer_id' )
nrow(tr00301)
# only 14% of customer base made purchase in the next month
# thus we set 1 as target event class when made purchase
# T_mpchr : taget var. made purchase
tr00301$T_mpchr <- "1"
# merge target and predictors
#---------------------------------
tr00401 <- merge( tr00205, tr00301, by="customer_id", all.x=T)
tr00401$T_mpchr[is.na(tr00401$T_mpchr)] <-"0"
tr00401$T_mon[is.na(tr00401$T_mon)] <-"0"
# predictor04 : number of channels used
tr00206 <- sqldf('select customer_id, count(distinct channel_id) as nchannel from tr002 group by customer_id' )
# predictor06 : number of months made purchase
tr00207 <- sqldf('select customer_id, count(distinct substr(d_date,1,7)) as nmonth12M from tr002 group by customer_id' )
# merge p04 and 06
#-----------------
tr00401 <- merge( tr00401, tr00206, by="customer_id", all.x=T)
tr00401 <- merge( tr00401, tr00207, by="customer_id", all.x=T)
tr00401$nchannel[is.na(tr00401$nchannel)] <-0
tr00401$nmonth12M[is.na(tr00401$nmonth12M)] <- 0
# predictor07 : number of days over $50
tr0020701 <- sqldf('select customer_id, d_date, sum(t_item_amt_sale) as dailyamt from tr002 group by customer_id, d_date' )
tr0020702 <- sqldf('select customer_id, count(distinct d_date) as daysov50 from tr0020701 where dailyamt >=50 group by customer_id' )
# predictor08 : number of months over $50
tr0020800 <- tr002
tr0020800$m_date <- substr(tr0020800$d_date, 1,7)
tr0020801 <- sqldf('select customer_id, m_date, sum(t_item_amt_sale) as monthlyamt from tr0020800 group by customer_id, m_date' )
tr0020802 <- sqldf('select customer_id, count(distinct m_date) as monthsov50 from tr0020801 where monthlyamt group by customer_id' )
# merge p07 and 08
#-----------------
tr00401 <- merge( tr00401, tr0020702, by="customer_id", all.x=T)
tr00401 <- merge( tr00401, tr0020802, by="customer_id", all.x=T)
tr00401$daysov50[is.na(tr00401$daysov50 )] <-0
tr00401$monthsov50[is.na(tr00401$monthsov50 )] <- 0
# test tree creation for validation
#---------------------
# to avoid "data class character is not supported" error
tr00401$T_mpchr <- as.factor(tr00401$T_mpchr)
# optional (for a test run) - monetary target
tr00401$T_mon <- as.numeric(tr00401$T_mon)
attach(tr00401)
library(party)
dt001 <- ctree( T_mpchr ~ rcndays + freqdays + mon12M + nchannel + nmonth12M + daysov50 + monthsov50 , data=tr00401, controls = ctree_control(maxdepth = 4))
plot(dt001)
# dt001 <- ctree( T_mon ~ rcndays + freqdays + mon12M + nchannel + nmonth12M + daysov50 + monthsov50 , data=tr00401, controls = ctree_control(maxdepth = 4))
# plot(dt001)
# export and save ~08 predictor set
#----------------------------------
# write.csv(tr00401, file="mset01.csv")
# tr00401 <- read.csv("mset01.csv", header = TRUE, sep = ",", dec=".", fill = TRUE )
# contract sku_master table
#---------------------------------
install.packages("sas7bdat")
library(sas7bdat)
sku_mast <- read.sas7bdat("sku_table.sas7bdat")
sku_mast01 <- sqldf('select sk_item_id, sku_description, sk_subclass_id, subclass_name, sk_class_id, class_name, sk_department_id, department_name from sku_mast')
# predictor09 : number of departments
tr0020901 <- sqldf('select a.customer_id, a.sk_item_id, b.sk_department_id from tr002 as a, sku_mast01 as b where a.sk_item_id=b.sk_item_id')
tr0020902 <- sqldf('select customer_id, count(distinct sk_department_id) as ndepts from tr0020901 group by customer_id')
# predictor10 : catalogue purchase flaghead
tmp000<- sqldf('select distinct channel_id from tr002')
# RT, MO(catalogue) , on
tr0021001 <- sqldf('select distinct customer_id, 1 as cat_flag from tr002')
# export and save ~10 predictor set
#----------------------------------
# write.csv(tr00401, file="mset02.csv")
# tr00401 <- read.csv("mset02.csv", header = TRUE, sep = ",", dec=".", fill = TRUE )
# write.csv(sku_mast01, file="sku_mast01.csv")
# merge p09 and 10
#-----------------
tr00401 <- merge( tr00401, tr0020902, by="customer_id", all.x=T)
tr00401 <- merge( tr00401, tr0021001, by="customer_id", all.x=T)
tr00401$ndepts[is.na(tr00401$ndepts )] <-0
tr00401$cat_flag[is.na(tr00401$cat_flag )] <- 0
attach(tr00401)
library(party)
dt001 <- ctree( T_mpchr ~ rcndays + freqdays + mon12M + nchannel + nmonth12M + daysov50 + monthsov50 + ndepts + cat_flag , data=tr00401, controls = ctree_control(maxdepth = 5))
plot(dt001)
'R 데이터 분석' 카테고리의 다른 글
[RF] 랜덤포리스트 for an Extremely Small Sample Set (0) | 2014.10.23 |
---|---|
bikini forecasting case [EDA sample] (0) | 2014.09.03 |
[R 사용자 컨퍼런스 2014] 랜덤포리스트를 활용한 다차원 시계열 예측: 아이디어와 산업계 응용 방안 (0) | 2014.05.23 |
[빅데이터][스몰 데이터] 발렌타인 데이 분석 연습_ 진행중 (0) | 2014.02.20 |
[R 라인플롯][시계열분석] 페이스북에 대한 관심 시계열 분석 예제 (0) | 2014.02.03 |