>RE::VISION CRM

R 데이터 분석

예측모델링 워크샵용 공유

YONG_X 2014. 8. 26. 14:17

# SAS 시간 변환

library(timeDate)

tr001$d_date <- as.POSIXct(tr001$d_date, origin = "1960-01-01", tz="Pacific")

head(tr001)

str(tr001)


#



# calculating number of days between 2 columns of dates in data frame


#   http://stackoverflow.com/questions/11666172/calculating-number-of-days-between-2-columns-of-dates-in-data-frame



# survey <- data.frame(date=c("2012/07/26","2012/07/25", "2012/01/01"),tx_start=c("2012/01/01","2012/01/01","2012/01/01"))


# survey$date_diff <- as.Date(as.character(survey$date), format="%Y/%m/%d") - as.Date(as.character(survey$tx_start), format="%Y/%m/%d")


tr001$c_date<-substr(as.character(tr001$d_date),1,10)

tr001$c_date <- tr001$c_date


# eliminate unneccessay columns

tr001<-tr001[,!c( names(tr001) %in% c("cust_type_id", "t_item_amt_other", "t_discount_amt_other" , "t_item_qty_other") )]


# date difference computation example ::

#---------------------

tr002tmp01 <- head(tr002,100)

tr002tmp01$a <- as.Date(as.character(tr002tmp01$d_date), format="%Y-%m-%d")

tr002tmp01$b <- as.Date("2003-01-01", format="%Y-%m-%d")

tr002tmp01$rcndays <- as.numeric( tr002tmp01$b - tr002tmp01$a )

#---- end of example ----


# predictor01 : recency

tr00203 <- sqldf('select customer_id, max(c_date) as recent_date from tr002 group by customer_id')

tr00203$rcndays <- as.numeric( as.Date("2003-01-01", format="%Y-%m-%d") - as.Date(as.character(tr00203$recent_date), format="%Y-%m-%d") ) 

max(tr00203$rcndays)



# predictor02 : frequency

tr00201 <- sqldf('select customer_id, count(distinct d_date) as freqdays from tr002 group by customer_id' )


# predictor03 : monetary

tr00202 <- sqldf('select customer_id, sum(t_item_amt_sale) as mon12M from tr002 group by customer_id' )


# merge RFM predictor

tr00205 <- merge( tr00203, merge(tr00201, tr00202,

by="customer_id"),by="customer_id")


tr00205 <- tr00205[,!(names(tr00205) %in% c("recent_date"))]


# to check if all customers in

nrow(tr00205)

nrow(tr00200)


# target variable creation

#--------------------

tr00301 <- sqldf('select customer_id, sum(t_item_amt_sale) as T_mon from tr003 group by customer_id' )

nrow(tr00301) 


# only 14% of customer base made purchase in the next month

# thus we set 1 as target event class when made purchase


# T_mpchr : taget var. made purchase

tr00301$T_mpchr <- "1" 


# merge target and predictors

#---------------------------------

tr00401 <- merge( tr00205,  tr00301, by="customer_id",  all.x=T)

tr00401$T_mpchr[is.na(tr00401$T_mpchr)] <-"0"

tr00401$T_mon[is.na(tr00401$T_mon)] <-"0"





# predictor04 : number of channels used

tr00206 <- sqldf('select customer_id, count(distinct channel_id) as nchannel from tr002 group by customer_id' )


# predictor06 : number of months made purchase

tr00207 <- sqldf('select customer_id, count(distinct substr(d_date,1,7)) as nmonth12M from tr002 group by customer_id' )


# merge p04 and 06

#-----------------

tr00401 <- merge( tr00401,  tr00206, by="customer_id",  all.x=T)

tr00401 <- merge( tr00401,  tr00207, by="customer_id",  all.x=T)

tr00401$nchannel[is.na(tr00401$nchannel)] <-0

tr00401$nmonth12M[is.na(tr00401$nmonth12M)] <- 0




# predictor07 : number of days over $50

tr0020701 <- sqldf('select customer_id, d_date, sum(t_item_amt_sale) as dailyamt from tr002 group by customer_id, d_date' )

tr0020702 <- sqldf('select customer_id, count(distinct d_date) as daysov50 from tr0020701 where dailyamt >=50  group by customer_id' )



# predictor08 : number of months over $50

tr0020800 <- tr002

tr0020800$m_date <- substr(tr0020800$d_date, 1,7) 

tr0020801 <- sqldf('select customer_id, m_date, sum(t_item_amt_sale) as monthlyamt from tr0020800 group by customer_id, m_date' )

tr0020802 <- sqldf('select customer_id, count(distinct m_date) as monthsov50 from tr0020801 where monthlyamt group by customer_id' )



# merge p07 and 08

#-----------------

tr00401 <- merge( tr00401,  tr0020702, by="customer_id",  all.x=T)

tr00401 <- merge( tr00401,  tr0020802, by="customer_id",  all.x=T)

tr00401$daysov50[is.na(tr00401$daysov50 )] <-0

tr00401$monthsov50[is.na(tr00401$monthsov50 )] <- 0



# test tree creation for validation

#---------------------


# to avoid "data class character is not supported" error

tr00401$T_mpchr <- as.factor(tr00401$T_mpchr)


# optional (for a test run) - monetary target

tr00401$T_mon <- as.numeric(tr00401$T_mon)


attach(tr00401)

library(party)

dt001 <- ctree( T_mpchr ~ rcndays + freqdays + mon12M + nchannel + nmonth12M  + daysov50 + monthsov50 , data=tr00401, controls = ctree_control(maxdepth = 4))

plot(dt001)



# dt001 <- ctree( T_mon ~ rcndays + freqdays + mon12M + nchannel + nmonth12M  + daysov50 + monthsov50 , data=tr00401, controls = ctree_control(maxdepth = 4))

# plot(dt001)



# export and save ~08 predictor set

#----------------------------------


# write.csv(tr00401, file="mset01.csv")

# tr00401 <- read.csv("mset01.csv", header = TRUE, sep = ",", dec=".", fill = TRUE )


# contract sku_master table

#---------------------------------

install.packages("sas7bdat")

library(sas7bdat)

sku_mast <- read.sas7bdat("sku_table.sas7bdat")

sku_mast01 <- sqldf('select sk_item_id, sku_description, sk_subclass_id, subclass_name, sk_class_id, class_name, sk_department_id, department_name from sku_mast')



# predictor09 : number of departments 

tr0020901 <- sqldf('select a.customer_id, a.sk_item_id, b.sk_department_id from tr002 as a, sku_mast01 as b where a.sk_item_id=b.sk_item_id')

tr0020902 <- sqldf('select customer_id, count(distinct sk_department_id) as ndepts from tr0020901 group by customer_id')



# predictor10 : catalogue purchase flaghead

 tmp000<- sqldf('select distinct channel_id from tr002')

# RT, MO(catalogue) , on

tr0021001 <- sqldf('select distinct customer_id, 1 as cat_flag from tr002')



# export and save ~10 predictor set

#----------------------------------


# write.csv(tr00401, file="mset02.csv")

# tr00401 <- read.csv("mset02.csv", header = TRUE, sep = ",", dec=".", fill = TRUE )


# write.csv(sku_mast01, file="sku_mast01.csv")



# merge p09 and 10

#-----------------

tr00401 <- merge( tr00401,  tr0020902, by="customer_id",  all.x=T)

tr00401 <- merge( tr00401,  tr0021001, by="customer_id",  all.x=T)

tr00401$ndepts[is.na(tr00401$ndepts )] <-0

tr00401$cat_flag[is.na(tr00401$cat_flag )] <- 0





attach(tr00401)

library(party)

dt001 <- ctree( T_mpchr ~ rcndays + freqdays + mon12M + nchannel + nmonth12M  + daysov50 + monthsov50 + ndepts + cat_flag , data=tr00401, controls = ctree_control(maxdepth = 5))

plot(dt001)