>RE::VISION CRM

R 데이터 분석

[빅데이터][스몰 데이터] 발렌타인 데이 분석 연습_ 진행중

YONG_X 2014. 2. 20. 16:42

[빅데이터][스몰 데이터] 발렌타인 데이 분석 연습_ 진행중


발렌타인데이? 한국에서만 난리일끼?

그래서?


구글트렌즈 데이터를 활용해서 발렌타인데이가 가지는 의미를

국내와 전세계를 비교해서 기초 검토에서 출발





먼저 




구글 트렌즈 자체만을 활용한 대략적 흐름 파악 결과는 추가적인 분석으로 이어질 수 있으므로

데이터를 다운로드 받아 R을 활용한 추가 분석 실시


## ----------------

##  plotting google trends weekly data

##  keyword == "valentineday" world


require(xlsx)

gt_vd <- read.xlsx("valentineday_K_02.xlsx", 1, stringsAsFactors=FALSE)


gt_vd$dweek <- as.Date(substr(gt_bd$Week, 1, 10))


gt_vd$valentinesday <- as.numeric(gt_vd$valentinesday)

gt_vd$gift  <- as.numeric(gt_vd$gift )

gt_vd$boyf  <- as.numeric(gt_vd$boyf )

gt_vd$girlf    <- as.numeric(gt_vd$girlf )


require(ggplot2) ; require(scales)     # essential 


# -------------

# sample code to save output chart 


# jpeg(filename = "big_Data_201401.jpeg", 

# width = 960, height = 480, units = "px", 

# pointsize = 14,

# quality = 100)


# attach(gt_vd)

# ggplot(gt_vd, aes(dweek, valentinesday)) + geom_line() + # scale_x_date(labels=date_format("%Y-%m" )) + xlab("") + # ylab("google trends index")


# dev.off()


#-----------------------


#-------------------------

# create timeseries chart

# to use only from 2010

gt_vd1 <- gt_vd[ as.numeric(substr(gt_vd$dweek,1,4))>=2010 , ]

attach(gt_vd1)

ggplot(gt_vd1, aes(dweek, valentinesday)) + geom_line() + scale_x_date(labels=date_format("%Y-%m" )) + xlab("") + ylab("google trends index")



#-----------

# yearly overlay comparison


gt_vd1 <- gt_vd[ as.numeric(substr(gt_vd$dweek,1,4))>=2010 , ]

gt_vd1$mon <- ifelse(as.numeric(substr(gt_vd1$dweek,23,24)<=3), 

substr(gt_vd1$dweek, 6, 7), substr(gt_vd1$dweek, 19, 20)) 


gt_vd1$yr <- substr(gt_vd1$dweek, 1,4)


# require(sqldf)


gt_vd2 <- sqldf('select distinct yr, mon,

    avg(valentinesday) as valentinesday ,

    avg(gift) as gift ,

    avg(boyf) as boyf ,

    avg(girlf) as girlf      

    from gt_vd1 group by yr, mon

 ')


require(ggplot2)


# location = south korea

ggplot(data = gt_vd2, aes(x=as.numeric(gt_vd2$mon), y=gt_vd2$valentinesday)) + geom_line(aes(colour=gt_vd2$yr))


#----------


# interpretation: gift fluctuates year by year


ggplot(data = gt_vd2, aes(x=as.numeric(gt_vd2$mon), y=gt_vd2$gift)) + geom_line(aes(colour=gt_vd2$yr)) + labs(title="Valentines Day")



# interpretation: girls dont care BF


ggplot(data = gt_vd2, aes(x=as.numeric(gt_vd2$mon), y=gt_vd2$boyf)) + geom_line(aes(colour=gt_vd2$yr)) + labs(title="BF")



# interpretation: boys DO care GF?


ggplot(data = gt_vd2, aes(x=as.numeric(gt_vd2$mon), y=gt_vd2$girlf)) + geom_line(aes(colour=gt_vd2$yr))  + labs(title="GF")



 yr mon valentinesday  gift  boyf girlf    ymoni


#-----------------------

# analysis using modeling

# use regression to investigate relationships


gt_vd2$i_bfgf <- (gt_vd2$boyf - gt_vd2$girlf) / (gt_vd2$boyf + gt_vd2$girlf)


# we can consider other variables from other resources too

# we can expand analysis  i_bfgf is an example


gt_vd2$ymoni <- as.numeric(as.character(gt_vd2$yr))+ (as.numeric(gt_vd2$mon)/(12*10))


fit_gift <- lm(gt_vd2$gift ~ as.numeric(gt_vd2$yr) + gt_vd2$ymoni  + gt_vd2$boyf + gt_vd2$girlf + gt_vd2$i_bfgf  + gt_vd2$valentinesday, data=gt_vd2)

summary(fit_gift ) # show results


# interpretation :: valentinesday.... is a critical event

# valentines day is just an one time event ... scarce!!

# analysis is not that reliable



# require(randomForest)


rf_gift = randomForest(gt_vd2$gift ~ as.numeric(gt_vd2$yr) + gt_vd2$ymoni  + gt_vd2$boyf + gt_vd2$girlf + gt_vd2$i_bfgf  + gt_vd2$valentinesday , data=gt_vd2, mrty=10 , importance=TRUE, do.trace=10, ntree=10000)


varImpPlot(rf_gift)



# this time, valentineday is the target


fit_vday <- lm(gt_vd2$valentinesday ~ as.numeric(gt_vd2$yr) + gt_vd2$ymoni  + gt_vd2$boyf + gt_vd2$girlf + gt_vd2$i_bfgf  + gt_vd2$gift, data=gt_vd2)

summary(fit_gift ) # show results



rf_vday = randomForest(gt_vd2$valentinesday ~ as.numeric(gt_vd2$yr) + gt_vd2$ymoni  + gt_vd2$boyf + gt_vd2$girlf + gt_vd2$i_bfgf  + gt_vd2$gift , data=gt_vd2, mrty=10 , importance=TRUE, do.trace=10, ntree=10000)


varImpPlot(rf_vday)




# interpretation : valentine's day is something for gift!!

# people google valentinesday is the signal of gift search


#-------- 결과 ----------