>RE::VISION CRM

R 데이터 분석

GameLog-In 데이터준비

YONG_X 2018. 10. 16. 16:32

# get original data - game log-in history

usr <- read.csv("https://t1.daumcdn.net/cfile/blog/215C784B588ED2A133?download")
trx <- read.csv("https://t1.daumcdn.net/cfile/blog/256C0D4B588ED29C26?download")
dim(cust)
dim(trx)


#--------------------------

# log in history

trx.csv


# user profile

user.csv

#--------------------------


# arbitrarily transform distrubution
usr1 <- usr
usr1$device_type <- as.character(usr1$device_type)
tmp_dvctp <- usr1[usr1$device_type=="Android" & usr1$gender=="M","device_type"]
tmp_dvctp[sample(1:length(tmp_dvctp), 3300)] <- "iOS"
usr1[usr1$device_type=="Android" & usr1$gender=="M",]$device_type <- tmp_dvctp
usr1$device_type <- factor(usr1$device_type)
usr1$gender <- as.character(usr1$gender)
tmp_dvctp <- usr1[usr1$device_type=="iOS" & usr1$gender=="F","gender"]
tmp_dvctp[sample(1:length(tmp_dvctp), 1200)] <- "M"
usr1[usr1$device_type=="iOS" & usr1$gender=="F",]$gender <- tmp_dvctp
usr1$gender <- factor(usr1$gender)

usr1$gender <- as.character(usr1$gender)
tmp_dvctp <- usr1[usr1$gender=="F","gender"]
tmp_dvctp[sample(1:length(tmp_dvctp), 7500)] <- "M"
usr1[usr1$gender=="F",]$gender <- tmp_dvctp
usr1$gender <- factor(usr1$gender)

usr1$generation <- as.numeric(as.character(usr1$generation))
tmp_dvctp <- usr1[usr1$generation>=30 & usr1$gender=="F","generation"]
tmp_dvctp[sample(1:length(tmp_dvctp), 1200)] <- rep(c(10,20),600)
usr1[usr1$generation>=30 & usr1$gender=="F",]$generation <- tmp_dvctp
usr1$generation <- factor(usr1$generation)

# write.csv(usr1, "C:/YONG/m1710/user.csv", row.names=F)
# head(read.csv("C:/YONG/m1710/user.csv"))


trx1 <- trx
trx1$duration <- rbinom(170360, 500, 0.04)

a1 <- aggregate(trx1$duration~trx1$user_id, FUN=mean)
names(a1) <- c("user_id","duration_mean")
plot(sort(a1$duration))
abline(h=mean(trx1$duration))

a2 <- aggregate(trx1$duration~trx1$user_id, FUN=sum)
names(a2) <- c("user_id","duration_sum")
plot(sort(a2$duration))

a1$duration_sum <- a2$duration_sum
#  freq vs. duration sum
plot(a1$duration_sum/a1$duration_mean, a1$duration_sum)
cor(a1$duration_sum/a1$duration_mean, a1$duration_sum)

heavyusr <- a1[a1$duration_sum/a1$duration_mean>40,]$user_id
husmpl <- head(heavyusr, 200)
trx1[trx1$user_id %in% husmpl, "duration"] <- round(trx1[trx1$user_id %in% husmpl, "duration"]*0.9)
husmpl1 <- head(sort(heavyusr), 300)
trx1[trx1$user_id %in% husmpl1, "duration"] <- round(trx1[trx1$user_id %in% husmpl1, "duration"]*0.95)

# write.csv(trx1, "C:/YONG/m1710/trx.csv", row.names=F) 



uprf1 <- merge(a1, usr1, by="user_id", all.x=T)

plot(jitter(uprf1$duration_sum/uprf1$duration_mean), uprf1$duration_sum,

    col=rgb(ifelse(uprf1$gender=="M",0,1),0, 1-ifelse(uprf1$gender=="M",0,1), 0.1),

    pch=19, cex=0.4)


# inspect a subset

uprf2 <- uprf1[uprf1$duration_sum>600,]

cor(uprf2$duration_sum/uprf2$duration_mean, ifelse(uprf2$gender=="M",0,1))

cor(uprf2$duration_sum, ifelse(uprf2$gender=="M",0,1))




trx.csv
5.32MB
user.csv
2.2MB