# get original data - game log-in history
usr <- read.csv("https://t1.daumcdn.net/cfile/blog/215C784B588ED2A133?download")
trx <- read.csv("https://t1.daumcdn.net/cfile/blog/256C0D4B588ED29C26?download")
dim(cust)
dim(trx)
#--------------------------
# log in history
# user profile
#--------------------------
# arbitrarily transform distrubution
usr1 <- usr
usr1$device_type <- as.character(usr1$device_type)
tmp_dvctp <- usr1[usr1$device_type=="Android" & usr1$gender=="M","device_type"]
tmp_dvctp[sample(1:length(tmp_dvctp), 3300)] <- "iOS"
usr1[usr1$device_type=="Android" & usr1$gender=="M",]$device_type <- tmp_dvctp
usr1$device_type <- factor(usr1$device_type)
usr1$gender <- as.character(usr1$gender)
tmp_dvctp <- usr1[usr1$device_type=="iOS" & usr1$gender=="F","gender"]
tmp_dvctp[sample(1:length(tmp_dvctp), 1200)] <- "M"
usr1[usr1$device_type=="iOS" & usr1$gender=="F",]$gender <- tmp_dvctp
usr1$gender <- factor(usr1$gender)
usr1$gender <- as.character(usr1$gender)
tmp_dvctp <- usr1[usr1$gender=="F","gender"]
tmp_dvctp[sample(1:length(tmp_dvctp), 7500)] <- "M"
usr1[usr1$gender=="F",]$gender <- tmp_dvctp
usr1$gender <- factor(usr1$gender)
usr1$generation <- as.numeric(as.character(usr1$generation))
tmp_dvctp <- usr1[usr1$generation>=30 & usr1$gender=="F","generation"]
tmp_dvctp[sample(1:length(tmp_dvctp), 1200)] <- rep(c(10,20),600)
usr1[usr1$generation>=30 & usr1$gender=="F",]$generation <- tmp_dvctp
usr1$generation <- factor(usr1$generation)
# write.csv(usr1, "C:/YONG/m1710/user.csv", row.names=F)
# head(read.csv("C:/YONG/m1710/user.csv"))
trx1 <- trx
trx1$duration <- rbinom(170360, 500, 0.04)
a1 <- aggregate(trx1$duration~trx1$user_id, FUN=mean)
names(a1) <- c("user_id","duration_mean")
plot(sort(a1$duration))
abline(h=mean(trx1$duration))
a2 <- aggregate(trx1$duration~trx1$user_id, FUN=sum)
names(a2) <- c("user_id","duration_sum")
plot(sort(a2$duration))
a1$duration_sum <- a2$duration_sum
# freq vs. duration sum
plot(a1$duration_sum/a1$duration_mean, a1$duration_sum)
cor(a1$duration_sum/a1$duration_mean, a1$duration_sum)
heavyusr <- a1[a1$duration_sum/a1$duration_mean>40,]$user_id
husmpl <- head(heavyusr, 200)
trx1[trx1$user_id %in% husmpl, "duration"] <- round(trx1[trx1$user_id %in% husmpl, "duration"]*0.9)
husmpl1 <- head(sort(heavyusr), 300)
trx1[trx1$user_id %in% husmpl1, "duration"] <- round(trx1[trx1$user_id %in% husmpl1, "duration"]*0.95)
# write.csv(trx1, "C:/YONG/m1710/trx.csv", row.names=F)
uprf1 <- merge(a1, usr1, by="user_id", all.x=T)
plot(jitter(uprf1$duration_sum/uprf1$duration_mean), uprf1$duration_sum,
col=rgb(ifelse(uprf1$gender=="M",0,1),0, 1-ifelse(uprf1$gender=="M",0,1), 0.1),
pch=19, cex=0.4)
# inspect a subset
uprf2 <- uprf1[uprf1$duration_sum>600,]
cor(uprf2$duration_sum/uprf2$duration_mean, ifelse(uprf2$gender=="M",0,1))
cor(uprf2$duration_sum, ifelse(uprf2$gender=="M",0,1))
'R 데이터 분석' 카테고리의 다른 글
[kdata 2019 recsys 0030] retail recommender using R[전용준 리비젼 recsys r] (0) | 2019.05.18 |
---|---|
[AI Summit workshop] rf anomaly 1206 (0) | 2018.11.30 |
[R분석] 실전 EDA 탐색적분석 R 팁 3 (0) | 2018.10.05 |
[R분석] LAD goes wrong. Why? a.f. 20180928 (0) | 2018.09.28 |
[R분석] 프로야구 KBO 타자 성적과 나이의 관계 (0) | 2018.09.28 |