>RE::VISION CRM

R 데이터 분석

[SCW.VEDAR] Part 2

YONG_X 2016. 8. 12. 11:14




#========= [ 8 ] USING DECISION TREE  ============================



# decision tree를 사용한 변수가 관계 확인 후 다시 플로팅 

require(party)

hit3$AVG_RBI <- hit3$RBI / hit3$AB

t1 <- ctree(AVG_RBI ~ AB + OBP + HR + AVG + SLG + OPS + BB + Pos + Team , data=hit3)

plot(t1)


plot(hit3$SLG, hit3$HR, col=ifelse(hit3$AVG_RBI>=0.2,"red","black"))

# 타점생산은 장타력에 달린 것. HR수만으로는 결정되지 않음

# does this mean any causation?


plot(hit3$SLG, hit3$HR/hit3$AB, col=ifelse(hit3$AVG_RBI>=0.2,"red","black"))

# HR 비율이 높다는 것은 결정적 요인 -- 트리에 파생변수로 추가?


hit3$AVG_HR <- hit3$HR / hit3$AB

t1 <- ctree(AVG_RBI ~ AVG_HR + AB + OBP + HR + AVG + SLG + OPS + BB + Pos + Team , data=hit3)

plot(t1)

plot(hit3$OPS, hit3$AVG_HR, col=ifelse(hit3$AVG_RBI>=0.2,"red","black"))


head(hit3[order(-hit3$AVG_RBI),],12)

# Lee_D AVG_RBI 4위





#========= [ 9 ] CORRELATED VARIABLES  ============================


# plotting correlated variables


plot(hit$SO/hit$AB, hit$AVG)

hit$AVG_SO <- hit$SO/hit$AB

abline(lm(hit$AVG~hit$AVG_SO), col="red")


hit7 <- hit[hit$Pos !="P" & hit$AB>=30,]

nrow(hit) ; nrow(hit7)

table(hit7$Pos)

plot(hit7$AVG_SO, hit7$AVG, main=paste0("excluding pitchers. cor =", as.character(cor(hit7$AVG_SO, hit7$AVG))))

abline(lm(hit7$AVG~hit7$AVG_SO), col="red")


plot(hit7$AVG_SO, hit7$AVG, main=paste0("excluding pitchers. cor =", as.character(cor(hit7$AVG_SO, hit7$AVG))), cex=hit7$AB*2/max(hit7$AB))

abline(lm(hit7$AVG~hit7$AVG_SO), col="red")




hit8 <- hit[hit$Pos =="LF" & hit$AB>=30,]


plot(hit8$AVG_SO, hit8$AVG, main=paste0("LFs. cor =", as.character(cor(hit8$AVG_SO, hit8$AVG))), cex=hit8$AB*2/max(hit8$AB))

abline(lm(hit8$AVG~hit8$AVG_SO), col="red")



plot(hit8$AVG_SO, hit8$AVG, main=paste0(paste0("LFs of (30AB+) [ cor =", substr(as.character(cor(hit8$AVG_SO, hit8$AVG)),1,8)," ]")), cex=hit8$AB*3/max(hit8$AB), xlab="SO rate", ylab="AVG",  ylim=c(0,0.35))

abline(lm(hit8$AVG~hit8$AVG_SO), col="red", lty=2)

abline(0.289*1.1, -0.184, col="darkgrey", lty=2)

abline(0.289*.9, -0.184, col="darkgrey", lty=2)

hit81 <- head(hit8[order(-hit8$AVG),],2)

points(hit81$AVG_SO, hit81$AVG, col="red", pch=20, cex=hit8$AB*3/max(hit8$AB))

text(hit81$AVG_SO, hit81$AVG, labels=hit81$Player, pos=1)

# to get the location

# tmploc <- locator(1)

text(0.34, 0.32, "** Point size shows AB", col="blue")



hit8 <- hit[hit$Pos =="LF" & hit$AB>=120,]

plot(hit8$AVG_SO, hit8$AVG, main=paste0(paste0("LFs(120AB+) [ cor =", substr(as.character(cor(hit8$AVG_SO, hit8$AVG)),1,8)," ]")), cex=hit8$AB*3/max(hit8$AB), xlab="SO rate", ylab="AVG", ylim=c(0,0.35))

abline(lm(hit8$AVG~hit8$AVG_SO), col="red", lty=2)

abline( lm(hit8$AVG~hit8$AVG_SO)$coefficients[1]*1.1, lm(hit8$AVG~hit8$AVG_SO)$coefficients[2], col="darkgrey", lty=2)

abline( lm(hit8$AVG~hit8$AVG_SO)$coefficients[1]*.9, lm(hit8$AVG~hit8$AVG_SO)$coefficients[2], col="darkgrey", lty=2)

hit81 <- head(hit8[order(-hit8$AVG),],2)

points(hit81$AVG_SO, hit81$AVG, col="red", pch=20, cex=hit8$AB*3/max(hit8$AB))

text(hit81$AVG_SO, hit81$AVG, labels=hit81$Player, pos=1)

# to get the location

# tmploc <- locator(1)

text(0.34, 0.32, "** Point size shows AB", col="blue")




plot(hit8$AVG_SO/hit8$AVG, hit8$AVG, main=paste0(paste0("LFs(120AB+) [ cor =", substr(as.character(cor(hit8$AVG_SO/hit8$AVG, hit8$AVG)),1,8)," ]")), cex=hit8$AB*3/max(hit8$AB) ,xlab="SO rate/AVG", ylab="AVG", ylim=c(0,0.35))

hit81 <- head(hit8[order(-hit8$AVG),],2)

hit8$AVG_SO_b_AVG <- hit8$AVG_SO/hit8$AVG

abline(lm(hit8$AVG~hit8$AVG_SO_b_AVG), col="red", lty=2)


points(hit81$AVG_SO_b_AVG, hit81$AVG, col="red", pch=20, cex=hit8$AB*3/max(hit8$AB))

text(hit81$AVG_SO/hit81$AVG, hit81$AVG, labels=hit81$Player, pos=1)

# to get the location

# tmploc <- locator(1)

text(1.7, 0.32, "** Point size shows AB", col="blue")



# cex handling practice ----------

plot(sort(hit8$AB), cex=sort(hit8$AB*3/max(hit8$AB)))

points(sort(hit8$AB),  cex=sort(hit8$AB*3/max(hit8$AB))*2, col="orange")

points(sort(hit8$AB)+20^1.1,  cex=sort((hit8$AB*3)/max(hit8$AB)), col="lightblue")



plot(hit8$AB, hit8$AVG, ylim=c(0,0.35))

abline(lm(hit8$AVG~hit8$AB), col="skyblue")


# --- correlation interpretation --------

# 타율이 높은 타자에게 더 많은 기회가 계속 주어지는 것인가?

# 타수가 늘어날 수록 타율이 높아지는가?




#========= [ 10 ] COLORING  ============================


# semi-transparent plot with bg colors example


plot(hit8$AB, hit8$SLG, ylim=c(0,0.7), col=adjustcolor("blue", alpha=0.5) , pch=20, cex=(hit8$AVG^1.2)*15)

abline(lm(hit8$SLG~hit8$AB), col="skyblue")

hit82 <- hit8[hit8$AB>=180,]

abline(lm(hit82$SLG~hit82$AB), col="darkblue", lty=2)


# semi-transparent plot

hit8$AVG_HR <- hit8$HR/hit8$AB


plot(hit8$AB, hit8$AVG_HR, ylim=c(0,0.1), col=adjustcolor("blue", alpha=0.5) , pch=20, cex=(hit8$AVG^1.3)*15)

points(hit8[hit8$Player=="Kim_H",]$AB, hit8[hit8$Player=="Kim_H",]$AVG_HR, col="red", pch=20, cex=(hit8$AVG^1.2)*15)


abline(lm(hit8$AVG_HR~hit8$AB), col="skyblue")

hit82 <- hit8[hit8$AB>=180,]

abline(lm(hit82$AVG_HR~hit82$AB), col="darkblue", lty=2)



# semi-transparent plot with bg colors

hit8 <- hit[hit$Pos =="LF" & hit$AB>=120,]

plot(hit8$AB, hit8$SLG, ylim=c(0,0.7), main="Who R the Valuable LFs?(120+AB)", xlab="AB", ylab="SLG")

rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "skyblue")

rect(180,par("usr")[3],par("usr")[2],par("usr")[4],col = "lightblue")

grid(NA, NULL, lwd=1, lty=3, col="white")

grid(NULL, NA, lwd=1, lty=3, col="white")

points(hit8$AB, hit8$SLG, ylim=c(0,0.7), col=adjustcolor("blue", alpha=0.5) , pch=20, cex=(hit8$AVG^1.2)*15)

abline(lm(hit8$SLG~hit8$AB), col="darkblue", lty=3, lwd=1.5)

abline(v=180,  lty=3, lwd=2, col="darkgrey")

hit82 <- hit8[hit8$AB>=180,]

abline(lm(hit82$SLG~hit82$AB), col="navyblue", lty=2, lwd=1.5)

points(hit8[hit8$HR/hit8$AB>=0.05,]$AB, hit8[hit8$HR/hit8$AB>=0.05,]$SLG, col=adjustcolor("green", alpha=0.5), pch=20, cex=(hit8[hit8$Player=="Kim_H",]$AVG^1.2)*15)

# pointing KH

points(hit8[hit8$Player=="Kim_H",]$AB, hit8[hit8$Player=="Kim_H",]$SLG, col="red", pch=20, cex=(hit8[hit8$Player=="Kim_H",]$AVG^1.2)*15)

text(hit8[hit8$Player=="Kim_H",]$AB, hit8[hit8$Player=="Kim_H",]$SLG, col="navyblue", labels=hit8[hit8$Player=="Kim_H",]$Player, pos=1)

text(140, 0.65, "** Point size shows AVG", pos=4)

text(140, 0.7, "* green :: AVG_HR>=0.05", pos=4)







#========= [ 11 ] ADDITIONAL  ============================

# Basics + PLUS

#------------------------

# histogram


hist(hit3$SLG)

hist(hit3$SLG, breaks=20)

abline(v=0.520,col="red")



#--------

# compare multiple distributions using histograms 

hist(hit3$SLG, breaks=20, xlim=c(0, 1), main="Dist of OBP and SLG")

hist(hit3$OBP, breaks=20, add=T)


hist(hit3$HR/hit3$AB, breaks=20 )

hist(hit7$HR/hit7$AB, breaks=20 )




#-------------

# boxplot by Pos


boxplot(AVG~Pos,data=hit3, main="AVG by Pos", 

  xlab="Position", ylab="AVG")


boxplot(HR~Pos,data=hit3, main="HR by Pos", 

  xlab="Position", ylab="HR")

# DH는 홈런 중요


boxplot(X2B~Pos,data=hit3, main="X2B by Pos", 

  xlab="Position", ylab="X2B")

# 3루수가 빠른 발과 펀치력 겸비


boxplot(HR~Team,data=hit3, main="HR by Team", 

  xlab="Team", ylab="HR", cex.axis=0.6)

abline(a=12, b=0, lty=3)


boxplot(AVG~Team,data=hit3, main="AVG by Team", 

  xlab="Team", ylab="AVG", cex.axis=0.6)

abline(a=0.3, b=0, lty=3)



bymedian <- with(hit3, reorder(Team, -HR, median))

boxplot(HR~bymedian,data=hit3, main="HR by Team - Ordered", 

  xlab="Team", ylab="HR", cex.axis=0.6)


# UDF for quartile 

getQtl <- function(x) {quantile(x, probs = 0.25, na.rm=T)}

bymedian <- with(hit3, reorder(Team, -HR, getQtl))

boxplot(HR~bymedian,data=hit3, main="HR by Team - Ordered", 

  xlab="Team", ylab="HR", cex.axis=0.6)




#===============================

# RF with MLB hitting data

#===============================


# regression case


require(randomForest)

fit <- randomForest(AVG_RBI ~ AB + OBP + HR + AVG + SLG + OPS + BB + 

                      Pos + Team + SO + SB + CS + X2B + X3B + H + R + G,

                      data=hit3, 

                      importance=TRUE, 

                      ntree=10000)


plot(fit)

varImpPlot(fit)


Prediction <- predict(fit, hit3)

submit <- data.frame(Player = hit3$Player, AVG_RBI_P = Prediction, AVG_RBI = hit3$AVG_RBI)

# write.csv(submit, file = "RF_pred.csv", row.names = FALSE)

plot(submit$AVG_RBI, submit$AVG_RBI_P)

# MAPE

mean(abs(submit$AVG_RBI - submit$AVG_RBI_P))/mean(submit$AVG_RBI) *100




require(party)


# forest of conditional inference tree  - often used for classification 

# c.f. http://trevorstephens.com/kaggle-titanic-tutorial/r-part-5-random-forests/


fit <- cforest(AVG_RBI ~ AB + OBP + HR + AVG + SLG + OPS + BB + 

                      Pos + Team + SO + SB + CS + X2B + X3B + H + R + G,

                 data=hit3,  

                 controls=cforest_unbiased(ntree=10000, mtry=3))

Prediction <- predict(fit, hit3, OOB=TRUE , type="response")


submit <- data.frame(Player = hit3$Player, AVG_RBI = hit3$AVG_RBI, 

   AVG_RBI_P = as.numeric(Prediction))

# write.csv(submit, file = "RF_pred.csv", row.names = FALSE)

plot(submit$AVG_RBI, submit$AVG_RBI_P)

# MAPE

mean(abs(submit$AVG_RBI - submit$AVG_RBI_P))/mean(submit$AVG_RBI) *100





#-----------

# handling category variables - barplot


head(table(hit2$Team, hit2$Pos))

barplot(colMeans(table(hit2$Team, hit2$Pos)))


sort(colMeans(table(hit2$Team, hit2$Pos)), decreasing=T)

barplot(sort(colMeans(table(hit2$Team, hit2$Pos)), decreasing=T))


teamPlayers <- aggregate(hit2$Player, by=list(hit2$Team), function(x) {length(x)})

names(teamPlayers) <- c("Team", "Players")

barplot(teamPlayers$Players, names.arg=teamPlayers$Team, cex.names=0.7)

barplot(teamPlayers$Players, names.arg=teamPlayers$Team, cex.axis=0.9, cex.names=0.7)



hit11 <- hit[hit$HR>10,] 

teamHRhitter <- aggregate(hit11$Player, by=list(hit11$Team), function(x) {length(x)})

names(teamHRhitter) <- c("Team", "HRhitters")

barplot(teamHRhitter$HRhitters, names.arg=teamHRhitter$Team, cex.names=0.7,

col=ifelse(teamHRhitter$HRhitters>=5, "red", "grey"))


hit12 <- hit[hit$HR>10,] 

PosHRhitter <- aggregate(hit12$Player, by=list(hit12$Pos), function(x) {length(x)})

names(PosHRhitter) <- c("Pos", "HRhitters")

barplot(PosHRhitter$HRhitters, names.arg=PosHRhitter$Pos, 

col=ifelse(PosHRhitter$HRhitters>=12, "red", "grey"))




#------- color filled line plot ---------------


plot(sort(hit3$HR), type="l", main="HR and X2B")

polygon(c(1:nrow(hit3), rev(1:nrow(hit3))), c(rep(0, nrow(hit3)), rev(sort(hit3$HR))), col=adjustcolor("blue", alpha=0.5))


polygon(c(1:nrow(hit3), rev(1:nrow(hit3))), c(rep(0, nrow(hit3)), rev(hit3[order(hit3$HR),]$X2B)), col=adjustcolor("red", alpha=0.5), border=NA)

mtext("ordered by HR(120+ AB)", 3, line=.8)

abline(lm(hit3[order(hit3$HR),]$X2B~c(1:nrow(hit3))), lty=2)

text(50, 12, "regression line of X2B")



# fill in areas between lines

plot(sort(hit3$HR), type="l", main="HR and X2B")

polygon(c(1:nrow(hit3), rev(1:nrow(hit3))) , c(rev(sort(hit3$HR, decreasing=T)), rev(hit3[order(hit3$HR),]$X2B)), col=adjustcolor("blue", alpha=0.5))

text(250, 3, "X2B above HR line filled")

arrows(200,15,200,11, col=2, lwd=7)


# why? running also is critical to get X2B

summary(lm(X2B~HR+SB, data=hit3))



plot(hit3[hit3$KPlayer==1,]$HR,hit3[hit3$KPlayer==1,]$X2B , ylim=c(0,10))

text(hit3[hit3$KPlayer==1,]$HR, hit3[hit3$KPlayer==1,]$X2B, labels=hit3[hit3$KPlayer==1,]$Player, pos=1)


# color handling practice

plot(hit$OBP, hit$SLG, col=ifelse(ceiling(hit$OBP*50)%%2 ==0 ,"black","darkgrey"), pch=20)


# plotting practice to mimic heatmaps

plot(expand.grid(1:10, 1:10), col="darkgrey", pch=15, cex=5)

points( expand.grid(1:10, 1:10), col=ifelse((expand.grid(1:10, 1:10)[,1]-expand.grid(1:10, 1:10)[,2]) %in% c(1,3), "red", "darkgrey"), pch=15, cex=5)




#---------- TBE analysis using plot and plot based heatmap


hit3$TBE <- (hit3$H - hit3$X2B - hit3$X3B - hit3$HR ) + (hit3$X2B *2 ) + (hit3$X3B *3 ) + (hit3$HR *4) + hit3$BB

hit3$AVG_TBE <- hit3$TBE/hit3$AB


require(party)

hit3$AVG_BB <- hit3$BB / hit3$AB

t1 <- ctree(AVG_TBE ~ AB + OBP + HR + AVG + SLG + AVG_BB + Pos + Team , data=hit3)

plot(t1)

# SLG and AVG_BB are keys



require(randomForest)

fit <- randomForest(AVG_TBE ~ AB + OBP + HR + AVG + SLG + AVG_BB + Pos + Team,

                      data=hit3, 

                      importance=TRUE, 

                      ntree=10000)


plot(fit)

varImpPlot(fit)

# SLG and OBP are keys?



# scatter plot itself -------


plot(hit3$SLG, hit3$AVG_BB, pch=20, col=ifelse(hit3$AVG_TBE>=0.5,"darkgrey", "lightgrey"), 

  main="TBE tiers by key factors",

  xlab="Slugging", ylab="BB/AB")

hit3TBEh <-  hit3[hit3$AVG_TBE>=0.65,]

points(hit3TBEh$SLG, hit3TBEh$AVG_BB, pch=20, col="black")

text(hit3TBEh$SLG, hit3TBEh$AVG_BB, ifelse(hit3TBEh$AVG_TBE>0.67,hit3TBEh$Player,""), col="blue", pos=4, cex=0.6)

text(hit3TBEh$SLG, hit3TBEh$AVG_BB, ifelse(hit3TBEh$AVG_TBE>0.71,hit3TBEh$Player,""), col="darkblue", pos=4, cex=0.6)

hit3TBEk <-  hit3[hit3$KPlayer==1,]

text(hit3TBEk$SLG, hit3TBEk$AVG_BB, hit3TBEk$Player, col="red", pos=4, cex=0.6)

mtext("Koreans are in the middle [120+AB - 20160705]", 3, line=.7, cex=0.8, col="darkgrey")

text(0.25, 0.21, "* Note: Colors represent TBE(Total Bases Earned) tiers", cex=0.6)



plot(hit3$SLG, hit3$OBP, pch=20, col=ifelse(hit3$AVG_TBE>=0.5,"darkgrey", "lightgrey"), 

  main="TBE tiers by key factors",

  xlab="Slugging", ylab="OBP")

hit3TBEh <-  hit3[hit3$AVG_TBE>=0.65,]

points(hit3TBEh$SLG, hit3TBEh$OBP, pch=20, col="black")

text(hit3TBEh$SLG, hit3TBEh$OBP, ifelse(hit3TBEh$AVG_TBE>0.67,hit3TBEh$Player,""), col="blue", pos=4, cex=0.6)

text(hit3TBEh$SLG, hit3TBEh$OBP, ifelse(hit3TBEh$AVG_TBE>0.71,hit3TBEh$Player,""), col="darkblue", pos=4, cex=0.6)

hit3TBEk <-  hit3[hit3$KPlayer==1,]

text(hit3TBEk$SLG, hit3TBEk$OBP, hit3TBEk$Player, col="red", pos=4, cex=0.6)

mtext("Koreans are in the middle [120+AB - 20160705]", 3, line=.7, cex=0.8, col="darkgrey")

text(0.25, 0.21, "* Note: Colors represent TBE(Total Bases Earned) tiers", cex=0.6)



hit3TBEh[hit3TBEh$AVG_TBE>0.71, ]




#--- profile comparison --------


hitKimHarp <- hit3[hit3$Player %in% c("Kim_H","Harper_B"),]

cols <- c("OBP","AVG","SLG","AVG_BB")

hitKim_H <- head(hitKimHarp[,cols],1)

hitHarper_B <- tail(hitKimHarp[,cols],1)

barplot( as.numeric(hitKim_H/hitHarper_B), names.arg=cols, 

  col=ifelse(as.numeric(hitKim_H/hitHarper_B)<=0.7, "red","grey"),

  main="Kim_H compared to Harper_B" )


plot(hit3$AVG, hit3$AVG_BB, pch=20, col=ifelse(hit3$OBP>=0.35,"darkgrey","lightgrey"))

hitKimHarp <- hit3[hit3$Player %in% c("Kim_H","Harper_B"),]

text(hitKimHarp$AVG, hitKimHarp$AVG_BB, c("Kim_H","Harper_B"), col="red", pos=4, cex=0.6)


plot(hit3$SO/hit3$AB, hit3$AVG_BB, pch=20, col=ifelse(hit3$OBP>=0.35,"darkgrey","lightgrey"), main="If Kim_H sees more balls?")

hitKimHarp <- hit3[hit3$Player %in% c("Kim_H","Harper_B"),]

text(hitKimHarp$SO/hitKimHarp$AB, hitKimHarp$AVG_BB, c("Kim_H","Harper_B"), col="red", pos=4, cex=0.6)

arrows(0.155, 0.13,0.205, 0.24, lty=2, lwd=1.5, col="red")

arrows(0.155, 0.13,0.22, 0.22, lty=2, lwd=1.5, col="orange")




plot(hit3$AVG, hit3$SLG-hit3$AVG, main="Kim_H, AVG Vs. ISOP")

rect(.33, par("usr")[3],par("usr")[2],par("usr")[4],col = "lightblue")

points(hit3$AVG, hit3$SLG-hit3$AVG)

mtext("ISOP: Isolated Power(a sabermetric baseball statistic) ; As of 20160704", 3, line=.7, cex=0.8, col="darkgrey")

points(hit3[hit3$Player=="Kim_H",]$AVG, hit3[hit3$Player=="Kim_H",]$SLG-hit3[hit3$Player=="Kim_H",]$AVG, pch=20, col="blue")

points(hit3[hit3$Player=="Suzuki_I",]$AVG, hit3[hit3$Player=="Suzuki_I",]$SLG-hit3[hit3$Player=="Suzuki_I",]$AVG, pch=20, col="red")

text(hit3[hit3$Player %in% c("Kim_H", "Suzuki_I"),]$AVG, hit3[hit3$Player %in% c("Kim_H", "Suzuki_I"),]$SLG-hit3[hit3$Player %in% c("Kim_H", "Suzuki_I"),]$AVG, hit3[hit3$Player %in% c("Kim_H", "Suzuki_I"),]$Player, pos=1, cex=0.6)


text(hit3[hit3$AVG>=0.33,]$AVG, hit3[hit3$AVG>=0.33,]$SLG-hit3[hit3$AVG>=0.33,]$AVG, hit3[hit3$AVG>=0.33,]$Player, pos=1, cex=0.6)

abline(v=0.338, col="darkgrey", lty=3)



# 그 위대한 Ichiro Suzuki, MLB Career AVG .314, OPS .762

# Hyun Soo Kim(a.k.a. Machine) 2016 AVG .328, OPS .856 (as of 20160809)

# Machine이라 불려도 손색없을 상황


#----------- END -------------

'R 데이터 분석' 카테고리의 다른 글

LGUP_BDAS _ 20161006  (0) 2016.10.06
heatmap using scatterplot  (0) 2016.08.22
[SCW.VEDAR] Part 1  (0) 2016.08.12
[R분석] cluster based anomaly detection  (0) 2016.08.11
[SCWHO] 시각적 데이터 분석 EDA 예제 MLB Hitting 2016mid  (0) 2016.07.05