#========= [ 8 ] USING DECISION TREE ============================
# decision tree를 사용한 변수가 관계 확인 후 다시 플로팅
require(party)
hit3$AVG_RBI <- hit3$RBI / hit3$AB
t1 <- ctree(AVG_RBI ~ AB + OBP + HR + AVG + SLG + OPS + BB + Pos + Team , data=hit3)
plot(t1)
plot(hit3$SLG, hit3$HR, col=ifelse(hit3$AVG_RBI>=0.2,"red","black"))
# 타점생산은 장타력에 달린 것. HR수만으로는 결정되지 않음
# does this mean any causation?
plot(hit3$SLG, hit3$HR/hit3$AB, col=ifelse(hit3$AVG_RBI>=0.2,"red","black"))
# HR 비율이 높다는 것은 결정적 요인 -- 트리에 파생변수로 추가?
hit3$AVG_HR <- hit3$HR / hit3$AB
t1 <- ctree(AVG_RBI ~ AVG_HR + AB + OBP + HR + AVG + SLG + OPS + BB + Pos + Team , data=hit3)
plot(t1)
plot(hit3$OPS, hit3$AVG_HR, col=ifelse(hit3$AVG_RBI>=0.2,"red","black"))
head(hit3[order(-hit3$AVG_RBI),],12)
# Lee_D AVG_RBI 4위
#========= [ 9 ] CORRELATED VARIABLES ============================
# plotting correlated variables
plot(hit$SO/hit$AB, hit$AVG)
hit$AVG_SO <- hit$SO/hit$AB
abline(lm(hit$AVG~hit$AVG_SO), col="red")
hit7 <- hit[hit$Pos !="P" & hit$AB>=30,]
nrow(hit) ; nrow(hit7)
table(hit7$Pos)
plot(hit7$AVG_SO, hit7$AVG, main=paste0("excluding pitchers. cor =", as.character(cor(hit7$AVG_SO, hit7$AVG))))
abline(lm(hit7$AVG~hit7$AVG_SO), col="red")
plot(hit7$AVG_SO, hit7$AVG, main=paste0("excluding pitchers. cor =", as.character(cor(hit7$AVG_SO, hit7$AVG))), cex=hit7$AB*2/max(hit7$AB))
abline(lm(hit7$AVG~hit7$AVG_SO), col="red")
hit8 <- hit[hit$Pos =="LF" & hit$AB>=30,]
plot(hit8$AVG_SO, hit8$AVG, main=paste0("LFs. cor =", as.character(cor(hit8$AVG_SO, hit8$AVG))), cex=hit8$AB*2/max(hit8$AB))
abline(lm(hit8$AVG~hit8$AVG_SO), col="red")
plot(hit8$AVG_SO, hit8$AVG, main=paste0(paste0("LFs of (30AB+) [ cor =", substr(as.character(cor(hit8$AVG_SO, hit8$AVG)),1,8)," ]")), cex=hit8$AB*3/max(hit8$AB), xlab="SO rate", ylab="AVG", ylim=c(0,0.35))
abline(lm(hit8$AVG~hit8$AVG_SO), col="red", lty=2)
abline(0.289*1.1, -0.184, col="darkgrey", lty=2)
abline(0.289*.9, -0.184, col="darkgrey", lty=2)
hit81 <- head(hit8[order(-hit8$AVG),],2)
points(hit81$AVG_SO, hit81$AVG, col="red", pch=20, cex=hit8$AB*3/max(hit8$AB))
text(hit81$AVG_SO, hit81$AVG, labels=hit81$Player, pos=1)
# to get the location
# tmploc <- locator(1)
text(0.34, 0.32, "** Point size shows AB", col="blue")
hit8 <- hit[hit$Pos =="LF" & hit$AB>=120,]
plot(hit8$AVG_SO, hit8$AVG, main=paste0(paste0("LFs(120AB+) [ cor =", substr(as.character(cor(hit8$AVG_SO, hit8$AVG)),1,8)," ]")), cex=hit8$AB*3/max(hit8$AB), xlab="SO rate", ylab="AVG", ylim=c(0,0.35))
abline(lm(hit8$AVG~hit8$AVG_SO), col="red", lty=2)
abline( lm(hit8$AVG~hit8$AVG_SO)$coefficients[1]*1.1, lm(hit8$AVG~hit8$AVG_SO)$coefficients[2], col="darkgrey", lty=2)
abline( lm(hit8$AVG~hit8$AVG_SO)$coefficients[1]*.9, lm(hit8$AVG~hit8$AVG_SO)$coefficients[2], col="darkgrey", lty=2)
hit81 <- head(hit8[order(-hit8$AVG),],2)
points(hit81$AVG_SO, hit81$AVG, col="red", pch=20, cex=hit8$AB*3/max(hit8$AB))
text(hit81$AVG_SO, hit81$AVG, labels=hit81$Player, pos=1)
# to get the location
# tmploc <- locator(1)
text(0.34, 0.32, "** Point size shows AB", col="blue")
plot(hit8$AVG_SO/hit8$AVG, hit8$AVG, main=paste0(paste0("LFs(120AB+) [ cor =", substr(as.character(cor(hit8$AVG_SO/hit8$AVG, hit8$AVG)),1,8)," ]")), cex=hit8$AB*3/max(hit8$AB) ,xlab="SO rate/AVG", ylab="AVG", ylim=c(0,0.35))
hit81 <- head(hit8[order(-hit8$AVG),],2)
hit8$AVG_SO_b_AVG <- hit8$AVG_SO/hit8$AVG
abline(lm(hit8$AVG~hit8$AVG_SO_b_AVG), col="red", lty=2)
points(hit81$AVG_SO_b_AVG, hit81$AVG, col="red", pch=20, cex=hit8$AB*3/max(hit8$AB))
text(hit81$AVG_SO/hit81$AVG, hit81$AVG, labels=hit81$Player, pos=1)
# to get the location
# tmploc <- locator(1)
text(1.7, 0.32, "** Point size shows AB", col="blue")
# cex handling practice ----------
plot(sort(hit8$AB), cex=sort(hit8$AB*3/max(hit8$AB)))
points(sort(hit8$AB), cex=sort(hit8$AB*3/max(hit8$AB))*2, col="orange")
points(sort(hit8$AB)+20^1.1, cex=sort((hit8$AB*3)/max(hit8$AB)), col="lightblue")
plot(hit8$AB, hit8$AVG, ylim=c(0,0.35))
abline(lm(hit8$AVG~hit8$AB), col="skyblue")
# --- correlation interpretation --------
# 타율이 높은 타자에게 더 많은 기회가 계속 주어지는 것인가?
# 타수가 늘어날 수록 타율이 높아지는가?
#========= [ 10 ] COLORING ============================
# semi-transparent plot with bg colors example
plot(hit8$AB, hit8$SLG, ylim=c(0,0.7), col=adjustcolor("blue", alpha=0.5) , pch=20, cex=(hit8$AVG^1.2)*15)
abline(lm(hit8$SLG~hit8$AB), col="skyblue")
hit82 <- hit8[hit8$AB>=180,]
abline(lm(hit82$SLG~hit82$AB), col="darkblue", lty=2)
# semi-transparent plot
hit8$AVG_HR <- hit8$HR/hit8$AB
plot(hit8$AB, hit8$AVG_HR, ylim=c(0,0.1), col=adjustcolor("blue", alpha=0.5) , pch=20, cex=(hit8$AVG^1.3)*15)
points(hit8[hit8$Player=="Kim_H",]$AB, hit8[hit8$Player=="Kim_H",]$AVG_HR, col="red", pch=20, cex=(hit8$AVG^1.2)*15)
abline(lm(hit8$AVG_HR~hit8$AB), col="skyblue")
hit82 <- hit8[hit8$AB>=180,]
abline(lm(hit82$AVG_HR~hit82$AB), col="darkblue", lty=2)
# semi-transparent plot with bg colors
hit8 <- hit[hit$Pos =="LF" & hit$AB>=120,]
plot(hit8$AB, hit8$SLG, ylim=c(0,0.7), main="Who R the Valuable LFs?(120+AB)", xlab="AB", ylab="SLG")
rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "skyblue")
rect(180,par("usr")[3],par("usr")[2],par("usr")[4],col = "lightblue")
grid(NA, NULL, lwd=1, lty=3, col="white")
grid(NULL, NA, lwd=1, lty=3, col="white")
points(hit8$AB, hit8$SLG, ylim=c(0,0.7), col=adjustcolor("blue", alpha=0.5) , pch=20, cex=(hit8$AVG^1.2)*15)
abline(lm(hit8$SLG~hit8$AB), col="darkblue", lty=3, lwd=1.5)
abline(v=180, lty=3, lwd=2, col="darkgrey")
hit82 <- hit8[hit8$AB>=180,]
abline(lm(hit82$SLG~hit82$AB), col="navyblue", lty=2, lwd=1.5)
points(hit8[hit8$HR/hit8$AB>=0.05,]$AB, hit8[hit8$HR/hit8$AB>=0.05,]$SLG, col=adjustcolor("green", alpha=0.5), pch=20, cex=(hit8[hit8$Player=="Kim_H",]$AVG^1.2)*15)
# pointing KH
points(hit8[hit8$Player=="Kim_H",]$AB, hit8[hit8$Player=="Kim_H",]$SLG, col="red", pch=20, cex=(hit8[hit8$Player=="Kim_H",]$AVG^1.2)*15)
text(hit8[hit8$Player=="Kim_H",]$AB, hit8[hit8$Player=="Kim_H",]$SLG, col="navyblue", labels=hit8[hit8$Player=="Kim_H",]$Player, pos=1)
text(140, 0.65, "** Point size shows AVG", pos=4)
text(140, 0.7, "* green :: AVG_HR>=0.05", pos=4)
#========= [ 11 ] ADDITIONAL ============================
# Basics + PLUS
#------------------------
# histogram
hist(hit3$SLG)
hist(hit3$SLG, breaks=20)
abline(v=0.520,col="red")
#--------
# compare multiple distributions using histograms
hist(hit3$SLG, breaks=20, xlim=c(0, 1), main="Dist of OBP and SLG")
hist(hit3$OBP, breaks=20, add=T)
hist(hit3$HR/hit3$AB, breaks=20 )
hist(hit7$HR/hit7$AB, breaks=20 )
#-------------
# boxplot by Pos
boxplot(AVG~Pos,data=hit3, main="AVG by Pos",
xlab="Position", ylab="AVG")
boxplot(HR~Pos,data=hit3, main="HR by Pos",
xlab="Position", ylab="HR")
# DH는 홈런 중요
boxplot(X2B~Pos,data=hit3, main="X2B by Pos",
xlab="Position", ylab="X2B")
# 3루수가 빠른 발과 펀치력 겸비
boxplot(HR~Team,data=hit3, main="HR by Team",
xlab="Team", ylab="HR", cex.axis=0.6)
abline(a=12, b=0, lty=3)
boxplot(AVG~Team,data=hit3, main="AVG by Team",
xlab="Team", ylab="AVG", cex.axis=0.6)
abline(a=0.3, b=0, lty=3)
bymedian <- with(hit3, reorder(Team, -HR, median))
boxplot(HR~bymedian,data=hit3, main="HR by Team - Ordered",
xlab="Team", ylab="HR", cex.axis=0.6)
# UDF for quartile
getQtl <- function(x) {quantile(x, probs = 0.25, na.rm=T)}
bymedian <- with(hit3, reorder(Team, -HR, getQtl))
boxplot(HR~bymedian,data=hit3, main="HR by Team - Ordered",
xlab="Team", ylab="HR", cex.axis=0.6)
#===============================
# RF with MLB hitting data
#===============================
# regression case
require(randomForest)
fit <- randomForest(AVG_RBI ~ AB + OBP + HR + AVG + SLG + OPS + BB +
Pos + Team + SO + SB + CS + X2B + X3B + H + R + G,
data=hit3,
importance=TRUE,
ntree=10000)
plot(fit)
varImpPlot(fit)
Prediction <- predict(fit, hit3)
submit <- data.frame(Player = hit3$Player, AVG_RBI_P = Prediction, AVG_RBI = hit3$AVG_RBI)
# write.csv(submit, file = "RF_pred.csv", row.names = FALSE)
plot(submit$AVG_RBI, submit$AVG_RBI_P)
# MAPE
mean(abs(submit$AVG_RBI - submit$AVG_RBI_P))/mean(submit$AVG_RBI) *100
require(party)
# forest of conditional inference tree - often used for classification
# c.f. http://trevorstephens.com/kaggle-titanic-tutorial/r-part-5-random-forests/
fit <- cforest(AVG_RBI ~ AB + OBP + HR + AVG + SLG + OPS + BB +
Pos + Team + SO + SB + CS + X2B + X3B + H + R + G,
data=hit3,
controls=cforest_unbiased(ntree=10000, mtry=3))
Prediction <- predict(fit, hit3, OOB=TRUE , type="response")
submit <- data.frame(Player = hit3$Player, AVG_RBI = hit3$AVG_RBI,
AVG_RBI_P = as.numeric(Prediction))
# write.csv(submit, file = "RF_pred.csv", row.names = FALSE)
plot(submit$AVG_RBI, submit$AVG_RBI_P)
# MAPE
mean(abs(submit$AVG_RBI - submit$AVG_RBI_P))/mean(submit$AVG_RBI) *100
#-----------
# handling category variables - barplot
head(table(hit2$Team, hit2$Pos))
barplot(colMeans(table(hit2$Team, hit2$Pos)))
sort(colMeans(table(hit2$Team, hit2$Pos)), decreasing=T)
barplot(sort(colMeans(table(hit2$Team, hit2$Pos)), decreasing=T))
teamPlayers <- aggregate(hit2$Player, by=list(hit2$Team), function(x) {length(x)})
names(teamPlayers) <- c("Team", "Players")
barplot(teamPlayers$Players, names.arg=teamPlayers$Team, cex.names=0.7)
barplot(teamPlayers$Players, names.arg=teamPlayers$Team, cex.axis=0.9, cex.names=0.7)
hit11 <- hit[hit$HR>10,]
teamHRhitter <- aggregate(hit11$Player, by=list(hit11$Team), function(x) {length(x)})
names(teamHRhitter) <- c("Team", "HRhitters")
barplot(teamHRhitter$HRhitters, names.arg=teamHRhitter$Team, cex.names=0.7,
col=ifelse(teamHRhitter$HRhitters>=5, "red", "grey"))
hit12 <- hit[hit$HR>10,]
PosHRhitter <- aggregate(hit12$Player, by=list(hit12$Pos), function(x) {length(x)})
names(PosHRhitter) <- c("Pos", "HRhitters")
barplot(PosHRhitter$HRhitters, names.arg=PosHRhitter$Pos,
col=ifelse(PosHRhitter$HRhitters>=12, "red", "grey"))
#------- color filled line plot ---------------
plot(sort(hit3$HR), type="l", main="HR and X2B")
polygon(c(1:nrow(hit3), rev(1:nrow(hit3))), c(rep(0, nrow(hit3)), rev(sort(hit3$HR))), col=adjustcolor("blue", alpha=0.5))
polygon(c(1:nrow(hit3), rev(1:nrow(hit3))), c(rep(0, nrow(hit3)), rev(hit3[order(hit3$HR),]$X2B)), col=adjustcolor("red", alpha=0.5), border=NA)
mtext("ordered by HR(120+ AB)", 3, line=.8)
abline(lm(hit3[order(hit3$HR),]$X2B~c(1:nrow(hit3))), lty=2)
text(50, 12, "regression line of X2B")
# fill in areas between lines
plot(sort(hit3$HR), type="l", main="HR and X2B")
polygon(c(1:nrow(hit3), rev(1:nrow(hit3))) , c(rev(sort(hit3$HR, decreasing=T)), rev(hit3[order(hit3$HR),]$X2B)), col=adjustcolor("blue", alpha=0.5))
text(250, 3, "X2B above HR line filled")
arrows(200,15,200,11, col=2, lwd=7)
# why? running also is critical to get X2B
summary(lm(X2B~HR+SB, data=hit3))
plot(hit3[hit3$KPlayer==1,]$HR,hit3[hit3$KPlayer==1,]$X2B , ylim=c(0,10))
text(hit3[hit3$KPlayer==1,]$HR, hit3[hit3$KPlayer==1,]$X2B, labels=hit3[hit3$KPlayer==1,]$Player, pos=1)
# color handling practice
plot(hit$OBP, hit$SLG, col=ifelse(ceiling(hit$OBP*50)%%2 ==0 ,"black","darkgrey"), pch=20)
# plotting practice to mimic heatmaps
plot(expand.grid(1:10, 1:10), col="darkgrey", pch=15, cex=5)
points( expand.grid(1:10, 1:10), col=ifelse((expand.grid(1:10, 1:10)[,1]-expand.grid(1:10, 1:10)[,2]) %in% c(1,3), "red", "darkgrey"), pch=15, cex=5)
#---------- TBE analysis using plot and plot based heatmap
hit3$TBE <- (hit3$H - hit3$X2B - hit3$X3B - hit3$HR ) + (hit3$X2B *2 ) + (hit3$X3B *3 ) + (hit3$HR *4) + hit3$BB
hit3$AVG_TBE <- hit3$TBE/hit3$AB
require(party)
hit3$AVG_BB <- hit3$BB / hit3$AB
t1 <- ctree(AVG_TBE ~ AB + OBP + HR + AVG + SLG + AVG_BB + Pos + Team , data=hit3)
plot(t1)
# SLG and AVG_BB are keys
require(randomForest)
fit <- randomForest(AVG_TBE ~ AB + OBP + HR + AVG + SLG + AVG_BB + Pos + Team,
data=hit3,
importance=TRUE,
ntree=10000)
plot(fit)
varImpPlot(fit)
# SLG and OBP are keys?
# scatter plot itself -------
plot(hit3$SLG, hit3$AVG_BB, pch=20, col=ifelse(hit3$AVG_TBE>=0.5,"darkgrey", "lightgrey"),
main="TBE tiers by key factors",
xlab="Slugging", ylab="BB/AB")
hit3TBEh <- hit3[hit3$AVG_TBE>=0.65,]
points(hit3TBEh$SLG, hit3TBEh$AVG_BB, pch=20, col="black")
text(hit3TBEh$SLG, hit3TBEh$AVG_BB, ifelse(hit3TBEh$AVG_TBE>0.67,hit3TBEh$Player,""), col="blue", pos=4, cex=0.6)
text(hit3TBEh$SLG, hit3TBEh$AVG_BB, ifelse(hit3TBEh$AVG_TBE>0.71,hit3TBEh$Player,""), col="darkblue", pos=4, cex=0.6)
hit3TBEk <- hit3[hit3$KPlayer==1,]
text(hit3TBEk$SLG, hit3TBEk$AVG_BB, hit3TBEk$Player, col="red", pos=4, cex=0.6)
mtext("Koreans are in the middle [120+AB - 20160705]", 3, line=.7, cex=0.8, col="darkgrey")
text(0.25, 0.21, "* Note: Colors represent TBE(Total Bases Earned) tiers", cex=0.6)
plot(hit3$SLG, hit3$OBP, pch=20, col=ifelse(hit3$AVG_TBE>=0.5,"darkgrey", "lightgrey"),
main="TBE tiers by key factors",
xlab="Slugging", ylab="OBP")
hit3TBEh <- hit3[hit3$AVG_TBE>=0.65,]
points(hit3TBEh$SLG, hit3TBEh$OBP, pch=20, col="black")
text(hit3TBEh$SLG, hit3TBEh$OBP, ifelse(hit3TBEh$AVG_TBE>0.67,hit3TBEh$Player,""), col="blue", pos=4, cex=0.6)
text(hit3TBEh$SLG, hit3TBEh$OBP, ifelse(hit3TBEh$AVG_TBE>0.71,hit3TBEh$Player,""), col="darkblue", pos=4, cex=0.6)
hit3TBEk <- hit3[hit3$KPlayer==1,]
text(hit3TBEk$SLG, hit3TBEk$OBP, hit3TBEk$Player, col="red", pos=4, cex=0.6)
mtext("Koreans are in the middle [120+AB - 20160705]", 3, line=.7, cex=0.8, col="darkgrey")
text(0.25, 0.21, "* Note: Colors represent TBE(Total Bases Earned) tiers", cex=0.6)
hit3TBEh[hit3TBEh$AVG_TBE>0.71, ]
#--- profile comparison --------
#
hitKimHarp <- hit3[hit3$Player %in% c("Kim_H","Harper_B"),]
cols <- c("OBP","AVG","SLG","AVG_BB")
hitKim_H <- head(hitKimHarp[,cols],1)
hitHarper_B <- tail(hitKimHarp[,cols],1)
barplot( as.numeric(hitKim_H/hitHarper_B), names.arg=cols,
col=ifelse(as.numeric(hitKim_H/hitHarper_B)<=0.7, "red","grey"),
main="Kim_H compared to Harper_B" )
plot(hit3$AVG, hit3$AVG_BB, pch=20, col=ifelse(hit3$OBP>=0.35,"darkgrey","lightgrey"))
hitKimHarp <- hit3[hit3$Player %in% c("Kim_H","Harper_B"),]
text(hitKimHarp$AVG, hitKimHarp$AVG_BB, c("Kim_H","Harper_B"), col="red", pos=4, cex=0.6)
plot(hit3$SO/hit3$AB, hit3$AVG_BB, pch=20, col=ifelse(hit3$OBP>=0.35,"darkgrey","lightgrey"), main="If Kim_H sees more balls?")
hitKimHarp <- hit3[hit3$Player %in% c("Kim_H","Harper_B"),]
text(hitKimHarp$SO/hitKimHarp$AB, hitKimHarp$AVG_BB, c("Kim_H","Harper_B"), col="red", pos=4, cex=0.6)
arrows(0.155, 0.13,0.205, 0.24, lty=2, lwd=1.5, col="red")
arrows(0.155, 0.13,0.22, 0.22, lty=2, lwd=1.5, col="orange")
plot(hit3$AVG, hit3$SLG-hit3$AVG, main="Kim_H, AVG Vs. ISOP")
rect(.33, par("usr")[3],par("usr")[2],par("usr")[4],col = "lightblue")
points(hit3$AVG, hit3$SLG-hit3$AVG)
mtext("ISOP: Isolated Power(a sabermetric baseball statistic) ; As of 20160704", 3, line=.7, cex=0.8, col="darkgrey")
points(hit3[hit3$Player=="Kim_H",]$AVG, hit3[hit3$Player=="Kim_H",]$SLG-hit3[hit3$Player=="Kim_H",]$AVG, pch=20, col="blue")
points(hit3[hit3$Player=="Suzuki_I",]$AVG, hit3[hit3$Player=="Suzuki_I",]$SLG-hit3[hit3$Player=="Suzuki_I",]$AVG, pch=20, col="red")
text(hit3[hit3$Player %in% c("Kim_H", "Suzuki_I"),]$AVG, hit3[hit3$Player %in% c("Kim_H", "Suzuki_I"),]$SLG-hit3[hit3$Player %in% c("Kim_H", "Suzuki_I"),]$AVG, hit3[hit3$Player %in% c("Kim_H", "Suzuki_I"),]$Player, pos=1, cex=0.6)
text(hit3[hit3$AVG>=0.33,]$AVG, hit3[hit3$AVG>=0.33,]$SLG-hit3[hit3$AVG>=0.33,]$AVG, hit3[hit3$AVG>=0.33,]$Player, pos=1, cex=0.6)
abline(v=0.338, col="darkgrey", lty=3)
# 그 위대한 Ichiro Suzuki, MLB Career AVG .314, OPS .762
# Hyun Soo Kim(a.k.a. Machine) 2016 AVG .328, OPS .856 (as of 20160809)
# Machine이라 불려도 손색없을 상황
#----------- END -------------
'R 데이터 분석' 카테고리의 다른 글
LGUP_BDAS _ 20161006 (0) | 2016.10.06 |
---|---|
heatmap using scatterplot (0) | 2016.08.22 |
[SCW.VEDAR] Part 1 (0) | 2016.08.12 |
[R분석] cluster based anomaly detection (0) | 2016.08.11 |
[SCWHO] 시각적 데이터 분석 EDA 예제 MLB Hitting 2016mid (0) | 2016.07.05 |