>RE::VISION CRM

R 데이터 분석

[SCW.VEDAR] Part 1

YONG_X 2016. 8. 12. 11:12

# HANDS-ON TUTORIAL: VISUAL AND EXPLORATORY DATA ANALYSIS USING R

# YONG XUNE XON | >RE::VISION CONSULTING

# 전용준 | 리비젼컨설팅 대표




# WHAT IS VISUAL EXPLORATORY DATA ANALYSIS?


# 목적: [1] 데이터 자체의 구조와 의미에 대한 이해 [2] 대상 도메인의 전반적 특성 또는 숨겨진 패턴 이해

#       [3] 이미 알려진 사실을 보다 이해하기 쉽게 표현  [4] 미학적 표현(c.f. InfoGraphic) 


# 방법: 정해진 방법 없음. 다양한 챠트, 그래프, 맵을 사용. 반복적 refinement.



# THIS SESSION: MLB.COM 의 HITTING STAT을 예제로 사용하여 시각적, 탐색적 분석 체험


# R FOR VISUAL DATA ANALYSIS - 데이터 처리와 시각화를 위한 기본, 고급 다양한 기능제공





#========= [ 1 ] READING DATA  ============================

# bring in the data file directly from blog posting attachment

# data as of 20160705

hit <- read.csv("https://t1.daumcdn.net/cfile/blog/230D904A577DCECA34?download")


# new data as of 20160818

# hit <- read.csv("https://t1.daumcdn.net/cfile/blog/2575FF4C57B55F5205?download")


hit$Player <- gsub(" ", "", hit$Player)

colnames(hit)

head(hit)




# data definitions

hits <- T

if(hits==T) {

hit1 <- hit[hit$AB>=50,]

# add K-player tag

cKPlayer <- c("Kim_H", "Lee_D", "Choo_S", "Kang_J", "Park_B", "Choi_J")

hit1$KPlayer <- 0 

hit1[hit1$Player %in% cKPlayer, "KPlayer" ] <-  1

hit2 <- hit1[hit1$AB>=80,]

hit3 <- hit1[hit1$AB>=120,]

}



dim(hit)




#========= [ 2 ] BASIC PLOT  ============================


plot(hit$AVG)

length(hit$AVG[hit$AVG>0])


# get basic stat

range(hit$AB)

max(hit$AB)


# sorted plot

plot(sort(hit$HR))

# add main title

plot(sort(hit$HR), main="HR distribution - sorted(as of 0704)")


# scatter plot

plot(hit$AB, hit$HR)

# scaling axis

plot(hit$SO/hit$AB, hit$HR/hit$AB)



nrow(hit[hit$AB==0,])

hit1 <- hit[hit$AB>=50,]


plot(hit1$SO/hit1$AB, hit1$HR/hit1$AB, xlab="AVG_SO", ylab="AVG_HR")

# check correlation

cor(hit1$SO/hit1$AB, hit1$HR/hit1$AB)


plot(hit1$X2B/hit1$AB, hit1$HR/hit1$AB)

cor(hit1$X2B/hit1$AB, hit1$HR/hit1$AB)



# ........................


plot(hit1$BB/hit1$AB, hit1$HR/hit1$AB)

cor(hit1$BB/hit1$AB, hit1$HR/hit1$AB)

# 홍런비율 높으면 볼넷비율 높음

# 상관관계가 약함

 

plot(hit1$BB/hit1$AB, hit1$AVG)

cor(hit1$BB/hit1$AB, hit1$AVG)

# 타율과 볼넷비율은 무관하다고 할 정도



plot(hit1$AVG)

plot(sort(hit1$AVG))

# 미리 소트가 되어있었기에

mean(hit1$AVG)

median(hit1$AVG)


plot(hit1$HR)

plot(sort(hit1$HR))

median(hit1$HR)


# OPS == OBP+SLG

plot(hit1$OBP, hit1$SLG)

cor(hit1$OBP, hit1$SLG)


# jitter when there are overlaps

plot(hit1$X2B, hit1$HR)

plot(hit1$X2B, jitter(hit1$HR))

plot(jitter(hit1$X2B), jitter(hit1$HR))




#-------------


# add K-player tag

cKPlayer <- c("Kim_H", "Lee_D", "Choo_S", "Kang_J", "Park_B", "Choi_J")

hit1$KPlayer <- 0 

hit1[hit1$Player %in% cKPlayer, "KPlayer" ] <-  1



plot(hit1$OBP, hit1$SLG)

# hilite K players

points(hit1[hit1$KPlayer==1,]$OBP, hit1[hit1$KPlayer==1,]$SLG, col="red", pch=20)


# list Korean Players

hit1[hit1$KPlayer==1,]


# Choi_J is not in the list



# change color and point shape conditionally

plot(hit1$OBP, hit1$SLG, col=ifelse(hit1$KPlayer==1, "red", "grey"), pch=20)


plot(hit1$OBP, hit1$SLG, col=ifelse(hit1$KPlayer==1, "red", "black"), pch=ifelse(hit1$KPlayer==1, 20, 1))



hit2 <- hit1[hit1$AB>=80,]


# break down OPS

plot(hit2$OBP, hit2$SLG, main="OBP Vs. SLG - Koreans [80+ ABs - 20160704]")

# layered coloring

points(hit2[hit2$OPS>=0.8,]$OBP, hit2[hit2$OPS>=0.8,]$SLG, col="grey", pch=20)

points(hit2[hit2$OPS>=0.9,]$OBP, hit2[hit2$OPS>=0.9,]$SLG, col="black", pch=20)

# mark Koreans

points(hit2[hit2$KPlayer==1,]$OBP, hit2[hit2$KPlayer==1,]$SLG, col="red", pch=20)

# add player names - Koreans only

text(hit2[hit2$KPlayer==1,]$OBP, hit2[hit2$KPlayer==1,]$SLG, labels=hit2[hit2$KPlayer==1,]$Player, pos=1)



# legend with adjusted vertical spacing

legend(x=0.21, y=0.7, c("OPS<0.8", "OPS>=0.8", "OPS>=0.9", "Korean"),

pch=c(1,20, 20, 20), col=c("black", "grey", "black", "red"), cex = 0.75, 

adj = 0, 

text.width=0.04, bty='n', 

yjust=c(1, 0.85, 0.7, 0.55) ) 



# 참고: color code

# http://www.endmemo.com/program/R/pchsymbols.php 

# http://www.statmethods.net/advgraphs/parameters.html

# http://research.stowers-institute.org/efg/R/Color/Chart/index.htm


# 김현수 리그 최상급 출루율

# 이대호 강정호 장타율 높음

# OPS로 보면 최고 수준보다는 아래급




#========= [ 3 ] PLOTTING PRACTICE  ============================


hit3 <- hit1[hit1$AB>=120,]

plot(hit3$OBP, hit3$SLG, main="OBP Vs. SLG - Koreans")

points(hit3[hit3$KPlayer==1,]$OBP, hit3[hit3$KPlayer==1,]$SLG, col="red", pch=20)

text(hit3[hit3$KPlayer==1,]$OBP, hit3[hit3$KPlayer==1,]$SLG, labels=hit3[hit3$KPlayer==1,]$Player, pos=1)



plot(sort(hit3$SLG), main="SLG of Kang_J and Lee_D")

# Kang_J 확인 

hit3[hit3$Player=="Kang_J",]$SLG

abline(a=hit3[hit3$Player=="Kang_J",]$SLG, b=0, col="red")


# add guide line 

plot(sort(hit3$SLG), main="SLG of Kang_J and Lee_D")

abline(h=hit3[hit3$Player=="Kang_J",]$SLG,col="red")



# Top n players

plot(head(hit3[order(-hit3$SLG),],100)$SLG, main="SLG of Kang_J")

abline(a=hit3[hit3$Player=="Kang_J",]$SLG, b=0, col="red")

# 상위 100명 중 중간수준


plot(head(hit3[order(-hit3$SLG),],20)$SLG, main="SLG Top 20", type="h")


plot(head(hit3[order(-hit3$SLG),],20)$SLG, main="SLG Top 20 - Point and H line")

lines(head(hit3[order(-hit3$SLG),],20)$SLG, type="h")


plot(head(hit3[order(-hit3$SLG),],20)$SLG, main="SLG Top 20 - Point and H line", pch=20)

lines(head(hit3[order(-hit3$SLG),],20)$SLG, type="h")




plot(sort(hit3$SLG[hit3$SLG>=0.515])) # Lee_D

plot(sort(hit3$SLG[hit3$SLG>=0.520])) # Kang_J

length(hit3$SLG[hit3$SLG>=0.520])

42/nrow(hit3)

nrow(hit3)


plot(sort(hit3$AVG[hit3$AVG>=0.338], decreasing=T))

# 김현수 규정타수 미달이지만 120AB 이상 306명 중에 리그 전체에서 7위 수준의 AVG



hit2$AVG_HR <- hit2$HR / hit2$AB

plot(sort(hit2$AVG_HR), main="KANG_J 0.072 HR/AB [80+AB - 20160705]")

abline(a=0.07236842, b=0, col="red")

# 타수당 홈런비율 - 타율과 비교위해




#--- add subtitle ---------------

hit2$AVG_HR <- hit2$HR / hit2$AB

plot(sort(hit2$AVG_HR), main="")

abline(a=0.07236842, b=0, col="red")

mtext("KANG_J's HR/AB = 0.072 [80+AB - 20160705]", 3, line=.8)

mtext("HR/AB",    3, line=2, cex=1.5)




#========= [ 4 ] AGGREGATION  ============================


# aggregation by team


teamhit <- aggregate(hit$H, by=list(hit$Team), FUN=sum)

names(teamhit) <- c("Team", "TeamH")


teamAB <- aggregate(hit$AB, by=list(hit$Team), FUN=sum)

names(teamAB) <- c("Team", "TeamAB")


teamHR <- aggregate(hit$HR, by=list(hit$Team), FUN=sum)

names(teamHR) <- c("Team", "TeamHR")


teamhitAB <- merge(teamhit, teamAB, by="Team", all.x=T)

teamhitAB <- merge(teamhitAB, teamHR, by="Team", all.x=T)


# 팀의 전체 합계와 같은 값이 아닐 수 있음에 주의

# 기록에 빠진 선수가 있을 경우


plot(teamhitAB$TeamH, teamhitAB$TeamAB)


teamGames <- aggregate(hit$G, by=list(hit$Team), FUN=max)

names(teamGames) <- c("Team", "TeamG")


teamhitAB <- merge(teamhitAB, teamGames, by="Team", all.x=T)



teamhitAB$AVG_H <- teamhitAB$TeamH/ teamhitAB$TeamAB


head(teamhitAB[order(-teamhitAB$AVG_H),])

head(teamhitAB[order(-teamhitAB$TeamHR),])




#---------

# basic barplot


barplot(hit2[hit2$KPlayer==1,]$HR)

barplot(hit2[hit2$KPlayer==1,]$HR, names.arg=hit2[hit2$KPlayer==1,]$Player)




#========= [ 5 ] ADVANCED PLOTTING  ============================



# ggplot2 like plotting with base plotting function

# https://flowingdata.com/2016/03/22/comparing-ggplot2-and-r-base-graphics/


# HR per Games - HR Leader Teams 

teamhitAB1 <- head(teamhitAB[order(-teamhitAB$TeamHR),])

teamhitAB1$THRpG <- teamhitAB1$TeamHR /  teamhitAB1$TeamG


par(las=1)

barplot(teamhitAB1$THRpG,

        names.arg=teamhitAB1$Team,

        col="#AFC0CB",

        border=FALSE,

        main="Team HR per Game - Leading Teams")



players_by_Pos <- table(hit2$Pos)

barplot(players_by_Pos)


# bar 색상 변경

barplot(players_by_Pos, col="lightblue")


barplot(players_by_Pos, col=ifelse(unlist(dimnames(players_by_Pos)) %in% c("CF", "LF", "RF"), "orange","lightblue"), main="Number of OFs")


# check dimnames(players_by_Pos)

# table check

str(players_by_Pos)

str(dimnames(players_by_Pos))

str(unlist(dimnames(players_by_Pos)))



# OF 구분별 태그 추가

hit$IO <- ifelse(hit$Pos %in% c("P","1B","2B","3B", "SS","C" ), "IF","OF")

teamHR1 <- aggregate(hit$HR, by=list(hit$Team, hit$IO), FUN=sum)

names(teamHR1) <- c("Team", "InOut", "TeamHR")

teamHR2 <- teamHR1[teamHR1$Team %in% c("BAL", "SEA", "TOR"),]




# barplot :: HR 상위 3개팀 내외야 구분별 홈런수 합계

# ...............


TeamHRIO1mat <- matrix( teamHR2$TeamHR,

                   nrow = 2,

                   byrow=TRUE,

                   dimnames = list(c("IF", "OF"), c("BAL", "SEA", "TOR"))

                   )

TeamHRIO1mat


mf_col <- c("#3CC3BD", "#FD8210")

barplot(TeamHRIO1mat, beside = TRUE, border=NA, col=mf_col, ylim=c(0,100),

main="Infielder Vs. Outfielder HRs - leading Teams")

legend("topleft", row.names(TeamHRIO1mat), pch=15, col=mf_col, yjust=c(1,0.8), 

ncol=2, cex=0.75 )

# grid(NA, NULL, lwd=1, lty=1, col="#ffffff")


grid(NA, NULL, lwd=1, lty=2, col="grey")

grid(NA, NULL, lwd=2, lty=3, col=500)


# 이대호가 SEA의 <홈런 중심 내야> 팀컬러에 힘을 실어준 분위기

# DH와 P를 IF에 포함하는가도 영향



#----------


# 내외야 구분별 홈런과 출루율 분포

# 외야수의 출루율이 대체로 높음


# 한 화면에 복수의 plot 추가

par(mfrow=c(1,2))

IOs <- unique(hit$IO)


# 루프를 사용

for (i in 1:length(IOs)) {

  currdata <- hit[hit$IO == IOs[i],]

  plot(jitter(currdata$OBP), jitter(currdata$HR),

    main=IOs[i], ylim=c(0,max(currdata$HR)*1.2))

  abline(v=median(currdata$OBP, na.rm=T), col="blue")

  abline(v=mean(currdata$OBP, na.rm=T), col="red")

  abline(h=median(currdata$HR, na.rm=T), col="grey", lty=2)


}


# v is for vertical line position



# AVG and HR by Position


par(mfrow=c(3,4))

Poss <- sort(unique(hit$Pos))

for (i in 1:3) {

 for (j in 1:4) {

    print(i)

    print(j)

    print((i*4)-4+j)

    print(Poss[(i*4)-4+j])

    currdata <- hit[hit$Pos == Poss[(i*4)-4+j],]

    if((i*4)-4+j <= length(Poss)) {

      plot(jitter(currdata$AVG), jitter(currdata$AB),

        main=Poss[(i*4)-4+j], ylim=c(0,400), xlim=c(0, 1), las=1)

      abline(v=median(currdata$AVG, na.rm=T), col="blue")

    }

 }

}




par(mfrow=c(3,4))

Poss <- sort(unique(hit$Pos))

for (i in 1:3) {

 for (j in 1:4) {

    print(i)

    print(j)

    print((i*4)-4+j)

    print(Poss[(i*4)-4+j])

    currdata <- hit[hit$Pos == Poss[(i*4)-4+j],]

    if((i*4)-4+j <= length(Poss)) {

      plot(jitter(currdata$HR), jitter(currdata$AB),

        main=Poss[(i*4)-4+j], ylim=c(0,400), xlim=c(0, 30), las=1)

      abline(v=median(currdata$HR, na.rm=T), col="blue")

    }

 }

}



par(mfrow=c(1,1))


# Error in plot.new() : figure margins too large [ ??? ]

# RStudio issue ... enlarge chart window




#---------


hit3 <- hit1[hit1$AB>=120,]

hit3$IO <- ifelse(hit3$Pos %in% c("P","DH", "1B","2B","3B", "SS","C" ), "IF", "OF")

hit3$IO <- ifelse(hit3$Pos %in% c("P","DH" ), "PD", hit3$IO)

plot( table(hit3$IO ))



plot(hit3$AVG, hit3$HR, main ="AVG Vs. HR by In-Out Fielder (120+AB)")

hit5 <- hit3[hit3$IO=="IF",]

hit51 <- hit3[hit3$IO=="OF",]

points(hit5$AVG, hit5$HR, col="grey", pch=20)

# 선형 회귀선 추가

abline(lm(hit5$HR~hit5$AVG), col="grey")


points(hit51$AVG, hit51$HR, col="blue", pch=20)

abline(lm(hit51$HR~hit51$AVG), lty=2, col="blue")


abline(v=median(hit5$AVG, na.rm=T), col="grey" , lty=3)

abline(h=median(hit5$HR, na.rm=T), col="grey", lty=3)


abline(v=median(hit51$AVG, na.rm=T), col="blue" , lty=3)

abline(h=median(hit51$HR, na.rm=T), col="blue", lty=3)


# add legend

legend(x=0.15, y=24, c("IF", "OF", "P&DH"),

pch=c(20, 20, 1), col=c("grey", "blue", "black"), cex = 0.75, 

adj = 0, 

text.width=0.04, bty='n', 

yjust=c(1, 0.85, 0.7) ) 





#========= [ 6 ] CLUSTER PROFILING (SKIP) ============================



k1 <- kmeans(hit3[,c("AVG","OBP","SLG", "HR" , "SO", "BB", "AB")],3)

k1$cluster <- as.factor(k1$cluster)


require(ggplot2)

ggplot(hit3, aes(AVG, HR, color = k1$cluster)) + geom_point()

ggplot(hit3, aes(OBP, SLG, color = k1$cluster)) + geom_point()

ggplot(hit3, aes(SO, BB, color = k1$cluster)) + geom_point()

ggplot(hit3, aes(SO, HR, color = k1$cluster)) + geom_point()

ggplot(hit3, aes(BB, HR, color = k1$cluster)) + geom_point()



# with base plot and new clustering

k1 <- kmeans(hit3[,c("OBP","SLG")], 5)

k1$cluster <- as.factor(k1$cluster)


cols <- c("red", "orange", "green", "skyblue", "blue")

plot(hit3$AVG, hit3$SO, col = cols[k1$cluster], pch=20)

plot(hit3$AVG, hit3$OPS, col = cols[k1$cluster], pch=20, main="OPS Clusters")

plot(hit3$AVG, hit3$OPS/hit3$AVG, col = cols[k1$cluster], pch=20, main="OPS Clusters")

plot(hit3$OBP, hit3$SLG, col = cols[k1$cluster], pch=20, main="OPS Clusters")

points(hit3[hit3$Player=="Kim_H",]$OBP, hit3[hit3$Player=="Kim_H",]$SLG, cex=2)

text(hit3[hit3$Player=="Kim_H",]$OBP, hit3[hit3$Player=="Kim_H",]$SLG, labels=hit3[hit3$Player=="Kim_H",]$Player, pos=1)


text(hit3[hit3$KPlayer==1,]$OBP, hit3[hit3$KPlayer==1,]$SLG, labels=hit3[hit3$KPlayer==1,]$Player, pos=1)




#========= [ 7 ] APPLICATIONS  ============================


#

# 특정 이슈에 대한 집중 분석

# 왜 Kim_H가 대접받지 못하는가

# LF Position,, BAL and Kim_H




hit6 <- hit[hit$Pos == "LF" & hit$AB>= 80, ] 

hit7 <- hit[hit$Player == "Kim_H", ] 


nrow(hit6)

length(unique(hit$Team))

plot(hit6$AVG, hit6$HR, main="LFs with AB 80+")

points(hit7$AVG, hit7$HR, col="red", pch=20)


plot(hit6$AB, hit6$HR)

points(hit7$AB, hit7$HR, col="red", pch=20)

points(hit7$AB*1.7, hit7$HR*1.7, col="green", pch=20)


# Kim_H 타수에 비해 홈런이 많이 적음. LF 포지션 치고

# 타수가 1.7배가 된다고 해도 여전히 홈런이 하위권


plot(hit6$AB, hit6$AVG)

points(hit7$AB, hit7$AVG, col="red", pch=20)

points(hit7$AB*2, hit7$AVG*0.9, col="green", pch=20)


# 타수가 두배가 된다면 타율이 10% 낮아져서 3할 정도 된다고 해도

# MLB 전체중 탑클래스

# 80 타수이상일 경우라면 타수가 늘어난다고 타율이 줄어들지 않음

# 오히려 안정적인 출장으로 타율은 높아질 가능성 있음


plot(hit6$AB, hit6$OBP)

points(hit7$AB, hit7$OBP, col="red", pch=20)

points(hit7$AB*2, hit7$OBP*0.9, col="green", pch=20)


# OBP가 압도적으로 높기에 타수가 늘어도 최고 수준일 가능성 높아 보임


plot(hit6$AB, hit6$SLG, main="if 2x AB?")

points(hit7$AB, hit7$SLG, col="red", pch=20)

points(hit7$AB*2, hit7$SLG, col="green", pch=20)

lines(c(hit7$AB, hit7$AB*2), c(hit7$SLG, hit7$SLG), lty=2, col="grey")


# HR이외 장타가 많은 편이라 SLG가 크게 낮지는 않음

# 전형적인 중거리 타자

# 타수 증가 후라면 SLG도 상위권은 되는 수준

# 플래툰 기용의 이유인 왼손투수 상대에서 약점이 없을 경우


# 현재 팀내의 HR 중심 공격 스타일에 부합하지 않음이 문제


# 출루유형별 진루베이스 수를 가중해서 합산한 지표 TB(Total Bases Earned)생성

hit6$TBE <- (hit6$H - hit6$X2B - hit6$X3B - hit6$HR ) + (hit6$X2B *2 ) + (hit6$X3B *3 ) + (hit6$HR *4) + hit6$BB

hit7 <- hit6[hit6$Player == "Kim_H", ] 


plot(sort(hit6$TBE))

abline(a=hit7$TBE*1.7, b=0, col="red")


# 타수가 늘어나더라도 HR수 부족으로 인해 극상위권의 TBE는 되기 어려워 보임(29위)

# OBP는 높으나 상대적으로 팀에 대한 기여도로 본다면 최고수준은 아님

# MLB라면 리드오프에 대해서 조차 HR 중요시

# HR으로 안된다면 X2B라도 많아져야 한다는 과제


nrow(hit6[hit6$TBE>=hit7$TBE*1.7,])

# 그래도 14위 정도 수준 (상위권)


# TBE 최상위권 player list - 전  Position

head(hit6[order(-hit6$TBE),])



plot(sort(hit6$TBE/hit6$AB), main="AVG TBE(Total Bases Earned) of LFs and Kim_H")

abline(a=hit7$TBE/hit7$AB, b=0, col="red")

text(5, hit7$TBE/hit7$AB, labels=hit7$Player, pos=3, col="blue")


# 출루가 많은 편이라 멀리나가는 편. LF중 전체 5위 수준

# 30개 팀임을 고려한다면... ?

# 그러나 장타 부족으로 최고 수준은 아님




plot(sort(hit6$HR/hit6$AB))

abline(a=hit7$HR/hit7$AB, b=0, col="red")

# 타수당 홈런수 비율 하위권


plot(sort(hit6$X2B/hit6$AB))

abline(a=hit7$X2B/hit7$AB, b=0, col="red")

# 타수당 2루타 비율은 상위권


plot(hit6$X2B, hit6$HR, main="Kim_H's HR Vs. 2XB - Amng LFs")

points(hit7$X2B, hit7$HR, col="red", pch=20 )

points(hit7$X2B*1.7, hit7$HR*1.7, col="green", pch=20 )

lines(c(hit7$X2B, hit7$X2B*1.7), c(hit7$HR, hit7$HR*1.7), lty=2, col="grey")

# 출장이 1.7배가 되어도 2루타는 상위권에 들겠으나 HR 매우 부족은 분명 - 최하위 수준


plot(hit6$X2B/hit6$AB, hit6$HR/hit6$AB, main="Kim_H's HR Vs. 2XB - Amng LFs")

points(hit7$X2B/hit7$AB, hit7$HR/hit7$AB, col="red", pch=20 )

# HR비율과 2루타 비율은 별 관계없음




plot(hit6$AVG, hit6$X2B/hit6$AB , main="Is Kim_H a Singler?")

points(hit7$AVG, hit7$X2B/hit7$AB , col="red", pch=20 )

# 타율이 매우 높음에 비해서는 2루타율이 높지 않아 보임


plot(hit6$AVG, (hit6$X2B/hit6$AB)/hit6$AVG , main="Is Kim_H a Singler?")

points(hit7$AVG, (hit7$X2B/hit7$AB)/hit7$AVG , col="red", pch=20 )


nrow(hit6[(hit6$X2B/hit6$AB)/hit6$AVG > (hit7$X2B/hit7$AB)/hit7$AVG , ]) / nrow(hit6)

# 타율대비 2루타율 비율에서 상위 43% 수준. 타율이 높은 것이 오히려 이미지에 손해


plot(hit$AB, hit$AVG, main="Kim_H's AVG will fall if more games?")

abline(lm(hit$AVG~hit$AB), col="blue", lty=2, lwd=2) 

points(hit7$AB, hit7$AVG, col="red", pch=20)


# 출장이 늘어난다고 해도 오히려 높아지면 모를까

# 전체 타자의 AB와 AVG 관계를 보면 타율이 낮아지지 않을 것을 추측 가능

# 단, 제3의 변수 즉 부상이나 슬럼프 가능성?

# 선수별 성적의 시즌중 시계열적인 변화 패턴에 대한 분석?

# 무리한 확대해석이나 일반화 가능성?



plot(hit$AB, hit$AVG, main="Kim_H's AVG will fall if more games?")

abline(lm(hit$AVG~hit$AB), col="blue", lty=2, lwd=2) 

abline(lm(hit3$AVG~hit3$AB), col="green", lty=2, lwd=2) 

points(hit7$AB, hit7$AVG, col="red", pch=20)






#-----------------------------

# add quantile grid


plot(sort(hit3$AVG), main="Tiers of AVG - 120+ AB")

abline(a=quantile(hit3$AVG, probs = 0.95, na.rm=T), b=0)

text(10, quantile(hit3$AVG, probs = 0.95, na.rm=T), labels="5%", pos=3)

abline(a=quantile(hit3$AVG, probs = 0.75, na.rm=T), b=0, lty=2)

text(10, quantile(hit3$AVG, probs = 0.75, na.rm=T), labels="25%", pos=3)

abline(a=quantile(hit3$AVG, probs = 0.25, na.rm=T), b=0, lty=2, col="darkgrey")

text(10, quantile(hit3$AVG, probs = 0.25, na.rm=T), labels="75%", pos=3)





#----------

plot(hit3$SO, hit3$BB)

points(hit3[hit3$AVG>=0.3,]$SO, hit3[hit3$AVG>=0.3,]$BB, col="blue", pch=20)

plot(hit3$BB/hit3$SO, hit3$AVG)


plot(hit3$SO, hit3$BB)

points(hit3[hit3$HR>=15,]$SO, hit3[hit3$HR>=15,]$BB, col="blue", pch=20)

plot(hit3$BB/hit3$SO, hit3$HR)



#----------

plot(hit3$RBI/hit3$AB, hit3$OBP)

points(hit7$RBI/hit7$AB, hit7$OBP, col="red", pch=20)

abline(a=quantile(hit3$OBP, probs = 0.75, na.rm=T), b=0, col="red", lty=3)

abline(v=quantile(hit3$RBI/hit3$AB, probs = 0.75, na.rm=T), col="red", lty=3)

abline(v=quantile(hit3$RBI/hit3$AB, probs = 0.25, na.rm=T), col="grey", lty=3)


# Kim_H, 출루율이 높다해도 타점생산력이 매우 낮은 것은 약점. 타순과도 상관있을 듯

# 개인능력 보다도 팀의 선수운용과 관련되었을 것


#--------[계속]------