[SCWHO] 시각적 데이터 분석 EDA 예제 MLB Hitting 2016mid

R 데이터 분석

[SCWHO] 시각적 데이터 분석 EDA 예제 MLB Hitting 2016mid

YONG_X 2016. 7. 5. 16:58

# 전용준 :: 리비젼컨설팅 :: 02-415-7650 :: revision.co.kr :: xyxonxyxon@empal.com

#==================================================

# setwd("E:/restore_Yong/GAMDFM/GT_s/")

# hit <- read.csv("mlb_hitt_20160705.csv")

mlb_hitt_20160705.csv

mlb_hitt_20160818.csv

# bring in the data file directly from blog posting attachment

hit <- read.csv("https://t1.daumcdn.net/cfile/blog/230D904A577DCECA34?download")

hit$Player <- gsub(" ", "", hit$Player)

cKPlayer <- c("Kim_H", "Lee_D", "Choo_S", "Kang_J", "Park_B")

hit$KPlayer <- 0

hit[hit$Player %in% cKPlayer, "KPlayer" ] <- 1

# .................

str(hit)

head(hit)

plot(hit$AVG)

plot(hit$AB, hit$AVG)

plot(hit$AB, hit$HR)

plot(hit$SO, hit$HR)

plot(hit$SO/hit$AB, hit$HR/hit$AB)

hit1 <- hit[hit$AB>=50,]

plot(hit1$SO/hit1$AB, hit1$HR/hit1$AB)

cor(hit1$SO/hit1$AB, hit1$HR/hit1$AB)

plot(hit$X2B/hit$AB, hit$HR/hit$AB)

plot(hit1$X2B/hit1$AB, hit1$HR/hit1$AB)

cor(hit1$X2B/hit1$AB, hit1$HR/hit1$AB)

plot(hit1$AVG, hit1$HR)

plot(hit1$AVG, hit1$HR/hit1$AB)

cor(hit1$AVG, hit1$HR/hit1$AB)

plot(hit1$AVG, hit1$SO/hit1$AB)

cor(hit1$AVG, hit1$SO/hit1$AB)

plot(hit1$BB/hit1$AB, hit1$HR/hit1$AB)

cor(hit1$BB/hit1$AB, hit1$HR/hit1$AB)

# 홍런비율 높으면 볼넷비율 높음

plot(hit1$BB/hit1$AB, hit1$AVG)

cor(hit1$BB/hit1$AB, hit1$AVG)

# 타율과 볼넷비율은 무관함

plot(hit1$AVG)

mean(hit1$AVG)

median(hit1$AVG)

plot(hit1$HR)

plot(sort(hit1$HR))

median(hit1$HR)

plot(sort(hit1$HR/hit1$AB))

median(sort(hit1$HR/hit1$AB))

median(sort(hit1$HR/(hit1$HR+hit1$X3B+hit1$X2B+hit1$H)))

hit1$All_H <- hit1$HR+hit1$X3B+hit1$X2B+hit1$H

plot(hit1$All_H, hit1$HR)

plot(hit1$OBP, hit1$SLG)

cor(hit1$OBP, hit1$SLG)

plot(hit1$OBP, hit1$SLG)

points(hit1[hit1$KPlayer==1,]$OBP, hit1[hit1$KPlayer==1,]$SLG, col="red", pch=20)

hit1[hit1$KPlayer==1,]

hit2 <- hit[hit$AB>=80,]

plot(hit2$OBP, hit2$SLG, main="OBP Vs. SLG - Koreans [80+ ABs - 20160705]")

points(hit2[hit2$OPS>=0.8,]$OBP, hit2[hit2$OPS>=0.8,]$SLG, col="grey", pch=20)

points(hit2[hit2$KPlayer==1,]$OBP, hit2[hit2$KPlayer==1,]$SLG, col="red", pch=20)

text(hit2[hit2$KPlayer==1,]$OBP, hit2[hit2$KPlayer==1,]$SLG, labels=hit2[hit2$KPlayer==1,]$Player, pos=1)

# legend with adjusted vertical spacing

legend(x=0.21, y=0.7, c("OPS<0.8","OPS>=0.8","Korean"),

pch=c(1,20,20), col=c("black", "grey", "red"), cex = 0.75,

adj = 0,

text.width=0.04, bty='n',

yjust=c(1,0.85,0.7) )

# http://www.endmemo.com/program/R/pchsymbols.php

# 김현수 리그 최상급 출루율

# 이대호 강정호 장타율 높음

hit3 <- hit[hit$AB>=120,]

plot(hit3$OBP, hit3$SLG, main="OBP Vs. SLG - Koreans")

points(hit3[hit3$KPlayer==1,]$OBP, hit3[hit3$KPlayer==1,]$SLG, col="red", pch=20)

text(hit3[hit3$KPlayer==1,]$OBP, hit3[hit3$KPlayer==1,]$SLG, labels=hit3[hit3$KPlayer==1,]$Player, pos=1)

plot(sort(hit3$SLG), main="OBP of Kang_J and Lee_D")

abline(a=0.515, b=0, col="red")

abline(a=0.520, b=0, col="red")

hist(hit3$SLG)

hist(hit3$SLG, breaks=20)

abline(v=0.520,col="red")

plot(head(hit3[order(-hit3$SLG),],100)$SLG)

abline(a=0.520, b=0, col="red")

plot(sort(hit3$SLG[hit3$SLG>=0.515]))

plot(sort(hit3$SLG[hit3$SLG>=0.520]))

length(hit3$SLG[hit3$SLG>=0.520])

42/nrow(hit3)

plot(sort(hit3$AVG[hit3$AVG>=0.338]))

# 김현수 규정타석 미달이지만 120AB 이상 306명 중에 리그 전체에서 7위 수준의 AVG

# -------------

hit2$RHR <- hit2$HR / hit2$AB

plot(sort(hit2$RHR), main="KANG_J 0.072 HR/AB [80+AB - 20160705]")

abline(a=0.07236842, b=0, col="red")

hit$All_H <- hit$HR+hit$X3B+hit$X2B+hit$H

teamhit <- aggregate(hit$All_H, by=list(hit$Team), FUN=sum)

names(teamhit) <- c("Team", "TeamAll_H")

teamAB <- aggregate(hit$AB, by=list(hit$Team), FUN=sum)

names(teamAB) <- c("Team", "TeamAB")

teamHR <- aggregate(hit$HR, by=list(hit$Team), FUN=sum)

names(teamHR) <- c("Team", "TeamHR")

teamhitAB <- merge(teamhit, teamAB, by="Team", all.x=T)

teamhitAB <- merge(teamhitAB, teamHR, by="Team", all.x=T)

plot(teamhitAB$TeamAll_H, teamhitAB$TeamAB)

teamGames <- aggregate(hit$G, by=list(hit$Team), FUN=max)

names(teamGames) <- c("Team", "TeamG")

teamhitAB <- merge(teamhitAB, teamGames, by="Team", all.x=T)

teamhitAB$AVG_AH <- teamhitAB$TeamAll_H/ teamhitAB$TeamAB

head(teamhitAB[order(-teamhitAB$AVG_AH),])

head(teamhitAB[order(-teamhitAB$TeamHR),])

https://t1.daumcdn.net/cfile/blog/25624E46577B68B01C?download

# ggplot2 like plotting with base

https://flowingdata.com/2016/03/22/comparing-ggplot2-and-r-base-graphics/

# HR per Games - HR Leader Teams

teamhitAB1 <- head(teamhitAB[order(-teamhitAB$TeamHR),])

teamhitAB1$THRpG <- teamhitAB1$TeamHR / teamhitAB1$TeamG

par(las=1)

barplot(teamhitAB1$THRpG,

names.arg=teamhitAB1$Team,

col="#AFC0CB",

border=FALSE,

main="Team HR per Game of Leading Teams")

players_by_Pos <- table(hit2$Pos)

barplot(players_by_Pos)

hit$IO <- ifelse(hit$Pos %in% c("P","1B","2B","3B", "SS","C" ), "IF","OF")

teamHR1 <- aggregate(hit$HR, by=list(hit$Team, hit$IO), FUN=sum)

names(teamHR1) <- c("Team", "InOut", "TeamHR")

teamHR2 <- teamHR1[teamHR1$Team %in% c("BAL", "SEA", "TOR"),]

# barplot :: HR 상위 3개팀 내외야 구분별 홈런수 합계

# ...................

TeamHRIO1mat <- matrix( teamHR2$TeamHR,

nrow = 2,

byrow=TRUE,

dimnames = list(c("IF", "OF"), c("BAL", "SEA", "TOR"))

)

mf_col <- c("#3CC3BD", "#FD8210")

barplot(TeamHRIO1mat, beside = TRUE, border=NA, col=mf_col, ylim=c(0,100),

main="Infielder Vs. Outfielder HRs - leading Teams")

legend("topleft", row.names(TeamHRIO1mat), pch=15, col=mf_col, yjust=c(1,0.8),

ncol=2, cex=0.75 )

# BAL 왜 김현수를 플래툰으로 기용하는지를 어느 정도 설명하는 그림

# BAL 외야수들의 홈런이 타팀에 비해 상당히 많은 편. 출루 중심인 타자를 외야에 쓰기가

# 팀컬러에 맞지 않아서 ... ?

# 이대호가 SEA의 홈런 중심 내야 팀컬러에 힘을 실어준 분위기

#----------

# 내외야 구분별 홈런과 출루율 분포

# 외야수의 출루율이 대체로 높음

par(mfrow=c(1,2))

IOs <- unique(hit$IO)

for (i in 1:length(IOs)) {

currdata <- hit[hit$IO == IOs[i],]

plot(currdata$OBP, currdata$HR,

main=IOs[i], ylim=c(0,max(currdata$HR)*1.2))

abline(v=median(currdata$OBP, na.rm=T), col="blue")

abline(v=mean(currdata$OBP, na.rm=T), col="red")

}

# red line :: 평균 : blue line :: 중위수

# grey line :: median HR

# AVG and HR by Position

par(mfrow=c(3,4))

Poss <- sort(unique(hit$Pos))

for (i in 1:3) {

for (j in 1:4) {

print(i)

print(j)

print((i*4)-4+j)

print(Poss[(i*4)-4+j])

currdata <- hit[hit$Pos == Poss[(i*4)-4+j],]

if((i*4)-4+j <= length(Poss)) {

plot(jitter(currdata$AVG), jitter(currdata$AB),

main=Poss[(i*4)-4+j], ylim=c(0,400), xlim=c(0, 1), las=1)

abline(v=median(currdata$AVG, na.rm=T), col="blue")

}

par(mfrow=c(3,4))

Poss <- sort(unique(hit$Pos))

for (i in 1:3) {

for (j in 1:4) {

print(i)

print(j)

print((i*4)-4+j)

print(Poss[(i*4)-4+j])

currdata <- hit[hit$Pos == Poss[(i*4)-4+j],]

if((i*4)-4+j <= length(Poss)) {

plot(jitter(currdata$HR), jitter(currdata$AB),

main=Poss[(i*4)-4+j], ylim=c(0,400), xlim=c(0, 30), las=1)

abline(v=median(currdata$HR, na.rm=T), col="blue")

}

par(mfrow=c(1,1))

# DH > 1B > 3B > RF > LF 순으로 홈런이 많음

# CF와 2B, SS 포지션은 수비가 더 중요

par(mfrow=c(1,1))

plot(hit3$AVG, hit3$HR, main ="AVG Vs. HR by In-Out Fielder (120+AB)")

hit5 <- hit3[hit3$IO=="IF",]

hit51 <- hit3[hit3$IO=="OF",]

points(hit5$AVG, hit5$HR, col="grey", pch=20)

abline(lm(hit5$HR~hit5$AVG))

points(hit51$AVG, hit51$HR, col="blue", pch=20)

abline(lm(hit51$HR~hit51$AVG), lty=2)

abline(v=median(hit5$AVG, na.rm=T), col="grey" , lty=3)

abline(h=median(hit5$HR, na.rm=T), col="grey", lty=3)

abline(v=median(hit51$AVG, na.rm=T), col="blue" , lty=3)

abline(h=median(hit51$HR, na.rm=T), col="blue", lty=3)

# 내야수가 타율과 홈런 모두에서 외야수들에 비해 앞서는 분포

# 타율과 홈런수간의 관계에는 큰 차이 없음

# grey :: infielder : blue :: outfielder

# clustering using k-means

require(ggplot2)

k1 <- kmeans(hit3[,c("AVG","OBP","SLG", "HR" , "SO", "BB")],3)

k1$cluster <- as.factor(k1$cluster)

ggplot(hit3, aes(AVG, HR, color = k1$cluster)) + geom_point()

ggplot(hit3, aes(OBP, SLG, color = k1$cluster)) + geom_point()

ggplot(hit3, aes(SO, BB, color = k1$cluster)) + geom_point()

ggplot(hit3, aes(SO, HR, color = k1$cluster)) + geom_point()

ggplot(hit3, aes(BB, HR, color = k1$cluster)) + geom_point()

# SO가 많은 집단과 아닌 집단, AB에 따른 차이가 있으나

#--------------------

# LF Position and Kim_H

# 왜 Kim_H가 대접받지 못하는가

hit6 <- hit[hit$Pos == "LF" & hit$AB>= 80, ]

hit7 <- hit[hit$Player == "Kim_H", ]

nrow(hit6)

length(unique(hit$Team))

plot(hit6$AVG, hit6$HR)

points(hit7$AVG, hit7$HR, col="red", pch=20)

plot(hit6$AB, hit6$HR)

points(hit7$AB, hit7$HR, col="red", pch=20)

points(hit7$AB*1.7, hit7$HR*1.7, col="green", pch=20)

# Kim_H 타수에 비해 홈런이 많이 적음. LF 포지션 치고

# 타수가 1.7배가 된다고 해도 여전히 홈런이 하위권

plot(hit6$AB, hit6$AVG)

points(hit7$AB, hit7$AVG, col="red", pch=20)

points(hit7$AB*2, hit7$AVG*0.9, col="green", pch=20)

# 타수가 두배가 된다면 타율이 10% 낮아져서 3할 정도 된다고 해도

# MLB 전체중 탑클래스

# 80 타수이상일 경우라면 타수가 늘어난다고 타율이 줄어들지 않음

# 오히려 안정적인 출장으로 타율은 높아질 가능성 있음

plot(hit6$AB, hit6$OBP)

points(hit7$AB, hit7$OBP, col="red", pch=20)

points(hit7$AB*2, hit7$OBP*0.9, col="green", pch=20)

# OBP가 압도적으로 높기에 타수가 늘어도 최고 수준일 가능성 높아 보임

plot(hit6$AB, hit6$SLG)

points(hit7$AB, hit7$SLG, col="red", pch=20)

points(hit7$AB*2, hit7$SLG, col="green", pch=20)

lines(c(hit7$AB, hit7$AB*2), c(hit7$SLG, hit7$SLG), lty=2, col="grey")

# HR이외 장타가 많은 편이라 SLG가 크게 낮지는 않음

# 전형적인 중거리 타자

# 타수 증가 후라면 SLG도 상위권은 되는 수준

# 플래툰 기용의 이유인 왼손투수 상대에서 약점이 없을 경우

# 현재 팀내의 HR 중심 공격 스타일에 부합하지 않음이 문제

# 출루유형별 진루베이스 수를 가중해서 합산한 지표 TB(Total Bases Earned)생성

hit6$TBE <- hit6$H + (hit6$X2B *2 ) + (hit6$X3B *3 ) + (hit6$HR *4) + hit6$BB

hit7 <- hit6[hit6$Player == "Kim_H", ]

plot(sort(hit6$TBE))

abline(a=hit7$TBE*1.7, b=0, col="red")

# 타수가 늘어나더라도 HR수 부족으로 인해 최상위권의 TBE는 되기 어려워 보임

# OBP는 높으나 상대적으로 팀에 대한 기여도로 본다면 최고수준은 아님

# MLB라면 리드오프에 대해서 조차 HR 중요시

# HR으로 안된다면 X2B라도 많아져야 한다는 과제

plot(sort(hit6$TBE/hit6$AB), main="AVG TBE(Total Bases Earned) of LFs and Kim_H")

abline(a=hit7$TBE/hit7$AB, b=0, col="red")

text(5, hit7$TBE/hit7$AB, labels=hit7$Player, pos=3, col="blue")

# 출루가 많은 편이라 멀리나가는 편. 전체 8위 수준

# 그러나 장타 부족으로 최고 수준은 아님

plot(sort(hit6$HR/hit6$AB))

abline(a=hit7$HR/hit7$AB, b=0, col="red")

# 타수당 홈런수 비율 하위권

plot(sort(hit6$X2B/hit6$AB))

abline(a=hit7$X2B/hit7$AB, b=0, col="red")

# 타수당 2루타 비율은 상위권

plot(hit6$X2B, hit6$HR, main="Kim_H's HR Vs. 2XB - Amng LFs")

points(hit7$X2B, hit7$HR, col="red", pch=20 )

points(hit7$X2B*1.7, hit7$HR*1.7, col="green", pch=20 )

lines(c(hit7$X2B, hit7$X2B*1.7), c(hit7$HR, hit7$HR*1.7), lty=2, col="grey")

# 출장이 1.7배가 되어도 2루타는 상위권에 들겠으나 HR 매우 부족은 분명 - 최하위 수준

plot(hit6$AVG, hit6$X2B/hit6$AB , main="Is Kim_H a Singler?")

points(hit7$AVG, hit7$X2B/hit7$AB , col="red", pch=20 )

# 타율이 매우 높음에 비해서는 2루타율이 높지 않아 보임

plot(hit6$AVG, (hit6$X2B/hit6$AB)/hit6$AVG , main="Is Kim_H a Singler?")

points(hit7$AVG, (hit7$X2B/hit7$AB)/hit7$AVG , col="red", pch=20 )

nrow(hit6[(hit6$X2B/hit6$AB)/hit6$AVG > (hit7$X2B/hit7$AB)/hit7$AVG , ]) / nrow(hit6)

# 타율대비 2루타율 비율에서 상위 43% 수준

# 타율이 높다보니 1루에 많이 있고, 타율이 높은 것이 오히려 이미지에는 손해

# 똑딱이 처럼 비쳐질 가능성 높음

plot(hit$AB, hit$AVG, main="Kim_H's AVG will fall if more games?")

abline(lm(hit$AVG~hit$AB), col="blue", lty=2, lwd=2)

points(hit7$AB, hit7$AVG, col="red", pch=20)

# 출장이 늘어난다고 해도 오히려 높아지면 모를까

# 전체 타자의 AB와 AVG 관계를 보면 타율이 낮아지지 않을 것을 추축 가능

plot(hit$AB, hit$AVG, main="Kim_H's AVG will fall if more games?")

abline(lm(hit$AVG~hit$AB), col="blue", lty=2, lwd=2)

# hit3 is for 120+ AB Players

abline(lm(hit3$AVG~hit3$AB), col="green", lty=2, lwd=2)

points(hit7$AB, hit7$AVG, col="red", pch=20)

# 120타수 이상 타자들 경우만 본다고 해도 약간은 타율이 오를 가능성이 더 높아 보임

mlb_hitt_20160705.csv

0.06MB

mlb_hitt_20160818.csv

0.07MB

mlb_hitt_20160705.csv

0.06MB

저작자표시 비영리 변경금지

'R 데이터 분석' 카테고리의 다른 글

[SCW.VEDAR] Part 1 (0)	2016.08.12
[R분석] cluster based anomaly detection (0)	2016.08.11
[R분석] 플롯에서 X축변경 예제 reassign x axis value in R plot (0)	2016.06.16
R 연관성 규칙 생성 연습 [Association Rule Discovery in R] (0)	2016.03.03
[R 데이터 처리] 도로명 주소에서 동이름 추출 (0)	2015.12.29

현재글[SCWHO] 시각적 데이터 분석 EDA 예제 MLB Hitting 2016mid

리비젼 CRM ( revisioncrm )

빅 데이터, AI, 리비젼컨설팅, 전용준 빅데이터, chatGPT, 데이터 사이언티스트, 데이터 분석, 빅데이터, CRM, 리비젼, 디지털마케팅, GPT, 프롬프트, 데이터분석, 인공지능, 전용준, 프롬프트엔지니어링, R, 챗GPT, 머신러닝,

Today :
Yesterday :