#---------------
# anomaly detection
#
# 여러 방법중 단계적 클러스터링 방법 적용 example
# procedure :: [1] cluster globally [2] pick a target cluster
# [3] cluster the target cluster [4] pick the anomaly subcluster
# bring in the data file directly from blog posting attachment
hit <- read.csv("https://t1.daumcdn.net/cfile/blog/230D904A577DCECA34?download")
hit$Player <- gsub(" ", "", hit$Player)
colnames(hit)
head(hit)
# data definitions
hits <- T
if(hits==T) {
hit1 <- hit[hit$AB>=50,]
# add K-player tag
cKPlayer <- c("Kim_H", "Lee_D", "Choo_S", "Kang_J", "Park_B", "Choi_J")
hit1$KPlayer <- 0
hit1[hit1$Player %in% cKPlayer, "KPlayer" ] <- 1
hit2 <- hit1[hit1$AB>=80,]
hit3 <- hit1[hit1$AB>=120,]
}
set.seed(0)
# with base plot and clustering
k1 <- kmeans(hit1[,c("AVG","OBP","SLG", "HR" , "SO", "BB", "AB")],3)
k1$cluster <- as.factor(k1$cluster)
cols <- c("red", "green", "blue")
plot(hit1$AVG, hit1$SO, col = cols[k1$cluster], pch=20)
plot(hit1$AVG, hit1$OPS, col = cols[k1$cluster], pch=20, main="global clusters - k=3")
plot(hit1$OBP, hit1$SLG, col = cols[k1$cluster], pch=20, main="Clusters")
table(k1$cluster)
hit1$cluster <- k1$cluster
plot(hit1$cluster)
plot(hit1$cluster, hit1$AB, ylab="AB")
plot(hit1$cluster, hit1$AVG, ylab="AVG")
plot(hit1$cluster, hit1$OBP, ylab="OBP")
plot(hit1$cluster, hit1$SLG, ylab="SLG")
plot(hit1$cluster, hit1$HR, ylab="HR")
plot(hit1$cluster, hit1$HR/hit1$AB, ylab="HR/AB")
plot(hit1$AB, hit1$SLG, col = cols[k1$cluster], pch=20, main="Clusters")
points(hit3[hit3$Player=="Kim_H",]$OBP, hit3[hit3$Player=="Kim_H",]$SLG, cex=2)
text(hit3[hit3$Player=="Kim_H",]$OBP, hit3[hit3$Player=="Kim_H",]$SLG, labels=hit3[hit3$Player=="Kim_H",]$Player, pos=1)
text(hit3[hit3$KPlayer==1,]$OBP, hit3[hit3$KPlayer==1,]$SLG, labels=hit3[hit3$KPlayer==1,]$Player, pos=1)
# sub clustering
hit1c1 <- hit1[hit1$cluster=="3",]
k1c1 <- kmeans(hit1c1[,c("AVG","OBP","SLG", "HR" , "SO", "BB", "AB")],4)
k1c1$cluster <- as.factor(k1c1$cluster)
hit1c1$cluster <- k1c1$cluster
cols1 <- c("black", "darkgrey", "lightgrey", "lightblue")
points(hit1c1$AB, hit1c1$SLG, col = cols1[hit1c1$cluster], pch=20)
plot(hit1c1$cluster)
plot(hit1c1$cluster, hit1c1$AVG, ylab="AVG")
plot(hit1c1$cluster, hit1c1$SLG, ylab="SLG")
plot(hit1c1$cluster, hit1c1$AB, ylab="AB")
plot(hit1c1$cluster, hit1c1$HR, ylab="HR")
hit1$cluster1 <- as.character(hit1$cluster)
hit1$cluster1[hit1$cluster=="3"] <- paste0("c3_", as.character(hit1c1$cluster))
hit1$cluster1 <- as.factor(hit1$cluster1)
plot(hit1$cluster1)
plot(hit1$cluster1, hit1$AB, ylab="AB")
plot(hit1$cluster1, hit1$AVG, ylab="AVG")
plot(hit1$cluster1, hit1$OBP, ylab="OBP")
plot(hit1$cluster1, hit1$SLG, ylab="SLG")
plot(hit1$cluster1, hit1$HR, ylab="HR")
plot(hit1$cluster1, hit1$HR/hit1$AB, ylab="HR/AB")
plot(hit1$cluster1, hit1$X3B-hit1$HR, ylab="X3B-HR")
plot(hit1$cluster1, hit1$X2B-hit1$HR, ylab="X2B-HR")
plot(hit1$cluster1, (hit1$X2B+hit1$X3B-hit1$HR)/(hit1$X2B+hit1$X3B+hit1$HR), ylab="X2B+X3B-HR/23H")
plot(hit1$cluster1, (hit1$X3B-hit1$HR)/hit1$AB, ylab="X3B-HR / AB")
plot(hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=c(cols[1:2],cols1), pch=20)
# get key factors from the globe
require(randomForest)
fit <- randomForest(cluster1 ~ AB + OBP + HR + SLG + HR + SO + BB + AB + X2B + X3B,
data=hit1,
importance=TRUE,
ntree=1000)
plot(fit)
varImpPlot(fit, main="global clustering - varImp")
# get key factors from a cluster
fit1 <- randomForest(cluster ~ AB + OBP + HR + SLG + HR + SO + BB + AB + X2B + X3B,
data=hit1c1,
importance=TRUE,
ntree=1000)
plot(fit1)
varImpPlot(fit1, main="2nd tier clustering - varImp")
#----
# initial subcluster profiling
plot(hit1$cluster1, hit1$SO, ylab="SO")
plot(hit1$cluster1, hit1$SO/hit1$AB, ylab="SO/AB")
plot(hit1$AB, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$AB, hit1$X2B+hit1$X3B, ylab="X23B", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$AB, (hit1$X2B+hit1$X3B)/hit1$AB, ylab="X23B/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$HR, hit1$X2B+hit1$X3B, ylab="X23B", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$BB, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$BB, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
plot(hit1$AB, hit1$SO+hit1$BB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
plot(hit1$AB, hit1$SO+hit1$BB+hit1$HR, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
plot(hit1$HR/hit1$AB, hit1$SO+hit1$BB/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
plot(hit1$AVG, hit1$SO+hit1$BB/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
plot(hit1$BB/hit1$AB, hit1$SO/hit1$AB, ylab="SO/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$BB/hit1$AB, hit1$SO/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
#-----
# refined subcluster profiling
plot(hit1$AB, hit1$SO, ylab="SO", col=cols[hit1$cluster], pch=20, main="global clusters - k=3")
plot(hit1$X2B, hit1$SO, ylab="SO", col=cols[hit1$cluster], pch=20)
plot(hit1$X2B, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$X2B, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
plot(hit1$AB, hit1$SO, ylab="SO", col=c(cols)[hit1$cluster], pch=20)
plot(hit1$AB, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$AB, hit1$SO/hit1$AB, ylab="SO/AB", col=c(cols)[hit1$cluster], pch=20)
plot(hit1$AB, hit1$SO/hit1$AB, ylab="SO/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$AB, (hit1$SO+hit1$HR)/hit1$AB, ylab="SOHR/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
text(hit1$AB, (hit1$SO+hit1$HR)/hit1$AB, labels=ifelse(hit1$Player=="Kim_H", hit1$Player, ""))
plot(hit1$AB, (hit1$SO+hit1$BB+hit1$HR)/hit1$AB, ylab="SOBBHR/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20, main="tiered clustering - k=2
+ k=4")
text(hit1$AB, (hit1$SO+hit1$BB+hit1$HR)/hit1$AB, labels=ifelse(hit1$Player=="Kim_H", hit1$Player, ""))
plot(hit1$AB, (hit1$SO+hit1$BB+hit1$HR)/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20, main="tiered clustering - k=2 + k=4")
# SO 뿐 아니라 BB도 HR도 없는 집단
plot(hit1$AB, (hit1$X2B-hit1$SO)/hit1$AB, ylab="X2B-SO/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$X2B, hit1$SO, col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$X2B, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
plot(hit1$AB, hit1$BB/hit1$SO, col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)
plot(hit1$X2B, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
#-------
# more subcluster profling
cols2 <- c(cols[1:2],cols1)
plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols[hit1$cluster], pch=20)
plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster!="3", cols2[hit1$cluster1], ifelse(hit1$cluster1=="c3_4",cols2
[hit1$cluster1], "grey")), pch=20)
plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols[hit1$cluster], pch=20)
plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster!="3", cols2[hit1$cluster1], ifelse(hit1$cluster1=="c3_4",cols2
[hit1$cluster1], "grey")), pch=20)
plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
plot(hit1$SLG-hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)
plot((hit1$X2B+hit1$X3B)/hit1$AB, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster=="3","red", "grey"), pch=20)
points((hit1$X2B+hit1$X3B)/hit1$AB, hit1$HR/hit1$AB, col="red", pch=ifelse(hit1$cluster1=="c3_4",20,1))
plot(hit1$cluster1, hit1$HR/(hit1$X2B+hit1$X3B+hit1$HR), ylab="HR/X2+B")
plot(hit1$X2B+hit1$X3B, hit1$HR, ylab="HR", col=ifelse(hit1$cluster=="3","red", "grey"), pch=20)
plot(hit1$X2B+hit1$X3B, hit1$HR, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))
plot(hit1$HR/(hit1$X2B+hit1$X3B+hit1$HR), hit1$HR/hit1$AB, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse
(hit1$cluster1=="c3_4",20,21))
plot(hit1$HR/(hit1$X2B+hit1$X3B+hit1$HR), hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))
plot(hit1$HR/hit1$AB, hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))
plot(hit1$X2B/hit1$AB, hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))
plot(hit1$X3B/hit1$AB, hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))
plot(hit1$X3B/hit1$AB, hit1$HR/hit1$AB, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))
plot(jitter(hit1$X3B), jitter(hit1$HR), col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))
plot(hit1$SLG-hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols, pch=20)
plot(hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols, pch=20)
plot(hit1$cluster1, hit1$X2B/hit1$AB, ylab="X2B/AB")
plot(hit1$cluster1, hit1$X3B/hit1$AB, ylab="X3B/AB")
plot(hit1$cluster1, hit1$X3B-hit1$HR, ylab="X3B-HR")
plot(hit1$cluster1, (hit1$X3B-hit1$HR)/hit1$AB, ylab="X3B-HR / AB")
plot(hit1$cluster1, hit1$HR, ylab="HR")
plot(hit1$cluster1, hit1$HR/hit1$AB, ylab="HR/AB")
plot(jitter(hit1$HR/hit1$AB), jitter(hit1$X3B/hit1$AB), ylab="X3B/AB", col=ifelse(hit1$cluster1=="c3_4",cols2[hit1$cluster1], "black"), pch=20)
plot(jitter(hit1$HR/hit1$AB), jitter((hit1$X2B+hit1$X3B)/hit1$AB), ylab="X23B/AB", col=ifelse(hit1$cluster1=="c3_4",cols2[hit1$cluster1],
"black"), pch=20
#--------
# granular clustering - global
set.seed(0)
# with base plot and clustering
k2 <- kmeans(hit1[,c("AVG","OBP","SLG", "HR" , "SO", "BB", "AB")],6)
k2$cluster <- as.factor(k2$cluster)
hit1$cluster2 <- k2$cluster
plot(hit1$cluster2)
plot(hit1$AB, hit1$SO, ylab="SO", col=cols2[hit1$cluster2], pch=20, main="global clustering - k=6")
# 특정 클러스터만 세분화된 클러스터링을 실시한 결과와는 달리 전체에서 중요한 요소를 기준으로 좀 더 세분화될 뿐
# 특정 클러스터만 세분화된 클러스터링을 통해 전체에서는 기대하지 않았던 집단을 발견 -- Anomaly group
#--------
'R 데이터 분석' 카테고리의 다른 글
[SCW.VEDAR] Part 2 (0) | 2016.08.12 |
---|---|
[SCW.VEDAR] Part 1 (0) | 2016.08.12 |
[SCWHO] 시각적 데이터 분석 EDA 예제 MLB Hitting 2016mid (0) | 2016.07.05 |
[R분석] 플롯에서 X축변경 예제 reassign x axis value in R plot (0) | 2016.06.16 |
R 연관성 규칙 생성 연습 [Association Rule Discovery in R] (0) | 2016.03.03 |