>RE::VISION CRM

R 데이터 분석

[R분석] cluster based anomaly detection

YONG_X 2016. 8. 11. 17:02

#---------------

#  anomaly detection

#  

#  여러 방법중 단계적 클러스터링 방법 적용 example

# procedure :: [1] cluster globally [2] pick a target cluster

#      [3] cluster the target cluster [4] pick the anomaly subcluster



# bring in the data file directly from blog posting attachment

hit <- read.csv("https://t1.daumcdn.net/cfile/blog/230D904A577DCECA34?download")

hit$Player <- gsub(" ", "", hit$Player)

colnames(hit)

head(hit)


# data definitions

hits <- T

if(hits==T) {

hit1 <- hit[hit$AB>=50,]

# add K-player tag

cKPlayer <- c("Kim_H", "Lee_D", "Choo_S", "Kang_J", "Park_B", "Choi_J")

hit1$KPlayer <- 0 

hit1[hit1$Player %in% cKPlayer, "KPlayer" ] <-  1

hit2 <- hit1[hit1$AB>=80,]

hit3 <- hit1[hit1$AB>=120,]

}



set.seed(0)

# with base plot and clustering

k1 <- kmeans(hit1[,c("AVG","OBP","SLG", "HR" , "SO", "BB", "AB")],3)

k1$cluster <- as.factor(k1$cluster)


cols <- c("red", "green", "blue")

plot(hit1$AVG, hit1$SO, col = cols[k1$cluster], pch=20)

plot(hit1$AVG, hit1$OPS, col = cols[k1$cluster], pch=20, main="global clusters - k=3")

plot(hit1$OBP, hit1$SLG, col = cols[k1$cluster], pch=20, main="Clusters")


table(k1$cluster)

hit1$cluster <- k1$cluster

plot(hit1$cluster)

plot(hit1$cluster, hit1$AB, ylab="AB")

plot(hit1$cluster, hit1$AVG, ylab="AVG")

plot(hit1$cluster, hit1$OBP, ylab="OBP")

plot(hit1$cluster, hit1$SLG, ylab="SLG")

plot(hit1$cluster, hit1$HR, ylab="HR")

plot(hit1$cluster, hit1$HR/hit1$AB, ylab="HR/AB")



plot(hit1$AB, hit1$SLG, col = cols[k1$cluster], pch=20, main="Clusters")


points(hit3[hit3$Player=="Kim_H",]$OBP, hit3[hit3$Player=="Kim_H",]$SLG, cex=2)

text(hit3[hit3$Player=="Kim_H",]$OBP, hit3[hit3$Player=="Kim_H",]$SLG, labels=hit3[hit3$Player=="Kim_H",]$Player, pos=1)

text(hit3[hit3$KPlayer==1,]$OBP, hit3[hit3$KPlayer==1,]$SLG, labels=hit3[hit3$KPlayer==1,]$Player, pos=1)



# sub clustering


hit1c1 <- hit1[hit1$cluster=="3",]

k1c1 <- kmeans(hit1c1[,c("AVG","OBP","SLG", "HR" , "SO", "BB", "AB")],4)

k1c1$cluster <- as.factor(k1c1$cluster)


hit1c1$cluster <- k1c1$cluster

cols1 <- c("black", "darkgrey", "lightgrey", "lightblue")


points(hit1c1$AB, hit1c1$SLG, col = cols1[hit1c1$cluster], pch=20)


plot(hit1c1$cluster)

plot(hit1c1$cluster, hit1c1$AVG, ylab="AVG")

plot(hit1c1$cluster, hit1c1$SLG, ylab="SLG")

plot(hit1c1$cluster, hit1c1$AB, ylab="AB")

plot(hit1c1$cluster, hit1c1$HR, ylab="HR")



hit1$cluster1 <-  as.character(hit1$cluster)

hit1$cluster1[hit1$cluster=="3"] <- paste0("c3_", as.character(hit1c1$cluster))

hit1$cluster1 <-  as.factor(hit1$cluster1)


plot(hit1$cluster1)

plot(hit1$cluster1, hit1$AB, ylab="AB")

plot(hit1$cluster1, hit1$AVG, ylab="AVG")

plot(hit1$cluster1, hit1$OBP, ylab="OBP")

plot(hit1$cluster1, hit1$SLG, ylab="SLG")

plot(hit1$cluster1, hit1$HR, ylab="HR")

plot(hit1$cluster1, hit1$HR/hit1$AB, ylab="HR/AB")

plot(hit1$cluster1, hit1$X3B-hit1$HR, ylab="X3B-HR")

plot(hit1$cluster1, hit1$X2B-hit1$HR, ylab="X2B-HR")

plot(hit1$cluster1, (hit1$X2B+hit1$X3B-hit1$HR)/(hit1$X2B+hit1$X3B+hit1$HR), ylab="X2B+X3B-HR/23H")

plot(hit1$cluster1, (hit1$X3B-hit1$HR)/hit1$AB, ylab="X3B-HR / AB")

plot(hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=c(cols[1:2],cols1), pch=20)


# get key factors from the globe

require(randomForest)


fit <- randomForest(cluster1 ~ AB + OBP + HR + SLG + HR + SO + BB + AB + X2B + X3B,

                      data=hit1, 

                      importance=TRUE, 

                      ntree=1000)

plot(fit)

varImpPlot(fit, main="global clustering - varImp")



# get key factors from a cluster

fit1 <- randomForest(cluster ~ AB + OBP + HR + SLG + HR + SO + BB + AB + X2B + X3B,

                      data=hit1c1, 

                      importance=TRUE, 

                      ntree=1000)


plot(fit1)

varImpPlot(fit1, main="2nd tier clustering - varImp")



#----

# initial subcluster profiling


plot(hit1$cluster1, hit1$SO, ylab="SO")

plot(hit1$cluster1, hit1$SO/hit1$AB, ylab="SO/AB")

plot(hit1$AB, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$AB, hit1$X2B+hit1$X3B, ylab="X23B", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$AB, (hit1$X2B+hit1$X3B)/hit1$AB, ylab="X23B/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$HR, hit1$X2B+hit1$X3B, ylab="X23B", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$BB, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$BB, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$AB, hit1$SO+hit1$BB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$AB, hit1$SO+hit1$BB+hit1$HR, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$HR/hit1$AB, hit1$SO+hit1$BB/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$AVG, hit1$SO+hit1$BB/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)


plot(hit1$BB/hit1$AB, hit1$SO/hit1$AB, ylab="SO/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$BB/hit1$AB, hit1$SO/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)



#-----

# refined subcluster profiling


plot(hit1$AB, hit1$SO, ylab="SO", col=cols[hit1$cluster], pch=20, main="global clusters - k=3")















plot(hit1$X2B, hit1$SO, ylab="SO", col=cols[hit1$cluster], pch=20)

plot(hit1$X2B, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$X2B, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)


plot(hit1$AB, hit1$SO, ylab="SO", col=c(cols)[hit1$cluster], pch=20)

plot(hit1$AB, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)


plot(hit1$AB, hit1$SO/hit1$AB, ylab="SO/AB", col=c(cols)[hit1$cluster], pch=20)

plot(hit1$AB, hit1$SO/hit1$AB, ylab="SO/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)


plot(hit1$AB, (hit1$SO+hit1$HR)/hit1$AB, ylab="SOHR/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

text(hit1$AB, (hit1$SO+hit1$HR)/hit1$AB, labels=ifelse(hit1$Player=="Kim_H", hit1$Player, ""))


plot(hit1$AB, (hit1$SO+hit1$BB+hit1$HR)/hit1$AB, ylab="SOBBHR/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20, main="tiered clustering - k=2 


+ k=4")


text(hit1$AB, (hit1$SO+hit1$BB+hit1$HR)/hit1$AB, labels=ifelse(hit1$Player=="Kim_H", hit1$Player, ""))

plot(hit1$AB, (hit1$SO+hit1$BB+hit1$HR)/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20, main="tiered clustering - k=2 + k=4")



# SO 뿐 아니라 BB도 HR도 없는 집단



plot(hit1$AB, (hit1$X2B-hit1$SO)/hit1$AB, ylab="X2B-SO/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$X2B, hit1$SO, col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$X2B, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)


plot(hit1$AB, hit1$BB/hit1$SO, col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$X2B, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)



#-------

# more subcluster profling


cols2 <- c(cols[1:2],cols1)

plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols[hit1$cluster], pch=20)

plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster!="3", cols2[hit1$cluster1], ifelse(hit1$cluster1=="c3_4",cols2


[hit1$cluster1], "grey")), pch=20)


plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols[hit1$cluster], pch=20)

plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster!="3", cols2[hit1$cluster1], ifelse(hit1$cluster1=="c3_4",cols2


[hit1$cluster1], "grey")), pch=20)

plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$SLG-hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)


plot((hit1$X2B+hit1$X3B)/hit1$AB, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster=="3","red", "grey"), pch=20)

points((hit1$X2B+hit1$X3B)/hit1$AB, hit1$HR/hit1$AB, col="red", pch=ifelse(hit1$cluster1=="c3_4",20,1))


plot(hit1$cluster1, hit1$HR/(hit1$X2B+hit1$X3B+hit1$HR), ylab="HR/X2+B")


plot(hit1$X2B+hit1$X3B, hit1$HR, ylab="HR", col=ifelse(hit1$cluster=="3","red", "grey"), pch=20)

plot(hit1$X2B+hit1$X3B, hit1$HR, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$HR/(hit1$X2B+hit1$X3B+hit1$HR), hit1$HR/hit1$AB, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse


(hit1$cluster1=="c3_4",20,21))

plot(hit1$HR/(hit1$X2B+hit1$X3B+hit1$HR), hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$HR/hit1$AB, hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$X2B/hit1$AB, hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$X3B/hit1$AB, hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$X3B/hit1$AB, hit1$HR/hit1$AB, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(jitter(hit1$X3B), jitter(hit1$HR), col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))


plot(hit1$SLG-hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols, pch=20)

plot(hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols, pch=20)


plot(hit1$cluster1, hit1$X2B/hit1$AB, ylab="X2B/AB")

plot(hit1$cluster1, hit1$X3B/hit1$AB, ylab="X3B/AB")

plot(hit1$cluster1, hit1$X3B-hit1$HR, ylab="X3B-HR")

plot(hit1$cluster1, (hit1$X3B-hit1$HR)/hit1$AB, ylab="X3B-HR / AB")

plot(hit1$cluster1, hit1$HR, ylab="HR")

plot(hit1$cluster1, hit1$HR/hit1$AB, ylab="HR/AB")


plot(jitter(hit1$HR/hit1$AB), jitter(hit1$X3B/hit1$AB), ylab="X3B/AB", col=ifelse(hit1$cluster1=="c3_4",cols2[hit1$cluster1], "black"), pch=20)

plot(jitter(hit1$HR/hit1$AB), jitter((hit1$X2B+hit1$X3B)/hit1$AB), ylab="X23B/AB", col=ifelse(hit1$cluster1=="c3_4",cols2[hit1$cluster1], 


"black"), pch=20



#--------

# granular clustering - global


set.seed(0)

# with base plot and clustering

k2 <- kmeans(hit1[,c("AVG","OBP","SLG", "HR" , "SO", "BB", "AB")],6)

k2$cluster <- as.factor(k2$cluster)

hit1$cluster2 <- k2$cluster


plot(hit1$cluster2)

plot(hit1$AB, hit1$SO, ylab="SO", col=cols2[hit1$cluster2], pch=20, main="global clustering - k=6")




# 특정 클러스터만 세분화된 클러스터링을 실시한 결과와는 달리 전체에서 중요한 요소를 기준으로 좀 더 세분화될 뿐

# 특정 클러스터만 세분화된 클러스터링을 통해 전체에서는 기대하지 않았던 집단을 발견 -- Anomaly group

#--------