[R분석] cluster based anomaly detection

R 데이터 분석

[R분석] cluster based anomaly detection

YONG_X 2016. 8. 11. 17:02

#---------------

# anomaly detection

# 여러 방법중 단계적 클러스터링 방법 적용 example

# procedure :: [1] cluster globally [2] pick a target cluster

# [3] cluster the target cluster [4] pick the anomaly subcluster

# bring in the data file directly from blog posting attachment

hit <- read.csv("https://t1.daumcdn.net/cfile/blog/230D904A577DCECA34?download")

hit$Player <- gsub(" ", "", hit$Player)

colnames(hit)

head(hit)

# data definitions

hits <- T

if(hits==T) {

hit1 <- hit[hit$AB>=50,]

# add K-player tag

cKPlayer <- c("Kim_H", "Lee_D", "Choo_S", "Kang_J", "Park_B", "Choi_J")

hit1$KPlayer <- 0

hit1[hit1$Player %in% cKPlayer, "KPlayer" ] <- 1

hit2 <- hit1[hit1$AB>=80,]

hit3 <- hit1[hit1$AB>=120,]

}

set.seed(0)

# with base plot and clustering

k1 <- kmeans(hit1[,c("AVG","OBP","SLG", "HR" , "SO", "BB", "AB")],3)

k1$cluster <- as.factor(k1$cluster)

cols <- c("red", "green", "blue")

plot(hit1$AVG, hit1$SO, col = cols[k1$cluster], pch=20)

plot(hit1$AVG, hit1$OPS, col = cols[k1$cluster], pch=20, main="global clusters - k=3")

plot(hit1$OBP, hit1$SLG, col = cols[k1$cluster], pch=20, main="Clusters")

table(k1$cluster)

hit1$cluster <- k1$cluster

plot(hit1$cluster)

plot(hit1$cluster, hit1$AB, ylab="AB")

plot(hit1$cluster, hit1$AVG, ylab="AVG")

plot(hit1$cluster, hit1$OBP, ylab="OBP")

plot(hit1$cluster, hit1$SLG, ylab="SLG")

plot(hit1$cluster, hit1$HR, ylab="HR")

plot(hit1$cluster, hit1$HR/hit1$AB, ylab="HR/AB")

plot(hit1$AB, hit1$SLG, col = cols[k1$cluster], pch=20, main="Clusters")

points(hit3[hit3$Player=="Kim_H",]$OBP, hit3[hit3$Player=="Kim_H",]$SLG, cex=2)

text(hit3[hit3$Player=="Kim_H",]$OBP, hit3[hit3$Player=="Kim_H",]$SLG, labels=hit3[hit3$Player=="Kim_H",]$Player, pos=1)

text(hit3[hit3$KPlayer==1,]$OBP, hit3[hit3$KPlayer==1,]$SLG, labels=hit3[hit3$KPlayer==1,]$Player, pos=1)

# sub clustering

hit1c1 <- hit1[hit1$cluster=="3",]

k1c1 <- kmeans(hit1c1[,c("AVG","OBP","SLG", "HR" , "SO", "BB", "AB")],4)

k1c1$cluster <- as.factor(k1c1$cluster)

hit1c1$cluster <- k1c1$cluster

cols1 <- c("black", "darkgrey", "lightgrey", "lightblue")

points(hit1c1$AB, hit1c1$SLG, col = cols1[hit1c1$cluster], pch=20)

plot(hit1c1$cluster)

plot(hit1c1$cluster, hit1c1$AVG, ylab="AVG")

plot(hit1c1$cluster, hit1c1$SLG, ylab="SLG")

plot(hit1c1$cluster, hit1c1$AB, ylab="AB")

plot(hit1c1$cluster, hit1c1$HR, ylab="HR")

hit1$cluster1 <- as.character(hit1$cluster)

hit1$cluster1[hit1$cluster=="3"] <- paste0("c3_", as.character(hit1c1$cluster))

hit1$cluster1 <- as.factor(hit1$cluster1)

plot(hit1$cluster1)

plot(hit1$cluster1, hit1$AB, ylab="AB")

plot(hit1$cluster1, hit1$AVG, ylab="AVG")

plot(hit1$cluster1, hit1$OBP, ylab="OBP")

plot(hit1$cluster1, hit1$SLG, ylab="SLG")

plot(hit1$cluster1, hit1$HR, ylab="HR")

plot(hit1$cluster1, hit1$HR/hit1$AB, ylab="HR/AB")

plot(hit1$cluster1, hit1$X3B-hit1$HR, ylab="X3B-HR")

plot(hit1$cluster1, hit1$X2B-hit1$HR, ylab="X2B-HR")

plot(hit1$cluster1, (hit1$X2B+hit1$X3B-hit1$HR)/(hit1$X2B+hit1$X3B+hit1$HR), ylab="X2B+X3B-HR/23H")

plot(hit1$cluster1, (hit1$X3B-hit1$HR)/hit1$AB, ylab="X3B-HR / AB")

plot(hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=c(cols[1:2],cols1), pch=20)

# get key factors from the globe

require(randomForest)

fit <- randomForest(cluster1 ~ AB + OBP + HR + SLG + HR + SO + BB + AB + X2B + X3B,

data=hit1,

importance=TRUE,

ntree=1000)

plot(fit)

varImpPlot(fit, main="global clustering - varImp")

# get key factors from a cluster

fit1 <- randomForest(cluster ~ AB + OBP + HR + SLG + HR + SO + BB + AB + X2B + X3B,

data=hit1c1,

importance=TRUE,

ntree=1000)

plot(fit1)

varImpPlot(fit1, main="2nd tier clustering - varImp")

#----

# initial subcluster profiling

plot(hit1$cluster1, hit1$SO, ylab="SO")

plot(hit1$cluster1, hit1$SO/hit1$AB, ylab="SO/AB")

plot(hit1$AB, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$AB, hit1$X2B+hit1$X3B, ylab="X23B", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$AB, (hit1$X2B+hit1$X3B)/hit1$AB, ylab="X23B/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$HR, hit1$X2B+hit1$X3B, ylab="X23B", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$BB, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$BB, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$AB, hit1$SO+hit1$BB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$AB, hit1$SO+hit1$BB+hit1$HR, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$HR/hit1$AB, hit1$SO+hit1$BB/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$AVG, hit1$SO+hit1$BB/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$BB/hit1$AB, hit1$SO/hit1$AB, ylab="SO/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$BB/hit1$AB, hit1$SO/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

#-----

# refined subcluster profiling

plot(hit1$AB, hit1$SO, ylab="SO", col=cols[hit1$cluster], pch=20, main="global clusters - k=3")

plot(hit1$X2B, hit1$SO, ylab="SO", col=cols[hit1$cluster], pch=20)

plot(hit1$X2B, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$X2B, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$AB, hit1$SO, ylab="SO", col=c(cols)[hit1$cluster], pch=20)

plot(hit1$AB, hit1$SO, ylab="SO", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$AB, hit1$SO/hit1$AB, ylab="SO/AB", col=c(cols)[hit1$cluster], pch=20)

plot(hit1$AB, hit1$SO/hit1$AB, ylab="SO/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$AB, (hit1$SO+hit1$HR)/hit1$AB, ylab="SOHR/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

text(hit1$AB, (hit1$SO+hit1$HR)/hit1$AB, labels=ifelse(hit1$Player=="Kim_H", hit1$Player, ""))

plot(hit1$AB, (hit1$SO+hit1$BB+hit1$HR)/hit1$AB, ylab="SOBBHR/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20, main="tiered clustering - k=2

+ k=4")

text(hit1$AB, (hit1$SO+hit1$BB+hit1$HR)/hit1$AB, labels=ifelse(hit1$Player=="Kim_H", hit1$Player, ""))

plot(hit1$AB, (hit1$SO+hit1$BB+hit1$HR)/hit1$AB, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20, main="tiered clustering - k=2 + k=4")

# SO 뿐 아니라 BB도 HR도 없는 집단

plot(hit1$AB, (hit1$X2B-hit1$SO)/hit1$AB, ylab="X2B-SO/AB", col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$X2B, hit1$SO, col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$X2B, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$AB, hit1$BB/hit1$SO, col=c(cols[1:2],cols1)[hit1$cluster1], pch=20)

plot(hit1$X2B, hit1$SO, col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

#-------

# more subcluster profling

cols2 <- c(cols[1:2],cols1)

plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols[hit1$cluster], pch=20)

plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster!="3", cols2[hit1$cluster1], ifelse(hit1$cluster1=="c3_4",cols2

[hit1$cluster1], "grey")), pch=20)

plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols[hit1$cluster], pch=20)

plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster!="3", cols2[hit1$cluster1], ifelse(hit1$cluster1=="c3_4",cols2

[hit1$cluster1], "grey")), pch=20)

plot(hit1$SLG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot(hit1$SLG-hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster1=="c3_4","red", "grey"), pch=20)

plot((hit1$X2B+hit1$X3B)/hit1$AB, hit1$HR/hit1$AB, ylab="HR/AB", col=ifelse(hit1$cluster=="3","red", "grey"), pch=20)

points((hit1$X2B+hit1$X3B)/hit1$AB, hit1$HR/hit1$AB, col="red", pch=ifelse(hit1$cluster1=="c3_4",20,1))

plot(hit1$cluster1, hit1$HR/(hit1$X2B+hit1$X3B+hit1$HR), ylab="HR/X2+B")

plot(hit1$X2B+hit1$X3B, hit1$HR, ylab="HR", col=ifelse(hit1$cluster=="3","red", "grey"), pch=20)

plot(hit1$X2B+hit1$X3B, hit1$HR, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$HR/(hit1$X2B+hit1$X3B+hit1$HR), hit1$HR/hit1$AB, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse

(hit1$cluster1=="c3_4",20,21))

plot(hit1$HR/(hit1$X2B+hit1$X3B+hit1$HR), hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$HR/hit1$AB, hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$X2B/hit1$AB, hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$X3B/hit1$AB, hit1$SLG, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$X3B/hit1$AB, hit1$HR/hit1$AB, col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(jitter(hit1$X3B), jitter(hit1$HR), col=ifelse(hit1$cluster=="3","red", "grey"), pch=ifelse(hit1$cluster1=="c3_4",20,21))

plot(hit1$SLG-hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols, pch=20)

plot(hit1$AVG, hit1$HR/hit1$AB, ylab="HR/AB", col=cols, pch=20)

plot(hit1$cluster1, hit1$X2B/hit1$AB, ylab="X2B/AB")

plot(hit1$cluster1, hit1$X3B/hit1$AB, ylab="X3B/AB")

plot(hit1$cluster1, hit1$X3B-hit1$HR, ylab="X3B-HR")

plot(hit1$cluster1, (hit1$X3B-hit1$HR)/hit1$AB, ylab="X3B-HR / AB")

plot(hit1$cluster1, hit1$HR, ylab="HR")

plot(hit1$cluster1, hit1$HR/hit1$AB, ylab="HR/AB")

plot(jitter(hit1$HR/hit1$AB), jitter(hit1$X3B/hit1$AB), ylab="X3B/AB", col=ifelse(hit1$cluster1=="c3_4",cols2[hit1$cluster1], "black"), pch=20)

plot(jitter(hit1$HR/hit1$AB), jitter((hit1$X2B+hit1$X3B)/hit1$AB), ylab="X23B/AB", col=ifelse(hit1$cluster1=="c3_4",cols2[hit1$cluster1],

"black"), pch=20

#--------

# granular clustering - global

set.seed(0)

# with base plot and clustering

k2 <- kmeans(hit1[,c("AVG","OBP","SLG", "HR" , "SO", "BB", "AB")],6)

k2$cluster <- as.factor(k2$cluster)

hit1$cluster2 <- k2$cluster

plot(hit1$cluster2)

plot(hit1$AB, hit1$SO, ylab="SO", col=cols2[hit1$cluster2], pch=20, main="global clustering - k=6")

# 특정 클러스터만 세분화된 클러스터링을 실시한 결과와는 달리 전체에서 중요한 요소를 기준으로 좀 더 세분화될 뿐

# 특정 클러스터만 세분화된 클러스터링을 통해 전체에서는 기대하지 않았던 집단을 발견 -- Anomaly group

#--------

저작자표시 비영리 변경금지 (새창열림)

'R 데이터 분석' 카테고리의 다른 글

[SCW.VEDAR] Part 2 (0)	2016.08.12
[SCW.VEDAR] Part 1 (0)	2016.08.12
[SCWHO] 시각적 데이터 분석 EDA 예제 MLB Hitting 2016mid (0)	2016.07.05
[R분석] 플롯에서 X축변경 예제 reassign x axis value in R plot (0)	2016.06.16
R 연관성 규칙 생성 연습 [Association Rule Discovery in R] (0)	2016.03.03

현재글[R분석] cluster based anomaly detection

리비젼 CRM ( revisioncrm )

인공지능, 리비젼, 전용준 빅데이터, R, AI, 챗GPT, CRM, 데이터 사이언티스트, 프롬프트엔지니어링, 데이터 분석, GPT, 프롬프트, 빅데이터, chatGPT, 빅 데이터, 전용준, 디지털마케팅, 데이터분석, 리비젼컨설팅, 머신러닝,

Today :
Yesterday :

일	월	화	수	목	금	토
					1	2
3	4	5	6	7	8	9
10	11	12	13	14	15	16
17	18	19	20	21	22	23
24	25	26	27	28	29	30
31