[IsolationForest] Anomaly Detection

R 데이터 분석

[IsolationForest] Anomaly Detection

YONG_X 2018. 8. 30. 17:08

# devtools::install_github("yanyachen/IsolationForest")

install.packages("IsolationForest", repos="http://R-Forge.R-project.org")

library(IsolationForest)

data(stackloss)

# train a model of Isolation Forest

tr<-IsolationTrees(stackloss, rFactor=0)

#evaluate anomaly score

as<-AnomalyScore(stackloss,tr)

# show anomaly score

as$outF

plot(stackloss$Air.Flow, stackloss$Water.Temp,

col=rgb(as$outF, 0,1-as$outF) , pch=19, cex=1.5)

plot(stackloss$Acid.Conc., stackloss$stack.loss,

col=rgb(as$outF, 0,1-as$outF) , pch=19, cex=1.5)

# train a model of Isolation Forest

tr<-IsolationTrees(mtcars, ntree=10000, rFactor=1)

#evaluate anomaly score

as<-AnomalyScore(mtcars,tr)

# show anomaly score

as$outF

sort(as$outF, decreasing=T)[1:2]

# UDF for standardize anomaly score

stnd <- function(x) {

x1 <- (x-min(x)) / (max(x)-min(x))

return(x1)

}

stndoutF<- stnd(as$outF)

plot(mtcars$carb, mtcars$hp,

col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)

plot(mtcars$qsec, mtcars$disp,

col=rgb(stndoutF, 0,stndoutF) , pch=19, cex=1.5)

# check what matters most

library(party)

mtcarsv <- mtcars

mtcarsv$outF <- as$outF

ctv1 <- ctree(outF~., data=mtcarsv)

plot(ctv1)

install.packages("randomForest")

library(randomForest)

rfv1 <- randomForest(outF~., data=mtcarsv, ntree=10000)

varImpPlot(rfv1)

#------- mycsv example -------

mycsv <- read.csv("https://t1.daumcdn.net/cfile/blog/991E7A3359BDDF6837?download",

header=TRUE)

# train a model of Isolation Forest

tr<-IsolationTrees(mycsv, ntree=20000,

rFactor=1, nmin=3)

#evaluate anomaly score

as<-AnomalyScore(mycsv,tr)

# show anomaly score

plot(density(as$outF ))

sort(as$outF, decreasing=T)[1:2]

# check what matters most

library(party)

mycsvv <- mycsv

mycsvv$outF <- as$outF

ctv1 <- ctree(outF~., data=mycsvv)

plot(ctv1)

# install.packages("randomForest")

library(randomForest)

rfv1 <- randomForest(outF~., data=mycsvv,

nodesize=3, ntree=10000)

varImpPlot(rfv1)

plot(mycsv$VisITedResources, mycsv$AnnouncementsView,

col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)

plot(mycsv$raisedhands, mycsv$Discussion,

col=rgb(stndoutF, 0,stndoutF) , pch=19, cex=1.5)

#---- kbo --------------

hit0 <- read.csv("https://t1.daumcdn.net/cfile/blog/99B515435B87998B07?download")

hit0 <- hit0[,!(names(hit0) %in% c("선수명","순위","팀명"))]

# train a model of Isolation Forest

tr<-IsolationTrees(hit0, ntree=20000,

rFactor=0.5, nmin=3)

#evaluate anomaly score

as<-AnomalyScore(hit0,tr)

# show anomaly score

plot(density(as$outF ))

sort(as$outF, decreasing=T)[1:2]

stndoutF<- stnd(as$outF)

# check what matters most

library(party)

hit0v <- hit0

hit0v$outF <- as$outF

hit0v1 <- hit0v[,!(names(hit0v) %in% "선수명")]

ctv1 <- ctree(outF~., data=hit0v1)

plot(ctv1)

# install.packages("randomForest")

library(randomForest)

rfv1 <- randomForest(outF~., data=hit0v1,

nodesize=3, ntree=30000)

varImpPlot(rfv1)

hit0 <- read.csv("https://t1.daumcdn.net/cfile/blog/99B515435B87998B07?download")

hit0$outF <- as$outF

# head(hit0[order(hit0$outF, decreasing=T),])

par(mfrow=c(1,2))

plot(hit0$X2B, hit0$GO,

col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5,

main="KBO Hitting (2018-08-16) - Anomaly Detection",

sub="red: anomaly")

text(hit0$X2B, hit0$GO,

labels=ifelse(hit0$outF>0.6,as.character(hit0$'선수명'),""))

grid()

plot(hit0$H, hit0$R,

col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5,

main="KBO Hitting (2018-08-16) - Anomaly Detection",

sub="red: anomaly")

text(hit0$H, hit0$R,

labels=ifelse(hit0$outF>0.6,as.character(hit0$'선수명'),""))

grid()

par(mfrow=c(1,1))

plot(hit0$TB, hit0$RBI,

col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)

text(hit0$TB, hit0$RBI,

labels=ifelse(hit0$TB>250,as.character(hit0$'선수명'),""))

plot(hit0$TB, hit0$GO,

col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)

text(hit0$TB, hit0$GO,

labels=ifelse(hit0$TB>250,as.character(hit0$'선수명'),""))

plot(hit0$TB, hit0$RBI,

col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)

text(hit0$TB, hit0$RBI,

labels=ifelse(hit0$TB>250,as.character(hit0$'선수명'),""))

lines(lowess(hit0$RBI~hit0$TB), lty=2)

plot(hit0$TB/hit0$AB, (hit0$RBI+hit0$R)/hit0$AB,

col=rgb(stndoutF, 0,1-stndoutF) , pch=19,

cex=stnd(hit0$AB)*3)

text(hit0$TB/hit0$AB, (hit0$RBI+hit0$R)/hit0$AB,

labels=hit0$'선수명')

# abline(h=median(hit0$RBI+hit0$R)/hit0$AB, lty=2)

#----- bankmarketing -----------

저작자표시 비영리 변경금지

'R 데이터 분석' 카테고리의 다른 글

[R분석] EDA탐색적분석 base R (mtcars) (0)	2018.09.18
[R분석] Anomaly Detection과 EDA 결합 (IsolationForest) 활용 (0)	2018.09.05
[DSM1809] statistical data analysis using R (0)	2018.08.24
[KDDTprj4] decision tree sample (0)	2018.08.04
[KDBRBD] retail data analysis practice (0)	2018.08.04

현재글[IsolationForest] Anomaly Detection

리비젼 CRM ( revisioncrm )

데이터 분석, 전용준, 머신러닝, 프롬프트, 리비젼, 전용준 빅데이터, GPT, 디지털마케팅, chatGPT, 리비젼컨설팅, CRM, 빅데이터, 프롬프트엔지니어링, R, 인공지능, 챗GPT, AI, 빅 데이터, 데이터 사이언티스트, 데이터분석,

Today :
Yesterday :

일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`