>RE::VISION CRM

R 데이터 분석

[IsolationForest] Anomaly Detection

YONG_X 2018. 8. 30. 17:08




# devtools::install_github("yanyachen/IsolationForest")

install.packages("IsolationForest", repos="http://R-Forge.R-project.org")


library(IsolationForest)


data(stackloss)

# train a model of Isolation Forest

tr<-IsolationTrees(stackloss, rFactor=0)

#evaluate anomaly score

as<-AnomalyScore(stackloss,tr)

# show anomaly score

as$outF 


plot(stackloss$Air.Flow, stackloss$Water.Temp, 

   col=rgb(as$outF, 0,1-as$outF) , pch=19, cex=1.5)


plot(stackloss$Acid.Conc., stackloss$stack.loss, 

   col=rgb(as$outF, 0,1-as$outF) , pch=19, cex=1.5)




# train a model of Isolation Forest

tr<-IsolationTrees(mtcars, ntree=10000, rFactor=1)

#evaluate anomaly score

as<-AnomalyScore(mtcars,tr)

# show anomaly score

as$outF 

sort(as$outF, decreasing=T)[1:2]



# UDF for standardize anomaly score

stnd <- function(x) {

  x1 <- (x-min(x)) / (max(x)-min(x))

  return(x1)

  }

stndoutF<- stnd(as$outF)


plot(mtcars$carb, mtcars$hp, 

   col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)


plot(mtcars$qsec, mtcars$disp,  

   col=rgb(stndoutF, 0,stndoutF) , pch=19, cex=1.5)



# check what matters most 

library(party)

mtcarsv <- mtcars

mtcarsv$outF <- as$outF 


ctv1 <- ctree(outF~., data=mtcarsv)

plot(ctv1)



install.packages("randomForest")

library(randomForest)

rfv1 <- randomForest(outF~., data=mtcarsv, ntree=10000)

varImpPlot(rfv1)



#------- mycsv example -------


mycsv <- read.csv("https://t1.daumcdn.net/cfile/blog/991E7A3359BDDF6837?download", 

   header=TRUE)


# train a model of Isolation Forest

tr<-IsolationTrees(mycsv, ntree=20000, 

  rFactor=1, nmin=3)

#evaluate anomaly score

as<-AnomalyScore(mycsv,tr)

# show anomaly score

plot(density(as$outF ))

sort(as$outF, decreasing=T)[1:2]



# check what matters most 

library(party)

mycsvv <- mycsv 

mycsvv$outF <- as$outF 




ctv1 <- ctree(outF~., data=mycsvv)

plot(ctv1)



# install.packages("randomForest")

library(randomForest)

rfv1 <- randomForest(outF~., data=mycsvv, 

  nodesize=3, ntree=10000)

varImpPlot(rfv1)


   


plot(mycsv$VisITedResources, mycsv$AnnouncementsView, 

   col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)


plot(mycsv$raisedhands, mycsv$Discussion,  

   col=rgb(stndoutF, 0,stndoutF) , pch=19, cex=1.5)




#---- kbo --------------


hit0 <- read.csv("https://t1.daumcdn.net/cfile/blog/99B515435B87998B07?download")

hit0 <- hit0[,!(names(hit0) %in% c("선수명","순위","팀명"))]


# train a model of Isolation Forest

tr<-IsolationTrees(hit0, ntree=20000, 

  rFactor=0.5, nmin=3)

#evaluate anomaly score

as<-AnomalyScore(hit0,tr)

# show anomaly score

plot(density(as$outF ))

sort(as$outF, decreasing=T)[1:2]


stndoutF<- stnd(as$outF)




# check what matters most 

library(party)

hit0v <- hit0 

hit0v$outF <- as$outF 


hit0v1 <- hit0v[,!(names(hit0v) %in% "선수명")]

ctv1 <- ctree(outF~., data=hit0v1)

plot(ctv1)



# install.packages("randomForest")

library(randomForest)


rfv1 <- randomForest(outF~., data=hit0v1, 

  nodesize=3, ntree=30000)

varImpPlot(rfv1)


   


hit0 <- read.csv("https://t1.daumcdn.net/cfile/blog/99B515435B87998B07?download")

hit0$outF <- as$outF 

# head(hit0[order(hit0$outF, decreasing=T),])


par(mfrow=c(1,2))

plot(hit0$X2B, hit0$GO, 

   col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5,

   main="KBO Hitting (2018-08-16) - Anomaly Detection",

   sub="red: anomaly")

text(hit0$X2B, hit0$GO, 

   labels=ifelse(hit0$outF>0.6,as.character(hit0$'선수명'),""))

grid()

plot(hit0$H, hit0$R, 

   col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5,

   main="KBO Hitting (2018-08-16) - Anomaly Detection",

   sub="red: anomaly")

text(hit0$H, hit0$R, 

   labels=ifelse(hit0$outF>0.6,as.character(hit0$'선수명'),""))

grid()

par(mfrow=c(1,1))



plot(hit0$TB, hit0$RBI, 

   col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)

text(hit0$TB, hit0$RBI, 

   labels=ifelse(hit0$TB>250,as.character(hit0$'선수명'),""))



plot(hit0$TB, hit0$GO, 

   col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)

text(hit0$TB, hit0$GO, 

   labels=ifelse(hit0$TB>250,as.character(hit0$'선수명'),""))



plot(hit0$TB, hit0$RBI, 

   col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)

text(hit0$TB, hit0$RBI, 

   labels=ifelse(hit0$TB>250,as.character(hit0$'선수명'),""))

lines(lowess(hit0$RBI~hit0$TB), lty=2)


plot(hit0$TB/hit0$AB, (hit0$RBI+hit0$R)/hit0$AB, 

   col=rgb(stndoutF, 0,1-stndoutF) , pch=19, 

   cex=stnd(hit0$AB)*3)

text(hit0$TB/hit0$AB, (hit0$RBI+hit0$R)/hit0$AB,  

   labels=hit0$'선수명')

# abline(h=median(hit0$RBI+hit0$R)/hit0$AB, lty=2)




#----- bankmarketing -----------