# devtools::install_github("yanyachen/IsolationForest")
install.packages("IsolationForest", repos="http://R-Forge.R-project.org")
library(IsolationForest)
data(stackloss)
# train a model of Isolation Forest
tr<-IsolationTrees(stackloss, rFactor=0)
#evaluate anomaly score
as<-AnomalyScore(stackloss,tr)
# show anomaly score
as$outF
plot(stackloss$Air.Flow, stackloss$Water.Temp,
col=rgb(as$outF, 0,1-as$outF) , pch=19, cex=1.5)
plot(stackloss$Acid.Conc., stackloss$stack.loss,
col=rgb(as$outF, 0,1-as$outF) , pch=19, cex=1.5)
# train a model of Isolation Forest
tr<-IsolationTrees(mtcars, ntree=10000, rFactor=1)
#evaluate anomaly score
as<-AnomalyScore(mtcars,tr)
# show anomaly score
as$outF
sort(as$outF, decreasing=T)[1:2]
# UDF for standardize anomaly score
stnd <- function(x) {
x1 <- (x-min(x)) / (max(x)-min(x))
return(x1)
}
stndoutF<- stnd(as$outF)
plot(mtcars$carb, mtcars$hp,
col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)
plot(mtcars$qsec, mtcars$disp,
col=rgb(stndoutF, 0,stndoutF) , pch=19, cex=1.5)
# check what matters most
library(party)
mtcarsv <- mtcars
mtcarsv$outF <- as$outF
ctv1 <- ctree(outF~., data=mtcarsv)
plot(ctv1)
install.packages("randomForest")
library(randomForest)
rfv1 <- randomForest(outF~., data=mtcarsv, ntree=10000)
varImpPlot(rfv1)
#------- mycsv example -------
mycsv <- read.csv("https://t1.daumcdn.net/cfile/blog/991E7A3359BDDF6837?download",
header=TRUE)
# train a model of Isolation Forest
tr<-IsolationTrees(mycsv, ntree=20000,
rFactor=1, nmin=3)
#evaluate anomaly score
as<-AnomalyScore(mycsv,tr)
# show anomaly score
plot(density(as$outF ))
sort(as$outF, decreasing=T)[1:2]
# check what matters most
library(party)
mycsvv <- mycsv
mycsvv$outF <- as$outF
ctv1 <- ctree(outF~., data=mycsvv)
plot(ctv1)
# install.packages("randomForest")
library(randomForest)
rfv1 <- randomForest(outF~., data=mycsvv,
nodesize=3, ntree=10000)
varImpPlot(rfv1)
plot(mycsv$VisITedResources, mycsv$AnnouncementsView,
col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)
plot(mycsv$raisedhands, mycsv$Discussion,
col=rgb(stndoutF, 0,stndoutF) , pch=19, cex=1.5)
#---- kbo --------------
hit0 <- read.csv("https://t1.daumcdn.net/cfile/blog/99B515435B87998B07?download")
hit0 <- hit0[,!(names(hit0) %in% c("선수명","순위","팀명"))]
# train a model of Isolation Forest
tr<-IsolationTrees(hit0, ntree=20000,
rFactor=0.5, nmin=3)
#evaluate anomaly score
as<-AnomalyScore(hit0,tr)
# show anomaly score
plot(density(as$outF ))
sort(as$outF, decreasing=T)[1:2]
stndoutF<- stnd(as$outF)
# check what matters most
library(party)
hit0v <- hit0
hit0v$outF <- as$outF
hit0v1 <- hit0v[,!(names(hit0v) %in% "선수명")]
ctv1 <- ctree(outF~., data=hit0v1)
plot(ctv1)
# install.packages("randomForest")
library(randomForest)
rfv1 <- randomForest(outF~., data=hit0v1,
nodesize=3, ntree=30000)
varImpPlot(rfv1)
hit0 <- read.csv("https://t1.daumcdn.net/cfile/blog/99B515435B87998B07?download")
hit0$outF <- as$outF
# head(hit0[order(hit0$outF, decreasing=T),])
par(mfrow=c(1,2))
plot(hit0$X2B, hit0$GO,
col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5,
main="KBO Hitting (2018-08-16) - Anomaly Detection",
sub="red: anomaly")
text(hit0$X2B, hit0$GO,
labels=ifelse(hit0$outF>0.6,as.character(hit0$'선수명'),""))
grid()
plot(hit0$H, hit0$R,
col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5,
main="KBO Hitting (2018-08-16) - Anomaly Detection",
sub="red: anomaly")
text(hit0$H, hit0$R,
labels=ifelse(hit0$outF>0.6,as.character(hit0$'선수명'),""))
grid()
par(mfrow=c(1,1))
plot(hit0$TB, hit0$RBI,
col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)
text(hit0$TB, hit0$RBI,
labels=ifelse(hit0$TB>250,as.character(hit0$'선수명'),""))
plot(hit0$TB, hit0$GO,
col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)
text(hit0$TB, hit0$GO,
labels=ifelse(hit0$TB>250,as.character(hit0$'선수명'),""))
plot(hit0$TB, hit0$RBI,
col=rgb(stndoutF, 0,1-stndoutF) , pch=19, cex=1.5)
text(hit0$TB, hit0$RBI,
labels=ifelse(hit0$TB>250,as.character(hit0$'선수명'),""))
lines(lowess(hit0$RBI~hit0$TB), lty=2)
plot(hit0$TB/hit0$AB, (hit0$RBI+hit0$R)/hit0$AB,
col=rgb(stndoutF, 0,1-stndoutF) , pch=19,
cex=stnd(hit0$AB)*3)
text(hit0$TB/hit0$AB, (hit0$RBI+hit0$R)/hit0$AB,
labels=hit0$'선수명')
# abline(h=median(hit0$RBI+hit0$R)/hit0$AB, lty=2)
#----- bankmarketing -----------
'R 데이터 분석' 카테고리의 다른 글
[R분석] EDA탐색적분석 base R (mtcars) (0) | 2018.09.18 |
---|---|
[R분석] Anomaly Detection과 EDA 결합 (IsolationForest) 활용 (0) | 2018.09.05 |
[DSM1809] statistical data analysis using R (0) | 2018.08.24 |
[KDDTprj4] decision tree sample (0) | 2018.08.04 |
[KDBRBD] retail data analysis practice (0) | 2018.08.04 |