# randomForest와 ranger 속도 차이 비교
# randomForest의 가장 큰 약점 중 하나인 모델 생성 속도 문제에 대한 대책으로 ranger 테스트
# ranger :: a fast implementation of Random Forest (Breiman 2001) or recursive partitioning, particularly suited for high dimensional data
bnk05 <- read.csv('https://t1.daumcdn.net/cfile/blog/99E6173359C44CE507?download')
bnk06 <- rbind(bnk05, bnk05)
bnk06 <- rbind(bnk06, bnk06) # replicate data
bnk06 <- rbind(bnk06, bnk06)
bnk06 <- rbind(bnk06, bnk06)
nrow(bnk06)
require(ranger)
require(randomForest)
start.time <- Sys.time()
rng1 <- ranger(y ~ ., data=bnk06, num.trees=1000, importance="impurity")
end.time <- Sys.time()
ranger.learningtime <- end.time - start.time
# sort(rng1$variable.importance, decreasing=T)
start.time <- Sys.time()
rf1 <- randomForest(y ~ ., data=bnk06, ntree=1000, importance=T)
end.time <- Sys.time()
rf.learningtime <- end.time - start.time
barplot(c(as.numeric(ranger.learningtime), as.numeric(rf.learningtime)*60),
names.arg=c("ranger", "rf"),
main="learning time" )
c((ranger.learningtime), (rf.learningtime))
start.time <- Sys.time()
for(i in 1: 50) {
pr <- predict(rng1, newdata=bnk06)
print(i)
}
end.time <- Sys.time()
ranger.predtime <- end.time - start.time
start.time <- Sys.time()
for(i in 1: 50) {
pr <- predict(rf1, newdata=bnk06)
print(i)
}
end.time <- Sys.time()
rf.predtime <- end.time - start.time
barplot(c(as.numeric(ranger.predtime), as.numeric(rf.predtime)),
names.arg=c("ranger", "rf"),
main="prediction time" )
c((ranger.predtime), (rf.predtime))
ranger.learningtime <- round(as.numeric(ranger.learningtime, units="secs"), 2)
rf.learningtime <- round(as.numeric(rf.learningtime, units="secs"), 2)
ranger.predtime <- round(as.numeric(ranger.predtime, units="secs"), 2)
rf.predtime <- round(as.numeric(rf.predtime, units="secs"), 2)
par(mfrow=c(1,2))
barplot(c(as.numeric(ranger.learningtime), as.numeric(rf.learningtime)),
names.arg=c("ranger", "rf"),
main="learning time (8.69K rows X 23 vars)" )
barplot(c(as.numeric(ranger.predtime), as.numeric(rf.predtime)),
names.arg=c("ranger", "rf"),
main="prediction time (4.34M rows X 23 vars)" )
par(mfrow=c(1,1))
# comparison of times (unit: seconds)
paste("ranger.learningtime : " , ranger.learningtime)
paste("rf.learningtime : " , rf.learningtime)
paste("ranger.predtime : " , ranger.predtime)
paste("rf.predtime : " , rf.predtime)
# "ranger.learningtime : 24.3938548564911"
# "rf.learningtime : 304.569322109222"
# "ranger.predtime : 0.177471876144409"
# "rf.predtime : 177.684410095215"
# rf.predtime/ranger.predtime == 1001.198
# 모델 빌딩에서 10배 이상, prediction에서도 1000배 가량 ranger가 빠름
#
'R 데이터 분석' 카테고리의 다른 글
[CRMAJU2018] 고객데이터분석 using R (0) | 2018.01.19 |
---|---|
탐색적분석이란? Exploratory Data Analysis (EDA) (0) | 2017.11.24 |
[DS_PCBA] 예측적 고객행동 분석 (0) | 2017.10.16 |
[kbdaa_bda] 블로그 크롤링 후 텍스트 분석 (0) | 2017.09.22 |
[kbdaa_bda] 은행마케팅 데이터 분석 실습 (0) | 2017.09.22 |