>RE::VISION CRM

R 데이터 분석

[R분석] randomForest와 ranger 속도 차이 비교

YONG_X 2017. 11. 1. 14:59

# randomForest와 ranger 속도 차이 비교

# randomForest의 가장 큰 약점 중 하나인 모델 생성 속도 문제에 대한 대책으로 ranger 테스트


# ranger ::  a fast implementation of Random Forest (Breiman 2001) or recursive partitioning, particularly suited for high dimensional data


bnk05 <- read.csv('https://t1.daumcdn.net/cfile/blog/99E6173359C44CE507?download')

bnk06 <- rbind(bnk05, bnk05)

bnk06 <- rbind(bnk06, bnk06) # replicate data

bnk06 <- rbind(bnk06, bnk06)

bnk06 <- rbind(bnk06, bnk06)


nrow(bnk06)

require(ranger)

require(randomForest)


start.time <- Sys.time()

rng1 <- ranger(y ~ ., data=bnk06, num.trees=1000, importance="impurity")

end.time <- Sys.time()

ranger.learningtime <- end.time - start.time


# sort(rng1$variable.importance, decreasing=T)


start.time <- Sys.time()

rf1 <- randomForest(y ~ ., data=bnk06, ntree=1000, importance=T)

end.time <- Sys.time()


rf.learningtime <- end.time - start.time

barplot(c(as.numeric(ranger.learningtime), as.numeric(rf.learningtime)*60),

   names.arg=c("ranger", "rf"),

   main="learning time" )


c((ranger.learningtime), (rf.learningtime))


start.time <- Sys.time()

for(i in 1: 50) { 

  pr <- predict(rng1, newdata=bnk06) 

  print(i)

  }

end.time <- Sys.time()


ranger.predtime <- end.time - start.time


start.time <- Sys.time()

for(i in 1: 50) { 

  pr <- predict(rf1, newdata=bnk06)  

  print(i)

  }

end.time <- Sys.time()


rf.predtime <- end.time - start.time


barplot(c(as.numeric(ranger.predtime), as.numeric(rf.predtime)),

   names.arg=c("ranger", "rf"),

   main="prediction time" )


c((ranger.predtime), (rf.predtime))




ranger.learningtime <- round(as.numeric(ranger.learningtime, units="secs"), 2)

rf.learningtime <- round(as.numeric(rf.learningtime, units="secs"), 2)

ranger.predtime <- round(as.numeric(ranger.predtime, units="secs"), 2)

rf.predtime <- round(as.numeric(rf.predtime, units="secs"), 2)



par(mfrow=c(1,2))


barplot(c(as.numeric(ranger.learningtime), as.numeric(rf.learningtime)),

   names.arg=c("ranger", "rf"),

   main="learning time (8.69K rows X 23 vars)" )


barplot(c(as.numeric(ranger.predtime), as.numeric(rf.predtime)),

   names.arg=c("ranger", "rf"),

   main="prediction time (4.34M rows X 23 vars)" )

par(mfrow=c(1,1))


# comparison of times (unit: seconds)

paste("ranger.learningtime : " , ranger.learningtime)

paste("rf.learningtime : " , rf.learningtime)


paste("ranger.predtime : " , ranger.predtime)

paste("rf.predtime : " , rf.predtime)



# "ranger.learningtime :  24.3938548564911"

# "rf.learningtime :  304.569322109222"

# "ranger.predtime :  0.177471876144409"

# "rf.predtime :  177.684410095215"


# rf.predtime/ranger.predtime == 1001.198





# 모델 빌딩에서 10배 이상, prediction에서도 1000배 가량 ranger가 빠름