CARD_SUBWAY_MONTH_201704_a.csv
R설치
https://cran.r-project.org/bin/windows/base/
R 스튜디오 설치
https://www.rstudio.com/products/rstudio/download2/
# setwd("E:/restore_Yong/0_RnModeling20161215/metro")
# hr01 <- read.csv("HR_DATA.csv")
# 블로그에서 바로 읽어오기
hr01 <- read.csv("https://t1.daumcdn.net/cfile/blog/23402B3B591A45D631?download")
names(hr01)[9] <- "department"
names(hr01)
#----- 02. 데이터 처리: 조회/정렬/집계/계산 ---------
# 데이터 조회
head(hr01,3)
View(hr01)
nrow(hr01)
nrow(hr01[hr01$satisfaction_level>0.99,])
hr01[hr01$satisfaction_level>0.99,]
hr01[hr01$satisfaction_level>0.99 & hr01$average_montly_hours>250,]
nrow(hr01[hr01$satisfaction_level>0.99 & hr01$average_montly_hours>250,])
hr01[hr01$department == "technical",]
nrow(hr01[hr01$department == "technical",])
# 데이터 정렬
hr01[hr01$satisfaction_level>0.99 & hr01$average_montly_hours>250 & hr01$department == "technical",]
hr02 <- hr01[hr01$satisfaction_level>0.99 & hr01$average_montly_hours>250 & hr01$department == "technical",]
View(hr02)
View(hr02[order(hr02$last_evaluation),])
# 데이터 계산
hr02$monthly_hours_per_prj <- hr02$average_montly_hours / hr02$number_project
hr02[,c(3,4,11)]
View(hr02[,c(3,4,11)])
# 데이터 집계
table(hr01$department)
table(hr01$salary)
hr03 <- aggregate(hr01$average_montly_hours, list(hr01$department), FUN=mean)
names(hr03) <- c("부서", "평균근무시간")
View(hr03)
hr04 <- aggregate(hr01$satisfaction_level, list(hr01$department), FUN=sd)
names(hr04) <- c("부서", "만족도표준편차")
View(hr04)
#---- 03. 시각적인 데이터 분석 -----
# 기본챠트 생성
plot(hr03)
plot(hr01$satisfaction_level)
plot(sort(hr01$satisfaction_level))
hist(hr01$satisfaction_level)
# 산점도 scatter plot
plot(hr01$satisfaction_level, hr01$average_montly_hours)
# 산점도에 이직여부를 반영
plot(hr01$satisfaction_level, hr01$average_montly_hours, col=(hr01$left+1)*2)
# 박스플롯
boxplot(hr01$satisfaction_level~hr01$department, main="부서별 만족도")
# 막대챠트
barplot(table(hr01$salary))
#===== 지도 시각화 =======
setwd("E:/restore_Yong/0_RnModeling20161215/metro")
#------ 서울 지하철 승하차 맵 --------------
# 서울 지하철호선별 역위치정보 - 서울열린데이터광장
# 서울 지하철호선별 역별 승하차 인원정보
# http://data.seoul.go.kr/openinf/fileview.jsp?infId=OA-12914
library(ggplot2)
library(ggmap)
station <- read.csv("subway_station.csv")
station <- station[!is.na(station$xwgs) & !is.na(station$ywgs),]
station <- station[station$line %in% factor(1:9),]
plot(station$xwgs, station$ywgs, col=factor(station$line)) # 색상구분한 역좌표
cent <- c(mean(station$xwgs), mean(station$ywgs)) # 센터 좌표2
seoul <- get_map(location=c(lat=cent[1],lon=cent[2]), zoom=11,maptype = "roadmap")
ggmap(seoul) +
geom_point(data=station, size=2, alpha=0.7, mapping=aes(x=ywgs, y=xwgs, color=factor(line)))
#---- 201704 승하차인원 비율 표시 -------------------
station <- read.csv("subway_station.csv")
station <- station[!is.na(station$xwgs) & !is.na(station$ywgs),]
station <- station[station$line %in% factor(1:9),]
stationtrff <- read.csv("CARD_SUBWAY_MONTH_201704_a.csv")
stationtrff <- stationtrff[stationtrff$date==20170404,]
station1 <- merge(station, stationtrff[,c(3,5,6)], by="station", all.x=T)
station1$onoffratio <- station1$onpass/station1$offpass
station1$onoffrhighname <- ifelse(station1$onoffratio>1.2,as.character(station1$name),"")
# stationtrff$onpass
# stationtrffcv <- aggregate(stationtrff[,c(5,6)], by=list(stationtrff$station), FUN=sd, na.rm=T)
# stationtrffcv <- aggregate(stationtrff[,c(5,6)], list(station=stationtrff$station), FUN=mean, na.rm=T)
stationtrffcv <- aggregate(stationtrff[,c(5,6)], list(station=stationtrff$station),
function(x) {sd(x)/mean(x)})
stationtrff04 <- read.csv("CARD_SUBWAY_MONTH_201704_a.csv")
stationtrff04$offpass[stationtrff04$offpass==0] <- 0.5
stationtrffmcv <- aggregate(stationtrff04[,c(5,6)], list(station=stationtrff04$station),
function(x) c(cv=sd(x)/mean(x)))
names(stationtrffmcv) <- c("station", onpassmcv", "offpassmcv")
station1 <- merge(station1, stationtrffmcv, by="station", all.x=T)
# 승하차 비율이 높은 역을 구분표시
ggmap(seoul) +
geom_point(data=station, size=2, alpha=0.7,
mapping=aes(x=ywgs, y=xwgs, color=factor(line), shape=factor(ifelse(station1$onoffratio>1.2,1,0))))
seoul <- get_map(location=c(lat=cent[1],lon=cent[2]), zoom=12,maptype = "roadmap")
ggmap(seoul) +
geom_point(data=station1, size=3, alpha=0.7,
mapping=aes(x=ywgs, y=xwgs, color=factor(line), shape=factor(ifelse(station1$onoffratio>1.2,1,0)))) +
geom_text(data=station1, aes(x=ywgs, y=xwgs, label=onoffrhighname) ,hjust=0, vjust=0, size=3)
# 플롯으로 기초비교
plot(sort(station1$onoffratio))
abline(h=1)
# 승차 변동성이 높은 역을 크게 표시
plot(sort(station1$onpassmcv)) # 0.4 이상이면 변동성 크다고 보임
station1$onpassmcvHstn <- ifelse(station1$onpassmcv>=0.4, as.character(station1$name),"")
ggmap(seoul) +
geom_point(data=station1, alpha=0.7,
mapping=aes(x=ywgs, y=xwgs, size=onpassmcv, color=factor(line) )) +
geom_text(data=station1, aes(x=ywgs, y=xwgs, label=onpassmcvHstn) ,hjust=0, vjust=0, size=3)
# 하차 변동성이 작은 역을 크게 표시
station2 <- station1[!(station1$station %in% c(260, 1022, 1458)),]
plot(sort(station2$offpassmcv)) # 0.1 이상이면 변동성 매우 작다고 보임
station2$offpassmcvLstn <- ifelse(station2$onpassmcv<0.1, as.character(station2$name),"")
station2$offpassmcvLndx <- 1- (station2$offpassmcv / max(station2$offpassmcv))
plot(station2$offpassmcv / max(station2$offpassmcv))
ggmap(seoul) +
geom_point(data=station2, alpha=0.5,
mapping=aes(x=ywgs, y=xwgs, size=offpassmcvLndx/3, color=factor(line) )) +
geom_text(data=station2, aes(x=ywgs, y=xwgs, label=offpassmcvLstn) ,hjust=0, vjust=0, size=3)
seoul <- get_map(location=c(lat=cent[1],lon=cent[2]), zoom=12,maptype = "toner")
ggmap(seoul) +
geom_point(aes(x = ywgs, y = xwgs, size = offpassmcvLndx, col = factor(line)),
data=station2, alpha=0.5) +
annotate("text", x = station2$ywgs, y = station2$xwgs, label=station2$offpassmcvLstn, hjust=0, vjust=0, size=3)
# 2호선 노선중 일평균 승하차 인원수 합계가 큰역을 크게 표시해본다면? 4월 평균?
station5 <- station[station$line %in% factor(2),]
stationtrff04_5 <- stationtrff04[stationtrff04$line=="2호선",]
# 일별승하차 인원수 합계
stationtrff04_5$onoffpasssum <- stationtrff04_5$onpass + stationtrff04_5$offpass
stationtrffsum <- aggregate(stationtrff04_5$onoffpasssum, list(station=stationtrff04_5$station),
FUN=mean)
names(stationtrffsum) <- c("station", "daily_onoffsum")
station5 <- merge(station5, stationtrffsum, by="station", all.x=T)
# toner version
cent <- c(mean(station5$xwgs), mean(station5$ywgs)) # 센터 좌표2
seoul <- get_map(location=c(lat=cent[1],lon=cent[2]), zoom=12, maptype = "toner") # or "watercolor"
ggmap(seoul) +
geom_point(aes(x = ywgs, y = xwgs, size = daily_onoffsum),
data=station5, alpha=0.7, col="green") +
annotate("text", x = station5$ywgs, y = station5$xwgs, label=station5$name,
col=ifelse(station5$station %in% c(222, 216, 239), "red", "black"), # 상위 3개 역 red
hjust=0, vjust=0, size=3)
# roadmap possibly in a wide format
cent <- c(mean(station5$xwgs), mean(station5$ywgs)) # 센터 좌표2
seoul <- get_googlemap(center=c(lon=cent[2], lat=cent[1]), zoom=11, maptype = "roadmap")
# get_googlemap options applied
ggmap(seoul) +
geom_point(aes(x = ywgs, y = xwgs, size = daily_onoffsum),
data=station5, alpha=0.7, col="green") +
annotate("text", x = station5$ywgs, y = station5$xwgs, label=station5$name, hjust=0, vjust=0, size=3)
# 새로 읽어오기
# station5 <- read.csv("station5.csv")
station5 <- read.csv("https://t1.daumcdn.net/cfile/blog/2160164D594C562634?download")
names(station5) <- c("역코드", "역명", "호선", "x좌표", "y좌표", "일일승하차인원합계", "평균습도")
# base plot version
# 단순한 플롯 그림
plot(station5$y좌표, station5$x좌표, col="green", pch=19, cex=station5$일일승하차인원합계/100000,
xlab="", ylab="", main="지하철역 붐비는 정도" ,
xlim=c(126.82,127.13), ylim=c(37.47,37.58))
text(station5$y좌표, station5$x좌표, labels=station5$역명, col=ifelse(station5$역명 %in% c(222, 216, 239), "red", "black"), pos=3, cex=0.5)
# 승하차인원과 습도 비교해 보기
plot(station5$일일승하차인원합계, station5$평균습도)
text(station5$일일승하차인원합계, station5$평균습도, labels=station5$역명, col=ifelse(station5$역명 %in% c(222, 216, 239), "red", "black"), pos=3, cex=0.5)
#---- 04. 예측분석 이해 ---------------
# 상관관계
plot(hr01$satisfaction_level, hr01$average_montly_hours)
abline(lm(hr01$average_montly_hours ~ hr01$satisfaction_level), col="red", lwd=2)
cor(hr01$satisfaction_level, hr01$average_montly_hours)
plot(hr01$satisfaction_level, jitter(hr01$number_project, factor=2))
abline(lm(hr01$number_project~hr01$satisfaction_level), col="red", lwd=2)
cor(hr01$satisfaction_level, hr01$number_project)
# 회귀분석
plot(jitter(hr01$number_project, factor=2), hr01$satisfaction_level)
abline(lm(hr01$satisfaction_level~hr01$number_project), col="red", lwd=2)
lines(lowess(hr01$satisfaction_level~hr01$number_project), col="blue", lwd=2)
summary(lm(hr01$satisfaction_level~hr01$number_project))
# 수식으로 표현 : satisfaction_level = -0.028839 * number_project + 0.722509
# 다중회귀분석 결과
summary(lm(left~., data=hr01))
# 의사결정나무 분석
library(rpart) #load the rpart package
t1 <- rpart(left ~ ., data = hr01, minbucket = 500)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(t1)
# ctree 버전
library(party)
# 숫자가 아닌 종류로 형식 변경
hr01t <- hr01
hr01t$left <- as.factor(hr01t$left)
t1 <- ctree(left ~ ., data = hr01t, controls = ctree_control(minbucket = 500))
plot(t1)
t1 <- ctree(left ~ ., data = hr01t, controls = ctree_control(minbucket = 500, maxdepth=3))
plot(t1)
# 산점도 확인
plot(hr01$satisfaction_level, jitter(hr01$time_spend_company, factor=2), col=(hr01$left+1)*3)
plot(hr01$satisfaction_level, jitter(hr01$number_project, factor=2), col=(hr01$left+1)*2)
# 군집분석
hr06 <- hr01[,c(2,4,6,8)]
k01 <- kmeans(scale(hr06), 3)
plot(hr01$average_montly_hours, hr01$satisfaction_level, col=k01$cluster+3)
plot(jitter(hr01$Work_accident, factor=2), hr01$last_evaluation, col=k01$cluster+3)
#---- 산불피해 분석 예제
# setwd("E:/restore_Yong/GAMDFM/Ffire")
# ff <- read.csv("forestfires.csv")
ff <-read.csv("https://t1.daumcdn.net/cfile/blog/26021135594B019B01?download")
names(ff)
#------ 컬럼 설명 -------
# Montesinho park(Portugal)
1. X - x-axis spatial coordinate within the Montesinho park map: 1 to 9
2. Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9
3. month - month of the year: "jan" to "dec"
4. day - day of the week: "mon" to "sun"
5. FFMC - FFMC index from the FWI system: 18.7 to 96.20
6. DMC - DMC index from the FWI system: 1.1 to 291.3
7. DC - DC index from the FWI system: 7.9 to 860.6
8. ISI - ISI index from the FWI system: 0.0 to 56.10
9. temp - temperature in Celsius degrees: 2.2 to 33.30
10. RH - relative humidity in %: 15.0 to 100
11. wind - wind speed in km/h: 0.40 to 9.40
12. rain - outside rain in mm/m2 : 0.0 to 6.4
13. area - the burned area of the forest (in ha): 0.00 to 1090.84
require(party)
ct1 <- ctree(area~ .,
data=ff, controls=ctree_control(maxdepth=3, minbucket=as.integer(nrow(ff)/100)))
plot(ct1)
plot(ff$area, jitter(ff$wind))
plot(sort(ff$area))
ff1 <- ff
ff1$logarea <- log(ff$area +1)
plot(sort(ff1$logarea))
ff1 <- ff1[, !(names(ff1) %in% c("area", "DMC", "DC"))]
ct2 <- ctree(logarea~ .,
data=ff1, controls=ctree_control(maxdepth=3, minbucket=5, , mincriterion = 0.2))
plot(ct2)
ff11 <- ff1[ff1$month != "dec",]
plot(jitter(ff11$Y), jitter(ff11$temp), col=ifelse(ff11$logarea>0 ,2,1))
abline(h=25.9, lty=2)
abline(v=5, lty=2)
plot(jitter(ff11$X), jitter(ff11$Y), col=rgb(ff1$logarea/max(ff1$logarea),0.3,0.3, alpha=0.7), pch=19)
#==========
plot(jitter(ff$wind), jitter(ff$rain), col=ifelse(ff$area>0,2,1))
plot(jitter(ff$wind), jitter(ff$temp), col=ifelse(ff$area>0,2,1))
ff3 <- ff
ff3$isLarea <- ifelse(ff3$area >= 2, 1, 0)
ff3 <- ff3[, !(names(ff3) == "area")]
ct3 <- ctree(isLarea~ .,
data=ff3, controls=ctree_control(maxdepth=5, minbucket=3, mincriterion = 0.9))
plot(ct3)
ct3 <- ctree(isLarea~ temp+wind+RH+rain,
data=ff3, controls=ctree_control(maxdepth=5, minbucket=3, mincriterion = 0.9))
plot(ct3)
#---- 고장분석 예제
# setwd("E:/restore_Yong/GAMDFM/FAULT")
# dataset source: https://www.kaggle.com/ludobenistant/predictive-maintenance
# m1 <- read.csv("maintenance_data.csv")
m1 <- read.csv("https://t1.daumcdn.net/cfile/blog/27711B35594B019D05?download")
plot(m1$temperatureInd, m1$pressureInd, col=ifelse(m1$broken==1, 2,1))
m2 <- m1
m2$lifetimeyear <- as.integer(m2$lifetime/12)
plot(sort(m2$lifetimeyear))
plot(sort(m2$pressureInd))
m2$pressureCla <- ifelse(m2$pressureInd >= 120, "pressure_H",ifelse(m2$pressureInd >= 80, "pressure_M","pressure_L"))
table(m2$pressureCla)
plot(sort(m2$moistureInd))
m2$moistureCla <- ifelse(m2$moistureInd >= 110, "moisture_H",ifelse(m2$moistureInd >= 85, "moisture_M","moisture_L"))
table(m2$moistureCla)
plot(sort(m2$temperatureInd))
m2$temperatureCla <- ifelse(m2$temperatureInd >= 130, "temperature_H",ifelse(m2$temperatureInd >= 80, "temperature_M","temperature_L"))
table(m2$temperatureCla)
barplot(table(m2$broken, m2$lifetimeyear ))
# barplot(prop.table(table(m2$broken, m2$lifetimeyear ),2)) # 비율로 비교
m2$brokenCla <- ifelse(m2$broken==1,"broken", "normal")
m2$lifetimeyearCla <- paste0("lifetime", as.character(m2$lifetimeyear))
m3 <- m2[, (names(m2) %in% c("brokenCla", "team", "provider", "lifetimeyearCla", "pressureCla", "moistureCla", "temperatureCla" ))]
#----------
install.packages("arules")
install.packages("arulesViz")
install.packages("datasets")
library(arules)
library(arulesViz)
library(datasets)
write.csv(m3, "maint_trs.csv", row.names=F, col.names=F)
m4 <- read.transactions("maint_trs.csv", format="basket", sep="," , skip=1);
inspect(head(m4, 5))
inspect(head(m4, 5))
itemFrequencyPlot(m4,topN=20,type="absolute")
rules <- apriori(m4, parameter = list(supp = 0.03, conf = 0.3, minlen=2, maxlen=2))
options(digits=3)
inspect(rules[1:20])
summary(rules)
df1 <- as.data.frame(inspect(rules))
df2 <- df1[df1$rhs=="{broken}" & df1$lift>1 , ]
df2[order(-df2$confidence),]
plot(rules)
rules1 <- apriori(m4, parameter = list(supp = 0.01, conf = 0.6, minlen=2, maxlen=2))
rules2 <- head(sort(rules, by="confidence"), 30)
rules3 <- subset(rules1, subset=rhs %pin% "broken")
rules4 <- subset(rules1, subset=rhs %in% c("broken", "normal"))
inspect(rules4)
plot(rules4, method="graph", interactive=F, shading=NA)
plot(rules4, method="graph", interactive=T, shading=NA)
df5 <- as.data.frame(inspect(rules4))
df5[order(-df5$confidence),]
#---- 05. 텍스트 분석 이해 ------------
:: 텍스트 마이닝 (+ 크롤링)
#-----[크롤링 crawling:: data harvesting ; scraping ; wrangling ]---
# install.packages("XML")
# install.packages("rvest")
library(XML)
library(rvest)
#-- [지하철]에 대한 네이버 뉴스
# 네이버 -> 뉴스검색 -> [키워드: 지하철] 검색
newsURL <- "http://news.naver.com/main/search/search.nhn?query=%C1%F6%C7%CF%C3%B6&st=news.all&q_enc=EUC-KR&r_enc=UTF-8&r_format=xml&rp=none&sm=all.basic&ic=all&so=rel.dsc&detail=0&pd=1&r_cluster2_start=1&r_cluster2_display=10&start=1&display=10&page="
# 네이버 -> 뉴스검색 -> [키워드: 서울 지하철] 검색
newsURL <- "http://news.naver.com/main/search/search.nhn?query=%BC%AD%BF%EF+%C1%F6%C7%CF%C3%B6&st=news.all&q_enc=EUC-KR&r_enc=UTF-8&r_format=xml&rp=none&sm=all.basic&ic=all&so=rel.dsc&detail=0&pd=1&r_cluster2_start=1&r_cluster2_display=10&start=1&display=10&page="
# 네이버 -> 뉴스검색 -> [키워드: 서울 지하철 불편] 검색
newsURL <- "http://news.naver.com/main/search/search.nhn?query=%BC%AD%BF%EF+%C1%F6%C7%CF%C3%B6+%BA%D2%C6%ED&st=news.all&q_enc=EUC-KR&r_enc=UTF-8&r_format=xml&rp=none&sm=all.basic&ic=all&so=rel.dsc&detail=0&pd=1&r_cluster2_start=1&r_cluster2_display=10&start=1&display=10&page="
GetStockCommentData <- function(num=1, newsURL){
url <- gsub(" ","",paste0(newsURL,as.character(num)))
doc <- htmlTreeParse(url, useInternalNodes = T)
print(url)
subject <- xpathSApply(doc, "//div[@class='ct']/a", xmlValue)#subject
date <- xpathSApply(doc, "//div[@class='ct']/div/span[4]", xmlValue)#보도일자
main <- xpathSApply(doc, "//div[@class='ct']/p", xmlValue)#요약보기
press <-xpathSApply(doc, "//div[@class='ct']/div/span[2]", xmlValue)#신문사
url2 <- xpathSApply(doc, "//div[@class='ct']/div/a", xmlGetAttr,'href')
subject <- iconv(subject,"UTF-8","EUC-KR")
date <- iconv(date,"UTF-8","EUC-KR")
main <- iconv(main,"UTF-8","EUC-KR")
press <- iconv(press,"UTF-8","EUC-KR")
main <- gsub("\r|\t|\n","",main)
stock_data <-cbind(subject,date,main,press)
return(stock_data)
}
gc() # 메모리상의 불필요한 데이터 제거
DATA<-NULL
StartPage <- 1
EndPage <- 5
DATA<-NULL
StartPage <- 1
EndPage <- 5
for (i in StartPage:EndPage) {
getData<-GetStockCommentData(i, newsURL)
print(length(getData))
DATA<-rbind(DATA,getData)
print(paste0("page ", as.character(i), " done"))
}
# write.csv(DATA,"F:/data/aaa.csv")
#------------
library(XML)
library(rvest)
#--- [키워드]에 대한 네이버 블로그 수집
# https ==> http [http로 바꾸어주고 돌려야함]
GetStockCommentData <- function(num){
url = gsub(" ","",paste0("https://section.blog.naver.com/sub/SearchBlog.nhn?type=post&option.keyword=%EC%84%9C%EC%9A%B8%20%EC%A7%80%ED%95%98%EC%B2%A0&term=&option.startDate=&option.endDate=&option.page.currentPage=", as.character(num)))
print(url)
doc = htmlTreeParse(url, useInternalNodes = T, encoding="UTF-8")
xpathSApply(doc, "//*[@id='blogSearchForm']/div[2]/ul[3]", xmlValue)
subject <- xpathSApply(doc, "//ul[@class='list_type_1 search_list']/li/h5/a", xmlValue)#subject
print(subject)
date <- xpathSApply(doc, "//span[@class='date']", xmlValue) #작성일
main <- xpathSApply(doc, "//div[@class='list_content']", xmlValue) #본문요약
main <- gsub("\r|\n","",main)
nick <- xpathSApply(doc, "//div[@class='list_data']/a", xmlValue) #블로그 닉네임
category <- xpathSApply(doc, "//span[@class='category']/a", xmlValue) #블로그 카테고리
href <- xpathSApply(doc, "//ul[@class='list_type_1 search_list']/li/h5/a", xmlGetAttr,'href')
stock_data <-cbind(subject, date, main, nick, category, href)
print(stock_data)
return(stock_data)
}
gc()
DATA<-NULL
StartPage <- 1
EndPage <- 5
for (i in StartPage:EndPage) {
getData<-GetStockCommentData(i)
DATA<-rbind(DATA,getData)
}
# write.csv(DATA,"aaablog.csv",row.names = F)
#--------------------
#------ 워드클라우드 생성 ------------
# install.packages("KoNLP")
library(KoNLP)
useSejongDic() # 세종 사전 생성
# setwd(".... 사용자 path 지정")
# DATA <- readLines("abcd.txt") # 텍스트 파일에서 읽어오는 경우
article <- paste0(as.data.frame(DATA)$subject, as.data.frame(DATA)$main )
paste(c("acde","ee"),collapse="")
article <- paste0(article,collapse="")
article1 <- sapply(article,extractNoun,USE.NAMES=F)
article2 <- unlist(article1)
article3 <- Filter(function(x){nchar(x)>=2},article2)
article3 <- gsub("\r|\n|\t","", article3)
article3 <- gsub("하기","", article3)
article3 <- gsub("△","", article3)
article3 <- gsub("하게","", article3)
# article3 <- gsub("빅데이터를","빅데이터", article3)
# article3 <- gsub("빅데이터로","빅데이터", article3)
wordcount <- table(article3)
barplot(sort(wordcount), las=2, cex.names =0.7)
barplot(head(tail(wordcount[order(-wordcount)],length(wordcount)-2),20)) # sort table
# wc1 <- head(tail(wordcount[order(-wordcount)],length(wordcount)-3),50)
# wc1 <- head(wordcount[order(-wordcount)],50)[c(2,5:50)]
wc1 <- head(wordcount[order(-wordcount)],50)
barplot(wc1, las=2, cex.names =0.7)
head(as.data.frame(wordcount))
df_a3 <- as.data.frame(wordcount)
df_a31 <- df_a3[df_a3$Freq>=3, ]
barplot(df_a31$Freq, names.arg=df_a31$article3, cex.names=0.7)
head(df_a31[order(- df_a31$Freq),],20)
barplot(wordcount[order(wordcount)])
#------ wordcloud 생성 --------------------
# install.packages("wordcloud2")
library(wordcloud2)
# windowsFonts(malgun=windowsFont("맑은 고딕"))
wordcloud2(data=wordcount, minSize = 7, minRotation = -pi/0, maxRotation = -pi/0, rotateRatio = 1, fontFamily='맑은 고딕', size=0.8)
# <지하철> 제외 후 wc -- 다이아몬드 모양
wordcloud2(data=wc1, minSize = 7, minRotation = -pi/0, maxRotation = -pi/0, rotateRatio = 1,
fontFamily='맑은 고딕', size=0.7, shape='diamond')
wordcloud2(data=wordcount, minSize = 7, minRotation = -pi/0, maxRotation = -pi/0, rotateRatio = 1,
fontFamily='맑은 고딕', size=0.7, shape='diamond')
# <빅데이터>와 <분석> 제외 후 wc -- 다이아몬드 모양
wordcloud2(data=wc1, minSize = 3, minRotation = -pi/0, maxRotation = -pi/0, rotateRatio = 1, fontFamily='맑은 고딕', size=0.4, shape='diamond')
#-------[by article]------
df_DATA <- as.data.frame(DATA)
a1 <- apply(df_DATA[,c(1,3)], 1, paste, collapse=" ")
a1 <- sapply(a1,extractNoun,USE.NAMES=F)
a2 <- NA
aid <- NA
for(i in 1:length(a1)) {
a2 <- c(a2, a1[[i]])
aid <- c(aid, rep(i, length(a1[[i]])) )
}
df_article <- data.frame(aid, a2)
df_article <- df_article[!is.na(df_article$aid),]
head(df_article, 60)
df_article <- df_article[nchar(as.character(df_article$a2))>=2,]
head(df_article, 60)
df_article$a2 <- gsub("\r|\n|\t","", df_article$a2)
df_article$a2 <- gsub("“|”|\\[|\\]|오전|오후|뉴시스와","", df_article$a2)
df_article$a2 <- gsub("빅데이터를","빅데이터", df_article$a2)
df_article$a2 <- gsub("빅데이터로","빅데이터", df_article$a2)
df_article <- df_article[nchar(as.character(df_article$a2))>=2,]
head(df_article, 60)
#------- association rules [single format]-----------
# 키워드간 연관성 분석 (동시 출현 패턴)
library(arules)
library(arulesViz)
setwd("E:/restore_Yong/GAMDFM/text_m")
head(df_article)
write.csv(df_article, "dfa.csv", row.names=F)
tr <- read.transactions("dfa.csv", format = "single", sep=',',
skip=1, rm.duplicates = FALSE, cols=c(1,2))
inspect(head(tr))
itemFrequencyPlot(tr,topN=20,type="absolute")
srules <- apriori(tr, parameter = list(supp = 0.06, conf = 0.4, minlen=2, maxlen=2))
# options(digits=3)
inspect(head(srules))
summary(srules)
rules1 <- head(sort(srules, by="confidence"), 50)
inspect(rules1)
rules2 <- subset(rules1, subset=(lift>1))
# rules2 <- subset(rules1, subset=(!(lhs %pin% "서울|지하철|들이")))
# rules2 <- subset(rules2, subset=(!(rhs %pin% "서울|지하철|들이")))
inspect(head(sort(rules2, by="lift"), 30))
plot(rules2, method="graph", interactive=T, shading=NA)
inspect(rules1)
inspect(head(sort(rules2, by="lift"), 10))
#--- 끝 -----
'R 데이터 분석' 카테고리의 다른 글
[kbdaa_bda] 빅데이터고객분석 GDA (0) | 2017.09.20 |
---|---|
[kbdaa_bda] 빅데이터고객분석 (0) | 2017.09.09 |
[R 분석: DT] rpart를 이용한 트리 모델 만들기 (0) | 2017.05.29 |
[SKK_DA1] predictive modeling practice (0) | 2017.05.25 |
[SKK_DA1] 시계열 모형 AR-MA-ARIMA 요점 (0) | 2017.05.25 |