subway]빅데이터분석

R 데이터 분석

subway]빅데이터분석

YONG_X 2017. 6. 22. 08:32

station5.csv

CARD_SUBWAY_MONTH_201704.csv

CARD_SUBWAY_MONTH_201704_a.csv

met_scrpt_20170607.txt

R설치

https://cran.r-project.org/bin/windows/base/

R 스튜디오 설치

https://www.rstudio.com/products/rstudio/download2/

# setwd("E:/restore_Yong/0_RnModeling20161215/metro")
# hr01 <- read.csv("HR_DATA.csv")
# 블로그에서 바로 읽어오기
hr01 <- read.csv("https://t1.daumcdn.net/cfile/blog/23402B3B591A45D631?download")

names(hr01)[9] <- "department"
names(hr01)

#----- 02. 데이터 처리: 조회/정렬/집계/계산 ---------

# 데이터 조회

head(hr01,3)
View(hr01)

nrow(hr01)

nrow(hr01[hr01$satisfaction_level>0.99,])
hr01[hr01$satisfaction_level>0.99,]

hr01[hr01$satisfaction_level>0.99 & hr01$average_montly_hours>250,]
nrow(hr01[hr01$satisfaction_level>0.99 & hr01$average_montly_hours>250,])

hr01[hr01$department == "technical",]
nrow(hr01[hr01$department == "technical",])

# 데이터 정렬

hr01[hr01$satisfaction_level>0.99 & hr01$average_montly_hours>250 & hr01$department == "technical",]

hr02 <- hr01[hr01$satisfaction_level>0.99 & hr01$average_montly_hours>250 & hr01$department == "technical",]
View(hr02)
View(hr02[order(hr02$last_evaluation),])

# 데이터 계산
hr02$monthly_hours_per_prj <- hr02$average_montly_hours / hr02$number_project
hr02[,c(3,4,11)]
View(hr02[,c(3,4,11)])

# 데이터 집계
table(hr01$department)
table(hr01$salary)

hr03 <- aggregate(hr01$average_montly_hours, list(hr01$department), FUN=mean)
names(hr03) <- c("부서", "평균근무시간")
View(hr03)

hr04 <- aggregate(hr01$satisfaction_level, list(hr01$department), FUN=sd)
names(hr04) <- c("부서", "만족도표준편차")
View(hr04)

#---- 03. 시각적인 데이터 분석 -----

# 기본챠트 생성

plot(hr03)

plot(hr01$satisfaction_level)

plot(sort(hr01$satisfaction_level))

hist(hr01$satisfaction_level)

# 산점도 scatter plot

plot(hr01$satisfaction_level, hr01$average_montly_hours)

# 산점도에 이직여부를 반영
plot(hr01$satisfaction_level, hr01$average_montly_hours, col=(hr01$left+1)*2)

# 박스플롯
boxplot(hr01$satisfaction_level~hr01$department, main="부서별 만족도")

# 막대챠트
barplot(table(hr01$salary))

#===== 지도 시각화 =======

setwd("E:/restore_Yong/0_RnModeling20161215/metro")

#------ 서울 지하철 승하차 맵 --------------
# 서울 지하철호선별 역위치정보 - 서울열린데이터광장
# 서울 지하철호선별 역별 승하차 인원정보
# http://data.seoul.go.kr/openinf/fileview.jsp?infId=OA-12914

library(ggplot2)
library(ggmap)

station <- read.csv("subway_station.csv")
station <- station[!is.na(station$xwgs) & !is.na(station$ywgs),]
station <- station[station$line %in% factor(1:9),]
plot(station$xwgs, station$ywgs, col=factor(station$line)) # 색상구분한 역좌표

cent <- c(mean(station$xwgs), mean(station$ywgs)) # 센터 좌표2

seoul <- get_map(location=c(lat=cent[1],lon=cent[2]), zoom=11,maptype = "roadmap")

ggmap(seoul) +
geom_point(data=station, size=2, alpha=0.7, mapping=aes(x=ywgs, y=xwgs, color=factor(line)))

#---- 201704 승하차인원 비율 표시 -------------------

station <- read.csv("subway_station.csv")
station <- station[!is.na(station$xwgs) & !is.na(station$ywgs),]
station <- station[station$line %in% factor(1:9),]

stationtrff <- read.csv("CARD_SUBWAY_MONTH_201704_a.csv")
stationtrff <- stationtrff[stationtrff$date==20170404,]

station1 <- merge(station, stationtrff[,c(3,5,6)], by="station", all.x=T)
station1$onoffratio <- station1$onpass/station1$offpass

station1$onoffrhighname <- ifelse(station1$onoffratio>1.2,as.character(station1$name),"")

# stationtrff$onpass
# stationtrffcv <- aggregate(stationtrff[,c(5,6)], by=list(stationtrff$station), FUN=sd, na.rm=T)
# stationtrffcv <- aggregate(stationtrff[,c(5,6)], list(station=stationtrff$station), FUN=mean, na.rm=T)
stationtrffcv <- aggregate(stationtrff[,c(5,6)], list(station=stationtrff$station),
function(x) {sd(x)/mean(x)})

stationtrff04 <- read.csv("CARD_SUBWAY_MONTH_201704_a.csv")
stationtrff04$offpass[stationtrff04$offpass==0] <- 0.5
stationtrffmcv <- aggregate(stationtrff04[,c(5,6)], list(station=stationtrff04$station),
function(x) c(cv=sd(x)/mean(x)))
names(stationtrffmcv) <- c("station", onpassmcv", "offpassmcv")
station1 <- merge(station1, stationtrffmcv, by="station", all.x=T)

# 승하차 비율이 높은 역을 구분표시
ggmap(seoul) +
geom_point(data=station, size=2, alpha=0.7,
mapping=aes(x=ywgs, y=xwgs, color=factor(line), shape=factor(ifelse(station1$onoffratio>1.2,1,0))))

seoul <- get_map(location=c(lat=cent[1],lon=cent[2]), zoom=12,maptype = "roadmap")
ggmap(seoul) +
geom_point(data=station1, size=3, alpha=0.7,
mapping=aes(x=ywgs, y=xwgs, color=factor(line), shape=factor(ifelse(station1$onoffratio>1.2,1,0)))) +
geom_text(data=station1, aes(x=ywgs, y=xwgs, label=onoffrhighname) ,hjust=0, vjust=0, size=3)

# 플롯으로 기초비교
plot(sort(station1$onoffratio))
abline(h=1)

# 승차 변동성이 높은 역을 크게 표시
plot(sort(station1$onpassmcv)) # 0.4 이상이면 변동성 크다고 보임
station1$onpassmcvHstn <- ifelse(station1$onpassmcv>=0.4, as.character(station1$name),"")

ggmap(seoul) +
geom_point(data=station1, alpha=0.7,
mapping=aes(x=ywgs, y=xwgs, size=onpassmcv, color=factor(line) )) +
geom_text(data=station1, aes(x=ywgs, y=xwgs, label=onpassmcvHstn) ,hjust=0, vjust=0, size=3)

# 하차 변동성이 작은 역을 크게 표시
station2 <- station1[!(station1$station %in% c(260, 1022, 1458)),]
plot(sort(station2$offpassmcv)) # 0.1 이상이면 변동성 매우 작다고 보임
station2$offpassmcvLstn <- ifelse(station2$onpassmcv<0.1, as.character(station2$name),"")
station2$offpassmcvLndx <- 1- (station2$offpassmcv / max(station2$offpassmcv))

plot(station2$offpassmcv / max(station2$offpassmcv))

ggmap(seoul) +
geom_point(data=station2, alpha=0.5,
mapping=aes(x=ywgs, y=xwgs, size=offpassmcvLndx/3, color=factor(line) )) +
geom_text(data=station2, aes(x=ywgs, y=xwgs, label=offpassmcvLstn) ,hjust=0, vjust=0, size=3)

seoul <- get_map(location=c(lat=cent[1],lon=cent[2]), zoom=12,maptype = "toner")
ggmap(seoul) +
geom_point(aes(x = ywgs, y = xwgs, size = offpassmcvLndx, col = factor(line)),
data=station2, alpha=0.5) +
annotate("text", x = station2$ywgs, y = station2$xwgs, label=station2$offpassmcvLstn, hjust=0, vjust=0, size=3)

# 2호선 노선중 일평균 승하차 인원수 합계가 큰역을 크게 표시해본다면? 4월 평균?

station5 <- station[station$line %in% factor(2),]
stationtrff04_5 <- stationtrff04[stationtrff04$line=="2호선",]

# 일별승하차 인원수 합계
stationtrff04_5$onoffpasssum <- stationtrff04_5$onpass + stationtrff04_5$offpass

stationtrffsum <- aggregate(stationtrff04_5$onoffpasssum, list(station=stationtrff04_5$station),
FUN=mean)
names(stationtrffsum) <- c("station", "daily_onoffsum")
station5 <- merge(station5, stationtrffsum, by="station", all.x=T)

# toner version
cent <- c(mean(station5$xwgs), mean(station5$ywgs)) # 센터 좌표2
seoul <- get_map(location=c(lat=cent[1],lon=cent[2]), zoom=12, maptype = "toner") # or "watercolor"
ggmap(seoul) +
geom_point(aes(x = ywgs, y = xwgs, size = daily_onoffsum),
data=station5, alpha=0.7, col="green") +
annotate("text", x = station5$ywgs, y = station5$xwgs, label=station5$name,
col=ifelse(station5$station %in% c(222, 216, 239), "red", "black"), # 상위 3개 역 red
hjust=0, vjust=0, size=3)

# roadmap possibly in a wide format
cent <- c(mean(station5$xwgs), mean(station5$ywgs)) # 센터 좌표2
seoul <- get_googlemap(center=c(lon=cent[2], lat=cent[1]), zoom=11, maptype = "roadmap")
# get_googlemap options applied
ggmap(seoul) +
geom_point(aes(x = ywgs, y = xwgs, size = daily_onoffsum),
data=station5, alpha=0.7, col="green") +
annotate("text", x = station5$ywgs, y = station5$xwgs, label=station5$name, hjust=0, vjust=0, size=3)

# 새로 읽어오기

# station5 <- read.csv("station5.csv")

station5 <- read.csv("https://t1.daumcdn.net/cfile/blog/2160164D594C562634?download")

names(station5) <- c("역코드", "역명", "호선", "x좌표", "y좌표", "일일승하차인원합계", "평균습도")

# base plot version

# 단순한 플롯 그림

plot(station5$y좌표, station5$x좌표, col="green", pch=19, cex=station5$일일승하차인원합계/100000,
xlab="", ylab="", main="지하철역 붐비는 정도" ,
xlim=c(126.82,127.13), ylim=c(37.47,37.58))
text(station5$y좌표, station5$x좌표, labels=station5$역명, col=ifelse(station5$역명 %in% c(222, 216, 239), "red", "black"), pos=3, cex=0.5)

# 승하차인원과 습도 비교해 보기

plot(station5$일일승하차인원합계, station5$평균습도)

text(station5$일일승하차인원합계, station5$평균습도, labels=station5$역명, col=ifelse(station5$역명 %in% c(222, 216, 239), "red", "black"), pos=3, cex=0.5)

#---- 04. 예측분석 이해 ---------------

# 상관관계

plot(hr01$satisfaction_level, hr01$average_montly_hours)
abline(lm(hr01$average_montly_hours ~ hr01$satisfaction_level), col="red", lwd=2)
cor(hr01$satisfaction_level, hr01$average_montly_hours)

plot(hr01$satisfaction_level, jitter(hr01$number_project, factor=2))
abline(lm(hr01$number_project~hr01$satisfaction_level), col="red", lwd=2)
cor(hr01$satisfaction_level, hr01$number_project)

# 회귀분석

plot(jitter(hr01$number_project, factor=2), hr01$satisfaction_level)
abline(lm(hr01$satisfaction_level~hr01$number_project), col="red", lwd=2)
lines(lowess(hr01$satisfaction_level~hr01$number_project), col="blue", lwd=2)

summary(lm(hr01$satisfaction_level~hr01$number_project))

# 수식으로 표현 : satisfaction_level = -0.028839 * number_project + 0.722509

# 다중회귀분석 결과
summary(lm(left~., data=hr01))

# 의사결정나무 분석

library(rpart) #load the rpart package

t1 <- rpart(left ~ ., data = hr01, minbucket = 500)

library(rattle)
library(rpart.plot)
library(RColorBrewer)

fancyRpartPlot(t1)

# ctree 버전

library(party)

# 숫자가 아닌 종류로 형식 변경
hr01t <- hr01
hr01t$left <- as.factor(hr01t$left)

t1 <- ctree(left ~ ., data = hr01t, controls = ctree_control(minbucket = 500))
plot(t1)

t1 <- ctree(left ~ ., data = hr01t, controls = ctree_control(minbucket = 500, maxdepth=3))
plot(t1)

# 산점도 확인
plot(hr01$satisfaction_level, jitter(hr01$time_spend_company, factor=2), col=(hr01$left+1)*3)

plot(hr01$satisfaction_level, jitter(hr01$number_project, factor=2), col=(hr01$left+1)*2)

# 군집분석

hr06 <- hr01[,c(2,4,6,8)]
k01 <- kmeans(scale(hr06), 3)

plot(hr01$average_montly_hours, hr01$satisfaction_level, col=k01$cluster+3)
plot(jitter(hr01$Work_accident, factor=2), hr01$last_evaluation, col=k01$cluster+3)

#---- 산불피해 분석 예제

# setwd("E:/restore_Yong/GAMDFM/Ffire")
# ff <- read.csv("forestfires.csv")

ff <-read.csv("https://t1.daumcdn.net/cfile/blog/26021135594B019B01?download")

names(ff)

#------ 컬럼 설명 -------
# Montesinho park(Portugal)

   1. X - x-axis spatial coordinate within the Montesinho park map: 1 to 9
   2. Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9
   3. month - month of the year: "jan" to "dec"
   4. day - day of the week: "mon" to "sun"
   5. FFMC - FFMC index from the FWI system: 18.7 to 96.20
   6. DMC - DMC index from the FWI system: 1.1 to 291.3
   7. DC - DC index from the FWI system: 7.9 to 860.6
   8. ISI - ISI index from the FWI system: 0.0 to 56.10
   9. temp - temperature in Celsius degrees: 2.2 to 33.30
   10. RH - relative humidity in %: 15.0 to 100
   11. wind - wind speed in km/h: 0.40 to 9.40
   12. rain - outside rain in mm/m2 : 0.0 to 6.4
   13. area - the burned area of the forest (in ha): 0.00 to 1090.84

require(party)
ct1 <- ctree(area~ .,
data=ff, controls=ctree_control(maxdepth=3, minbucket=as.integer(nrow(ff)/100)))
plot(ct1)

plot(ff$area, jitter(ff$wind))
plot(sort(ff$area))

ff1 <- ff
ff1$logarea <- log(ff$area +1)

plot(sort(ff1$logarea))

ff1 <- ff1[, !(names(ff1) %in% c("area", "DMC", "DC"))]

ct2 <- ctree(logarea~ .,
data=ff1, controls=ctree_control(maxdepth=3, minbucket=5, , mincriterion = 0.2))
plot(ct2)

ff11 <- ff1[ff1$month != "dec",]

plot(jitter(ff11$Y), jitter(ff11$temp), col=ifelse(ff11$logarea>0 ,2,1))
abline(h=25.9, lty=2)
abline(v=5, lty=2)

plot(jitter(ff11$X), jitter(ff11$Y), col=rgb(ff1$logarea/max(ff1$logarea),0.3,0.3, alpha=0.7), pch=19)

#==========

plot(jitter(ff$wind), jitter(ff$rain), col=ifelse(ff$area>0,2,1))

plot(jitter(ff$wind), jitter(ff$temp), col=ifelse(ff$area>0,2,1))

ff3 <- ff
ff3$isLarea <- ifelse(ff3$area >= 2, 1, 0)
ff3 <- ff3[, !(names(ff3) == "area")]

ct3 <- ctree(isLarea~ .,
data=ff3, controls=ctree_control(maxdepth=5, minbucket=3, mincriterion = 0.9))
plot(ct3)

ct3 <- ctree(isLarea~ temp+wind+RH+rain,
data=ff3, controls=ctree_control(maxdepth=5, minbucket=3, mincriterion = 0.9))
plot(ct3)

#---- 고장분석 예제

# setwd("E:/restore_Yong/GAMDFM/FAULT")

# dataset source: https://www.kaggle.com/ludobenistant/predictive-maintenance

# m1 <- read.csv("maintenance_data.csv")

m1 <- read.csv("https://t1.daumcdn.net/cfile/blog/27711B35594B019D05?download")

plot(m1$temperatureInd, m1$pressureInd, col=ifelse(m1$broken==1, 2,1))

m2 <- m1

m2$lifetimeyear <- as.integer(m2$lifetime/12)
plot(sort(m2$lifetimeyear))

plot(sort(m2$pressureInd))
m2$pressureCla <- ifelse(m2$pressureInd >= 120, "pressure_H",ifelse(m2$pressureInd >= 80, "pressure_M","pressure_L"))
table(m2$pressureCla)

plot(sort(m2$moistureInd))
m2$moistureCla <- ifelse(m2$moistureInd >= 110, "moisture_H",ifelse(m2$moistureInd >= 85, "moisture_M","moisture_L"))
table(m2$moistureCla)

plot(sort(m2$temperatureInd))
m2$temperatureCla <- ifelse(m2$temperatureInd >= 130, "temperature_H",ifelse(m2$temperatureInd >= 80, "temperature_M","temperature_L"))
table(m2$temperatureCla)

barplot(table(m2$broken, m2$lifetimeyear ))
# barplot(prop.table(table(m2$broken, m2$lifetimeyear ),2)) # 비율로 비교

m2$brokenCla <- ifelse(m2$broken==1,"broken", "normal")

m2$lifetimeyearCla <- paste0("lifetime", as.character(m2$lifetimeyear))

m3 <- m2[, (names(m2) %in% c("brokenCla", "team", "provider", "lifetimeyearCla", "pressureCla", "moistureCla", "temperatureCla" ))]

#----------

install.packages("arules")
install.packages("arulesViz")
install.packages("datasets")

library(arules)
library(arulesViz)
library(datasets)

write.csv(m3, "maint_trs.csv", row.names=F, col.names=F)
m4 <- read.transactions("maint_trs.csv", format="basket", sep="," , skip=1);

inspect(head(m4, 5))

inspect(head(m4, 5))
itemFrequencyPlot(m4,topN=20,type="absolute")

rules <- apriori(m4, parameter = list(supp = 0.03, conf = 0.3, minlen=2, maxlen=2))

options(digits=3)
inspect(rules[1:20])

summary(rules)

df1 <- as.data.frame(inspect(rules))

df2 <- df1[df1$rhs=="{broken}" & df1$lift>1 , ]
df2[order(-df2$confidence),]

plot(rules)

rules1 <- apriori(m4, parameter = list(supp = 0.01, conf = 0.6, minlen=2, maxlen=2))

rules2 <- head(sort(rules, by="confidence"), 30)

rules3 <- subset(rules1, subset=rhs %pin% "broken")

rules4 <- subset(rules1, subset=rhs %in% c("broken", "normal"))

inspect(rules4)

plot(rules4, method="graph", interactive=F, shading=NA)

plot(rules4, method="graph", interactive=T, shading=NA)

df5 <- as.data.frame(inspect(rules4))

df5[order(-df5$confidence),]

#---- 05. 텍스트 분석 이해 ------------

:: 텍스트 마이닝 (+ 크롤링)

#-----[크롤링 crawling:: data harvesting ; scraping ; wrangling ]---

# install.packages("XML")
# install.packages("rvest")

library(XML)
library(rvest)

#-- [지하철]에 대한 네이버 뉴스
# 네이버 -> 뉴스검색 -> [키워드: 지하철] 검색
newsURL <- "http://news.naver.com/main/search/search.nhn?query=%C1%F6%C7%CF%C3%B6&st=news.all&q_enc=EUC-KR&r_enc=UTF-8&r_format=xml&rp=none&sm=all.basic&ic=all&so=rel.dsc&detail=0&pd=1&r_cluster2_start=1&r_cluster2_display=10&start=1&display=10&page="

# 네이버 -> 뉴스검색 -> [키워드: 서울 지하철] 검색
newsURL <- "http://news.naver.com/main/search/search.nhn?query=%BC%AD%BF%EF+%C1%F6%C7%CF%C3%B6&st=news.all&q_enc=EUC-KR&r_enc=UTF-8&r_format=xml&rp=none&sm=all.basic&ic=all&so=rel.dsc&detail=0&pd=1&r_cluster2_start=1&r_cluster2_display=10&start=1&display=10&page="

# 네이버 -> 뉴스검색 -> [키워드: 서울 지하철 불편] 검색
newsURL <- "http://news.naver.com/main/search/search.nhn?query=%BC%AD%BF%EF+%C1%F6%C7%CF%C3%B6+%BA%D2%C6%ED&st=news.all&q_enc=EUC-KR&r_enc=UTF-8&r_format=xml&rp=none&sm=all.basic&ic=all&so=rel.dsc&detail=0&pd=1&r_cluster2_start=1&r_cluster2_display=10&start=1&display=10&page="

GetStockCommentData <- function(num=1, newsURL){
url <- gsub(" ","",paste0(newsURL,as.character(num)))
doc <- htmlTreeParse(url, useInternalNodes = T)
print(url)

subject <- xpathSApply(doc, "//div[@class='ct']/a", xmlValue)#subject
date <- xpathSApply(doc, "//div[@class='ct']/div/span[4]", xmlValue)#보도일자
main <- xpathSApply(doc, "//div[@class='ct']/p", xmlValue)#요약보기
press <-xpathSApply(doc, "//div[@class='ct']/div/span[2]", xmlValue)#신문사
url2 <- xpathSApply(doc, "//div[@class='ct']/div/a", xmlGetAttr,'href')

subject <- iconv(subject,"UTF-8","EUC-KR")
date <- iconv(date,"UTF-8","EUC-KR")
main <- iconv(main,"UTF-8","EUC-KR")
press <- iconv(press,"UTF-8","EUC-KR")

main <- gsub("\r|\t|\n","",main)

stock_data <-cbind(subject,date,main,press)
return(stock_data)
}
gc() # 메모리상의 불필요한 데이터 제거

DATA<-NULL
StartPage <- 1
EndPage <- 5

# newsUrl에 키워드 검색 결과 페이지 URL을 paste
# newsUrl <- "http://news.naver.com/main/search/search.nhn?query=%B4%EB%C7%D0%B1%B3&st=news.all&q_enc=EUC-KR&r_enc=UTF-8&r_format=xml&rp=none&sm=all.basic&ic=all&so=rel.dsc&rcnews=exist:032:005:086:020:021:081:022:023:025:028:038:469:421:003:001:422:449:004:215:437:056:214:019:057:096:374:055:448:052:009:008:011:277:018:366:014:015:016:375:079:119:006:047:143:002:138:029:293:031:030:092:145:024:417:242:308:262:140:094:243:007:033:037:053:042:353:105:036:050:&stDate=range:20170401:20170415&detail=0&pd=4&r_cluster2_start=1&r_cluster2_display=10&start=1&display=10&startDate=2017-04-01&endDate=2017-04-15"

DATA<-NULL
StartPage <- 1
EndPage <- 5

for (i in StartPage:EndPage) {
getData<-GetStockCommentData(i, newsURL)
print(length(getData))
DATA<-rbind(DATA,getData)
print(paste0("page ", as.character(i), " done"))
}

# write.csv(DATA,"F:/data/aaa.csv")
#------------

library(XML)
library(rvest)

#--- [키워드]에 대한 네이버 블로그 수집

# https ==> http [http로 바꾸어주고 돌려야함]

GetStockCommentData <- function(num){
url = gsub(" ","",paste0("https://section.blog.naver.com/sub/SearchBlog.nhn?type=post&option.keyword=%EC%84%9C%EC%9A%B8%20%EC%A7%80%ED%95%98%EC%B2%A0&term=&option.startDate=&option.endDate=&option.page.currentPage=", as.character(num)))
print(url)
doc = htmlTreeParse(url, useInternalNodes = T, encoding="UTF-8")
xpathSApply(doc, "//*[@id='blogSearchForm']/div[2]/ul[3]", xmlValue)

subject <- xpathSApply(doc, "//ul[@class='list_type_1 search_list']/li/h5/a", xmlValue)#subject
print(subject)
date <- xpathSApply(doc, "//span[@class='date']", xmlValue) #작성일
main <- xpathSApply(doc, "//div[@class='list_content']", xmlValue) #본문요약
main <- gsub("\r|\n","",main)
nick <- xpathSApply(doc, "//div[@class='list_data']/a", xmlValue) #블로그 닉네임
category <- xpathSApply(doc, "//span[@class='category']/a", xmlValue) #블로그 카테고리
href <- xpathSApply(doc, "//ul[@class='list_type_1 search_list']/li/h5/a", xmlGetAttr,'href')

stock_data <-cbind(subject, date, main, nick, category, href)
print(stock_data)
return(stock_data)
}

gc()
DATA<-NULL
StartPage <- 1
EndPage <- 5

for (i in StartPage:EndPage) {
getData<-GetStockCommentData(i)
DATA<-rbind(DATA,getData)
}

# write.csv(DATA,"aaablog.csv",row.names = F)
#--------------------

#------ 워드클라우드 생성 ------------

# install.packages("KoNLP")
library(KoNLP)
useSejongDic() # 세종 사전 생성

# setwd(".... 사용자 path 지정")
# DATA <- readLines("abcd.txt") # 텍스트 파일에서 읽어오는 경우

article <- paste0(as.data.frame(DATA)$subject, as.data.frame(DATA)$main )
paste(c("acde","ee"),collapse="")
article <- paste0(article,collapse="")

article1 <- sapply(article,extractNoun,USE.NAMES=F)

article2 <- unlist(article1)
article3 <- Filter(function(x){nchar(x)>=2},article2)

article3 <- gsub("\r|\n|\t","", article3)
article3 <- gsub("하기","", article3)
article3 <- gsub("△","", article3)
article3 <- gsub("하게","", article3)

# article3 <- gsub("빅데이터를","빅데이터", article3)
# article3 <- gsub("빅데이터로","빅데이터", article3)

wordcount <- table(article3)
barplot(sort(wordcount), las=2, cex.names =0.7)
barplot(head(tail(wordcount[order(-wordcount)],length(wordcount)-2),20)) # sort table

# wc1 <- head(tail(wordcount[order(-wordcount)],length(wordcount)-3),50)
# wc1 <- head(wordcount[order(-wordcount)],50)[c(2,5:50)]
wc1 <- head(wordcount[order(-wordcount)],50)
barplot(wc1, las=2, cex.names =0.7)

head(as.data.frame(wordcount))
df_a3 <- as.data.frame(wordcount)
df_a31 <- df_a3[df_a3$Freq>=3, ]
barplot(df_a31$Freq, names.arg=df_a31$article3, cex.names=0.7)
head(df_a31[order(- df_a31$Freq),],20)

barplot(wordcount[order(wordcount)])

#------ wordcloud 생성 --------------------

# install.packages("wordcloud2")
library(wordcloud2)
# windowsFonts(malgun=windowsFont("맑은 고딕"))
wordcloud2(data=wordcount, minSize = 7, minRotation = -pi/0, maxRotation = -pi/0, rotateRatio = 1, fontFamily='맑은 고딕', size=0.8)

# <지하철> 제외 후 wc -- 다이아몬드 모양
wordcloud2(data=wc1, minSize = 7, minRotation = -pi/0, maxRotation = -pi/0, rotateRatio = 1,
fontFamily='맑은 고딕', size=0.7, shape='diamond')

wordcloud2(data=wordcount, minSize = 7, minRotation = -pi/0, maxRotation = -pi/0, rotateRatio = 1,
fontFamily='맑은 고딕', size=0.7, shape='diamond')

# <빅데이터>와 <분석> 제외 후 wc -- 다이아몬드 모양
wordcloud2(data=wc1, minSize = 3, minRotation = -pi/0, maxRotation = -pi/0, rotateRatio = 1, fontFamily='맑은 고딕', size=0.4, shape='diamond')

#-------[by article]------

df_DATA <- as.data.frame(DATA)
a1 <- apply(df_DATA[,c(1,3)], 1, paste, collapse=" ")
a1 <- sapply(a1,extractNoun,USE.NAMES=F)

a2 <- NA
aid <- NA
for(i in 1:length(a1)) {
a2 <- c(a2, a1[[i]])
aid <- c(aid, rep(i, length(a1[[i]])) )
}

df_article <- data.frame(aid, a2)
df_article <- df_article[!is.na(df_article$aid),]
head(df_article, 60)
df_article <- df_article[nchar(as.character(df_article$a2))>=2,]
head(df_article, 60)

df_article$a2 <- gsub("\r|\n|\t","", df_article$a2)
df_article$a2 <- gsub("“|”|\\[|\\]|오전|오후|뉴시스와","", df_article$a2)

df_article$a2 <- gsub("빅데이터를","빅데이터", df_article$a2)
df_article$a2 <- gsub("빅데이터로","빅데이터", df_article$a2)
df_article <- df_article[nchar(as.character(df_article$a2))>=2,]
head(df_article, 60)

#------- association rules [single format]-----------
# 키워드간 연관성 분석 (동시 출현 패턴)

library(arules)
library(arulesViz)

setwd("E:/restore_Yong/GAMDFM/text_m")
head(df_article)

write.csv(df_article, "dfa.csv", row.names=F)

tr <- read.transactions("dfa.csv", format = "single", sep=',',
skip=1, rm.duplicates = FALSE, cols=c(1,2))
inspect(head(tr))

itemFrequencyPlot(tr,topN=20,type="absolute")

srules <- apriori(tr, parameter = list(supp = 0.06, conf = 0.4, minlen=2, maxlen=2))
# options(digits=3)
inspect(head(srules))
summary(srules)

rules1 <- head(sort(srules, by="confidence"), 50)
inspect(rules1)
rules2 <- subset(rules1, subset=(lift>1))
# rules2 <- subset(rules1, subset=(!(lhs %pin% "서울|지하철|들이")))
# rules2 <- subset(rules2, subset=(!(rhs %pin% "서울|지하철|들이")))
inspect(head(sort(rules2, by="lift"), 30))

plot(rules2, method="graph", interactive=T, shading=NA)

inspect(rules1)
inspect(head(sort(rules2, by="lift"), 10))

#--- 끝 -----

CARD_SUBWAY_MONTH_201704.csv

1.1MB

CARD_SUBWAY_MONTH_201704_a.csv

met_scrpt_20170607.txt

저작자표시 비영리 변경금지 (새창열림)

'R 데이터 분석' 카테고리의 다른 글

[kbdaa_bda] 빅데이터고객분석 GDA (0)	2017.09.20
[kbdaa_bda] 빅데이터고객분석 (0)	2017.09.09
[R 분석: DT] rpart를 이용한 트리 모델 만들기 (0)	2017.05.29
[SKK_DA1] predictive modeling practice (0)	2017.05.25
[SKK_DA1] 시계열 모형 AR-MA-ARIMA 요점 (0)	2017.05.25

현재글subway]빅데이터분석

리비젼 CRM ( revisioncrm )

리비젼컨설팅, chatGPT, 인공지능, 전용준 빅데이터, 디지털마케팅, 빅 데이터, 전용준, 빅데이터, 챗GPT, 데이터 사이언티스트, R, CRM, GPT, 데이터분석, 프롬프트, 머신러닝, AI, 데이터 분석, 리비젼, 프롬프트엔지니어링,

Today :
Yesterday :