# ctree 연습과 해석 이해용
dia<-read.csv("diamonds.csv")
names(dia)
# ctree를 그리기 위한 패키지 불러오기
require(party)
# 아무 변수나 사용해서 한번 그려보기
t1 <- ctree(dia$price ~ dia$x +dia$y+dia$z )
plot(t1)
# 트리의 크기를 조정
t1 <- ctree(dia$price ~ dia$x +dia$y+dia$z, control=ctree_control(minsplit = 100, maxdepth =
3, minbucket = 50) )
plot(t1)
# 만일 한 독립변수 predictor만 사용해서 트리를 그린다면?
t1 <- ctree(dia$price ~ dia$x, control=ctree_control(minsplit = 100, maxdepth = 3, minbucket
= 50) )
plot(t1)
# scatter 플롯과 비교해보면?
plot(dia$price , dia$x)
plot( dia$x, dia$price )
abline(lm(dia$price ~ dia$x), col="red")
# 결론:: 한 변수만을 가지고 트리를 그리는 것은 구간을 자동 설정하는 것과 같음
#--------------------------
data sets from KDD competition
# reading in the movie data set
t <- read.csv("movietrain_v2.csv")
length(unique(t$movie))
[1] 3676
nrow(t)
[1] 750156
aq1 <- sqldf('select movie, count(*) as cnt, avg(rating) as
avg_rat from t group by movie')
aq1 <- aq1[order(- aq1$cnt),]
h1 <- head(aq1,254)
tail(h1)
aq2 <- sum(h1$cnt)
mvttl <- read.csv('mv_title.txt')
h1$mvttl <- mvttl$movietitle
# EDA with movie set =============
plot(mvr$rating)
plot(sort(mvr$rating))
q301 <- sqldf('select user,
count(*) as rcnt
from mvr
group by user')
# rating을 많이 하는 사람일 수록 점수를 짜게?
q302 <- sqldf('select user,
count(*) as rcnt,
avg(rating) as avg_r
from mvr
group by user')
# ranking이 높은 영화가 user로부터 rating 건수가
# 많은가?
q303 <-sqldf('select movie,
count(*) as rcnt_mv
from mvr
group by movie')
q304 <- sqldf('select a.*, b. rcnt_mv
from h1 as a left join q303 as b
on a.movie=b.movie
')
plot(q304$mranking, q304$rcnt_mv)
# ranking이 높으면 rating도 높을까?
q305 <-sqldf('select movie,
avg(rating) as avgr_mv
from mvr
group by movie')
q306 <- sqldf('select a.*, b. avgr_mv
from h1 as a left join q305 as b
on a.movie=b.movie
')
plot(q306$mranking, q306$avgr_mv)
# 연도 추출해오기
h1$myear <- substr(h1$mvttl, nchar(h1$mvttl) -8, nchar(h1$mvttl)-5)
# 출시연도별 raking과 관계는?
# 출시연도 정제
> range(h1$myear)
[1] " Il " "il b"
> max(h1$myear)
[1] "il b"
> min(h1$myear)
[1] " Il "
> head(h1$myear)
[1] "1994" "1972" "1974" "2008" "1994" "1993"
> tail(h1$myear)
[1] "1991" "2004" "1973" "1962" "1960" "1972"
> head(sort(h1$myear))
[1] " Il " " Mon" "001)" "138." "1921" "1925"
> tail(sort(h1$myear))
[1] "2014" "2014" "2015" "2015" "980)" "il b"
q307 <- sqldf('select a.*, b. mranking
from h1 as a left join q306 as b
on a.movie=b.movie
where myear1 > 1900
')
plot(q307$mranking, q307$myear)
q308 <- sqldf('select a.*, b.avgr_mv
from h1 as a left join q306 as b
on a.movie=b.movie
where myear1 > 1900
')
plot(q308$avgr_mv, q308$myear)
abline(lm(q308$myear~q308$avgr_mv), col="red")
#--------------------------
[연습과제 1 ] 호불호가 극단적으로 갈리는 영화는??
호 = *****
불호 = * OR **
q309 <- sqldf('select movie,
count(*) as ncnt_HR
from mvr
where rating =5
group by movie ')
q310 <- sqldf('select movie,
count(*) as ncnt_LR
from mvr
where rating in (1,2 )
group by movie ')
q311 <- sqldf('select movie,
count(*) as ncnt
from mvr
group by movie ')
q312 <- sqldf('select a.*, b.ncnt_HR, c.ncnt_LR
from q311 as a
left join q309 as b
on a.movie=b.movie
left join q310 as c
on a.movie=c.movie ')
q312$ncnt_HR <- ifelse(is.na(q312$ncnt_HR) ,0, q312$ncnt_HR)
q312$ncnt_LR <- ifelse(is.na(q312$ncnt_LR) ,0, q312$ncnt_LR)
q312$extr <- (q312$ncnt_HR+q312$ncnt_LR) / q312$ncnt
q312$extreven <- 1 -( abs(q312$ncnt_HR-q312$ncnt_LR) / (q312$ncnt_HR+q312$ncnt_LR) )
plot(q312$extr, q312$extreven )
q313 <- q312[q312$extr>=0.5 & q312$extreven>0.8 & q312$ncnt>=50,]
plot(q313$extr, q313$extreven )
[연습과제 2 ] rating median이 가장 높은 top 10 영화중 가장 rating count가 작은 영화는?
#=========================
영화 - 250등이내에 들 것으로 보이는 영화 ("YES")
분류 모델
영화의 특성
- 평가회원수
- 평가회원의 평균평점
#-------
- 평가에서 5점을 준 회원수
- 평가에서 3점이라를 준 회원수
- 평가회원의 평균평가수
- 평가회원중 상위 20%의 평가수
- 영화출시후 기간(년) (X)
q320 <- sqldf('select distinct movie
from mvr')
q321 <- sqldf('select a.*, b.myear1 as is_in_250
from q320 as a left join h1 as b
on a.movie=b.movie')
q321$is_in_250 <- ifelse(is.na(q321$is_in_250), "NO","YES")
q321 <- sqldf('select movie, "YES" as is_in_250
from h1 ')
q322 <- sqldf('select a.*, b.is_in_250 as is_in_250
from q320 as a left join q321 as b
on a.movie=b.movie')
q322$is_in_250 <- ifelse(is.na(q322$is_in_250), "NO",q322$is_in_250)
- 평가회원수
q323 <- sqldf('select movie,
count(distinct user) as nusr
from mvr
group by movie')
- 평가회원의 평균평점
q324 <- sqldf('select user,
avg(rating) as avg_rating
from mvr
group by user')
q325 <- sqldf('select a.movie,
avg(b.avg_rating) as avg_u_rating
from mvr as a left join q324 as b
on a.user=b.user
group by a.movie')
q326 <- sqldf('select a.*,
b.nusr,
c.avg_u_rating
from q322 as a
left join q323 as b on a.movie=b.movie
left join q325 as c on a.movie=c.movie ')
q326$is_in_250<-as.factor(q326$is_in_250)
q326$avg_u_rating<-as.numeric(q326$avg_u_rating)
t3 <- ctree(q326$is_in_250~q326$nusr+q326$avg_u_rating)
plot(t3)
#-- sampling ---------
plot(sort(rnorm(100,0,1)))
hist(sort(rnorm(100,0,1)))
sample(rnorm(100,1,2), size = 10, replace = FALSE)
sample(1:5, size=20, prob=c(5,1,1,1,1), replace=TRUE)
sample.int(1e10, 12, replace = TRUE)
runif(5,0,1)
plot((sort(runif(10000,0,1))))
mvr1 <-mvr
mvr1$rndseq <-rnorm(nrow(mvr),0,1)
q326$rndseq <-rnorm(nrow(q326),0,1)
quantile(q326$rndseq, c(0.5))
q326train <- q326[q326$rndseq>= 0.005029539 ,]
q326test <- q326[q326$rndseq< 0.005029539 ,]
t4 <- ctree(is_in_250~ avg_u_rating, data=q326train)
plot(t4)
#---------
q326trainNo <- q326[q326$is_in_250=="NO" ,]
quantile(q326trainNo$rndseq, 0.2)
q326trainNo1 <- q326trainNo[q326trainNo$rndseq< -0.8234932 ,]
q326trainYes <- q326[q326$is_in_250=="YES" ,]
q326trainNew <- rbind(q326trainNo1, q326trainYes)
t5 <- ctree(is_in_250~ avg_u_rating, data=q326trainNew, control=ctree_control(minsplit = 10,
maxdepth = 5, minbucket = 3) )
plot(t5)
cfm1 <- table(predict(t5, newdata=q326test ), q326test$is_in_250)
class(cfm1)
accuracyct1 <- (cfm1[1,1] + cfm1[2,2] ) / nrow(q326test) *100
'R 데이터 분석' 카테고리의 다른 글
[retail example ] R :: Tree, R, Scoring, cumulative gains chart plotting , AIC oversampling, (0) | 2015.07.11 |
---|---|
CVS example (0) | 2015.07.09 |
predictive modeling :: algorithms and interpretation (0) | 2015.07.08 |
retail :: 예측모델링 data prep + tree model (0) | 2015.07.07 |
:: retail data preparation and EDA 연습 과제 (0) | 2015.07.06 |