>RE::VISION CRM

R 데이터 분석

movie prediction exmple

YONG_X 2015. 7. 8. 17:56

# ctree 연습과 해석 이해용


dia<-read.csv("diamonds.csv")


names(dia)


# ctree를 그리기 위한 패키지 불러오기

require(party)


# 아무 변수나 사용해서 한번 그려보기

t1 <- ctree(dia$price ~ dia$x +dia$y+dia$z )

plot(t1)


# 트리의 크기를 조정

t1 <- ctree(dia$price ~ dia$x +dia$y+dia$z, control=ctree_control(minsplit = 100, maxdepth = 


3, minbucket = 50) )

plot(t1)


# 만일 한 독립변수 predictor만 사용해서 트리를 그린다면?

t1 <- ctree(dia$price ~ dia$x, control=ctree_control(minsplit = 100, maxdepth = 3, minbucket 


= 50) )

plot(t1)


# scatter 플롯과 비교해보면?

plot(dia$price , dia$x)

plot( dia$x, dia$price )

abline(lm(dia$price ~ dia$x), col="red")


# 결론:: 한 변수만을 가지고 트리를 그리는 것은 구간을 자동 설정하는 것과 같음



#--------------------------


data sets from KDD competition

# reading in the movie data set

t <- read.csv("movietrain_v2.csv")

length(unique(t$movie))

[1] 3676

nrow(t)

[1] 750156


aq1 <- sqldf('select movie, count(*) as cnt, avg(rating) as

avg_rat from t group by movie')

aq1 <- aq1[order(- aq1$cnt),]

h1 <- head(aq1,254)

tail(h1)

aq2 <- sum(h1$cnt)

mvttl <- read.csv('mv_title.txt')

h1$mvttl <- mvttl$movietitle


# EDA with movie set =============


plot(mvr$rating)

plot(sort(mvr$rating))

q301 <- sqldf('select user,

count(*) as rcnt

from mvr

group by user')


# rating을 많이 하는 사람일 수록 점수를 짜게?

q302 <- sqldf('select user,

count(*) as rcnt,

avg(rating) as avg_r

from mvr

group by user')


# ranking이 높은 영화가 user로부터 rating 건수가

# 많은가?


q303 <-sqldf('select movie,

count(*) as rcnt_mv

from mvr

group by movie')


q304 <- sqldf('select a.*, b. rcnt_mv

from h1 as a left join q303 as b

on a.movie=b.movie

')

plot(q304$mranking, q304$rcnt_mv)



# ranking이 높으면 rating도 높을까?


q305 <-sqldf('select movie,

avg(rating) as avgr_mv

from mvr

group by movie')


q306 <- sqldf('select a.*, b. avgr_mv

from h1 as a left join q305 as b

on a.movie=b.movie

')


plot(q306$mranking, q306$avgr_mv)


# 연도 추출해오기

h1$myear <- substr(h1$mvttl, nchar(h1$mvttl) -8, nchar(h1$mvttl)-5)


# 출시연도별 raking과 관계는?


# 출시연도 정제


> range(h1$myear)

[1] " Il " "il b"

> max(h1$myear)

[1] "il b"

> min(h1$myear)

[1] " Il "

> head(h1$myear)

[1] "1994" "1972" "1974" "2008" "1994" "1993"

> tail(h1$myear)

[1] "1991" "2004" "1973" "1962" "1960" "1972"

> head(sort(h1$myear))

[1] " Il " " Mon" "001)" "138." "1921" "1925"

> tail(sort(h1$myear))

[1] "2014" "2014" "2015" "2015" "980)" "il b"


q307 <- sqldf('select a.*, b. mranking

from h1 as a left join q306 as b

on a.movie=b.movie

where myear1 > 1900

')


plot(q307$mranking, q307$myear)


q308 <- sqldf('select a.*, b.avgr_mv 

from h1 as a left join q306 as b

on a.movie=b.movie

where myear1 > 1900

')


plot(q308$avgr_mv, q308$myear)

abline(lm(q308$myear~q308$avgr_mv), col="red")



#--------------------------


[연습과제 1 ] 호불호가 극단적으로 갈리는 영화는??

호 = *****

불호 = * OR **


q309 <- sqldf('select movie,

count(*) as ncnt_HR

from mvr

where rating =5

group by movie ')


q310 <- sqldf('select movie,

count(*) as ncnt_LR

from mvr

where rating in (1,2 )

group by movie ')


q311 <- sqldf('select movie,

count(*) as ncnt

from mvr

group by movie ')


q312 <- sqldf('select a.*, b.ncnt_HR, c.ncnt_LR

from q311 as a 

  left join q309 as b

  on a.movie=b.movie

  left join q310 as c

  on a.movie=c.movie  ')


q312$ncnt_HR <- ifelse(is.na(q312$ncnt_HR) ,0, q312$ncnt_HR)

q312$ncnt_LR <- ifelse(is.na(q312$ncnt_LR) ,0, q312$ncnt_LR)


q312$extr <- (q312$ncnt_HR+q312$ncnt_LR) / q312$ncnt


q312$extreven <- 1 -( abs(q312$ncnt_HR-q312$ncnt_LR) / (q312$ncnt_HR+q312$ncnt_LR) )


plot(q312$extr, q312$extreven )


q313 <- q312[q312$extr>=0.5 & q312$extreven>0.8 & q312$ncnt>=50,] 


plot(q313$extr, q313$extreven )



[연습과제 2 ] rating median이 가장 높은 top 10 영화중 가장 rating count가 작은 영화는?




#=========================

영화 - 250등이내에 들 것으로 보이는 영화 ("YES")

분류 모델


영화의 특성


- 평가회원수

- 평가회원의 평균평점


#-------


- 평가에서 5점을 준 회원수

- 평가에서 3점이라를 준 회원수

- 평가회원의 평균평가수

- 평가회원중 상위 20%의 평가수


   - 영화출시후 기간(년) (X)



q320 <- sqldf('select distinct movie

from mvr')


q321 <- sqldf('select a.*, b.myear1 as is_in_250

from q320 as a left join h1 as b

on a.movie=b.movie')


q321$is_in_250 <- ifelse(is.na(q321$is_in_250), "NO","YES")


q321 <- sqldf('select movie, "YES" as is_in_250

from h1 ')


q322 <- sqldf('select a.*, b.is_in_250 as is_in_250

from q320 as a left join q321 as b

on a.movie=b.movie')


q322$is_in_250 <- ifelse(is.na(q322$is_in_250), "NO",q322$is_in_250)


- 평가회원수


q323 <- sqldf('select movie,

count(distinct user) as nusr

from mvr

group by movie')


- 평가회원의 평균평점


q324 <- sqldf('select user,

avg(rating) as avg_rating

from mvr

group by user')


q325 <- sqldf('select a.movie,

avg(b.avg_rating) as avg_u_rating

from mvr as a left join q324 as b

on a.user=b.user

group by a.movie')


q326 <- sqldf('select a.*,

b.nusr,

c.avg_u_rating

from q322 as a 

  left join q323 as b on a.movie=b.movie

  left join q325 as c on a.movie=c.movie ')


q326$is_in_250<-as.factor(q326$is_in_250)

q326$avg_u_rating<-as.numeric(q326$avg_u_rating)


t3 <-  ctree(q326$is_in_250~q326$nusr+q326$avg_u_rating)

plot(t3)


#-- sampling ---------


plot(sort(rnorm(100,0,1)))

hist(sort(rnorm(100,0,1)))


sample(rnorm(100,1,2), size = 10, replace = FALSE)

sample(1:5, size=20, prob=c(5,1,1,1,1), replace=TRUE)

sample.int(1e10, 12, replace = TRUE)

runif(5,0,1)


plot((sort(runif(10000,0,1))))


mvr1 <-mvr

mvr1$rndseq <-rnorm(nrow(mvr),0,1)


q326$rndseq <-rnorm(nrow(q326),0,1)

quantile(q326$rndseq, c(0.5))


q326train <- q326[q326$rndseq>= 0.005029539 ,]

q326test <- q326[q326$rndseq< 0.005029539 ,]


t4 <-  ctree(is_in_250~ avg_u_rating, data=q326train)

plot(t4)



#---------

q326trainNo <- q326[q326$is_in_250=="NO" ,]

quantile(q326trainNo$rndseq, 0.2)


q326trainNo1 <- q326trainNo[q326trainNo$rndseq< -0.8234932  ,]

q326trainYes <- q326[q326$is_in_250=="YES" ,]


q326trainNew <- rbind(q326trainNo1, q326trainYes)


t5 <-  ctree(is_in_250~ avg_u_rating, data=q326trainNew, control=ctree_control(minsplit = 10, 


maxdepth = 5, minbucket = 3) )

plot(t5)


cfm1 <- table(predict(t5, newdata=q326test ), q326test$is_in_250)

class(cfm1)

accuracyct1 <- (cfm1[1,1] + cfm1[2,2] ) / nrow(q326test) *100