>RE::VISION CRM

R 데이터 분석

[R분석] EDA탐색적분석 base R (mtcars)

YONG_X 2018. 9. 18. 11:07

# Practicing EDA using Base R Plots

# Exploratory Data Analysis

# 리비젼컨설팅. 전용준. 2018


head(mtcars)

dim(mtcars)


# 아래 사진 클릭시 YouTube 동영상으로 연결





# [1] check the distibution of the target

plot(mtcars$mpg)

abline(h=mean(mtcars$mpg), lty=2)

abline(h=median(mtcars$mpg), lty=3, col="blue")


# 평균과 중위수가 일치하지 않는다면 대칭 분포가 아님

# (= Skewed)


plot(sort(mtcars$mpg))

abline(h=mean(mtcars$mpg), lty=2)

abline(h=median(mtcars$mpg), lty=3, col="blue")


plot(sort(mtcars$mpg, decreasing=T))

abline(h=mean(mtcars$mpg), lty=2)

abline(h=median(mtcars$mpg), lty=3, col="blue")


# 분포의 히스토그램과 밀도플롯


hist(mtcars$mpg)

hist(mtcars$mpg, breaks=10)

abline(v=mean(mtcars$mpg), lty=2)

abline(v=median(mtcars$mpg), lty=3, col="blue")


hist(mtcars$mpg, breaks=10, prob=T)

lines(density(mtcars$mpg), lwd=2)

polygon(density(mtcars$mpg), col=rgb(0,0,0.5,0.5))

abline(v=mean(mtcars$mpg), lty=2)

abline(v=median(mtcars$mpg), lty=3, col="blue")


# target과의 상관계수 확인

barplot(cor(as.matrix(mtcars[,1]), as.matrix(mtcars[,-1])))

# drat = Rear axle ratio

# [ A discussion of R’s mtcar dataset variables ]



# [2] scatterplot 산점도 (Y~X) 

plot(mtcars$wt, mtcars$mpg)

abline(lm(mtcars$mpg~mtcars$wt), col="red", lty=2)


cor(mtcars$wt, mtcars$mpg)

cor.test(mtcars$wt, mtcars$mpg)


# add  locally-weighted polynomial regression smoothing

lines(lowess(mtcars$mpg~mtcars$wt), col="blue", lty=3)

# ==> 선형적 관계가 아님 ==> 분포 변환 시도(??)



# [3] scatterplot 산점도 활용 (Y~X1+X2) 


# 두 X 변수들간의 관계를 확인

plot(mtcars$wt, mtcars$drat)


# 두 X 변수값들과 target과의 관계 확인

plot(mtcars$wt, mtcars$drat, 

     col=ifelse(mtcars$mpg>quantile(mtcars$mpg,.7),"red","blue"),

     main="mpg by wt and drat")


# little decoration

plot(mtcars$wt, mtcars$drat, 

     col=ifelse(mtcars$mpg>quantile(mtcars$mpg,.7),"red","blue"),

     main="mpg by weight and drat",

     pch=19, cex=1.2,

     xlim=c(0,max(mtcars$wt)),

     ylim=c(0,max(mtcars$drat)),

     xlab="Weight", ylab="Rear axle ratio(drat)")

grid(5)

lines(lowess(mtcars$drat~mtcars$wt), 

      lty=3, lwd=2)



# continuous coloring


# UDF for standardize anomaly score

# 스코어의 스케일을 0~1 사이로 변환하기 위한 사용자정의함수

stnd <- function(x) {

  x1 <- (x-min(x)) / (max(x)-min(x))

  return(x1)

}


plot(mtcars$wt, mtcars$drat, 

     col=rgb(stnd(mtcars$mpg), 0, 1-stnd(mtcars$mpg)),

     main="mpg by weight and drat",

     pch=19, cex=1.2,

     xlim=c(0,max(mtcars$wt)),

     ylim=c(0,max(mtcars$drat)),

     xlab="Weight", ylab="Rear axle ratio(drat)")

grid(5)

lines(lowess(mtcars$drat~mtcars$wt), 

      lty=3, lwd=2)


# give transparency

plot(mtcars$wt, mtcars$drat, 

     col=rgb(stnd(mtcars$mpg), 0, 1-stnd(mtcars$mpg), 0.5),

     main="mpg by weight and drat",

     pch=19, cex=1.2,

     xlim=c(0,max(mtcars$wt)),

     ylim=c(0,max(mtcars$drat)),

     xlab="Weight", ylab="Rear axle ratio(drat)")

grid(5)

lines(lowess(mtcars$drat~mtcars$wt), 

      lty=3, lwd=2)


# show car names

# text(mtcars$wt, mtcars$drat, labels=row.names(mtcars))

# text(mtcars$wt, mtcars$drat, labels=row.names(mtcars),

#     cex=0.5, pos=2)


# plot(sort(mtcars$mpg))

# abline(h=c(12, 30), lty=3)


text(mtcars$wt, mtcars$drat, 

     labels=ifelse(mtcars$mpg>30 | mtcars$mpg<12, row.names(mtcars),""), 

     cex=0.5, pos=2)


#--------- end of script -------------