# Practicing EDA using Base R Plots
# Exploratory Data Analysis
#
# 리비젼컨설팅. 전용준. 2018
head(mtcars)
dim(mtcars)
# 아래 사진 클릭시 YouTube 동영상으로 연결
# [1] check the distibution of the target
plot(mtcars$mpg)
abline(h=mean(mtcars$mpg), lty=2)
abline(h=median(mtcars$mpg), lty=3, col="blue")
# 평균과 중위수가 일치하지 않는다면 대칭 분포가 아님
# (= Skewed)
plot(sort(mtcars$mpg))
abline(h=mean(mtcars$mpg), lty=2)
abline(h=median(mtcars$mpg), lty=3, col="blue")
plot(sort(mtcars$mpg, decreasing=T))
abline(h=mean(mtcars$mpg), lty=2)
abline(h=median(mtcars$mpg), lty=3, col="blue")
# 분포의 히스토그램과 밀도플롯
hist(mtcars$mpg)
hist(mtcars$mpg, breaks=10)
abline(v=mean(mtcars$mpg), lty=2)
abline(v=median(mtcars$mpg), lty=3, col="blue")
hist(mtcars$mpg, breaks=10, prob=T)
lines(density(mtcars$mpg), lwd=2)
polygon(density(mtcars$mpg), col=rgb(0,0,0.5,0.5))
abline(v=mean(mtcars$mpg), lty=2)
abline(v=median(mtcars$mpg), lty=3, col="blue")
# target과의 상관계수 확인
barplot(cor(as.matrix(mtcars[,1]), as.matrix(mtcars[,-1])))
# drat = Rear axle ratio
# [ A discussion of R’s mtcar dataset variables ]
# [2] scatterplot 산점도 (Y~X)
plot(mtcars$wt, mtcars$mpg)
abline(lm(mtcars$mpg~mtcars$wt), col="red", lty=2)
cor(mtcars$wt, mtcars$mpg)
cor.test(mtcars$wt, mtcars$mpg)
# add locally-weighted polynomial regression smoothing
lines(lowess(mtcars$mpg~mtcars$wt), col="blue", lty=3)
# ==> 선형적 관계가 아님 ==> 분포 변환 시도(??)
# [3] scatterplot 산점도 활용 (Y~X1+X2)
# 두 X 변수들간의 관계를 확인
plot(mtcars$wt, mtcars$drat)
# 두 X 변수값들과 target과의 관계 확인
plot(mtcars$wt, mtcars$drat,
col=ifelse(mtcars$mpg>quantile(mtcars$mpg,.7),"red","blue"),
main="mpg by wt and drat")
# little decoration
plot(mtcars$wt, mtcars$drat,
col=ifelse(mtcars$mpg>quantile(mtcars$mpg,.7),"red","blue"),
main="mpg by weight and drat",
pch=19, cex=1.2,
xlim=c(0,max(mtcars$wt)),
ylim=c(0,max(mtcars$drat)),
xlab="Weight", ylab="Rear axle ratio(drat)")
grid(5)
lines(lowess(mtcars$drat~mtcars$wt),
lty=3, lwd=2)
# continuous coloring
# UDF for standardize anomaly score
# 스코어의 스케일을 0~1 사이로 변환하기 위한 사용자정의함수
stnd <- function(x) {
x1 <- (x-min(x)) / (max(x)-min(x))
return(x1)
}
plot(mtcars$wt, mtcars$drat,
col=rgb(stnd(mtcars$mpg), 0, 1-stnd(mtcars$mpg)),
main="mpg by weight and drat",
pch=19, cex=1.2,
xlim=c(0,max(mtcars$wt)),
ylim=c(0,max(mtcars$drat)),
xlab="Weight", ylab="Rear axle ratio(drat)")
grid(5)
lines(lowess(mtcars$drat~mtcars$wt),
lty=3, lwd=2)
# give transparency
plot(mtcars$wt, mtcars$drat,
col=rgb(stnd(mtcars$mpg), 0, 1-stnd(mtcars$mpg), 0.5),
main="mpg by weight and drat",
pch=19, cex=1.2,
xlim=c(0,max(mtcars$wt)),
ylim=c(0,max(mtcars$drat)),
xlab="Weight", ylab="Rear axle ratio(drat)")
grid(5)
lines(lowess(mtcars$drat~mtcars$wt),
lty=3, lwd=2)
# show car names
# text(mtcars$wt, mtcars$drat, labels=row.names(mtcars))
# text(mtcars$wt, mtcars$drat, labels=row.names(mtcars),
# cex=0.5, pos=2)
# plot(sort(mtcars$mpg))
# abline(h=c(12, 30), lty=3)
text(mtcars$wt, mtcars$drat,
labels=ifelse(mtcars$mpg>30 | mtcars$mpg<12, row.names(mtcars),""),
cex=0.5, pos=2)
'R 데이터 분석' 카테고리의 다른 글
[R분석] 프로야구 KBO 타자 성적과 나이의 관계 (0) | 2018.09.28 |
---|---|
[R분석] KBO 프로야구 가을야구 stat 예상 분석 (0) | 2018.09.20 |
[R분석] Anomaly Detection과 EDA 결합 (IsolationForest) 활용 (0) | 2018.09.05 |
[IsolationForest] Anomaly Detection (0) | 2018.08.30 |
[DSM1809] statistical data analysis using R (0) | 2018.08.24 |