#---- 군집분석 ------------------
mycsv <- read.csv("xAPI-Edu-Data.csv", header=TRUE)
mycsv <- read.csv("C:/Users/dataanalysis/Desktop/DA/scrpt_data/xAPI-Edu-Data.csv", header=TRUE)
s1 <- mycsv
head(s1, 3)
names(s1)
# s2 <- na.omit(s2) # listwise deletion of missing
# apply k-means clustering
s2 <- s1[,c("raisedhands","VisITedResources", "AnnouncementsView", "Discussion")]
fit <- kmeans(s2, 3)
table(fit$cluster)
plot(s2$raisedhands, s2$VisITedResources, col=fit$cluster, pch=19)
plot(s2$AnnouncementsView, s2$Discussion, col=fit$cluster, pch=19)
cor(s2) # 변수간 상관관계 확인
plot(s2$VisITedResources, s2$Discussion, col=fit$cluster, pch=19)
# check the proportion of Failed (Class=="L")
plot(s2$VisITedResources, s2$Discussion, col=fit$cluster, pch=ifelse(s1$Class=="L",19,22))
sp1 <- aggregate(s2,by=list(fit$cluster),FUN=mean)
sp2 <- sp1[,2:5]
barplot(as.matrix(sp2), beside=T)
barplot(t(as.matrix(sp2)), beside=T)
# use median instead of mean for profiling
sp1 <- aggregate(s2,by=list(fit$cluster),FUN=median)
sp2 <- sp1[,2:5]
rownames(sp2) <- c("Cluster 1", "Cluster 2", "Cluster3")
barplot(as.matrix(sp2), beside=T, legend=rownames(as.matrix(sp2)))
sp21 <- t(as.matrix(sp2))
barplot(sp21, beside=T, legend=rownames(sp21))
# to refine legend
# colors <- c("black", "darkgrey", "grey", "lightgrey")
colors <- c(rgb(0.5,0.5,0.1), rgb(0.5,0.5,0.35), rgb(0.5,0.5,0.5), rgb(0.5,0.5,0.85))
barplot(sp21, beside=T, col= colors)
legend("topright", rownames(sp21), fill = colors, bty = "n")
library(ggplot2)
s1$cluster <- fit$cluster
ggplot(s1, aes(cluster)) +
geom_bar(aes(fill = gender), position = "dodge")
ggplot(s1, aes(cluster)) +
geom_bar(aes(fill = NationalITy), position = "dodge")
ggplot(s1, aes(cluster)) +
geom_bar(aes(fill = Topic), position = "dodge")
ggplot(s1,
aes(as.character(cluster), raisedhands )) +
geom_point() + geom_boxplot() +
labs(title = "Raisehands distribution by cluster", x = "student cluster")
# hierarchical clustering : bottom-up으로 유사한 개체들을 단계적으로 묶어 트리구조 생성
clusters <- hclust(dist(s2))
# clusters <- hclust(dist(s2), method="average")
plot(clusters)
rect.hclust(clusters, k=5) # 5개 군집으로 구분 표시
clusterCut <- cutree(clusters, 5)
barplot(table(clusterCut), main="Students by Cluster")
'R 데이터 분석' 카테고리의 다른 글
[kbdaa_bda] 데이터 처리 연습 GDA (0) | 2017.09.21 |
---|---|
[kbdaa_bda] 고객빅데이터분석 _은행모델 (0) | 2017.09.21 |
[kbdaa_bda] 빅데이터고객분석 GDA (0) | 2017.09.20 |
[kbdaa_bda] 빅데이터고객분석 (0) | 2017.09.09 |
subway]빅데이터분석 (0) | 2017.06.22 |