>RE::VISION CRM

R 데이터 분석

[kbdaa_bda] 빅데이터고객분석 _ 군집

YONG_X 2017. 9. 21. 12:24

#---- 군집분석 ------------------


mycsv <- read.csv("xAPI-Edu-Data.csv", header=TRUE)

mycsv <- read.csv("C:/Users/dataanalysis/Desktop/DA/scrpt_data/xAPI-Edu-Data.csv", header=TRUE)



s1 <- mycsv


head(s1, 3)

names(s1)


# s2 <- na.omit(s2) # listwise deletion of missing


# apply k-means clustering


s2 <- s1[,c("raisedhands","VisITedResources", "AnnouncementsView", "Discussion")]

fit <- kmeans(s2, 3)

table(fit$cluster)

plot(s2$raisedhands, s2$VisITedResources,   col=fit$cluster, pch=19)

plot(s2$AnnouncementsView, s2$Discussion,  col=fit$cluster, pch=19)


cor(s2) # 변수간 상관관계 확인


plot(s2$VisITedResources, s2$Discussion,   col=fit$cluster, pch=19)



# check the proportion of Failed (Class=="L")

plot(s2$VisITedResources, s2$Discussion, col=fit$cluster, pch=ifelse(s1$Class=="L",19,22))


sp1 <- aggregate(s2,by=list(fit$cluster),FUN=mean)

sp2 <- sp1[,2:5]

barplot(as.matrix(sp2), beside=T)

barplot(t(as.matrix(sp2)), beside=T)


# use median instead of mean for profiling

sp1 <- aggregate(s2,by=list(fit$cluster),FUN=median)

sp2 <- sp1[,2:5]

rownames(sp2) <- c("Cluster 1", "Cluster 2", "Cluster3")

barplot(as.matrix(sp2), beside=T,   legend=rownames(as.matrix(sp2)))


sp21 <- t(as.matrix(sp2))

barplot(sp21, beside=T, legend=rownames(sp21))


# to refine legend

# colors <- c("black", "darkgrey", "grey", "lightgrey")

colors <- c(rgb(0.5,0.5,0.1), rgb(0.5,0.5,0.35), rgb(0.5,0.5,0.5), rgb(0.5,0.5,0.85))

barplot(sp21, beside=T, col= colors)

legend("topright", rownames(sp21), fill = colors, bty = "n")



library(ggplot2)


s1$cluster <- fit$cluster

ggplot(s1, aes(cluster)) +

  geom_bar(aes(fill = gender), position = "dodge")


ggplot(s1, aes(cluster)) +

  geom_bar(aes(fill = NationalITy), position = "dodge")

ggplot(s1, aes(cluster)) +

  geom_bar(aes(fill = Topic), position = "dodge")


ggplot(s1,

  aes(as.character(cluster), raisedhands )) +

  geom_point() + geom_boxplot() + 

  labs(title = "Raisehands distribution by cluster", x = "student cluster")



# hierarchical clustering : bottom-up으로 유사한 개체들을 단계적으로 묶어 트리구조 생성


clusters <- hclust(dist(s2))

# clusters <- hclust(dist(s2), method="average")


plot(clusters)

rect.hclust(clusters, k=5) # 5개 군집으로 구분 표시

clusterCut <- cutree(clusters, 5) 


barplot(table(clusterCut), main="Students by Cluster")