-
Notifications
You must be signed in to change notification settings - Fork 194
/
TeamEvaluate2015.R
190 lines (158 loc) · 5.68 KB
/
TeamEvaluate2015.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# Load libraries and read files
packages <- c("dplyr", "fpc", "cluster",
"factoextra", "dendextend",
"psych", "qgraph")
lapply(packages, library, character.only = TRUE)
raw_df <- read.csv("./Team2015season.csv", header=T)
# scale data
scaled_data <- raw_df %>%
remove_rownames() %>%
column_to_rownames("Team") %>%
scale()
#######################################
# Hierarchical Cluster Analysis
# Useful tutorial:
# https://uc-r.github.io/hc_clustering
#######################################
#Eucledian, Ward's method
d_1 <- dist(scaled_data, method="euclidean")
clust_1 <- hclust(d_1, method="ward.D")
#draw the dendrogram
plot(clust_1,
cex=0.7,
xlab="",
ylab="Distance",
main="Clusterings of 60 European teams")
rect.hclust(clust_1, k = 4, border = 2:5)
#get membership vector
cuts <- cutree(clust_1,k=4)
scaled_data %>%
as.data.frame() %>%
mutate(cluster = cuts) %>%
head
# Compute distance matrix
res.dist <- dist(scaled_data, method = "euclidean")
# Compute 2 hierarchical clusterings
hc1 <- hclust(res.dist, method = "complete")
hc2 <- hclust(res.dist, method = "ward.D2")
# Create two dendrograms and compare group partition
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
dend_list <- dendlist(dend1, dend2)
tanglegram(dend1, dend2,
lwd = 1,
edge.lwd = 1,
lab.cex = 0.5,
columns_width = c(8, 3, 8),
highlight_distinct_edges = FALSE, # Turn-off dashed lines
common_subtrees_color_lines = FALSE, # Turn-off line colors
common_subtrees_color_branches = TRUE, # Color common branches
main = paste("entanglement =", round(entanglement(dend_list), 2))
)
###########################################
# K-means clustering
# Useful tutorial:
# https://uc-r.github.io/kmeans_clustering
###########################################
# use 4 centers that Hc clustering suggests
# nstart: attempts multiple initial configurations
# and reports on the best one.
km_results <- kmeans(scaled_data, centers = 4, nstart = 100)
km_results
# fviz_cluster does PCA and plot the data points
# according to the first two PCs that explain the majority of the variance
fviz_cluster(km_results, data = scaled_data)
# Evaluating clustering
# Best number of cluster using scree-plot (elbow method)
# optimal total-wihtin cluster sum of square
set.seed(123)
fviz_nbclust(scaled_data, kmeans, method = "wss")
# Average Silhouette method
# measuring the quality of the clusters
# by how well object lies within a cluster
# try to maximize average silhouette
fviz_nbclust(scaled_data, kmeans, method = "silhouette")
# GAP statistics method
# can apply to both kmeans and HC
# compares the total intracluster variation
# with their expected values
# under null reference distribution of the data
# at various value of k
set.seed(123)
gap_stat <- clusGap(scaled_data,
FUN = kmeans,
nstart = 100,
K.max = 10,
B = 50)
# Print the result
print(gap_stat, method = "firstmax")
fviz_gap_stat(gap_stat)
###################################################################
# Factor analysis
# Useful tutorial:
# http://www.di.fc.ul.pt/~jpn/r/factoranalysis/factoranalysis.html
# https://rpubs.com/aaronsc32/factor-analysis-introduction
###################################################################
# determined the number of factors to use with scree plot
parallel <- fa.parallel(scaled_data,
fm = 'minres',
fa = 'fa')
# factor analysis -- no rotation
# Varimax: assume factors completely uncorrelated
# Oblique: correlations in factors
# Method: factanal only support MaxLikelihood
# In fa (psych), we can use "PAF (pa)" or "mingres",
# the later provide results similar to `MaxLikelihood`
# without assuming multivariate normal distribution
# and derives solutions through iterative eigen decomposition like principal axis.
fa1 <- factanal(scaled_data,
factors=2,
rotation="none",
scores="regression")
fa2 <- fa(scaled_data,
nfactors = 3,
rotate = "oblimin",
fm="minres")
fa1
# biplot
biplot(fa1$scores[,1:2],
loadings(fa1),
cex=c(0.7,0.8))
# qgraph
# a different visualization of biplot
qg.fa1 <- qgraph(fa1)
# NOTE:
# - after Exploratory Factor Analysis (EFA),
# - the next step could be Confirmatory Factor Analysis
# - which is part of a larger subset: Structual Equation Modelling
# - https://socialsciences.mcmaster.ca/jfox/Misc/sem/SEM-paper.pdf
# we can get some flexibility from the "psych" package
fa_analysis <- function(data_set, factor,
rotate = "varimax", fm = "pa"){
res <- fa(data_set, nfactors = factor,
rotate = rotate, fm = fm)
print("Factor Analysis results:")
print(res)
# get loading plot for the first two factors
plot(res$loadings, pch=18, col='red')
abline(h=0)
abline(v=0)
text(res$loadings, labels=names(data_set),cex=0.8)
#get reproduced correlation matrix
repro <- res$loadings%*%t(res$loadings)
#residual correlation matrix
residual <- cor(data_set)-repro
print("Residual correlation matrx")
round(resid2,2)
#get root-mean squared residuals
len <- length(residual[upper.tri(residual)])
RMSR <- sqrt(sum(residual[upper.tri(residual)]^2)/len)
print("Root-mean squared residuals:", RMSR)
#get proportion of residuals greater than 0.05 in absolute value
prop <- sum(rep(1,len)[abs(residual[upper.tri(residual)])>0.05])/len
print("Proportion of residuals greater than 0.05 in absolute value:", prop)
}
# varimax - paf
fa_analysis(soccer, 3)
# quartimax - pag
fa_analysis(soccer, 3, "quartimax", "pa")