Skip to content
Snippets Groups Projects
Commit 4f05e6d9 authored by Jeffrey Post's avatar Jeffrey Post
Browse files

Update with Aziza's original code and data

parent 57ea9f3a
No related branches found
No related tags found
No related merge requests found
Pipeline #88422 passed with stage
in 17 seconds
#================================================================================================================================
# MIT LICENCE
# Copyright (c) 2019, Aziza Merzouki
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#================================================================================================================================
library(clValid)
library(factoextra)
library(FactoMineR)
library(dendsort)
source("./plotIncidenceMap.R")
#####################################################
# Perform hierarchical clustering based on a distance object
hierarchicalClustering = function(d,clusteringMethod){
hc = hclust(d,method = clusteringMethod)
return(hc)
}
#####################################################
# Function computeEuclidianDistance computes the Euclidian Distance between the Data Points
# Each Line is a Data Point, ex. a Vectorized Matrix
computeEuclidianDistance = function(M){
n = dim(M)[1]
euclidDist = array(0,dim=c(n,n))
for (i in c(1:(n-1))){
for (j in c((i+1):n)){
diff = M[i,]-M[j,]
dist = sqrt(sum(diff^2))
euclidDist[i,j] = dist
euclidDist[j,i] = dist
}
}
rownames(euclidDist) = rownames(M)
colnames(euclidDist) = rownames(M)
return(euclidDist)
}
#####################################################
# Draw Map where Countries are coloured based on the Cluster they belong to.
# Inputs:
# 1) Clusters (element name = Names of Country + element value = Cluster),
# 2) Map Region,
# 3) Map Title,
# 4) Name of file where the map is saved
drawMapOfClusters = function (clusters,region,title,fileName){
pdf(fileName)
# quartz()
# Get map from rworldmap
sPDF = getMap(resolution="high")
mapData = data.frame(country = names(clusters), cluster=clusters)
#print(mapData)
spdf = joinCountryData2Map(mapData,joinCode='NAME',nameJoinColumn="country")
myPalette= "heat" #"rainbow"#
mapCountryData(spdf,nameColumnToPlot="cluster",catMethod="categorical", mapRegion="africa",mapTitle=title,colourPalette=myPalette)
# Add Country Labels on the Map
labelCountries(spdf,nameCountryColum="country",cex=0.4,col="black")
dev.off()
}
#####################################################
# Draw Box plots per cluster for a given attribute (given attribute File).
drawAttributeBoxplotsPerCluster = function(attributeFile,attributeName,countries,year,clusters,title,fileName){
nClusters = max(clusters)
print("drawAttributeBoxplotsPerCluster()")
pdf(fileName)
allCountryAttributes = getCountryAttribute(attributeFile, countries, year)
allCountryAttributes = allCountryAttributes[paste("X",year,sep="")] # Ignore "Country" Columns
print("BEFORE FIRST POINT PLOT")
plot(1,1,col="white",xlab="Clusters",ylab=paste("Country",attributeName,year),main=title,xlim=c(1,nClusters),ylim=c(min(allCountryAttributes),max(allCountryAttributes)))
print("BEFORE FIRST POINT PLOT")
for(i in c(1:nClusters)){
clusterCountryNames = names(clusters[clusters == i])
clusterAttributes = getCountryAttribute(attributeFile, clusterCountryNames, year)
print(paste(attributeName,"of Cluster",i))
print(clusterAttributes[c("Country",paste("X",year,sep=""))])
print(paste(attributeName,"summary of Cluster",i))
print(summary(clusterAttributes))
boxplot(clusterAttributes[paste("X",year,sep="")],add=TRUE,at=i)
}
dev.off()
}
#####################################################
# Draw Density Plots per cluster for all variables in matrix M.
draw_density_plots = function(M, clusters,mainDir,cutRefValue=""){
extension = '.pdf'
# extension = '.png'
i = 1
# for (f in rownames(M)){
for (f in colnames(M)){
fileName = paste(mainDir,'/density_var_',i,'_',cutRefValue,'Clusters',extension,sep='')
pdf(fileName)
# png(fileName)
dat_f = c()
dat_c = c()
for (c in unique(clusters)){
# Density
dat_f = c(dat_f, M[clusters==c,f])
dat_c = c(dat_c, rep(toString(c),length(M[clusters==c,f])))
}
dat = data.frame(feature=dat_f,cluster=dat_c)
g = ggplot(dat, aes(x = feature, fill = cluster)) +
# geom_histogram(aes(y=..density..),alpha=0.5,position="identity")+
geom_density(alpha = 0.5) +
labs(x = f) +
# scale_fill_manual(values=c("yellow", "orange", "red")) +
theme(axis.text=element_text(size=14),
axis.title=element_text(size=18),
legend.text=element_text(size=14))
plot(g)
dev.off()
i = i+1
}
}
#####################################################
# Dimensionality Reduction of Data
# ex. the DAGs with N variables are represented by (N*N-N)/2 values => Vectorized DAGs.
# For N = 12, dim = 66. Use PCA to reduce it to 2-3 dimensions.
# reduceDimensions = function(allDAGs,clusters=NULL){
reduceDimensions = function(vectDAGs,names,clusters=NULL,filename){
# Transpose matrix to get "rows=countries" and "columns=variables"
print("Data Matrix BEFORE PCA")
print(vectDAGs)
print(clusters)
# PCA
res.pca = prcomp(vectDAGs)
# Print Rotation Matrix = Eigenvectors (the norm of each column = 1)
print("res.pca = ")
print(res.pca$rotation[,1:2])
# Print Eigenvalues (Explained Variance = Eigenvalue/sum of eigenvalues)
print(get_eigenvalue(res.pca))
txtFilename = gsub('pdf','csv',filename)
# write.table(res.pca,txtFilename)
# Bar Chart of Explained Variance per PC
explainedVar_filename = gsub('.pdf','_explainedVar.pdf',filename)
pdf(explainedVar_filename)
plot(fviz_eig(res.pca,addlabels = TRUE,linecolor = "#FC4E07",barcolor = "#00AFBB", barfill = "#00AFBB"))
dev.off()
print("Clusters = ")
print(clusters)
# Plot data point projections on PCA 2D-space
pdf(filename)
if (is.null(clusters)){
plot(fviz_pca_ind(res.pca, repel=TRUE))
} else{
# plot(fviz_pca_ind(res.pca,col.ind=as.factor(clusters),legend.title = "Groups", addEllipses=FALSE, ellipse.type="confidence", repel=TRUE, palette =c("#FFDF00", "orange", "red")))
plot(fviz_pca_ind(res.pca,col.ind=as.factor(clusters),legend.title = "Groups", addEllipses=TRUE, ellipse.type="confidence", repel=TRUE))
}
dev.off()
# Checking PCs
# see ref: http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials/
var = get_pca_var(res.pca)
print("eigen values")
eig_val = get_eigenvalue(res.pca)
print(eig_val)
print("Check explained variance = eigenvalue/sum(eigenvalues)")
print(eig_val[,"eigenvalue"]/sum(eig_val[,"eigenvalue"]))
# Loadings = Rotation Matrix = Eigen Vectors
print("Rotation matrix = Loadings")
print(res.pca$rotation[,1:2])
write.xlsx(res.pca$rotation[,1:2],"./Results/loadings.xlsx")
print("Standard Deviations of Components = SQRT(eigenvalue(PC))")
print(sqrt(eig_val[,"eigenvalue"]))
print("var$coord = ")
print(var$coord[,1:2])
print("var$coord/eigenvector = std(PCs) ")
print(var$coord[,1:2]/res.pca$rotation[,1:2])
print("var$cor = ")
print(var$cor[,1:2])
print("var$cos2 = ")
print(var$cos2[,1:2])
print("var$contrib = ")
print(var$contrib[,1:2])
print("Eigen Value 2")
print(eig_val[2,1])
print("Contrib to (C1 * Eig1 + C2 * Eig2)/(Eig1+Eig2)")
print((var$contrib[,1]*eig_val[1,"eigenvalue"] + var$contrib[,2]*eig_val[2,"eigenvalue"])/(eig_val[1,"eigenvalue"]+eig_val[2,"eigenvalue"]))
print("Sum of contributions to PC1 = ")
print(sum(var$contrib[,1]))
# Plot initial variable contributions to PCs on PCA 2D-space
contribution_percent = 2
variablesContribution_filename = gsub('.pdf',paste0('_varContribution_',contribution_percent,'.pdf'),filename)
print(paste("Number of variables with contribution to PC1 or PC2 > ", contribution_percent, "% = "))
nbVar_contrib = sum(var$contrib[,1]>=contribution_percent | var$contrib[,2]>=contribution_percent)
print(nbVar_contrib)
# Visualize contribution of variables to PCs
pdf(variablesContribution_filename)
plot(fviz_pca_var(res.pca, geom = c("point","arrow", "text"), arrowsize= 0.2, labelsize = 3, alpha.var = "contrib", select.var = list(contrib = nbVar_contrib), repel=TRUE))
dev.off()
# Biplot - Top variable contribution to PC1 and PC2
nbVar_biplot = 15
biplot_filename = gsub('.pdf',paste0('_biplot_',nbVar_biplot,'.pdf'),filename)
pdf(biplot_filename)
plot(fviz_pca_biplot(res.pca, geom.ind = c("point","text"), geom.var = c("point","arrow", "text"), arrowsize= 0.3, label="all", labelsize = 3, alpha.var = "contrib", select.var = list(contrib = nbVar_biplot), repel=TRUE))
dev.off()
# Top variable contribution to PC1 and PC2
contribution_top = 21 #26 #48
variablesContribution_filename_top = gsub('.pdf',paste0('_topVarContrib_',contribution_top,'.pdf'),filename)
pdf(variablesContribution_filename_top)
plot(fviz_contrib(res.pca, choice = "var", axes = 1:2, top = contribution_top))
dev.off()
# Top variable contribution to PC1
contribution_top = 21 #26 #48
variablesContribution_filename_top = gsub('.pdf',paste0('_topVarContrib_1_',contribution_top,'.pdf'),filename)
pdf(variablesContribution_filename_top)
plot(fviz_contrib(res.pca, choice = "var", axes = 1, top = contribution_top))
dev.off()
# Top variable contribution to PC2
contribution_top = 21 #26 #48
variablesContribution_filename_top = gsub('.pdf',paste0('_topVarContrib_2_',contribution_top,'.pdf'),filename)
pdf(variablesContribution_filename_top)
plot(fviz_contrib(res.pca, choice = "var", axes = 2, top = contribution_top))
dev.off()
threeD_projection = predict(res.pca,vectDAGs)
if (is.null(clusters)){
Dim1 = threeD_projection[,1]
Dim2 = threeD_projection[,2]
Dim3 = threeD_projection[,3]
scatterplot3d::scatterplot3d(Dim1,Dim2,Dim3,pch = 16,labels=rownames(threeD_projection))
} else {
# Cluster Colors
col <- c("#CD5C5C","#999999", "#E69F00", "#56B4E9","#800080","#FF00FF","#000080","#0000FF","#008080","#00FFFF")
colors <- col[as.numeric(clusters)]
# Plot Projected Data (but keep z=0 for debugging)
threeD_filename = gsub('.pdf','_3D_z0.pdf',filename)
pdf(threeD_filename)
scatterplot3d::scatterplot3d(x=threeD_projection[,1],y=threeD_projection[,2], z = matrix(0,dim(threeD_projection)[1],1),pch = 16,color=colors)
# scatterplot3d::scatterplot3d(x=threeD_projection[,1],y=threeD_projection[,2], z = matrix(0,dim(threeD_projection)[1],1),pch = 16)
legend("bottom", legend = levels(as.factor(clusters)),
col = col,
pch = c(16, 16, 16),
inset = -0.05, xpd = TRUE, horiz = TRUE)
dev.off()
# Plot Projected Data (but keep z=0 for debugging)
threeD_filename = gsub('.pdf','_3D.pdf',filename)
pdf(threeD_filename)
scatterplot3d::scatterplot3d(x=threeD_projection[,1],y=threeD_projection[,2], z = threeD_projection[,2],pch = 16)
# scatterplot3d::scatterplot3d(x=threeD_projection[,1],y=threeD_projection[,2], z = threeD_projection[,2],pch = 16,color=colors)
# legend("bottom", legend = levels(as.factor(clusters)),
# col = col,
# pch = c(16, 16, 16),
# inset = -0.05, xpd = TRUE, horiz = TRUE)
dev.off()
}
print("Done with DIM RED mehod!!!")
return(threeD_projection)
}
#####################################################
# Get Feature Labels from the CSV file given as input.
getFeatureLabels = function(descriptionDir,jointGenders = FALSE){
# Get feature labels
print(descriptionDir)
M=read.csv(descriptionDir)
featureLabels = M[,"label_dhs"]
featureLabels = gsub("\\s+\n\\s+"," ",featureLabels)
if (jointGenders){
otherGenderDir = ""
# Male or Female Datasets
if (grepl("Female",descriptionDir)==TRUE){
featureLabels = paste(featureLabels,'women')
otherGenderDir = gsub("Female","Male", descriptionDir)
M=read.csv(otherGenderDir)
otherGenderFeatureLabels = M[,"label_dhs"]
otherGenderFeatureLabels = paste(otherGenderFeatureLabels,'men')
} else {
if (grepl("Male",descriptionDir)==TRUE){
featureLabels = paste(featureLabels,'men')
otherGenderDir = gsub("Male","Female", descriptionDir)
M=read.csv(otherGenderDir)
otherGenderFeatureLabels = M[,"label_dhs"]
otherGenderFeatureLabels = paste(otherGenderFeatureLabels,'women')
} else {
print("ERROR: No Male or Female Datasets Directory.")
return(-1)
}
}
otherGenderFeatureLabels = gsub("\\s+\n\\s+"," ",otherGenderFeatureLabels)
featureLabels = c(featureLabels,otherGenderFeatureLabels)
}
return(featureLabels)
}
#================================================================================================================================
# MIT LICENCE
# Copyright (c) 2019, Aziza Merzouki
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#================================================================================================================================
source("featuresClustering.R")
#####################################################
# Evaluate clustering using Silhouette Width and Dunn Index
evaluateClustering = function(d, clusters){
# Silhouette width
sil = silhouette(clusters,dist=d)
mean_sil = mean(sil[,"sil_width"])
# Dunn Index
dunnIdx= dunn(d,clusters)
return(list(mean_sil,dunnIdx))
}
#####################################################
# Perform Clustering using different numbers of clusters and Evaluate their results
runClusteringInvestigation = function(additionalDataFile, cutReference, cutRefValues, clusteringMethod="complete",incidenceYear=NULL,scale=FALSE){
print("runClusteringInvestigation()")
print(paste("additionalDataFile = ",additionalDataFile))
print(paste("cutReference = ",cutReference))
print(paste("cutRefValues = ",cutRefValues))
print(paste("clusteringMethod = ",clusteringMethod))
print(paste("incidenceYear = ",incidenceYear))
# Cutting reference values
nRefValues = length(cutRefValues)
allSil = array(dim = nRefValues)
allDunn = array(dim = nRefValues)
# Cluster with different Cutting Reference Values (height or number of clusters)
for (i in c(1: nRefValues)){
cutVal = cutRefValues[i]
clustRes = runFeaturesClustering(additionalDataFile, cutReference, cutVal, clusteringMethod, incidenceYear, scale)
clusters = clustRes[[1]]
d = clustRes[[2]]
DS_id = clustRes[[3]]
nClusters = max(clusters)
nObs = length(clusters)
if (nClusters > 1){
evalRes = evaluateClustering(d, clusters)
allSil[i] = evalRes[[1]]
allDunn[i] = evalRes[[2]]
}
}
max_sil = allSil[which.max(allSil)]
optCutRefVal_sil = cutRefValues[which.max(allSil)]
max_dunn = allDunn[which.max(allDunn)]
optCutRefVal_dunn = cutRefValues[which.max(allDunn)]
# Plot the Evolution of the Average Silhouette Width with respect to the Cutting Reference Value (Height or Number of Clusters)
# Set directory where results must be saved.
mainDir = "./Results"
dataType = "Features"
if (scale){
fileName = paste(mainDir,'/',dataType,'_scale_sil_',DS_id,'_',cutReference,'_',clusteringMethod,'.pdf',sep='')
} else {
fileName = paste(mainDir,'/',dataType,'_sil_',DS_id,'_',cutReference,'_',clusteringMethod,'.pdf',sep='')
}
pdf(fileName)
plot(cutRefValues,allSil,xlab=cutReference,ylab="Average Silhouette Width",main=DS_id)
legend("topright",legend = paste("max_sil = ", format(max_sil,digits=2), " - ", cutReference, " = ", optCutRefVal_sil))
dev.off()
return(list(max_sil, optCutRefVal_sil,max_dunn, optCutRefVal_dunn))
}
#================================================================================================================================
# MIT LICENCE
# Copyright (c) 2019, Aziza Merzouki
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#================================================================================================================================
library('xlsx')
library('ggplot2')
library('corrplot')
source("clusteringTools.R")
source("plotFeatureMaps.R")
#####################################################
# Clustering countries based on features.
runFeaturesClustering = function(additionalDataFile=NULL, cutReference, cutRefValue, clusteringMethod = "complete", incidenceYear=NULL, scale = FALSE, data_sheetName = 'Data'){
print("runFeaturesClustering(...)")
print(paste("additionalDataFile = ",additionalDataFile))
print(paste("cutReference = ",cutReference))
print(paste("cutRefValue = ",cutRefValue))
print(paste("clusteringMethod = ",clusteringMethod))
print(paste("incidenceYear = ",incidenceYear))
dataType="Feature"
DS_id = "Male_Female"
# Get Data MATRIX.
M = read.xlsx(additionalDataFile,sheetName = data_sheetName)
#M = read.xlsx(additionalDataFile)
# Remove Columns with all rows = NA
M = M[,colSums(is.na(M))<nrow(M)]
# Remove Rows with all columns = NA
M = M[rowSums(is.na(M))<ncol(M),]
# Get Country Names to set Row Names and remove Country Column from Data
countryNames = M[,"Country"]
M = M[,2:ncol(M)]
rownames(M) = countryNames
# Set directory where results must be saved.
mainDir = "./Results"
dir.create(mainDir)
if (scale){
M = scale(M)
}
print("M Before PLOT FEATURE MAP")
print(M)
# Used when plotting Biplot (individual projection and variable contribution in PCA)
M = M/100
# Plot Maps of Features.
if (scale){
fileName_spec = 'scale'
plotFeatureMaps(M,mainDir,fileName_spec)
}else{
plotFeatureMaps(M,mainDir)
}
print("DONE PLOTTING FEATURE MAP")
# Dimensionality Reduction
if (scale){
fileName = paste(mainDir,'/',dataType,'_scale_PCA_noClusters_',DS_id,'.pdf',sep='')
} else {
fileName = paste(mainDir,'/',dataType,'_PCA_noClusters_',DS_id,'.pdf',sep='')
}
M_reduced = reduceDimensions(M,countryNames,filename=fileName)
print(M)
print(str(M))
# Cluster Countries based on their Feature Frequencies
# Distances between Countries' Feature Frequencies are computed using Euclidian Distance.
euclidDist = computeEuclidianDistance(M)
d = as.dist(euclidDist)
print(euclidDist)
pdf(paste(mainDir,"/EuclidDissimilarity_matrix.pdf",sep=""))
g = corrplot(1-euclidDist/max(euclidDist), order = "hclust", method="color",is.corr=FALSE,tl.col = "black", addrect = 3)
dev.off()
hc = hierarchicalClustering(d,clusteringMethod = clusteringMethod)
print("DONE Clustering")
# The hclust object takes the node labels as they are ordered in the dendrogram
hc$labels = countryNames
# Plot the dendrogram
if (scale){
fileName = paste(mainDir,'/',dataType,'_scale_dend_',DS_id,'_',clusteringMethod,'.pdf',sep='')
} else{
fileName = paste(mainDir,'/',dataType,'_dend_',DS_id,'_',clusteringMethod,'.pdf',sep='')
}
pdf(fileName)
idx = gregexpr(pattern="Causal_DHS", mainDir)[[1]][1]
figTitle = paste("Cluster dendrogram - ",DS_id," Feature frequencies,\n",clusteringMethod, sep="")
plot(as.dendrogram(hc), main = figTitle, nodePar = list(lab.cex = 0.8, pch = c(NA, NA)))
# plot(hc,labels=countryNames)
dev.off()
cutRef_params = paste(cutReference,'_', cutRefValue,sep='')
# Define clusters based on Height or Number of Clusters
if (cutReference == "height"){
clusters = cutree(hc, h=cutRefValue)
} else {
if (cutReference == "nbClust"){
clusters = cutree(hc, k=cutRefValue)
}
}
print("Clusters BEFORE relabel")
print(clusters)
# Swap Clusters
Relabel = c(2,1,3)
clustersRelabelled = clusters
for (i in c(1:3)){
clustersRelabelled[clusters == i] = Relabel[i]
}
clusters = clustersRelabelled
print("Clusters AFTER relabel")
print(clusters)
nClusters = max(clusters)
mapData = data.frame(country = names(clusters), cluster=clusters)
# Save summary of all variables per cluster
for(i in c(1:nClusters)){
print(paste("Summary for Cluster", i))
clusterCountryNames = names(clusters[clusters == i])
clusterM = M[clusterCountryNames,]
df_clusterM = data.frame(Country=rownames(clusterM), Cluster=i)
print(df_clusterM)
write.csv(df_clusterM,paste0(mainDir,'/countries_cluster',i,'.csv'),row.names = FALSE)
print(clusterM)
print(summary(clusterM))
}
# return(0)
print("M right before plot of feature densities per cluster")
print(M[1:10,1:10])
# Draw Density Plots per Cluster for all Variables
draw_density_plots(M,clusters,mainDir,cutRefValue)
# Dimensionality Reduction
if (scale){
fileName = paste(mainDir,'/',dataType,'_scale_PCA_',DS_id,'_',cutRef_params,'_',clusteringMethod,'.pdf',sep='')
} else {
fileName = paste(mainDir,'/',dataType,'_PCA_',DS_id,'_',cutRef_params,'_',clusteringMethod,'.pdf',sep='')
}
reduceDimensions(M,countryNames,clusters,fileName)
# # Dimensionality Reduction using National HIV incidences (to color country projections)
# incidenceFile = paste(getwd(),'/../data_countries/HIVIncidence_per1000_15_49.csv',sep='/')
# incidences = getCountryAttribute(incidenceFile,countryNames,incidenceYear)
# reduceDimensions(M,countryNames,floor(incidences[,paste0("X",incidenceYear)]),fileName)
# Draw Africa's Map with Countries Coloured based on their Cluster
if (scale){
fileName = paste(mainDir,'/',dataType,'_scale_map_',DS_id,'_',cutRef_params,'_',clusteringMethod,'.pdf',sep='')
} else {
fileName = paste(mainDir,'/',dataType,'_map_',DS_id,'_',cutRef_params,'_',clusteringMethod,'.pdf',sep='')
}
title = paste("Cluster Map - ", DS_id,' Feature frequencies\n',clusteringMethod, sep="")
drawMapOfClusters(clusters,"africa",title,fileName)
# Incidence
if (!is.null(incidenceYear)){
# Draw Incidence Map, to compare to Feature Matrices Clustering Map
incidenceFile = paste(getwd(),'/../data_countries/HIVIncidence_per1000_15_49.csv',sep='/')
plotIncidenceMap(mainDir,incidenceFile, mapData$country, incidenceYear)
prevalenceFile = paste(getwd(),'/../data_countries/HIVPrevalence_15_49.csv',sep='/')
plotPrevalenceMap(mainDir,prevalenceFile, mapData$country, incidenceYear)
# Compute Incidence Box Plot of each cluster.
if (scale){
fileName = paste(mainDir,'/',dataType,'_scale_cluster_IncidDistrib_',DS_id,'_',cutRef_params,'_',clusteringMethod,'.pdf',sep='')
} else {
fileName = paste(mainDir,'/',dataType,'_cluster_IncidDistrib_',DS_id,'_',cutRef_params,'_',clusteringMethod,'.pdf',sep='')
}
drawAttributeBoxplotsPerCluster(incidenceFile,"Incidence",mapData$country,incidenceYear,clusters,title,fileName)
# Compute Prevalence Box Plot of each cluster.
if (scale){
fileName = paste(mainDir,'/',dataType,'_scale_cluster_PrevDistrib_',DS_id,'_',cutRef_params,'_',clusteringMethod,'.pdf',sep='')
} else {
fileName = paste(mainDir,'/',dataType,'_cluster_PrevDistrib_',DS_id,'_',cutRef_params,'_',clusteringMethod,'.pdf',sep='')
}
drawAttributeBoxplotsPerCluster(prevalenceFile,"Prevalence",mapData$country,incidenceYear,clusters,title,fileName)
}
return(list(clusters,d,DS_id))
}
#================================================================================================================================
# MIT LICENCE
# Copyright (c) 2019, Aziza Merzouki
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#================================================================================================================================
source("evaluateClustering.R")
# Set XLSX file with country level data.
file_StatCompiler_data = "../data_countries/all_SSA_var_Religion_ART_2015_GINI_percent_nbPartners_norm_completedData.xlsx"
# Run clustering with given number of clusters.
nbClusters = 3
runFeaturesClustering(file_StatCompiler_data,"nbClust",nbClusters,incidenceYear=2016,data_sheetName = "Data_LongVar")
# Run clustering with a range of cluster numbers
min_nbClusters = 2
max_nbClusters = 4
# runClusteringInvestigation(file_StatCompiler_data,"nbClust",c(min_nbClusters:max_nbClusters),incidenceYear=2016)
#================================================================================================================================
# MIT LICENCE
# Copyright (c) 2019, Aziza Merzouki
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#================================================================================================================================
library(rworldmap)
library(rworldxtra)
library(manipulate)
#####################################################
# Plot Variables Map, Given a Matrix M where Columns = Variables and Rows = Countries.
plotFeatureMaps <- function(M,mainDir,fileName_spec=NULL){
# M = t(M)
print("plotFeaturesMap(...)")
print(M)
Variable = colnames(M)
n = length(Variable)
print(paste("n = ", n))
Country = rownames(M)
# Add the Country Names to the Dataframe of Feature Associations
M=data.frame(M,Country)
# print("M with Country Column = ")
# print(M)
# Plot Map of Features
# # Interactive Map
# sPDF <- getMap(resolution="high")
# spdf <- joinCountryData2Map(M,joinCode='NAME',nameJoinColumn="Country")
# print(Variable[1])
# # manipulate(mapCountryData(spdf,nameColumnToPlot= Variable[i], mapRegion="africa",mapTitle= paste("Feature Map ",Variable[i]),catMethod=seq(floor(min(M[,Variable[i]])),ceiling(max(M[,Variable[i]])),by=(ceiling(max(M[,Variable[i]]))-floor(min(M[,Variable[i]])))/10)), i=slider(1,n))
# manipulate(mapCountryData(spdf,nameColumnToPlot= Variable[i], mapRegion="africa",mapTitle= Variable[i],catMethod=seq(floor(min(M[,Variable[i]])),ceiling(max(M[,Variable[i]])),by=(ceiling(max(M[,Variable[i]]))-floor(min(M[,Variable[i]])))/10)), i=slider(1,n))
# # Add Country Labels on the Map
# labelCountries(spdf,nameCountryColum="Country",cex=0.5,col="black")
# Save Separate Maps
extension = '.pdf'
# extension = '.png'
for (i in c(1:n)){
if (is.null(fileName_spec)){
fileName = paste(mainDir,'/map_var_',i,extension,sep='')
} else{
fileName = paste(mainDir,'/map_var_',i,'_',fileName_spec,extension,sep='')
}
pdf(fileName)
# png(fileName)
# Plot Map of Features
sPDF <- getMap(resolution="high")
spdf <- joinCountryData2Map(M,joinCode='NAME',nameJoinColumn="Country")
print(Variable[1])
# mapCountryData(spdf,nameColumnToPlot= Variable[i], mapRegion="africa",mapTitle= paste("Feature Map ",Variable[i]),catMethod=seq(floor(min(M[,Variable[i]])),ceiling(max(M[,Variable[i]])),by=(ceiling(max(M[,Variable[i]]))-floor(min(M[,Variable[i]])))/10))
mapCountryData(spdf,nameColumnToPlot= Variable[i], mapRegion="africa",mapTitle= Variable[i],catMethod=seq(floor(min(M[,Variable[i]])),ceiling(max(M[,Variable[i]])),by=(ceiling(max(M[,Variable[i]]))-floor(min(M[,Variable[i]])))/10))
# Add Country Labels on the Map
# labelCountries(spdf,nameCountryColum="Country",cex=0.5,col="black")
dev.off()
}
}
#================================================================================================================================
# MIT LICENCE
# Copyright (c) 2019, Aziza Merzouki
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#================================================================================================================================
library(rworldmap)
library(rworldxtra)
library(manipulate)
library(reshape2)
library(plotly)
library(IRdisplay)
#####################################################
# Get the Attribute Value of a list of Countries for given Years
getCountryAttribute <- function(attributeFilePath,countries,years){
# Read Attribute File
M = read.csv(attributeFilePath)
# print(M)
colNames = paste("X",years,sep="")
colIdxs = match(colNames, colnames(M))
country_incidence = M[,c("Country",colNames)]
country_incidence = country_incidence[country_incidence$Country%in%countries,]
for (year in years){
# Identify the column containing the incidence rates for the given year
colName = paste("X",year,sep="")
colIdx = match(colName, colnames(M))
# Keep Country and Incidence, while ignoring unavailable data "... "
# country_incidence = country_incidence[country_incidence[,colIdx]!="... ",c("Country",colName)]
# Convert Incidence from Factor to Numeric, and ignore NA data
country_incidence[,colName] = as.numeric(levels(country_incidence[,colName]))[country_incidence[,colName]]
#country_incidence = country_incidence[is.na(country_incidence$incidenceNum)==FALSE,]
}
return(country_incidence)
}
#####################################################
# Plot Incidence Map for a given set of Countries in a given Year,
# based on a File containing the Indcidence measure for "all" countries over the years.
# ex. countries = read.xlsx('../data_countries/list_SSA_countries.xlsx',sheetName = "Data")
# plotIncidenceMap('../data_countries/HIVIncidence_per1000_15_49.csv', countries[,1], c(2016))
plotIncidenceMap <- function(mainDir, incidenceFilePath, countries, years){
print(countries)
country_incidence = getCountryAttribute(incidenceFilePath, countries, years)
print(country_incidence)
country_incidence = na.omit(country_incidence)
print(country_incidence)
for (year in years){
# Plot Histogram of Incidence
pdf(paste('./',mainDir,'/incidenceHistogram_',year,'.pdf',sep=''))
hist(country_incidence[,paste("X",year,sep="")],breaks = seq(0,50,by=0.5), main=paste("Incidence Histogram", year, sep=" - "),xlab="HIV Incidence per 1000 Population")
dev.off()
# Get map from rworldmap
# quartz()
nbCat = 40
catMeth = "fixedWidth" #"fixedWidth" #"quantiles"
# catMeth = "myCatMethod"
if (catMeth == "myCatMethod"){
catMeth_tmp = c(0,0.1,0.2,0.5,1,2,5,seq(10,50,by=10))
nbCat = length(catMeth_tmp)
fileName = paste('./',mainDir,'/incidenceMap_',year,'_',catMeth,'_',nbCat,'.pdf',sep='')
print(fileName)
catMeth = catMeth_tmp
} else {
fileName = paste('./',mainDir,'/incidenceMap_',year,'_',catMeth,'_', nbCat,'.pdf',sep='')
}
pdf(fileName)
sPDF <- getMap(resolution="high")
spdf <- joinCountryData2Map(country_incidence,joinCode='NAME',nameJoinColumn="Country")
mapCountryData(spdf,nameColumnToPlot=paste("X",year,sep=''), mapRegion="africa",mapTitle= paste("Incidence Map",year),catMethod=catMeth,numCats= nbCat)
quantile(country_incidence[,paste("X",year,sep="")],1:nbCat/nbCat)
# Add Country Labels on the Map
labelCountries(spdf,nameCountryColum="Country",cex=0.4,col="black")
dev.off()
}
if (length(years)>1){
manipulate(mapCountryData(spdf,nameColumnToPlot=paste("X",year,sep=''), mapRegion="africa",mapTitle= paste("Incidence Map",year),catMethod=catMeth,numCats= nbCat),year=slider(min(years),max(years),step=5))
}
}
#####################################################
# Plot Prevalence Map for a given set of Countries in a given Year,
# based on a File containing the Prevalence measure for "all" countries over the years.
# ex. countries = read.xlsx('../data_countries/list_SSA_countries.xlsx',sheetName = "Data")
# plotPrevalenceMap('../data_countries/HIVPrevalence_15_49.csv', countries[,1], c(2015))
plotPrevalenceMap <- function(mainDir, prevalenceFilePath, countries, years){
country_prevalence = getCountryAttribute(prevalenceFilePath, countries, years)
print(country_prevalence)
country_prevalence = na.omit(country_prevalence)
print(country_prevalence)
for (year in years){
# Plot Histogram of Incidence
pdf(paste('./',mainDir,'/prevalenceHistogram_',year,'.pdf',sep=''))
# hist(country_prevalence[,paste("X",year,sep="")],breaks = seq(0,50,by=0.5), main=paste("Prevalence Histogram", year, sep=" - "),xlab="HIV Prevalence")
hist(country_prevalence[,paste("X",year,sep="")], main=paste("Prevalence Histogram", year, sep=" - "),xlab="HIV Prevalence")
dev.off()
# Get map from rworldmap
nbCat = 40
catMeth = "fixedWidth" #"fixedWidth" #"quantiles"
# catMeth = "myCatMethod"
if (catMeth == "myCatMethod"){
catMeth_tmp = c(0,0.1,0.2,0.5,1,2,5,seq(10,50,by=10))
nbCat = length(catMeth_tmp)
fileName = paste('./',mainDir,'/prevalenceMap_',year,'_',catMeth,'_',nbCat,'.pdf',sep='')
print(fileName)
catMeth = catMeth_tmp
} else {
fileName = paste('./',mainDir,'/prevalenceMap_',year,'_',catMeth,'_', nbCat,'.pdf',sep='')
}
pdf(fileName)
sPDF <- getMap(resolution="high")
spdf <- joinCountryData2Map(country_prevalence,joinCode='NAME',nameJoinColumn="Country")
mapCountryData(spdf,nameColumnToPlot=paste("X",year,sep=''), mapRegion="africa",mapTitle= paste("Prevalence Map",year),catMethod=catMeth,numCats= nbCat)
quantile(country_prevalence[,paste("X",year,sep="")],1:nbCat/nbCat)
# Add Country Labels on the Map
# labelCountries(spdf,nameCountryColum="Country",cex=0.4,col="black")
dev.off()
}
if (length(years)>1){
manipulate(mapCountryData(spdf,nameColumnToPlot=paste("X",year,sep=''), mapRegion="africa",mapTitle= paste("Prevalence Map",year),catMethod=catMeth,numCats= nbCat),year=slider(min(years),max(years),step=5))
}
}
This diff is collapsed.
This diff is collapsed.
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment