Unsupervised Analysis Workflow

Packages required for analysis: “dendextend”,“ggdendro”,“ggplot2”,“randomForest”,“caret”,“RColorBrewer”. Their dependencies may also be required.

Function for F-M index calculation

get_clustResults <- function(dataMat, treeInd, refClust, methodInd, colorsMat, classNum) {
    
    # Calculation of row-wise correlation from the input data matrix:
    corMat_pearson <- matrix(ncol = dim(dataMat)[1], nrow = dim(dataMat)[1])
    corMat_kendall <- matrix(ncol = dim(dataMat)[1], nrow = dim(dataMat)[1])
    corMat_spearman <- matrix(ncol = dim(dataMat)[1], nrow = dim(dataMat)[1])
    
    for (i in 1:dim(dataMat)[1]) {
        for (j in 1:dim(dataMat)[1]) {
            corMat_pearson[i, j] <- cor(as.vector(dataMat[i, ]), as.vector(dataMat[j, ]), method = "pearson")
            corMat_kendall[i, j] <- cor(as.vector(dataMat[i, ]), as.vector(dataMat[j, ]), method = "kendall")
            corMat_spearman[i, j] <- cor(as.vector(dataMat[i, ]), as.vector(dataMat[j, ]), method = "spearman")
        }
    }
    
    # Asssigning obtained correlation matrices as input similarity matrices for clustering:
    sMetric <- list()
    sMetric[[1]] <- corMat_kendall
    sMetric[[2]] <- corMat_pearson
    sMetric[[3]] <- corMat_spearman
    
    for (i in 1:3) {
        colnames(sMetric[[i]]) <- rownames(sMetric[[i]]) <- rownames(dataMat)
    }
    
    hclustfun <- list()
    hclustfun[[1]] = function(x) hclust(x, method = "ward.D2")
    hclustfun[[2]] = function(x) hclust(x, method = "average")
    hclustfun[[3]] = function(x) hclust(x, method = "complete")
    
    hc <- list()
    count <- 0
    for (i in 1:length(hclustfun)) {
        for (j in 1:length(sMetric)) {
            count <- count + 1
            hc[[count]] <- hclustfun[[i]](dist(sMetric[[j]]))
        }
    }
    
    methods <- c("Ward_KendallCor", "Ward_PearsonCor", "Ward_SpearmanCor", "Avg_KendallCor", "Avg_PearsonCor", "Avg_SpearmanCor", "Complete_KendallCor", 
        "Complete_PearsonCor", "Complete_SpearmanCor")
    
    fmRefIndex <- matrix(nrow = length(methods), ncol = 1)
    pre_nullFM_Index <- NULL
    pVal <- NULL
    nullFM_Index <- matrix(nrow = length(methods), ncol = 1)
    sdFM_Index <- matrix(nrow = length(methods), ncol = 1)
    
    permNumber <- 1000  #number of permutations
    
    for (i in 1:length(methods)) {
        # comparison is done to refClust[[2]] which is the average linkage on reference data
        fmRefIndex[i] <- FM_index_R(cutree(hc[[i]], k = treeInd), cutree(refClust[[2]], k = treeInd))
        pre_nullFM_Index <- Bk_permutations(hc[[i]], refClust[[2]], k = treeInd, R = permNumber)
        nullFM_Index[i] <- mean(pre_nullFM_Index[[1]])
        sdFM_Index[i] <- sd(pre_nullFM_Index[[1]])
        histAcc <- hist(pre_nullFM_Index[[1]], plot = FALSE)
        pVal[i] <- sum(histAcc$counts[which(histAcc$breaks >= fmRefIndex[i])], na.rm = TRUE)/permNumber
    }
    
    all.par <- NULL
    all.par <- rbind(all.par, data.frame(method = methods, tree_Cut = treeInd, FM_index = round(fmRefIndex, 2), null_FM_index = round(nullFM_Index, 2), 
        standard_dev = round(sdFM_Index, 2), p_value = round(pVal, 2)))
    results <- list(all.par = all.par, hc = hc)
    return(results)
}

Setting the seed for reproducibility

The default kind of the random number generator is changed in R 3.6.0. Setting the sampling kind to rounding will allow consistency with previous versions of R.

RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler used
set.seed(42)

Input parameter options

  1. classNum: Number of classes /
    Options: 3,9 or 16 classes for SRM grouping

  2. methodInd: Index for the hierarchical clustering methodology
    Options: 1-Kendall correlation with Ward’s method
    2-Pearson correlation with Ward’s method
    3-Spearman correlation with Ward’s method
    4-Kendall correlation with average linkage method
    5-Pearson correlation with average linkage method
    6-Spearman correlation with average linkage method
    7-Kendall correlation with complete linkage method
    8-Pearson correlation with complete linkage method
    9-Spearman correlation with complete linkage method

# For this analysis, we select the following input arguments:
classNum = 3
methodInd = 5

Read GC-MS data of SRM samples

# Reading processed input GC-MS data of SRM samples
unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1]  #assigning the row names as sample names
unfData <- unfData[, -1]  #excluding the first column that contains sample names
unfData <- as.matrix(unfData)
First 10 rows and 5 columns of the processed GC-MS data
Decalin C1.Decalins C2.Decalins C3.Decalins Naphthalene
petro258 1683524 4176960 2478180 3125211 2101455
petro259 1649766 4104724 2433514 3085597 2060556
petro260 1610183 4043132 2391167 3032684 2059942
petro230 234594070 215240060 149590418 206905029 106635695
petro231 300689337 276231110 191760364 264012867 135649094
petro232 348938684 320796132 222587040 306269193 155050777
petro266 61680447 48228029 33821356 48344642 22031443
petro267 69692453 54506887 38215996 54608205 24832960
petro268 77114253 60267441 42288380 60480097 27432439
petro203 1957888 4406090 2546545 3242664 2670092

Reference categorization of SRM samples

sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)
SRM category information
Name Sample Labels_16_Class Labels_9_Class Labels_3_Class Index_16class Index_9class Index_3class
87 Octane Gasoline petro258 87 Octane Gasoline Gasoline LRP 1 1 1
87 Octane Gasoline petro259 87 Octane Gasoline Gasoline LRP 1 1 1
87 Octane Gasoline petro260 87 Octane Gasoline Gasoline LRP 1 1 1
SRM 2773 petro230 Biodiesel (Animal-based) Biodiesel LRP 2 2 1
SRM 2773 petro231 Biodiesel (Animal-based) Biodiesel LRP 2 2 1
SRM 2773 petro232 Biodiesel (Animal-based) Biodiesel LRP 2 2 1
SRM 2772 petro266 Biodiesel (Soy-based) Biodiesel LRP 3 2 1
SRM 2772 petro267 Biodiesel (Soy-based) Biodiesel LRP 3 2 1
SRM 2772 petro268 Biodiesel (Soy-based) Biodiesel LRP 3 2 1
SRM 2722 petro203 Crude Oil (Heavy-Sweet) Crude Oil Crude 4 3 2
SRM 2722 petro204 Crude Oil (Heavy-Sweet) Crude Oil Crude 4 3 2
SRM 2722 petro205 Crude Oil (Heavy-Sweet) Crude Oil Crude 4 3 2
SRM 2721 petro274 Crude Oil (Light-Sour) Crude Oil Crude 5 3 2
SRM 2721 petro275 Crude Oil (Light-Sour) Crude Oil Crude 5 3 2
SRM 2721 petro276 Crude Oil (Light-Sour) Crude Oil Crude 5 3 2
SRM 1615 petro207 Gas Oil Gas Oil HRP 6 4 3
SRM 1615 petro208 Gas Oil Gas Oil HRP 6 4 3
SRM 1615 petro209 Gas Oil Gas Oil HRP 6 4 3
SRM 2779 petro270 Gulf of Mexico Crude Oil Crude Oil Crude 7 3 2
SRM 2779 petro271 Gulf of Mexico Crude Oil Crude Oil Crude 7 3 2
SRM 2779 petro272 Gulf of Mexico Crude Oil Crude Oil Crude 7 3 2
JP8 petro246 Jet Fuel Jet Fuel LRP 8 5 1
JP8 petro247 Jet Fuel Jet Fuel LRP 8 5 1
JP8 petro248 Jet Fuel Jet Fuel LRP 8 5 1
JP5 petro250 Jet Fuel Jet Fuel LRP 8 5 1
JP5 petro251 Jet Fuel Jet Fuel LRP 8 5 1
JP5 petro252 Jet Fuel Jet Fuel LRP 8 5 1
Jet Fuel A petro254 Jet Fuel Jet Fuel LRP 8 5 1
Jet Fuel A petro255 Jet Fuel Jet Fuel LRP 8 5 1
Jet Fuel A petro256 Jet Fuel Jet Fuel LRP 8 5 1
SRM 2723b petro226 Low S Diesel Diesel LRP 9 6 1
SRM 2723b petro227 Low S Diesel Diesel LRP 9 6 1
SRM 2723b petro228 Low S Diesel Diesel LRP 9 6 1
SRM 1848 petro218 Motor Oil Additive Motor Oil HRP 10 7 3
SRM 1848 petro219 Motor Oil Additive Motor Oil HRP 10 7 3
SRM 1848 petro220 Motor Oil Additive Motor Oil HRP 10 7 3
SRM 2299 petro210 S in gasoline Gasoline LRP 11 1 1
SRM 2299 petro211 S in gasoline Gasoline LRP 11 1 1
SRM 2299 petro212 S in gasoline Gasoline LRP 11 1 1
SRM 1617b petro242 S in Kerosene (High Level) Kerosene LRP 12 8 1
SRM 1617b petro243 S in Kerosene (High Level) Kerosene LRP 12 8 1
SRM 1617b petro244 S in Kerosene (High Level) Kerosene LRP 12 8 1
SRM 1616b petro262 S in Kerosene (Low Level) Kerosene LRP 13 8 1
SRM 1616b petro263 S in Kerosene (Low Level) Kerosene LRP 13 8 1
SRM 1616b petro264 S in Kerosene (Low Level) Kerosene LRP 13 8 1
SRM 2770 petro234 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 2770 petro235 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 2770 petro236 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1623c petro238 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1623c petro239 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1623c petro240 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1620c petro278 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1620c petro279 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1620c petro280 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1624d petro214 Sulfur in Diesel Diesel LRP 15 6 1
SRM 1624d petro215 Sulfur in Diesel Diesel LRP 15 6 1
SRM 1624d petro216 Sulfur in Diesel Diesel LRP 15 6 1
SRM 2771 petro222 Zero S Diesel Diesel LRP 16 6 1
SRM 2771 petro223 Zero S Diesel Diesel LRP 16 6 1
SRM 2771 petro224 Zero S Diesel Diesel LRP 16 6 1

Missing data imputation with zeros

unfData[is.na(unfData)] <- 0

Data cleaning

Feature columns with zero standard deviation are removed from analysis.

unfData <- unfData[, which(sapply(1:dim(unfData)[2], function(x) sd(unfData[, x])) > 0)]

dataMat <- dummy_dataMat <- matrix(ncol = (dim(unfData)[2]), nrow = dim(unfData)[1])

Data scaling

For no dimensionality reduction option, row-wise min-max scaling is used. For no dimensionality reduction option, row-wise z-score normalization is used.

# For no dimensionality reduction option: Row-wise min-max scaling
for (i in 1:dim(unfData)[2]) {
    for (j in 1:dim(unfData)[1]) {
        minVal <- min(unfData[j, ])
        maxVal <- max(unfData[j, ])
        dataMat[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
    }
}

# For no dimensionality reduction option: Row-wise z-score normalization
for (i in 1:dim(unfData)[2]) {
    for (j in 1:dim(unfData)[1]) {
        meanVal <- mean(unfData[j, ])
        sdVal <- sd(unfData[j, ])
        dummy_dataMat[j, i] <- (unfData[j, i] - meanVal)/sdVal
    }
}

Singular Value Decomposition

Singular Value Decomposition with a threshold of 85% variance

Xmat <- svd(dummy_dataMat)
criteriaInd <- max(which(cumsum(Xmat$d/sum(Xmat$d) * 100) <= 85))
selU_mat <- Xmat$u[, 1:criteriaInd]
selD_mat <- matrix(0, ncol = dim(selU_mat)[2], nrow = dim(selU_mat)[2])
diag(selD_mat) <- as.matrix(sqrt(Xmat$d[1:criteriaInd]))
red_dataMat <- selU_mat %*% selD_mat

Clustering analysis

refObj <- NULL
refObj <- sampleIDs[, paste("Index_", classNum, "class", sep = "")]
names(refObj) <- rownames(dataMat) <- rownames(red_dataMat) <- rownames(unfData)

dMetric_ref <- dist(refObj)  #Euclidean distance
hclustfun <- list()
hclustfun[[1]] = function(x) hclust(x, method = "ward.D2")
hclustfun[[2]] = function(x) hclust(x, method = "average")
hclustfun[[3]] = function(x) hclust(x, method = "complete")

refClust <- list()
count <- 0
for (i in 1:length(hclustfun)) {
    count <- count + 1
    refClust[[count]] <- hclustfun[[i]](dMetric_ref)
}

# Defining colors for the dendrogram
colorsMat <- c("goldenrod", "darkorange4", "darkolivegreen", "blue4", "firebrick4", "plum1", "grey50", "seagreen3", "palevioletred1", "turquoise1", "deeppink4", 
    "dodgerblue", "darkorchid3", "saddlebrown", "maroon2", "lawngreen")

# Defining the number of cuts for the F-M index calculation
treeInd <- classNum

# Calling functions for producing grouping structure based on the selected parameters
results <- get_clustResults(dataMat, treeInd, refClust, methodInd, colorsMat, classNum)
reduced_results <- get_clustResults(red_dataMat, treeInd, refClust, methodInd, colorsMat, classNum)
F-M Indices and p-values (No Dimensionality Reduction)
method tree_Cut FM_index null_FM_index standard_dev p_value
Ward_KendallCor 3 0.53 0.39 0.02 0.00
Ward_PearsonCor 3 0.40 0.41 0.02 0.31
Ward_SpearmanCor 3 0.49 0.44 0.02 0.02
Avg_KendallCor 3 0.57 0.53 0.03 0.04
Avg_PearsonCor 3 0.51 0.50 0.03 0.26
Avg_SpearmanCor 3 0.57 0.53 0.03 0.04
Complete_KendallCor 3 0.64 0.45 0.02 0.00
Complete_PearsonCor 3 0.51 0.51 0.03 0.36
Complete_SpearmanCor 3 0.57 0.53 0.03 0.05
F-M Indices and p-values (Dimensionality Reduction)
method tree_Cut FM_index null_FM_index standard_dev p_value
Ward_KendallCor 3 0.41 0.38 0.01 0.02
Ward_PearsonCor 3 0.50 0.39 0.01 0.00
Ward_SpearmanCor 3 0.41 0.38 0.01 0.01
Avg_KendallCor 3 0.56 0.44 0.02 0.00
Avg_PearsonCor 3 0.54 0.48 0.03 0.04
Avg_SpearmanCor 3 0.48 0.42 0.02 0.00
Complete_KendallCor 3 0.41 0.39 0.02 0.06
Complete_PearsonCor 3 0.60 0.44 0.02 0.00
Complete_SpearmanCor 3 0.41 0.39 0.02 0.07

Generating dendrograms

Dendrogram is generated for the selected clustering methodology (Average linkage & Pearson correlation with no dimensionality reduction)

i <- methodInd  # passing the clustering methodology index for generation of dendrograms
hc <- results$hc
groups <- NULL
groups <- cutree(hc[[i]], k = treeInd)  #cut tree into n clusters, where n is treeInd

dd.row <- as.dendrogram(hc[[i]])
ddata_x <- dendro_data(dd.row, type = "rectangle")
groupMat <- data.frame(label = sampleIDs$Sample, groupInfo = sampleIDs[, paste("Labels_", classNum, "_Class", sep = "")])
labs <- merge(label(ddata_x), groupMat, by.y = "label")
labs_final <- labs[order(labs$x, decreasing = FALSE), ]
Class_Labels <- as.factor(labs_final$groupInfo)
ggplot(segment(ddata_x)) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend), colour = "black") + geom_text(data = label(ddata_x), aes(label = label, 
    x = x, y = 0, colour = Class_Labels), size = 5, fontface = 2, nudge_y = 0.8) + theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), 
    axis.text = element_blank(), axis.ticks.x = element_blank(), axis.ticks.y = element_blank()) + labs(x = "", y = "") + scale_color_manual(values = colorsMat) + 
    coord_flip() + scale_y_reverse(expand = c(1, 0))