Unsupervised Analysis Workflow

Packages required for analysis: “dendextend”,“ggdendro”,“ggplot2”,“randomForest”,“caret”,“RColorBrewer”. Their dependencies may also be required.

Function for F-M index calculation

get_clustResults <- function(dataMat, treeInd, refClust, methodInd, colorsMat, classNum) {
    
    # Calculation of row-wise correlation from the input data matrix:
    corMat_pearson <- matrix(ncol = dim(dataMat)[1], nrow = dim(dataMat)[1])
    corMat_kendall <- matrix(ncol = dim(dataMat)[1], nrow = dim(dataMat)[1])
    corMat_spearman <- matrix(ncol = dim(dataMat)[1], nrow = dim(dataMat)[1])
    
    for (i in 1:dim(dataMat)[1]) {
        for (j in 1:dim(dataMat)[1]) {
            corMat_pearson[i, j] <- cor(as.vector(dataMat[i, ]), as.vector(dataMat[j, ]), method = "pearson")
            corMat_kendall[i, j] <- cor(as.vector(dataMat[i, ]), as.vector(dataMat[j, ]), method = "kendall")
            corMat_spearman[i, j] <- cor(as.vector(dataMat[i, ]), as.vector(dataMat[j, ]), method = "spearman")
        }
    }
    
    # Asssigning obtained correlation matrices as input similarity matrices for clustering:
    sMetric <- list()
    sMetric[[1]] <- corMat_kendall
    sMetric[[2]] <- corMat_pearson
    sMetric[[3]] <- corMat_spearman
    
    for (i in 1:3) {
        colnames(sMetric[[i]]) <- rownames(sMetric[[i]]) <- rownames(dataMat)
    }
    
    hclustfun <- list()
    hclustfun[[1]] = function(x) hclust(x, method = "ward.D2")
    hclustfun[[2]] = function(x) hclust(x, method = "average")
    hclustfun[[3]] = function(x) hclust(x, method = "complete")
    
    hc <- list()
    count <- 0
    for (i in 1:length(hclustfun)) {
        for (j in 1:length(sMetric)) {
            count <- count + 1
            hc[[count]] <- hclustfun[[i]](dist(sMetric[[j]]))
        }
    }
    
    methods <- c("Ward_KendallCor", "Ward_PearsonCor", "Ward_SpearmanCor", "Avg_KendallCor", "Avg_PearsonCor", "Avg_SpearmanCor", "Complete_KendallCor", 
        "Complete_PearsonCor", "Complete_SpearmanCor")
    
    fmRefIndex <- matrix(nrow = length(methods), ncol = 1)
    pre_nullFM_Index <- NULL
    pVal <- NULL
    nullFM_Index <- matrix(nrow = length(methods), ncol = 1)
    sdFM_Index <- matrix(nrow = length(methods), ncol = 1)
    
    permNumber <- 1000  #number of permutations
    
    for (i in 1:length(methods)) {
        # comparison is done to refClust[[2]] which is the average linkage on reference data
        fmRefIndex[i] <- FM_index_R(cutree(hc[[i]], k = treeInd), cutree(refClust[[2]], k = treeInd))
        pre_nullFM_Index <- Bk_permutations(hc[[i]], refClust[[2]], k = treeInd, R = permNumber)
        nullFM_Index[i] <- mean(pre_nullFM_Index[[1]])
        sdFM_Index[i] <- sd(pre_nullFM_Index[[1]])
        histAcc <- hist(pre_nullFM_Index[[1]], plot = FALSE)
        pVal[i] <- sum(histAcc$counts[which(histAcc$breaks >= fmRefIndex[i])], na.rm = TRUE)/permNumber
    }
    
    all.par <- NULL
    all.par <- rbind(all.par, data.frame(method = methods, tree_Cut = treeInd, FM_index = round(fmRefIndex, 2), null_FM_index = round(nullFM_Index, 2), 
        standard_dev = round(sdFM_Index, 2), p_value = round(pVal, 2)))
    results <- list(all.par = all.par, hc = hc)
    return(results)
}

Setting the seed for reproducibility

The default kind of the random number generator is changed in R 3.6.0. Setting the sampling kind to rounding will allow consistency with previous versions of R.

RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler used
set.seed(42)

Input parameter options

  1. classNum: Number of classes /
    Options: 3,9 or 16 classes for SRM grouping

  2. methodInd: Index for the hierarchical clustering methodology
    Options: 1-Kendall correlation with Ward’s method
    2-Pearson correlation with Ward’s method
    3-Spearman correlation with Ward’s method
    4-Kendall correlation with average linkage method
    5-Pearson correlation with average linkage method
    6-Spearman correlation with average linkage method
    7-Kendall correlation with complete linkage method
    8-Pearson correlation with complete linkage method
    9-Spearman correlation with complete linkage method

# For this analysis, we select the following input arguments:
classNum = 3
methodInd = 5

Read GC-MS data of SRM samples

# Reading processed input GC-MS data of SRM samples
unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1]  #assigning the row names as sample names
unfData <- unfData[, -1]  #excluding the first column that contains sample names
unfData <- as.matrix(unfData)
First 10 rows and 5 columns of the processed GC-MS data
Decalin C1.Decalins C2.Decalins C3.Decalins Naphthalene
petro258 1683524 4176960 2478180 3125211 2101455
petro259 1649766 4104724 2433514 3085597 2060556
petro260 1610183 4043132 2391167 3032684 2059942
petro230 234594070 215240060 149590418 206905029 106635695
petro231 300689337 276231110 191760364 264012867 135649094
petro232 348938684 320796132 222587040 306269193 155050777
petro266 61680447 48228029 33821356 48344642 22031443
petro267 69692453 54506887 38215996 54608205 24832960
petro268 77114253 60267441 42288380 60480097 27432439
petro203 1957888 4406090 2546545 3242664 2670092

Reference categorization of SRM samples

sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)
SRM category information
Name Sample Labels_16_Class Labels_9_Class Labels_3_Class Index_16class Index_9class Index_3class
87 Octane Gasoline petro258 87 Octane Gasoline Gasoline LRP 1 1 1
87 Octane Gasoline petro259 87 Octane Gasoline Gasoline LRP 1 1 1
87 Octane Gasoline petro260 87 Octane Gasoline Gasoline LRP 1 1 1
SRM 2773 petro230 Biodiesel (Animal-based) Biodiesel LRP 2 2 1
SRM 2773 petro231 Biodiesel (Animal-based) Biodiesel LRP 2 2 1
SRM 2773 petro232 Biodiesel (Animal-based) Biodiesel LRP 2 2 1
SRM 2772 petro266 Biodiesel (Soy-based) Biodiesel LRP 3 2 1
SRM 2772 petro267 Biodiesel (Soy-based) Biodiesel LRP 3 2 1
SRM 2772 petro268 Biodiesel (Soy-based) Biodiesel LRP 3 2 1
SRM 2722 petro203 Crude Oil (Heavy-Sweet) Crude Oil Crude 4 3 2
SRM 2722 petro204 Crude Oil (Heavy-Sweet) Crude Oil Crude 4 3 2
SRM 2722 petro205 Crude Oil (Heavy-Sweet) Crude Oil Crude 4 3 2
SRM 2721 petro274 Crude Oil (Light-Sour) Crude Oil Crude 5 3 2
SRM 2721 petro275 Crude Oil (Light-Sour) Crude Oil Crude 5 3 2
SRM 2721 petro276 Crude Oil (Light-Sour) Crude Oil Crude 5 3 2
SRM 1615 petro207 Gas Oil Gas Oil HRP 6 4 3
SRM 1615 petro208 Gas Oil Gas Oil HRP 6 4 3
SRM 1615 petro209 Gas Oil Gas Oil HRP 6 4 3
SRM 2779 petro270 Gulf of Mexico Crude Oil Crude Oil Crude 7 3 2
SRM 2779 petro271 Gulf of Mexico Crude Oil Crude Oil Crude 7 3 2
SRM 2779 petro272 Gulf of Mexico Crude Oil Crude Oil Crude 7 3 2
JP8 petro246 Jet Fuel Jet Fuel LRP 8 5 1
JP8 petro247 Jet Fuel Jet Fuel LRP 8 5 1
JP8 petro248 Jet Fuel Jet Fuel LRP 8 5 1
JP5 petro250 Jet Fuel Jet Fuel LRP 8 5 1
JP5 petro251 Jet Fuel Jet Fuel LRP 8 5 1
JP5 petro252 Jet Fuel Jet Fuel LRP 8 5 1
Jet Fuel A petro254 Jet Fuel Jet Fuel LRP 8 5 1
Jet Fuel A petro255 Jet Fuel Jet Fuel LRP 8 5 1
Jet Fuel A petro256 Jet Fuel Jet Fuel LRP 8 5 1
SRM 2723b petro226 Low S Diesel Diesel LRP 9 6 1
SRM 2723b petro227 Low S Diesel Diesel LRP 9 6 1
SRM 2723b petro228 Low S Diesel Diesel LRP 9 6 1
SRM 1848 petro218 Motor Oil Additive Motor Oil HRP 10 7 3
SRM 1848 petro219 Motor Oil Additive Motor Oil HRP 10 7 3
SRM 1848 petro220 Motor Oil Additive Motor Oil HRP 10 7 3
SRM 2299 petro210 S in gasoline Gasoline LRP 11 1 1
SRM 2299 petro211 S in gasoline Gasoline LRP 11 1 1
SRM 2299 petro212 S in gasoline Gasoline LRP 11 1 1
SRM 1617b petro242 S in Kerosene (High Level) Kerosene LRP 12 8 1
SRM 1617b petro243 S in Kerosene (High Level) Kerosene LRP 12 8 1
SRM 1617b petro244 S in Kerosene (High Level) Kerosene LRP 12 8 1
SRM 1616b petro262 S in Kerosene (Low Level) Kerosene LRP 13 8 1
SRM 1616b petro263 S in Kerosene (Low Level) Kerosene LRP 13 8 1
SRM 1616b petro264 S in Kerosene (Low Level) Kerosene LRP 13 8 1
SRM 2770 petro234 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 2770 petro235 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 2770 petro236 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1623c petro238 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1623c petro239 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1623c petro240 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1620c petro278 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1620c petro279 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1620c petro280 S in Residual Fuel Oil RFO HRP 14 9 3
SRM 1624d petro214 Sulfur in Diesel Diesel LRP 15 6 1
SRM 1624d petro215 Sulfur in Diesel Diesel LRP 15 6 1
SRM 1624d petro216 Sulfur in Diesel Diesel LRP 15 6 1
SRM 2771 petro222 Zero S Diesel Diesel LRP 16 6 1
SRM 2771 petro223 Zero S Diesel Diesel LRP 16 6 1
SRM 2771 petro224 Zero S Diesel Diesel LRP 16 6 1

Missing data imputation with zeros

unfData[is.na(unfData)] <- 0

Data cleaning

Feature columns with zero standard deviation are removed from analysis.

unfData <- unfData[, which(sapply(1:dim(unfData)[2], function(x) sd(unfData[, x])) > 0)]

dataMat <- dummy_dataMat <- matrix(ncol = (dim(unfData)[2]), nrow = dim(unfData)[1])

Data scaling

For no dimensionality reduction option, row-wise min-max scaling is used. For no dimensionality reduction option, row-wise z-score normalization is used.

# For no dimensionality reduction option: Row-wise min-max scaling
for (i in 1:dim(unfData)[2]) {
    for (j in 1:dim(unfData)[1]) {
        minVal <- min(unfData[j, ])
        maxVal <- max(unfData[j, ])
        dataMat[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
    }
}

# For no dimensionality reduction option: Row-wise z-score normalization
for (i in 1:dim(unfData)[2]) {
    for (j in 1:dim(unfData)[1]) {
        meanVal <- mean(unfData[j, ])
        sdVal <- sd(unfData[j, ])
        dummy_dataMat[j, i] <- (unfData[j, i] - meanVal)/sdVal
    }
}

Singular Value Decomposition

Singular Value Decomposition with a threshold of 85% variance

Xmat <- svd(dummy_dataMat)
criteriaInd <- max(which(cumsum(Xmat$d/sum(Xmat$d) * 100) <= 85))
selU_mat <- Xmat$u[, 1:criteriaInd]
selD_mat <- matrix(0, ncol = dim(selU_mat)[2], nrow = dim(selU_mat)[2])
diag(selD_mat) <- as.matrix(sqrt(Xmat$d[1:criteriaInd]))
red_dataMat <- selU_mat %*% selD_mat

Clustering analysis

refObj <- NULL
refObj <- sampleIDs[, paste("Index_", classNum, "class", sep = "")]
names(refObj) <- rownames(dataMat) <- rownames(red_dataMat) <- rownames(unfData)

dMetric_ref <- dist(refObj)  #Euclidean distance
hclustfun <- list()
hclustfun[[1]] = function(x) hclust(x, method = "ward.D2")
hclustfun[[2]] = function(x) hclust(x, method = "average")
hclustfun[[3]] = function(x) hclust(x, method = "complete")

refClust <- list()
count <- 0
for (i in 1:length(hclustfun)) {
    count <- count + 1
    refClust[[count]] <- hclustfun[[i]](dMetric_ref)
}

# Defining colors for the dendrogram
colorsMat <- c("goldenrod", "darkorange4", "darkolivegreen", "blue4", "firebrick4", "plum1", "grey50", "seagreen3", "palevioletred1", "turquoise1", "deeppink4", 
    "dodgerblue", "darkorchid3", "saddlebrown", "maroon2", "lawngreen")

# Defining the number of cuts for the F-M index calculation
treeInd <- classNum

# Calling functions for producing grouping structure based on the selected parameters
results <- get_clustResults(dataMat, treeInd, refClust, methodInd, colorsMat, classNum)
reduced_results <- get_clustResults(red_dataMat, treeInd, refClust, methodInd, colorsMat, classNum)
F-M Indices and p-values (No Dimensionality Reduction)
method tree_Cut FM_index null_FM_index standard_dev p_value
Ward_KendallCor 3 0.53 0.39 0.02 0.00
Ward_PearsonCor 3 0.40 0.41 0.02 0.31
Ward_SpearmanCor 3 0.49 0.44 0.02 0.02
Avg_KendallCor 3 0.57 0.53 0.03 0.04
Avg_PearsonCor 3 0.51 0.50 0.03 0.26
Avg_SpearmanCor 3 0.57 0.53 0.03 0.04
Complete_KendallCor 3 0.64 0.45 0.02 0.00
Complete_PearsonCor 3 0.51 0.51 0.03 0.36
Complete_SpearmanCor 3 0.57 0.53 0.03 0.05
F-M Indices and p-values (Dimensionality Reduction)
method tree_Cut FM_index null_FM_index standard_dev p_value
Ward_KendallCor 3 0.41 0.38 0.01 0.02
Ward_PearsonCor 3 0.50 0.39 0.01 0.00
Ward_SpearmanCor 3 0.41 0.38 0.01 0.01
Avg_KendallCor 3 0.56 0.44 0.02 0.00
Avg_PearsonCor 3 0.54 0.48 0.03 0.04
Avg_SpearmanCor 3 0.48 0.42 0.02 0.00
Complete_KendallCor 3 0.41 0.39 0.02 0.06
Complete_PearsonCor 3 0.60 0.44 0.02 0.00
Complete_SpearmanCor 3 0.41 0.39 0.02 0.07

Generating dendrograms

Dendrogram is generated for the selected clustering methodology (Average linkage & Pearson correlation with no dimensionality reduction)

i <- methodInd  # passing the clustering methodology index for generation of dendrograms
hc <- results$hc
groups <- NULL
groups <- cutree(hc[[i]], k = treeInd)  #cut tree into n clusters, where n is treeInd

dd.row <- as.dendrogram(hc[[i]])
ddata_x <- dendro_data(dd.row, type = "rectangle")
groupMat <- data.frame(label = sampleIDs$Sample, groupInfo = sampleIDs[, paste("Labels_", classNum, "_Class", sep = "")])
labs <- merge(label(ddata_x), groupMat, by.y = "label")
labs_final <- labs[order(labs$x, decreasing = FALSE), ]
Class_Labels <- as.factor(labs_final$groupInfo)
ggplot(segment(ddata_x)) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend), colour = "black") + geom_text(data = label(ddata_x), aes(label = label, 
    x = x, y = 0, colour = Class_Labels), size = 5, fontface = 2, nudge_y = 0.8) + theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), 
    axis.text = element_blank(), axis.ticks.x = element_blank(), axis.ticks.y = element_blank()) + labs(x = "", y = "") + scale_color_manual(values = colorsMat) + 
    coord_flip() + scale_y_reverse(expand = c(1, 0))

Dendrogram is generated for the selected clustering methodology (Average linkage & Pearson correlation after dimensionality reduction)

i <- methodInd  # passing the clustering methodology index for generation of dendrograms
hc <- reduced_results$hc
groups <- NULL
groups <- cutree(hc[[i]], k = treeInd)  #cut tree into n clusters, where n is treeInd

dd.row <- as.dendrogram(hc[[i]])
ddata_x <- dendro_data(dd.row, type = "rectangle")
groupMat <- data.frame(label = sampleIDs$Sample, groupInfo = sampleIDs[, paste("Labels_", classNum, "_Class", sep = "")])
labs <- merge(label(ddata_x), groupMat, by.y = "label")
labs_final <- labs[order(labs$x, decreasing = FALSE), ]
Class_Labels <- as.factor(labs_final$groupInfo)
ggplot(segment(ddata_x)) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend), colour = "black") + geom_text(data = label(ddata_x), aes(label = label, 
    x = x, y = 0, colour = Class_Labels), size = 5, fontface = 2, nudge_y = 0.8) + theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), 
    axis.text = element_blank(), axis.ticks.x = element_blank(), axis.ticks.y = element_blank()) + labs(x = "", y = "") + scale_color_manual(values = colorsMat) + 
    coord_flip() + scale_y_reverse(expand = c(1, 0))

Supervised Analysis - Predicting Known Substances

Function for the Random Forest

This is used for building the Random Forest model and retrieving the important features.

"%notin%" <- Negate("%in%")

get_modelResults <- function(mergedTable, optimal_mtry) {
    featRank <- data.frame()
    rf_Model <- randomForest(formula = Status ~ ., data = mergedTable, mtry = optimal_mtry, importance = TRUE)
    featRank <- importance(rf_Model, type = 1, scale = TRUE)
    rankedFeats <- rownames(featRank)[order(featRank, decreasing = TRUE)]
    finalResults <- list(model = rf_Model, rankedFeats = rankedFeats, featRank_details = featRank)
    return(finalResults)
}

Setting the seed for reproducibility

set.seed(42)

Read GC-MS data of SRM samples

unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1]  #assigning the row names as sample names
unfData <- unfData[, -1]  #excluding the first column that contains sample names
unfData <- as.matrix(unfData)

Reference categorization of SRM samples

sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)

Missing data imputation with zeros

unfData[is.na(unfData)] <- 0

Input parameter options

  1. classNum: Number of classes /
    Options: 3,9 or 16 classes for SRM grouping

  2. option_rmDuplicate: Boolean value to decide whether to remove the replicates or not
    Options: “Y” or “y” for yes / “N” or “n” for no

# For this analysis, we select the following input arguments:
classNum <- 3
option_rmDuplicate <- "n"

Removing replicate samples

if (option_rmDuplicate %in% c("y", "Y")) {
    sampleIDs <- sampleIDs[which(!duplicated(sampleIDs$Name)), ]  #removing 2nd & 3rd duplicates
    # sampleIDs <- sampleIDs[seq(2,dim(sampleIDs)[1],3),] #removing 1st & 3rd duplicates sampleIDs <- sampleIDs[seq(3,dim(sampleIDs)[1],3),] #removing 1st
    # & 2nd duplicates
    unfData <- unfData[rownames(unfData) %in% sampleIDs$File, ]
} else if (option_rmDuplicate %notin% c("n", "N")) {
    print("Invalid input for removal of duplicates option!")
}

Sample labeling

In this part of the code, we attach the relevant labels (based on the input class number) to the data table.

if (classNum == 3) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_3class, file = sampleIDs$File))
}
if (classNum == 9) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_9class, file = sampleIDs$File))
}
if (classNum == 16) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_16class, file = sampleIDs$File))
}

Data scaling and cleaning

# Scales the data row-wise:
pre1_mergedTable <- matrix(ncol = dim(unfData)[2], nrow = dim(unfData)[1])
rownames(pre1_mergedTable) <- rownames(unfData)
colnames(pre1_mergedTable) <- colnames(unfData)
for (i in 1:dim(unfData)[2]) {
    for (j in 1:dim(unfData)[1]) {
        minVal <- min(as.matrix(unfData[j, ]))
        maxVal <- max(as.matrix(unfData[j, ]))
        pre1_mergedTable[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
    }
}

# Removes the columns with sd = 0 :
pre2_mergedTable <- pre1_mergedTable[, as.numeric(which(apply(pre1_mergedTable, 2, sd) > 0))]

# Scales the data column-wise:
pre3_mergedTable <- matrix(nrow = dim(pre2_mergedTable)[1], ncol = dim(pre2_mergedTable)[2])
rownames(pre3_mergedTable) <- rownames(pre2_mergedTable)
colnames(pre3_mergedTable) <- colnames(pre2_mergedTable)
for (j in 1:dim(pre2_mergedTable)[1]) {
    for (i in 1:dim(pre2_mergedTable)[2]) {
        minVal <- min(as.matrix(pre2_mergedTable[, i]))
        maxVal <- max(as.matrix(pre2_mergedTable[, i]))
        pre3_mergedTable[j, i] <- (pre2_mergedTable[j, i] - minVal)/(maxVal - minVal)
    }
}
mergedTable <- as.data.frame(cbind(pre3_mergedTable, Status = selTable$Status))

Formatting the data table

Column names are cleaned.

colnames(mergedTable) <- gsub("-", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("/", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[(]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[)]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[,]", "", colnames(mergedTable))
mergedTable$Status <- as.factor(mergedTable$Status)

Tuning Random Forest model

Method: Random Forest Classification
Tuning step allows us to get optimal mtry parameter, which determines the number of features used in model building.

seeds <- vector(mode = "list", length = 61)
for (i in 1:60) seeds[[i]] <- sample.int(1000, 60)
seeds[[61]] <- sample.int(1000, 1)
control <- trainControl(method = "LOOCV", search = "grid", seeds = seeds)
tunegrid <- expand.grid(.mtry = c(1:(dim(mergedTable)[2] - 1)))
rf_random <- train(Status ~ ., data = mergedTable, method = "rf", metric = "Accuracy", tuneGrid = tunegrid, trControl = control)
optimal_mtry <- rf_random$bestTune$mtry


loopNum <- 1
confMat <- vector(mode = "list", length = 1)
modelResults <- get_modelResults(mergedTable, optimal_mtry)
confMat[[1]] <- modelResults$model$confusion
rankedFeats <- modelResults$rankedFeats
featRank_stat <- modelResults$featRank_details

save(confMat, file = paste("confMatrix_", classNum, "class.dat", sep = ""))  #raw format is saved for further use
dummy <- as.data.frame(confMat[[1]])
notPerm_conf <- as.matrix(dummy[, 1:as.numeric(classNum)])
notPerm_acc <- sum(diag(notPerm_conf))/(as.numeric(dim(unfData)[1])) * 100

Formatting and saving the confusion matrix

original_confMat <- t(confMat[[1]][1:3, 1:3])  #transpose is taken to put 'true' label as columns and 'predicted' as rows
if (as.numeric(classNum) == 3) {
    labelNames <- c("LRP", "Crude", "HRP")
}
if (as.numeric(classNum) == 9) {
    labelNames <- c("Gasoline", "Biodiesel", "Crude Oil", "Gas Oil", "Jet Fuel", "Diesel", "Motor Oil", "Kerosene", "RFO")
}
if (as.numeric(classNum) == 16) {
    labelNames <- c("87 Octane Gasoline", "Biodiesel (Animal-based)", "Biodiesel
(Soy-based)", "Crude Oil (Heavy-sweet)", "Crude Oil
(Light-sour)", "Gas Oil", 
        "Gulf of Mexico Crude Oil", "Jet Fuel", "Low S Diesel", "Motor Oil Additive", "S in Gasoline", "S in
Kerosene (High Level)", "S in Kerosene (Low Level)", 
        "S in
Residual Fuel Oil", "Sulfur in Diesel", "Zero S Diesel")
}
rownames(original_confMat) <- colnames(original_confMat) <- labelNames
write.csv(original_confMat, file = paste("confMatrix_", classNum, "class.csv", sep = ""))

Formatting and saving the feature ranking

featRank_stat <- as.data.frame(featRank_stat)
featRank_final <- as.matrix(featRank_stat[order(featRank_stat$MeanDecreaseAccuracy, decreasing = T), 1])
rownames(featRank_final) <- rownames(featRank_stat)[order(featRank_stat$MeanDecreaseAccuracy, decreasing = T)]
colnames(featRank_final) <- c("Mean Decrease in Accuracy")
write.csv(featRank_final, file = paste("featureRanking_", classNum, "_class.csv", sep = ""))

Results

Original Confusion Matrix:

##       LRP Crude HRP
## LRP    36     0   0
## Crude   0     9   0
## HRP     0     0  15
## [1] "Original Confusion Matrix Accuracy: 100"
Ranked Analytical Features
Mean Decrease in Accuracy
C2.Chrysenes.Benzo.a.anthracenes 7.520562
C1.Dibenzothiophenes 7.099796
Dibenzofuran 7.080316
C2.Dibenzothiophenes 6.910357
C3.Phenanthrene.anthracenes 6.901531
C4.Naphthalenes 6.820887
C2.Fluoranthene.pyrenes 6.696374
C4.Phenanthrene.anthracenes 6.574730
C2.Decalins 6.571430
Naphthobenzothiophene 6.540177
C4.Dibenzothiophenes 6.489756
C1.Benzothiophenes 6.489034
C3.Dibenzothiophenes 6.467766
Fluoranthene 6.455721
Acenaphthene 6.453360
C3.Fluorenes 6.311278
Acenaphthylene 6.302357
C3.Naphthalenes 6.300192
Benzo.b.fluoranthene 6.279048
C2.Phenanthrene.anthracenes 6.266151
Benzothiophene 6.248435
Pyrene 6.199187
C2.Fluorenes 6.142463
C2.Naphthobenzothiophenes 6.121350
Naphthalene 6.100423
C1.Decalins 6.092001
Benzo.k.fluoranthene 6.063806
C2.Naphthalenes 6.020790
Benzo.g.h.i.perylene 5.992569
C1.Fluoranthene.pyrenes 5.949770
Indeno.1.2.3.cd.pyrene 5.931836
Anthracene 5.906540
C1.Phenanthrene.anthracenes 5.872676
Decalin 5.861592
Biphenyl 5.848396
Benzo.e.pyrene 5.846148
Benz.a.anthracene 5.808037
Dibenzo.a.h.anthracene 5.797696
Phenanthrene 5.791218
C4.Chrysenes.Benzo.a.anthracenes 5.761235
Dibenzothiophene 5.760049
C1.Naphthalenes 5.593102
C1.Naphthobenzothiophenes 5.399784
Chrysene 5.277497
C3.Benzothiophenes 5.274166
C3.Decalins 5.270193
Benzo.a.pyrene 5.247689
Perylene 5.237808
Fluorene 5.150002
C1.Fluorenes 5.127238
C1.Chrysenes.Benzo.a.anthracenes 5.070513
C3.Naphthobenzothiophenes 4.903954
C3.Chrysenes.Benzo.a.anthracenes 4.861172
C3.Fluoranthene.pyrenes 4.809014
C2.Benzothiophenes 4.767038

Assessing the Statistical Signifance of Results

We’ll now use the same workflow to generate average confusion matrix from 1000 permutations.

# Defines a necesaary function to be used within the code:
"%notin%" <- Negate("%in%")

# Define model generation function:
get_modelResults <- function(mergedTable, optimal_mtry) {
    featRank <- data.frame()
    rf_Model <- randomForest(formula = Status ~ ., data = mergedTable, mtry = optimal_mtry, importance = TRUE)
    featRank <- importance(rf_Model, type = 1, scale = TRUE)
    rankedFeats <- rownames(featRank)[order(featRank, decreasing = TRUE)]
    # rankedFeats <- colnames(mergedTable)[order(featRank,decreasing=TRUE)] #same with above line
    finalResults <- list(model = rf_Model, rankedFeats = rankedFeats, featRank_details = featRank)
    return(finalResults)
}

# Setting seed for reproducible results:
set.seed(42)

# Reading processed input GC-MS data of SRM samples
unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1]  #assigning the row names as sample names
unfData <- unfData[, -1]  #excluding the first column that contains sample names
unfData <- as.matrix(unfData)

# Reading reference category information of SRM samples
sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)

# Missing data imputation with zeros:
unfData[is.na(unfData)] <- 0

# Input parameter options: 1. classNum: Number of classes / Options: 3,9 or 16 classes for SRM grouping 2. option_rmDuplicate: Boolean value to decide
# whether to remove the replicates or not Options: 'Y' or 'y' for yes / 'N' or 'n' for no

# Input arguments:
classNum <- 3
option_rmDuplicate <- "n"

# To remove duplicates:
if (option_rmDuplicate %in% c("y", "Y")) {
    sampleIDs <- sampleIDs[which(!duplicated(sampleIDs$Name)), ]  #removing 2nd & 3rd duplicates
    # sampleIDs <- sampleIDs[seq(2,dim(sampleIDs)[1],3),] #removing 1st & 3rd duplicates sampleIDs <- sampleIDs[seq(3,dim(sampleIDs)[1],3),] #removing 1st
    # & 2nd duplicates
    unfData <- unfData[rownames(unfData) %in% sampleIDs$File, ]
} else if (option_rmDuplicate %notin% c("n", "N")) {
    print("Invalid input for removal of duplicates option!")
}

# Attaches the relevant labels (based on the input class number) to the data table:
if (classNum == 3) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_3class, file = sampleIDs$File))
}
if (classNum == 9) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_9class, file = sampleIDs$File))
}
if (classNum == 16) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_16class, file = sampleIDs$File))
}

# Scales the data row-wise:
pre1_mergedTable <- matrix(ncol = dim(unfData)[2], nrow = dim(unfData)[1])
rownames(pre1_mergedTable) <- rownames(unfData)
colnames(pre1_mergedTable) <- colnames(unfData)
for (i in 1:dim(unfData)[2]) {
    for (j in 1:dim(unfData)[1]) {
        minVal <- min(as.matrix(unfData[j, ]))
        maxVal <- max(as.matrix(unfData[j, ]))
        pre1_mergedTable[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
    }
}

# Removes the columns with sd = 0 :
pre2_mergedTable <- pre1_mergedTable[, as.numeric(which(apply(pre1_mergedTable, 2, sd) > 0))]
# Scales the data column-wise:
pre3_mergedTable <- matrix(nrow = dim(pre2_mergedTable)[1], ncol = dim(pre2_mergedTable)[2])
rownames(pre3_mergedTable) <- rownames(pre2_mergedTable)
colnames(pre3_mergedTable) <- colnames(pre2_mergedTable)
for (j in 1:dim(pre2_mergedTable)[1]) {
    for (i in 1:dim(pre2_mergedTable)[2]) {
        minVal <- min(as.matrix(pre2_mergedTable[, i]))
        maxVal <- max(as.matrix(pre2_mergedTable[, i]))
        pre3_mergedTable[j, i] <- (pre2_mergedTable[j, i] - minVal)/(maxVal - minVal)
    }
}
mergedTable <- as.data.frame(cbind(pre3_mergedTable, Status = selTable$Status))

# Formatting the data table (specifically - cleaning the column names):
colnames(mergedTable) <- gsub("-", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("/", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[(]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[)]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[,]", "", colnames(mergedTable))
mergedTable$Status <- as.factor(mergedTable$Status)

# Tuning Random Forest model to get optimal mtry parameter (determines the number of features used in model building) Method: Random Forest
# Classification
seeds <- vector(mode = "list", length = 61)
for (i in 1:60) seeds[[i]] <- sample.int(1000, 60)
seeds[[61]] <- sample.int(1000, 1)
control <- trainControl(method = "LOOCV", search = "grid", seeds = seeds)
tunegrid <- expand.grid(.mtry = c(1:(dim(mergedTable)[2] - 1)))
rf_random <- train(Status ~ ., data = mergedTable, method = "rf", metric = "Accuracy", tuneGrid = tunegrid, trControl = control)
optimal_mtry <- rf_random$bestTune$mtry

# Permutes the data labels by checking the permutation option input:
permNumber <- 1000
permuted_confMat <- vector(mode = "list", length = permNumber)
loopNum <- permNumber
for (l in 1:loopNum) {
    mergedTable$Status <- as.factor(sample(mergedTable$Status))
    modelResults <- get_modelResults(mergedTable, optimal_mtry)
    permuted_confMat[[l]] <- modelResults$model$confusion
}

save(permuted_confMat, file = paste("permuted_confMatrix_", classNum, "class.dat", sep = ""))

load(paste("confMatrix_", classNum, "class.dat", sep = ""))
load(paste("original_classification_accuracy_", classNum, "class.dat", sep = ""))
dummy <- as.data.frame(confMat[[1]])
notPerm_conf <- as.matrix(dummy[, 1:as.numeric(classNum)])
notPerm_acc <- sum(diag(notPerm_conf))/(as.numeric(dim(unfData)[1])) * 100

dsum <- matrix(ncol = as.numeric(classNum), nrow = as.numeric(classNum), 0)
a <- matrix(ncol = as.numeric(classNum), nrow = as.numeric(classNum))
acc <- NULL
for (i in 1:permNumber) {
    dummy <- as.data.frame(permuted_confMat[[i]])
    a <- as.matrix(dummy[, 1:as.numeric(classNum)])
    acc[i] <- sum(diag(a))/as.numeric(dim(unfData)[1]) * 100
    dsum <- as.table(dsum) + a
}
avgConf <- dsum/permNumber
histAcc <- hist(acc, plot = FALSE)
pVal <- sum(histAcc$counts[which(histAcc$breaks >= notPerm_acc)], na.rm = TRUE)/permNumber

if (as.numeric(classNum) == 3) {
    colnames(notPerm_conf) <- rownames(notPerm_conf) <- colnames(avgConf) <- rownames(avgConf) <- c("LRP", "Crude", "HRP")
}
if (as.numeric(classNum) == 9) {
    colnames(notPerm_conf) <- rownames(notPerm_conf) <- colnames(avgConf) <- rownames(avgConf) <- c("Gasoline", "Biodiesel", "Crude Oil", "Gas Oil", "Jet Fuel", 
        "Diesel", "Motor Oil", "Kerosene", "RFO")
}
if (as.numeric(classNum) == 16) {
    colnames(notPerm_conf) <- rownames(notPerm_conf) <- colnames(avgConf) <- rownames(avgConf) <- c("87 Octane Gasoline", "Biodiesel (Animal-based)", "Biodiesel (Soy-based)", 
        "Crude Oil (Heavy-sweet)", "Crude Oil (Light-sour)", "Gas Oil", "Gulf of Mexico Crude Oil", "Jet Fuel", "Low S Diesel", "Motor Oil Additive", "S in Gasoline", 
        "S in Kerosene (High Level)", "S in Kerosene (Low Level)", "S in Residual Fuel Oil", "Sulfur in Diesel", "Zero S Diesel")
}

# Formatting and saving the confusion matrix:
average_confMat <- t(avgConf[1:classNum, 1:classNum])
if (as.numeric(classNum) == 3) {
    labelNames <- c("LRP", "Crude", "HRP")
}
if (as.numeric(classNum) == 9) {
    labelNames <- c("Gasoline", "Biodiesel", "Crude Oil", "Gas Oil", "Jet Fuel", "Diesel", "Motor Oil", "Kerosene", "RFO")
}
if (as.numeric(classNum) == 16) {
    labelNames <- c("87 Octane Gasoline", "Biodiesel (Animal-based)", "Biodiesel
(Soy-based)", "Crude Oil (Heavy-sweet)", "Crude Oil
(Light-sour)", "Gas Oil", 
        "Gulf of Mexico Crude Oil", "Jet Fuel", "Low S Diesel", "Motor Oil Additive", "S in Gasoline", "S in
Kerosene (High Level)", "S in Kerosene (Low Level)", 
        "S in
Residual Fuel Oil", "Sulfur in Diesel", "Zero S Diesel")
}
rownames(average_confMat) <- colnames(average_confMat) <- labelNames
write.csv(average_confMat, file = paste("permutation_confMatrix_", classNum, "class.csv", sep = ""))

Average Confusion Matrix from 1000 permutations:

print(average_confMat)
##          LRP  Crude    HRP
## LRP   22.391  5.773  9.574
## Crude  5.051  1.116  2.078
## HRP    8.558  2.111  3.348
print(paste("Mean Accuracy:", round(mean(acc), 2), sep = " "))
## [1] "Mean Accuracy: 44.76"
print(paste("Std Accuracy:", round(sd(acc), 2), sep = " "))
## [1] "Std Accuracy: 6.98"
print(paste("P-value:", round(pVal, 3), sep = " "))
## [1] "P-value: 0"

Supervised Analysis - Predicting Unknown Substances

Function for the Random Forest

This is used for building the Random Forest model and retrieving the important features.

"%notin%" <- Negate("%in%")
# Define model generation function:
get_modelResults <- function(testTable, trainTable, optimal_mtry) {
    rf_Model <- randomForest(formula = Status ~ ., data = trainTable, mtry = optimal_mtry, importance = TRUE)
    predictedVal <- predict(rf_Model, testTable)
    confMat <- table(testTable$Status, predictedVal)
    return(confMat)
}

Setting the seed for reproducibility

set.seed(42)

Read GC-MS data of SRM samples

unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1]  #assigning the row names as sample names
unfData <- unfData[, -1]  #excluding the first column that contains sample names
unfData <- as.matrix(unfData)

Reference categorization of SRM samples

sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)

Missing data imputation with zeros

unfData[is.na(unfData)] <- 0

Input parameter options

  1. classNum: Number of classes /
    Options: 3,9 or 16 classes for SRM grouping

  2. option_permutation: Boolean value to decide whether to permute the sample labels or not
    Options: “Y” or “y” for yes / “N” or “n” for no

# For this analysis, we select the following input arguments:
classNum <- 3
option_permutation <- "n"

Sample labeling

In this part of the code, we attach the relevant labels (based on the input class number) to the data table.

if (classNum == 3) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_3class, file = sampleIDs$File, material = sampleIDs$Name))
}
if (classNum == 9) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_9class, file = sampleIDs$File, material = sampleIDs$Name))
}
if (classNum == 16) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_16class, file = sampleIDs$File, material = sampleIDs$Name))
}

Data scaling and cleaning

# Scales the data row-wise:
pre1_mergedTable <- matrix(ncol = dim(unfData)[2], nrow = dim(unfData)[1])
rownames(pre1_mergedTable) <- rownames(unfData)
colnames(pre1_mergedTable) <- colnames(unfData)
for (i in 1:dim(unfData)[2]) {
    for (j in 1:dim(unfData)[1]) {
        minVal <- min(as.matrix(unfData[j, ]))
        maxVal <- max(as.matrix(unfData[j, ]))
        pre1_mergedTable[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
    }
}

# Removes the columns with sd = 0 :
pre2_mergedTable <- pre1_mergedTable[, as.numeric(which(apply(pre1_mergedTable, 2, sd) > 0))]

# Scales the data column-wise:
pre3_mergedTable <- matrix(nrow = dim(pre2_mergedTable)[1], ncol = dim(pre2_mergedTable)[2])
rownames(pre3_mergedTable) <- rownames(pre2_mergedTable)
colnames(pre3_mergedTable) <- colnames(pre2_mergedTable)
for (j in 1:dim(pre2_mergedTable)[1]) {
    for (i in 1:dim(pre2_mergedTable)[2]) {
        minVal <- min(as.matrix(pre2_mergedTable[, i]))
        maxVal <- max(as.matrix(pre2_mergedTable[, i]))
        pre3_mergedTable[j, i] <- (pre2_mergedTable[j, i] - minVal)/(maxVal - minVal)
    }
}
mergedTable <- as.data.frame(cbind(pre3_mergedTable, Status = selTable$Status, material = selTable$material))
colNames <- c(colnames(unfData), "Status", "material")
colnames(mergedTable) <- colNames

Formatting the data table

Column names are cleaned.

colnames(mergedTable) <- gsub("-", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("/", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[(]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[)]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[,]", "", colnames(mergedTable))
mergedTable$Status <- as.factor(mergedTable$Status)

Training Random Forest Classifier

Random Forest Classifier is trained for each SRM substance separately.

confMat_all <- vector(mode = "list", length = length(unique(mergedTable$material)))

for (m in 1:length(unique(mergedTable$material))) {
    # length(unique(mergedTable$material)) = 20 SRM substance!
    trainTable <- mergedTable[mergedTable$material != unique(mergedTable$material)[m], 1:(dim(mergedTable)[2] - 1)]
    testTable <- mergedTable[mergedTable$material == unique(mergedTable$material)[m], 1:(dim(mergedTable)[2] - 1)]
    seeds <- vector(mode = "list", length = 61)
    for (i in 1:60) seeds[[i]] <- sample.int(1000, 60)
    seeds[[61]] <- sample.int(1000, 1)
    control <- trainControl(method = "LOOCV", search = "grid", seeds = seeds)
    tunegrid <- expand.grid(.mtry = c(1:(dim(trainTable)[2] - 1)))
    rf_random <- train(Status ~ ., data = trainTable, method = "rf", metric = "Accuracy", tuneGrid = tunegrid, trControl = control)
    optimal_mtry <- rf_random$bestTune$mtry
    
    # Permutes the data labels if the option_permutation is 'y' of 'Y':
    if (option_permutation %in% c("n", "N")) {
        loopNum <- 1
        confMat <- vector(mode = "list", length = 1)
        # getting test results for 3 replicates of each excluded SRM substance
        modelResults <- get_modelResults(testTable, trainTable, optimal_mtry)
        confMat[[1]] <- modelResults
    } else if (option_permutation %in% c("y", "Y")) {
        confMat <- vector(mode = "list", length = permNumber)
        loopNum <- permNumber
        for (l in 1:loopNum) {
            testTable$Status <- as.factor(sample(testTable$Status))
            modelResults <- get_modelResults(testTable, trainTable, optimal_mtry)
            confMat[[l]] <- modelResults
        }
    } else if (option_permutation %notin% c("y,", "Y", "n", "N")) {
        print("Invalid input for label permuting option!")
    }
    confMat_all[[m]] <- confMat
}

save(confMat_all, file = paste("original_confMatrix_", classNum, "class.dat", sep = ""))

Assessing the Statistical Signifance of Results

We’ll now use the same workflow to generate average confusion matrix from 1000 permutations.

# Define model generation function and predicting for unknown substance:
get_modelResults <- function(testTable, trainTable, optimal_mtry) {
    rf_Model <- randomForest(formula = Status ~ ., data = trainTable, mtry = optimal_mtry, importance = TRUE)
    predictedVal <- predict(rf_Model, testTable)
    confMat <- table(testTable$Status, predictedVal)
    return(confMat)
}

# Setting seed for reproducible results:
set.seed(42)

# Reading processed input GC-MS data of SRM samples
unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1]  #assigning the row names as sample names
unfData <- unfData[, -1]  #excluding the first column that contains sample names
unfData <- as.matrix(unfData)

# Reading reference category information of SRM samples
sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)

# Missing data imputation with zeros:
unfData[is.na(unfData)] <- 0

# Input parameter options: 1. classNum: Number of classes / Options: 3,9 or 16 classes for SRM grouping 2. option_permutation: Boolean value to decide
# whether to permute the sample labels / Options: 'Y' or 'y' for yes / 'N' or 'n' for no

# Input arguments:
classNum <- 3
option_permutation <- "y"  #options: 'Y' or 'y' for yes / 'N' or 'n' for no
permNumber <- 1000

# Attaches the relevant labels (based on the input class number) to the data table:
if (classNum == 3) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_3class, file = sampleIDs$File, material = sampleIDs$Name))
}
if (classNum == 9) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_9class, file = sampleIDs$File, material = sampleIDs$Name))
}
if (classNum == 16) {
    selTable <- as.data.frame(cbind(Status = sampleIDs$Index_16class, file = sampleIDs$File, material = sampleIDs$Name))
}

# Scales the data row-wise:
pre1_mergedTable <- matrix(ncol = dim(unfData)[2], nrow = dim(unfData)[1])
rownames(pre1_mergedTable) <- rownames(unfData)
colnames(pre1_mergedTable) <- colnames(unfData)
for (i in 1:dim(unfData)[2]) {
    for (j in 1:dim(unfData)[1]) {
        minVal <- min(as.matrix(unfData[j, ]))
        maxVal <- max(as.matrix(unfData[j, ]))
        pre1_mergedTable[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
    }
}

# Removes the columns with sd = 0 :
pre2_mergedTable <- pre1_mergedTable[, as.numeric(which(apply(pre1_mergedTable, 2, sd) > 0))]
# Scales the data column-wise:
pre3_mergedTable <- matrix(nrow = dim(pre2_mergedTable)[1], ncol = dim(pre2_mergedTable)[2])
rownames(pre3_mergedTable) <- rownames(pre2_mergedTable)
colnames(pre3_mergedTable) <- colnames(pre2_mergedTable)
for (j in 1:dim(pre2_mergedTable)[1]) {
    for (i in 1:dim(pre2_mergedTable)[2]) {
        minVal <- min(as.matrix(pre2_mergedTable[, i]))
        maxVal <- max(as.matrix(pre2_mergedTable[, i]))
        pre3_mergedTable[j, i] <- (pre2_mergedTable[j, i] - minVal)/(maxVal - minVal)
    }
}
mergedTable <- as.data.frame(cbind(pre3_mergedTable, Status = selTable$Status, material = selTable$material))
colNames <- c(colnames(unfData), "Status", "material")
colnames(mergedTable) <- colNames

# Formatting the data table (specifically - cleaning the column names):
colnames(mergedTable) <- gsub("-", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("/", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[(]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[)]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[,]", "", colnames(mergedTable))
mergedTable$Status <- as.factor(mergedTable$Status)

# Training Random Forest classifier for each SRM substance separately
confMat_all <- vector(mode = "list", length = length(unique(mergedTable$material)))

for (m in 1:length(unique(mergedTable$material))) {
    # length(unique(mergedTable$material)) = 20 SRM substance!
    trainTable <- mergedTable[mergedTable$material != unique(mergedTable$material)[m], 1:(dim(mergedTable)[2] - 1)]
    testTable <- mergedTable[mergedTable$material == unique(mergedTable$material)[m], 1:(dim(mergedTable)[2] - 1)]
    seeds <- vector(mode = "list", length = 61)
    for (i in 1:60) seeds[[i]] <- sample.int(1000, 60)
    seeds[[61]] <- sample.int(1000, 1)
    control <- trainControl(method = "LOOCV", search = "grid", seeds = seeds)
    tunegrid <- expand.grid(.mtry = c(1:(dim(trainTable)[2] - 1)))
    rf_random <- train(Status ~ ., data = trainTable, method = "rf", metric = "Accuracy", tuneGrid = tunegrid, trControl = control)
    optimal_mtry <- rf_random$bestTune$mtry
    
    # Permutes the data labels if the option_permutation is 'y' of 'Y':
    if (option_permutation %in% c("n", "N")) {
        loopNum <- 1
        confMat <- vector(mode = "list", length = 1)
        # getting test results for 3 replicates of each excluded SRM substance
        modelResults <- get_modelResults(testTable, trainTable, optimal_mtry)
        confMat[[1]] <- modelResults
    } else if (option_permutation %in% c("y", "Y")) {
        confMat <- vector(mode = "list", length = permNumber)
        loopNum <- permNumber
        for (l in 1:loopNum) {
            testTable$Status <- as.factor(sample(testTable$Status))
            modelResults <- get_modelResults(testTable, trainTable, optimal_mtry)
            confMat[[l]] <- modelResults
        }
    } else if (option_permutation %notin% c("y,", "Y", "n", "N")) {
        print("Invalid input for label permuting option!")
    }
    confMat_all[[m]] <- confMat
}

save(confMat_all, file = paste("permuted_confMatrix_", classNum, "class.dat", sep = ""))

Calculate classification accuracy and p-value

# calculation of accuracy & p-values:
totalSample_size <- 60  #3 replicates of 20 SRM substances

# loading the original confusion matrix results:
load(paste("original_confMatrix_", classNum, "class.dat", sep = ""))
notPerm_conf <- confMat_all[[1]][[1]]  #assigning the confusion matrix of the first SRM substance sample predictions

for (i in 2:20) {
    # appending the confusion matrices of the other SRM substance sample predictions
    notPerm_conf <- notPerm_conf + confMat_all[[i]][[1]]
}
# calculation of classification accuracy from the overall confusion matrix
notPerm_acc <- sum(diag(notPerm_conf))/(as.numeric(totalSample_size)) * 100

# loading the permuted confusion matrix results:
load(paste("permuted_confMatrix_", classNum, "class.dat", sep = ""))
perm_acc <- NULL
each_perm_confMat <- list()

formatted_confMat <- list()
pre_formatted_confMat <- list()

# re-formatting the confusion matrix arrangement:
for (i in 1:1000) {
    for (j in 1:20) {
        dummy <- confMat_all[[j]][[i]]
        pre_formatted_confMat[[j]] <- dummy
    }
    formatted_confMat[[i]] <- pre_formatted_confMat
}
# obtaining the overall confusion matrix at each permutation
total_perm_confMat <- matrix(0, nrow = classNum, ncol = classNum)
for (i in 1:1000) {
    each_perm_confMat[[i]] <- Reduce("+", formatted_confMat[[i]])
    perm_acc[i] <- sum(diag(each_perm_confMat[[i]]))/(as.numeric(totalSample_size)) * 100
    total_perm_confMat <- total_perm_confMat + as.matrix(each_perm_confMat[[i]])
}
# calculation of the p-value:
histAcc <- hist(perm_acc, plot = FALSE)
pVal <- sum(histAcc$counts[which(histAcc$breaks >= notPerm_acc)], na.rm = TRUE)/1000

if (as.numeric(classNum) == 3) {
    colnames(notPerm_conf) <- rownames(notPerm_conf) <- colnames(total_perm_confMat) <- rownames(total_perm_confMat) <- c("LRP", "Crude", "HRP")
}

print("Original Confusion Matrix:")
## [1] "Original Confusion Matrix:"
print(notPerm_conf)
##        predictedVal
##         LRP Crude HRP
##   LRP    36     0   0
##   Crude   3     0   6
##   HRP     5     1   9
print(paste("Original Confusion Matrix Accuracy:", notPerm_acc, sep = " "))
## [1] "Original Confusion Matrix Accuracy: 75"
print("Average Confusion Matrix from 1000 permutations:")
## [1] "Average Confusion Matrix from 1000 permutations:"
print(total_perm_confMat/1000)
##        predictedVal
##            LRP  Crude    HRP
##   LRP   35.980  0.012  0.008
##   Crude  3.000  0.000  6.000
##   HRP    5.604  2.311  7.085
print(paste("Mean Accuracy:", mean(perm_acc), sep = " "))
## [1] "Mean Accuracy: 71.775"
print(paste("Std Accuracy:", sd(perm_acc), sep = " "))
## [1] "Std Accuracy: 3.57046696462193"
print(paste("P-value:", pVal, sep = " "))
## [1] "P-value: 0.092"

Session Information

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## Random number generation:
##  RNG:     Mersenne-Twister 
##  Normal:  Inversion 
##  Sample:  Rounding 
##  
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] formatR_1.7         kableExtra_1.1.0    RColorBrewer_1.1-2  caret_6.0-84        lattice_0.20-38     randomForest_4.6-14 ggplot2_3.2.1       ggdendro_0.1-20     dendextend_1.12.0   knitr_1.25         
## 
## loaded via a namespace (and not attached):
##  [1] viridis_0.5.1      httr_1.4.1         viridisLite_0.3.0  splines_3.6.1      foreach_1.4.7      prodlim_2018.04.18 assertthat_0.2.1   highr_0.8          stats4_3.6.1       yaml_2.2.0         ipred_0.9-9       
## [12] pillar_1.4.2       backports_1.1.4    glue_1.3.1         digest_0.6.20      rvest_0.3.4        colorspace_1.4-1   recipes_0.1.7      htmltools_0.3.6    Matrix_1.2-17      plyr_1.8.4         timeDate_3043.102 
## [23] pkgconfig_2.0.2    purrr_0.3.2        scales_1.0.0       webshot_0.5.1      gower_0.2.1        lava_1.6.6         tibble_2.1.3       generics_0.0.2     withr_2.1.2        nnet_7.3-12        lazyeval_0.2.2    
## [34] survival_2.44-1.1  magrittr_1.5       crayon_1.3.4       evaluate_0.14      nlme_3.1-140       MASS_7.3-51.4      xml2_1.2.2         class_7.3-15       tools_3.6.1        data.table_1.12.2  hms_0.5.1         
## [45] stringr_1.4.0      profdpm_3.3        munsell_0.5.0      compiler_3.6.1     e1071_1.7-2        rlang_0.4.0        grid_3.6.1         iterators_1.0.12   rstudioapi_0.10    labeling_0.3       rmarkdown_1.15    
## [56] gtable_0.3.0       ModelMetrics_1.2.2 codetools_0.2-16   reshape2_1.4.3     R6_2.4.0           gridExtra_2.3      lubridate_1.7.4    dplyr_0.8.3        zeallot_0.1.0      readr_1.3.1        stringi_1.4.3     
## [67] Rcpp_1.0.2         vctrs_0.2.0        rpart_4.1-15       tidyselect_0.2.5   xfun_0.9