Packages required for analysis: “dendextend”,“ggdendro”,“ggplot2”,“randomForest”,“caret”,“RColorBrewer”. Their dependencies may also be required.
get_clustResults <- function(dataMat, treeInd, refClust, methodInd, colorsMat, classNum) {
# Calculation of row-wise correlation from the input data matrix:
corMat_pearson <- matrix(ncol = dim(dataMat)[1], nrow = dim(dataMat)[1])
corMat_kendall <- matrix(ncol = dim(dataMat)[1], nrow = dim(dataMat)[1])
corMat_spearman <- matrix(ncol = dim(dataMat)[1], nrow = dim(dataMat)[1])
for (i in 1:dim(dataMat)[1]) {
for (j in 1:dim(dataMat)[1]) {
corMat_pearson[i, j] <- cor(as.vector(dataMat[i, ]), as.vector(dataMat[j, ]), method = "pearson")
corMat_kendall[i, j] <- cor(as.vector(dataMat[i, ]), as.vector(dataMat[j, ]), method = "kendall")
corMat_spearman[i, j] <- cor(as.vector(dataMat[i, ]), as.vector(dataMat[j, ]), method = "spearman")
}
}
# Asssigning obtained correlation matrices as input similarity matrices for clustering:
sMetric <- list()
sMetric[[1]] <- corMat_kendall
sMetric[[2]] <- corMat_pearson
sMetric[[3]] <- corMat_spearman
for (i in 1:3) {
colnames(sMetric[[i]]) <- rownames(sMetric[[i]]) <- rownames(dataMat)
}
hclustfun <- list()
hclustfun[[1]] = function(x) hclust(x, method = "ward.D2")
hclustfun[[2]] = function(x) hclust(x, method = "average")
hclustfun[[3]] = function(x) hclust(x, method = "complete")
hc <- list()
count <- 0
for (i in 1:length(hclustfun)) {
for (j in 1:length(sMetric)) {
count <- count + 1
hc[[count]] <- hclustfun[[i]](dist(sMetric[[j]]))
}
}
methods <- c("Ward_KendallCor", "Ward_PearsonCor", "Ward_SpearmanCor", "Avg_KendallCor", "Avg_PearsonCor", "Avg_SpearmanCor", "Complete_KendallCor",
"Complete_PearsonCor", "Complete_SpearmanCor")
fmRefIndex <- matrix(nrow = length(methods), ncol = 1)
pre_nullFM_Index <- NULL
pVal <- NULL
nullFM_Index <- matrix(nrow = length(methods), ncol = 1)
sdFM_Index <- matrix(nrow = length(methods), ncol = 1)
permNumber <- 1000 #number of permutations
for (i in 1:length(methods)) {
# comparison is done to refClust[[2]] which is the average linkage on reference data
fmRefIndex[i] <- FM_index_R(cutree(hc[[i]], k = treeInd), cutree(refClust[[2]], k = treeInd))
pre_nullFM_Index <- Bk_permutations(hc[[i]], refClust[[2]], k = treeInd, R = permNumber)
nullFM_Index[i] <- mean(pre_nullFM_Index[[1]])
sdFM_Index[i] <- sd(pre_nullFM_Index[[1]])
histAcc <- hist(pre_nullFM_Index[[1]], plot = FALSE)
pVal[i] <- sum(histAcc$counts[which(histAcc$breaks >= fmRefIndex[i])], na.rm = TRUE)/permNumber
}
all.par <- NULL
all.par <- rbind(all.par, data.frame(method = methods, tree_Cut = treeInd, FM_index = round(fmRefIndex, 2), null_FM_index = round(nullFM_Index, 2),
standard_dev = round(sdFM_Index, 2), p_value = round(pVal, 2)))
results <- list(all.par = all.par, hc = hc)
return(results)
}
The default kind of the random number generator is changed in R 3.6.0. Setting the sampling kind to rounding will allow consistency with previous versions of R.
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler used
set.seed(42)
classNum: Number of classes /
Options: 3,9 or 16 classes for SRM grouping
methodInd: Index for the hierarchical clustering methodology
Options: 1-Kendall correlation with Ward’s method
2-Pearson correlation with Ward’s method
3-Spearman correlation with Ward’s method
4-Kendall correlation with average linkage method
5-Pearson correlation with average linkage method
6-Spearman correlation with average linkage method
7-Kendall correlation with complete linkage method
8-Pearson correlation with complete linkage method
9-Spearman correlation with complete linkage method
# For this analysis, we select the following input arguments:
classNum = 3
methodInd = 5
# Reading processed input GC-MS data of SRM samples
unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1] #assigning the row names as sample names
unfData <- unfData[, -1] #excluding the first column that contains sample names
unfData <- as.matrix(unfData)
Decalin | C1.Decalins | C2.Decalins | C3.Decalins | Naphthalene | |
---|---|---|---|---|---|
petro258 | 1683524 | 4176960 | 2478180 | 3125211 | 2101455 |
petro259 | 1649766 | 4104724 | 2433514 | 3085597 | 2060556 |
petro260 | 1610183 | 4043132 | 2391167 | 3032684 | 2059942 |
petro230 | 234594070 | 215240060 | 149590418 | 206905029 | 106635695 |
petro231 | 300689337 | 276231110 | 191760364 | 264012867 | 135649094 |
petro232 | 348938684 | 320796132 | 222587040 | 306269193 | 155050777 |
petro266 | 61680447 | 48228029 | 33821356 | 48344642 | 22031443 |
petro267 | 69692453 | 54506887 | 38215996 | 54608205 | 24832960 |
petro268 | 77114253 | 60267441 | 42288380 | 60480097 | 27432439 |
petro203 | 1957888 | 4406090 | 2546545 | 3242664 | 2670092 |
sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)
Name | Sample | Labels_16_Class | Labels_9_Class | Labels_3_Class | Index_16class | Index_9class | Index_3class |
---|---|---|---|---|---|---|---|
87 Octane Gasoline | petro258 | 87 Octane Gasoline | Gasoline | LRP | 1 | 1 | 1 |
87 Octane Gasoline | petro259 | 87 Octane Gasoline | Gasoline | LRP | 1 | 1 | 1 |
87 Octane Gasoline | petro260 | 87 Octane Gasoline | Gasoline | LRP | 1 | 1 | 1 |
SRM 2773 | petro230 | Biodiesel (Animal-based) | Biodiesel | LRP | 2 | 2 | 1 |
SRM 2773 | petro231 | Biodiesel (Animal-based) | Biodiesel | LRP | 2 | 2 | 1 |
SRM 2773 | petro232 | Biodiesel (Animal-based) | Biodiesel | LRP | 2 | 2 | 1 |
SRM 2772 | petro266 | Biodiesel (Soy-based) | Biodiesel | LRP | 3 | 2 | 1 |
SRM 2772 | petro267 | Biodiesel (Soy-based) | Biodiesel | LRP | 3 | 2 | 1 |
SRM 2772 | petro268 | Biodiesel (Soy-based) | Biodiesel | LRP | 3 | 2 | 1 |
SRM 2722 | petro203 | Crude Oil (Heavy-Sweet) | Crude Oil | Crude | 4 | 3 | 2 |
SRM 2722 | petro204 | Crude Oil (Heavy-Sweet) | Crude Oil | Crude | 4 | 3 | 2 |
SRM 2722 | petro205 | Crude Oil (Heavy-Sweet) | Crude Oil | Crude | 4 | 3 | 2 |
SRM 2721 | petro274 | Crude Oil (Light-Sour) | Crude Oil | Crude | 5 | 3 | 2 |
SRM 2721 | petro275 | Crude Oil (Light-Sour) | Crude Oil | Crude | 5 | 3 | 2 |
SRM 2721 | petro276 | Crude Oil (Light-Sour) | Crude Oil | Crude | 5 | 3 | 2 |
SRM 1615 | petro207 | Gas Oil | Gas Oil | HRP | 6 | 4 | 3 |
SRM 1615 | petro208 | Gas Oil | Gas Oil | HRP | 6 | 4 | 3 |
SRM 1615 | petro209 | Gas Oil | Gas Oil | HRP | 6 | 4 | 3 |
SRM 2779 | petro270 | Gulf of Mexico Crude Oil | Crude Oil | Crude | 7 | 3 | 2 |
SRM 2779 | petro271 | Gulf of Mexico Crude Oil | Crude Oil | Crude | 7 | 3 | 2 |
SRM 2779 | petro272 | Gulf of Mexico Crude Oil | Crude Oil | Crude | 7 | 3 | 2 |
JP8 | petro246 | Jet Fuel | Jet Fuel | LRP | 8 | 5 | 1 |
JP8 | petro247 | Jet Fuel | Jet Fuel | LRP | 8 | 5 | 1 |
JP8 | petro248 | Jet Fuel | Jet Fuel | LRP | 8 | 5 | 1 |
JP5 | petro250 | Jet Fuel | Jet Fuel | LRP | 8 | 5 | 1 |
JP5 | petro251 | Jet Fuel | Jet Fuel | LRP | 8 | 5 | 1 |
JP5 | petro252 | Jet Fuel | Jet Fuel | LRP | 8 | 5 | 1 |
Jet Fuel A | petro254 | Jet Fuel | Jet Fuel | LRP | 8 | 5 | 1 |
Jet Fuel A | petro255 | Jet Fuel | Jet Fuel | LRP | 8 | 5 | 1 |
Jet Fuel A | petro256 | Jet Fuel | Jet Fuel | LRP | 8 | 5 | 1 |
SRM 2723b | petro226 | Low S Diesel | Diesel | LRP | 9 | 6 | 1 |
SRM 2723b | petro227 | Low S Diesel | Diesel | LRP | 9 | 6 | 1 |
SRM 2723b | petro228 | Low S Diesel | Diesel | LRP | 9 | 6 | 1 |
SRM 1848 | petro218 | Motor Oil Additive | Motor Oil | HRP | 10 | 7 | 3 |
SRM 1848 | petro219 | Motor Oil Additive | Motor Oil | HRP | 10 | 7 | 3 |
SRM 1848 | petro220 | Motor Oil Additive | Motor Oil | HRP | 10 | 7 | 3 |
SRM 2299 | petro210 | S in gasoline | Gasoline | LRP | 11 | 1 | 1 |
SRM 2299 | petro211 | S in gasoline | Gasoline | LRP | 11 | 1 | 1 |
SRM 2299 | petro212 | S in gasoline | Gasoline | LRP | 11 | 1 | 1 |
SRM 1617b | petro242 | S in Kerosene (High Level) | Kerosene | LRP | 12 | 8 | 1 |
SRM 1617b | petro243 | S in Kerosene (High Level) | Kerosene | LRP | 12 | 8 | 1 |
SRM 1617b | petro244 | S in Kerosene (High Level) | Kerosene | LRP | 12 | 8 | 1 |
SRM 1616b | petro262 | S in Kerosene (Low Level) | Kerosene | LRP | 13 | 8 | 1 |
SRM 1616b | petro263 | S in Kerosene (Low Level) | Kerosene | LRP | 13 | 8 | 1 |
SRM 1616b | petro264 | S in Kerosene (Low Level) | Kerosene | LRP | 13 | 8 | 1 |
SRM 2770 | petro234 | S in Residual Fuel Oil | RFO | HRP | 14 | 9 | 3 |
SRM 2770 | petro235 | S in Residual Fuel Oil | RFO | HRP | 14 | 9 | 3 |
SRM 2770 | petro236 | S in Residual Fuel Oil | RFO | HRP | 14 | 9 | 3 |
SRM 1623c | petro238 | S in Residual Fuel Oil | RFO | HRP | 14 | 9 | 3 |
SRM 1623c | petro239 | S in Residual Fuel Oil | RFO | HRP | 14 | 9 | 3 |
SRM 1623c | petro240 | S in Residual Fuel Oil | RFO | HRP | 14 | 9 | 3 |
SRM 1620c | petro278 | S in Residual Fuel Oil | RFO | HRP | 14 | 9 | 3 |
SRM 1620c | petro279 | S in Residual Fuel Oil | RFO | HRP | 14 | 9 | 3 |
SRM 1620c | petro280 | S in Residual Fuel Oil | RFO | HRP | 14 | 9 | 3 |
SRM 1624d | petro214 | Sulfur in Diesel | Diesel | LRP | 15 | 6 | 1 |
SRM 1624d | petro215 | Sulfur in Diesel | Diesel | LRP | 15 | 6 | 1 |
SRM 1624d | petro216 | Sulfur in Diesel | Diesel | LRP | 15 | 6 | 1 |
SRM 2771 | petro222 | Zero S Diesel | Diesel | LRP | 16 | 6 | 1 |
SRM 2771 | petro223 | Zero S Diesel | Diesel | LRP | 16 | 6 | 1 |
SRM 2771 | petro224 | Zero S Diesel | Diesel | LRP | 16 | 6 | 1 |
unfData[is.na(unfData)] <- 0
Feature columns with zero standard deviation are removed from analysis.
unfData <- unfData[, which(sapply(1:dim(unfData)[2], function(x) sd(unfData[, x])) > 0)]
dataMat <- dummy_dataMat <- matrix(ncol = (dim(unfData)[2]), nrow = dim(unfData)[1])
For no dimensionality reduction option, row-wise min-max scaling is used. For no dimensionality reduction option, row-wise z-score normalization is used.
# For no dimensionality reduction option: Row-wise min-max scaling
for (i in 1:dim(unfData)[2]) {
for (j in 1:dim(unfData)[1]) {
minVal <- min(unfData[j, ])
maxVal <- max(unfData[j, ])
dataMat[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
}
}
# For no dimensionality reduction option: Row-wise z-score normalization
for (i in 1:dim(unfData)[2]) {
for (j in 1:dim(unfData)[1]) {
meanVal <- mean(unfData[j, ])
sdVal <- sd(unfData[j, ])
dummy_dataMat[j, i] <- (unfData[j, i] - meanVal)/sdVal
}
}
Singular Value Decomposition with a threshold of 85% variance
Xmat <- svd(dummy_dataMat)
criteriaInd <- max(which(cumsum(Xmat$d/sum(Xmat$d) * 100) <= 85))
selU_mat <- Xmat$u[, 1:criteriaInd]
selD_mat <- matrix(0, ncol = dim(selU_mat)[2], nrow = dim(selU_mat)[2])
diag(selD_mat) <- as.matrix(sqrt(Xmat$d[1:criteriaInd]))
red_dataMat <- selU_mat %*% selD_mat
refObj <- NULL
refObj <- sampleIDs[, paste("Index_", classNum, "class", sep = "")]
names(refObj) <- rownames(dataMat) <- rownames(red_dataMat) <- rownames(unfData)
dMetric_ref <- dist(refObj) #Euclidean distance
hclustfun <- list()
hclustfun[[1]] = function(x) hclust(x, method = "ward.D2")
hclustfun[[2]] = function(x) hclust(x, method = "average")
hclustfun[[3]] = function(x) hclust(x, method = "complete")
refClust <- list()
count <- 0
for (i in 1:length(hclustfun)) {
count <- count + 1
refClust[[count]] <- hclustfun[[i]](dMetric_ref)
}
# Defining colors for the dendrogram
colorsMat <- c("goldenrod", "darkorange4", "darkolivegreen", "blue4", "firebrick4", "plum1", "grey50", "seagreen3", "palevioletred1", "turquoise1", "deeppink4",
"dodgerblue", "darkorchid3", "saddlebrown", "maroon2", "lawngreen")
# Defining the number of cuts for the F-M index calculation
treeInd <- classNum
# Calling functions for producing grouping structure based on the selected parameters
results <- get_clustResults(dataMat, treeInd, refClust, methodInd, colorsMat, classNum)
reduced_results <- get_clustResults(red_dataMat, treeInd, refClust, methodInd, colorsMat, classNum)
method | tree_Cut | FM_index | null_FM_index | standard_dev | p_value |
---|---|---|---|---|---|
Ward_KendallCor | 3 | 0.53 | 0.39 | 0.02 | 0.00 |
Ward_PearsonCor | 3 | 0.40 | 0.41 | 0.02 | 0.31 |
Ward_SpearmanCor | 3 | 0.49 | 0.44 | 0.02 | 0.02 |
Avg_KendallCor | 3 | 0.57 | 0.53 | 0.03 | 0.04 |
Avg_PearsonCor | 3 | 0.51 | 0.50 | 0.03 | 0.26 |
Avg_SpearmanCor | 3 | 0.57 | 0.53 | 0.03 | 0.04 |
Complete_KendallCor | 3 | 0.64 | 0.45 | 0.02 | 0.00 |
Complete_PearsonCor | 3 | 0.51 | 0.51 | 0.03 | 0.36 |
Complete_SpearmanCor | 3 | 0.57 | 0.53 | 0.03 | 0.05 |
method | tree_Cut | FM_index | null_FM_index | standard_dev | p_value |
---|---|---|---|---|---|
Ward_KendallCor | 3 | 0.41 | 0.38 | 0.01 | 0.02 |
Ward_PearsonCor | 3 | 0.50 | 0.39 | 0.01 | 0.00 |
Ward_SpearmanCor | 3 | 0.41 | 0.38 | 0.01 | 0.01 |
Avg_KendallCor | 3 | 0.56 | 0.44 | 0.02 | 0.00 |
Avg_PearsonCor | 3 | 0.54 | 0.48 | 0.03 | 0.04 |
Avg_SpearmanCor | 3 | 0.48 | 0.42 | 0.02 | 0.00 |
Complete_KendallCor | 3 | 0.41 | 0.39 | 0.02 | 0.06 |
Complete_PearsonCor | 3 | 0.60 | 0.44 | 0.02 | 0.00 |
Complete_SpearmanCor | 3 | 0.41 | 0.39 | 0.02 | 0.07 |
Dendrogram is generated for the selected clustering methodology (Average linkage & Pearson correlation with no dimensionality reduction)
i <- methodInd # passing the clustering methodology index for generation of dendrograms
hc <- results$hc
groups <- NULL
groups <- cutree(hc[[i]], k = treeInd) #cut tree into n clusters, where n is treeInd
dd.row <- as.dendrogram(hc[[i]])
ddata_x <- dendro_data(dd.row, type = "rectangle")
groupMat <- data.frame(label = sampleIDs$Sample, groupInfo = sampleIDs[, paste("Labels_", classNum, "_Class", sep = "")])
labs <- merge(label(ddata_x), groupMat, by.y = "label")
labs_final <- labs[order(labs$x, decreasing = FALSE), ]
Class_Labels <- as.factor(labs_final$groupInfo)
ggplot(segment(ddata_x)) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend), colour = "black") + geom_text(data = label(ddata_x), aes(label = label,
x = x, y = 0, colour = Class_Labels), size = 5, fontface = 2, nudge_y = 0.8) + theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(),
axis.text = element_blank(), axis.ticks.x = element_blank(), axis.ticks.y = element_blank()) + labs(x = "", y = "") + scale_color_manual(values = colorsMat) +
coord_flip() + scale_y_reverse(expand = c(1, 0))
Dendrogram is generated for the selected clustering methodology (Average linkage & Pearson correlation after dimensionality reduction)
i <- methodInd # passing the clustering methodology index for generation of dendrograms
hc <- reduced_results$hc
groups <- NULL
groups <- cutree(hc[[i]], k = treeInd) #cut tree into n clusters, where n is treeInd
dd.row <- as.dendrogram(hc[[i]])
ddata_x <- dendro_data(dd.row, type = "rectangle")
groupMat <- data.frame(label = sampleIDs$Sample, groupInfo = sampleIDs[, paste("Labels_", classNum, "_Class", sep = "")])
labs <- merge(label(ddata_x), groupMat, by.y = "label")
labs_final <- labs[order(labs$x, decreasing = FALSE), ]
Class_Labels <- as.factor(labs_final$groupInfo)
ggplot(segment(ddata_x)) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend), colour = "black") + geom_text(data = label(ddata_x), aes(label = label,
x = x, y = 0, colour = Class_Labels), size = 5, fontface = 2, nudge_y = 0.8) + theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(),
axis.text = element_blank(), axis.ticks.x = element_blank(), axis.ticks.y = element_blank()) + labs(x = "", y = "") + scale_color_manual(values = colorsMat) +
coord_flip() + scale_y_reverse(expand = c(1, 0))
This is used for building the Random Forest model and retrieving the important features.
"%notin%" <- Negate("%in%")
get_modelResults <- function(mergedTable, optimal_mtry) {
featRank <- data.frame()
rf_Model <- randomForest(formula = Status ~ ., data = mergedTable, mtry = optimal_mtry, importance = TRUE)
featRank <- importance(rf_Model, type = 1, scale = TRUE)
rankedFeats <- rownames(featRank)[order(featRank, decreasing = TRUE)]
finalResults <- list(model = rf_Model, rankedFeats = rankedFeats, featRank_details = featRank)
return(finalResults)
}
set.seed(42)
unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1] #assigning the row names as sample names
unfData <- unfData[, -1] #excluding the first column that contains sample names
unfData <- as.matrix(unfData)
sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)
unfData[is.na(unfData)] <- 0
classNum: Number of classes /
Options: 3,9 or 16 classes for SRM grouping
option_rmDuplicate: Boolean value to decide whether to remove the replicates or not
Options: “Y” or “y” for yes / “N” or “n” for no
# For this analysis, we select the following input arguments:
classNum <- 3
option_rmDuplicate <- "n"
if (option_rmDuplicate %in% c("y", "Y")) {
sampleIDs <- sampleIDs[which(!duplicated(sampleIDs$Name)), ] #removing 2nd & 3rd duplicates
# sampleIDs <- sampleIDs[seq(2,dim(sampleIDs)[1],3),] #removing 1st & 3rd duplicates sampleIDs <- sampleIDs[seq(3,dim(sampleIDs)[1],3),] #removing 1st
# & 2nd duplicates
unfData <- unfData[rownames(unfData) %in% sampleIDs$File, ]
} else if (option_rmDuplicate %notin% c("n", "N")) {
print("Invalid input for removal of duplicates option!")
}
In this part of the code, we attach the relevant labels (based on the input class number) to the data table.
if (classNum == 3) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_3class, file = sampleIDs$File))
}
if (classNum == 9) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_9class, file = sampleIDs$File))
}
if (classNum == 16) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_16class, file = sampleIDs$File))
}
# Scales the data row-wise:
pre1_mergedTable <- matrix(ncol = dim(unfData)[2], nrow = dim(unfData)[1])
rownames(pre1_mergedTable) <- rownames(unfData)
colnames(pre1_mergedTable) <- colnames(unfData)
for (i in 1:dim(unfData)[2]) {
for (j in 1:dim(unfData)[1]) {
minVal <- min(as.matrix(unfData[j, ]))
maxVal <- max(as.matrix(unfData[j, ]))
pre1_mergedTable[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
}
}
# Removes the columns with sd = 0 :
pre2_mergedTable <- pre1_mergedTable[, as.numeric(which(apply(pre1_mergedTable, 2, sd) > 0))]
# Scales the data column-wise:
pre3_mergedTable <- matrix(nrow = dim(pre2_mergedTable)[1], ncol = dim(pre2_mergedTable)[2])
rownames(pre3_mergedTable) <- rownames(pre2_mergedTable)
colnames(pre3_mergedTable) <- colnames(pre2_mergedTable)
for (j in 1:dim(pre2_mergedTable)[1]) {
for (i in 1:dim(pre2_mergedTable)[2]) {
minVal <- min(as.matrix(pre2_mergedTable[, i]))
maxVal <- max(as.matrix(pre2_mergedTable[, i]))
pre3_mergedTable[j, i] <- (pre2_mergedTable[j, i] - minVal)/(maxVal - minVal)
}
}
mergedTable <- as.data.frame(cbind(pre3_mergedTable, Status = selTable$Status))
Column names are cleaned.
colnames(mergedTable) <- gsub("-", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("/", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[(]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[)]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[,]", "", colnames(mergedTable))
mergedTable$Status <- as.factor(mergedTable$Status)
Method: Random Forest Classification
Tuning step allows us to get optimal mtry parameter, which determines the number of features used in model building.
seeds <- vector(mode = "list", length = 61)
for (i in 1:60) seeds[[i]] <- sample.int(1000, 60)
seeds[[61]] <- sample.int(1000, 1)
control <- trainControl(method = "LOOCV", search = "grid", seeds = seeds)
tunegrid <- expand.grid(.mtry = c(1:(dim(mergedTable)[2] - 1)))
rf_random <- train(Status ~ ., data = mergedTable, method = "rf", metric = "Accuracy", tuneGrid = tunegrid, trControl = control)
optimal_mtry <- rf_random$bestTune$mtry
loopNum <- 1
confMat <- vector(mode = "list", length = 1)
modelResults <- get_modelResults(mergedTable, optimal_mtry)
confMat[[1]] <- modelResults$model$confusion
rankedFeats <- modelResults$rankedFeats
featRank_stat <- modelResults$featRank_details
save(confMat, file = paste("confMatrix_", classNum, "class.dat", sep = "")) #raw format is saved for further use
dummy <- as.data.frame(confMat[[1]])
notPerm_conf <- as.matrix(dummy[, 1:as.numeric(classNum)])
notPerm_acc <- sum(diag(notPerm_conf))/(as.numeric(dim(unfData)[1])) * 100
original_confMat <- t(confMat[[1]][1:3, 1:3]) #transpose is taken to put 'true' label as columns and 'predicted' as rows
if (as.numeric(classNum) == 3) {
labelNames <- c("LRP", "Crude", "HRP")
}
if (as.numeric(classNum) == 9) {
labelNames <- c("Gasoline", "Biodiesel", "Crude Oil", "Gas Oil", "Jet Fuel", "Diesel", "Motor Oil", "Kerosene", "RFO")
}
if (as.numeric(classNum) == 16) {
labelNames <- c("87 Octane Gasoline", "Biodiesel (Animal-based)", "Biodiesel
(Soy-based)", "Crude Oil (Heavy-sweet)", "Crude Oil
(Light-sour)", "Gas Oil",
"Gulf of Mexico Crude Oil", "Jet Fuel", "Low S Diesel", "Motor Oil Additive", "S in Gasoline", "S in
Kerosene (High Level)", "S in Kerosene (Low Level)",
"S in
Residual Fuel Oil", "Sulfur in Diesel", "Zero S Diesel")
}
rownames(original_confMat) <- colnames(original_confMat) <- labelNames
write.csv(original_confMat, file = paste("confMatrix_", classNum, "class.csv", sep = ""))
featRank_stat <- as.data.frame(featRank_stat)
featRank_final <- as.matrix(featRank_stat[order(featRank_stat$MeanDecreaseAccuracy, decreasing = T), 1])
rownames(featRank_final) <- rownames(featRank_stat)[order(featRank_stat$MeanDecreaseAccuracy, decreasing = T)]
colnames(featRank_final) <- c("Mean Decrease in Accuracy")
write.csv(featRank_final, file = paste("featureRanking_", classNum, "_class.csv", sep = ""))
Original Confusion Matrix:
## LRP Crude HRP
## LRP 36 0 0
## Crude 0 9 0
## HRP 0 0 15
## [1] "Original Confusion Matrix Accuracy: 100"
Mean Decrease in Accuracy | |
---|---|
C2.Chrysenes.Benzo.a.anthracenes | 7.520562 |
C1.Dibenzothiophenes | 7.099796 |
Dibenzofuran | 7.080316 |
C2.Dibenzothiophenes | 6.910357 |
C3.Phenanthrene.anthracenes | 6.901531 |
C4.Naphthalenes | 6.820887 |
C2.Fluoranthene.pyrenes | 6.696374 |
C4.Phenanthrene.anthracenes | 6.574730 |
C2.Decalins | 6.571430 |
Naphthobenzothiophene | 6.540177 |
C4.Dibenzothiophenes | 6.489756 |
C1.Benzothiophenes | 6.489034 |
C3.Dibenzothiophenes | 6.467766 |
Fluoranthene | 6.455721 |
Acenaphthene | 6.453360 |
C3.Fluorenes | 6.311278 |
Acenaphthylene | 6.302357 |
C3.Naphthalenes | 6.300192 |
Benzo.b.fluoranthene | 6.279048 |
C2.Phenanthrene.anthracenes | 6.266151 |
Benzothiophene | 6.248435 |
Pyrene | 6.199187 |
C2.Fluorenes | 6.142463 |
C2.Naphthobenzothiophenes | 6.121350 |
Naphthalene | 6.100423 |
C1.Decalins | 6.092001 |
Benzo.k.fluoranthene | 6.063806 |
C2.Naphthalenes | 6.020790 |
Benzo.g.h.i.perylene | 5.992569 |
C1.Fluoranthene.pyrenes | 5.949770 |
Indeno.1.2.3.cd.pyrene | 5.931836 |
Anthracene | 5.906540 |
C1.Phenanthrene.anthracenes | 5.872676 |
Decalin | 5.861592 |
Biphenyl | 5.848396 |
Benzo.e.pyrene | 5.846148 |
Benz.a.anthracene | 5.808037 |
Dibenzo.a.h.anthracene | 5.797696 |
Phenanthrene | 5.791218 |
C4.Chrysenes.Benzo.a.anthracenes | 5.761235 |
Dibenzothiophene | 5.760049 |
C1.Naphthalenes | 5.593102 |
C1.Naphthobenzothiophenes | 5.399784 |
Chrysene | 5.277497 |
C3.Benzothiophenes | 5.274166 |
C3.Decalins | 5.270193 |
Benzo.a.pyrene | 5.247689 |
Perylene | 5.237808 |
Fluorene | 5.150002 |
C1.Fluorenes | 5.127238 |
C1.Chrysenes.Benzo.a.anthracenes | 5.070513 |
C3.Naphthobenzothiophenes | 4.903954 |
C3.Chrysenes.Benzo.a.anthracenes | 4.861172 |
C3.Fluoranthene.pyrenes | 4.809014 |
C2.Benzothiophenes | 4.767038 |
We’ll now use the same workflow to generate average confusion matrix from 1000 permutations.
# Defines a necesaary function to be used within the code:
"%notin%" <- Negate("%in%")
# Define model generation function:
get_modelResults <- function(mergedTable, optimal_mtry) {
featRank <- data.frame()
rf_Model <- randomForest(formula = Status ~ ., data = mergedTable, mtry = optimal_mtry, importance = TRUE)
featRank <- importance(rf_Model, type = 1, scale = TRUE)
rankedFeats <- rownames(featRank)[order(featRank, decreasing = TRUE)]
# rankedFeats <- colnames(mergedTable)[order(featRank,decreasing=TRUE)] #same with above line
finalResults <- list(model = rf_Model, rankedFeats = rankedFeats, featRank_details = featRank)
return(finalResults)
}
# Setting seed for reproducible results:
set.seed(42)
# Reading processed input GC-MS data of SRM samples
unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1] #assigning the row names as sample names
unfData <- unfData[, -1] #excluding the first column that contains sample names
unfData <- as.matrix(unfData)
# Reading reference category information of SRM samples
sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)
# Missing data imputation with zeros:
unfData[is.na(unfData)] <- 0
# Input parameter options: 1. classNum: Number of classes / Options: 3,9 or 16 classes for SRM grouping 2. option_rmDuplicate: Boolean value to decide
# whether to remove the replicates or not Options: 'Y' or 'y' for yes / 'N' or 'n' for no
# Input arguments:
classNum <- 3
option_rmDuplicate <- "n"
# To remove duplicates:
if (option_rmDuplicate %in% c("y", "Y")) {
sampleIDs <- sampleIDs[which(!duplicated(sampleIDs$Name)), ] #removing 2nd & 3rd duplicates
# sampleIDs <- sampleIDs[seq(2,dim(sampleIDs)[1],3),] #removing 1st & 3rd duplicates sampleIDs <- sampleIDs[seq(3,dim(sampleIDs)[1],3),] #removing 1st
# & 2nd duplicates
unfData <- unfData[rownames(unfData) %in% sampleIDs$File, ]
} else if (option_rmDuplicate %notin% c("n", "N")) {
print("Invalid input for removal of duplicates option!")
}
# Attaches the relevant labels (based on the input class number) to the data table:
if (classNum == 3) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_3class, file = sampleIDs$File))
}
if (classNum == 9) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_9class, file = sampleIDs$File))
}
if (classNum == 16) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_16class, file = sampleIDs$File))
}
# Scales the data row-wise:
pre1_mergedTable <- matrix(ncol = dim(unfData)[2], nrow = dim(unfData)[1])
rownames(pre1_mergedTable) <- rownames(unfData)
colnames(pre1_mergedTable) <- colnames(unfData)
for (i in 1:dim(unfData)[2]) {
for (j in 1:dim(unfData)[1]) {
minVal <- min(as.matrix(unfData[j, ]))
maxVal <- max(as.matrix(unfData[j, ]))
pre1_mergedTable[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
}
}
# Removes the columns with sd = 0 :
pre2_mergedTable <- pre1_mergedTable[, as.numeric(which(apply(pre1_mergedTable, 2, sd) > 0))]
# Scales the data column-wise:
pre3_mergedTable <- matrix(nrow = dim(pre2_mergedTable)[1], ncol = dim(pre2_mergedTable)[2])
rownames(pre3_mergedTable) <- rownames(pre2_mergedTable)
colnames(pre3_mergedTable) <- colnames(pre2_mergedTable)
for (j in 1:dim(pre2_mergedTable)[1]) {
for (i in 1:dim(pre2_mergedTable)[2]) {
minVal <- min(as.matrix(pre2_mergedTable[, i]))
maxVal <- max(as.matrix(pre2_mergedTable[, i]))
pre3_mergedTable[j, i] <- (pre2_mergedTable[j, i] - minVal)/(maxVal - minVal)
}
}
mergedTable <- as.data.frame(cbind(pre3_mergedTable, Status = selTable$Status))
# Formatting the data table (specifically - cleaning the column names):
colnames(mergedTable) <- gsub("-", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("/", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[(]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[)]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[,]", "", colnames(mergedTable))
mergedTable$Status <- as.factor(mergedTable$Status)
# Tuning Random Forest model to get optimal mtry parameter (determines the number of features used in model building) Method: Random Forest
# Classification
seeds <- vector(mode = "list", length = 61)
for (i in 1:60) seeds[[i]] <- sample.int(1000, 60)
seeds[[61]] <- sample.int(1000, 1)
control <- trainControl(method = "LOOCV", search = "grid", seeds = seeds)
tunegrid <- expand.grid(.mtry = c(1:(dim(mergedTable)[2] - 1)))
rf_random <- train(Status ~ ., data = mergedTable, method = "rf", metric = "Accuracy", tuneGrid = tunegrid, trControl = control)
optimal_mtry <- rf_random$bestTune$mtry
# Permutes the data labels by checking the permutation option input:
permNumber <- 1000
permuted_confMat <- vector(mode = "list", length = permNumber)
loopNum <- permNumber
for (l in 1:loopNum) {
mergedTable$Status <- as.factor(sample(mergedTable$Status))
modelResults <- get_modelResults(mergedTable, optimal_mtry)
permuted_confMat[[l]] <- modelResults$model$confusion
}
save(permuted_confMat, file = paste("permuted_confMatrix_", classNum, "class.dat", sep = ""))
load(paste("confMatrix_", classNum, "class.dat", sep = ""))
load(paste("original_classification_accuracy_", classNum, "class.dat", sep = ""))
dummy <- as.data.frame(confMat[[1]])
notPerm_conf <- as.matrix(dummy[, 1:as.numeric(classNum)])
notPerm_acc <- sum(diag(notPerm_conf))/(as.numeric(dim(unfData)[1])) * 100
dsum <- matrix(ncol = as.numeric(classNum), nrow = as.numeric(classNum), 0)
a <- matrix(ncol = as.numeric(classNum), nrow = as.numeric(classNum))
acc <- NULL
for (i in 1:permNumber) {
dummy <- as.data.frame(permuted_confMat[[i]])
a <- as.matrix(dummy[, 1:as.numeric(classNum)])
acc[i] <- sum(diag(a))/as.numeric(dim(unfData)[1]) * 100
dsum <- as.table(dsum) + a
}
avgConf <- dsum/permNumber
histAcc <- hist(acc, plot = FALSE)
pVal <- sum(histAcc$counts[which(histAcc$breaks >= notPerm_acc)], na.rm = TRUE)/permNumber
if (as.numeric(classNum) == 3) {
colnames(notPerm_conf) <- rownames(notPerm_conf) <- colnames(avgConf) <- rownames(avgConf) <- c("LRP", "Crude", "HRP")
}
if (as.numeric(classNum) == 9) {
colnames(notPerm_conf) <- rownames(notPerm_conf) <- colnames(avgConf) <- rownames(avgConf) <- c("Gasoline", "Biodiesel", "Crude Oil", "Gas Oil", "Jet Fuel",
"Diesel", "Motor Oil", "Kerosene", "RFO")
}
if (as.numeric(classNum) == 16) {
colnames(notPerm_conf) <- rownames(notPerm_conf) <- colnames(avgConf) <- rownames(avgConf) <- c("87 Octane Gasoline", "Biodiesel (Animal-based)", "Biodiesel (Soy-based)",
"Crude Oil (Heavy-sweet)", "Crude Oil (Light-sour)", "Gas Oil", "Gulf of Mexico Crude Oil", "Jet Fuel", "Low S Diesel", "Motor Oil Additive", "S in Gasoline",
"S in Kerosene (High Level)", "S in Kerosene (Low Level)", "S in Residual Fuel Oil", "Sulfur in Diesel", "Zero S Diesel")
}
# Formatting and saving the confusion matrix:
average_confMat <- t(avgConf[1:classNum, 1:classNum])
if (as.numeric(classNum) == 3) {
labelNames <- c("LRP", "Crude", "HRP")
}
if (as.numeric(classNum) == 9) {
labelNames <- c("Gasoline", "Biodiesel", "Crude Oil", "Gas Oil", "Jet Fuel", "Diesel", "Motor Oil", "Kerosene", "RFO")
}
if (as.numeric(classNum) == 16) {
labelNames <- c("87 Octane Gasoline", "Biodiesel (Animal-based)", "Biodiesel
(Soy-based)", "Crude Oil (Heavy-sweet)", "Crude Oil
(Light-sour)", "Gas Oil",
"Gulf of Mexico Crude Oil", "Jet Fuel", "Low S Diesel", "Motor Oil Additive", "S in Gasoline", "S in
Kerosene (High Level)", "S in Kerosene (Low Level)",
"S in
Residual Fuel Oil", "Sulfur in Diesel", "Zero S Diesel")
}
rownames(average_confMat) <- colnames(average_confMat) <- labelNames
write.csv(average_confMat, file = paste("permutation_confMatrix_", classNum, "class.csv", sep = ""))
Average Confusion Matrix from 1000 permutations:
print(average_confMat)
## LRP Crude HRP
## LRP 22.391 5.773 9.574
## Crude 5.051 1.116 2.078
## HRP 8.558 2.111 3.348
print(paste("Mean Accuracy:", round(mean(acc), 2), sep = " "))
## [1] "Mean Accuracy: 44.76"
print(paste("Std Accuracy:", round(sd(acc), 2), sep = " "))
## [1] "Std Accuracy: 6.98"
print(paste("P-value:", round(pVal, 3), sep = " "))
## [1] "P-value: 0"
This is used for building the Random Forest model and retrieving the important features.
"%notin%" <- Negate("%in%")
# Define model generation function:
get_modelResults <- function(testTable, trainTable, optimal_mtry) {
rf_Model <- randomForest(formula = Status ~ ., data = trainTable, mtry = optimal_mtry, importance = TRUE)
predictedVal <- predict(rf_Model, testTable)
confMat <- table(testTable$Status, predictedVal)
return(confMat)
}
set.seed(42)
unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1] #assigning the row names as sample names
unfData <- unfData[, -1] #excluding the first column that contains sample names
unfData <- as.matrix(unfData)
sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)
unfData[is.na(unfData)] <- 0
classNum: Number of classes /
Options: 3,9 or 16 classes for SRM grouping
option_permutation: Boolean value to decide whether to permute the sample labels or not
Options: “Y” or “y” for yes / “N” or “n” for no
# For this analysis, we select the following input arguments:
classNum <- 3
option_permutation <- "n"
In this part of the code, we attach the relevant labels (based on the input class number) to the data table.
if (classNum == 3) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_3class, file = sampleIDs$File, material = sampleIDs$Name))
}
if (classNum == 9) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_9class, file = sampleIDs$File, material = sampleIDs$Name))
}
if (classNum == 16) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_16class, file = sampleIDs$File, material = sampleIDs$Name))
}
# Scales the data row-wise:
pre1_mergedTable <- matrix(ncol = dim(unfData)[2], nrow = dim(unfData)[1])
rownames(pre1_mergedTable) <- rownames(unfData)
colnames(pre1_mergedTable) <- colnames(unfData)
for (i in 1:dim(unfData)[2]) {
for (j in 1:dim(unfData)[1]) {
minVal <- min(as.matrix(unfData[j, ]))
maxVal <- max(as.matrix(unfData[j, ]))
pre1_mergedTable[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
}
}
# Removes the columns with sd = 0 :
pre2_mergedTable <- pre1_mergedTable[, as.numeric(which(apply(pre1_mergedTable, 2, sd) > 0))]
# Scales the data column-wise:
pre3_mergedTable <- matrix(nrow = dim(pre2_mergedTable)[1], ncol = dim(pre2_mergedTable)[2])
rownames(pre3_mergedTable) <- rownames(pre2_mergedTable)
colnames(pre3_mergedTable) <- colnames(pre2_mergedTable)
for (j in 1:dim(pre2_mergedTable)[1]) {
for (i in 1:dim(pre2_mergedTable)[2]) {
minVal <- min(as.matrix(pre2_mergedTable[, i]))
maxVal <- max(as.matrix(pre2_mergedTable[, i]))
pre3_mergedTable[j, i] <- (pre2_mergedTable[j, i] - minVal)/(maxVal - minVal)
}
}
mergedTable <- as.data.frame(cbind(pre3_mergedTable, Status = selTable$Status, material = selTable$material))
colNames <- c(colnames(unfData), "Status", "material")
colnames(mergedTable) <- colNames
Column names are cleaned.
colnames(mergedTable) <- gsub("-", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("/", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[(]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[)]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[,]", "", colnames(mergedTable))
mergedTable$Status <- as.factor(mergedTable$Status)
Random Forest Classifier is trained for each SRM substance separately.
confMat_all <- vector(mode = "list", length = length(unique(mergedTable$material)))
for (m in 1:length(unique(mergedTable$material))) {
# length(unique(mergedTable$material)) = 20 SRM substance!
trainTable <- mergedTable[mergedTable$material != unique(mergedTable$material)[m], 1:(dim(mergedTable)[2] - 1)]
testTable <- mergedTable[mergedTable$material == unique(mergedTable$material)[m], 1:(dim(mergedTable)[2] - 1)]
seeds <- vector(mode = "list", length = 61)
for (i in 1:60) seeds[[i]] <- sample.int(1000, 60)
seeds[[61]] <- sample.int(1000, 1)
control <- trainControl(method = "LOOCV", search = "grid", seeds = seeds)
tunegrid <- expand.grid(.mtry = c(1:(dim(trainTable)[2] - 1)))
rf_random <- train(Status ~ ., data = trainTable, method = "rf", metric = "Accuracy", tuneGrid = tunegrid, trControl = control)
optimal_mtry <- rf_random$bestTune$mtry
# Permutes the data labels if the option_permutation is 'y' of 'Y':
if (option_permutation %in% c("n", "N")) {
loopNum <- 1
confMat <- vector(mode = "list", length = 1)
# getting test results for 3 replicates of each excluded SRM substance
modelResults <- get_modelResults(testTable, trainTable, optimal_mtry)
confMat[[1]] <- modelResults
} else if (option_permutation %in% c("y", "Y")) {
confMat <- vector(mode = "list", length = permNumber)
loopNum <- permNumber
for (l in 1:loopNum) {
testTable$Status <- as.factor(sample(testTable$Status))
modelResults <- get_modelResults(testTable, trainTable, optimal_mtry)
confMat[[l]] <- modelResults
}
} else if (option_permutation %notin% c("y,", "Y", "n", "N")) {
print("Invalid input for label permuting option!")
}
confMat_all[[m]] <- confMat
}
save(confMat_all, file = paste("original_confMatrix_", classNum, "class.dat", sep = ""))
We’ll now use the same workflow to generate average confusion matrix from 1000 permutations.
# Define model generation function and predicting for unknown substance:
get_modelResults <- function(testTable, trainTable, optimal_mtry) {
rf_Model <- randomForest(formula = Status ~ ., data = trainTable, mtry = optimal_mtry, importance = TRUE)
predictedVal <- predict(rf_Model, testTable)
confMat <- table(testTable$Status, predictedVal)
return(confMat)
}
# Setting seed for reproducible results:
set.seed(42)
# Reading processed input GC-MS data of SRM samples
unfData <- read.csv("Unfolded_GC-MS_data_of_SRM.csv", header = T)
rownames(unfData) <- unfData[, 1] #assigning the row names as sample names
unfData <- unfData[, -1] #excluding the first column that contains sample names
unfData <- as.matrix(unfData)
# Reading reference category information of SRM samples
sampleIDs <- read.csv("SRM_categorization_info.csv", header = T)
# Missing data imputation with zeros:
unfData[is.na(unfData)] <- 0
# Input parameter options: 1. classNum: Number of classes / Options: 3,9 or 16 classes for SRM grouping 2. option_permutation: Boolean value to decide
# whether to permute the sample labels / Options: 'Y' or 'y' for yes / 'N' or 'n' for no
# Input arguments:
classNum <- 3
option_permutation <- "y" #options: 'Y' or 'y' for yes / 'N' or 'n' for no
permNumber <- 1000
# Attaches the relevant labels (based on the input class number) to the data table:
if (classNum == 3) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_3class, file = sampleIDs$File, material = sampleIDs$Name))
}
if (classNum == 9) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_9class, file = sampleIDs$File, material = sampleIDs$Name))
}
if (classNum == 16) {
selTable <- as.data.frame(cbind(Status = sampleIDs$Index_16class, file = sampleIDs$File, material = sampleIDs$Name))
}
# Scales the data row-wise:
pre1_mergedTable <- matrix(ncol = dim(unfData)[2], nrow = dim(unfData)[1])
rownames(pre1_mergedTable) <- rownames(unfData)
colnames(pre1_mergedTable) <- colnames(unfData)
for (i in 1:dim(unfData)[2]) {
for (j in 1:dim(unfData)[1]) {
minVal <- min(as.matrix(unfData[j, ]))
maxVal <- max(as.matrix(unfData[j, ]))
pre1_mergedTable[j, i] <- (unfData[j, i] - minVal)/(maxVal - minVal)
}
}
# Removes the columns with sd = 0 :
pre2_mergedTable <- pre1_mergedTable[, as.numeric(which(apply(pre1_mergedTable, 2, sd) > 0))]
# Scales the data column-wise:
pre3_mergedTable <- matrix(nrow = dim(pre2_mergedTable)[1], ncol = dim(pre2_mergedTable)[2])
rownames(pre3_mergedTable) <- rownames(pre2_mergedTable)
colnames(pre3_mergedTable) <- colnames(pre2_mergedTable)
for (j in 1:dim(pre2_mergedTable)[1]) {
for (i in 1:dim(pre2_mergedTable)[2]) {
minVal <- min(as.matrix(pre2_mergedTable[, i]))
maxVal <- max(as.matrix(pre2_mergedTable[, i]))
pre3_mergedTable[j, i] <- (pre2_mergedTable[j, i] - minVal)/(maxVal - minVal)
}
}
mergedTable <- as.data.frame(cbind(pre3_mergedTable, Status = selTable$Status, material = selTable$material))
colNames <- c(colnames(unfData), "Status", "material")
colnames(mergedTable) <- colNames
# Formatting the data table (specifically - cleaning the column names):
colnames(mergedTable) <- gsub("-", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("/", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[(]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[)]", "", colnames(mergedTable))
colnames(mergedTable) <- gsub("[,]", "", colnames(mergedTable))
mergedTable$Status <- as.factor(mergedTable$Status)
# Training Random Forest classifier for each SRM substance separately
confMat_all <- vector(mode = "list", length = length(unique(mergedTable$material)))
for (m in 1:length(unique(mergedTable$material))) {
# length(unique(mergedTable$material)) = 20 SRM substance!
trainTable <- mergedTable[mergedTable$material != unique(mergedTable$material)[m], 1:(dim(mergedTable)[2] - 1)]
testTable <- mergedTable[mergedTable$material == unique(mergedTable$material)[m], 1:(dim(mergedTable)[2] - 1)]
seeds <- vector(mode = "list", length = 61)
for (i in 1:60) seeds[[i]] <- sample.int(1000, 60)
seeds[[61]] <- sample.int(1000, 1)
control <- trainControl(method = "LOOCV", search = "grid", seeds = seeds)
tunegrid <- expand.grid(.mtry = c(1:(dim(trainTable)[2] - 1)))
rf_random <- train(Status ~ ., data = trainTable, method = "rf", metric = "Accuracy", tuneGrid = tunegrid, trControl = control)
optimal_mtry <- rf_random$bestTune$mtry
# Permutes the data labels if the option_permutation is 'y' of 'Y':
if (option_permutation %in% c("n", "N")) {
loopNum <- 1
confMat <- vector(mode = "list", length = 1)
# getting test results for 3 replicates of each excluded SRM substance
modelResults <- get_modelResults(testTable, trainTable, optimal_mtry)
confMat[[1]] <- modelResults
} else if (option_permutation %in% c("y", "Y")) {
confMat <- vector(mode = "list", length = permNumber)
loopNum <- permNumber
for (l in 1:loopNum) {
testTable$Status <- as.factor(sample(testTable$Status))
modelResults <- get_modelResults(testTable, trainTable, optimal_mtry)
confMat[[l]] <- modelResults
}
} else if (option_permutation %notin% c("y,", "Y", "n", "N")) {
print("Invalid input for label permuting option!")
}
confMat_all[[m]] <- confMat
}
save(confMat_all, file = paste("permuted_confMatrix_", classNum, "class.dat", sep = ""))
# calculation of accuracy & p-values:
totalSample_size <- 60 #3 replicates of 20 SRM substances
# loading the original confusion matrix results:
load(paste("original_confMatrix_", classNum, "class.dat", sep = ""))
notPerm_conf <- confMat_all[[1]][[1]] #assigning the confusion matrix of the first SRM substance sample predictions
for (i in 2:20) {
# appending the confusion matrices of the other SRM substance sample predictions
notPerm_conf <- notPerm_conf + confMat_all[[i]][[1]]
}
# calculation of classification accuracy from the overall confusion matrix
notPerm_acc <- sum(diag(notPerm_conf))/(as.numeric(totalSample_size)) * 100
# loading the permuted confusion matrix results:
load(paste("permuted_confMatrix_", classNum, "class.dat", sep = ""))
perm_acc <- NULL
each_perm_confMat <- list()
formatted_confMat <- list()
pre_formatted_confMat <- list()
# re-formatting the confusion matrix arrangement:
for (i in 1:1000) {
for (j in 1:20) {
dummy <- confMat_all[[j]][[i]]
pre_formatted_confMat[[j]] <- dummy
}
formatted_confMat[[i]] <- pre_formatted_confMat
}
# obtaining the overall confusion matrix at each permutation
total_perm_confMat <- matrix(0, nrow = classNum, ncol = classNum)
for (i in 1:1000) {
each_perm_confMat[[i]] <- Reduce("+", formatted_confMat[[i]])
perm_acc[i] <- sum(diag(each_perm_confMat[[i]]))/(as.numeric(totalSample_size)) * 100
total_perm_confMat <- total_perm_confMat + as.matrix(each_perm_confMat[[i]])
}
# calculation of the p-value:
histAcc <- hist(perm_acc, plot = FALSE)
pVal <- sum(histAcc$counts[which(histAcc$breaks >= notPerm_acc)], na.rm = TRUE)/1000
if (as.numeric(classNum) == 3) {
colnames(notPerm_conf) <- rownames(notPerm_conf) <- colnames(total_perm_confMat) <- rownames(total_perm_confMat) <- c("LRP", "Crude", "HRP")
}
print("Original Confusion Matrix:")
## [1] "Original Confusion Matrix:"
print(notPerm_conf)
## predictedVal
## LRP Crude HRP
## LRP 36 0 0
## Crude 3 0 6
## HRP 5 1 9
print(paste("Original Confusion Matrix Accuracy:", notPerm_acc, sep = " "))
## [1] "Original Confusion Matrix Accuracy: 75"
print("Average Confusion Matrix from 1000 permutations:")
## [1] "Average Confusion Matrix from 1000 permutations:"
print(total_perm_confMat/1000)
## predictedVal
## LRP Crude HRP
## LRP 35.980 0.012 0.008
## Crude 3.000 0.000 6.000
## HRP 5.604 2.311 7.085
print(paste("Mean Accuracy:", mean(perm_acc), sep = " "))
## [1] "Mean Accuracy: 71.775"
print(paste("Std Accuracy:", sd(perm_acc), sep = " "))
## [1] "Std Accuracy: 3.57046696462193"
print(paste("P-value:", pVal, sep = " "))
## [1] "P-value: 0.092"
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## Random number generation:
## RNG: Mersenne-Twister
## Normal: Inversion
## Sample: Rounding
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] formatR_1.7 kableExtra_1.1.0 RColorBrewer_1.1-2 caret_6.0-84 lattice_0.20-38 randomForest_4.6-14 ggplot2_3.2.1 ggdendro_0.1-20 dendextend_1.12.0 knitr_1.25
##
## loaded via a namespace (and not attached):
## [1] viridis_0.5.1 httr_1.4.1 viridisLite_0.3.0 splines_3.6.1 foreach_1.4.7 prodlim_2018.04.18 assertthat_0.2.1 highr_0.8 stats4_3.6.1 yaml_2.2.0 ipred_0.9-9
## [12] pillar_1.4.2 backports_1.1.4 glue_1.3.1 digest_0.6.20 rvest_0.3.4 colorspace_1.4-1 recipes_0.1.7 htmltools_0.3.6 Matrix_1.2-17 plyr_1.8.4 timeDate_3043.102
## [23] pkgconfig_2.0.2 purrr_0.3.2 scales_1.0.0 webshot_0.5.1 gower_0.2.1 lava_1.6.6 tibble_2.1.3 generics_0.0.2 withr_2.1.2 nnet_7.3-12 lazyeval_0.2.2
## [34] survival_2.44-1.1 magrittr_1.5 crayon_1.3.4 evaluate_0.14 nlme_3.1-140 MASS_7.3-51.4 xml2_1.2.2 class_7.3-15 tools_3.6.1 data.table_1.12.2 hms_0.5.1
## [45] stringr_1.4.0 profdpm_3.3 munsell_0.5.0 compiler_3.6.1 e1071_1.7-2 rlang_0.4.0 grid_3.6.1 iterators_1.0.12 rstudioapi_0.10 labeling_0.3 rmarkdown_1.15
## [56] gtable_0.3.0 ModelMetrics_1.2.2 codetools_0.2-16 reshape2_1.4.3 R6_2.4.0 gridExtra_2.3 lubridate_1.7.4 dplyr_0.8.3 zeallot_0.1.0 readr_1.3.1 stringi_1.4.3
## [67] Rcpp_1.0.2 vctrs_0.2.0 rpart_4.1-15 tidyselect_0.2.5 xfun_0.9