Load Required Libraries

Required Packages: “dendextend”,“ggplot2”,“ggdendro”,“caret”,“boot”,“randomForest”,“statip”,“sm”,“reshape”,“pROC”,“stats”. Their dependencies may also be required.

Pre-Processing Functions

This part of the code reads in the name files and pre-processes the experimental dataset. The first pre-processing function will remove redundant columns, inactive compounds, technical replicates of controls and media.

data_names <- read.csv("./Reference_ER_Activity.csv", header = T)
feature_names <- read.csv("./Feature_Names.csv", header = T)

ERData_process <- function(data_set) {
    # Order Data Sets based on compound ID
    data_set <- data_set[order(data_set$compound, decreasing = F), ]
    
    # Drop columns with channel 00
    data_set <- data_set[, -grep("ch00", colnames(data_set))]
    
    # Separate out Negative Control - Media
    media <- data_set[which(data_set$compound %in% "DMSO"), ]
    data_set <- data_set[-which(data_set$compound %in% "DMSO"), ]
    
    # Separate out Positive Control - E2 (Estradiol)
    e2 <- data_set[which(data_set$compound %in% "E2"), ]
    data_set <- data_set[-which(data_set$compound %in% "E2"), ]
    
    # Separate out 4HT
    ht4 <- data_set[which(data_set$compound %in% "4HT"), ]
    data_set <- data_set[-which(data_set$compound %in% "4HT"), ]
    
    # Remove Inactive Compounds
    a02 <- data_set[which(data_set$compound %in% "A02"), ]
    data_set <- data_set[-which(data_set$compound %in% "A02"), ]
    
    a03 <- data_set[which(data_set$compound %in% "A03"), ]
    data_set <- data_set[-which(data_set$compound %in% "A03"), ]
    
    b06 <- data_set[which(data_set$compound %in% "B06"), ]
    data_set <- data_set[-which(data_set$compound %in% "B06"), ]
    
    c02 <- data_set[which(data_set$compound %in% "C02"), ]
    data_set <- data_set[-which(data_set$compound %in% "C02"), ]
    
    c05 <- data_set[which(data_set$compound %in% "C05"), ]
    data_set <- data_set[-which(data_set$compound %in% "C05"), ]
    
    c06 <- data_set[which(data_set$compound %in% "C06"), ]
    data_set <- data_set[-which(data_set$compound %in% "C06"), ]
    
    d03 <- data_set[which(data_set$compound %in% "D03"), ]
    data_set <- data_set[-which(data_set$compound %in% "D03"), ]
    
    d06 <- data_set[which(data_set$compound %in% "D06"), ]
    data_set <- data_set[-which(data_set$compound %in% "D06"), ]
    
    e03 <- data_set[which(data_set$compound %in% "E03"), ]
    data_set <- data_set[-which(data_set$compound %in% "E03"), ]
    
    f01 <- data_set[which(data_set$compound %in% "F01"), ]
    data_set <- data_set[-which(data_set$compound %in% "F01"), ]
    
    f02 <- data_set[which(data_set$compound %in% "F02"), ]
    data_set <- data_set[-which(data_set$compound %in% "F02"), ]
    
    h05 <- data_set[which(data_set$compound %in% "H05"), ]
    data_set <- data_set[-which(data_set$compound %in% "H05"), ]
    
    g02 <- data_set[which(data_set$compound %in% "G02"), ]
    data_set <- data_set[-which(data_set$compound %in% "G02"), ]
    
    # Merge masked compund IDs with correct names
    expand_names <- data.frame(matrix(NA, ncol = 2, nrow = dim(data_set)[1]))
    for (j in 1:length(data_names$Short_ID)) {
        for (i in 1:dim(data_set)[1]) {
            if (data_names$Short_ID[j] == data_set$compound[i]) {
                expand_names[i, 1] <- toString(data_names$Preferred_Name[j])
                expand_names[i, 2] <- toString(data_names$ER.Activity[j])
            }
        }
    }
    colnames(expand_names) <- c("Preferred.Name", "ER.Activity")
    
    # Create a new data frame with experimental features only
    clean_set <- data_set[, -(1:6)]
    media_set <- media[, -(1:6)]
    e2_set <- e2[, -(1:6)]
    
    for (k in 1:length(colnames(clean_set))) {
        if (colnames(clean_set)[k] == feature_names$Original.Feature.Names[k]) {
            colnames(clean_set)[k] <- as.character(feature_names$Even.Reduced.Names[k])
        }
    }
    
    return(list(clean_set = clean_set, expand_names = expand_names, e2_set = e2_set, media_set = media_set))
}

Data Normalization

This function is used to normalize the experimental datasets before model building.

# Data Normalization with respect to mean absolute deviation
normalize_mad <- function(clean_set, e2_set) {
    mad_e2 <- NULL
    bm_e2 <- data.frame(matrix(NA, nrow = dim(e2_set)[1], ncol = dim(clean_set)[2]))
    for (i in 1:dim(e2_set)[1]) {
        for (j in 1:dim(e2_set)[2]) {
            bm_e2[i, j] <- abs(e2_set[i, j] - mean(e2_set[, j]))
            mad_e2[j] <- mean(bm_e2[, j])
        }
    }
    
    df_mad <- data.frame(matrix(NA, nrow = dim(clean_set)[1], ncol = dim(clean_set)[2]))
    for (i in 1:dim(clean_set)[1]) {
        for (j in 1:dim(clean_set)[2]) {
            df_mad[i, j] <- (clean_set[i, j] - median(e2_set[, j]))/mad_e2[j]
        }
    }
    colnames(df_mad) <- colnames(clean_set)
    return(df_mad)
}

Confidence Interval Calculations in Logistic Regression

This function is used in the confidence interval calculations of the logistic regression model parameters.

bs <- function(formula, data, indices) {
    d <- data[indices, ]  # allows boot to select sample
    fit <- glm(formula, family = binomial(link = "logit"), data = d)
    return(coef(fit))
}

Model Performance Calculations

This function is used to calculate model performance metrics; accuracy, sensitivity, specificity, balanced accuracy and the 95% confidence intervals for the prediction accuracy.

mperform <- function(fit_model, valid_data_numeric, valid_data_class, type = "logit") {
    if (type == "logit") {
        predict_test <- round(predict(fit_model, valid_data_numeric, type = "response"), 3)
        conf_valid <- confusionMatrix(factor(round(predict_test)), valid_data_class$Class.Info, positive = "0", dnn = c("Predicted", "True"))
        Accuracy <- round(as.numeric(conf_valid$overall[1]), 2)
        Sensitivity <- round(as.numeric(conf_valid$byClass[1]), 2)
        Specificity <- round(as.numeric(conf_valid$byClass[2]), 2)
        BA <- round(as.numeric(conf_valid$byClass[11]), 2)
        CI_Lower <- round(as.numeric(conf_valid$overall[3]), 2)
        CI_Upper <- round(as.numeric(conf_valid$overall[4]), 2)
        
    } else if (type == "rf") {
        predict_test <- predict(fit_model, valid_data_numeric)
        conf_valid <- confusionMatrix(predict_test, valid_data_class$Class.Info, positive = "0", dnn = c("Predicted", "True"))
        Accuracy <- round(as.numeric(conf_valid$overall[1]), 2)
        Sensitivity <- round(as.numeric(conf_valid$byClass[1]), 2)
        Specificity <- round(as.numeric(conf_valid$byClass[2]), 2)
        BA <- round(as.numeric(conf_valid$byClass[11]), 2)
        CI_Lower <- round(as.numeric(conf_valid$overall[3]), 2)
        CI_Upper <- round(as.numeric(conf_valid$overall[4]), 2)
    }
    summary.res <- data.frame(Accuracy = Accuracy, CI.Lower = CI_Lower, CI.Upper = CI_Upper, Sensitivity = Sensitivity, Specificity = Specificity, Balanced.Accuracy = BA)
    return(summary.res)
}

Reading Experimental Datasets

data_sets <- NULL
for (j in 1:6) {
    data_sets[[j]] <- read.csv(paste("./EPA_Reps_CSV/20190531A_Rep_", (j + 3), "_Well_Data_updated_cmpnd.csv", sep = ""), header = T)
}

for (j in 7:12) {
    data_sets[[j]] <- read.csv(paste("./EPA_Reps_CSV/20190531B_Rep_", (j + 3), "_Well_Data_updated_cmpnd.csv", sep = ""), header = T)
}

for (j in 13:18) {
    data_sets[[j]] <- read.csv(paste("./EPA_Reps_CSV/20190531C_Rep_", (j + 3), "_Well_Data_updated_cmpnd.csv", sep = ""), header = T)
}

Data Pre-processing

Check Missing Data

for (i in 1:length(data_sets)) {
    print(sum(is.na(data_sets[[i]]$clean_set)))
}
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0

Data Cleaning

Call the pre-processing function for data cleaning.

processed_data <- NULL
for (i in 1:length(data_sets)) {
    processed_data[[i]] <- ERData_process(data_sets[[i]])
}
# Replicate 8 (index =5) is selected for analysis
new_frame <- data.frame(processed_data[[5]]$clean_set, Preferred.Name = processed_data[[5]]$expand_names$Preferred.Name)
First 10 rows and 5 columns of the processed experimental data used for analysis
Cell.Area Cell.Circularity Nucleus.Area Nucleus.Circularity Fraction.Localized.In.Nucleus
7591.96 0.953436 2096.64 1.13147 0.832444
7570.25 0.949877 2031.14 1.13205 0.824702
6815.69 0.957420 2008.79 1.14522 0.834381
6944.24 0.956784 2009.98 1.14160 0.844161
7222.53 0.946303 2117.25 1.13752 0.811180
6582.49 0.937830 2077.13 1.13671 0.824058
6742.16 0.944760 2202.86 1.14404 0.835256
6920.76 0.939669 2124.64 1.14398 0.819195
6848.19 0.949418 2019.10 1.15171 0.784999
7118.88 0.947879 2140.21 1.15036 0.776418

Outlier Detection

# Average biological replicates raw biological replicates for outlier detection
aggregated_res <- aggregate(new_frame[, -dim(new_frame)[2]], by = list(new_frame$Preferred.Name), FUN = mean)
rownames(aggregated_res) <- aggregated_res$Group.1

aggregate_numeric <- aggregated_res[, -1]
# Cluster results and visualize it with a dendrogram
hc <- hclust(dist(aggregate_numeric, method = "euclidean"), method = "complete")
dd.row <- as.dendrogram(hc)
ddata_x <- dendro_data(dd.row, type = "rectangle")

# Vertical Dendrogram
ggplot(segment(ddata_x)) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + geom_text(data = label(ddata_x), aes(label = label, x = x, y = 0), 
    hjust = 0, nudge_y = max(segment(ddata_x)$y) * 0.01, size = 5) + labs(x = "", y = "") + coord_flip() + scale_y_reverse(expand = c(1, 1)) + theme(legend.position = "none", 
    axis.text.y = element_blank(), axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), panel.background = element_rect(fill = "white"), 
    panel.grid = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank())