####################################################### "Package Installation" #######################################################span>
############# Uncomment the following lines if any of the packages are not already installed or need to be reinstalled. #############
# BiocManager packages
# Install required Bioconductor packages
# BiocManager::install("sva", force = TRUE)
# BiocManager::install("GeneMeta", force = TRUE)
# BiocManager::install("ComplexHeatmap", force = TRUE)
# BiocManager::install("fgsea", force = TRUE)
# BiocManager::install("clusterProfiler", force = TRUE)
# Other packages
# Install required CRAN packages:
# install.packages("RColorBrewer")
# install.packages("msigdbr")
# install.packages("ggpubr")
# install.packages("reshape2")
# install.packages("caret")
# install.packages("rgl")
# sva package installation
# Install "sva" package using BiocManager
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("sva")
# VSN package installation
# Install "vsn" package using BiocManager
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("vsn")
# tidyverse package installation
# install.packages("tidyverse")
# GEOquery package installation
# Install "GEOquery" package using BiocManager
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("GEOquery")
# limma package installation
# install.packages("limma")
# plotly package installation
# install.packages("plotly")
# ggvenn package installation
# install.packages("ggvenn")
# ggpubr package installation
# install.packages("ggpubr")
# rgl package installation
# For rgl installation, require remotes package first
# install.packages("remotes")
# require(remotes)
# remotes::install_github("dmurdoch/rgl")
# Package Loading
# Load the required packages for data analysis and visualization.
require(tidyverse) # Package for comprehensive data manipulation and visualization, including dplyr, ggplot2, and other useful packages
require(GEOquery) # Package for accessing and retrieving gene expression data from the Gene Expression Omnibus (GEO) database
require(reshape2) # Package for reshaping data from wide to long format and vice versa, useful for data preprocessing
require(caret) # Package for machine learning and predictive modeling, providing a unified interface for various algorithms and performance evaluation
require(GeneMeta) # Package for meta-analysis of gene expression data, allowing combining results from multiple studies
require(limma) # Package for analyzing microarray data using linear models, including differential expression analysis
require(rgl) # Package for creating interactive 3D visualizations, useful for exploring complex data structures
require(sva) # Package for surrogate variable analysis, used to identify and adjust for unwanted variation in high-dimensional datasets
require(plotly) # Package for creating interactive and dynamic plots, including scatter plots, bar plots, and more
require(ggvenn) # Package for creating Venn diagrams using ggplot2, useful for visualizing set relationships
require(ggpubr) # Package for creating publication-ready plots using ggplot2, providing additional customization options
require(ComplexHeatmap) # Package for creating complex heatmaps, allowing visualization of multiple layers of data and annotations
require(RColorBrewer) # Package for providing color palettes suitable for data visualization
require(msigdbr) # Package for accessing and using the Molecular Signatures Database (MSigDB) for gene set enrichment analysis
require(fgsea) # Package for performing gene set enrichment analysis (GSEA) using fast gene set testing algorithms
require(locfit) # Package for local regression modeling, useful for fitting flexible curves to data
require(vsn) # Package for variance stabilization and normalization, specifically for microarray data analysis
######################################################################################################################################
####################################################### 1- DownloadGEO Function#######################################################
# DownloadGEO: Downloads a Gene Expression Omnibus (GEO) dataset and prepares it for analysis
# Input (Arguments):
# - GSEname: A character vector specifying the GEO dataset name (default: "GSE25065")
# Output:
# - Extracted variables saved as separate files (matrix.csv, phenotype.tsv, annot.tsv)
# Function Description:
# This function downloads a specified GEO dataset from Gene Expression Omnibus (GEO) and prepares it for analysis.
# It creates a folder to store the downloaded dataset, increases memory allocation for efficient processing,
# and downloads the specified GEO dataset. It then extracts the gene expression matrix, phenotype information,
# and annotation data from the downloaded dataset. The extracted variables are saved as separate files.
DownloadGEO <- function(GSEname = "GSE25065"){
# Create a folder to store the downloaded dataset
dir.create(GSEname, showWarnings = FALSE)
# Increase the memory allocation for efficient processing
Sys.setenv("VROOM_CONNECTION_SIZE"=131072 *131072)
#Download dataset
gset <- getGEO(GSEname, GSEmatrix = TRUE, AnnotGPL = TRUE,
destdir = paste0(getwd(), "/", GSEname))
if(!is.null(gset)){
# Extract the gene expression matrix, phenotype information, and annotation data:
# Extract the gene expression matrix from the downloaded dataset
matrix <- gset[[paste0(GSEname, "_series_matrix.txt.gz")]]@assayData[["exprs"]]
# Check if the matrix has undergone log2 transformation
qx <- as.numeric(quantile(matrix, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm= TRUE))
LogC <- (qx[5] > 100) || (qx[6]-qx[1] > 50 && qx[2] > 0)
if(!LogC){
# Reverse the log2 transformation if applied
matrix <- 2^(matrix)
}
# Add row names to the matrix and set the column name for the identifier column
matrix <- cbind(rownames(matrix), matrix)
colnames(matrix)[1] <- "ID"
# Extract the phenotype information from the downloaded dataset
phenotype <- gset[[paste0(GSEname, "_series_matrix.txt.gz")]]@phenoData@data
phenotype <- cbind(rownames(phenotype), phenotype)
colnames(phenotype)[1] <- "ID"
# Extract the annotation data from the downloaded dataset
annot<- gset[[paste0(GSEname, "_series_matrix.txt.gz")]]@featureData@data
# Write the extracted variables to separate files:
# Write the gene expression matrix to a CSV file
write.csv(matrix, paste0(GSEname, "/", "matrix.csv"), quote = FALSE, row.names = FALSE)
# Write the phenotype information to a tab-separated values (TSV) file
write_tsv(phenotype, paste0(GSEname, "/", "phenotype.tsv"))
# Write the annotation data to a TSV file
write_tsv(annot, paste0(GSEname, "/", "annot.tsv"))
# Remove unnecessary variables from memory
rm(gset, matrix, phenotype, annot)
# Display a message indicating that the dataset is ready for analysis
message ("+++++ Datatset is ready to analyse. +++++")
}
}
######################################################################################################################################
######################################################### 2- ReadGEO Function#########################################################
# ReadGEO: Reads the gene expression matrix file for a given project
# Input (Arguments):
# - projname: A character vector specifying the project name
# Output:
# - A data frame containing the gene expression matrix
# Function Description:
# This function reads the gene expression matrix file for a specified project.
# It takes the argument `projname`, which is a character vector specifying the project name.
# The function reads the matrix file in CSV format using the `read_csv` function and stores it in the variable `matrix`.
# The gene expression matrix is assumed to be stored in a file named "matrix.csv" in the project folder.
# Finally, the matrix is converted to a data frame using the `data.frame` function before being returned as the output.
# The function does not modify the original matrix file; it only reads the file and returns the matrix as a data frame.
ReadGEO <- function(projname){
# Reading the gene expression matrix file
matrix <- read_csv (paste0(projname , "/", "matrix.csv"))
# Converting the matrix to a data frame
matrix <- data.frame (matrix)
}
######################################################################################################################################
###################################################### 3- MakePhenotype Function #####################################################
# MakePhenotype: Creates a phenotype file based on project-specific phenotype data
# Input (Arguments):
# - projname: A character vector of project names
# - compare: A string indicating whether to compare "subtype" or "grade"
# Output:
# - A phenotype file containing the selected comparison (subtype or grade)
# Function Description:
# This function creates a phenotype file based on project-specific phenotype data. It takes the argument `projname`,
# which is a character vector of project names. The function reads the phenotype file for each project, converts
# the phenotype data to a data frame, and sets row names. Depending on the value of the `compare` argument, the function
# either compares subtype or grade information. For subtype comparison, it extracts subtype data, formats it, and stores
# it in a new list. For grade comparison, it extracts grade data, handles special cases (such as Greek symbols), and
# formats it. The formatted phenotype data is stored in a new list for each project. The function then combines the
# phenotype data from all projects into a single data frame. Finally, it writes the phenotype data to a CSV file and
# displays a message indicating that the phenotype file is created.
Makephenotype <- function(projname , compare= "grade"){
# Create a folder to store the phenotype file
dir.create("phenotype", showWarnings = FALSE)
# Initialize an empty list to store the phenotype data for each project
pheno <-list()
# Iterate over each project name
for(i in seq_along(projname)){
# Read the phenotype file for the current project
phenotype <- read_tsv(paste0(projname [i], "/", "phenotype.tsv"))
# Convert the phenotype data to a data frame and set row names
phenotype <- data.frame (phenotype)
rownames(phenotype) <- phenotype[,1]
phenotype <- phenotype[,-1]
# Add the phenotype data to the list using the project name as the key
pheno[[projname [i]]] <- phenotype
}
# Choose either subtype or grade for comparison
if(compare == "subtype"){
pheno2 <-list()
# Iterate over each project in the phenotype list
for (i in seq_along(pheno)){
# Find columns related to subtype information (e.g., pam50)
phecoln <- grep("pam50", pheno[[i]])
# Check if any subtype columns were found
if(length(phecoln) > 0){
# Extract the subtype data and format it
pheno1 <- data.frame (pheno[[i]][ ,phecoln])
colnames(pheno1) <- "subtype"
pheno1$subtype <- gsub("pam50.+:", "", pheno1$subtype)
# Prepare the final phenotype data by adding sample identifiers and column names
pheno1 <- cbind(rownames(pheno[[i]]), pheno1)
colnames(pheno1)[1] <- "Sample"
# Store the formatted phenotype data in a new list using the project name as the key
pheno2[[names(pheno)[i]]] <- pheno1
}
}
# Combine the phenotype data from all projects into a single data frame
pheno3 <- do.call(rbind, pheno2)
rownames(pheno3) <- NULL
} elseif(compare == "grade"){
pheno2 <-list()
# Iterate over each project in the phenotype list
for (i in seq_along(pheno)){
# Find columns related to grade information
gradecoln <- grep("grade:", pheno[[i]])
# Check if any grade columns were found
if(length(gradecoln) > 0){
# Extract the grade data and format it
pheno1 <- data.frame (pheno[[i]][ ,gradecoln])
colnames(pheno1) <- "grade"
pheno1$grade <- gsub("grade:|grade: |B-R grade: ", "", pheno1$grade)
pheno1$grade <- gsub("=.+", "", pheno1$grade)
# Handle special cases where grades are represented as Greek symbols
greek <- grep("III", pheno1$grade)
if(length(greek) > 0){
one <- which(factor(pheno1$grade) == "I")
two <- which(factor(pheno1$grade) == "II")
three <- which(factor(pheno1$grade) == "III")
pheno1[one, ] <- "1"
pheno1[two, ] <- "2"
pheno1[three, ] <- "3"
}
# Prepare the final phenotype data by adding sample identifiers and column names
pheno1 <- cbind(rownames(pheno[[i]]), pheno1)
colnames(pheno1)[1] <- "Sample"
# Store the formatted phenotype data in a new list using the project name as the key
pheno2[[names(pheno)[i]]] <- pheno1
}
}
# Combine the phenotype data from all projects into a single data frame
pheno3 <- do.call(rbind, pheno2)
rownames(pheno3) <- NULL
}
# Write the phenotype data to a CSV file
write.csv(pheno3, paste0("phenotype", "/", "phenotype_",compare , ".csv"), row.names = FALSE, quote = FALSE)
# Display a message indicating that the phenotype file is created
message ("phenotype file is created into phenotype folder")
}
######################################################################################################################################
####################################################### 4- MakeBoxPlot Function ######################################################
# MakeBoxPlot: Creates a box plot to visualize gene expression values from a gene expression matrix
# Input (Arguments):
# - GEOmatrix: A gene expression matrix in data frame format
# - projname: A character vector specifying the project name
# Output:
# - A box plot visualizing the gene expression values
# Function Description:
# This function creates a box plot to visualize the gene expression values from a gene expression matrix.
# It takes two arguments: `GEOmatrix`, which is the gene expression matrix in data frame format, and `projname`,
# which is a character vector specifying the project name. The function converts the gene expression matrix into
# a long format data frame using the `melt` function. It then uses `ggplot2` to create the box plot, with the
# x-axis representing the samples and the y-axis representing the gene expression values. The plot is customized
# with color and fill options, axis labels, y-axis scale labels, and theme settings. The resulting box plot
# is displayed, with the subtitle indicating the project name.
MakeBoxPlot <- function(GEOmatrix,projname){
# Convert the gene expression matrix into a long format data frame
melted <- melt(GEOmatrix, id.vars= "ID")
# Define a color palette for the plot
my_pal <- c("#1B9E77")
# Plot the box plot using ggplot2
ggplot(data = melted, aes(x = variable, y = value)) +
# Add the box plot layer with color and fill options
geom_boxplot(aes(color = "#45B39D", fill= "#45B39D"), outlier.alpha = 0.1)+
# Set the x and y axis labels
labs(x= 'SampleID', y= 'Values') +
# Customize the y-axis scale labels
scale_y_continuous(labels = scales::comma) +
# Apply a classic theme to the plot
theme_classic() +
# Set the color scale manually
scale_color_manual(values=c(my_pal)) +
# Set the fill scale manually with a modified transparency value
scale_fill_manual(values=c(paste(my_pal, "66", sep = ""))) +
# Customize the appearance of the axes, text, and title
theme(
# Customize axis labels and text styles:
# Customize axis text font, size, color, and angle
axis.text = element_text(family = "Arial",size = 24 , colour = "black", angle = 90),
# Customize x-axis text font, color, and size
axis.text.x = element_text(family = "Arial",colour = "black", size = 6),
# Customize y-axis text font, color, and size
axis.text.y = element_text(family = "Arial",colour = "black", size = 12),
# Customize plot subtitle font, size, color, and horizontal justification
plot.subtitle = element_text(family = "Arial",size = 24, colour = "black", hjust = 0.5),
# Customize y-axis title font, size, and angle
axis.title.y = element_text(family = "Arial", size = 24, angle = 90),
# Customize x-axis title font, size, and angle
axis.title.x = element_text(family = "Arial", size = 24, angle = 00),
# Remove the legend from the plot
legend.position="none"
) +
# Add a subtitle to the plot indicating the project name
labs(subtitle = paste0("Box plot - ", projname))
}
######################################################################################################################################
####################################################### 5- the MakePCA Function ######################################################
# MakePCA: Performs principal component analysis (PCA) and generates a plot
# Input (Arguments):
# - GEOmatrix: A gene expression matrix in data frame format
# - studies: A list of gene expression matrices from different studies (optional)
# - projname: A character vector specifying the project name(s)
# - compare: A character specifying the comparison type (e.g., "grade")
# Output:
# - A PCA plot or a 3D plot (based on the number of studies)
# Function Description:
# This function performs principal component analysis (PCA) on the gene expression matrix provided.
# It takes the gene expression matrix, `GEOmatrix`, as the primary input. Additionally, it can take a
# list of gene expression matrices from different studies, `studies`, the project name(s), `projname`,
# and the comparison type, `compare` (e.g., "grade"). The function first prepares the data by scaling
# the gene expression matrix and performing PCA using the `preProcess` function from the `caret` package.
# It then plots the PCA results either as a 2D plot for multiple studies or a 3D plot for a single study.
# The plot is customized with colors, legends, axis labels, and theme settings. The resulting PCA plot
# or 3D plot is displayed or saved as an HTML widget based on the number of studies provided.
MakePCA<- function(GEOmatrix, studies = NULL , projname , compare = "grade"){
# Create a directory to store the plots
dir.create("Plots", showWarnings = FALSE)
# Read the gene expression matrix file
set.seed(123)
# Check if the first column of GEOmatrix is "ID"
if(colnames(GEOmatrix)[1] == "ID"){
# Set the row names of GEOmatrix to the first column and remove the ID column
rownames(GEOmatrix) <- GEOmatrix[,1]
GEOmatrix <- GEOmatrix[,-1]
}
# Read the phenotype file
pheno <- read_csv (paste0("phenotype/phenotype_", compare , ".csv"))
pheno <- data.frame (pheno)
# Subset the phenotype data for samples present in the gene expression matrix
paired_pheno <- which(pheno$Sample %in% colnames(GEOmatrix))
pheno <- pheno[paired_pheno, ]
# Perform PCA analysis:
# Convert the gene expression matrix to a data frame and transpose it
GEOmatrix <- data.frame (t(GEOmatrix))
# Scale the data for PCA
pca_df <- scale(GEOmatrix)
# Create a PCA model
pca <- preProcess(x = pca_df, method = 'pca', pcaComp = 3)
# Apply the PCA transformation to the data
pca_df <- data.frame (predict(pca, pca_df))
# Sort the PCA data frame and the phenotype data frame by row names
pca_df <- pca_df[order(rownames(pca_df)), ]
pheno <- pheno[order(pheno$Sample), ]
# Handling missing values in phenotype data:
# Identify rows with NA values in the "grade" column
na_row <- which(is.na(pheno$grade))
# Execute the following code if there are NA values in the row
if(length(na_row) > 0){
# Get the sample names with NA values
na_samp <- pheno[na_row,1]
# Identify the corresponding columns in the PCA data frame
na_colm <- which(rownames(pca_df) %in% na_samp)
# Remove rows with NA values from the phenotype data frame and corresponding columns from the PCA data frame
pheno <- pheno[-na_row, ]
pca_df <- pca_df[-na_colm, ]
}
# Assign class labels to PCA data:
# Create a factor variable for class labels based on the "grade" column
pca_df$Class <- factor(pheno$grade)
# Define a color palette for the 2D plot for multi-studies
my_pal <- c("#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#9A7D0A")
# Execute the following code if the length of projname is greater than 1
if(length(projname)>1){
# Get the row names of the PCA data frame
pca_rownames <- rownames(pca_df)
# Create a batch variable for different studies
study_list <- studies
batch <-list()
# Iterate over each element in study_list
for(i in seq_along(study_list)){
# Determine the number of samples from each study present in the PCA data frame
l <- length(which(colnames(study_list[[i]]) %in% pca_rownames))
# Create a list with repeated project names for each sample from the corresponding study
batch[[i]] <- rep (projname [i], l)
}
# Combine the list into a single vector
batch <- unlist(batch)
# Assign the batch information as a factor variable in the PCA data frame
pca_df$studies <- factor(batch)
# Generate the PCA plot using ggplot2
ggplot(data = pca_df, aes(x = PC1, y = PC2, color = Class, fill = Class)) +
# Add points with shape based on the "Studies" variable
geom_point(size = 3, aes( shape= studies)) +
# Customize color scale for class labels
scale_color_manual(values=c(my_pal)) +
# Customize fill colors for class labels
scale_fill_manual(values=c(paste(my_pal, "66", sep = ""))) +
# Apply a classic theme to the plot
theme_classic() +
# Customize the appearance of the axes, text, and title
theme(
# Customize axis text font, size, and color
axis.text = element_text(family = "Arial",size = 24 , colour = "black"),
# Customize x-axis text font, color, and size
axis.text.x = element_text(family = "Arial",colour = "black", size = 24),
# Customize y-axis text font, color, and size
axis.text.y = element_text(family = "Arial",colour = "black", size = 24),
# Customize plot subtitle text font, size, color, and position
plot.subtitle = element_text(family = "Arial",size = 24, colour = "black", hjust = 0.5),
# Customize y-axis title font, size, and rotation angle
axis.title.y = element_text(family = "Arial", size = 24, angle = 90),
# Customize x-axis title font, size, and rotation angle
axis.title.x = element_text(family = "Arial", size = 24, angle = 00),
# Customize legend text size and font
legend.text = element_text(size = 10, family = "Arial"),
# Customize legend title size and font
legend.title = element_text(size = 20, family = "Arial")
) +
# Set the subtitle of the plot
labs(subtitle = "PCA plot- Merged studies")
} else {
# Plot a 3D plot for a single study:
# Plotting using the plot3d function
plot3d(
# Specify the coordinates for the 3D plot
x=pca_df$PC1, y=pca_df$PC2, z=pca_df$PC3,
# Set color based on the class labels
col = my_pal[pca_df$Class],
# Set the point type as "s" for spheres
type = 's',
# Set the radius of the spheres
radius = 4,
# Set the size of the spheres
size = 3,
# Set the axis labels
xlab="PC1", ylab="PC2", zlab="PC3"
)
# Add a legend to the plot
legend3d(
# Set the position of the legend
"topright",
# Set the legend labels based on the class labels
legend = levels(factor(pca_df$Class)),
# Set the legend title
title = compare ,
# Set the legend marker type
pch = 16,
# Set the legend marker colors
col = my_pal,
# Set the legend text size
cex=1.2,
# Set the inset position of the legend
inset=c(0.02)
)
# Save the 3D plot as an HTML widget:
# Combine project names into a single string
projname <- paste0(projname , collapse = "-")
# Save the 3D plot as an HTML widget using the saveWidget() function
htmlwidgets::saveWidget(
# Specify the dimensions of the widget
rglwidget(width = 800, height = 800),
# Set the file name
file = paste0("Plots/PCA_", projname , ".html"),
# Specify the library directory
libdir = "libs",
# Set the title of the HTML file
title = paste0("PCA - ", projname),
# Specify whether the HTML file should be self-contained
selfcontained = FALSE
)
}
}
######################################################################################################################################
##################################################### 6- VSNQuantilNorm Function######################################################
# VSNQuantilNorm: Performs VSN (Variance Stabilizing Normalization) followed by quantile normalization on the input gene expression matrix
# Input (Arguments):
# - GEOmatrix: The gene expression matrix to be normalized
# Output:
# - A data frame containing the normalized gene expression matrix
# Function Description:
# This function performs Variance Stabilizing Normalization (VSN) followed by quantile normalization on the
# input gene expression matrix, `GEOmatrix`. VSN is used to stabilize the variance of gene expression values
# across samples, and quantile normalization is applied to ensure that the distribution of gene expression values
# is the same across samples. The function first sets the row names of the gene expression matrix and removes the
# ID column. Then, it performs VSN normalization using the `normalizeVSN` function. After VSN normalization, the
# function applies quantile normalization by ranking and sorting each column of the normalized matrix. It calculates
# the mean across rows and applies an indexing function to retrieve the mean values using column indices. Finally,
# the function returns a data frame containing the normalized gene expression matrix, where the first column is
# named "ID" and row names are combined with the normalized matrix.
VSNQuantilNorm <- function(GEOmatrix){
# Set the seed for reproducibility of random processes
set.seed(1234)
# Reading the gene expression matrix and setting row names
rownames(GEOmatrix) <- GEOmatrix[,1]
GEOmatrix <- GEOmatrix[,-1]
# Normalize the gene expression data using the VSN (variance stabilization and normalization) method
vsnnorm <- normalizeVSN(GEOmatrix)
# Quantile normalization:
# Rank each column of the normalized matrix
df_rank <- apply(vsnnorm,2,rank,ties.method="min")
# Sort each column of the normalized matrix
df_sorted <- data.frame (apply(vsnnorm, 2, sort))
# Calculate the mean across rows
df_mean <- apply(df_sorted, 1, mean)
# Function to retrieve mean values using column indices
index_to_mean <- function(my_index, my_mean){
# Return the mean value at the specified index
return(my_mean[my_index])
}
# Apply the index_to_mean function to the ranked matrix to obtain the final normalized matrix
norm_final <- apply(df_rank, 2, index_to_mean, my_mean=df_mean)
# Convert the final normalized matrix into a data frame
norm_final <- data.frame (norm_final)
# Convert all columns of the data frame to numeric type
norm_finall <- data.frame (sapply(norm_final, function(x) as.numeric(as.character(x))), check.names= FALSE)
# Combine the row names of the original matrix with the normalized matrix
norm_finall <- cbind(rownames(GEOmatrix), norm_finall)
# Rename the first column as "ID"
colnames(norm_finall)[1] <- "ID"
# Return the normalized gene expression matrix
return(norm_finall)
}
######################################################################################################################################
###################################################### 7- MergeStudies function ######################################################
# MergeStudies: Merge multiple matrices
# Input (Arguments):
# - studies: A list of matrices to be merged
# Output:
# - The merged matrix
# Function Description:
# This function merges multiple matrices provided as a list, 'studies'. It uses the 'merge' function to combine
# the matrices, ensuring that all rows and columns from each matrix are included in the merged result by setting
# the 'all=TRUE' argument. The function returns the merged matrix, which contains the combined data from all
# input matrices.
Mergestudies <- function(studies){
# Merging two or more matrices using the 'merge' function with 'all=TRUE' to include all rows and columns
study_list <- Reduce(function(x, y) merge(x, y, all= TRUE), studies)
return(study_list)
}
######################################################################################################################################
#################################################### 8- StudyBatchEffect Function ####################################################
# StudyBatchEffect: Perform batch effect correction on merged matrices
# Input (Arguments):
# - studies: A list of matrices with batch effects
# Output:
# - The batch-corrected matrix
# Function Description:
# This function performs batch effect correction on a list of matrices provided in 'studies'.
# It calculates the number of samples for each matrix and creates a vector of batch sizes based on the
# number of samples in each matrix. The function then merges the matrices using the 'merge' function with
# 'all=TRUE' to include all rows and columns. Next, it sets the row names of the merged matrix and performs
# batch effect correction using the ComBat function. The corrected matrix is converted to a data frame,
# ensuring columns are of numeric type. Finally, the row names are combined with the corrected matrix,
# the first column is renamed as "ID", and the batch-corrected matrix is returned as the output of the function.
StudyBatchEffect <- function(studies){
# Calculating the number of samples for each matrix
study_list <- studies
# Create an empty list to store batches
batch <-list()
# Iterate over the indices of the study_list
for(i in seq_along(study_list)){
# Subtract 1 to exclude the column containing row names
batch[[i]] <- ncol(study_list[[i]]) - 1
}
# Flatten the list to create a vector of batch sizes
batch <- unlist(batch)
# Replicate the sequence of study indices based on batch sizes
batch <- rep (seq_along(study_list), batch)
# Merging matrices using the 'merge' function with 'all=TRUE' to include all rows and columns
study_list <- Reduce(function(x, y) merge(x, y, all= TRUE), study_list)
# Setting the row names of the merged matrix to the first column
rownames(study_list) <- study_list[,1]
# Exclude the first column containing row names
study_list <- study_list[,-1]
# Perform batch effect correction using the ComBat function
corrected_batch <- ComBat(
# Input data for batch effect correction
dat=study_list,
# Batch information for correction
batch=batch,
# No covariates specified for batch effect correction
mod=NULL ,
# Estimate empirical Bayes parameters
par.prior= TRUE,
# Disable diagnostic plots
prior.plots= FALSE)
# Convert the corrected matrix to a data frame
corrected_batch <- data.frame (corrected_batch)
# Convert columns of the data frame to numeric type
corrected_batch <- data.frame (
# Convert each column to numeric type using as.numeric(as.character(x))
sapply(corrected_batch, function(x) as.numeric(as.character(x))),
# Set check.names to FALSE to preserve column names
check.names= FALSE,
# Set row.names to the row names of the original corrected_batch object
row.names = rownames(corrected_batch))
# Combine the row names with the corrected matrix
corrected_batch <- cbind(rownames(corrected_batch), corrected_batch)
# Rename the first column as "ID"
colnames(corrected_batch)[1] <- "ID"
# Return the batch-corrected matrix as the output of the function
return(corrected_batch)
}
######################################################################################################################################
#################################################### 9- MakeDensityPlot Function ####################################################
# MakeDensityPlot: Create a density plot of gene expression values
# Input:
# - matrix: The gene expression matrix
# - studies: A list of matrices representing different studies
# Output:
# - None (the plot is displayed)
# Function Description:
# This function creates a density plot to visualize the distribution of gene expression values across different studies.
# It converts the gene expression matrix to a long format data frame, calculates the number of samples for each study,
# and adds a study factor variable to the data frame. The function then plots the density plot using ggplot2,
# with gene expression values on the x-axis and color-coded by studies. It customizes the appearance of the plot
# and adds axis labels and a subtitle. The density plot provides insights into the distribution patterns and
# potential differences in gene expression between studies.
MakeDensityPlot <- function(matrix, studies){
# Set the random seed for reproducibility
set.seed(1234)
# Convert the gene expression matrix to a long format data frame
melted <- melt(matrix, id.vars = "ID")
# Calculating the number of samples for each study:
# Create an empty list named 'st'
st <-list()
# Loop over the sequence of indices of the 'studies' vector
# The variable 'i' represents the current index in each iteration
# The loop iterates over each element in 'studies'
for(i in seq_along(studies)){
# Get the number of rows in the study matrix
row_n <- nrow(studies[[i]])
# Get the number of columns in the study matrix excluding the ID column
col_n <- ncol(studies[[i]]) -1
# Create a vector of study names repeated for each sample
st[[i]] <- rep (names(studies)[i], row_n*col_n)
}
st <- unlist(st)
# Add the study factor variable to the melted data frame
melted$studies <- factor(st)
# Plotting a density plot:
# Define a color palette for the plot
my_pal <- c("#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#9A7D0A")
# Create a ggplot object with the melted data as the input data
# Specify x as gene expression values and color by studies
ggplot(data = melted, aes(x = value, color = studies)) +
# Add density plot with a specified line size
geom_density(size = 1.5) +
# Set axis labels
labs(y= 'Density', x= 'Value') +
# Apply a classic theme to the plot
theme_classic() +
# Set manual color scale based on study names
scale_color_manual(values=c(my_pal)) +
# Set manual fill color scale
scale_fill_manual(values=c(paste(my_pal, "66", sep = ""))) +
# Customize the appearance of the axes, text, and title
theme(
# Customize axis text styles
axis.text = element_text(family = "Arial",size = 24 , colour = "black", angle = 90),
# Customize x-axis text style
axis.text.x = element_text(family = "Arial",colour = "black", size = 12),
# Customize y-axis text style
axis.text.y = element_text(family = "Arial",colour = "black", size = 24),
# Customize subtitle
plot.subtitle = element_text(family = "Arial",size = 24, colour = "black", hjust = 0.5),
# Customize y-axis title
axis.title.y = element_text(family = "Arial", size = 24, angle = 90),
# Customize x-axis title
axis.title.x = element_text(family = "Arial", size = 24, angle = 00)
) +
# Add a subtitle to the plot
labs(subtitle = "Density plot")
}
######################################################################################################################################
###################################################### 10- MakeMDSPlot Function ######################################################
# MakeMDSPlot: Create a multidimensional scaling (MDS) plot
# Input (Arguments):
# - matrix: The input matrix containing gene expression data
# - studies: A list of matrices representing different studies
# - projname: A vector of project names corresponding to each study
# - compare: A comparison variable
# - nstudies: The number of studies
# Output:
# - None (the plot is displayed)
# Function Description:
# This function creates an MDS plot to visualize the relationship between samples based on their gene expression data.
# The plot helps to identify clusters and patterns in the data, providing insights into similarities and differences among samples.
# Define the MakeMDSPlot function with input parameters
MakeMDSPlot <- function(matrix, studies, projname, compare, nstudies){
# Set the random seed for reproducibility
set.seed(1234)
# Check if the matrix has an "ID" column, and if so, set row names and remove the ID column
if(colnames(matrix)[1] == "ID"){
# Set row names to the values in the "ID" column
rownames(matrix) <- matrix[,1]
# Remove the "ID" column from the matrix
matrix <- matrix[,-1]
}
# Transpose the matrix and convert it to a data frame
matrix <- data.frame (t(matrix))
# Compute MDS using the dist() and cmdscale() functions:
# Assign the input matrix to the variable mds
mds <- matrix %>%
# Compute the distance matrix using the dist() function
dist() %>%
# Perform classical multidimensional scaling (MDS) on the distance matrix using the cmdscale() function.
cmdscale() %>%
# Convert the MDS output to a data frame and assign it to the variable mds.
data.frame ()
# Set column names as "Dim1" and "Dim2"
colnames(mds) <- c("Dim1", "Dim2")
# Perform clustering using k-means on the MDS coordinates
# Perform k-means clustering on mds with nstudies clusters and assign the resulting cluster assignments to the variable clust.
clust <- kmeans(mds, nstudies)$cluster %>%
# Convert the cluster assignments to a factor data type.
as.factor()
# Add clustering results to the MDS data frame
# Update the 'mds' data frame
mds <- mds %>%
# Add a new column called "Clusters" and assign cluster assignments from 'clust'
mutate(clusters = clust)
# Retrieve the row names of the MDS data frame
mds_rownames <- rownames(mds)
# Assign study names to each sample based on column names of study matrices:
study_list <- studies
# Create an empty list called 'batch'
batch <-list()
# Loop over the indices of 'study_list'
for(i in seq_along(study_list)){
# Determine the number of samples in the current study
l <- length(which(colnames(study_list[[i]]) %in% mds_rownames))
# Create a vector of project names for the samples in the current study
batch[[i]] <- rep (projname [i], l)
}
# Combine the project names into a single vector
batch <- unlist(batch)
# Add the project names as a factor to the MDS data frame
mds$studies <- factor(batch)
# MDS plot customization:
# This section creates an MDS (Multidimensional Scaling) plot using the ggscatter function from the ggpubr package and customizes various plot elements:
# Define a vector of color codes for plotting
my_pal <- c("#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#9A7D0A")
# Create the scatter plot using ggscatter from the ggpubr package
g <- ggscatter(mds, x = "Dim1", y = "Dim2",
# Specify the label, shape, color, palette, size, and other options for ggscatter:
# Set the labels for the points as row names of the MDS matrix
label = rownames(mds),
# Use the "Studies" variable for shaping the points
shape = "Studies",
# Set the theme of the ggplot to "classic"
ggtheme = theme_classic(),
# Use the "Clusters" variable for coloring the points
color = "Clusters",
# Set the color palette for the points
palette = my_pal,
# Set the size of the points
size = 1.5,
# Display ellipses around the points
ellipse = TRUE,
# Set the type of ellipse to "convex"
ellipse.type = "convex",
# Enable point label repulsion to avoid overlapping labels
repel = TRUE)
# Customize the appearance of the axes, text, and title
# Add the theme and labels to the ggplot object
g + theme(
# Customize axis text
axis.text = element_text(family = "Arial",size = 24 , colour = "black"),
# Customize x-axis text
axis.text.x = element_text(family = "Arial",colour = "black", size = 24),
# Customize y-axis text
axis.text.y = element_text(family = "Arial",colour = "black", size = 24),
# Customize subtitle
plot.subtitle = element_text(family = "Arial",size = 24, colour = "black", hjust = 0.5),
# Customize y-axis title
axis.title.y = element_text(family = "Arial", size = 24, angle = 90),
# Customize x-axis title
axis.title.x = element_text(family = "Arial", size = 24, angle = 00)
) +
# Set the subtitle of the plot as "MDS plot"
labs(subtitle = "MDS plot")
}
######################################################################################################################################
##################################################### 11- CochranQTest Function ######################################################
# CochranQTest: Perform Cochran's Q Test on multiple studies
# Input:
# - studies: A list of matrices representing different studies
# - groups: A vector specifying the comparison groups (e.g., grade1 vs grade3)
# - compare: A string indicating the type of comparison (e.g., "grade")
# Output:
# - A list containing the mean and variance for each comparison mode
# Function Description:
# Cochran's Q Test: Perform statistical analysis to compare proportions across multiple groups
# This function performs Cochran's Q Test on multiple studies to compare proportions across different groups.
# It takes a list of matrices representing different studies, a vector specifying the comparison groups,
# and a string indicating the type of comparison. The function calculates the mean and variance for each comparison mode
# and returns them as a list. Cochran's Q Test is a statistical test used to determine if there are significant differences
# in proportions between multiple groups. It helps to assess the homogeneity of proportions across the groups and
# identify any significant departures from the null hypothesis of equal proportions.
CochranQTest <- function(studies, groups, compare ){
# Set the random seed for reproducibility
set.seed(1234)
# Reading matrix and phenotype:
# Initialize a list to store the mean values for each study
d.adj.Split <-list()
# List to store variance values for each study
var.d.adj.Split <-list()
# Iterate over each study in the 'studies' list
for(i in seq_along(studies)){
# Get the matrix for the current study
matrix <- studies[[i]]
# Check if the matrix has an "ID" column, and if so, handle it accordingly
# (e.g., setting row names and removing the ID column)
if(colnames(matrix)[1] == "ID"){
# Set row names to the values in the first column
rownames(matrix) <- matrix[,1]
# Remove the first column from the matrix
matrix <- matrix[,-1]
}
# Read the phenotype file for the corresponding comparison type:
# Read the phenotype file
pheno <- read_csv (paste0("phenotype/phenotype_", compare , ".csv"))
# Convert the read data into a data frame
pheno <- data.frame (pheno)
# Select the samples from the phenotype file that match the column names in the matrix:
# Identify the matching column indices
paired_pheno <- which(pheno$Sample %in% colnames(matrix))
# Select the rows that correspond to the matched samples
pheno <- pheno[paired_pheno, ]
# Identify the rows with missing values in the "grade" column
na_row <- which(is.na(pheno$grade))
# Check if there are any rows with missing values in the "grade" column
# and handle the missing values accordingly (e.g., removing corresponding rows and columns)
# Check if the length of 'na_row' is greater than zero (if there are any missing values in the 'na_row' variable)
if(length(na_row)> 0){
# Get the sample IDs with missing values
na_samp <- pheno[na_row,1]
# Identify the matching column indices in the matrix
na_colm <- which(colnames(matrix) %in% na_samp)
# Remove the rows with missing values from the phenotype data
pheno <- pheno[-na_row, ]
# Remove the corresponding columns from the matrix
matrix <- matrix[, -na_colm]
}
# Calculating the number of each comparison mode (grade1 vs grade3):
# Empty list to store the factor levels (0 and 1) for each comparison group
g_list <-list()
# Empty list to store the sample IDs for each comparison group
sampleid <-list()
# Iterate over the comparison groups and perform operations for each group
for(j in seq_along(groups)){
# Identify the rows with the specified group
g_row <- which(pheno $grade %in% groups[j])
# Get the sample IDs for the specified group
s_id <- pheno[g_row,1]
# Store the sample IDs for the comparison group
sampleid[[j]] <- s_id
# Create the factor levels for the comparison group
g_list[[j]] <- factor(rep(j - 1, length(g_row)))
}
# Applying those modes and calculating the mean and variance for Cochran' Q Test
# Unlist the factor levels for all comparison groups to create a single vector of factor levels
spl <- unlist(g_list)
# Unlist the sample IDs for all comparison groups
spsam <- unlist(sampleid)
# Select the columns in the matrix corresponding to the sample IDs
matrix <- matrix[,spsam]
# Convert the matrix to a matrix object
matrix <- as.matrix(matrix)
# Calculate the mean difference for each comparison mode
d.Split <- getdF(matrix, spl)
# Calculate the overall mean difference
mean_d <- dstar(d.Split, length(spl))
# Store the mean difference for the current study
d.adj.Split[[i]] <- mean_d
# Calculate the variance
var_d <- sigmad(mean_d, sum(spl==0), sum(spl==1))
# Store the variance for the current study
var.d.adj.Split[[i]] <- var_d
}
# Create a list containing the mean and variance values
output <-list(d.adj.Split, var.d.adj.Split)
# Return the output as a list containing mean differences and variances
return(output)
# Display a message indicating the completion of the analysis
message ("Analysis has completed.")
}
######################################################################################################################################
###################################################### 12- MakeQQPlot Function #######################################################
# MakeQQPlot: Generate a QQ plot for Cochran's Q Test
# Input:
# - cohranq: Output from CochranQTest function (list of means and variances)
# - nstudies: Number of studies
# Output:
# - QQ plot
# Function Description:
# This function generates a QQ plot for Cochran's Q Test. It takes the output from the CochranQTest function,
# which is a list of means and variances, and the number of studies as input. The function calculates the quantiles
# of the chi-square distribution and the quantiles of the sample, and then plots them on a QQ plot.
# A QQ plot (quantile-quantile plot) is used to assess whether a dataset follows a specific theoretical distribution
# by comparing the observed quantiles against the expected quantiles. In this case, the QQ plot helps to visualize
# the goodness-of-fit between the observed Cochran's Q statistics and the expected chi-square distribution.
# Deviations from the diagonal reference line indicate departures from the expected distribution.
MakeQQPlot <- function(cohranq, nstudies){
# Unpack the previous list into data frames for means and variances
# Extract the means from the output list
output1 <- cohranq[[1]]
# Extract the variances from the output list
output2 <- cohranq[[2]]
# Combine the means into a data frame
mymns <- data.frame (do.call(cbind, output1))
# Combine the variances into a data frame
myvars <- data.frame (do.call(cbind, output2))
# Calculate Cochran's Q Test:
# Compute Cochran's Q Test using means and variances
my.Q <- f.Q(mymns, myvars)
# Store the number of studies for later use
nstudies <- nstudies
# Calculate quantiles of the chi-square distribution
chisqq <- qchisq(seq(0, .9999, .001), df = nstudies-1)
# Calculate quantiles of the sample
tmp <- quantile(my.Q, seq(0, .9999, .001))
# Create a data frame for QQ plot data
qq <- data.frame (chisqq = chisqq, tmp = tmp)
# Plotting the QQ plot:
# Set data and aesthetics for the plot
ggplot(data = qq, aes(x = chisqq, y = tmp, color = "#1B9E77")) +
# Add points to the plot
geom_point() +
# Add a diagonal reference line to the plot
geom_abline() +
# Apply a classic theme to the plot
theme_classic() +
# Set labels for x and y axes
labs(x= 'quantiles of Chi square', y= 'quantiles of Sample') +
# Customize the appearance of the axes, text, and title
theme(
# Customize axis text appearance
axis.text = element_text(family = "Arial",size = 24 , colour = "black"),
# Customize x-axis text appearance
axis.text.x = element_text(family = "Arial",colour = "black", size = 24),
# Customize y-axis text appearance
axis.text.y = element_text(family = "Arial",colour = "black", size = 24),
# Customize plot subtitle appearance
plot.subtitle = element_text(family = "Arial",size = 24, colour = "black", hjust = 0.5),
# Customize y-axis title appearance
axis.title.y = element_text(family = "Arial", size = 24, angle = 90),
# Customize x-axis title appearance
axis.title.x = element_text(family = "Arial", size = 24, angle = 00),
# Remove the legend
legend.position="none"
) +
# Set the plot subtitle
labs(subtitle = "QQ Plot")
}
######################################################################################################################################
##################################################### 13- FEMREMAnalysis Function#####################################################
# FEMREMAnalysis: Perform FEM-REM meta-analysis
# Input:
# - studies: A list of expression matrices from different studies
# - projname: Names of the projects or studies
# - compare: Comparison mode (e.g., "grade1_vs_grade3")
# - groups: List of comparison groups
# - model: Model selection ("FEM" for fixed effects model, "REM" for random effects model)
# - FDR: False discovery rate threshold for filtering genes
# - nperm: Number of permutations for z-score calculation
# Output:
# - Meta-analysis results
# Function Description:
# This function performs FEM-REM meta-analysis on expression matrices from different studies.
# It takes as input a list of expression matrices, names of the projects or studies, a comparison mode,
# a list of comparison groups, a model selection (FEM for fixed effects model or REM for random effects model),
# a false discovery rate (FDR) threshold for gene filtering, and the number of permutations for z-score calculation.
# The function first creates folders to store the meta-analysis results. It reads the phenotype file for the comparison
# mode and the annotation files for each project. It then performs data preprocessing, including handling missing values
# and filtering based on comparison groups. The expression matrices, class labels, and annotation data are prepared for analysis.
# Next, the function determines the type of analysis based on the model selection and performs z-score calculation using
# the zScoreFDR function. The two-sided scores are extracted and combined with annotation information.
# Missing gene symbols are removed, and the data frame is subsetted, counted, and checked for duplicated gene symbols.
# For duplicated gene symbols, average expression values and associated information are calculated and stored.
# The averaged genes are combined with unique genes into a single data frame, sorted by z-score in descending order, and written to files.
# The function also performs FDR filtering on the data frame, writes the filtered results to files,
# and displays summary information for each project. The completion message is displayed when the analysis is finished.
FEMREMAnalysis <- function(studies, projname , compare , groups, model, FDR, nperm){
# Set the random seed for reproducibility
set.seed(1234)
# Creating folders for meta-analysis results:
# Create MetaAnalysis folder
dir.create("MetaAnalysis", showWarnings = FALSE)
# Create filtered subfolder
dir.create("MetaAnalysis/filtered", showWarnings = FALSE)
# Create volcano subfolder
dir.create("MetaAnalysis/volcano", showWarnings = FALSE)
# Reading phenotype file
phenotype <- read_csv (paste0("phenotype/phenotype_", compare , ".csv"))
phenotype <- data.frame (phenotype)
# Reading annotation file for each project:
# Initialize an empty list to store annotations for each project
annotations <-list()
# Iterate over each project to process annotations
for(i in seq_along(projname)){
# Read annotation file
annot<- read_tsv(paste0(projname [i], "/", "annot.tsv"))
# Convert to data frame
annot<- data.frame (annot)
# Set row names
rownames(annot) <- annot[,1]
# Remove the first column
annot<- annot[,-1]
# Select specific columns
annot<- annot[ , c("Gene.symbol", "Chromosome.location", "GO.Function")]
# Combine row names and data
annot<- cbind(rownames(annot), annot)
# Rename the first column
colnames(annot)[1] <- "ID"
# Order by ID column
annot<- annot[order(annot$ID), ]
# Store annotation data in a list
annotations[[projname [i]]] <- annot
}
# Perform intersection of annotations
an <- Reduce(intersect, annotations)
# Combine annotations into a data frame
annotations <- data.frame (do.call(cbind, an))
# Create a list to store expression matrices
matrices_list <-list()
# Create a list to store class labels
Class <-list()
# Iterate over each study to process data
for(i in seq_along(studies)){
# Get the expression matrix for the current study
matrix <- studies[[i]]
# Check if the matrix has an "ID" column
if(colnames(matrix)[1] == "ID"){
# Set row names from the first column
rownames(matrix) <- matrix[,1]
# Remove the first column
matrix <- matrix[,-1]
}
# Data preprocessing and cleaning for phenotype and gene expression matrices:
# Identify the indices of samples in the phenotype matching matrix that are present in the column names of the matrix
paired_pheno <- which(phenotype$Sample %in% colnames(matrix))
# Subset the phenotype data frame to include only the rows corresponding to the identified samples
pheno <- phenotype[paired_pheno, ]
# Identify the indices of rows in the phenotype data frame where the grade column contains missing values (NA)
na_row <- which(is.na(pheno$grade))
# Check if there are any missing values in the rows of the phenotype data
if(length(na_row)> 0){
# Get the sample IDs with missing grade values
na_samp <- pheno[na_row,1]
# Get the corresponding column indices
na_colm <- which(colnames(matrix) %in% na_samp)
# Remove rows with missing grade values from phenotype
pheno <- pheno[-na_row, ]
# Remove columns with missing grade values from matrix
matrix <- matrix[, -na_colm]
}
# Calculating the number of each comparison mode (grade1 vs grade3):
# Create a list to store comparison groups
g_list <-list()
# Create a list to store sample IDs
sampleid <-list()
# Iterate over each group to process comparisons
for(j in seq_along(groups)){
# Identify rows matching the current comparison group
g_row <- which(pheno $grade %in% groups[j])
# Get the sample IDs for the current group
s_id <- pheno[g_row,1]
# Store sample IDs in the list
sampleid[[j]] <- s_id
# Create factor levels for the current group
g_list[[j]] <- factor(rep(j - 1, length(g_row)))
}
# Prepare factor levels, sample IDs, and expression matrix for analysis:
# Unlist the factor levels for all comparison groups
spl <- unlist(g_list)
# Convert factor levels to numeric
spl <- as.numeric(as.character(spl))
# Store class labels for the current study
Class[[i]] <- spl
# Unlist the sample IDs for all comparison groups
spsam <- unlist(sampleid)
# Subset the matrix using the selected sample IDs
matrix <- matrix[,spsam]
# Convert to matrix class
matrix <- as.matrix(matrix)
# Create an ExpressionSet object
matrix <- ExpressionSet(assayData=matrix)
# Store the expression matrix in the list
matrices_list[[i]] <- matrix
}
# Determine the type of analysis based on the model type (FEM or REM):
# Check if the model type is FEM for Fixed Effects Model analysis:
# - If the selected model is 'FEM' (Fixed Effects Model), set the useREM flag to FALSE.
# - This flag is used to indicate the use of the Fixed Effects Model in subsequent analysis.
if(model == 'FEM'){
# Set useREM flag to FALSE for FEM model (indicating the use of Fixed Effects Model)
useREM <- FALSE
}
# - If the selected model is 'REM' (Random Effects Model), set the useREM flag to TRUE.
# - This flag is used to indicate the use of the Random Effects Model in subsequent analysis.
if(model == 'REM'){
# Set useREM flag to TRUE for REM model (indicating the use of Random Effects Model)
useREM <- TRUE
}
# Perform z-score calculation
ScoresFDR <- zScoreFDR(matrices_list, Class, useREM = useREM, nperm = nperm)
# Get the two-sided scores
twosided <- data.frame (ScoresFDR[["two.sided"]])
# Sort by row names
twosided <- twosided[order(rownames(twosided)),]
# Identify matching annotations and add annotation information:
# Identify matching annotations
anno_row <- which(annotations$ID %in% rownames(twosided))
# Add annotation information
twosided <- cbind(twosided, annotations[anno_row, c(2, 3, 4)])
# Removing missing gene symbols:
# Identify missing gene symbols
genemissing <- which(is.na(twosided$Gene.symbol) == TRUE)
# Remove rows with missing gene symbols
twosided <- twosided[-genemissing, ]
# Subset, Count, and Identify Duplicated Gene Symbols:
# Subset columns
combination <- twosided[ ,c("zSco", "FDR", "Gene.symbol", "Chromosome.location", "GO.Function")]
# Count duplicated gene symbols
dupgene_freq <- data.frame (table(combination$Gene.symbol))
# Get duplicated gene symbols
only_dupgene <- dupgene_freq[which(dupgene_freq$Freq> 1), 1]
# Identify rows with duplicated gene symbols
only_dupgene_row <- which(combination$Gene.symbol %in% only_dupgene)
# Subset rows with duplicated gene symbols
duplicat_deg <- combination[only_dupgene_row,]
# Calculate and store the average expression values and associated information for each gene symbol with duplicated entries
# Create an empty list to store averaged genes
averaged_genes <-list()
# Iterate over each duplicated gene symbol
for(b in 1:length(only_dupgene)){
# Identify rows for the current duplicated gene
d_row <- which(duplicat_deg$Gene.symbol %in% only_dupgene[b])
# Calculate column means
col_mean <- rbind(colMeans(duplicat_deg[d_row, 1:2]))
# Combine column means with other columns
col_all <- cbind(col_mean, duplicat_deg[d_row[1],3:5])
# Store the averaged gene information
averaged_genes[[b]] <- col_all
}
# Combine averaged genes into a single data frame
averaged_genes <- do.call(rbind, averaged_genes)
# Combine the rows representing unique gene symbols with the rows representing averaged genes:
# Subset rows with unique gene symbols
unique_deg <- combination[-only_dupgene_row, ]
# Combine unique genes and averaged genes
all_unique_genes_combi <- rbind(unique_deg, averaged_genes)
# Reset row names
rownames(all_unique_genes_combi) <- NULL
# Sort the data frame by z-score in descending order
all_unique_genes_combi <- all_unique_genes_combi[order(all_unique_genes_combi[,1], decreasing = TRUE),]
# Write the result to a file
write_tsv(all_unique_genes_combi, "MetaAnalysis/volcano/combination.tsv")
# Filter the data frame based on FDR threshold
all_unique_genes_combi <- all_unique_genes_combi[which(all_unique_genes_combi[, "FDR"] < FDR),]
# Write the filtered result to a file
write_tsv(all_unique_genes_combi, "MetaAnalysis/filtered/combination.tsv")
# Seperating each study, averaging duplicated genes
# Get column indices for FDR values
sfdr <- grep("FDR_Ex_{0-9}", colnames(twosided))
# Iterate over the indices of sfdr
for(i in seq_along(sfdr)){
# Get column indices for gene symbols
gs_col <- grep("Gene.symbol|Chromosome.location|GO.Function", colnames(twosided))
# Subset columns for the current study
common <- twosided[,c(sfdr[i]-1, sfdr[i], gs_col)]
# Rename columns with project names
colnames(common)[1:2] <- gsub("Ex_.", projname [i],colnames(common)[1:2])
# Perform operations on duplicated gene symbols if there are multiple rows in the common data frame:
# Check if there are multiple rows in the common data frame
if(nrow(common)>1){
# Count duplicated gene symbols
dupgene_freq <- data.frame (table(common$Gene.symbol))
# Get duplicated gene symbols
only_dupgene <- dupgene_freq[which(dupgene_freq$Freq> 1), 1]
# Identify rows with duplicated gene symbols
only_dupgene_row <- which(common$Gene.symbol %in% only_dupgene)
# Subset rows with duplicated gene symbols
duplicat_deg <- common [only_dupgene_row,]
# Iterate over each duplicated gene symbol, calculate column means, and store the averaged gene information in a list:
# Create a list to store averaged genes
averaged_genes <-list()
# Iterate over each duplicated gene symbol
for(b in 1:length(only_dupgene)){
# Identify rows for the current duplicated gene
d_row <- which(duplicat_deg$Gene.symbol %in% only_dupgene[b])
# Calculate column means
col_mean <- rbind(colMeans(duplicat_deg[d_row, 1:2]))
# Combine column means with other columns
col_all <- cbind(col_mean, duplicat_deg[d_row[1],3:5])
# Store the averaged gene information
averaged_genes[[b]] <- col_all
}
# Combine averaged genes and unique genes into a single data frame and sort by z-score in descending order:
# Combine averaged genes into a single data frame
averaged_genes <- do.call(rbind, averaged_genes)
# Subset rows with unique gene symbols
unique_deg <- common [-only_dupgene_row, ]
# Combine unique genes and averaged genes
all_unique_genes <- rbind(unique_deg, averaged_genes)
# Reset row names
rownames(all_unique_genes) <- NULL
# Sort the genes by z-score in descending order
all_unique_genes <- all_unique_genes[order(all_unique_genes[,1], decreasing = TRUE),]
# Write the result to a tsv file
write_tsv(all_unique_genes, paste0("MetaAnalysis/volcano/", projname [i], ".tsv"))
# Apply FDR threshold
all_unique_genes <- all_unique_genes[which(all_unique_genes[,2] < FDR),]
# Write the filtered result to a file
write_tsv(all_unique_genes, paste0("MetaAnalysis/filtered/", projname [i], ".tsv"))
}
# Count genes with "///"
eslash_genes <- length(grep("///", all_unique_genes$Gene.symbol))
# Display summary message for the current project
message ("###################### Summary for: ", projname [i], " ############################")
# Display summary information about the genes analysis
message (cat("Number of missing gene symbols:", length(genemissing),
"\nNumber of duplicated genes:", length(only_dupgene),
"\nTotal number of genes after averaging duplicated genes and FDR filter:", NROW(all_unique_genes),
"\nNumber of genes having ///:", eslash_genes))
message ("##########################################################################")
}
# Display completion message for the analysis
message ("+++++++++++++++++++++++ Analysis has completed. +++++++++++++++++++++++")
}
######################################################################################################################################
###################################################### 14- Makevolcano Function#######################################################
# Function to generate a volcano plot
# Input:
# - projname: Name of the project (default: "GSE11121")
# - padjlevel: Adjusted p-value threshold (default: 0.05)
# - pSE_thresh: Positive effect size threshold (default: 1)
# - nSE_thresh: Negative effect size threshold (default: -1)
# - ntop: Number of top genes to label (default: 10)
# - voltype: Type of volcano plot ("ggplot" or "plotly") (default: "ggplot")
# Output:
# - A volcano plot visualization
# - DEG files (upregulated, downregulated, and top genes)
# Function Description:
# This function generates a volcano plot for differential gene expression analysis.
# It takes several arguments to customize the plot, including the project name,
# adjusted p-value threshold, effect size thresholds, number of top genes to label,
# and the type of volcano plot (ggplot or plotly).
# The function reads the volcano plot data, calculates adjusted p-values and assigns DEG labels based on the thresholds.
# It then generates the volcano plot using either ggplot or plotly, visualizing the relationship between effect size and statistical significance.
# The plot includes color-coded points representing upregulated, downregulated, and non-significant genes.
# Additionally, the function saves DEG files containing upregulated, downregulated, and top genes,
# and displays the number of upregulated and downregulated genes.
Makevolcano <- function(projname = "GSE11121",
# padjlevel: Adjusted p-value threshold (default: 0.05)
padjlevel = 0.05,
# pSE_thresh: Positive effect size threshold (default: 1)
pSE_thresh = 1,
# nSE_thresh: Negative effect size threshold (default: -1)
nSE_thresh = -1,
# ntop: Number of top genes to label (default: 10)
ntop = 10,
# voltype: Type of volcano plot ("ggplot" or "plotly") (default: "ggplot")
voltype = "ggplot"){
# Create a directory for storing DEG files
dir.create("MetaAnalysis/DEG", showWarnings = FALSE)
# Reading the DEGs table:
# Read the DEGs table from a TSV file
volcano <- read_tsv(paste0("MetaAnalysis/volcano/", projname , ".tsv"))
# Convert the table to a data frame
volcano <- data.frame (volcano)
# Remove project name from column names
colnames(volcano)[1:2] <- gsub(paste0("_", projname), "", colnames(volcano)[1:2])
# Calculate adjusted p-values and assign DEG labels based on the threshold
volcano <- volcano %>%
# Calculate adjusted p-values using the negative logarithm of FDR values
mutate(AdjustedPvalue = -log10(FDR)) %>%
# Assign initial label of "NotSignificant" to all genes
# (initially, indicating that they are not significantly differentially expressed
# before applying further criteria to determine if a gene should be labeled as "Upregulated" or "Downregulated")
mutate(DEG = "NotSignificant") %>%
# Assign "Upregulated" label to genes that meet adjusted p-value and positive effect size thresholds
mutate(DEG = ifelse(AdjustedPvalue> -log10(padjlevel) & zSco> pSE_thresh, "Upregulated", DEG)) %>%
# Assign "Downregulated" label to genes that meet adjusted p-value and negative effect size thresholds
mutate(DEG = ifelse(AdjustedPvalue> -log10(padjlevel) & zSco < nSE_thresh, "Downregulated", DEG))
# Handle zero FDR values to avoid infinite AdjustedPvalue:
# Identify the indices of rows with FDR equal to 0
zero_fdr <- which(volcano$FDR == 0)
# Calculate the minimum non-zero FDR and adjust it to avoid infinite AdjustedPvalue
m <- -log10(min(volcano[-zero_fdr, "FDR"])) + 0.5
# Replace the AdjustedPvalue of zero FDR rows with the adjusted value
volcano[zero_fdr, "AdjustedPvalue"] <- m
# Define a vector of color codes to be used for plotting
my_pal <- c("#1B9E77","#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#9A7D0A")
Calculating the number of each Upregulated, Downregulated, and NotSignificant probes#
# Select rows from the volcano data frame where DEG is "Upregulated"
volcano_up <- volcano[which(volcano$DEG == "Upregulated"),]
# Select rows from the volcano data frame where DEG is "Downregulated"
volcano_down <- volcano[which(volcano$DEG == "Downregulated"),]
# Sort the volcano_down data frame based on the zSco column in ascending order
volcano_down <- volcano_down[order(volcano_down$zSco, decreasing = FALSE), ]
# Select rows from the volcano data frame where DEG is "NotSignificant"
volcano_NA <- volcano[which(volcano$DEG == "NotSignificant"),]
# Combine the data frames of upregulated, downregulated, and non-significant genes into a single data frame
volcano_all <- rbind(volcano_up, volcano_down, volcano_NA)
# Replace the labels of the top probes with the word "Top"
ntop_rep <- rep ("Top", ntop)
# Create the DEG column in the volcano_all data frame by combining labels from different data frames
volcano_all$DEG <- c(ntop_rep, volcano_up$DEG[-c(1:ntop)], ntop_rep, volcano_down$DEG[-c(1:ntop)], volcano_NA$DEG)
# Create a column 'Top' in the 'volcano_all' data frame
# Assign gene symbols to the 'Top' column for genes labeled as 'Top'
volcano_all$Top <- ifelse(volcano_all$DEG == "Top", volcano_all$Gene.symbol, "")
# Plotting either with plotly or ggplot:
# Check if the specified voltype is "ggplot"
if(voltype == "ggplot"){
# Create a ggplot object for generating the volcano plot using ggplot2
p <- ggplot(data = volcano_all, aes(x = zSco, y= AdjustedPvalue, color= DEG, fill = DEG, label= Top)) +
# Set the axis labels
labs(x= 'Effect size', y= "-log10(FDR)") +
# Add points to the plot
geom_point(size = 1, shape = 21) +
# Add text labels to the plot with overlap checking
geom_text(check_overlap = TRUE,vjust = 0.1, nudge_y = 0.1) +
# Set manual color scale for DEG categories
scale_color_manual(values=c(my_pal)) +
# Set manual fill color scale for DEG categories
scale_fill_manual(values=c(paste(my_pal, "66", sep = ""))) +
# Apply a classic theme to the plot
theme_classic() +
# Customize the appearance of the axes, text, and title
theme(
# Customize axis text appearance
axis.text = element_text(family = "Arial",size = 24 , colour = "black"),
# Customize x-axis text appearance
axis.text.x = element_text(family = "Arial",colour = "black", size = 24),
# Customize y-axis text appearance
axis.text.y = element_text(family = "Arial",colour = "black", size = 24),
# Customize plot subtitle appearance
plot.subtitle = element_text(family = "Arial",size = 24, colour = "black", hjust = 0.5),
# Customize y-axis title appearance
axis.title.y = element_text(family = "Arial", size = 24, angle = 90),
# Customize x-axis title appearance
axis.title.x = element_text(family = "Arial", size = 24, angle = 00),
# Customize legend text appearance
legend.text = element_text(size = 10, family = "Arial"),
# Customize legend title appearance
legend.title = element_text(size = 20, family = "Arial")
) +
# Set the plot subtitle
labs(subtitle = paste0("Volcano plot-", projname))
# Return the ggplot object
p
} else {
# Create a ggplot object, add layers and annotation to the plot
p <- ggplot(
data = volcano_all,
aes(
# Set the x-axis variable as 'zSco' (effect size)
x = zSco,
# Set the y-axis variable as 'AdjustedPvalue' (-log10(FDR))
y= AdjustedPvalue,
# Map the 'DEG' variable to the color aesthetic
color= DEG,
# Map the 'DEG' variable to the fill aesthetic
fill = DEG,
# Create text labels by concatenating gene information
text = paste(
# Include the gene symbol
"<br>Gene: " , Gene.symbol,
# Include the chromosome location
"<br>Chromosome: " , Chromosome.location,
# Include the GO function
"<br>GO function: " , GO.function
)
)
) +
# Set the x and y-axis labels
labs(x= 'Effect size', y= , title = paste0("Volcano plot-", projname)) +
# Add points to the plot
geom_point(size = 1, shape = 21) +
# Set the color aesthetics using DEG
scale_color_manual(values=c(my_pal)) +
# Set the fill aesthetics using DEG
scale_fill_manual(values=c(paste(my_pal, "66", sep = ""))) +
# Apply a classic theme to the plot
theme_classic() +
# Customize the appearance of the axes, text, and title
theme(
# Customize axis text font family, size, and color
axis.text = element_text(family = "Arial",size = 24 , colour = "black"),
# Customize x-axis text font family, color, and size
axis.text.x = element_text(family = "Arial",colour = "black", size = 24),
# Customize y-axis text font family, color, and size
axis.text.y = element_text(family = "Arial",colour = "black", size = 24),
# Customize plot subtitle font family, size, color, and horizontal justification
plot.subtitle = element_text(family = "Arial",size = 24, colour = "black", hjust = 0.5),
# Customize y-axis title font family, size, and angle
axis.title.y = element_text(family = "Arial", size = 24, angle = 90),
# Customize x-axis title font family, size, and angle
axis.title.x = element_text(family = "Arial", size = 24, angle = 00)
)
# Convert the ggplot object to plotly with text tooltip
p <- ggplotly(p, tooltip = "text") %>%
# Customize the layout settings for the plotly plot
layout(
# Set the title of the plot
title = paste0("Volcano plot-", projname),
# Customize font family, color, and size
font =list(family = "Arial", color = "black", size = 24),
# Customize legend settings
legend =list(family = "Arial", color = "black", size = 20, itemclick = "toggle")
)
}
# Calculate the number of up-regulated and down-regulated genes
write_tsv(rbind(volcano_up, volcano_down), paste0("MetaAnalysis/DEG/", projname , "_up_down.tsv"))
# Create the top table (up and down) based on the specified number of top genes (ntop)
Top <- rbind(volcano_up[1:ntop, ], volcano_down[1:ntop, ])
Top <- Top[,-6]
# Save the top table into the project folder
write_tsv(Top, paste0("MetaAnalysis/DEG/", projname , "_top.tsv"))
# Print the number of up-regulated and down-regulated genes
message ("################# Significance #################")
message (cat("Up-regulated genes:", nrow(volcano_up),
"\nDown-regulated genes:", nrow(volcano_down)))
message ("################################################")
# Return the generated plot
return(p)
}
######################################################################################################################################
####################################################### 15- MakeVenna Function########################################################
# Function to generate a volcano plot
# Arguments:
# - projname: Name of the project (default: "GSE11121")
# - padjlevel: Adjusted p-value threshold (default: 0.05)
# - pSE_thresh: Positive effect size threshold (default: 1)
# - nSE_thresh: Negative effect size threshold (default: -1)
# - ntop: Number of top genes to label (default: 10)
# - voltype: Type of volcano plot ("ggplot" or "plotly") (default: "ggplot")
# Input: Dataframe containing the differential expression analysis results
# Output: Volcano plot object (ggplot or plotly)
# Function Description:
# This function generates a volcano plot based on differential expression analysis results.
# It takes several arguments to customize the plot, including the project name, adjusted p-value threshold,
# effect size thresholds, number of top genes to label, and the type of volcano plot (ggplot or plotly).
# The function reads the differential expression analysis results from a dataframe.
# It then identifies the common genes across all projects and creates separate files for common genes and top common genes.
# The function calculates the mean values of z-score and FDR for the common genes and creates a new dataframe.
# The common genes, as well as the Venn diagram showing the overlapping genes between projects, are saved in the "common" folder.
# Finally, the function generates a volcano plot using either ggplot or plotly, visualizing the relationship between effect size and statistical significance of genes.
# The plot includes color-coded points representing upregulated, downregulated, and non-significant genes. The function returns the volcano plot object.
# Function to generate a volcano plot based on differential expression analysis results
MakeVenna <- function(common,
ntop = 10){
# Create a directory named "common" inside the "MetaAnalysis" folder to store common gene files
dir.create("MetaAnalysis/common", showWarnings = FALSE)
# Reading all top-down files for each project
# Create an empty list to store the differential expression gene tables for each project
com_table <-list()
# Iterate over each project in the 'common' vector
for(i in seq_along(common)){
# Read the TSV file for the current project and convert it to a data frame
DEG_table <- data.frame (read_tsv(paste0("MetaAnalysis/DEG/", common [i], "_up_down.tsv")))
# Store the DEG table in the com_table list, using the project name as the list element name
com_table[[common [i]]] <- DEG_table[,-6]
}
# Calculating up and down-regulated genes for each project:
# Create an empty list to store the up and down-regulated genes for each project
DEGslist <-list()
# Iterate over each project in the com_table list
for(i in seq_along(com_table)){
# Store the up and down-regulated genes in the DEGslist
DEGslist[[common [i]]] <- com_table[[i]][[3]]
}
# Finding common genes across all projects
com_prob <- Reduce(intersect, DEGslist)
# Writing common genes into separate files and generating top files
# Create an empty list to store the common genes across all studies
all_com <-list()
# Loop over each study in the com_table list
for(i in seq_along(com_table)){
# Find the indices of common genes in the current project's DEG table
comn <- which(com_table[[names(com_table)[i]]][["Gene.symbol"]] %in% com_prob)
# Extract the common genes from the DEG table
com_final_DEG <- com_table[[names(com_table)[i]]][comn, ]
# Write the common genes into a separate file
write_tsv(com_final_DEG, paste0("MetaAnalysis/common/",names(com_table)[i],"_common_DEGs.tsv"))
# Sort the common genes based on gene symbol
sorted_com_final_DEG <- com_final_DEG[order(com_final_DEG$Gene.symbol),]
# Store the sorted common genes in the all_com list
all_com[[names(com_table)[i]]] <- sorted_com_final_DEG
# Extract the top upregulated genes from the common genes
com_final_up <- com_final_DEG[which(com_final_DEG$DEG == "Upregulated"), ]
# Selecting the top ntop upregulated genes
com_final_up <- com_final_up[1:ntop, ]
# Extract the top downregulated genes from the common genes
com_final_down <- com_final_DEG[which(com_final_DEG$DEG == "Downregulated"), ]
# Select the top ntop rows from the com_final_down dataframe
com_final_down <- com_final_down[1:ntop, ]
# Write the top common genes (upregulated and downregulated) into a separate file
write_tsv(rbind(com_final_up, com_final_down), paste0("MetaAnalysis/common/", names(com_table)[i],"_top_common_DEGs.tsv"))
}
# Separating the first two columns and remaining columns for each project
# Create an empty list to store the first two columns of each dataframe in all_com
firstsecond_col <-list()
# Create an empty list to store the remaining columns (3 to 6) of each dataframe in all_com
remaining <-list()
# Iterate over the indices of all_com list
for(i in seq_along(all_com)){
# Extract the first two columns of the i-th dataframe in all_com and store it in firstsecond_col list
firstsecond_col[[i]] <- data.frame (all_com[[i]][,1:2])
# Extract the first two columns of the i-th dataframe in all_com and store it in firstsecond_col list
remaining[[i]] <- data.frame (all_com[[i]][,3:6])
}
# Combine the first two columns of the data frames in the firstsecond_col list and calculate the mean values of z-score and FDR, and
# create a new data frame (sz_fdr_df) containing the mean values and combine it with the remaining columns:
# Combine the first two columns of the data frames in firstsecond_col list into a single data frame
firstsecond_col_df <- do.call(cbind, firstsecond_col)
# Find the column indices containing "zSco" in their column names
sz <- grep("zSco", colnames(firstsecond_col_df))
# Find the column indices containing "FDR" in their column names
fdr <- grep("FDR", colnames(firstsecond_col_df))
# Calculate the mean of z-score for each row
sz_mean <- rowMeans(firstsecond_col_df[,sz])
# Calculate the mean of FDR for each row
fdr_mean <- rowMeans(firstsecond_col_df[,fdr])
# Create a new data frame with columns for zSco and FDR means
sz_fdr_df <- data.frame (zSco = sz_mean, FDR= fdr_mean)
# Combine the sz_fdr_df with the remaining columns from remaining list
sz_fdr_df <- cbind(sz_fdr_df, remaining[[1]])
# Write the sz_fdr_df to a TSV file
write_tsv(sz_fdr_df, paste0("MetaAnalysis/common/all_mean_common_DEGs.tsv"))
# Display a message indicating that common genes between all datasets have been saved into the common folder
message ("+++++ Common genes between all datasets have saved into common folder. +++++")
# Creating venn diagram:
# Create an empty list to store the DEGs for each project
DEGslist <-list()
# Iterate over the projects in the com_table
for(i in seq_along(com_table)){
# Extract the DEGs for each project and store them in the DEGslist
DEGslist[[common [i]]] <- com_table[[i]][3]
}
# Create an empty list to store the Venn diagram input for each project
venlist <-list()
# Extract the common DEGs for each project and store them in the venlist:
# Iterate over the elements in the DEGslist
for(i in seq_along(DEGslist)){
# Assign the common elements from DEGslist to the corresponding element in venlist
venlist[[common [i]]] <- DEGslist[[i]][[1]]
}
# Set the seed for reproducibility in generating the Venn diagram
set.seed(1234)
# Define the color palette for the Venn diagram
my_pal <- c("#1B9E77","#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#9A7D0A")
# Generate the Venn diagram using the specified parameters
venaa <- ggvenn(venlist,
# Set the fill colors of the Venn diagram circles using the my_pal vector
fill_color = my_pal,
# Set the stroke size of the Venn diagram circles to 0.5
stroke_size = 0.5,
# Set the size of the set names in the Venn diagram to 4
set_name_size = 4)
# Return the Venn diagram object
return(venaa)
}
######################################################################################################################################
################################################### 16- Makenetworkanalyst Function###################################################
# Function to prepare data for NetworkAnalyst platform
# Arguments:
# - matrix: Input matrix containing gene expression data
# - projname: Name of the project
# - compare: Comparison type (default: "grade")
# - groups: List of groups for comparison
# Input: Gene expression matrix, phenotype data, annotation data
# Output: Modified matrix file for NetworkAnalyst
# Function Description:
# This function prepares the gene expression data for the NetworkAnalyst platform.
# It takes a gene expression matrix, project name, comparison type, and groups for comparison as input.
# The function performs several steps to process the data and generate a modified matrix file compatible with NetworkAnalyst.
# The function first creates a directory to store the network analysis files for NetworkAnalyst.
# It then checks if the first column of the matrix is labeled as "ID" and adjusts the matrix accordingly.
# The phenotype file is read and processed based on the specified comparison type.
# The annotation file is also read and processed to ensure compatibility.
# Next, the function finds matching samples based on the provided groups and subsets the matrix accordingly.
# The values in the matrix are rounded to two decimal places. Rows with missing gene symbols are removed from the matrix.
# Group information is extracted from the phenotype data and stored in the grouplis list.
# The function renames the matrix file according to the group names specified by the user.
# It modifies the column names and the second row to adhere to the file format required by NetworkAnalyst.
# Finally, the modified matrix file is saved in the "MetaAnalysis/networkanalyst" directory with the name "projname_network_full.txt".
# The completion message is displayed when the file has been successfully saved.
Makenetworkanalyst <- function(matrix, projname , compare = "grade", groups){
# Creating directory to store network analysis files for NetworkAnalyst
dir.create("MetaAnalysis/networkanalyst", showWarnings = FALSE)
# Check if the first column of the matrix is labeled as "ID"
if(colnames(matrix)[1] == "ID"){
# Assign the values of the first column as row names
rownames(matrix) <- matrix[,1]
# Remove the first column from the matrix
matrix <- matrix[,-1]
}
# Reading and processing phenotype file
# Read the phenotype file based on the comparison type
pheno <- read_csv (paste0("phenotype/phenotype_", compare , ".csv"))
# Convert the read data into a data frame
pheno <- data.frame (pheno)
# Find the indices of the samples that match the columns of the matrix
paired_pheno <- which(pheno$Sample %in% colnames(matrix))
# Subset the phenotype data based on the matched samples
pheno <- pheno[paired_pheno, ]
# Reading and processing annotation file:
# Read the annotation file and create a data frame
annot<- read_tsv(paste0(projname , "/", "annot.tsv"))
annot<- data.frame (annot)
# Set the row names of the annotation data frame
rownames(annot) <- annot[,1]
# Remove the first column from the annotation data frame
annot<- annot[,-1]
# Finding matching grades or subtypes:
# Create an empty list to store sample IDs for each group
sampleid <-list()
# Iterate over the groups and find matching samples in the phenotype data
for(i in seq_along(groups)){
# Find rows in the phenotype data matching the group
g_row <- which(pheno $grade %in% groups[[i]])
# Store the matched samples in the list
sampleid[[i]] <- pheno[g_row, ]
}
# Combine the sample IDs into a single data frame
sampleid <- do.call(rbind, sampleid)
# Sort the sample IDs by the "Sample" column
sampleid <- sampleid[order(sampleid$Sample),]
# Subset the matrix using the sample IDs
matrix <- matrix[, sampleid$Sample]
# Round the values in the matrix to two decimal places
matrix <- round(matrix, 2)
# Remove rows with missing gene symbols
identical(rownames(matrix), rownames(annot))
matrix <- cbind(annot$Gene.symbol, matrix)
# Rename the first column as "Gene.symbol"
colnames(matrix)[1] <- "Gene.symbol"
# Find the indices of missing gene symbols in the matrix
genemissing <- which(is.na(matrix$Gene.symbol) == TRUE)
# Remove rows with missing gene symbols from the matrix
matrix <- matrix[-genemissing, ]
# Create an empty list to store group information
grouplis <-list()
# Iterating over each group in the 'groups' list
for(i in seq_along(groups)){
# Combine group names into a single string separated by "|"
name <- paste0(groups[[i]], collapse = "|")
# Find rows in the phenotype data matching the group
rownum <- grep(name, pheno$grade)
# Store the matched sample names in the grouplis list
grouplis[[names(groups)[i]]] <- pheno$Sample[rownum]
}
# Rename the matrix file according to the group names specified by the user
updown_matrix <- rbind(colnames(matrix), matrix)
# Iterating over each element in the 'grouplis' list
for(i in seq_along(grouplis)){
# Combine sample names for each group into a single string separated by "|"
matname <- paste0(grouplis[[i]], collapse = "|")
# Find column indices in the matrix matching the group
matcoln <- grep(matname, colnames(updown_matrix))
# Rename the corresponding columns with group names
updown_matrix[1,matcoln] <- names(grouplis)[i]
}
# Modify column names and the second row according to the file format for NetworkAnalyst
# Find column indices with group names
changedcol <- grep(paste0(names(grouplis), collapse = "|"), updown_matrix[1,])
# Subset the matrix with the modified column names
updown_matrix <- updown_matrix[, c(1, changedcol)]
# Rename the first cell in the matrix as "#CLASS"
updown_matrix[1,1] <- "#CLASS"
# Rename the first column as "#NAME"
colnames(updown_matrix)[1] <- "#NAME"
# Save the modified matrix file for NetworkAnalyst
write.table(updown_matrix, paste0("MetaAnalysis/networkanalyst/", projname ,"_network_full.txt"), quote = FALSE, row.names = FALSE, sep = "\t" )
# Displaying a message indicating that the file has been saved
message ("+++++ Your file has saved into networkanalyst folder. +++++")
}
######################################################################################################################################
###################################################### 17- MakeHeatmap Function ######################################################
# Function to create a heatmap based on gene expression data
# Arguments:
# - matrix: The gene expression data matrix
# - projname: The project name or identifier
# - compare: The comparison for phenotype
# - groups: A list of groups for sample selection
# - ntop: The number of top genes to consider
# - sorted: A logical value indicating whether the heatmap should be sorted
# Input:
# - Gene expression data matrix
# - Phenotype data
# - Annotation data
# - Differential expression gene (DEG) file
# Output:
# - Heatmap plot visualizing the gene expression data
# Function Description:
# This function generates a heatmap based on gene expression data.
# It takes a gene expression data matrix, project name, comparison for phenotype, groups for sample selection,
# number of top genes to consider, and a logical value indicating whether the heatmap should be sorted as input.
# The function starts by checking if the matrix's first column is named "ID" and adjusts the matrix if necessary.
# It reads the phenotype CSV file and converts it to a data frame.
# The annotation file is also read and processed based on the project name.
# The matrix is adjusted to match the IDs in the annotation data frame.
# Next, the function performs averaging for duplicated genes by calculating the column-wise mean for rows with the same gene symbol.
# The averaged genes are combined with unique non-duplicated genes into a single data frame.
# The DEG file is read, and the top upregulated and downregulated genes are extracted.
# The function selects the genes from the combined data frame that match the selected DEGs.
# The network data frame is created, and the sample IDs are matched with the network data frame columns based on the groups provided.
# The grades are extracted from the network data frame columns.
# The function generates the heatmap plot using the network data frame.
# If the heatmap should be sorted, the columns are ordered based on column names.
# The plot includes row (gene) annotations, a bottom annotation indicating the grades, and a legend for expression levels.
# The row title is positioned on the left side, and the column names are hidden.
# The completion message is displayed when the heatmap plot is generated.
MakeHeatmap <- function(matrix, projname , compare , groups, ntop, sorted){
# Set the random seed for reproducibility
set.seed(1234)
# Check if the first column of the matrix is named "ID"
if(colnames(matrix)[1] == "ID"){
# If so, adjust the matrix to treat the IDs as row names:
# Set row names of the matrix to the values in the first column
rownames(matrix) <- matrix[,1]
# Remove the first column from the matrix
matrix <- matrix[,-1]
# Sort the matrix based on the row names
matrix <- matrix[order(rownames(matrix)),]
}
# Read the phenotype CSV file and convert it to a data frame:
# Read the phenotype CSV file based on the "compare" value
pheno <- read_csv (paste0("phenotype/phenotype_", compare , ".csv"))
# Convert the read data to a data frame
pheno <- data.frame (pheno)
# Read annotation file based on the "projname" value:
# If the condition is true (if projname is equal to "combination"), perform the following operations
if(projname == "combination"){
# For "combination" project, read from "GSE25055/annot.tsv"
annot<- data.frame (read_tsv(paste0("GSE25055", "/", "annot.tsv")))
} else{
# Read the annotation TSV file based on the "projname" value
annot<- data.frame (read_tsv(paste0(projname , "/", "annot.tsv")))
}
# Select only the "ID" and "Gene.symbol" columns from the annotation data frame
annot<- annot[ , c("ID","Gene.symbol")]
# Sort the annotation data frame based on the "ID" column
annot<- annot[order(annot$ID), ]
# Check if the row names of the matrix match the IDs in the annotation
if(identical(rownames(matrix), annot$ID)){
# If so, add the "Gene.symbol" column to the matrix and adjust column names
matrix <- cbind(annot$Gene.symbol, matrix)
# Set the column name of the matrix at index 1 to "Gene.symbol"
colnames(matrix)[1] <- "Gene.symbol"
}
# Perform averaging for duplicated genes:
# Create a data frame with frequency counts of duplicated genes
dupgene_freq <- data.frame (table(matrix$Gene.symbol))
# Create a data frame with frequency counts of duplicated genes
only_dupgene <- dupgene_freq[which(dupgene_freq$Freq > 1), 1]
# Find the row indices of the matrix where the gene symbol matches the duplicated genes
only_dupgene_row <- which(matrix$Gene.symbol %in% only_dupgene)
# Extract the duplicated genes and their corresponding rows from the matrix
duplicat_deg <- matrix[only_dupgene_row,]
# Create an empty list to store the averaged genes
averaged_genes <-list()
# Iterate over each duplicated gene
for(i in 1:length(only_dupgene)){
# Find the row indices of the duplicated gene in the duplicat_deg data frame
d_row <- which(duplicat_deg$Gene.symbol %in% only_dupgene[i])
# Calculate the column-wise mean for the selected rows
col_mean <- rbind(colMeans(duplicat_deg[d_row, -1]))
# Combine the averaged gene data with its corresponding gene symbol
col_all <- cbind(duplicat_deg[d_row[1],1], col_mean)
# Add the averaged gene to the list
averaged_genes[[i]] <- col_all
}
# Combine all the averaged genes into a single data frame
averaged_genes <- data.frame (do.call(rbind, averaged_genes))
# Set the column name of the averaged genes at index 1 to "Gene.symbol"
colnames(averaged_genes)[1] <- "Gene.symbol"
# Extract the unique non-duplicated genes from the matrix
unique_deg <- matrix[-only_dupgene_row, ]
# Combine the unique genes and averaged genes into a single data frame
all_unique_genes <- rbind(unique_deg, averaged_genes)
# Reset the row names of the combined genes data frame
rownames(all_unique_genes) <- NULL
# Reading DEG file
DEG_table <- data.frame (read_tsv(paste0("MetaAnalysis/DEG/", projname , "_up_down.tsv")))
# Finding up and down regulated gene and make original matrix of them
# Extract the upregulated genes from the DEG table based on the "DEG" column value
up <- DEG_table[which(DEG_table$DEG == "Upregulated"),]
# Select the top "ntop" rows and the second and third columns (gene symbol and expression values)
up <- up[1:ntop,2:3]
# Extract the downregulated genes from the DEG table based on the "DEG" column value
down <- DEG_table[which(DEG_table$DEG == "Downregulated"),]
# Select the top "ntop" rows and the second and third columns (gene symbol and expression values)
down <- down[1:ntop, 2:3]
# Combine the upregulated and downregulated genes into a single gene list column
genelist <- rbind(up, down)[,2]
# Find the row indices of the genes in the all_unique_genes data frame
num <- which(all_unique_genes$Gene.symbol %in% genelist)
# Extract the network data frame with only the selected genes
network_df <- data.frame (all_unique_genes[num,])
# Extract the gene symbols column from the network data frame
Gene.symbol <- network_df$Gene.symbol
# Remove the gene symbols column from the network data frame
network_df <- network_df[,-1]
# Convert the network data frame to a numeric matrix
network_df <- as.matrix(sapply(network_df, as.numeric))
# Set the row names of the network matrix to the gene symbols
rownames(network_df) <- Gene.symbol
# Finding grades and matching with colnames:
# Create an empty list to store sample IDs for each group
sampleid <-list()
# Iterate over each group and find the corresponding samples in the phenotype data
for(i in seq_along(groups)){
# Find the row indices in pheno where grade matches the current group
g_row <- which(pheno $grade %in% groups[[i]])
# Store the corresponding sample IDs for the current group in the sampleid list
sampleid[[i]] <- pheno[g_row, ]
}
# Combine the sample IDs from different groups into a single data frame
sampleid <- do.call(rbind, sampleid)
# Find the column indices of the samples in the network_df matrix
sampleid_n <- which(sampleid$Sample %in% colnames(network_df))
# Filter the sample IDs based on the column indices
sampleid <- sampleid[sampleid_n,]
# Sort the sample IDs based on the "grade" column
sampleid <- sampleid[order(sampleid$grade),]
# Add a prefix "Grade_" to the values in the "grade" column
sampleid $grade <- paste0("Grade_", sampleid$grade)
# Extract the corresponding columns from the network_df matrix based on the sample IDs
network_df <- network_df[, sampleid$Sample]
# Set the column names of the network_df matrix to the modified "grade" values
colnames(network_df) <- sampleid $grade
# Grade info
# Extract the column names (grades) of the network_df matrix
type <- colnames(network_df)
# Create a HeatmapAnnotation object with a data frame and annotation height
ha = HeatmapAnnotation(
df = data.frame (Grade = type),
annotation_height = unit(4, "mm")
)
# Heatmap plot
# If the heatmap should be sorted, provide additional parameters for ordering
if(sorted == TRUE){
Heatmap(network_df,
# Set the properties for row names (font size)
row_names_gp = gpar(fontsize = 7),
# Add the HeatmapAnnotation object as the bottom annotation
bottom_annotation= ha,
# Define the order of columns based on column names
column_order = colnames(network_df),
# Set the color of the border around the heatmap
border_gp = gpar(col = "black"),
# Set the title for the row (gene) annotations
row_title = "DEGs",
# Position the row title on the left side
row_title_side = "left",
# Hide the column names in the heatmap
show_column_names = FALSE,
heatmap_legend_param =list(
# Set the orientation of the legend (vertical)
legend_direction = "vertical",
# Set the height of the legend
legend_height = unit(50, "mm"),
# Set the width of each grid in the legend
grid_width = unit(6, "mm"),
# Set the height of each grid in the legend
grid_height = unit(50, "cm"),
# Set the title of the legend
title = "Expression"))
} else {
# Create the heatmap plot without sorting if "sorted" is FALSE
Heatmap(network_df,
# Set the properties for row names (font size)
row_names_gp = gpar(fontsize = 7),
# Add the HeatmapAnnotation object as the bottom annotation
bottom_annotation= ha,
# Set the color of the border around the heatmap
border_gp = gpar(col = "black"),
# Set the title for the row (gene) annotations
row_title = "DEGs",
# Position the row title on the left side
row_title_side = "left",
# Hide the column names in the heatmap
show_column_names = FALSE,
heatmap_legend_param =list(
# Set the orientation of the legend (vertical)
legend_direction = "vertical",
# Set the height of the legend
legend_height = unit(50, "mm"),
# Set the width of each grid in the legend
grid_width = unit(6, "mm"),
# Set the height of each grid in the legend
grid_height = unit(50, "cm"),
# Set the title of the legend
title = "Expression"))
}
}
######################################################################################################################################
########################################################## 18- GSEAAnalysis ##########################################################
# Function to perform Gene Set Enrichment Analysis (GSEA)
# Arguments:
# - projname: The project name or identifier
# - FDR: The false discovery rate (FDR) threshold for significance
# Input:
# - DEG_table: Data frame containing differential expression gene (DEG) information
# - H: Data frame containing hallmark pathway gene sets
# - FC: Data frame containing gene names and logFC values
# Output:
# - GSEA plot visualizing the enriched gene sets
# Function Description:
# This function performs Gene Set Enrichment Analysis (GSEA) using the fgseaSimple function.
# It takes the project name and FDR threshold as input and generates a GSEA plot of enriched gene sets.
# The function reads the DEG table specific to the project name and prepares gene scores and gene set lookup table.
# GSEA is performed using fgseaSimple, and the results are visualized in a GSEA plot.
# The significance of gene sets is determined based on FDR and NES values.
# The plot includes pathway names, NES values, and colored bars representing significance.
# Function to perform Gene Set Enrichment Analysis (GSEA)
GSEAAnalysis <- function(projname , FDR){
# Read the DEG table file specific to the given project name and create a data frame
DEG_table <- data.frame (read_tsv(paste0("MetaAnalysis/DEG/", projname , "_up_down.tsv")))
# Removing /// from some genes
DEG_table$Gene.symbol <- gsub("///.*", "", DEG_table$Gene.symbol)
# Retrieve hallmark pathway gene sets specific to the "Homo sapiens" species
H <- msigdbr(species = "Homo sapiens", category = "H")
# Store the result of the following operations in H.symbol.ls
H.symbol.ls <- H %>%
# Select the gs_name and gene_symbol columns from H
select(gs_name, gene_symbol) %>%
# Group the data by gs_name
group_by(gs_name) %>%
# Summarize the unique gene_symbols for each gs_name
summarise(all.genes =list(unique(gene_symbol))) %>%
# Convert the summarized data into a lookup table format
deframe()
# Create a new data frame FC with columns 3 and 1 from DEG_table (gene names and logFC)
FC <- DEG_table[,c(3,1)]
# Create a vector FC.vec containing the values from the zSco column of the FC data frame
FC.vec <- FC$zSco
# Assign the Gene.symbol column values from the FC data frame as the names of the FC.vec vector
names(FC.vec) <- FC$Gene.symbol
# Perform Gene Set Enrichment Analysis (GSEA) using the fgseaSimple function
gsea.H <- fgseaSimple(
# - The pathways argument specifies the gene set lookup table H.symbol.ls
pathways = H.symbol.ls,
# - The stats argument provides the FC.vec vector with gene scores
stats = FC.vec,
# - The scoreType is set to "std" for standard scoring
scoreType = "std",
# - The nperm parameter is set to 1000 for the number of permutations to perform
# in order to estimate the significance of the enrichment scores
nperm=1000)
# Create a vector my_pal with hexadecimal color codes representing a color palette
my_pal <- c("#1B9E77","#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#9A7D0A")
# Clean up pathway names in the gsea.H object:
# Remove the prefix "HALLMARK_" from the pathway column in the gsea.H object
gsea.H$pathway <- gsub("HALLMARK_","", gsea.H$pathway)
# Replace underscores "_" with spaces " " in the pathway column of the gsea.H object
gsea.H$pathway <- gsub("_"," ", gsea.H$pathway)
# Calculating the Significance based on FDR and NES
gsea.H <- gsea.H %>%
# Update the Significance column in the gsea.H object with initial values of "NotSignificance"
mutate(Significance = "NotSignificance") %>%
# Update the Significance values based on the conditions:
# - If padj is less than or equal to FDR and NES is greater than 0, assign "Upregulated"
mutate(Significance = ifelse(padj <= FDR & NES > 0, "Upregulated", Significance)) %>%
# - If padj is less than or equal to FDR and NES is less than 0, assign "Downregulated"
mutate(Significance = ifelse(padj <= FDR & NES < 0, "Downregulated", Significance))
# Filter the gsea.H object to retain only the rows where Significance is not equal to "NotSignificance
gsea.H <- gsea.H[which(gsea.H$Significance != "NotSignificance"), ]
# Create a ggplot visualization using the gsea.H object
gsea.H %>%
# - The x-axis represents the pathway names reordered by NES values
# - The y-axis represents the NES values
ggplot(aes(x=reorder(pathway, NES), y=NES)) +
# - Display the bars with a width of 0.5, coloring them based on the Significance column
geom_col(width = 0.5, aes(color= Significance, fill = Significance)) +
# - Add labels for the x-axis ("Gene set") and y-axis ("Normalized enrichment score (NES)")
labs(x= 'Gene set', y= 'Gene set') +
# - Apply a classic theme to the plot
theme_classic() +
# - Customize the colors of the bars and legend using the "blue" and "red" values:
# Set the colors for data points using a manual color scale
scale_color_manual(values=c("blue", "red")) +
# Set the colors for filled areas using a manual fill scale
scale_fill_manual(values=c("blue", "red")) +
# - Flip the coordinates to have horizontal bars instead of vertical bars
coord_flip() +
# Customize the appearance of various plot elements (the axes, text, and title) using the theme function
theme(
# - Modify the font family, size, and color of the axis text
axis.text = element_text(family = "Arial",size = 24 , colour = "black"),
# - Customize the font family, color, and size of the x-axis text
axis.text.x = element_text(family = "Arial",colour = "black", size = 24),
# - Customize the font family, color, and size of the y-axis text
axis.text.y = element_text(family = "Arial",colour = "black", size = 9),
# - Adjust the appearance of the plot subtitle, including font family, size, color, and horizontal justification
plot.subtitle = element_text(family = "Arial",size = 24, colour = "black", hjust = 0.5),
# - Customize the appearance of the y-axis title, including font family, size, and angle (90-degree rotation)
axis.title.y = element_text(family = "Arial", size = 24, angle = 90),
# - Customize the appearance of the x-axis title, including font family, size, and angle (0-degree rotation)
axis.title.x = element_text(family = "Arial", size = 24, angle = 00),
# - Modify the size and font family of the legend text
legend.text = element_text(size = 10, family = "Arial"),
# - Modify the size and font family of the legend title
legend.title = element_text(size = 20, family = "Arial")) +
# - Set the subtitle of the plot to include the project name (projname)
labs(subtitle = paste0("GSEA - ", projname))
}
######################################################################################################################################
################################################### Running all functions ############################################################
# Downloading GEO data for specific GEO accession numbers
DownloadGEO("GSE25065")
DownloadGEO("GSE11121")
DownloadGEO("GSE7390")
DownloadGEO("GSE25055")
# Read the GEO matrices into the R environment for the specified GEO accession numbers
GSE11121 <- ReadGEO ("GSE11121")
GSE25055 <- ReadGEO ("GSE25055")
GSE25065 <- ReadGEO ("GSE25065")
GSE7390 <- ReadGEO ("GSE7390")
# Call the MakePhenotype function to create phenotype data for the specified GEO datasets
Makephenotype (projname = c("GSE11121", "GSE25055", "GSE25065", "GSE7390"),
compare = "grade")
# Call the MakeBoxPlot function to create box plots for the specified GEO matrices and project names
MakeBoxPlot(GEOmatrix = GSE11121, projname = "GSE11121")
MakeBoxPlot(GEOmatrix = GSE25065, projname = "GSE25065")
MakeBoxPlot(GEOmatrix = GSE25055, projname = "GSE25055")
MakeBoxPlot(GEOmatrix = GSE7390, projname = "GSE7390")
# Call the MakePCA function to perform Principal Component Analysis (PCA) for the specified GEO matrices
# and project names, using the "grade" comparison
MakePCA(GEOmatrix = GSE11121, projname = "GSE11121", compare = "grade")
MakePCA(GEOmatrix = GSE25065, projname = "GSE25065", compare = "grade")
MakePCA(GEOmatrix = GSE25055, projname = "GSE25055", compare = "grade")
MakePCA(GEOmatrix = GSE7390, projname = "GSE7390", compare = "grade")
# Apply VSN (Variance Stabilizing Normalization) quantile normalization to the specified GEO matrices
normalized_GSE11121 <- VSNQuantilNorm(GEOmatrix = GSE11121)
normalized_GSE25055 <- VSNQuantilNorm(GEOmatrix = GSE25055)
normalized_GSE25065 <- VSNQuantilNorm(GEOmatrix = GSE25065)
normalized_GSE7390 <- VSNQuantilNorm(GEOmatrix = GSE7390)
# Generate box plots and perform PCA analysis for the specified normalized GEO matrices and project names after normalization
MakeBoxPlot(GEOmatrix = normalized_GSE11121, projname = "GSE11121")
MakePCA(GEOmatrix = normalized_GSE11121, projname = "GSE11121", compare = "grade")
MakeBoxPlot(GEOmatrix = normalized_GSE25055, projname = "GSE25055")
MakePCA(GEOmatrix = normalized_GSE25055, projname = "GSE25055", compare = "grade")
MakeBoxPlot(GEOmatrix = normalized_GSE25065, projname = "GSE25065")
MakePCA(GEOmatrix = normalized_GSE25065, projname = "GSE25065", compare = "grade")
MakeBoxPlot(GEOmatrix = normalized_GSE7390, projname = "GSE7390")
MakePCA(GEOmatrix = normalized_GSE7390, projname = "GSE7390", compare = "grade")
# Merging studies without batch correction (just for comparison)
merge_studies <- Mergestudies(list(normalized_GSE25055, normalized_GSE11121, normalized_GSE25065, normalized_GSE7390))
# Perform PCA (before batch effect) on merged gene expression matrix using multiple studies and compare based on "grade"
# Important note : studies should match projname
MakePCA(GEOmatrix = merge_studies,
studies = list(normalized_GSE25055, normalized_GSE11121, normalized_GSE25065, normalized_GSE7390),
projname = c("GSE25055", "GSE11121", "GSE25065", "GSE7390"),
compare = "grade")
# Batch effect correction
corbatch <- StudyBatchEffect(studies = list(normalized_GSE25055, normalized_GSE11121, normalized_GSE25065, normalized_GSE7390))
# Perform PCA on batch-corrected data and compare based on "grade"
# Important note : studies should match projname
MakePCA(GEOmatrix = corbatch,
studies = list(normalized_GSE25055, normalized_GSE11121, normalized_GSE25065, normalized_GSE7390),
projname = c("GSE25055", "GSE11121", "GSE25065", "GSE7390"), compare = "grade")
# Create density plots before batch effect correction for gene expression in each study
MakeDensityPlot(matrix = merge_studies,
studies = list(GSE25055 = normalized_GSE25055,
GSE11121 = normalized_GSE11121,
GSE25065 = normalized_GSE25065,
GSE7390 = normalized_GSE7390))
# Density plot after batch effect correction
MakeDensityPlot(matrix = corbatch,
studies = list(GSE25055 = normalized_GSE25055,
GSE11121 = normalized_GSE11121,
GSE25065 = normalized_GSE25065,
GSE7390 = normalized_GSE7390))
# MDS plot before batch effect correction
# Important note : studies should match projname
MakeMDSPlot(matrix = merge_studies,
studies = list(normalized_GSE25055, normalized_GSE11121, normalized_GSE25065, normalized_GSE7390),
projname = c("GSE25055", "GSE11121", "GSE25065", "GSE7390"),
compare = "grade",
nstudies = 4)
# Create MDS (Multidimensional Scaling) plot, after batch effect correction, using merged gene expression data from 4 studies, comparing based on "grade"
# Important note : studies should match projname
MakeMDSPlot(matrix = corbatch,
studies = list(normalized_GSE25055, normalized_GSE11121, normalized_GSE25065, normalized_GSE7390),
projname = c("GSE25055", "GSE11121", "GSE25065", "GSE7390"),
compare = "grade",
nstudies = 4)
# Perform Cochran's Q test on the studies, comparing groups "3" and "1" based on "grade"
cohranqtest <- CochranQTest(studies = list(GSE11121, GSE25065,GSE25055, GSE7390),
groups = c("3", "1"),
compare = "grade")
# Create QQ plot using Cochran's Q test results from 4 studies
MakeQQPlot(cohranq = cohranqtest, nstudies = 4)
# Perform REM (Random Effects Model) Analysis on the studies, comparing groups "3" and "1" based on "grade" with FDR of 0.05 and 50 permutations.
FEMREMAnalysis(studies = list(normalized_GSE25055, normalized_GSE11121, normalized_GSE25065, normalized_GSE7390),
projname = c("GSE25055", "GSE11121","GSE25065", "GSE7390"),
compare = "grade",
model = "REM",
groups = c("3", "1"),
FDR = 0.05,
nperm = 50)
# Create a volcano plot for project "GSE11121" with an adjusted p-value threshold of 0.05, positive SE threshold of 1, negative SE threshold of -1, displaying the top 10 results, using the "ggplot" style
volcano_GSE11121 <- Makevolcano(projname = "GSE11121",
padjlevel = 0.05,
pSE_thresh = 1,
nSE_thresh = -1,
ntop = 10,
voltype = "ggplot")
# Create a volcano plot for project "GSE7390" with an adjusted p-value threshold of 0.05, positive SE threshold of 1, negative SE threshold of -1, displaying the top 10 results, using the "ggplot" style
volcano_GSE7390 <- Makevolcano(projname = "GSE7390",
padjlevel = 0.05,
pSE_thresh = 1,
nSE_thresh = -1,
ntop = 10,
voltype = "ggplot")
# Create a volcano plot for project "GSE25065" with an adjusted p-value threshold of 0.05, positive SE threshold of 1, negative SE threshold of -1, displaying the top 10 results, using the "ggplot" style
volcano_GSE25065 <- Makevolcano(projname = "GSE25065",
padjlevel = 0.05,
pSE_thresh = 1,
nSE_thresh = -1,
ntop = 10,
voltype = "ggplot")
# Create a volcano plot for project "GSE25055" with an adjusted p-value threshold of 0.05, positive SE threshold of 1, negative SE threshold of -1, displaying the top 10 results, using the "ggplot" style
volcano_GSE25055 <- Makevolcano(projname = "GSE25055",
padjlevel = 0.05,
pSE_thresh = 1,
nSE_thresh = -1,
ntop = 10,
voltype = "ggplot")
# Combination
# Create a volcano plot for the combined data "combination" with an adjusted p-value threshold of 0.05, positive SE threshold of 1, negative SE threshold of -1, displaying the top 10 results, using the "ggplot" style
volcano_combination <- Makevolcano(projname = "combination",
padjlevel = 0.05,
pSE_thresh = 1,
nSE_thresh = -1,
ntop = 10,
voltype = "ggplot")
# Create a Venn diagram for the common elements among "GSE25055", "GSE11121", "GSE25065", and "GSE7390" studies, considering the top 10 elements
MakeVenna(common = c("GSE25055", "GSE11121","GSE25065", "GSE7390"), ntop = 10)
# Create a heatmap using the gene expression matrix from "normalized_GSE25055" for project "GSE25055", comparing based on "grade" and grouping samples into "grade1" and "grade3". Display the top 25 elements and do not sort the heatmap.
MakeHeatmap(matrix = normalized_GSE25055,
projname = "GSE25055",
compare = "grade",
groups =list(grade1 = "1", grade3 = "3"),
ntop = 25,
sorted = FALSE)
# Create a heatmap using the gene expression matrix from "normalized_GSE11121" for project "GSE11121", comparing based on "grade" and grouping samples into "grade1" and "grade3". Display the top 25 elements and do not sort the heatmap.
MakeHeatmap(matrix = normalized_GSE11121,
projname = "GSE11121",
compare = "grade",
groups =list(grade1 = "1", grade3 = "3"),
ntop = 25,
sorted = FALSE)
# Create a heatmap using the gene expression matrix from "normalized_GSE25065" for project "GSE25065", comparing based on "grade" and grouping samples into "grade1" and "grade3". Display the top 25 elements and do not sort the heatmap.
MakeHeatmap(matrix = normalized_GSE25065,
projname = "GSE25065",
compare = "grade",
groups =list(grade1 = "1", grade3 = "3"),
ntop = 25,
sorted = FALSE)
# Create a heatmap using the gene expression matrix from "normalized_GSE7390" for project "GSE7390", comparing based on "grade" and grouping samples into "grade1" and "grade3". Display the top 25 elements and do not sort the heatmap.
MakeHeatmap(matrix = normalized_GSE7390,
projname = "GSE7390",
compare = "grade",
groups =list(grade1 = "1", grade3 = "3"),
ntop = 25,
sorted = FALSE)
# Combination
# Create a heatmap using the merged gene expression matrix for the "combination" project, comparing based on "grade" and grouping samples into "grade1" and "grade3". Display the top 25 elements and do not sort the heatmap.
MakeHeatmap(matrix = merge_studies,
projname = "combination",
compare = "grade",
groups =list(grade1 = "1", grade3 = "3"),
ntop = 25,
sorted = FALSE)
# (Sorted grade) Create a heatmap using the merged gene expression matrix for the "combination" project, comparing based on "grade" and grouping samples into "grade1" and "grade3". Display the top 25 elements and sort the heatmap based on the values of "grade".
MakeHeatmap(matrix = merge_studies,
projname = "combination",
compare = "grade",
groups =list(grade1 = "1", grade3 = "3"),
ntop = 25,
sorted = TRUE)
# Perform Gene Set Enrichment Analysis (GSEA) for project "GSE25055" with a false discovery rate (FDR) of 0.05.
GSEAAnalysis(projname = "GSE25055", FDR = 0.05)
# Perform Gene Set Enrichment Analysis (GSEA) for project "GSE11121" with a false discovery rate (FDR) of 0.05.
GSEAAnalysis(projname = "GSE11121", FDR = 0.05)
# Perform Gene Set Enrichment Analysis (GSEA) for project "GSE25065" with a false discovery rate (FDR) of 0.05.
GSEAAnalysis(projname = "GSE25065", FDR = 0.05)
# Perform Gene Set Enrichment Analysis (GSEA) for project "GSE7390" with a false discovery rate (FDR) of 0.05.
GSEAAnalysis(projname = "GSE7390", FDR = 0.05)
# (Combination) Perform Gene Set Enrichment Analysis (GSEA) for the combined project "combination" with a false discovery rate (FDR) of 0.05.
GSEAAnalysis(projname = "combination", FDR = 0.05)
# Perform network analysis using the gene expression matrix from "GSE11121" for project "GSE11121", comparing based on "grade" and grouping samples into "grade1" and "grade3".
Makenetworkanalyst(matrix = GSE11121,
projname = "GSE11121",
compare = "grade",
groups =list(grade1 = "1", grade3 = "3"))
# Perform network analysis using the gene expression matrix from "GSE25055" for project "GSE25055", comparing based on "grade" and grouping samples into "grade1" and "grade3".
Makenetworkanalyst(matrix =GSE25055,
projname = "GSE25055",
compare = "grade",
groups =list(grade1 = "1", grade3 = "3"))
# Perform network analysis using the gene expression matrix from "GSE25065" for project "GSE25065", comparing based on "grade" and grouping samples into "grade1" and "grade3".
Makenetworkanalyst(matrix = GSE25065,
projname = "GSE25065",
compare = "grade",
groups =list(grade1 = "1", grade3 = "3"))
# Perform network analysis using the gene expression matrix from "GSE7390" for project "GSE7390", comparing based on "grade" and grouping samples into "grade1" and "grade3".
Makenetworkanalyst(matrix = GSE7390,
projname = "GSE7390",
compare = "grade",
groups =list(grade1 = "1", grade3 = "3"))