Phase 2 <br> R Code (4-Dataset Analysis

R Code For 4 Datasets (FC/FDR)



			
			

# Load the GEOquery library for accessing GEO data			
require(GEOquery)
# Load the limma library for differential expression analysis			
require(limma)
# Load the tidyverse library, which includes several useful packages for data manipulation and visualization		
require(tidyverse)
# Load the plotly library for interactive plots			
require(plotly)
# Load the ggvenn library for creating Venn diagrams using ggplot2			
require(ggvenn)
# Uncomment the line below if devtools package is not installed
# if (!require(devtools)) install.packages("devtools")
# devtools::install_github("yanlinlin82/ggvenn")
require(ggvenn)

# Function to download the specified GEO dataset 
DownloadGEO <- function(GSEname = "GSE11121") {
  # Create a folder for the dataset
  dir.create(GSEname, showWarnings = FALSE)

  # Increase memory allocation for efficient data processing
  Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 10)
  
  # Download the dataset from the GEO database
  gset <- getGEO(GSEname, GSEMatrix = TRUE, AnnotGPL = TRUE,
                 destdir = paste0(getwd(), "/", GSEname))
  
  # Check if the dataset was successfully downloaded
  if (!is.null(gset)) {  
    # Extract matrix data from the downloaded dataset
    matrix <- gset[[paste0(GSEname, "_series_matrix.txt.gz")]]@assayData[["exprs"]]
    
    # Perform log2 transformation if the matrix is not already transformed
    # Determine if log2 transformation is required based on the quantiles of the matrix values
    qx <- as.numeric(quantile(matrix, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm = TRUE))
    LogC <- (qx[5] > 100) || (qx[6] - qx[1] > 50 && qx[2] > 0)
    
    # Check if log2 transformation is required
    if (LogC) {
      # Replace non-positive values with NA
      matrix[matrix <= 0] <- NA
      # Perform log2 transformation on the matrix
      matrix <- log2(matrix)
    }
    
    # Add row names to the matrix:
    # Add row names as the first column of the matrix using cbind()
    matrix <- cbind(rownames(matrix), matrix)
    # Set the column name of the first column as "ID" using colnames()
    colnames(matrix)[1] <- "ID"
    
    # Extract phenotype information from the dataset:
    # Extract the phenotype information from the dataset
    phenotype <- gset[[paste0(GSEname, "_series_matrix.txt.gz")]]@phenoData@data  
    # Add row names as the first column of the phenotype data using cbind()
    phenotype <- cbind(rownames(phenotype), phenotype)  
    # Set the column name of the first column as "ID" using colnames()
    colnames(phenotype)[1] <- "ID"  
    
    # Extract annotation information from the dataset
    annot <- gset[[paste0(GSEname, "_series_matrix.txt.gz")]]@featureData@data
    
    # Write the extracted variables to files:
    # Write the 'matrix' variable to a CSV file
    write.csv(matrix, paste0(GSEname, "/", "matrix.csv"), quote = FALSE, row.names = FALSE)
    # Write the 'phenotype' variable to a TSV file
    write_tsv(phenotype, paste0(GSEname, "/", "phenotype.tsv"))
    # Write the 'annot' variable to a TSV file
    write_tsv(annot, paste0(GSEname, "/", "annot.tsv"))
    
    # Remove unnecessary variables to free up memory
    rm(gset, matrix, phenotype, annot)

    # Display a completion message
    message("+++++ Dataset is ready to be analyzed. +++++")
  }
}


# Function to perform differential expression analysis on the specified GEO dataset.
DEGanalysis <- function(projname,
                        compare = "subtype",
                        groups,
                        adjust = "fdr") {
  # Reading the files:
  # Read the matrix file and store it in a variable
  matrix <- read_csv(paste0(projname, "/", "matrix.csv"), )
  # Convert the matrix into a data frame
  matrix <- data.frame(matrix)
  # Set row names of the matrix
  rownames(matrix) <- matrix[, 1]
  # Remove the first column of the matrix
  matrix <- matrix[, -1]
  
  # Read the phenotype file and store it in a variable
  phenotype <- read_tsv(paste0(projname, "/", "phenotype.tsv"))
  # Convert the phenotype into a data frame
  phenotype <- data.frame(phenotype)
  # Set row names of the phenotype
  rownames(phenotype) <- phenotype[, ]
  # Remove the first column of the phenotype
  phenotype <- phenotype[, -1]
  
  # Read the annotation file and store it in a variable
  annot <- read_tsv(paste0(projname, "/", "annot.tsv"))
  # Convert the annotation into a data frame
  annot <- data.frame(annot)
  # Set row names of the annotation
  rownames(annot) <- annot[, 1]
  # Remove the first column of the annotation
  annot <- annot[, -1]
  
  # Extract sample IDs based on user-specified groups for differential analysis
  if (compare == "subtype") {
    
    # Preparing the breast cancer subtype
    phecoln <- grep("pam50", phenotype)
    if (length(phecoln) > 0) {
      # Prepare breast cancer subtypes into a dataframe
      pheno <- data.frame(phenotype[, phecoln])
      # Extract relevant columns from the phenotype file for breast cancer subtypes and rename them
      colnames(pheno) <- "subtype"
      pheno$subtype <- gsub("pam50.+:", "", pheno$subtype)
      rownames(pheno) <- rownames(phenotype)
      # Store the extracted subtypes in a list, grouped by user-specified group names
      grouplis <- list()
      # Iterate over each group specified by the user
      for (i in seq_along(groups)) {
        # Create a regular expression pattern by concatenating group elements with a "|"
        name <- paste0(groups[[i]], collapse = "|")
        # Find the row numbers in the "subtype" column of the phenotype data that match the pattern
        rownum <- grep(name, pheno$subtype)
        # Store the row names corresponding to the matched rows in the grouplis list
        grouplis[[names(groups)[i]]] <- rownames(pheno)[rownum]
      }
    }
  } else {
    
    # Preparing the breast cancer grades:
    # Find columns containing "grade:" in the 'phenotype' data frame
    gradecoln <- grep("grade:", phenotype)
    
    # Check if there are any columns with "grade:" found
    if (length(gradecoln) > 0) {
      # Prepare breast cancer grades into a dataframe
      grade <- data.frame(phenotype[, gradecoln])
      
      # Extract relevant columns from phenotype file for breast cancer grades and rename them:
      # Rename the column in 'grade' data frame to "grade"
      colnames(grade) <- "grade"
      # Remove unwanted text patterns from the 'grade' column using regular expressions
      grade$grade <- gsub("grade:|grade: |B-R grade: ", "", grade$grade)
      grade$grade <- gsub("=.+", "", grade$grade)
      
      # Identify rows containing the pattern "III" in the 'grade' column
      greek <- grep("III", grade$grade)
      if (length(greek) > 0) {
        
        # Convert grade names to numeric values:
        # Identify rows with grade "I"
        one <- which(factor(grade$grade) == "I")
        # Identify rows with grade "II"
        two <- which(factor(grade$grade) == "II")
        # Identify rows with grade "III"
        three <- which(factor(grade$grade) == "III")
        
        # Convert grade values to numeric: "I" -> 1, "II" -> 2, "III" -> 3
        grade[one, ] <- "1"
        grade[two, ] <- "2"
        grade[three, ] <- "3"
      }
      
      # Set row names of 'grade' data frame to row names of 'phenotype'
      rownames(grade) <- rownames(phenotype)
      
      # Store the extracted grades in a list, grouped by user-specified group names
      grouplis <- list()
      for (i in seq_along(groups)) {
        # Generate a group name by collapsing the elements of 'groups' with "|"
        name <- paste0(groups[[i]], collapse = "|")
        # Find the row numbers in 'grade' where the group name is present in the 'grade' column
        rownum <- grep(name, grade$grade)
        # Store the corresponding row names in the 'grouplis' list under the appropriate group name
        grouplis[[names(groups)[i]]] <- rownames(grade)[rownum]
      }
    }
  }
  
  # Rename the matrix file according to the groups name specified by the user:
  # Iterate over each group in grouplis		
  for (i in seq_along(grouplis)) {
    # Generate a group name by collapsing the elements of 'grouplis[[i]]' with "|"
    matname <- paste0(grouplis[[i]], collapse = "|")
    # Find the column numbers in 'matrix' where the group name is present in the column names
    matcoln <- grep(matname, colnames(matrix))
    # Rename the columns of 'matrix' with the group name followed by a numeric index
    colnames(matrix)[matcoln] <- paste0(names(grouplis)[i], 1:length(grouplis[[i]]))
  }
   
  # Keep only the renamed samples in the matrix and order them:
  # Subset and reorder the 'matrix' data frame based on the group names
  matnames <- grep(paste0(names(grouplis), collapse = "|"), colnames(matrix))
  # Subset the 'matrix' data frame to include only the columns corresponding to the renamed samples
  matrix <- matrix[, matnames]
  # Reorder the columns of the 'matrix' data frame based on the column names
  matrix <- matrix[, order(colnames(matrix))]
  
  # Creating a factor of 0 and 1 according to the length of groups specified by the user:
  # Create an empty list to store the factors
  gname <- list()
  # Iterate over each group in grouplis
  for (i in seq_along(grouplis)) {
    # Count the number of columns in matrix corresponding to the group
    number <- length(grep(names(grouplis)[i], colnames(matrix)))
    # Create a factor variable with values i - 1 repeated number times
    gname[[i]] <- factor(rep(i - 1, number))
  }
  # Convert the list of factors to a single vector
  grouping <- unlist(gname)
  
  # Create a design matrix for the linear model:
  # Create a design matrix with the factor variables as predictors
  design <- model.matrix(~ i - 0 + grouping)
  # Assign column names to the design matrix based on the group names
  colnames(design) <-names(grouplis)
  
  # Fit the linear model to the data:
  # Fit the linear model using the matrix and design matrix
  fit1 <- lmFit(matrix, design)
  # Get the number of group names
  l <- length(names(grouplis))
  # Create a string representation of group names for contrasts
  cts <- paste0(names(grouplis)[l:1], collapse = "-")
  # Create contrasts for pairwise comparisons of groups
  cont.matrix <- makeContrasts(contrasts = cts, levels = design)
  # Fit the contrasts to the model
  fit2 <- contrasts.fit(fit1, cont.matrix)
  # Perform empirical Bayes moderation on the fitted model
  fit2 <- eBayes(fit2)
  
  # Generate a top table of differentially expressed genes
  DEGs <- topTable(fit2, adjust = adjust, number = Inf)
  # Add row names as a separate column in the DEGs table
  DEGs <- cbind(rownames(DEGs), DEGs)
  # Rename the first column as "ID"
  colnames(DEGs)[1] <- <- "ID"
  
  # Preparing the annotation variable:
  # Extract relevant columns from the 'annot' data frame
  annotation <- annot[, c("Gene.symbol", "Chromosome.location", "GO.Function")]
  # Add row names as a separate column in the 'annotation' data frame
  annotation <- cbind(rownames(annotation), annotation)
  # Rename the first column as "ID"
  colnames(annotation)[1] <- <- "ID"
  
  # Making sure that both DEGs and annotation row names are the same:
  # Sort the 'annotation' data frame by the "ID" column
  annotation <- annotation[order(annotation$ID), ]
  # Sort the 'DEGs' data frame by the "ID" column
  DEGs <- DEGs[order(DEGs$ID), ]
  
  # Merge the top table with the annotation data 
  DEGs <- cbind(DEGs, annotation[, c(2, 3, 4)])
  
  # Removing missing gene symbols:
  # Find the indices of rows with missing gene symbols
  genemissing <- which(is.na(DEGs$Gene.symbol) == TRUE)
  # Remove the rows with missing gene symbols from the 'DEGs' data frame
  DEGs <- DEGs[-genemissing, ]
  
  # Write the resulting top table to a file
  write_tsv(DEGs, paste0(projname, "/", "DEGs.tsv"))
  
  # Display a completion message
  message("+++++ Analysis has completed. +++++")
}



# Function to generates volcano plots for the specified GEO datasets.
Makevolcano <- function(projname,
                        padjlevel = 0.05,
                        uplogFC = 1,
                        downlogFC = -1,
                        ntop = 10,
                        voltype = "plotly") {
  # Read the tab-separated DEGs file into a data frame
  table <- read_tsv(paste0(projname, "/", "DEGs.tsv"))
  table <- data.frame(table)
  
  # Define a vector of color codes
  my_pal <- c("#1B9E77", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#9A7D0A")
  
  # Add DEG column based on the value of adj.P.Val and logFC:
  volcano <- table %>%
    # Calculate AdjustedPvalue as the negative logarithm (base 10) of adj.P.Val
    mutate(AdjustedPvalue = -log10(adj.P.Val)) %>%
    # Initialize DEG column with "Non-Significant" for all rows
    mutate(DEG = "NotSignificant") %>%
    # Update DEG based on conditions for upregulation
    mutate(DEG = ifelse(AdjustedPvalue > -log10(padjlevel) & logFC > uplogFC, "Upregulated", DEG)) %>%
    # Update DEG based on conditions for downregulation
    mutate(DEG = ifelse(AdjustedPvalue > -log10(padjlevel) & logFC < downlogFC, "Downregulated", DEG))
  
  # Separate the DEGs into three categories: Upregulated, Downregulated, Non-Significant:
  # Subset the 'volcano' data frame to select only the rows where the 'DEG' column is equal to "Upregulated".
  volcano_up <- volcano[which(volcano$DEG == "Upregulated"), ]
  # Sort the 'volcano_up' data frame in descending order based on the 'logFC' column.
  volcano_up <- volcano_up[order(volcano_up$logFC, decreasing = T), ]
  # Subset the 'volcano' data frame to select only the rows where the 'DEG' column is equal to "Downregulated".
  volcano_down <- volcano[which(volcano$DEG == "Downregulated"), ]
  # Sort the 'volcano_down' data frame in ascending order based on the 'logFC' column.
  volcano_down <- volcano_down[order(volcano_down$logFC, decreasing = F), ]
  # Subset the 'volcano' data frame to select only the rows where the 'DEG' column is equal to "Non-Significant".
  volcano_NA <- volcano[which(volcano$DEG == "NotSignificant"), ]
  
  # Combine the DEGs into a single data frame:
  # Combine the 'volcano_up', 'volcano_down', and 'volcano_NA' data frames vertically using the 'rbind' function.
  volcano_all <- rbind(volcano_up, volcano_down, volcano_NA)
  
  # Replace the top probes with the word "Top" in the DEGs column:
  # Create a vector 'ntop_rep' with length 'ntop' where each element is "Top".
  ntop_rep <- rep("Top", ntop)
  # Create a new column 'DEGs' in the 'volcano_all' data frame.
  volcano_all$DEGs <- c(ntop_rep, volcano_up$DEG[-c(1:ntop)], ntop_rep, volcano_down$DEG[-c(1:ntop)], volcano_NA$DEG)
  
  # Label only top genes:
  # Assign the gene symbol as a label for top genes
  volcano_all$Top <- ifelse(volcano_all$DEGs == "Top", volcano_all$Gene.symbol, "")
  
  # Plotting either with plotly or ggplot
  if (voltype == "plotly") {
    # Create a ggplot object with layers and annotations
    p <- ggplot(data = volcano_all, aes(x = logFC, y = AdjustedPvalue, color = DEGs, fill = DEGs, text = paste("ID: ", ID,
													       # Tooltip text for gene symbol
	                                                                                                       "<br> Gene: ", Gene.symbol,
	                                                                                                       # Tooltip text for chromosome location
	                                                                                                       "<br> Chromosome: ", Chromosome.location,
	                                                                                                       # Tooltip text for GO function
	                                                                                                       "<br> GO function: ", GO.Function))) +
      # Set the x-axis label, y-axis label, and plot title
      labs(x = 'log2 (Fold Change)', y = "-log10(Adjusted P-value)", title = paste0("Volcano plot-", projname)) +
      # Add points to the plot with specified size and shape
      geom_point(size = 1, shape = 21) +
      # Set the color scale of the plot manually
      scale_color_manual(values = c(my_pal)) +
      # Set the fill color scale of the plot manually
      scale_fill_manual(values = c(paste(my_pal, "66", sep = ""))) +
      # Apply the "classic" theme to the plot
      theme_classic() +
      # Set axis text font and size
      theme(axis.text = element_text(family = "Arial", size = 24, colour = "black"),
            # Set x-axis text color and size
            axis.text.x = element_text(family = "Arial", colour = "black", size = 24),
            # Set y-axis text color and size
            axis.text.y = element_text(family = "Arial", colour = "black", size = 24),
            # Set subtitle font, size, and color
            plot.subtitle = element_text(family = "Arial", size = 24, colour = "black", hjust = 0.5),
            # Set y-axis title font, size, and angle
            axis.title.y = element_text(family = "Arial", size = 24, angle = 90),
            # Set x-axis title font, size, and angle
            axis.title.x = element_text(family = "Arial", size = 24, angle = 00))
    
    # Convert ggplot object to plotly object and customize layout
    p <- ggplotly(p, tooltip = "text") %>%
      # Set the title of the plot
      layout(title = paste0("Volcano plot-", projname),
             # Customize font properties
             font = list(family = "Arial", color = "black", size = 24),
             # Customize legend properties
             legend = list(family = "Arial", color = "black", size = 20, itemclick = "toggle"))
  } else {
    # Create a ggplot object with layers and annotations:
    p <- ggplot(data = volcano_all, aes(x = logFC, y = AdjustedPvalue, color = DEGs, fill = DEGs, label = Top)) + 
      # Set the labels for x and y axes
      labs(x = 'log2 (Fold Change)', y = "-log10(Adjusted P-value)") +
      # Add points to the plot
      geom_point(size = 1, shape = 21) +
      # Add text annotations to the plot
      geom_text(check_overlap = F, vjust = 0.1, nudge_y = 0.4) +
      # Set manual color scale for DEGs
      scale_color_manual(values = c(my_pal)) +
      # Set manual fill scale for DEGs
      scale_fill_manual(values = c(paste(my_pal, "66", sep = ""))) +
      # Use classic theme for the plot
      theme_classic() +
     
      theme(
        # Customize axis text properties
        axis.text = element_text(family = "Arial", size = 24, colour = "black"),
        # Customize x-axis text properties    
        axis.text.x = element_text(family = "Arial", colour = "black", size = 24),
        # Customize y-axis text properties    
        axis.text.y = element_text(family = "Arial", colour = "black", size = 24),
        # Customize subtitle text properties    
        plot.subtitle = element_text(family = "Arial", size = 24, colour = "black", hjust = 0.5),
        # Customize y-axis title properties    
        axis.title.y = element_text(family = "Arial", size = 24, angle = 90),
        # Customize x-axis title properties    
        axis.title.x = element_text(family = "Arial", size = 24, angle = 00),
        # Customize legend text properties    
        legend.text = element_text(size = 10, family = "Arial"), 
        # Customize legend title properties    
        legend.title = element_text(size = 20, family = "Arial")
      ) + 
      # Set the subtitle of the plot
      labs(subtitle = paste0("Volcano plot-", projname))
  }
  
  # Calculate the number of up-regulated and down-regulated genes:
  # Write the up-regulated and down-regulated genes to a TSV file
  write_tsv(rbind(volcano_up, volcano_down), paste0(projname, "/", "all_top_down.tsv"))
  
  # Create the top table containing the top up-regulated and down-regulated genes
  top <- rbind(volcano_up[1:ntop, ], volcano_down[1:ntop, ])
  # Reorder columns in the top table
  top <- top[, c(1:6, 11, 7, 8, 9, 10, 12)]
  # Rename the 7th column as "n.log10(adj.P.Val)"
  colnames(top)[7] <- "n.log10(adj.P.Val)"
  
  # Save the top table as a CSV file in the project folder
  write.csv(top, paste0(projname, "/", "top.csv"), quote = FALSE, row.names = FALSE)
  
  # Display the number of up-regulated and down-regulated genes
  message(sprintf("Up-regulated genes: %s; Down-regulated genes: %s", nrow(volcano_up), nrow(volcano_down)))
  
  # Return the generated plot
  return(p)
}


# Function to creates a Venn diagram to visualize common differentially expressed genes among specified GEO datasets.
MakeVenna <- function(common, ntop = 10) {
  
  # Create a directory to store the output files
  dir.create("common", showWarnings = FALSE)
  
  # Reading all all_top_down files and processing them
  com_table <- list()
  
  # Process each dataset
  for (i in seq_along(common)) {
    # Read the all_top_down file for the current dataset
    DEG_table <- data.frame(read_tsv(paste0(common[i], "/", "all_top_down.tsv")))
    
    # Finding duplicated genes
    # Find the indices of duplicated genes
    dup_genes_n <- which(duplicated(DEG_table$Gene.symbol))
    # Get unique duplicated genes
    dup_uniq_genes <- unique(DEG_table[dup_genes_n, "Gene.symbol"])
    # Find the indices of all duplicated genes
    find_dup <- which(DEG_table$Gene.symbol %in% dup_uniq_genes)
    # Extract all duplicated genes
    all_dup <- DEG_table[find_dup, ]
    # Remove duplicated genes from the DEG table
    DEG_table <- DEG_table[-find_dup, -11]
    
    # Calculation the mean for logFC/AveExpr/t value/P-value/adjusted P-value/B value of duplicated genes
    uniq_genes_list <- list()
    
    # Process each duplicated gene
    for (j in seq_along(dup_uniq_genes)) {
      # Find the indices of the current duplicated gene
      c <- which(all_dup$Gene.symbol %in% dup_uniq_genes[j])
      # Calculate the mean logFC
      b <- c(mean(all_dup$logFC[c]),
             # Calculate the mean AveExpr
             mean(all_dup$AveExpr[c]),
             # Calculate the mean t value
             mean(all_dup$t[c]),
             # Calculate the mean P-value
             mean(all_dup$P.Value[c]),
             # Calculate the mean adjusted P-value
             mean(all_dup$adj.P.Val[c]),
             # Calculate the mean B value
             mean(all_dup$B[c]))
      
      # Assign column names and create a new data frame with the mean values
      # Assign column names to the mean values
     names(b) <- c("logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B")
      # Create a new data frame with the mean values
      d <- data.frame(c(all_dup[c[1], -c(2, 3, 4, 5, 6, 7, 11)], b))
      # Reorder the columns of the data frame
      d <- d[, c(1, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5)]
      # Store the data frame in the list for unique genes
      uniq_genes_list[[j]] <- d
    }
    
    # Joining the mean calculated and unique genes with the rest of genes
    # Combine the data frames in the list into one data frame
    uniq_genes_df <- do.call(rbind, uniq_genes_list)
    # Append the unique genes data frame to the DEG_table
    DEG_table <- rbind(DEG_table, uniq_genes_df)
    
    # Store the processed DEG table for the current dataset
    com_table[[common[i]]] <- DEG_table
  }
  
  # Calculating up and down-regulated genes for each dataset
  # Create an empty list to store the up and down-regulated genes
  DEGslist <- list()
  
  for (i in seq_along(com_table)) {
    # Extract the 8th column (DEGs) and store it in the list
    DEGslist[[common[i]]] <- com_table[[i]][8]
  }
  
  # Find common genes among all datasets
  com_prob <- Reduce(intersect, DEGslist)
  
  # Writing all common genes into separated files as well as the top files
  # Create an empty list to store all common genes
  all_com <- list()
  # Create an empty list to store all common genes (including all samples)
  all_com_all <- list()
  
  for (i in seq_along(com_table)) {
    # Extract the common differentially expressed genes for the current dataset
    comn <- which(com_table[[names(com_table)[i]]][["Gene.symbol"]] %in% com_prob$Gene.symbol)
    # Subset the current dataset with the common DEGs
    com_final_DEG  <- com_table[[names(com_table)[i]]][comn, ]
    # Sort the subsetted data frame based on the "DEG" column
    com_final_DEG <- com_final_DEG[order(com_final_DEG$DEG), ]
    
    # Save the common differentially expressed genes as a TSV file
    write_tsv(com_final_DEG, paste0("common", "/",names(com_table)[i], "_common_DEGs.tsv"))
    
    # Extract the top up-regulated genes for the current dataset
    com_final_up <- com_final_DEG[which(com_final_DEG$DEG == "Upregulated"), ]
    # Sort the subsetted data frame based on the "logFC" column in decreasing order
    com_final_up <- com_final_up[order(com_final_up$logFC, decreasing = TRUE), ]
    
    # Extract the top down-regulated genes for the current dataset
    com_final_down <- com_final_DEG[which(com_final_DEG$DEG == "Downregulated"), ]
    # Sort the subsetted data frame based on the "logFC" column in increasing order
    com_final_down <- com_final_down[order(com_final_down$logFC, decreasing = FALSE), ]
    
    # Save the top common differentially expressed genes as a TSV file
    write_tsv(rbind(com_final_up[1:ntop, ], com_final_down[1:ntop, ]), paste0("common", "/",names(com_table)[i], "_top_common_DEGs.tsv"))
    
    # Store the common differentially expressed genes and all genes for each dataset
    # Store the subsetted com_final_DEG data frame in all_com list
    all_com[[names(com_table)[i]]] <- com_final_DEG[, c(2:7)]
    # Store the complete com_final_DEG data frame in all_com_all list
    all_com_all[[names(com_table)[i]]] <- com_final_DEG
  }
  
  # Calculate the mean of logFC for all common genes in each dataset 
  # Combine the data frames in all_com list into a single data frame, all_com_df
  all_com_df <- do.call(cbind, all_com)
  # Find the column indices of columns containing "logFC" in their names
  logfc_n <- grep("logFC", colnames(all_com_df))
  
  # Extract the columns containing logFC values from all_com_all_df
  logfc_df <- all_com_df[, logfc_n]
  
  # Calculate the row means of logFC values to obtain the mean logFC for each common gene
  logfc_df <- data.frame(logFC = rowMeans(logfc_df))
  
  # Extract columns containing adj.P.Val values from all_com_df:
  # Find the column indices of columns containing "adj.P.Val" in their names
  adj.P.Val_n <- grep("adj.P.Val", colnames(all_com_df))
  # Extract the columns containing adj.P.Val values
  adj.P.Val_df <- all_com_df[, adj.P.Val_n]
  # Calculate the row means of adj.P.Val values to obtain the mean adj.P.Val for each common gene:
  # Create a data frame with a single column "adj.P.Val" containing the mean adj.P.Val values
  adj.P.Val_df <- data.frame(adj.P.Val = rowMeans(adj.P.Val_df))
  
  # Extract columns containing B values from all_com_df
  B_n <- grep("B", colnames(all_com_df))
  B_df <- all_com_df[, B_n]
  # Calculate the row means of B values to obtain the mean B for each common gene
  B_df <- data.frame(B = rowMeans(B_df))
  
  # Extract columns containing t values from all_com_df
  t_n <- grep(".t", colnames(all_com_df))
  t_df <- all_com_df[, t_n]
  # Calculate the row means of t values to obtain the mean t for each common gene
  t_df <- data.frame(t = rowMeans(t_df))
  
  # Extract columns containing P.Value values from all_com_df
  P.Value_n <- grep("P.Value", colnames(all_com_df))
  P.Value_df <- all_com_df[, P.Value_n]
  # Calculate the row means of P.Value values to obtain the mean P.Value for each common gene
  P.Value_df <- data.frame(P.Value = rowMeans(P.Value_df))
  
  # Extract columns containing AveExpr values from all_com_df:
  # Find the column indices of columns containing "AveExpr" in their names
  AveExpr_n <- grep("AveExpr", colnames(all_com_df))
  # Extract the columns containing AveExpr values
  AveExpr_df <- all_com_df[, AveExpr_n]
  # Calculate the row means of AveExpr values to obtain the mean AveExpr for each common gene:
  # Create a data frame with a single column "AveExpr" containing the mean AveExpr values
  AveExpr_df <- data.frame(AveExpr = rowMeans(AveExpr_df))
  
  # Combining all calculated means into one data frame 
  all_com_all_df <- data.frame(do.call(cbind, all_com_all[[1]]))
  # Removing unnecessary columns from all_com_all_df
  all_com_all_df <- all_com_all_df[, -c(2:7)]
  # Combining the calculated means with all_com_all_df
  all_com_all_df <- cbind(all_com_all_df, logfc_df, adj.P.Val_df, B_df, t_df, AveExpr_df, P.Value_df)
  # Reordering the columns of all_com_all_df
  all_com_all_df <- all_com_all_df[, c(1, 6, 10, 9, 11, 7, 8, 2, 3, 4, 5)]
  
  # Writing the combined data frame of all calculated means to a TSV file
  write_tsv(all_com_all_df, paste0("common", "/", "all_mean_common_DEGs.tsv"))
  
  # Displaying a message to indicate that the common genes have been saved
  message("+++++ Common genes between all datasets have been saved into the 'common' folder. +++++")
  
  # Creating a list of DEGs for each dataset to generate a Venn diagram
  venlist <- list()
  # Iterating over the DEGslist to populate the venlist
  for (i in seq_along(DEGslist)) {
    venlist[[common[i]]] <- DEGslist[[i]][[1]]
  }
  
  # Generate the Venn diagram using the logFC mean values for common genes
  # Set the seed for random number generation
  set.seed(1234)
  # Define color palette for the Venn diagram
  my_pal <- c("#1B9E77", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#9A7D0A")
  # Create the Venn diagram using ggvenn package
  venaa <- ggvenn(venlist, 
                  fill_color = my_pal,
                  stroke_size = 0.5,
                  set_name_size = 4)
  
  # Return the generated Venn diagram
  return(venaa)
}



# Function to prepares files for network analysis using the specified GEO datasets.
Makenetworkanalyst <- function(projname,
                               compare = "grade",
                               groups) {
  # Reading files
  # Create 'networkanalyst' directory to store the output files
  dir.create("networkanalyst", showWarnings = FALSE)
  
  # Read the 'matrix.csv' file
  matrix <- read_csv(paste0(projname, "/", "matrix.csv"))
  matrix <- data.frame(matrix)
  
  # Read the 'phenotype.tsv' file
  phenotype <- read_tsv(paste0(projname, "/", "phenotype.tsv"))
  phenotype <- data.frame(phenotype)
  
  # Set the row names of the 'phenotype' dataframe
  rownames(phenotype) <- phenotype[, 1]
  
  # Calculating DEGs:
  # Read the 'all_top_down.tsv' file containing up and down genes
  updown <- data.frame(read_tsv(paste0(projname, "/", "all_top_down.tsv")))
  
  # Creating a file that contains up and down genes based on groups specified:
  
  # Identify the rows in 'matrix' corresponding to up and down genes
  updown_genes <- which(matrix$ID %in% updown$ID)
  # Create a new matrix 'updown_matrix' containing the identified rows
  updown_matrix <- matrix[updown_genes, ]
 
  # Add gene symbols to the 'updown_matrix' dataframe
  updown_matrix <- cbind(updown$Gene.symbol, updown_matrix)
  # Rename the first column as "Symbol"
  colnames(updown_matrix)[1] <-  "Symbol" 
  # Remove the second column from the dataframe
  updown_matrix <- updown_matrix[, -2]
  
  # Extract the gene symbols from the 'updown_matrix' dataframe
  genes <- updown_matrix$Symbol
  # Remove the first column (gene symbols) from 'updown_matrix', round the remaining expression values to 2 decimal places
  updown_matrix <- round(updown_matrix[-1], 2)
  # Add the gene symbols column back to the 'updown_matrix' dataframe
  updown_matrix <- cbind(genes, updown_matrix)
  
  if (compare ==  "subtype") {
    # Prepare data based on subtype comparison
    
    # Find the column index for 'pam50' in 'phenotype'
    phecoln <- grep( "pam50", phenotype)
    
    # Check if the length of 'phecoln' is greater than 0 (if there are subtype columns present in 'phenotype' dataframe.)
    if (length(phecoln) > 0) {
      # Extract the subtype column from 'phenotype'
      pheno <- data.frame(phenotype[, phecoln])
      # Rename the column to "subtype"
      colnames(pheno) <-  "subtype"
      
      # Clean up the subtype values by removing the "pam50" prefix
      pheno$subtype <- gsub( "pam50.+:",  "", pheno$subtype)
      # Assign the row names of 'phenotype' as the row names of 'pheno'
      rownames(pheno) <- rownames(phenotype)
      
      # Creating groups of samples based on specified conditions:
      # Create an empty list to store groups of samples
      grouplis <- list()
      for (i in seq_along(groups)) {
        # Combine the group names using the '|' separator
        name <- paste0(groups[[i]], collapse = "|")
        # Find the rows in 'pheno$subtype' that match the group name
        rownum <- grep(name, pheno$subtype)
        # Store the matching row names in 'grouplis' with the group name as the key
        grouplis[[names(groups)[i]]] <- rownames(pheno)[rownum]
      }
    }
  } else {
    # Preparing the breast cancer grades into a dataframe:
    # Find the column index containing "grade:" in the phenotype data
    gradecoln <- grep( "grade:", phenotype)
    # Check if any column contains "grade:"
    if (length(gradecoln) > 0) {
      # Extract the column with "grade:" and create a data frame
      grade <- data.frame(phenotype[, gradecoln])
      # Rename the column to "grade"
      colnames(grade) <-  "grade"
      # Remove "grade:", "grade: ", and "B-R grade: " from the values in the column
      grade$grade <- gsub( "grade:|grade: |B-R grade: ",  "", grade$grade)
      # Remove any characters after "=" in the values
      grade$grade <- gsub( "=.+",  "", grade$grade)
      # Find the rows with "III" in the grade column
      greek <- grep( "III", grade$grade)
      # Check if any rows contain "III"
      if (length(greek) > 0) {
        # Find the rows with "I", "II", "III" in the grade column
        one <- which(factor(grade$grade) ==  "I")
        two <- which(factor(grade$grade) ==  "II")
        three <- which(factor(grade$grade) ==  "III")
        # Replace the rows with "I" with "1"
        grade[one, ] <-  "1"
        # Replace the rows with "II" with "2"
        grade[two, ] <-  "2"
        # Replace the rows with "III" with "3"
        grade[three, ] <-  "3"
      }
      # Set the row names of the 'grade' data frame to match the row names of the 'phenotype' data frame
      rownames(grade) <- rownames(phenotype)
      # Create an empty list to store the group information
      grouplis <- list()
      for (i in seq_along(groups)) {
        # Concatenate the elements of 'groups' with a '|' separator
        name <- paste0(groups[[i]], collapse = "|")
        # Find the rows in the 'grade' data frame where the 'grade' column matches the group name
        rownum <- grep(name, grade$grade)
        # Store the row names corresponding to the group in the 'grouplis' list
        grouplis[[names(groups)[i]]] <- rownames(grade)[rownum]
      }
    }
  }
  
  # Renaming the matrix file according to the groups name specified by the user:
  # Bind the column names of 'updown_matrix' as the first row of 'updown_matrix'
  updown_matrix <- rbind(colnames(updown_matrix), updown_matrix)
  for (i in seq_along(grouplis)) {
    # Concatenate the elements of 'grouplis[[i]]' with a '|' separator
    matname <- paste0(grouplis[[i]], collapse = "|")
    # Find the columns in 'updown_matrix' that match the group name specified by 'matname'
    matcoln <- grep(matname, colnames(updown_matrix))
    # Replace the corresponding columns in the first row of 'updown_matrix' with the group name
    updown_matrix[1, matcoln] <-names(grouplis)[i]
  }
  
  # Adding column names and the second row according to the file for networkanalyst:
  # Find the columns in the first row of 'updown_matrix' that match the names of 'grouplis'
  changedcol <- grep(paste0(names(grouplis), collapse = "|"), updown_matrix[1, ])
  # Keep the first column and the columns selected by 'changedcol' in 'updown_matrix'
  updown_matrix <- updown_matrix[, c(1, changedcol)]
  # Replace the first cell of 'updown_matrix' with "#CLASS"
  updown_matrix[1, 1] <-  "#CLASS"
  # Replace the column name of the first column in 'updown_matrix' with "#NAME"
  colnames(updown_matrix)[1] <-  "#NAME"
  
  # Write 'updown_matrix' to a tab-separated file
  write.table(updown_matrix, paste0( "networkanalyst", "/", projname,  "_dataanlys.txt"), quote = FALSE, row.names = FALSE, sep = "\t")
  
  # Display a message indicating that the file has been saved
  message("+++++ Your file has been saved in the networkanalyst folder. +++++")
}


	  
#===================================================== Q1 =====================================================#
# other adjust methods ---> "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"

################################################# GSE25055 ################################################# 
# Download the GEO dataset GSE25055
DownloadGEO( "GSE25055")

# Perform DEG analysis on GSE25055 dataset
DEGanalysis(projname = "GSE25055",
            compare = "grade",
            groups = list(normal = c("1"), tumor = c("3")),
            adjust = "fdr")

# Create a volcano plot for GSE25055 dataset
volcano_GSE25055 <- Makevolcano(projname = "GSE25055",
                                padjlevel = 0.05,
                                uplogFC = 1,
                                downlogFC = -1,
                                ntop = 10,
                                voltype = "ggplot")
volcano_GSE25055

# Prepare files for network analysis on GSE25055 dataset
Makenetworkanalyst(projname = "GSE25055",
                   compare = "grade",
                   groups = list(normal = c("1"), tumor = c("3")))

################################################# GSE7390 #################################################
# Download the GEO dataset GSE7390
DownloadGEO("GSE7390")

# Perform DEG analysis on GSE7390 dataset
DEGanalysis(projname = "GSE7390",
            compare = "grade",
            groups = list(normal = c("1"), tumor = c("3")),
            adjust = "fdr")

# Create a volcano plot for GSE7390 dataset
volcano_GSE7390 <- Makevolcano(projname = "GSE7390",
                               padjlevel = 0.05,
                               uplogFC = 1,
                               downlogFC = -1,
                               ntop = 10,
                               voltype = "ggplot")
volcano_GSE7390

# Prepare files for network analysis on GSE7390 dataset
Makenetworkanalyst(projname = "GSE7390",
                   compare = "grade",
                   groups = list(normal = c("1"), tumor = c("3")))

################################################# GSE11121 #################################################
# Download the GEO dataset GSE11121
DownloadGEO("GSE11121")

# Perform DEG analysis on GSE11121 dataset
DEGanalysis(projname = "GSE11121",
            compare = "grade",
            groups = list(normal = c("1"), tumor = c("3")),
            adjust = "fdr")

# Create a volcano plot for GSE11121 dataset
volcano_GSE11121 <- Makevolcano(projname = "GSE11121",
                                padjlevel = 0.05,
                                uplogFC = 1,
                                downlogFC = -1,
                                ntop = 10,
                                voltype = "ggplot")
volcano_GSE11121

# Prepare files for network analysis on GSE11121 dataset
Makenetworkanalyst(projname = "GSE11121",
                   compare = "grade",
                   groups = list(normal = c("1"), tumor = c("3")))

################################################# GSE25065 #################################################
# Download the GEO dataset GSE25065
DownloadGEO("GSE25065")

# Perform DEG analysis on GSE25065 dataset
DEGanalysis(projname = "GSE25065",
            compare = "grade",
            groups = list(normal = c("1"), tumor = c("3")),
            adjust = "fdr")

# Create a volcano plot for GSE25065 dataset
volcano_GSE25065 <- Makevolcano(projname = "GSE25065",
                                padjlevel = 0.05,
                                uplogFC = 1,
                                downlogFC = -1,
                                ntop = 10,
                                voltype = "ggplot")
volcano_GSE25065

# Prepare files for network analysis on GSE25065 dataset
Makenetworkanalyst(projname = "GSE25065",
                   compare = "grade",
                   groups = list(normal = c("1"), tumor = c("3")))


# Create a Venn diagram for the common differentially expressed genes across the datasets
MakeVenna(common = c("GSE25055", "GSE7390", "GSE11121", ), ntop = 10)