Phase 1 <br> R Code (1-Dataset Analysis

R Code(One Dataset)


			
			

# Load required packages
require(GEOquery)    # For downloading and processing GEO datasets
require(limma)       # For performing differential expression analysis
require(tidyverse)   # For data manipulation and visualization
require(plotly)      # For interactive plots


# Function to download a GEO dataset
DownloadGEO <- function(GSEname) {
  # Create a folder for the dataset
  dir.create(GSEname, showWarnings = FALSE)
  
  # Increase memory allocation
  Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 10) 
  
  # Download the dataset
  gset <- getGEO(
    GSEname,
    GSEMatrix = TRUE,
    AnnotGPL = TRUE,
    destdir = paste0(getwd(), "/", GSEname)
  )
 
  if(!is.null(gset)) {
    # Extract matrix, phenotype, and annotation
    matrix <- gset[[paste0(GSEname, "_series_matrix.txt.gz")]]@assayData[["exprs"]]
    matrix <- cbind(rownames(matrix), matrix)
    colnames(matrix)[1] <- "ID"
    
    phenotype <- gset[[paste0(GSEname, "_series_matrix.txt.gz")]]@phenoData@data
    phenotype <- cbind(rownames(phenotype), phenotype)
    colnames(phenotype)[1] <- "ID"
    
    annot <- gset[[paste0(GSEname, "_series_matrix.txt.gz")]]@featureData@data
    
    # Write the extracted variables to files
    write.csv(matrix, paste0(GSEname, "/", "matrix.csv"), quote = FALSE, row.names = FALSE)
    write_tsv(phenotype, paste0(GSEname, "/", "phenotype.tsv")) 
    write_tsv(annot, paste0(GSEname, "/", "annot.tsv")) 
    
    # Remove variables from memory
    rm(gset, matrix, phenotype, annot)
    
    message("+++++ Dataset is ready to analyze. +++++")
  }
}

# # Function for performing differential expression analysis
DEGanalysis <- function(projname,
                        compare = "subtype",
                        groups,
                        adjust = "fdr") {
  # Load the matrix data from the CSV file
  matrix <- read_csv(paste0(projname, "/", "matrix.csv"))
  matrix <- data.frame(matrix)
  rownames(matrix) <- matrix[,1]
  matrix <- matrix[,-1]

  # Load the phenotype data from the TSV file
  phenotype <- read_tsv(paste0(projname, "/", "phenotype.tsv"))
  phenotype <- data.frame(phenotype)
  rownames(phenotype) <- phenotype[,1]
  phenotype <- phenotype[,-1]

  # Load the annotation data from the TSV file
  annot <- read_tsv(paste0(projname, "/", "annot.tsv"))
  annot <- data.frame(annot)
  rownames(annot) <- annot[,1]
  annot <- annot[,-1]
   
  (subtype) Extract the relevant column from the phenotype data and rename it as "subtype"
  phecoln <- grep("pam50", phenotype)
  pheno <- data.frame(phenotype[ ,phecoln])
  colnames(pheno) <- "subtype"
  pheno$subtype <- gsub("pam50.+:", "", pheno$subtype)
  rownames(pheno) <- rownames(phenotype)

  # (grade) Extract the relevant column from the phenotype data and rename it as "grade"
  gradecoln <- grep("grade:", phenotype)
  grade <- data.frame(phenotype[ ,gradecoln])
  colnames(grade) <- "grade"
  grade$grade <- gsub("grade:", "", grade$grade)
  grade$grade <- gsub("=.+", "", grade$grade)
  rownames(grade) <- rownames(phenotype)
  
  # Perform log2 transformation if the matrix is not already transformed
  qx <- as.numeric(quantile(matrix, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
  LogC <- (qx[5] > 100) || (qx[6]-qx[1] > 50 && qx[2] > 0)
  if(LogC){
    matrix[which(matrix <= 0)] <- NaN
    matrix <- log2(matrix)
  }

  # Create a list to store the sample IDs for each group
  if(compare == "subtype"){
    grouplis <- list() 
    for(i in seq_along(groups)){
      name <- paste0(groups[[i]], collapse = "|")
      rownum <- grep(name, pheno$subtype)
      grouplis[[names(groups)[i]]] <- rownames(pheno)[rownum]
    }
  } else{
    grouplis <- list()
    for(i in seq_along(groups)){
      name <- paste0(groups[[i]], collapse = "|")
      rownum <- grep(name, grade$grade)
      grouplis[[names(groups)[i]]] <- rownames(grade)[rownum]
    }
  }
  
  # Rename the matrix file according to the group names specified by the use 
  for(i in seq_along(grouplis)){
    matname <- paste0(grouplis[[i]], collapse = "|")
    matcoln <- grep(matname, colnames(matrix))
    colnames(matrix)[matcoln] <- paste0(names(grouplis)[i], 1:length(grouplis[[i]]))
  } 
  
  # Keep only the samples that are renamed and order them
  matnames <- grep(paste0(names(grouplis), collapse = "|"), colnames(matrix))
  matrix <- matrix[, matnames]
  matrix <- matrix[, order(colnames(matrix))]
  
  # Create a factor variable with 0 and 1 based on the length of groups specified by user
  gname <- list()
  for(i in seq_along(grouplis)){
    number <- length(grep(names(grouplis)[i], colnames(matrix)))
    gname[[i]] <- factor(rep(i - 1, number))
  }
  grouping <- unlist(gname)
  
  # Create the design matrix for the linear model
  design <- model.matrix(~ 0 + grouping)
  colnames(design) <- names(grouplis)
  
  # Fit the linear model
  fit1 <- lmFit(matrix, design)
  l <- length(names(grouplis))
  cts <- paste0(names(grouplis)[l:1], collapse = "-")
  cont.matrix <- makeContrasts(contrasts = cts, levels = design)
  fit2 <- contrasts.fit(fit1, cont.matrix)
  fit2 <- eBayes(fit2)
  
  # Create a top table with all genes
  DEGs <- topTable(fit2, adjust = adjust, number = Inf)
  DEGs <- cbind(rownames(DEGs), DEGs)
  colnames(DEGs)[1] <- "ID" 
  
  # Prepare the annotation variable
  annotation <- annot[, c("Gene.symbol", "Chromosome.location", "GO.Function")]
  annotation <- cbind(rownames(annotation), annotation)
  colnames(annotation)[1] <- "ID" 
  
  # Ensure that the row names of DEGs and annotation are the same
  annotation <- annotation[order(annotation$ID), ]
  DEGs <- DEGs[order(DEGs$ID), ]
  
  # Add the annotation to the top table
  DEGs <- cbind(DEGs,  annotation[, c(2, 3, 4)])
  
  # Removing missing gene symbols
  genemissing <- which(is.na(DEGs$Gene.symbol) == TRUE)
  DEGs <- DEGs[-genemissing, ]
  
  # Write the top table to a TSV file
  write_tsv(DEGs, paste0(projname, "/", "DEGs.tsv"))
  
  message("+++++ Analysis has completed. +++++")
}



# Function for generating a volcano plot
Makevolcano <- function(projname,
                        padjlevel = 0.05,
                        uplogFC = 1,
                        downlogFC = -1 ,
                        ntop = 10) {

  # Read the top table from the TSV file
  table <- read_tsv(paste0(projname, "/", "DEGs.tsv"))
  table <- data.frame(table)
	   
  # Define color palette for the plot
  my_pal <- c("#1B9E77","#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#9A7D0A")

  # Add a "DEGs" column based on the adjusted p-value and log fold change
  volcano <- table %>%
    mutate(AdjustedPvalue = -log10(adj.P.Val)) %>%
    mutate(DEGs = "Not") %>%
    mutate(DEGs = ifelse(AdjustedPvalue > -log10(padjlevel) & logFC > uplogFC, "Upregulated", DEGs)) %>%
    mutate(DEGs = ifelse(AdjustedPvalue > -log10(padjlevel) & logFC < downlogFC, "Downregulated", DEGs))

  # Create a ggplot object, add layers, and annotate the plot
  p <- ggplot(data = volcano, aes(x = logFC, y = AdjustedPvalue, color = DEGs, fill = DEGs, text = paste("ID: ", ID,
                                                                                                       "<br>Gene: ", Gene.symbol,
                                                                                                       "<br>Chromosome: ", Chromosome.location,
                                                                                                       "<br>GO function: ", GO.Function))) +
    labs(x= 'log2 (Fold Change)', y = "-log10(Adjusted P-value)") + 
    geom_point(size = 1, shape = 21) +
    scale_color_manual(values = c(my_pal)) +
    scale_fill_manual(values = c(paste(my_pal, "66" , sep = "" ))) +
    theme_classic() +
    theme(axis.text = element_text(family = "Times", size = 15 , colour = "black"),
          axis.text.x = element_text(family = "Times", colour = "black", size = 15),
          axis.text.y = element_text(family = "Times", colour = "black", size = 15),
          plot.subtitle = element_text(family = "Times", size = 20, colour = "black", hjust = 0.5),
          axis.title.y = element_text(family = "Times", size = rel(1.8), angle = 90),
          axis.title.x = element_text(family = "Times", size = rel(1.8), angle = 00)) +
    labs(subtitle = "Volcano plot")

  # Convert ggplot to plotly for interactivity
  p <- ggplotly(p, tooltip = "text")

  # Calculate the number of up-regulated and down-regulated genes
  up <- volcano[which(volcano$DEGs == "Upregulated"), ]
  up <- up[order(up$logFC, decreasing = T), ]
  down <- volcano[which(volcano$DEGs == "Downregulated"), ]
  down <- down[order(down$logFC, decreasing = F), ]

  # Creating the top table (up and down) based on the number specified (ntop)
  top <- rbind(up[1:ntop, ], down[1:ntop, ])
  top <- top[, c(1:6, 11, 7, 8, 9, 10, 12)]
  colnames(top)[7] <- "n.log10(adj.P.Val)"

  # Save the top table as a CSV file in the project folder
  write.csv(top, paste0(projname, "/", "top.csv"), quote = FALSE, row.names = FALSE)

  # Display a message with the number of up-regulated and down-regulated genes
  message(sprintf("Up-regulated genes: %s; Down-regulated genes: %s", nrow(up), nrow(down)))

# Return the interactive volcano plot
  return(p)
}




# Download GEO dataset with the accession number "GSE25055"
DownloadGEO ("GSE25055")

#============================ Q1 =========================#
# other adjust methods ---> "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"

# (Lum A, Normal-like subtypes) VS (Basal-like, HER2, Lum B)
DEGanalysis(projname = "GSE25055",
            compare = "subtype",
            groups = list(normal = c("LumA", "Normal"), tumor = c("Basal", "Her2", "LumB")),
            adjust = "fdr")

# alpha = 0.01 - logFC 0
volcano1 <- Makevolcano(projname = "GSE25055",
                        padjlevel = 0.01,
                        uplogFC = 1.5,
                        downlogFC = -1.5,
                        ntop = 10)
volcano1

# alpha = 0.01 - logFC 1
volcano2 <- Makevolcano(projname = "GSE25055",
                        padjlevel = 0.00001,
                        uplogFC = 1,
                        downlogFC = -1)
volcano2

# alpha = 0.05 - logFC 0
volcano3 <- Makevolcano(projname = "GSE25055",
                        padjlevel = 0.05,
                        uplogFC = 0,
                        downlogFC = 0)
volcano3

# alpha = 0.05 - logFC 1
volcano4 <- Makevolcano(projname = "GSE25055",
                        padjlevel = 0.05,
                        uplogFC = 1,
                        downlogFC = -1
volcano4

#============================ Q2 =========================#
# Grade 1 vs Grade 3
DEGanalysis(projname = "GSE25055",
            compare = "grade",
            groups = list(normal = c("1"), tumor = c("3")),
            adjust = "fdr")

# alpha = 0.01 - logFC 0
volcano5 <- Makevolcano(projname = "GSE25055",
                        padjlevel = 0.01,
                        uplogFC = 0,
                        downlogFC = 0)
volcano5

# alpha = 0.01 - logFC 1.5
volcano6 <- Makevolcano(projname = "GSE25055",
                        padjlevel = 0.01,
                        uplogFC = 1.5,
                        downlogFC = -1.5)
volcano6

# alpha = 0.05 - logFC 0
volcano7 <- Makevolcano(projname = "GSE25055",
                        padjlevel = 0.05,
                        uplogFC = 0,
                        downlogFC = 0)
volcano7

# alpha = 0.01 - logFC 1
volcano8 <- Makevolcano(projname = "GSE25055",
                        padjlevel = 0.01,
                        uplogFC = 1,
                        downlogFC = -1)
volcano8