Before starting the analysis, we need to load the essential R libraries required for data manipulation, visualization, and performing differential gene expression (DEG) analysis.
# Load necessary libraries for RNA-seq analysis and visualization
library(GEOquery) # Load the GEOquery package to access GEO datasets
library(DESeq2) # For differential expression analysis
library(data.table) # Efficient data manipulation
library(umap) # Dimensionality reduction for visualization
library(ggplot2) # General-purpose plotting
library(clusterProfiler) # For functional enrichment analysis
library(org.Hs.eg.db) # Human genome annotations
library(VennDiagram) # Venn diagram visualization
library(knitr) # Report generation and table formatting (for kable)
library(tinytex) # LaTeX support for report generation
Code Block 2.1. Retrieving Data from GEO
# ===================================================================
# Step 1. Download Supplementary Files for the Count Matrix
# ===================================================================
# Define the GEO accession ID
gse_id <- "GSE157194"
# Download the supplementary files for this GEO dataset
getGEOSuppFiles(gse_id)
# Check the list of files downloaded
list.files(gse_id)
# ================================
# Path to the downloaded file
supp_file_path <- paste0(gse_id, "/", "GSE157194_Raw_gene_counts_matrix.txt.gz")
# Unzip and load the file into R as a data frame (counts matrix)
counts_matrix <- read.table(gzfile(supp_file_path), header = TRUE, row.names = 1)
# ===================================================================
# Step 2. Download Phenotype Data
# ===================================================================
# Specify the GEO accession number
gse_id <- "GSE157194"
# Download the phenotype data (GEO series matrix)
gse_data <- getGEO(gse_id, GSEMatrix = TRUE, AnnotGPL = TRUE)
# If multiple series are available, choose the first one
gse_data <- gse_data[[1]]
# Extract metadata from the GEO dataset
# We access the metadata (also known as phenotype data) stored in the 'phenoData' slot of the GEO object.
metadata <- gse_data@phenoData@data
# ===================================================================
# Step 3. Print Summary Information about the Count Matrix and Metadata
# ===================================================================
cat("\n\n# Summary Information of the Count Matrix and Metadata (Post-GEO Download)\n")
cat("=========================================================\n")
# Count matrix summary
cat("\nCount Matrix Summary:\n")
cat("---------------------------------------------------------\n")
cat("Number of Genes (Rows): ", nrow(counts_matrix), "\n")
cat("Number of Samples (Columns): ", ncol(counts_matrix), "\n\n")
# Metadata summary
cat("\nMetadata Summary:\n")
cat("---------------------------------------------------------\n")
cat("Number of Samples (Rows): ", nrow(metadata), "\n")
cat("Number of Features (Columns): ", ncol(metadata), "\n\n")
cat("\n=========================================================\n")
Code Block 2.2.1: Identifying and Removing Outlier Samples, and Visualizing the Impact
# ========================================================
# Load necessary libraries
# ========================================================
library(ggplot2)
library(gridExtra) # For arranging multiple plots in one figure
# ========================================================
# Step 1: Calculate Library Size and Identify Outliers
# ========================================================
# Calculate library sizes
library_sizes <- colSums(counts_matrix)
# Optionally, print summary to check for anomalies
summary(library_sizes)
# Identify potential outlier samples based on library size
# Step 1.1: Set thresholds for outlier identification
# Set the lower threshold to identify samples with low counts
lower_threshold <- quantile(library_sizes, 0.25) - 1.5 * IQR(library_sizes)
# Set the upper threshold to identify samples with very high counts
upper_threshold <- quantile(library_sizes, 0.75) + 1.5 * IQR(library_sizes)
# Step 1.2: Find outlier samples
# Uncomment the corresponding line depending on whether you want to use:
# - Only the lower threshold
# - Only the upper threshold
# - Both lower and upper thresholds
# Use only the lower threshold:
# outlier_samples <- colnames(counts_matrix)[library_sizes < lower_threshold]
# Use both lower and upper thresholds:
outlier_samples <- colnames(counts_matrix)[library_sizes < lower_threshold | library_sizes > upper_threshold]
# ========================================================
# Step 2: Store Original Dimensions Before Filtering (First part for Plotting Dimensions of Count Matrix and Metadata)
# ========================================================
# Store original dimensions before filtering
counts_matrix_before <- dim(counts_matrix) # [number of genes, number of samples]
metadata_before <- dim(metadata) # [number of samples, number of features]
# ========================================================
# Step 3: Remove Outlier Samples from Data
# ========================================================
# Step 3.1: Remove outlier samples from counts matrix
counts_matrix_filtered <- counts_matrix[, !(colnames(counts_matrix) %in% outlier_samples)]
# Step 3.2: Remove outlier samples from metadata
metadata_filtered <- metadata[colnames(counts_matrix_filtered), ]
# Step 3.3: Store dimensions after filtering (Second part for Plotting Dimensions of Count Matrix and Metadata)
counts_matrix_after <- dim(counts_matrix_filtered)
metadata_after <- dim(metadata_filtered)
# Step 3.4: Calculate filtered library sizes
filtered_library_sizes <- colSums(counts_matrix_filtered)
# ========================================================
# Step 4: Plot the Filtered Library Sizes (Box, Violin, Dot, Bar, Density plots)
# ========================================================
# Step 4.1: Prepare Data for Plotting:
# Combine original and filtered library sizes into a data frame for comparison
comparison_df <- data.frame(
Sample = c(names(library_sizes), names(filtered_library_sizes)),
Library_Size = c(library_sizes, filtered_library_sizes),
Group = c(rep("Original", length(library_sizes)), rep("Filtered", length(filtered_library_sizes)))
)
# Step 4.2: Boxplot of Library Sizes Before and After Filtering
box_plot <- ggplot(comparison_df, aes(x = Group, y = Library_Size, fill = Group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplot of Library Sizes Before and After Removing Outliers",
x = "Group", y = "Total Counts per Sample") +
theme(legend.position = "none")
ggsave("Figures/2.2.1.Outlier_Samples/Figure2.2.1.3.Box_Plot_Library_Sizes.png", plot = box_plot, width = 8, height = 5, bg = "white")
# Step 4.3: Violin Plot of Library Sizes Before and After Filtering
violin_plot <- ggplot(comparison_df, aes(x = Group, y = Library_Size, fill = Group)) +
geom_violin(trim = FALSE, alpha = 0.6) +
theme_minimal() +
labs(title = "Violin Plot of Library Sizes Before and After Removing Outliers",
x = "Group", y = "Total Counts per Sample") +
theme(legend.position = "none")
ggsave("Figures/2.2.1.Outlier_Samples/Figure2.2.1.5.Violin_Plot_Library_Sizes.png", plot = violin_plot, width = 8, height = 5, bg = "white")
# Step 4.4: Dot Plot with Annotation of Outliers
dot_plot <- ggplot(comparison_df, aes(x = Sample, y = Library_Size, color = Group)) +
geom_point() +
theme_minimal() +
labs(title = "Dot Plot of Library Sizes with Outlier Annotation",
x = "Sample", y = "Total Counts per Sample") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
geom_text(data = subset(comparison_df, Group == "Original" & Sample %in% outlier_samples),
aes(label = Sample), vjust = -1, size = 3, color = "red", angle = 5)
ggsave("Figures/2.2.1.Outlier_Samples/Figure2.2.1.1.Dot_Plot_Library_Sizes.png", plot = dot_plot, width = 10, height = 5, bg = "white")
# Step 4.5: Comparative Bar Plot Before and After Filtering
bar_plot <- ggplot(comparison_df, aes(x = Sample, y = Library_Size, fill = Group)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Comparative Bar Plot of Library Sizes Before and After Removing Outliers",
x = "Sample", y = "Total Counts per Sample") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
ggsave("Figures/2.2.1.Outlier_Samples/Figure2.2.1.2.Bar_Plot_Library_Sizes.png", plot = bar_plot, width = 10, height = 5, bg = "white")
# Step 4.6: Side-by-Side Density Plot Before and After Filtering
density_plot <- ggplot(comparison_df, aes(x = Library_Size, fill = Group)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(title = "Density Plot of Library Sizes Before and After Removing Outliers",
x = "Total Counts per Sample", y = "Density") +
scale_x_log10() +
theme(legend.position = "top")
ggsave("Figures/2.2.1.Outlier_Samples/Figure2.2.1.4.Density_Plot_Library_Sizes.png", plot = density_plot, width = 8, height = 5, bg = "white")
# ========================================================
# Step 5: Plot Dimensions of Count Matrix and Metadata (Third part for Plotting Dimensions of Count Matrix and Metadata)
# ========================================================
# Step 5.1: Create Data Frames for Count Matrix Dimensions
counts_matrix_dims <- data.frame(
Measure = rep(c("Samples", "Genes"), each = 2),
Stage = rep(c("Before Filtering", "After Filtering"), 2),
Count = c(counts_matrix_before[2], counts_matrix_after[2],
counts_matrix_before[1], counts_matrix_after[1])
)
# Step 5.2: Create Data Frames for Count Matrix and Metadata Dimensions
metadata_dims <- data.frame(
Measure = rep(c("Samples", "Features"), each = 2),
Stage = rep(c("Before Filtering", "After Filtering"), 2),
Count = c(metadata_before[1], metadata_after[1],
metadata_before[2], metadata_after[2])
)
# Step 5.3: Generate and Combine Plots for Dimensions
# Plot for count matrix samples before and after filtering
count_samples_plot <- ggplot(subset(counts_matrix_dims, Measure == "Samples"), aes(x = Stage, y = Count, fill = Stage)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Count), vjust = -0.5) + # Add count labels on top of bars
theme_minimal() +
labs(title = "Count Matrix: Number of Samples (Columns)", y = "Samples", x = "") +
theme(legend.position = "none") +
scale_fill_manual(values = c("Before Filtering" = "#4682B4", "After Filtering" = "#8A2BE2")) # Change colors
# Plot for count matrix genes (transcripts) before and after filtering
count_genes_plot <- ggplot(subset(counts_matrix_dims, Measure == "Genes"), aes(x = Stage, y = Count, fill = Stage)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Count), vjust = -0.5) + # Add count labels on top of bars
theme_minimal() +
labs(title = "Count Matrix: Number of Genes (Rows)", y = "Genes (Transcripts)", x = "") +
theme(legend.position = "none") +
scale_fill_manual(values = c("Before Filtering" = "#4682B4", "After Filtering" = "#8A2BE2")) # Change colors
# Plot for metadata samples before and after filtering
metadata_samples_plot <- ggplot(subset(metadata_dims, Measure == "Samples"), aes(x = Stage, y = Count, fill = Stage)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Count), vjust = -0.5) + # Add count labels on top of bars
theme_minimal() +
labs(title = "Metadata: Number of Samples (Rows)", y = "Count", x = "") +
theme(legend.position = "none") +
scale_fill_manual(values = c("Before Filtering" = "#32CD32", "After Filtering" = "#FFD700")) # Change colors
# Plot for metadata features before and after filtering
metadata_features_plot <- ggplot(subset(metadata_dims, Measure == "Features"), aes(x = Stage, y = Count, fill = Stage)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Count), vjust = -0.5) + # Add count labels on top of bars
theme_minimal() +
labs(title = "Metadata: Number of Features (Columns)", y = "Sample Attributes", x = "") +
theme(legend.position = "none") +
scale_fill_manual(values = c("Before Filtering" = "#32CD32", "After Filtering" = "#FFD700")) # Change colors
# Arrange the four plots in a grid
combined_plot <- grid.arrange(count_samples_plot, count_genes_plot,
metadata_samples_plot, metadata_features_plot,
ncol = 2)
# Save the combined plot
ggsave("Figures/2.2.1.Outlier_Samples/Figure2.2.1.6.Dimensions_Comparison.png", plot = combined_plot, width = 12, height = 10, bg = "white")
# ========================================================
# Step 6: Reassign Filtered Data
# ========================================================
# Reassign filtered versions back to counts_matrix and metadata
counts_matrix <- counts_matrix_filtered
metadata <- metadata_filtered
# ======================================
# Step 7. Print Summary Information about the Count Matrix and Metadata
# ======================================
cat("\n\n# Summary Information of the Count Matrix and Metadata (Post-Outlier Removal)\n")
cat("=========================================================\n")
# Count matrix summary
cat("\nCount Matrix Summary:\n")
cat("---------------------------------------------------------\n")
cat("Number of Genes (Rows): ", nrow(counts_matrix), "\n")
cat("Number of Samples (Columns): ", ncol(counts_matrix), "\n\n")
# Metadata summary
cat("\nMetadata Summary:\n")
cat("---------------------------------------------------------\n")
cat("Number of Samples (Rows): ", nrow(metadata), "\n")
cat("Number of Features (Columns): ", ncol(metadata), "\n\n")
cat("\n=========================================================\n")
ncol(counts_matrix) / 2
). This step ensures that the filtered dataset retains genes with meaningful levels of expression across a significant number of samples. After identifying the genes to keep, we filter the counts_matrix
to retain only those genes, reassign the filtered matrix back to counts_matrix
(Step 2), and then check the dimensions of the filtered dataset (Step 3) to confirm the number of genes and samples that remain (Figure 2.2.2).Code Block 2.2.2: Pre-filtering Low-Count Genes to Improve Data Quality
# ========================================================
# Step 1: Pre-filter low-count genes
# ========================================================
# Step 1: Create a logical matrix to check if each value in counts_matrix is >= 10 (sufficient expression).
# Step 2: Retain genes that have counts >= 10 in at least half of the samples (ncol(counts_matrix) / 2).
keep_genes <- rowSums(counts_matrix >= 10) >= (ncol(counts_matrix) / 2)
# Filter the counts matrix to retain only the genes that meet the filtering criteria
filtered_counts_matrix <- counts_matrix[keep_genes, ]
# ========================================================
# Step 2: Reassign filtered matrix to counts_matrix
# ========================================================
# Assign the filtered counts matrix back to counts_matrix for consistency in later steps
counts_matrix <- filtered_counts_matrix
# ======================================
# Step 3. Check dimensions of the filtered matrix (Summary Information after Low-Count Filtering)
# ======================================
cat("\n\n# Summary Information of the Count Matrix and Metadata (Post-Low-Count Filtering)\n")
cat("=========================================================\n")
# Count matrix summary
cat("\nCount Matrix Summary:\n")
cat("---------------------------------------------------------\n")
cat("Number of Genes (Rows): ", nrow(counts_matrix), "\n")
cat("Number of Samples (Columns): ", ncol(counts_matrix), "\n\n")
# Metadata summary
cat("\nMetadata Summary:\n")
cat("---------------------------------------------------------\n")
cat("Number of Samples (Rows): ", nrow(metadata), "\n")
cat("Number of Features (Columns): ", ncol(metadata), "\n\n")
cat("\n=========================================================\n")
Code Block 3.1.1. Metadata Exploration
# View all column names in the metadata to select relevant columns for analysis
# This step helps us to explore the structure of the metadata and understand which columns are available.
colnames(metadata)
# Select relevant columns for analysis based on the experiment
selected_metadata <- metadata[, c("geo_accession", "title", "source_name_ch1", "characteristics_ch1.2", "characteristics_ch1.3", "timepoint:ch1", "therapy:ch1", "patient id:ch1", "platform_id", "data_processing")]
# Open the selected metadata in a viewer for further exploration
View(selected_metadata)
Code Block 3.1.2.1: Sample Distribution Table Generation
# Table of samples by patient group (e.g., lesional vs non-lesional)
group_table <- table(metadata$source_name_ch1)
kable(group_table, format = "html", col.names = c("Patient Group", "Count"))
# Table of samples by timepoints (e.g., baseline vs post-treatment)
timepoint_table <- table(metadata$characteristics_ch1.2)
kable(timepoint_table, format = "html", col.names = c("Timepoint", "Count"))
Code Block 3.1.2.2. Assessing Sample Characteristics for Experimental Design
# ======================
# List of all relevant columns
# These columns are selected based on the characteristics of the experiment,
# including metadata features like source names, patient IDs, and data processing details.
columns <- c("source_name_ch1", # Source of the sample
"characteristics_ch1.2", # Experimental characteristic 1
"characteristics_ch1.3", # Experimental characteristic 2
"therapy:ch1", # Therapy information
"patient id:ch1", # Patient ID
"platform_id", # Platform used for data generation
"extract_protocol_ch1", # Sample extraction protocol 1
"extract_protocol_ch1.1", # Sample extraction protocol 2
"data_processing", # Data processing detail 1
"data_processing.1", # Data processing detail 2
"data_processing.2", # Data processing detail 3
"data_processing.3", # Data processing detail 4
"data_processing.4", # Data processing detail 5
"data_processing.5", # Data processing detail 6
"data_processing.6" # Data processing detail 7
)
# ==========
# Loop through each column and generate a summary table for each
# The loop checks if each column exists in the 'metadata' dataframe and prints a table of its unique values and frequencies.
for (col in columns) {
if (col %in% colnames(metadata)) { # Ensure the column exists in the metadata dataframe
group_table <- table(metadata[[col]]) # Access and count the occurrences of each unique value in the column
print(kable(group_table)) # Display the table using kable for a nicely formatted output
} else {
print(paste("Column", col, "does not exist in the metadata")) # Handle cases where the column doesn't exist
}
}
source_name_ch1
, to display the sample distribution by experimental condition (Figure 3.1.3.1). However, the variable can be easily substituted with others if needed. (Step 1): The ggplot2
library is loaded for creating the plot. (Step 2): A bar plot is constructed using geom_bar()
to create the bars and geom_text()
to add count labels, where ..count..
refers to the computed count statistic. The plot is styled using theme_minimal()
and color is applied with scale_fill_brewer()
. (Step 3): Finally, the ggsave()
function saves the plot to a specified location.
Code Block 3.1.3.1. Visualizing Sample Distribution by Experimental Condition
# ====================================
# Step 1: Load the necessary library
# ====================================
library(ggplot2)
# ============================================
# Step 2: Create a bar plot for sample counts
# ============================================
p <- ggplot(metadata, aes(x = `source_name_ch1`, fill = `source_name_ch1`)) +
geom_bar() + # Add bars representing sample counts
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 3) + # Add count labels above bars
theme_minimal() + # Apply a minimal theme for better readability
labs(title = "Sample Distribution by Experimental Condition",
x = "Source Name",
y = "Number of Samples") +
theme(axis.text.x = element_text(angle = 60, vjust = 0.5, hjust = 0.6, size = 8)) + # Adjust axis label size and angle
scale_fill_brewer(palette = "Set3") # Apply a color palette for the bars
# The variable "source_name_ch1" in this code can be manually replaced by other variables
# such as "characteristics_ch1.2", "characteristics_ch1.3", or "patient id:ch1" to visualize their distributions.
# ==================================
# Step 3: Save the plot to a file
# ==================================
ggsave("Figures/3.1.3.1.Bar_Chart_Metadata_Variables/3.1.3.1.Sample_Distribution_by_Condition.png",
plot = p, width = 10, height = 6, dpi = 300, bg = "white")
Code Block 3.1.3.2. Visualizing Sample Distribution for Multiple Metadata Variables Using Automated Bar Charts
# =========================== Step 1: Install and Load Necessary Packages =========================== #
# Load the required packages for plotting
library(ggplot2) # For creating the bar plots
library(viridis) # For using a color palette that supports many unique colors
# =========================== Step 2: Define Columns to Visualize =========================== #
# List of columns to visualize (these correspond to the metadata columns you want to plot)
columns_to_plot <- c("source_name_ch1", "characteristics_ch1.2", "characteristics_ch1.3", "patient id:ch1")
# =========================== Step 3: Loop Through Each Column and Create Plots =========================== #
# Loop through each column and create a bar plot for sample distribution
for (col in columns_to_plot) {
# --------------------- Convert Column Name for Tidy Evaluation --------------------- #
# Convert the column name (a string) to a symbol that ggplot can interpret within aes()
col_sym <- sym(col)
# --------------------- Sanitize Column Name for Saving Files --------------------- #
# Replace ":" with "_" to create a valid filename (since ":" is not allowed in filenames)
sanitized_col <- gsub("[:]", "_", col)
# =========================== Step 4: Create Bar Plot =========================== #
# Create the bar plot for the current column
p <- ggplot(metadata, aes(x = !!col_sym, fill = !!col_sym)) + # Fill colors based on the column
geom_bar() + # Create the bar chart
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 3) + # Add text labels above bars to show the count
theme_minimal() + # Use a minimal theme for a clean appearance
labs(title = paste("Sample Distribution by", col), x = col, y = "Number of Samples") + # Add title and axis labels
theme(axis.text.x = element_text(angle = 60, vjust = 0.5, hjust = 0.6, size = 8)) + # Rotate x-axis labels and adjust their size
scale_fill_viridis_d(option = "C") # Use the "viridis" color palette to ensure enough distinct colors for each category
# --------------------- Display the Plot --------------------- #
# Print the plot so it displays in the R console or notebook
print(p)
# =========================== Step 5: Save the Plot as PNG =========================== #
# Save the plot to a file in the specified directory with a relevant name
ggsave(filename = paste0("Figures/3.1.3.2.Bar_Chart_Metadata_Variables/Figure3.1.3.2_", sanitized_col, "_Sample_Distribution.png"),
plot = p, width = 10, height = 6, dpi = 300, bg = "white")
}
Code Block 3.2.1. Viewing the count matrix to inspect the gene expression data for each sample.
# View the first few rows of the count matrix
# This allows us to inspect the raw gene expression data for each sample.
View(counts_matrix)
Code Block 3.2.2.1. Boxplot of Raw Counts (Uncolored)
# ============================
# Step 1: Load necessary library
# ============================
library(ggplot2)
# ============================
# Step 2: Prepare Data for Plotting
# ============================
# Convert counts_matrix to a data frame and log-transform it
df <- as.data.frame(log2(counts_matrix + 1)) # Log2 transform the counts matrix
# Add a sample column (rownames will become a new column for plotting)
df$Sample <- rownames(df)
# Reshape the data for ggplot2 (long format)
df_long <- reshape2::melt(df, id.vars = "Sample")
# ============================
# Step 3: Create Boxplot
# ============================
p <- ggplot(df_long, aes(x = variable, y = value)) +
geom_boxplot() + # Create boxplot
theme_minimal() + # Apply a minimal theme for better readability
labs(title = "Log2 Raw Counts Across Samples", x = "Samples", y = "Log2 Counts") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 4),
plot.background = element_rect(fill = "white", colour = NA), # Set white background
panel.background = element_rect(fill = "white", colour = NA)) # Set white background for the panel
# ============================
# Step 4: Save the Plot 3.2.2.1.Uncolored_Boxplot_Raw_Counts
# ============================
ggsave("Figures/3.2.2.Boxplot_Raw_Counts/3.2.2.1.Uncolored_Boxplot_Raw_Counts.png"",
plot = p, width = 13, height = 4)
Code Block 3.2.2.2. Boxplot of Raw Counts Colored by Experimental Condition
# ============================
# Load necessary libraries
# ============================
library(ggplot2)
library(reshape2)
# ============================
# Step 1: Align Sample IDs and Metadata
# ============================
# Set the row names of metadata to the sample names (stored in metadata$title)
rownames(metadata) <- metadata$title
# Extract the sample IDs from counts_matrix (columns represent samples)
sample_ids <- colnames(counts_matrix)
# Filter metadata to contain only rows that match the sample IDs in the count matrix
metadata_filtered <- metadata[metadata$title %in% sample_ids, ]
# Reorder metadata to match the order of sample IDs in the count matrix
metadata_filtered <- metadata_filtered[match(sample_ids, rownames(metadata_filtered)), ]
# ============================
# Step 2: Prepare Data for Plotting
# ============================
# Transpose the counts matrix to make samples the rows
counts_matrix_t <- t(counts_matrix)
# Convert the transposed counts matrix to a data frame and log-transform the counts
# Add 1 to avoid log(0), which is undefined
df <- as.data.frame(log2(counts_matrix_t + 1))
# Add a sample column (rownames will become a new column for plotting)
df$Sample <- rownames(df)
# Add the group information from metadata (e.g., source_name_ch1)
df$group <- metadata_filtered$source_name_ch1[match(df$Sample, rownames(metadata_filtered))]
# Ensure `group` is a factor for proper grouping in the plot
df$group <- as.factor(df$group)
# ============================
# Step 3: Reshape the Data for ggplot2 (Long Format)
# ============================
# Melt the data frame to create a long format for ggplot2
df_long <- melt(df, id.vars = c("Sample", "group"))
# ============================
# Step 4: Define Colors for Each Unique Group and Add Sample Counts to the Labels
# ============================
# Calculate the number of samples for each group
group_counts <- table(df$group)
# Calculate the total number of samples
total_samples <- sum(group_counts)
# Create the color palette for the unique groups
unique_groups <- unique(df$group)
color_palette <- c("#FF6347", "#4682B4", "#32CD32", "#FFD700", "#8A2BE2", "#00CED1")
# Ensure the color palette has enough colors for all groups
group_colors <- setNames(color_palette[1:length(unique_groups)], unique_groups)
# ============================
# Step 5: Create the Plot with Updated Group Labels
# ============================
p <- ggplot(df_long, aes(x = factor(Sample, levels = sample_ids), y = value, fill = group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Log2 Raw Counts Across Samples by Source Name", x = "Samples", y = "Log2 Counts",
fill = paste0("group (Total n=", total_samples, ")")) + # Add total number to the legend title
theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 4), # Rotate and adjust x-axis labels
plot.background = element_rect(fill = "white", colour = NA), # Set white background
panel.background = element_rect(fill = "white", colour = NA)) + # Set white background for the panel
scale_fill_manual(values = group_colors, labels = paste0(levels(df$group), " (n=", group_counts[levels(df$group)], ")")) # Use the custom color vector and add counts in the legend
# ============================
# Step 6: Save the Plot
# ============================
ggsave("Figures/3.2.2.Boxplot_Raw_Counts/3.2.2.2.Colored_Boxplot_Raw_Counts.png",
plot = p, width = 20, height = 6)
hist()
function. (Step 4): To enhance the visualization, the gene count values are added as labels above each bar. Finally, (Step 5): The PNG device is closed, and the plot is saved.
Code Block 3.2.3. Histogram of Total Gene Counts Per Sample
# ============================
# Step 1: Define Output File Path
# ============================
# Define the file path where you want to save the image
output_file <- "C:/Users/omidm/OneDrive/Desktop/RNA_Seq_Task1/3/3_Count_Matrix_Exploration/3.2.3.a.histogram_Of_total_GeneCounts_Per_Sample_Before_Removing_LowCountGenes_AND_Outlier_Samples.png"
# ============================
# Step 2: Open PNG Device
# ============================
# Open the PNG device to save the plot
png(filename = output_file, width = 1200, height = 1000)
# ============================
# Step 3: Create Histogram of Log-Transformed Gene Counts
# ============================
# Calculate log10 of total counts per gene
log_gene_counts <- log10(rowSums(counts_matrix))
# Create the histogram and get the counts for each bin
hist_data <- hist(log_gene_counts,
breaks = 50, # Specify number of bins in the histogram
main = "Gene Count Distribution", # Title of the plot
xlab = "Log10 Counts", # Label for the x-axis
col = "steelblue") # Add color to the bars
# Update the y-axis limit to make room for labels
ylim <- c(0, max(hist_data$counts) * 1.1)
plot(hist_data, col = "steelblue", main = "Gene Count Distribution", xlab = "Log10 Counts", ylim = ylim) # cex: to change the font size of the labels on the bars
# ============================
# Step 4: Add Numbers Above Bars
# ============================
# Add the text labels above the histogram bars
text(hist_data$mids, hist_data$counts, labels = hist_data$counts, pos = 3, cex = 1, srt = 45)
# ============================
# Step 5: Close the PNG Device
# ============================
# Close the PNG device
dev.off()
matrixStats
and ggplot2
are loaded for variance calculations and plotting. (Step 2): The gene expression data in counts_matrix
is log2-transformed to reduce its dynamic range, making it easier to interpret. (Step 3): Using the rowVars()
function, the variance of each gene across all samples is calculated, identifying highly variable genes for further analysis. (Step 4): The cumulative variance is computed by sorting these variances and calculating their cumulative sum, which helps assess how much variability is captured by the top genes. (Step 5): Finally, a line plot is generated (Figure 3.2.4) using ggplot2
to visualize the cumulative variance explained by an increasing number of genes, aiding in the selection of the optimal number of genes for further analysis.
Code Block 3.2.4. Cumulative Variance Plot to Determine Optimal Gene Count for Heatmap
# =========== Step 1: Load Required Libraries =============
library(matrixStats) # Or DelayedMatrixStats for variance calculations
library(ggplot2) # For enhanced plotting capabilities
# =========== Step 2: Log2 Transformation =============
# Convert counts_matrix to a matrix and apply log2 transformation
# Log transformation reduces the dynamic range of the data, making it easier to interpret
counts_matrix_log <- log2(as.matrix(counts_matrix) + 1)
# =========== Step 3: Calculate Gene Variances =============
# Use the rowVars() function to calculate the variance for each gene across samples
# Gene variance is used to identify the most variable genes for further analysis
gene_variances <- rowVars(counts_matrix_log)
# =========== Step 4: Calculate Cumulative Variance =============
# Sort the variances in descending order and calculate the cumulative sum
cum_variance <- cumsum(sort(gene_variances, decreasing = TRUE)) / sum(gene_variances)
# =========== Step 5: Plot Cumulative Variance using ggplot2 for enhanced plotting =============
# Plot the cumulative proportion of variance explained as the number of genes increases
# This plot will help visualize how many top genes are needed to explain a high proportion of variance
ggplot(data.frame(Genes = 1:length(cum_variance), CumulativeVariance = cum_variance), aes(x = Genes, y = CumulativeVariance)) +
geom_line() +
labs(title = "Cumulative Variance Explained by Top Genes",
x = "Number of Genes",
y = "Cumulative Proportion of Variance Explained") +
theme_minimal()
Code Block 3.2.5.1. Unsupervised Heatmap for All Samples (160 samples)
# ============ Load Required Libraries ============
# Load the required libraries for matrix operations and heatmap generation
library(matrixStats) # Ensure matrixStats is loaded for row variance calculation
library(pheatmap) # For generating heatmaps
# ============ Step 1: Convert counts_matrix to Matrix ============
# Convert counts_matrix to a matrix if it is currently a data frame
counts_matrix <- as.matrix(counts_matrix) # Ensures compatibility with downstream functions
# ============ Step 2: Log2 Transformation of Counts ============
# Apply a log2 transformation to reduce the dynamic range of the count data
counts_matrix_log <- log2(counts_matrix + 1) # Adding 1 to avoid log(0) issues
# ============ Step 3: Identify Top Variable Genes ============
# Recalculate the most variable genes based on the log-transformed counts
# We select the top 300 most variable genes using row variance
top_var_genes_log <- head(order(rowVars(counts_matrix_log), decreasing = TRUE), 300)
# ============ Step 4: Generate Heatmap and Save ============
# Create the heatmap for all 166 samples based on the top 300 most variable genes
# Adjust clustering and visual parameters such as label angles and font sizes
pheatmap(counts_matrix_log[top_var_genes_log, ],
cluster_rows = TRUE, cluster_cols = TRUE,
show_rownames = FALSE, show_colnames = TRUE,
angle_col = 90, # Rotate x-axis labels for better readability
fontsize_col = 4, # Reduce font size for the column labels
filename = "Figures/3.2.5.Exploratory_Heatmaps/3.2.5.1.Unsupervised_Heatmap_160_Samples.png") # Save the heatmap as an image
matrixStats
and pheatmap
libraries are loaded for matrix operations and heatmap generation. (Step 1): The metadata is subset to include only the baseline lesional (AL_m0) and non-lesional (AN_m0) samples by filtering based on metadata$title
. These subsets are combined into one dataset for comparison. (Step 2): The counts_matrix
is filtered to include only the lesional and non-lesional samples based on the sample names in metadata$title
. (Step 3): A log2 transformation is applied to the filtered count matrix using log2()
to reduce the dynamic range. (Step 4): The top 300 most variable genes are identified based on variance using rowVars()
. (Step 5): A heatmap is generated (Figure 3.2.5.2) using pheatmap
, with rows and columns clustered to visualize the relationship between samples and gene expression. Adjustments are made for label angles and font size, and the heatmap is saved as an image file.
Code Block 3.2.5.2. Unsupervised Heatmap for Baseline Lesional and Non-lesional Samples (109 samples)
# ============ Load Required Libraries ============
# Load the required libraries for matrix operations and heatmap generation
library(matrixStats) # Ensure matrixStats is loaded for row variance calculation
library(pheatmap) # For generating heatmaps
# ============ Step 1: Subset Metadata for Lesional and Non-lesional Samples ============
# Subset the metadata to include only the baseline lesional (AL_m0) and non-lesional (AN_m0) samples
lesional_samples <- metadata[grepl("AL_m0", metadata$title), ] # Filter metadata for lesional samples
non_lesional_samples <- metadata[grepl("AN_m0", metadata$title), ] # Filter metadata for non-lesional samples
# Combine lesional and non-lesional samples into one dataset for comparison
selected_samples <- rbind(lesional_samples, non_lesional_samples)
# ============ Step 2: Subset the Count Matrix ============
# Subset the count matrix to include only the lesional and non-lesional samples
# Assuming the sample names (columns) in counts_matrix match those in metadata$title
selected_sample_names <- selected_samples$title
counts_matrix_filtered <- counts_matrix[, selected_sample_names] # Filter the count matrix for the selected samples
# ============ Step 3: Log2 Transformation of Filtered Counts ============
# Apply a log2 transformation to the filtered count matrix to reduce the dynamic range
counts_matrix_log <- log2(as.matrix(counts_matrix_filtered) + 1) # Adding 1 to avoid log(0) issues
# ============ Step 4: Identify Top Variable Genes ============
# Recalculate the most variable genes based on the log-transformed counts
# Select the top 300 most variable genes
top_var_genes_log <- head(order(rowVars(counts_matrix_log), decreasing = TRUE), 300)
# ============ Step 5: Generate Heatmap and Save ============
# Create a heatmap for the lesional and non-lesional samples based on the top 300 most variable genes
# Adjust clustering and visual parameters such as label angles and font sizes
pheatmap(counts_matrix_log[top_var_genes_log, ],
cluster_rows = TRUE, cluster_cols = TRUE,
show_rownames = FALSE, show_colnames = TRUE,
angle_col = 90, # Rotate x-axis labels for better readability
fontsize_col = 4, # Reduce font size for the column labels
filename = "Figures/3.2.5.Exploratory_Heatmaps/3.2.5.2.Unsupervised_Heatmap_Baseline_Samples.png") # Save the heatmap as an image
Code Block 3.2.5.3. Grouped Heatmap for All Groups
# =========== Load Required Libraries ===========
library(matrixStats)
library(pheatmap)
# =========== Step 1: Apply Log2 Transformation ===========
counts_matrix_log <- log2(as.matrix(counts_matrix) + 1)
# =========== Step 2: Extract Sample Grouping Information ===========
sample_groups <- metadata$source_name_ch1
# =========== Step 3: Define Descriptive Labels ===========
new_labels <- c(
"m0_AN" = "Baseline Non-lesional",
"cyclosporine_m3_AN" = "Cyclosporine Non-lesional",
"dupilumab_m3_AN" = "Dupilumab Non-lesional",
"m0_AL" = "Baseline Lesional",
"cyclosporine_m3_AL" = "Cyclosporine Lesional",
"dupilumab_m3_AL" = "Dupilumab Lesional"
)
# =================================================================================
# =========== Step 3.1: Select Groups for Analysis ===========
selected_groups <- c("m0_AN", "cyclosporine_m3_AN", "dupilumab_m3_AN", "m0_AL", "cyclosporine_m3_AL", "dupilumab_m3_AL")
# ===== we can replace the above selected_groups with one of the bellow =====
# 1. Heatmap for all groups:
# c("m0_AN", "cyclosporine_m3_AN", "dupilumab_m3_AN", "m0_AL", "cyclosporine_m3_AL", "dupilumab_m3_AL")
# 2. (Lesional vs. Non-lesional) Compare "Baseline Lesional" with "Baseline Non-lesional":
# c("m0_AN", "m0_AL")
# 3. (Lesional vs. Non-lesional) Compare Cyclosporine Lesional with Cyclosporine Non-lesional:
# c("cyclosporine_m3_AN", "cyclosporine_m3_AL")
# 4. (Lesional vs. Non-lesional) Compare Dupilumab Lesional with Dupilumab Non-lesional:
# c("dupilumab_m3_AN", "dupilumab_m3_AL")
# 5. (Therapy Comparisons) Compare Baseline Lesional, Cyclosporine Lesional, and Dupilumab Lesional (compared to Baseline Non-Lesional).
# c("m0_AN", "m0_AL", "cyclosporine_m3_AL", "dupilumab_m3_AL")
# 6. (Therapy Comparisons) Compare Dupilumab Lesional and Cyclosporine Lesional.
# c("dupilumab_m3_AL", "cyclosporine_m3_AL")
# 7. (Pre-treatment vs. Post-treatment) Compare Baseline Lesional & Cyclosporine Lesional.
# c("m0_AL", "cyclosporine_m3_AL")
# 8. (Pre-treatment vs. Post-treatment) Compare Baseline Lesional & Dupilumab Lesional.
# c("m0_AL", "dupilumab_m3_AL")
# =================================================================================
# =========== Step 4: Reorder Columns Based on Group Selection ===========
desired_order <- selected_groups
reordered_columns <- match(desired_order, sample_groups)
counts_matrix_log <- counts_matrix_log[, reordered_columns]
colnames(counts_matrix_log) <- new_labels[sample_groups[reordered_columns]]
# =========== Step 5: Calculate Top Variable Genes ===========
top_var_genes_log <- head(order(rowVars(counts_matrix_log), decreasing = TRUE), 300)
# =========== Step 6: Generate the Heatmap ===========
pheatmap(counts_matrix_log[top_var_genes_log, ],
cluster_rows = TRUE, cluster_cols = TRUE,
show_rownames = FALSE, show_colnames = TRUE,
angle = 45, vjust = 0.5, hjust = 0.6,
fontsize_col = 8,
filename = "Figures/3.2.5.Exploratory_Heatmaps/3.2.5.3.Grouped_Heatmap_All_Groups.png")
Code Block 3.2.5.4. Grouped Heatmap: Compare Baseline Lesional with Baseline Non-lesional
# ============================
selected_groups <- c("m0_AN", "m0_AL")
# ============================
Code Block 3.2.5.5. Grouped Heatmap: Compare Cyclosporine Lesional vs. Non-lesional
# ============================
selected_groups <- c("cyclosporine_m3_AN", "cyclosporine_m3_AL")
# ============================
Code Block 3.2.5.6. Grouped Heatmap: Compare Dupilumab Lesional vs. Dupilumab Non-lesional
# ============================
selected_groups <- c("dupilumab_m3_AN", "dupilumab_m3_AL")
# ============================
Code Block 3.2.5.7. Grouped Heatmap: Compare Baseline Non-Lesional vs. Baseline Lesional, Cyclosporine Lesional, and Dupilumab Lesional
# ============================
selected_groups <- c("m0_AN", "m0_AL", "cyclosporine_m3_AL", "dupilumab_m3_AL")
# ============================
Code Block 3.2.5.8. Grouped Heatmap: Compare Dupilumab Lesional vs. Cyclosporine Lesional
# ============================
selected_groups <- c("dupilumab_m3_AL", "cyclosporine_m3_AL")
# ============================
Code Block 3.2.5.9. Grouped Heatmap: Compare Pre-treatment vs. Post-treatment (Baseline Lesional & Cyclosporine Lesional)
# ============================
selected_groups <- c("m0_AL", "cyclosporine_m3_AL")
# ============================
Code Block 3.2.5.10. Grouped Heatmap: Compare Pre-treatment vs. Post-treatment (Baseline Lesional & Dupilumab Lesional)
# ============================
selected_groups <- c("m0_AL", "dupilumab_m3_AL")
# ============================
We begin by loading the essential R libraries needed for data manipulation, DEG analysis, and visualization.
# Load Required Libraries
library(DESeq2)
library(data.table)
library(umap)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
library(VennDiagram)
library(knitr)
library(tinytex)
This step involves downloading the count data and gene annotations from GEO. We convert these into suitable formats for further analysis.
# Load counts table from GEO
urld <- "https://www.ncbi.nlm.nih.gov/geo/download/?format=file&type=rnaseq_counts"
path <- paste(urld, "acc=GSE157194", "file=GSE157194_raw_counts_GRCh38.p13_NCBI.tsv.gz", sep="&")
tbl <- as.matrix(data.table::fread(path, header=T, colClasses="integer"), rownames="GeneID")
# Load gene annotations
apath <- paste(urld, "type=rnaseq_counts", "file=Human.GRCh38.p13.annot.tsv.gz", sep="&")
annot <- data.table::fread(apath, header=T, quote="", stringsAsFactors=F, data.table=F)
rownames(annot) <- annot$GeneID
This step filters the samples and assigns them to the Lesional or Non-Lesional groups based on predefined group labels.
# Sample selection
gsms <- paste0("01101001010101011010011001011001011001101001011010",
"01100101011010101001011001010101010010101010101001",
"01100100110XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
"XXXXXXXXXXXXXXXX")
sml <- strsplit(gsms, split="")[[1]]
# Filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
tbl <- tbl[ ,sel]
# Group membership for samples
gs <- factor(sml)
groups <- make.names(c("Lesional","Non-Lesional"))
levels(gs) <- groups
sample_info <- data.frame(Group = gs, row.names = colnames(tbl))
To improve analysis accuracy, we filter out genes with low counts across samples.
# Pre-filter low count genes
keep <- rowSums(tbl >= 10) >= min(table(gs))
tbl <- tbl[keep, ]
Before proceeding with differential expression analysis, we perform a PCA to explore the variance in the dataset and ensure that the samples group as expected.
# PCA Plot
vsd <- vst(ds, blind=FALSE)
pcaData <- plotPCA(vsd, intgroup="Group", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Group)) +
geom_point(size=3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed() +
theme_minimal()
We create a DESeqDataSet
object, normalize the data, and run the differential expression analysis using DESeq2. This model tests for differences in gene expression between the Lesional and Non-Lesional groups.
# Create DESeqDataSet and run DESeq2
ds <- DESeqDataSetFromMatrix(countData=tbl, colData=sample_info, design= ~Group)
ds <- DESeq(ds, test="Wald", sfType="poscount")
We extract the results from DESeq2, separating the differentially expressed genes (DEGs) into upregulated and downregulated categories, and prepare the top genes for further analysis.
# Extract results for top genes
r <- results(ds, contrast = c("Group", groups[1], groups[2]), alpha = 0.01, pAdjustMethod = "fdr")
# Separate downregulated and upregulated DEGs
downregulated <- r[r$log2FoldChange < 0 & r$padj < 0.01, ]
upregulated <- r[r$log2FoldChange > 0 & r$padj < 0.01, ]
# Take the top 10 genes for downregulated and upregulated
top_downregulated <- downregulated[order(downregulated$log2FoldChange)[1:10], ]
top_upregulated <- upregulated[order(upregulated$log2FoldChange, decreasing = TRUE)[1:10], ]
# Merge with gene annotations
top_downregulated <- merge(as.data.frame(top_downregulated), annot, by = 0, sort = FALSE)
top_upregulated <- merge(as.data.frame(top_upregulated), annot, by = 0, sort = FALSE)
# Round "padj" and "log2FoldChange" columns for readability
top_downregulated <- within(top_downregulated, {
padj <- sprintf("%.4f", round(padj, 2))
log2FoldChange <- sprintf("%.4f", round(log2FoldChange, 2))
})
top_upregulated <- within(top_upregulated, {
padj <- sprintf("%.4f", round(padj, 2))
log2FoldChange <- sprintf("%.4f", round(log2FoldChange, 2))
})
# Print the top_downregulated table using kable
kable(top_downregulated, format = "markdown")
kable(top_upregulated, format = "markdown")
We generate various plots to visualize the data and the results of the differential expression analysis.
# Box-and-whisker plot
dat <- log10(counts(ds, normalized = T) + 1)
ord <- order(gs)
palette(c("#1B9E77", "#7570B3"))
par(mar=c(7,4,2,1))
boxplot(dat[,ord], boxwex=0.6, notch=T, main="GSE157194", ylab="lg(norm.counts)", outline=F, las=2, col=gs[ord])
legend("topleft", groups, fill=palette(), bty="n")
# Dispersion estimates plot
plotDispEsts(ds, main="GSE157194 Dispersion Estimates")
# UMAP plot
dat <- dat[!duplicated(dat), ]
par(mar=c(3,3,2,6), xpd=TRUE, cex.main=1.5)
ump <- umap(t(dat), n_neighbors = 15, random_state = 123)
plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
legend("topright", inset=c(-0.15,0), legend=groups, pch=20, col=1:length(groups), title="Group", pt.cex=1.5)
# Histogram of adjusted p-values
hist(r$padj, breaks=seq(0, 1, length = 21), col = "grey", border = "white",
xlab = "", ylab = "", main = "GSE157194 Frequencies of padj-values")
# Volcano plot
old.pal <- palette(c("#00BFFF", "#FF3030"))
par(mar=c(4,4,2,1), cex.main=1.5)
plot(r$log2FoldChange, -log10(r$padj), main=paste(groups[1], "vs", groups[2]),
xlab="log2FC", ylab="-log10(Padj)", pch=20, cex=0.5)
with(subset(r, padj<0.01 & abs(log2FoldChange) >= 1.5),
points(log2FoldChange, -log10(padj), pch=20, col=(sign(log2FoldChange) + 3)/2, cex=1))
legend("bottomleft", title=paste("Padj<", 0.01, sep=""), legend=c("down", "up"), pch=20,col=1:2)
# MA plot
par(mar=c(4,4,2,1), cex.main=1.5)
plot(log10(r$baseMean), r$log2FoldChange, main=paste(groups[1], "vs", groups[2]),
xlab="log10(mean of normalized counts)", ylab="log2FoldChange", pch=20, cex=0.5)
with(subset(r, padj<0.01 & abs(log2FoldChange) >= 1.5),
points(log10(baseMean), log2FoldChange, pch=20, col=(sign(log2FoldChange) + 3)/2, cex=1))
legend("bottomleft", title=paste("Padj<", 0.01, sep=""), legend=c("down", "up"), pch=20,col=1:2)
abline(h=0)
palette(old.pal)
After identifying DEGs, we perform GO and KEGG pathway enrichment analyses to understand the biological significance of the results.
# GO enrichment analysis for upregulated and downregulated genes
up_genes <- top_upregulated$GeneID
down_genes <- top_downregulated$GeneID
# GO enrichment for upregulated genes
ego_up <- enrichGO(gene = up_genes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL",
ont = "BP", pAdjustMethod = "fdr", qvalueCutoff = 0.05)
# GO enrichment for downregulated genes
ego_down <- enrichGO(gene = down_genes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL",
ont = "BP", pAdjustMethod = "fdr", qvalueCutoff = 0.05)
# Plotting the top GO terms
barplot(ego_up, showCategory=10, title="Top GO terms for Upregulated Genes", drop=TRUE)
barplot(ego_down, showCategory=10, title="Top GO terms for Downregulated Genes", drop=TRUE)
# KEGG pathway enrichment analysis for upregulated and downregulated genes
kegg_up <- enrichKEGG(gene = up_genes, organism = "hsa", pAdjustMethod = "fdr", qvalueCutoff = 0.05)
kegg_down <- enrichKEGG(gene = down_genes, organism = "hsa", pAdjustMethod = "fdr", qvalueCutoff = 0.05)
# Plotting the top KEGG pathways
barplot(kegg_up, showCategory=10, title="Top KEGG Pathways for Upregulated Genes")
barplot(kegg_down, showCategory=10, title="Top KEGG Pathways for Downregulated Genes")
We visualize the overlap between different gene sets (total, upregulated, downregulated) using a Venn diagram.
# Venn diagram for gene sets
total_genes <- rownames(tbl)
upregulated_genes <- rownames(top_upregulated)
downregulated_genes <- rownames(top_downregulated)
venn_data <- list(
Total = total_genes,
Upregulated = upregulated_genes,
Downregulated = downregulated_genes
)
venn_plot <- venn.diagram(
x = venn_data,
category.names = c("Total", "Upregulated", "Downregulated"),
filename = NULL,
margin = 0,
cex.prop = 0.8,
offset.prop = 0.2
)
grid.draw(venn_plot)
Finally, we extract the top 20 differentially expressed genes from both categories and print the results.
# Extract results for top genes table (rounded)
r <- results(ds, contrast = c("Group", groups[1], groups[2]), alpha = 0.01, pAdjustMethod = "fdr")
# Separate downregulated and upregulated DEGs
downregulated <- r[r$log2FoldChange < 0 & r$padj < 0.01, ]
upregulated <- r[r$log2FoldChange > 0 & r$padj < 0.01, ]
# Take the top 20 genes for downregulated and upregulated
top_downregulated <- downregulated[order(downregulated$log2FoldChange)[1:20], ]
top_upregulated <- upregulated[order(upregulated$log2FoldChange, decreasing = TRUE)[1:20], ]
# Merge with gene annotations
top_downregulated <- merge(as.data.frame(top_downregulated), annot, by = 0, sort = FALSE)
top_upregulated <- merge(as.data.frame(top_upregulated), annot, by = 0, sort = FALSE)
# Round "padj" and "log2FoldChange" columns
top_downregulated <- within(top_downregulated, {
padj <- sprintf("%.4f", round(padj, 4))
log2FoldChange <- sprintf("%.4f", round(log2FoldChange, 4))
})
top_upregulated <- within(top_upregulated, {
padj <- sprintf("%.4f", round(padj, 4))
log2FoldChange <- sprintf("%.4f", round(log2FoldChange, 4))
})
# Print the top_downregulated table using kable
kable(top_downregulated, format = "markdown")
kable(top_upregulated, format = "markdown")
This complete pipeline guides you through performing differential gene expression analysis on the GSE157194 dataset using DESeq2. The analysis identifies significant gene expression changes between lesional and non-lesional skin samples from Atopic Dermatitis patients, and includes essential data visualizations and enrichment analyses to interpret the results biologically.