GSVA plots & tables for RNA-seq and NanoString

gene_x 0 like s 880 view s

Tags: processing, pipeline

  1. preparing gene expression matrix: calculate DESeq2 results

    1. #Input: merged_gene_counts.txt
    2. setwd("/home/jhuang/DATA/Data_Susanne_Carotis_RNASeq/run_2023_GSVA_table/")
    3. library("AnnotationDbi")
    4. library("clusterProfiler")
    5. library("ReactomePA")
    6. #BiocManager::install("org.Hs.eg.db")
    7. library("org.Hs.eg.db")
    8. library(DESeq2)
    9. library(gplots)
    10. d.raw<- read.delim2("merged_gene_counts.txt",sep="\t", header=TRUE, row.names=1)
    11. colnames(d.raw)<-c("gene_name", "leer_mock_2h_r2", "Ace2_mock_2h_r2", "leer_inf_24h_r1", "Ace2_inf_2h_r1", "leer_inf_24h_r2", "leer_inf_2h_r1", "leer_mock_2h_r1", "leer_inf_2h_r2", "Ace2_inf_2h_r2", "Ace2_mock_2h_r1", "Ace2_inf_24h_r2", "Ace2_inf_24h_r1")
    12. col_order <- c("gene_name", "leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2")
    13. reordered.raw <- d.raw[,col_order]
    14. reordered.raw$gene_name <- NULL
    15. #d <- d.raw[rowSums(reordered.raw>3)>2,]
    16. condition = as.factor(c("leer_mock_2h","leer_mock_2h","leer_inf_2h","leer_inf_2h","leer_inf_24h","leer_inf_24h","Ace2_mock_2h","Ace2_mock_2h","Ace2_inf_2h","Ace2_inf_2h","Ace2_inf_24h","Ace2_inf_24h"))
    17. ids = as.factor(c("leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2"))
    18. #cData = data.frame(row.names=colnames(reordered.raw), condition=condition, batch=batch, ids=ids)
    19. #dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~batch+condition)
    20. cData = data.frame(row.names=colnames(reordered.raw), condition=condition, ids=ids)
    21. dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~condition)
    22. #----more detailed and specific with the following code!----
    23. dds$condition <- relevel(dds$condition, "Ace2_mock_2h")
    24. dds = DESeq(dds, betaPrior=FALSE) # betaPrior default value is FALSE
    25. resultsNames(dds)
  2. preparing selected_geneSets in gsva(exprs, selected_geneSets, method="gsva"). Note that methods are different than methods for nanoString, here are ENSEMBL listed.

    1. #Input: "Signatures.xls" + "Signatures_additional.xls"
    2. library(readxl)
    3. library(gridExtra)
    4. library(ggplot2)
    5. library(GSVA)
    6. # Paths to the Excel files
    7. file_paths <- list("Signatures.xls", "Signatures_additional.xls")
    8. # Get sheet names for each file
    9. sheet_names_list <- lapply(file_paths, excel_sheets)
    10. # Initialize an empty list to hold gene sets
    11. geneSets <- list()
    12. # Loop over each file path and its corresponding sheet names
    13. for (i in 1:length(file_paths)) {
    14. file_path <- file_paths[[i]]
    15. sheet_names <- sheet_names_list[[i]]
    16. # Loop over each sheet, extract the ENSEMBL IDs, and add to the list
    17. for (sheet in sheet_names) {
    18. # Read the sheet
    19. data <- read_excel(file_path, sheet = sheet)
    20. # Process the GeneSet names (replacing spaces with underscores, for example)
    21. gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
    22. # Add ENSEMBL IDs to the list
    23. geneSets[[gene_set_name]] <- as.character(data$ENSEMBL)
    24. }
    25. }
    26. # Print the result to check
    27. #print(geneSets)
    28. #summary(geneSets)
    29. ##desired_geneSets <- c("Monocytes", "Plasma_cells", "T_regs", "Cyt._act._T_cells", "Neutrophils", "Inflammatory_neutrophils", "Suppressive_neutrophils", "LDG", "CD40_activated")
    30. #desired_geneSets <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex", "NLRP3_inflammasome")
    31. #selected_geneSets <- geneSets[desired_geneSets]
    32. selected_geneSets <- geneSets
    33. # Print the selected gene sets
    34. print(selected_geneSets)
  3. prepare violin plots and p-value table for RNA-seq data

    1. # 0. for RNAseq, the GSVA input requires a gene expression matrix where rows are genes and columns are samples. This matrix must be in non-log space.
    2. exprs <- counts(dds, normalized=TRUE)
    3. # 1. Compute GSVA scores:
    4. gsva_scores <- gsva(exprs, selected_geneSets, method="gsva")
    5. # 2. Convert to data.frame for ggplot:
    6. gsva_df <- as.data.frame(t(gsva_scores))
    7. # 3. Add conditions to gsva_df:
    8. gsva_df$Condition <- dds$condition
    9. # 4. Filter the gsva_df to retain only the desired conditions:
    10. #group 1 vs. group 3 in the nanostring data
    11. gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("Ace2_mock_2h", "Ace2_inf_24h"), ]
    12. # 5. Define a function to plot violin plots:
    13. # Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
    14. gsva_df_filtered$Condition <- gsub("Ace2_mock_2h", "Group3", gsva_df_filtered$Condition) #group3=mock
    15. gsva_df_filtered$Condition <- gsub("Ace2_inf_24h", "Group1a", gsva_df_filtered$Condition) #group1a=infection
    16. gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
    17. #plot_violin <- function(data, gene_name) {
    18. # # Calculate the t-test p-value for the two conditions
    19. # condition1_data <- data[data$Condition == "Group1a", gene_name]
    20. # condition2_data <- data[data$Condition == "Group3", gene_name]
    21. # p_value <- t.test(condition1_data, condition2_data)$p.value
    22. # # Convert p-value to annotation
    23. # p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
    24. # rounded_p_value <- paste0("p = ", round(p_value, 2))
    25. # plot_title <- gsub("_", " ", gene_name)
    26. # p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
    27. # geom_violin(linewidth=1.2) +
    28. # scale_fill_manual(values = custom_colors) +
    29. # labs(title=plot_title, y="GSVA Score") +
    30. # ylim(-1, 1) +
    31. # theme_light() +
    32. # theme(
    33. # axis.title.x = element_text(size=12),
    34. # axis.title.y = element_text(size=12),
    35. # axis.text.x = element_text(size=10),
    36. # axis.text.y = element_text(size=10),
    37. # plot.title = element_text(size=12, hjust=0.5),
    38. # legend.position = "none" # Hide legend since the colors are self-explanatory
    39. # )
    40. # # Add p-value annotation to the plot
    41. # p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
    42. # return(p)
    43. #}
    44. ## 6. Generate the list of plots in a predefined order:
    45. #genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
    46. #genes <- genes[match(desired_order, genes)]
    47. #genes <- genes[!is.na(genes)]
    48. #first_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
    49. # -- This following code does not have the colors in the figure --
    50. plot_violin <- function(data, gene_name) {
    51. # Calculate the t-test p-value for the two conditions
    52. condition1_data <- data[data$Condition == "Group1a", gene_name]
    53. condition2_data <- data[data$Condition == "Group3", gene_name]
    54. p_value <- t.test(condition1_data, condition2_data)$p.value
    55. # Convert p-value to annotation
    56. p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
    57. rounded_p_value <- paste0("p = ", round(p_value, 2))
    58. plot_title <- gsub("_", " ", gene_name)
    59. p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
    60. geom_violin(linewidth=1.2) +
    61. labs(title=plot_title, y="GSVA Score") +
    62. ylim(-1, 1) +
    63. theme_light() +
    64. theme(
    65. axis.title.x = element_text(size=12),
    66. axis.title.y = element_text(size=12),
    67. axis.text.x = element_text(size=10),
    68. axis.text.y = element_text(size=10),
    69. plot.title = element_text(size=12, hjust=0.5)
    70. )
    71. # Add p-value annotation to the plot
    72. p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
    73. #return(p)
    74. # Return both the plot and the p-value
    75. list(plot = p, p_value = p_value)
    76. }
    77. # 6. Generate the list of plots in a predefined order:
    78. #desired_order <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex", "NLRP3_inflammasome")
    79. genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
    80. #genes <- genes[match(desired_order, genes)]
    81. #first_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
    82. # Correct the creation of p_values_df
    83. p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
    84. p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))
    85. # Calculate adjusted p-values
    86. #p_values_df$Adjusted_P_Value <- p.adjust(p_values_df$P_Value, method = "fdr")
    87. write.xlsx(p_values_df, "p_values_df.xlsx")

5.1. preparing selected_geneSets in gsva(exprs, selected_geneSets, method="gsva") for NanoString

  1. #Input: Signatures.xls
  2. library(readxl)
  3. library(gridExtra)
  4. library(ggplot2)
  5. library(GSVA)
  6. # Paths to the Excel files
  7. file_paths <- list("Signatures.xls", "Signatures_additional.xls")
  8. # Get sheet names for each file
  9. sheet_names_list <- lapply(file_paths, excel_sheets)
  10. # Initialize an empty list to hold gene sets
  11. geneSets <- list()
  12. # Loop over each file path and its corresponding sheet names
  13. for (i in 1:length(file_paths)) {
  14. file_path <- file_paths[[i]]
  15. sheet_names <- sheet_names_list[[i]]
  16. # Loop over each sheet, extract the ENSEMBL IDs, and add to the list
  17. for (sheet in sheet_names) {
  18. # Read the sheet
  19. data <- read_excel(file_path, sheet = sheet)
  20. # Process the GeneSet names (replacing spaces with underscores, for example)
  21. gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
  22. # Add ENSEMBL IDs to the list
  23. geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
  24. }
  25. }
  26. # Print the result to check
  27. summary(geneSets)
  28. #desired_geneSets <- c("Monocytes", "Plasma_cells", "T_regs", "Cyt._act._T_cells", "Neutrophils", "Inflammatory_neutrophils", "Suppressive_neutrophils", "LDG", "CD40_activated")
  29. desired_geneSets <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex", "NLRP3_inflammasome", "T_cells", "Monocytes","Plasma_cells","T_regs","Cyt._act._T_cells","Neutrophils","Inflammatory_neutrophils","Suppressive_neutrophils","LDG","CD40_activated")
  30. selected_geneSets <- geneSets[desired_geneSets]
  31. # Print the selected gene sets
  32. print(selected_geneSets)

5.2. prepare violin plots and p-value table for NanoString data (Group 3 vs Group 1)

  1. #library(rmarkdown); render("run.Rmd", c("html_document")) under jhuang@hamburg:~/DATA/Data_Susanne_Carotis_spatialRNA_PUBLISHING/run_2023_2_GSVA_table with R
  2. # 0. for Nanostring, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples. This matrix must be in non-log space.
  3. exprs <- exprs(target_m666Data) #18677 45
  4. #exprs <- exprs(filtered_or_neg_target_m666Data) #2274 45
  5. # 1. Compute GSVA scores:
  6. gsva_scores <- gsva(exprs, selected_geneSets, method="gsva")
  7. # 2. Convert to data.frame for ggplot:
  8. gsva_df <- as.data.frame(t(gsva_scores))
  9. # 3. Add conditions to gsva_df:
  10. identical(rownames(pData(target_m666Data)), rownames(gsva_df))
  11. gsva_df$Condition <- pData(target_m666Data)$Grp
  12. #identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
  13. gsva_df$SampleID <- pData(target_m666Data)$SampleID
  14. # 4. Filter the gsva_df to retain only the desired conditions:
  15. #group 1 vs. group 3 in the nanostring data
  16. gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]
  17. # 5. Define a function to plot violin plots:
  18. # Define custom colors
  19. custom_colors <- c("Group1" = "lightblue", "Group1a" = "red", "Group3" = "grey")
  20. #To implement the custom colors, and make the adjustments to abbreviate "Inflammatory" and "Suppressive", as well as increase the font size for the groups on the x-axis, we can modify the plot_violin function as follows:
  21. gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
  22. gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
  23. gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
  24. plot_violin <- function(data, gene_name) {
  25. # Calculate the t-test p-value for the two conditions
  26. condition1_data <- data[data$Condition == "Group1", gene_name]
  27. condition2_data <- data[data$Condition == "Group3", gene_name]
  28. p_value <- t.test(condition1_data, condition2_data)$p.value
  29. # Convert p-value to annotation
  30. p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  31. rounded_p_value <- paste0("p = ", round(p_value, 2))
  32. plot_title <- gsub("_", " ", gene_name)
  33. p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
  34. geom_violin(linewidth=1.2) +
  35. scale_fill_manual(values = custom_colors) +
  36. labs(title=plot_title, y="GSVA Score") +
  37. ylim(-1, 1) +
  38. theme_light() +
  39. theme(
  40. axis.title.x = element_text(size=12),
  41. axis.title.y = element_text(size=12),
  42. axis.text.x = element_text(size=10),
  43. axis.text.y = element_text(size=10),
  44. plot.title = element_text(size=12, hjust=0.5),
  45. legend.position = "none" # Hide legend since the colors are self-explanatory
  46. )
  47. # Add p-value annotation to the plot
  48. p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
  49. #return(p)
  50. # Return both the plot and the p-value
  51. list(plot = p, p_value = p_value)
  52. }
  53. library(openxlsx)
  54. desired_order <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex", "NLRP3_inflammasome", "T_cells", "Monocytes","Plasma_cells","T_regs","Cyt._act._T_cells","Neutrophils","Inflammatory_neutrophils","Suppressive_neutrophils","LDG","CD40_activated")
  55. genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
  56. genes <- genes[match(desired_order, genes)]
  57. genes <- genes[!is.na(genes)]
  58. #second_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
  59. # Correct the creation of p_values_df
  60. p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
  61. p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))
  62. write.xlsx(p_values_df, "p_values_df_Group3_vs_Group1_19sig.xlsx")

5.3. prepare violin plots and p-value table for NanoString data (Group 3 vs Group 1a)

  1. # 2. Convert to data.frame for ggplot:
  2. gsva_df <- as.data.frame(t(gsva_scores))
  3. # 3. Add conditions to gsva_df:
  4. identical(rownames(pData(target_m666Data)), rownames(gsva_df))
  5. gsva_df$Condition <- pData(target_m666Data)$Group
  6. #identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
  7. gsva_df$SampleID <- pData(target_m666Data)$SampleID
  8. # 4. Filter the gsva_df to retain only the desired conditions:
  9. #group 1 vs. group 3 in the nanostring data
  10. gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "3"), ]
  11. # 5. Define a function to plot violin plots:
  12. # Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
  13. gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
  14. gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
  15. gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
  16. plot_violin <- function(data, gene_name) {
  17. # Calculate the t-test p-value for the two conditions
  18. condition1_data <- data[data$Condition == "Group1a", gene_name]
  19. condition2_data <- data[data$Condition == "Group3", gene_name]
  20. p_value <- t.test(condition1_data, condition2_data)$p.value
  21. # Convert p-value to annotation
  22. p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  23. rounded_p_value <- paste0("p = ", round(p_value, 2))
  24. plot_title <- gsub("_", " ", gene_name)
  25. p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
  26. geom_violin(linewidth=1.2) +
  27. scale_fill_manual(values = custom_colors) +
  28. labs(title=plot_title, y="GSVA Score") +
  29. ylim(-1, 1) +
  30. theme_light() +
  31. theme(
  32. axis.title.x = element_text(size=12),
  33. axis.title.y = element_text(size=12),
  34. axis.text.x = element_text(size=10),
  35. axis.text.y = element_text(size=10),
  36. plot.title = element_text(size=12, hjust=0.5),
  37. legend.position = "none" # Hide legend since the colors are self-explanatory
  38. )
  39. # Add p-value annotation to the plot
  40. p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
  41. #return(p)
  42. # Return both the plot and the p-value
  43. list(plot = p, p_value = p_value)
  44. }
  45. # 6. Generate the list of plots in a predefined order:
  46. genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
  47. genes <- genes[match(desired_order, genes)]
  48. genes <- genes[!is.na(genes)]
  49. # Correct the creation of p_values_df
  50. p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
  51. p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))
  52. write.xlsx(p_values_df, "p_values_df_Group3_vs_Group1a_19sig.xlsx")

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum