Human RNA-seq processing for Data_Ben_RNAseq_2025

gene_x 0 like s 14 view s

Tags: processing, human, RNA-seq

Related posts:

http://xgenes.com/article/article-content/242/rna-seq-data-analysis-of-yersinia-on-grch38/

http://xgenes.com/article/article-content/209/rna-seq-skin-organoids-on-grch38-chrhsv1-final/

  1. Create raw_data

    mkdir raw_data; cd raw_data
    
    ln -s ../20250805_AV243904_0023_A/1_Palpha_2_5h_418_3/1_Palpha_2_5h_418_3_R1.fastq.gz Palpha_2_5h_3_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/1_Palpha_2_5h_418_3/1_Palpha_2_5h_418_3_R2.fastq.gz Palpha_2_5h_3_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/1_Palpha_2_5h_418_4/1_Palpha_2_5h_418_4_R1.fastq.gz Palpha_2_5h_4_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/1_Palpha_2_5h_418_4/1_Palpha_2_5h_418_4_R2.fastq.gz Palpha_2_5h_4_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/2_SA_2_5h_418_3/2_SA_2_5h_418_3_R1.fastq.gz SA_2_5h_3_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/2_SA_2_5h_418_3/2_SA_2_5h_418_3_R2.fastq.gz SA_2_5h_3_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/2_SA_2_5h_418_4/2_SA_2_5h_418_4_R1.fastq.gz SA_2_5h_4_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/2_SA_2_5h_418_4/2_SA_2_5h_418_4_R2.fastq.gz SA_2_5h_4_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/3_Pminus_2_5h_418_3/3_Pminus_2_5h_418_3_R1.fastq.gz Pminus_2_5h_3_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/3_Pminus_2_5h_418_3/3_Pminus_2_5h_418_3_R2.fastq.gz Pminus_2_5h_3_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/3_Pminus_2_5h_418_4/3_Pminus_2_5h_418_4_R1.fastq.gz Pminus_2_5h_4_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/3_Pminus_2_5h_418_4/3_Pminus_2_5h_418_4_R2.fastq.gz Pminus_2_5h_4_R2.fastq.gz
    
    ln -s ../20250805_AV243904_0023_A/4_Palpha_5h_418_3/4_Palpha_5h_418_3_R1.fastq.gz Palpha_5h_3_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/4_Palpha_5h_418_3/4_Palpha_5h_418_3_R2.fastq.gz Palpha_5h_3_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/4_Palpha_5h_418_4/4_Palpha_5h_418_4_R1.fastq.gz Palpha_5h_4_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/4_Palpha_5h_418_4/4_Palpha_5h_418_4_R2.fastq.gz Palpha_5h_4_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/5_SA_5h_418_3/5_SA_5h_418_3_R1.fastq.gz SA_5h_3_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/5_SA_5h_418_3/5_SA_5h_418_3_R2.fastq.gz SA_5h_3_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/5_SA_5h_418_4/5_SA_5h_418_4_R1.fastq.gz SA_5h_4_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/5_SA_5h_418_4/5_SA_5h_418_4_R2.fastq.gz SA_5h_4_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/6_Pminus_5h_418_3/6_Pminus_5h_418_3_R1.fastq.gz Pminus_5h_3_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/6_Pminus_5h_418_3/6_Pminus_5h_418_3_R2.fastq.gz Pminus_5h_3_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/6_Pminus_5h_418_4/6_Pminus_5h_418_4_R1.fastq.gz Pminus_5h_4_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/6_Pminus_5h_418_4/6_Pminus_5h_418_4_R2.fastq.gz Pminus_5h_4_R2.fastq.gz
    
    ln -s ../20250805_AV243904_0023_A/7_uninf_5h_418_3/7_uninf_5h_418_3_R1.fastq.gz uninf_5h_3_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/7_uninf_5h_418_3/7_uninf_5h_418_3_R2.fastq.gz uninf_5h_3_R2.fastq.gz
    ln -s ../20250805_AV243904_0023_A/7_uninf_5h_418_4/7_uninf_5h_418_4_R1.fastq.gz uninf_5h_4_R1.fastq.gz
    ln -s ../20250805_AV243904_0023_A/7_uninf_5h_418_4/7_uninf_5h_418_4_R2.fastq.gz uninf_5h_4_R2.fastq.gz
    
    mkdir trimmed trimmed_unpaired;
    for sample_id in Palpha_2_5h_3 Palpha_2_5h_4 SA_2_5h_3 SA_2_5h_4 Pminus_2_5h_3 Pminus_2_5h_4 Palpha_5h_3 Palpha_5h_4 SA_5h_3 SA_5h_4 Pminus_5h_3 Pminus_5h_4 uninf_5h_3 uninf_5h_4; do
            java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 raw_data/${sample_id}_R1.fastq.gz raw_data/${sample_id}_R2.fastq.gz trimmed/${sample_id}_R1.fastq.gz trimmed_unpaired/${sample_id}_R1.fastq.gz trimmed/${sample_id}_R2.fastq.gz trimmed_unpaired/${sample_id}_R2.fastq.gz ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log;
    done
    mv trimmed/*.fastq.gz .
    
  2. Create samplesheet.csv

    sample,fastq_1,fastq_2,strandedness
    Palpha_2_5h_3,Palpha_2_5h_3_R1.fastq.gz,Palpha_2_5h_3_R2.fastq.gz,auto
    Palpha_2_5h_4,Palpha_2_5h_4_R1.fastq.gz,Palpha_2_5h_4_R2.fastq.gz,auto
    SA_2_5h_3,SA_2_5h_3_R1.fastq.gz,SA_2_5h_3_R2.fastq.gz,auto
    SA_2_5h_4,SA_2_5h_4_R1.fastq.gz,SA_2_5h_4_R2.fastq.gz,auto
    ...
    
  3. Run nextflow

    ln -s ~/Tools/nf-core-rnaseq-3.12.0/ rnaseq
    #Adapt the genome path in ~/Tools/nf-core-rnaseq-3.12.0/conf/igenomes.config
    #Note that the current path is "/mnt/nvme1n1p1/Homo_sapiens/Ensembl/GRCh38/..."
    
    #---- SUCCESSFUL -----
    (host_env) /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results    --genome GRCh38         -profile docker -resume  --max_cpus 60 --max_memory 512.GB --max_time 2400.h --save_reference --aligner star_salmon  --skip_deseq2_qc --skip_fastqc
    
  4. Import data and construct DESeqDataSet object

    # Import the required libraries
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library(gplots)
    library(tximport)
    library(DESeq2)
    library("org.Hs.eg.db")
    library(dplyr)
    library(tidyverse)
    setwd("~/DATA/Data_Ben_RNAseq_2025/results/star_salmon")
    # Define paths to your Salmon output quantification files
    files <- c("Palpha_2_5h_r3" = "./Palpha_2_5h_3/quant.sf",
            "Palpha_2_5h_r4" = "./Palpha_2_5h_4/quant.sf",
            "SA_2_5h_r3" = "./SA_2_5h_3/quant.sf",
            "SA_2_5h_r4" = "./SA_2_5h_4/quant.sf",
            "Pminus_2_5h_r3" = "./Pminus_2_5h_3/quant.sf",
            "Pminus_2_5h_r4" = "./Pminus_2_5h_4/quant.sf",
            "Palpha_5h_r3" = "./Palpha_5h_3/quant.sf",
            "Palpha_5h_r4" = "./Palpha_5h_4/quant.sf",
            "SA_5h_r3" = "./SA_5h_3/quant.sf",
            "SA_5h_r4" = "./SA_5h_4/quant.sf",
            "Pminus_5h_r3" = "./Pminus_5h_3/quant.sf",
            "Pminus_5h_r4" = "./Pminus_5h_4/quant.sf",
            "uninf_5h_r3" = "./uninf_5h_3/quant.sf",
            "uninf_5h_r4" = "./uninf_5h_4/quant.sf")
    # Import the transcript abundance data with tximport
    txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    # Define the replicates and condition of the samples
    replicate <- factor(c("r3", "r4", "r3", "r4", "r3", "r4", "r3", "r4", "r3", "r4", "r3", "r4", "r3", "r4"))
    condition <- factor(c("Palpha_2_5h","Palpha_2_5h","SA_2_5h","SA_2_5h","Pminus_2_5h","Pminus_2_5h",    "Palpha_5h","Palpha_5h","SA_5h","SA_5h","Pminus_5h","Pminus_5h",    "uninf_5h","uninf_5h"))
    
    # Define the colData for DESeq2
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    # -- transcript-level count data (x2) --
    # Create DESeqDataSet object
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    write.csv(counts(dds), file="transcript_counts.csv")
    
    # -- gene-level count data (x2) --
    # Read in the tx2gene map from salmon_tx2gene.tsv
    tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    # Set the column names
    colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    # Remove the gene_name column if not needed
    tx2gene <- tx2gene[,1:2]
    # Import and summarize the Salmon data with tximport
    txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    # Continue with the DESeq2 workflow as before...
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
    write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    
    d.raw <- read.csv("gene_counts.csv", header=TRUE, row.names=1)
    dds <- DESeqDataSetFromMatrix(countData=d.raw, colData=colData, design=~condition+replicate)
    #dds <- DESeqDataSetFromMatrix(countData=d.raw, colData=colData, design=~condition)
    dim(counts(dds))
    head(counts(dds), 10)
    rld <- rlogTransformation(dds)
    
    #We don't need to run DESeq(dds) before estimateSizeFactors(dds). In fact, the typical workflow in DESeq2 is the opposite: we usually run estimateSizeFactors(dds) (and other preprocessing functions) before running the main DESeq(dds) function.
    #The estimateSizeFactors function is used to calculate size factors for normalization, which corrects for differences in library size (i.e., the number of read counts) between samples. This normalization step is crucial to ensure that differences in gene expression aren't merely due to differences in sequencing depth between samples.
    #The DESeq function, on the other hand, performs the main differential expression analysis, comparing gene expression between different conditions or groups.
    #So, the typical workflow is:
    #  - Create the DESeqDataSet object.
    #  - Use estimateSizeFactors to normalize for library size.
    #  - (Optionally, estimate dispersion with estimateDispersions if not using the full DESeq function later.)
    #  - Use DESeq for the differential expression analysis.
    #  - However, it's worth noting that if you run the main DESeq function directly after creating the DESeqDataSet object, it will automatically perform the normalization (using estimateSizeFactors) and dispersion estimation steps for you. In that case, there's no need to run estimateSizeFactors separately before DESeq.
    
    # draw simple pca and heatmap
    library(gplots)
    library("RColorBrewer")
    mat <- assay(rld)
    mm <- model.matrix(~condition, colData(rld))
    mat <- limma::removeBatchEffect(mat, batch=rld$replicate, design=mm)
    assay(rld) <- mat
    # -- pca --
    png("pca.png", 1200, 800)
    plotPCA(rld, intgroup=c("condition"))
    dev.off()
    png("pca_rep.png", 1200, 800)
    plotPCA(rld, intgroup=c("replicate"))
    dev.off()
    # -- heatmap --
    png("heatmap.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
    
  5. (Optional, possibly has error) Size Factors

    sizeFactors(dds)
    #NULL
    # Estimate size factors
    dds <- estimateSizeFactors(dds)
    # Estimate dispersions
    dds <- estimateDispersions(dds)
    
  6. Select the differentially expressed genes

    #CONSOLE: mkdir degenes
    setwd("/mnt/md1/DATA/Data_Ben_RNAseq_2025/results/star_salmon/degenes")
    
    dds$condition <- relevel(dds$condition, "SA_2_5h")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("Palpha_2_5h_vs_SA_2_5h")
    
    dds$condition <- relevel(dds$condition, "SA_5h")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("Palpha_5h_vs_SA_5h")
    
    dds$condition <- relevel(dds$condition, "Pminus_2_5h")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("Palpha_2_5h_vs_Pminus_2_5h", "SA_2_5h_vs_Pminus_2_5h")
    
    dds$condition <- relevel(dds$condition, "Pminus_5h")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("Palpha_5h_vs_Pminus_5h", "SA_5h_vs_Pminus_5h")
    
    #dds$condition <- relevel(dds$condition, "uninf_2_5h")
    #dds = DESeq(dds, betaPrior=FALSE)
    #resultsNames(dds)
    #clist <- c("Palpha_2_5h_vs_uninf_2_5h", "SA_2_5h_vs_uninf_2_5h", "Pminus_2_5h_vs_uninf_2_5h")
    
    dds$condition <- relevel(dds$condition, "uninf_5h")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("Palpha_5h_vs_uninf_5h", "SA_5h_vs_uninf_5h", "Pminus_5h_vs_uninf_5h")
    
    library(biomaRt)
    listEnsembl()
    listMarts()
    #--> total 69, 27  GRCh38.p7 and 39  GRCm38.p4
    ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
    datasets <- listDatasets(ensembl)
    
    for (i in clist) {
        i<-clist[1]
        contrast = paste("condition", i, sep="_")
        res = results(dds, name=contrast)
        res <- res[!is.na(res$log2FoldChange),]
        res_df <- as.data.frame(res)
        write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
        up <- subset(res_df, padj<=0.05 & log2FoldChange>=2)
        down <- subset(res_df, padj<=0.05 & log2FoldChange<=-2)
        write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
        write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    # -- Save annotated DEGs: annotate human genes 'geness_res', note that the virus genes in that is ignored in the process since they are not in the database --
    for (i in clist) {
        #i<-clist[1]
        contrast = paste("condition", i, sep="_")
        res = results(dds, name=contrast)
        res <- res[!is.na(res$log2FoldChange),]
        #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID"))
        #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID"))
        # In the ENSEMBL-database, GENEID is ENSEMBL-ID.
        #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE"))  #  "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND"
        #geness <- geness[!duplicated(geness$GENEID), ]
        #using getBM replacing AnnotationDbi::select
        #filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids.
        geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
            filters = 'ensembl_gene_id',
            values = rownames(res),
            mart = ensembl)
        geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)
        #merge by column by common colunmn name, in the case "GENEID"
        res$ENSEMBL = rownames(res)
        identical(rownames(res), rownames(geness_uniq))
        res_df <- as.data.frame(res)
        geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
        dim(geness_res)
        rownames(geness_res) <- geness_res$ensembl_gene_id
        geness_res$ensembl_gene_id <- NULL
    
        df_sorted <- as.data.frame(geness_res[order(geness_res$padj),])
        write.csv(df_sorted, file = paste(i, "all_annotated.txt", sep="-"))
        up <- subset(df_sorted, padj<=0.05 & log2FoldChange>=2)
        down <- subset(df_sorted, padj<=0.05 & log2FoldChange<=-2)
        write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up_annotated.txt", sep="-"))
        write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down_annotated.txt", sep="-"))
    }
    
    #Under KONSOLE
    for i in Palpha_2_5h_vs_Pminus_2_5h Palpha_2_5h_vs_SA_2_5h Palpha_5h_vs_Pminus_5h Palpha_5h_vs_SA_5h Palpha_5h_vs_uninf_5h Pminus_5h_vs_uninf_5h SA_2_5h_vs_Pminus_2_5h SA_5h_vs_Pminus_5h SA_5h_vs_uninf_5h; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all_annotated.txt ${i}-up_annotated.txt ${i}-down_annotated.txt -d$',' -o ${i}.xls;"; done
    
  7. Generate volcano-plots including highlighting the pre-defined genes

    # -- Identify the Ensembl IDs for the genes mentioned in the provided text (TNF-Alpha, IKBK/CHUK, IKBKG, MAP3K7, MAPK9, MAPK10, MAPK11, MAPK12, MAPK13, MAPK14) --
    using reliable genomic databases. Then, I’ll integrate these IDs into the provided R code to mark these specific genes in green on the volcano plot. I’ll ensure the code remains functional and compatible with your requirements, maintaining the non-compressed volcano plot structure from the previous response.
    Step 1: Finding Ensembl IDs for the Genes
    The genes mentioned are part of the TNF-Alpha pathway and related signaling pathways (e.g., NF-κB and MAPK cascades). Below, I list each gene with its corresponding Ensembl ID, sourced from the Ensembl database (GRCh38/hg38 assembly, accessed via web resources like Ensembl.org and GeneCards).
    
    TNF-Alpha (TNF):
    
    Official gene symbol: TNF
    Ensembl ID: ENSG00000232810
    Description: Tumor necrosis factor, a cytokine involved in inflammation and immune response.
    
    IKBK/CHUK:
    
    Official gene symbol: CHUK (also known as IKK-alpha, IKK1)
    Ensembl ID: ENSG00000213341
    Description: Component of inhibitor of nuclear factor kappa B kinase complex, part of the IKK complex in NF-κB signaling.
    
    IKBKG:
    
    Official gene symbol: IKBKG (also known as NEMO, IKK-gamma)
    Ensembl ID: ENSG00000269335
    Description: Regulatory subunit of the IKK complex, critical for NF-κB activation.
    
    MAP3K7:
    
    Official gene symbol: MAP3K7 (also known as TAK1)
    Ensembl ID: ENSG00000135341
    Description: Mitogen-activated protein kinase kinase kinase 7, involved in MAPK and NF-κB pathways.
    
    MAPK9:
    
    Official gene symbol: MAPK9 (also known as JNK2)
    Ensembl ID: ENSG00000050748
    Description: Mitogen-activated protein kinase 9, part of the JNK signaling pathway.
    
    MAPK10:
    
    Official gene symbol: MAPK10 (also known as JNK3)
    Ensembl ID: ENSG00000109339
    Description: Mitogen-activated protein kinase 10, involved in JNK signaling.
    
    MAPK11:
    
    Official gene symbol: MAPK11 (also known as p38-beta)
    Ensembl ID: ENSG00000185386
    Description: Mitogen-activated protein kinase 11, part of the p38 MAPK pathway.
    
    MAPK12:
    
    Official gene symbol: MAPK12 (also known as p38-gamma)
    Ensembl ID: ENSG00000188130
    Description: Mitogen-activated protein kinase 12, involved in p38 MAPK signaling.
    
    MAPK13:
    
    Official gene symbol: MAPK13 (also known as p38-delta)
    Ensembl ID: ENSG00000156711
    Description: Mitogen-activated protein kinase 13, part of the p38 MAPK pathway.
    
    MAPK14:
    
    Official gene symbol: MAPK14 (also known as p38-alpha)
    Ensembl ID: ENSG00000112062
    Description: Mitogen-activated protein kinase 14, a key component of the p38 MAPK pathway.
    
    Note on IKBK: The text mentions “IKBK/CHUK,” which likely refers to CHUK (IKK-alpha). There is no gene explicitly named “IKBK” in standard databases, but it may be a typo or shorthand for IKBKB (IKK-beta, Ensembl ID: ENSG00000104365). Since the text explicitly lists CHUK, I include only CHUK here, but I can add IKBKB (ENSG00000104365) if you confirm it’s needed.
    
    #grep "ENSG00000232810" Palpha_5h_vs_SA_5h-all_annotated.txt
    #grep "ENSG00000213341" Palpha_5h_vs_SA_5h-all_annotated.txt
    #grep "ENSG00000269335" Palpha_5h_vs_SA_5h-all_annotated.txt
    #grep "ENSG00000135341" Palpha_5h_vs_SA_5h-all_annotated.txt
    #grep "ENSG00000050748" Palpha_5h_vs_SA_5h-all_annotated.txt
    #grep "ENSG00000109339" Palpha_5h_vs_SA_5h-all_annotated.txt
    #grep "ENSG00000185386" Palpha_5h_vs_SA_5h-all_annotated.txt
    #grep "ENSG00000188130" Palpha_5h_vs_SA_5h-all_annotated.txt
    #grep "ENSG00000156711" Palpha_5h_vs_SA_5h-all_annotated.txt
    #grep "ENSG00000112062" Palpha_5h_vs_SA_5h-all_annotated.txt
    #"ENSG00000232810","TNF","protein_coding",7124,"6",31575565,31578336,1,"tumor necrosis factor [Source:HGNC Symbol;Acc:HGNC:11892]",17354.5202707282,-0.0316703649357798,1.51550966895126,-0.0208975010748007,0.983327420043257,0.999911624451988
    #"ENSG00000213341","CHUK","protein_coding",1147,"10",100188300,100229596,-1,"component of inhibitor of nuclear factor kappa B kinase complex [Source:HGNC Symbol;Acc:HGNC:1974]",250.048543715369,0.0180137792628405,0.29151899644714,0.0617928144730933,0.950727825773712,0.999911624451988
    #"ENSG00000269335","IKBKG","protein_coding",8517,"X",154541199,154565046,1,"inhibitor of nuclear factor kappa B kinase regulatory subunit gamma [Source:HGNC Symbol;Acc:HGNC:5961]",264.16016859718,-0.0596284805706239,0.317465418784091,-0.187826695578385,0.851012508150622,0.999911624451988
    #"ENSG00000135341","MAP3K7","protein_coding",6885,"6",90513573,90587072,-1,"mitogen-activated protein kinase kinase kinase 7 [Source:HGNC Symbol;Acc:HGNC:6859]",469.168500006136,0.102648288503779,0.230048819356802,0.446202196519746,0.655451196420353,0.999911624451988
    #"ENSG00000050748","MAPK9","protein_coding",5601,"5",180233143,180292099,-1,"mitogen-activated protein kinase 9 [Source:HGNC Symbol;Acc:HGNC:6886]",249.338619945305,0.0631383542055737,0.2271647559144,0.2779408009461,0.781057802736406,0.999911624451988
    #"ENSG00000109339","MAPK10","protein_coding",5602,"4",85990007,86594625,-1,"mitogen-activated protein kinase 10 [Source:HGNC Symbol;Acc:HGNC:6872]",3.75355645627412,-2.0901236921552,2.18774228429073,-0.955379300004171,0.339385918189734,0.999911624451988
    #"ENSG00000185386","MAPK11","protein_coding",5600,"22",50263713,50270767,-1,"mitogen-activated protein kinase 11 [Source:HGNC Symbol;Acc:HGNC:6873]",8.66154083806627,-0.057183741613102,0.905513595426625,-0.0631506162932433,0.949646568726793,0.999911624451988
    #"ENSG00000188130","MAPK12","protein_coding",6300,"22",50245450,50261716,-1,"mitogen-activated protein kinase 12 [Source:HGNC Symbol;Acc:HGNC:6874]",11.0165465721666,-0.068123411110505,0.771289101509761,-0.0883240940098294,0.929619089537679,0.999911624451988
    #"ENSG00000156711","MAPK13","protein_coding",5603,"6",36127809,36144524,1,"mitogen-activated protein kinase 13 [Source:HGNC Symbol;Acc:HGNC:6875]",2429.62022870584,0.0304563554360708,0.161662995832214,0.188394105152429,0.850567720370941,0.999911624451988
    #"ENSG00000112062","MAPK14","protein_coding",1432,"6",36027677,36111236,1,"mitogen-activated protein kinase 14 [Source:HGNC Symbol;Acc:HGNC:6876]",571.205891076513,0.166160923432922,0.215977780993788,0.769342673437788,0.44168991057234,0.999911624451988
    
    # 批量火山图绘制脚本
    
    # 加载库
    library(ggplot2)
    library(ggrepel)
    
    # 预定义基因
    predefined_genes <- c("TNF", "CHUK", "IKBKG", "MAP3K7", "MAPK9",
                          "MAPK10", "MAPK11", "MAPK12", "MAPK13", "MAPK14")
    
    # 输入文件列表
    files <- c(
      "./Palpha_2_5h_vs_Pminus_2_5h-all_annotated.txt",
      "./Palpha_2_5h_vs_SA_2_5h-all_annotated.txt",
      "./Palpha_5h_vs_Pminus_5h-all_annotated.txt",
      "./Palpha_5h_vs_SA_5h-all_annotated.txt",
      "./Palpha_5h_vs_uninf_5h-all_annotated.txt",
      "./Pminus_5h_vs_uninf_5h-all_annotated.txt",
      "./SA_2_5h_vs_Pminus_2_5h-all_annotated.txt",
      "./SA_5h_vs_Pminus_5h-all_annotated.txt",
      "./SA_5h_vs_uninf_5h-all_annotated.txt"
    )
    
    # 循环绘制
    for (f in files) {
    
      # 读取数据
      geness_res <- read.csv(file = f, sep = ",", row.names = 1)
    
      # 颜色分类
      geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray",
                                ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
    
      # 特定基因绿色
      geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
    
      # 点大小
      geness_res$PointSize <- ifelse(geness_res$external_gene_name %in% predefined_genes, 5, 3)
    
      # 排序选前50上下调
      geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
      top_g <- unique(c(
        geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:50],
        geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:50]
      ))
    
      # y轴刻度
      y_breaks <- seq(0, ceiling(max(-log10(geness_res$padj), na.rm = TRUE)), by = 5)
      y_labels <- y_breaks
    
      # 计算 -log10(padj)
      geness_res$adjusted_pvalue <- -log10(geness_res$padj)
    
      # 输出文件名
      out_png <- paste0(sub("-all_annotated.txt", "", basename(f)), ".png")
    
      # 绘图
      png(out_png, width = 1000, height = 1000)
      p <- ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue,
                                  color = Color, size = PointSize, label = external_gene_name)) +
        geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +
        geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
        geom_point(data = subset(geness_res, !external_gene_name %in% predefined_genes), size = 3) +
        geom_point(data = subset(geness_res, external_gene_name %in% predefined_genes), size = 5) +
        labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
        scale_color_identity() +
        scale_size_identity() +
        geom_text_repel(
          data = subset(geness_res,
                        (external_gene_name %in% top_g & padj < 0.05 & abs(log2FoldChange) >= 2) |
                        external_gene_name %in% predefined_genes),
          size = 7,
          point.padding = 0.3,
          color = "black",
          min.segment.length = 0.1,
          box.padding = 0.3,
          max.overlaps = 20
        ) +
        theme_bw(base_size = 24) +
        theme(legend.position = "bottom") +
        scale_y_continuous(breaks = y_breaks, labels = y_labels)
    
      print(p)
      dev.off()
    
      message("图已生成: ", out_png)
    }
    
    # -- Alternative R Code (Using Ensembl IDs) --
    
    geness_res <- read.csv(file = "Pminus_5h_vs_uninf_5h-all_annotated.txt", sep=",", row.names=1)
    #geness_res <- df_sorted
    # Color setting
    geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray",
                            ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
    # Predefined genes colored in green
    predefined_genes <- c("")
    geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
    geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
    top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
                    geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))
    
    # Calculate breaks for the y-axis based on the data
    y_breaks <- seq(0, ceiling(max(-log10(geness_res$padj), na.rm = TRUE)), by = 5)
    y_labels <- y_breaks
    
    # Use original -log10(padj) values without compression
    geness_res$adjusted_pvalue <- -log10(geness_res$padj)
    
    # Create the plot
    png(paste(i, "png", sep="."), width=1000, height=1000)
    ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
      geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +
      geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
      geom_point(size = 3) +
      labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
      scale_color_identity() +
      geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)),
                      size = 7,
                      point.padding = 0.15,
                      color = "black",
                      min.segment.length = .1,
                      box.padding = .2,
                      lwd = 2) +
      theme_bw(base_size = 24) +
      theme(legend.position = "bottom") +
      scale_y_continuous(breaks = y_breaks, labels = y_labels)
    dev.off()
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum