Human RNA-seq processing for Data_Ben_RNAseq_2025

gene_x 0 like s 14 view s

Tags: processing, human, RNA-seq

http://xgenes.com/article/article-content/242/rna-seq-data-analysis-of-yersinia-on-grch38/

http://xgenes.com/article/article-content/209/rna-seq-skin-organoids-on-grch38-chrhsv1-final/

Create raw_data

mkdir raw_data; cd raw_data

ln -s ../20250805_AV243904_0023_A/1_Palpha_2_5h_418_3/1_Palpha_2_5h_418_3_R1.fastq.gz Palpha_2_5h_3_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/1_Palpha_2_5h_418_3/1_Palpha_2_5h_418_3_R2.fastq.gz Palpha_2_5h_3_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/1_Palpha_2_5h_418_4/1_Palpha_2_5h_418_4_R1.fastq.gz Palpha_2_5h_4_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/1_Palpha_2_5h_418_4/1_Palpha_2_5h_418_4_R2.fastq.gz Palpha_2_5h_4_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/2_SA_2_5h_418_3/2_SA_2_5h_418_3_R1.fastq.gz SA_2_5h_3_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/2_SA_2_5h_418_3/2_SA_2_5h_418_3_R2.fastq.gz SA_2_5h_3_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/2_SA_2_5h_418_4/2_SA_2_5h_418_4_R1.fastq.gz SA_2_5h_4_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/2_SA_2_5h_418_4/2_SA_2_5h_418_4_R2.fastq.gz SA_2_5h_4_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/3_Pminus_2_5h_418_3/3_Pminus_2_5h_418_3_R1.fastq.gz Pminus_2_5h_3_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/3_Pminus_2_5h_418_3/3_Pminus_2_5h_418_3_R2.fastq.gz Pminus_2_5h_3_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/3_Pminus_2_5h_418_4/3_Pminus_2_5h_418_4_R1.fastq.gz Pminus_2_5h_4_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/3_Pminus_2_5h_418_4/3_Pminus_2_5h_418_4_R2.fastq.gz Pminus_2_5h_4_R2.fastq.gz

ln -s ../20250805_AV243904_0023_A/4_Palpha_5h_418_3/4_Palpha_5h_418_3_R1.fastq.gz Palpha_5h_3_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/4_Palpha_5h_418_3/4_Palpha_5h_418_3_R2.fastq.gz Palpha_5h_3_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/4_Palpha_5h_418_4/4_Palpha_5h_418_4_R1.fastq.gz Palpha_5h_4_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/4_Palpha_5h_418_4/4_Palpha_5h_418_4_R2.fastq.gz Palpha_5h_4_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/5_SA_5h_418_3/5_SA_5h_418_3_R1.fastq.gz SA_5h_3_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/5_SA_5h_418_3/5_SA_5h_418_3_R2.fastq.gz SA_5h_3_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/5_SA_5h_418_4/5_SA_5h_418_4_R1.fastq.gz SA_5h_4_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/5_SA_5h_418_4/5_SA_5h_418_4_R2.fastq.gz SA_5h_4_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/6_Pminus_5h_418_3/6_Pminus_5h_418_3_R1.fastq.gz Pminus_5h_3_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/6_Pminus_5h_418_3/6_Pminus_5h_418_3_R2.fastq.gz Pminus_5h_3_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/6_Pminus_5h_418_4/6_Pminus_5h_418_4_R1.fastq.gz Pminus_5h_4_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/6_Pminus_5h_418_4/6_Pminus_5h_418_4_R2.fastq.gz Pminus_5h_4_R2.fastq.gz

ln -s ../20250805_AV243904_0023_A/7_uninf_5h_418_3/7_uninf_5h_418_3_R1.fastq.gz uninf_5h_3_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/7_uninf_5h_418_3/7_uninf_5h_418_3_R2.fastq.gz uninf_5h_3_R2.fastq.gz
ln -s ../20250805_AV243904_0023_A/7_uninf_5h_418_4/7_uninf_5h_418_4_R1.fastq.gz uninf_5h_4_R1.fastq.gz
ln -s ../20250805_AV243904_0023_A/7_uninf_5h_418_4/7_uninf_5h_418_4_R2.fastq.gz uninf_5h_4_R2.fastq.gz

mkdir trimmed trimmed_unpaired;
for sample_id in Palpha_2_5h_3 Palpha_2_5h_4 SA_2_5h_3 SA_2_5h_4 Pminus_2_5h_3 Pminus_2_5h_4 Palpha_5h_3 Palpha_5h_4 SA_5h_3 SA_5h_4 Pminus_5h_3 Pminus_5h_4 uninf_5h_3 uninf_5h_4; do
        java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 raw_data/${sample_id}_R1.fastq.gz raw_data/${sample_id}_R2.fastq.gz trimmed/${sample_id}_R1.fastq.gz trimmed_unpaired/${sample_id}_R1.fastq.gz trimmed/${sample_id}_R2.fastq.gz trimmed_unpaired/${sample_id}_R2.fastq.gz ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log;
done
mv trimmed/*.fastq.gz .

Create samplesheet.csv

sample,fastq_1,fastq_2,strandedness
Palpha_2_5h_3,Palpha_2_5h_3_R1.fastq.gz,Palpha_2_5h_3_R2.fastq.gz,auto
Palpha_2_5h_4,Palpha_2_5h_4_R1.fastq.gz,Palpha_2_5h_4_R2.fastq.gz,auto
SA_2_5h_3,SA_2_5h_3_R1.fastq.gz,SA_2_5h_3_R2.fastq.gz,auto
SA_2_5h_4,SA_2_5h_4_R1.fastq.gz,SA_2_5h_4_R2.fastq.gz,auto
...

Run nextflow

ln -s ~/Tools/nf-core-rnaseq-3.12.0/ rnaseq
#Adapt the genome path in ~/Tools/nf-core-rnaseq-3.12.0/conf/igenomes.config
#Note that the current path is "/mnt/nvme1n1p1/Homo_sapiens/Ensembl/GRCh38/..."

#---- SUCCESSFUL -----
(host_env) /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results    --genome GRCh38         -profile docker -resume  --max_cpus 60 --max_memory 512.GB --max_time 2400.h --save_reference --aligner star_salmon  --skip_deseq2_qc --skip_fastqc

Import data and construct DESeqDataSet object

# Import the required libraries
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library(gplots)
library(tximport)
library(DESeq2)
library("org.Hs.eg.db")
library(dplyr)
library(tidyverse)
setwd("~/DATA/Data_Ben_RNAseq_2025/results/star_salmon")
# Define paths to your Salmon output quantification files
files <- c("Palpha_2_5h_r3" = "./Palpha_2_5h_3/quant.sf",
        "Palpha_2_5h_r4" = "./Palpha_2_5h_4/quant.sf",
        "SA_2_5h_r3" = "./SA_2_5h_3/quant.sf",
        "SA_2_5h_r4" = "./SA_2_5h_4/quant.sf",
        "Pminus_2_5h_r3" = "./Pminus_2_5h_3/quant.sf",
        "Pminus_2_5h_r4" = "./Pminus_2_5h_4/quant.sf",
        "Palpha_5h_r3" = "./Palpha_5h_3/quant.sf",
        "Palpha_5h_r4" = "./Palpha_5h_4/quant.sf",
        "SA_5h_r3" = "./SA_5h_3/quant.sf",
        "SA_5h_r4" = "./SA_5h_4/quant.sf",
        "Pminus_5h_r3" = "./Pminus_5h_3/quant.sf",
        "Pminus_5h_r4" = "./Pminus_5h_4/quant.sf",
        "uninf_5h_r3" = "./uninf_5h_3/quant.sf",
        "uninf_5h_r4" = "./uninf_5h_4/quant.sf")
# Import the transcript abundance data with tximport
txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
# Define the replicates and condition of the samples
replicate <- factor(c("r3", "r4", "r3", "r4", "r3", "r4", "r3", "r4", "r3", "r4", "r3", "r4", "r3", "r4"))
condition <- factor(c("Palpha_2_5h","Palpha_2_5h","SA_2_5h","SA_2_5h","Pminus_2_5h","Pminus_2_5h",    "Palpha_5h","Palpha_5h","SA_5h","SA_5h","Pminus_5h","Pminus_5h",    "uninf_5h","uninf_5h"))

# Define the colData for DESeq2
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
# -- transcript-level count data (x2) --
# Create DESeqDataSet object
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
write.csv(counts(dds), file="transcript_counts.csv")

# -- gene-level count data (x2) --
# Read in the tx2gene map from salmon_tx2gene.tsv
tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
# Set the column names
colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
# Remove the gene_name column if not needed
tx2gene <- tx2gene[,1:2]
# Import and summarize the Salmon data with tximport
txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
# Continue with the DESeq2 workflow as before...
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
#dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")

d.raw <- read.csv("gene_counts.csv", header=TRUE, row.names=1)
dds <- DESeqDataSetFromMatrix(countData=d.raw, colData=colData, design=~condition+replicate)
#dds <- DESeqDataSetFromMatrix(countData=d.raw, colData=colData, design=~condition)
dim(counts(dds))
head(counts(dds), 10)
rld <- rlogTransformation(dds)

#We don't need to run DESeq(dds) before estimateSizeFactors(dds). In fact, the typical workflow in DESeq2 is the opposite: we usually run estimateSizeFactors(dds) (and other preprocessing functions) before running the main DESeq(dds) function.
#The estimateSizeFactors function is used to calculate size factors for normalization, which corrects for differences in library size (i.e., the number of read counts) between samples. This normalization step is crucial to ensure that differences in gene expression aren't merely due to differences in sequencing depth between samples.
#The DESeq function, on the other hand, performs the main differential expression analysis, comparing gene expression between different conditions or groups.
#So, the typical workflow is:
#  - Create the DESeqDataSet object.
#  - Use estimateSizeFactors to normalize for library size.
#  - (Optionally, estimate dispersion with estimateDispersions if not using the full DESeq function later.)
#  - Use DESeq for the differential expression analysis.
#  - However, it's worth noting that if you run the main DESeq function directly after creating the DESeqDataSet object, it will automatically perform the normalization (using estimateSizeFactors) and dispersion estimation steps for you. In that case, there's no need to run estimateSizeFactors separately before DESeq.

# draw simple pca and heatmap
library(gplots)
library("RColorBrewer")
mat <- assay(rld)
mm <- model.matrix(~condition, colData(rld))
mat <- limma::removeBatchEffect(mat, batch=rld$replicate, design=mm)
assay(rld) <- mat
# -- pca --
png("pca.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()
png("pca_rep.png", 1200, 800)
plotPCA(rld, intgroup=c("replicate"))
dev.off()
# -- heatmap --
png("heatmap.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()

(Optional, possibly has error) Size Factors

sizeFactors(dds)
#NULL
# Estimate size factors
dds <- estimateSizeFactors(dds)
# Estimate dispersions
dds <- estimateDispersions(dds)

Select the differentially expressed genes

#CONSOLE: mkdir degenes
setwd("/mnt/md1/DATA/Data_Ben_RNAseq_2025/results/star_salmon/degenes")

dds$condition <- relevel(dds$condition, "SA_2_5h")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("Palpha_2_5h_vs_SA_2_5h")

dds$condition <- relevel(dds$condition, "SA_5h")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("Palpha_5h_vs_SA_5h")

dds$condition <- relevel(dds$condition, "Pminus_2_5h")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("Palpha_2_5h_vs_Pminus_2_5h", "SA_2_5h_vs_Pminus_2_5h")

dds$condition <- relevel(dds$condition, "Pminus_5h")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("Palpha_5h_vs_Pminus_5h", "SA_5h_vs_Pminus_5h")

#dds$condition <- relevel(dds$condition, "uninf_2_5h")
#dds = DESeq(dds, betaPrior=FALSE)
#resultsNames(dds)
#clist <- c("Palpha_2_5h_vs_uninf_2_5h", "SA_2_5h_vs_uninf_2_5h", "Pminus_2_5h_vs_uninf_2_5h")

dds$condition <- relevel(dds$condition, "uninf_5h")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("Palpha_5h_vs_uninf_5h", "SA_5h_vs_uninf_5h", "Pminus_5h_vs_uninf_5h")

library(biomaRt)
listEnsembl()
listMarts()
#--> total 69, 27  GRCh38.p7 and 39  GRCm38.p4
ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
datasets <- listDatasets(ensembl)

for (i in clist) {
    i<-clist[1]
    contrast = paste("condition", i, sep="_")
    res = results(dds, name=contrast)
    res <- res[!is.na(res$log2FoldChange),]
    res_df <- as.data.frame(res)
    write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
    up <- subset(res_df, padj<=0.05 & log2FoldChange>=2)
    down <- subset(res_df, padj<=0.05 & log2FoldChange<=-2)
    write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
    write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

# -- Save annotated DEGs: annotate human genes 'geness_res', note that the virus genes in that is ignored in the process since they are not in the database --
for (i in clist) {
    #i<-clist[1]
    contrast = paste("condition", i, sep="_")
    res = results(dds, name=contrast)
    res <- res[!is.na(res$log2FoldChange),]
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID"))
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID"))
    # In the ENSEMBL-database, GENEID is ENSEMBL-ID.
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE"))  #  "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND"
    #geness <- geness[!duplicated(geness$GENEID), ]
    #using getBM replacing AnnotationDbi::select
    #filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids.
    geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
        filters = 'ensembl_gene_id',
        values = rownames(res),
        mart = ensembl)
    geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)
    #merge by column by common colunmn name, in the case "GENEID"
    res$ENSEMBL = rownames(res)
    identical(rownames(res), rownames(geness_uniq))
    res_df <- as.data.frame(res)
    geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
    dim(geness_res)
    rownames(geness_res) <- geness_res$ensembl_gene_id
    geness_res$ensembl_gene_id <- NULL

    df_sorted <- as.data.frame(geness_res[order(geness_res$padj),])
    write.csv(df_sorted, file = paste(i, "all_annotated.txt", sep="-"))
    up <- subset(df_sorted, padj<=0.05 & log2FoldChange>=2)
    down <- subset(df_sorted, padj<=0.05 & log2FoldChange<=-2)
    write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up_annotated.txt", sep="-"))
    write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down_annotated.txt", sep="-"))
}

#Under KONSOLE
for i in Palpha_2_5h_vs_Pminus_2_5h Palpha_2_5h_vs_SA_2_5h Palpha_5h_vs_Pminus_5h Palpha_5h_vs_SA_5h Palpha_5h_vs_uninf_5h Pminus_5h_vs_uninf_5h SA_2_5h_vs_Pminus_2_5h SA_5h_vs_Pminus_5h SA_5h_vs_uninf_5h; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all_annotated.txt ${i}-up_annotated.txt ${i}-down_annotated.txt -d$',' -o ${i}.xls;"; done

Generate volcano-plots including highlighting the pre-defined genes

# -- Identify the Ensembl IDs for the genes mentioned in the provided text (TNF-Alpha, IKBK/CHUK, IKBKG, MAP3K7, MAPK9, MAPK10, MAPK11, MAPK12, MAPK13, MAPK14) --
using reliable genomic databases. Then, I’ll integrate these IDs into the provided R code to mark these specific genes in green on the volcano plot. I’ll ensure the code remains functional and compatible with your requirements, maintaining the non-compressed volcano plot structure from the previous response.
Step 1: Finding Ensembl IDs for the Genes
The genes mentioned are part of the TNF-Alpha pathway and related signaling pathways (e.g., NF-κB and MAPK cascades). Below, I list each gene with its corresponding Ensembl ID, sourced from the Ensembl database (GRCh38/hg38 assembly, accessed via web resources like Ensembl.org and GeneCards).

TNF-Alpha (TNF):

Official gene symbol: TNF
Ensembl ID: ENSG00000232810
Description: Tumor necrosis factor, a cytokine involved in inflammation and immune response.

IKBK/CHUK:

Official gene symbol: CHUK (also known as IKK-alpha, IKK1)
Ensembl ID: ENSG00000213341
Description: Component of inhibitor of nuclear factor kappa B kinase complex, part of the IKK complex in NF-κB signaling.

IKBKG:

Official gene symbol: IKBKG (also known as NEMO, IKK-gamma)
Ensembl ID: ENSG00000269335
Description: Regulatory subunit of the IKK complex, critical for NF-κB activation.

MAP3K7:

Official gene symbol: MAP3K7 (also known as TAK1)
Ensembl ID: ENSG00000135341
Description: Mitogen-activated protein kinase kinase kinase 7, involved in MAPK and NF-κB pathways.

MAPK9:

Official gene symbol: MAPK9 (also known as JNK2)
Ensembl ID: ENSG00000050748
Description: Mitogen-activated protein kinase 9, part of the JNK signaling pathway.

MAPK10:

Official gene symbol: MAPK10 (also known as JNK3)
Ensembl ID: ENSG00000109339
Description: Mitogen-activated protein kinase 10, involved in JNK signaling.

MAPK11:

Official gene symbol: MAPK11 (also known as p38-beta)
Ensembl ID: ENSG00000185386
Description: Mitogen-activated protein kinase 11, part of the p38 MAPK pathway.

MAPK12:

Official gene symbol: MAPK12 (also known as p38-gamma)
Ensembl ID: ENSG00000188130
Description: Mitogen-activated protein kinase 12, involved in p38 MAPK signaling.

MAPK13:

Official gene symbol: MAPK13 (also known as p38-delta)
Ensembl ID: ENSG00000156711
Description: Mitogen-activated protein kinase 13, part of the p38 MAPK pathway.

MAPK14:

Official gene symbol: MAPK14 (also known as p38-alpha)
Ensembl ID: ENSG00000112062
Description: Mitogen-activated protein kinase 14, a key component of the p38 MAPK pathway.

Note on IKBK: The text mentions “IKBK/CHUK,” which likely refers to CHUK (IKK-alpha). There is no gene explicitly named “IKBK” in standard databases, but it may be a typo or shorthand for IKBKB (IKK-beta, Ensembl ID: ENSG00000104365). Since the text explicitly lists CHUK, I include only CHUK here, but I can add IKBKB (ENSG00000104365) if you confirm it’s needed.

#grep "ENSG00000232810" Palpha_5h_vs_SA_5h-all_annotated.txt
#grep "ENSG00000213341" Palpha_5h_vs_SA_5h-all_annotated.txt
#grep "ENSG00000269335" Palpha_5h_vs_SA_5h-all_annotated.txt
#grep "ENSG00000135341" Palpha_5h_vs_SA_5h-all_annotated.txt
#grep "ENSG00000050748" Palpha_5h_vs_SA_5h-all_annotated.txt
#grep "ENSG00000109339" Palpha_5h_vs_SA_5h-all_annotated.txt
#grep "ENSG00000185386" Palpha_5h_vs_SA_5h-all_annotated.txt
#grep "ENSG00000188130" Palpha_5h_vs_SA_5h-all_annotated.txt
#grep "ENSG00000156711" Palpha_5h_vs_SA_5h-all_annotated.txt
#grep "ENSG00000112062" Palpha_5h_vs_SA_5h-all_annotated.txt
#"ENSG00000232810","TNF","protein_coding",7124,"6",31575565,31578336,1,"tumor necrosis factor [Source:HGNC Symbol;Acc:HGNC:11892]",17354.5202707282,-0.0316703649357798,1.51550966895126,-0.0208975010748007,0.983327420043257,0.999911624451988
#"ENSG00000213341","CHUK","protein_coding",1147,"10",100188300,100229596,-1,"component of inhibitor of nuclear factor kappa B kinase complex [Source:HGNC Symbol;Acc:HGNC:1974]",250.048543715369,0.0180137792628405,0.29151899644714,0.0617928144730933,0.950727825773712,0.999911624451988
#"ENSG00000269335","IKBKG","protein_coding",8517,"X",154541199,154565046,1,"inhibitor of nuclear factor kappa B kinase regulatory subunit gamma [Source:HGNC Symbol;Acc:HGNC:5961]",264.16016859718,-0.0596284805706239,0.317465418784091,-0.187826695578385,0.851012508150622,0.999911624451988
#"ENSG00000135341","MAP3K7","protein_coding",6885,"6",90513573,90587072,-1,"mitogen-activated protein kinase kinase kinase 7 [Source:HGNC Symbol;Acc:HGNC:6859]",469.168500006136,0.102648288503779,0.230048819356802,0.446202196519746,0.655451196420353,0.999911624451988
#"ENSG00000050748","MAPK9","protein_coding",5601,"5",180233143,180292099,-1,"mitogen-activated protein kinase 9 [Source:HGNC Symbol;Acc:HGNC:6886]",249.338619945305,0.0631383542055737,0.2271647559144,0.2779408009461,0.781057802736406,0.999911624451988
#"ENSG00000109339","MAPK10","protein_coding",5602,"4",85990007,86594625,-1,"mitogen-activated protein kinase 10 [Source:HGNC Symbol;Acc:HGNC:6872]",3.75355645627412,-2.0901236921552,2.18774228429073,-0.955379300004171,0.339385918189734,0.999911624451988
#"ENSG00000185386","MAPK11","protein_coding",5600,"22",50263713,50270767,-1,"mitogen-activated protein kinase 11 [Source:HGNC Symbol;Acc:HGNC:6873]",8.66154083806627,-0.057183741613102,0.905513595426625,-0.0631506162932433,0.949646568726793,0.999911624451988
#"ENSG00000188130","MAPK12","protein_coding",6300,"22",50245450,50261716,-1,"mitogen-activated protein kinase 12 [Source:HGNC Symbol;Acc:HGNC:6874]",11.0165465721666,-0.068123411110505,0.771289101509761,-0.0883240940098294,0.929619089537679,0.999911624451988
#"ENSG00000156711","MAPK13","protein_coding",5603,"6",36127809,36144524,1,"mitogen-activated protein kinase 13 [Source:HGNC Symbol;Acc:HGNC:6875]",2429.62022870584,0.0304563554360708,0.161662995832214,0.188394105152429,0.850567720370941,0.999911624451988
#"ENSG00000112062","MAPK14","protein_coding",1432,"6",36027677,36111236,1,"mitogen-activated protein kinase 14 [Source:HGNC Symbol;Acc:HGNC:6876]",571.205891076513,0.166160923432922,0.215977780993788,0.769342673437788,0.44168991057234,0.999911624451988

# 批量火山图绘制脚本

# 加载库
library(ggplot2)
library(ggrepel)

# 预定义基因
predefined_genes <- c("TNF", "CHUK", "IKBKG", "MAP3K7", "MAPK9",
                      "MAPK10", "MAPK11", "MAPK12", "MAPK13", "MAPK14")

# 输入文件列表
files <- c(
  "./Palpha_2_5h_vs_Pminus_2_5h-all_annotated.txt",
  "./Palpha_2_5h_vs_SA_2_5h-all_annotated.txt",
  "./Palpha_5h_vs_Pminus_5h-all_annotated.txt",
  "./Palpha_5h_vs_SA_5h-all_annotated.txt",
  "./Palpha_5h_vs_uninf_5h-all_annotated.txt",
  "./Pminus_5h_vs_uninf_5h-all_annotated.txt",
  "./SA_2_5h_vs_Pminus_2_5h-all_annotated.txt",
  "./SA_5h_vs_Pminus_5h-all_annotated.txt",
  "./SA_5h_vs_uninf_5h-all_annotated.txt"
)

# 循环绘制
for (f in files) {

  # 读取数据
  geness_res <- read.csv(file = f, sep = ",", row.names = 1)

  # 颜色分类
  geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray",
                            ifelse(geness_res$log2FoldChange > 0, "red", "blue"))

  # 特定基因绿色
  geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"

  # 点大小
  geness_res$PointSize <- ifelse(geness_res$external_gene_name %in% predefined_genes, 5, 3)

  # 排序选前50上下调
  geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
  top_g <- unique(c(
    geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:50],
    geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:50]
  ))

  # y轴刻度
  y_breaks <- seq(0, ceiling(max(-log10(geness_res$padj), na.rm = TRUE)), by = 5)
  y_labels <- y_breaks

  # 计算 -log10(padj)
  geness_res$adjusted_pvalue <- -log10(geness_res$padj)

  # 输出文件名
  out_png <- paste0(sub("-all_annotated.txt", "", basename(f)), ".png")

  # 绘图
  png(out_png, width = 1000, height = 1000)
  p <- ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue,
                              color = Color, size = PointSize, label = external_gene_name)) +
    geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +
    geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
    geom_point(data = subset(geness_res, !external_gene_name %in% predefined_genes), size = 3) +
    geom_point(data = subset(geness_res, external_gene_name %in% predefined_genes), size = 5) +
    labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
    scale_color_identity() +
    scale_size_identity() +
    geom_text_repel(
      data = subset(geness_res,
                    (external_gene_name %in% top_g & padj < 0.05 & abs(log2FoldChange) >= 2) |
                    external_gene_name %in% predefined_genes),
      size = 7,
      point.padding = 0.3,
      color = "black",
      min.segment.length = 0.1,
      box.padding = 0.3,
      max.overlaps = 20
    ) +
    theme_bw(base_size = 24) +
    theme(legend.position = "bottom") +
    scale_y_continuous(breaks = y_breaks, labels = y_labels)

  print(p)
  dev.off()

  message("图已生成: ", out_png)
}

# -- Alternative R Code (Using Ensembl IDs) --

geness_res <- read.csv(file = "Pminus_5h_vs_uninf_5h-all_annotated.txt", sep=",", row.names=1)
#geness_res <- df_sorted
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray",
                        ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c("")
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
                geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))

# Calculate breaks for the y-axis based on the data
y_breaks <- seq(0, ceiling(max(-log10(geness_res$padj), na.rm = TRUE)), by = 5)
y_labels <- y_breaks

# Use original -log10(padj) values without compression
geness_res$adjusted_pvalue <- -log10(geness_res$padj)

# Create the plot
png(paste(i, "png", sep="."), width=1000, height=1000)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
  geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +
  geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
  geom_point(size = 3) +
  labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
  scale_color_identity() +
  geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)),
                  size = 7,
                  point.padding = 0.15,
                  color = "black",
                  min.segment.length = .1,
                  box.padding = .2,
                  lwd = 2) +
  theme_bw(base_size = 24) +
  theme(legend.position = "bottom") +
  scale_y_continuous(breaks = y_breaks, labels = y_labels)
dev.off()

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

Human RNA-seq processing for Data_Ben_RNAseq_2025

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章