Workflow for RNA-Binding Protein Enrichment and RNA Type Distribution Analysis (Ute’s Project)

gene_x 0 like s 158 view s

Tags: pipeline

Download RBP motifs (PWM) from ATtRACT_DB: ATtRACT_db.txt and pwm.txt; Convert to MEME format (if needed) (Under ~/DATA/Data_Ute/RBP_enrichments/)

# Both generate_named_meme.py and generate_attract_human_meme.py generate the attract_human.meme inkl. Gene_name
# Note that "grep + generate_named_meme.py" = generate_attract_human_meme.py
#grep "Homo_sapiens" ATtRACT_db.txt > attract_human.txt  #3256
#python ~/Scripts/generate_named_meme.py pwm.txt attract_human.txt  #OUTPUT: attract_human_named.meme
#python ~/Scripts/generate_attract_human_meme.py pwm.txt ATtRACT_db.txt #OUTPUT: attract_human.meme and it is the same to attract_human_named.meme, both are named motifs, in total 1583 MOTIFs, for example "MOTIF 904_HNRNPH2", only for human-reading, not for pipeline

#cut -f12 attract_human.txt | sort | uniq > valid_ids.txt
python ~/Scripts/convert_attract_pwm_to_meme.py  #Input is "pwm.txt" #OUTPUT: "attract_human.meme", in total 1583 MOTIFs, for example "MOTIF 904"

Download GENCODE (Under ~/REFs/)

#Visit and Download: GENCODE FTP site https://www.gencodegenes.org/human/
    * GTF annotation file (e.g., gencode.v48.annotation.gtf.gz)
    * Corresponding genome FASTA (e.g., GRCh38.primary_assembly.genome.fa.gz)
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/gencode.v48.annotation.gtf.gz
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/GRCh38.primary_assembly.genome.fa.gz
gunzip gencode.v48.annotation.gtf.gz
gunzip GRCh38.primary_assembly.genome.fa.gz

Get 3UTR.fasta, 5UTR.fasta, CDS.fasta and transcripts.fasta based on GENCODE-files (Under ~/DATA/Data_Ute/RBP_enrichments/)

grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt > MKL-1_wt.EV_vs_parental-up_protein_coding.txt
grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt > MKL-1_wt.EV_vs_parental-down_protein_coding.txt
grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-all.txt > MKL-1_wt.EV_vs_parental-all_protein_coding.txt
grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt > WaGa_wt.EV_vs_parental-up_protein_coding.txt
grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt > WaGa_wt.EV_vs_parental-down_protein_coding.txt
grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-all.txt > WaGa_wt.EV_vs_parental-all_protein_coding.txt

#Usage: python extract_transcript_parts.py <gene_list.txt> <gencode.gtf> <genome.fa> <out_prefix>, the script generates <out_prefix>.[transcripts|CDS|5UTR|3UTR].fasta files.
python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-down_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_down  #112
python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-up_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_up  #5988
python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-all_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_background  #19239
python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-down_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_down  #93
python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-up_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_up  #6538
python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-all_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_background  #19239

python ~/Scripts/filter_short_fasta.py MKL-1_up.3UTR.fasta MKL-1_up.filtered.3UTR.fasta
python ~/Scripts/filter_short_fasta.py MKL-1_down.3UTR.fasta MKL-1_down.filtered.3UTR.fasta
python ~/Scripts/filter_short_fasta.py MKL-1_background.3UTR.fasta MKL-1_background.filtered.3UTR.fasta
# 检查背景文件中有多少序列：
#grep -c "^>" MKL-1_background.filtered.3UTR.fasta
#68890
# 检查背景 FIMO 命中的总序列数：
#cut -f3 fimo_background_MKL-1_background/fimo.tsv | sort | uniq | wc -l
#67841
python ~/Scripts/filter_short_fasta.py WaGa_background.3UTR.fasta WaGa_background.filtered.3UTR.fasta
python ~/Scripts/filter_short_fasta.py WaGa_up.3UTR.fasta WaGa_up.filtered.3UTR.fasta
python ~/Scripts/filter_short_fasta.py WaGa_down.3UTR.fasta WaGa_down.filtered.3UTR.fasta

FIMO for motif scan on 3UTR sequences

# Generate fimo_foreground_MKL-1_down/fimo.tsv
mamba activate meme_env
fimo --thresh 1e-4 --oc fimo_foreground_MKL-1_down attract_human_named.meme MKL-1_down.3UTR.fasta
fimo --thresh 1e-4 --oc fimo_foreground_MKL-1_up attract_human.meme MKL-1_up.3UTR.fasta
fimo --thresh 1e-4 --oc fimo_background_MKL-1_background attract_human.meme MKL-1_background.3UTR.fasta
fimo --thresh 1e-4 --oc fimo_foreground_WaGa_down attract_human.meme WaGa_down.3UTR.fasta
fimo --thresh 1e-4 --oc fimo_foreground_WaGa_up attract_human.meme WaGa_up.3UTR.fasta
fimo --thresh 1e-4 --oc fimo_background_WaGa_background attract_human.meme WaGa_background.3UTR.fasta

#Keep only one match per gene (based on Ensembl Gene ID like ENSG00000134871) for each RBP motif, even if multiple transcripts have hits.
#python ~/Scripts/filter_fimo_best_per_gene.py --input fimo_foreground/fimo.tsv --output fimo_foreground/fimo.filtered.tsv

python ~/Scripts/convert_gtf_to_Gene_annotation_TSV_file.py ~/REFs/Homo_sapiens.GRCh38.114.gtf Homo_sapiens.GRCh38.gene_annotation.tsv

python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
--input fimo_foreground_MKL-1_down/fimo.tsv \
--annot Homo_sapiens.GRCh38.gene_annotation.tsv \
--output_filtered fimo_foreground_MKL-1_down/fimo.filtered.tsv \
--output_annotated fimo_foreground_MKL-1_down/fimo.filtered.annotated.tsv
#21559
python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
--input fimo_foreground_MKL-1_up/fimo.tsv \
--annot Homo_sapiens.GRCh38.gene_annotation.tsv \
--output_filtered fimo_foreground_MKL-1_up/fimo.filtered.tsv \
--output_annotated fimo_foreground_MKL-1_up/fimo.filtered.annotated.tsv
#(736661 rows)
python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
--input fimo_background_MKL-1_background/fimo.tsv \
--annot Homo_sapiens.GRCh38.gene_annotation.tsv \
--output_filtered fimo_background_MKL-1_background/fimo.filtered.tsv \
--output_annotated fimo_background_MKL-1_background/fimo.filtered.annotated.tsv
#(1869075 rows)
python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
--input fimo_foreground_WaGa_down/fimo.tsv \
--annot Homo_sapiens.GRCh38.gene_annotation.tsv \
--output_filtered fimo_foreground_WaGa_down/fimo.filtered.tsv \
--output_annotated fimo_foreground_WaGa_down/fimo.filtered.annotated.tsv
#(20364 rows)
python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
--input fimo_foreground_WaGa_up/fimo.tsv \
--annot Homo_sapiens.GRCh38.gene_annotation.tsv \
--output_filtered fimo_foreground_WaGa_up/fimo.filtered.tsv \
--output_annotated fimo_foreground_WaGa_up/fimo.filtered.annotated.tsv
#(805634 rows)
python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
--input fimo_background_WaGa_background/fimo.tsv \
--annot Homo_sapiens.GRCh38.gene_annotation.tsv \
--output_filtered fimo_background_WaGa_background/fimo.filtered.tsv \
--output_annotated fimo_background_WaGa_background/fimo.filtered.annotated.tsv
#(1811615 rows)

python ~/Scripts/run_enrichment.py \
    --attract ATtRACT_db.txt \
    --fimo_fg fimo_foreground_MKL-1_up/fimo.filtered.tsv \
    --fimo_bg fimo_background_MKL-1_background/fimo.filtered.tsv \
    --output rbp_enrichment_MKL-1_up.csv \
    --strategy inclusive
python ~/Scripts/run_enrichment.py \
    --attract ATtRACT_db.txt \
    --fimo_fg fimo_foreground_MKL-1_down/fimo.filtered.tsv \
    --fimo_bg fimo_background_MKL-1_background/fimo.filtered.tsv \
    --output rbp_enrichment_MKL-1_down.csv
python ~/Scripts/run_enrichment.py \
    --attract ATtRACT_db.txt \
    --fimo_fg fimo_foreground_WaGa_up/fimo.filtered.tsv \
    --fimo_bg fimo_background_WaGa_background/fimo.filtered.tsv \
    --output rbp_enrichment_WaGa_up.csv
python ~/Scripts/run_enrichment.py \
    --attract ATtRACT_db.txt \
    --fimo_fg fimo_foreground_WaGa_down/fimo.filtered.tsv \
    --fimo_bg fimo_background_WaGa_background/fimo.filtered.tsv \
    --output rbp_enrichment_WaGa_down.csv

#Column Meaning in rbp_enrichment_*.[csv|xlsx]
#a  Number of unique foreground UTRs hit by the RBP
#b  Number of unique background UTRs hit by the RBP
#c  Total number of foreground UTRs
#d  Total number of background UTRs (⬅️ this is the value you're asking about)
#p_value, fdr   From Fisher's exact test on enrichment

# The following pdf-files are not used and sent!
python ~/Scripts/plot_volcano.py --csv rbp_enrichment_MKL-1_up.csv --output MKL-1_volcano_up.pdf --title "Upregulated MKL-1"
python ~/Scripts/plot_rbp_heatmap.py \
--csvs rbp_enrichment_MKL-1_up.csv rbp_enrichment_MKL-1_down.csv \
--labels Upregulated Downregulated \
--output MKL-1_rbp_enrichment_heatmap.pdf

#-- Get all genes the number 1621 refers to --
#AGO2,1621,5050,5732,12987,1.0,1.0   #MKL-1_up
#motif_ids are 414 and 399
grep "^414" fimo.filtered.annotated.tsv > AGO2.txt
grep "^399" fimo.filtered.annotated.tsv >> AGO2.txt
cut -d$'\t' -f11 AGO2.txt | sort -u > AGO2_uniq.txt
wc -l AGO2_uniq.txt
#1621 AGO2_uniq.txt

miRNAs motif analysis using ATtRACT + FIMO

    #* Extract their sequences
    #* Generate a background set
    #* Run RBP enrichment (e.g., with RBPmap or FIMO)
    #* Get p-adjusted enrichment stats (e.g., Fisher + BH)

    #Input_1. DE results (differential expression file from smallRNA-seq)
        #Input: up- and down-, all-regulated files
        #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-up.txt  #66
        #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-down.txt  #38
        #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-all.txt  #1304
        #Format: 1st column = miRNA ID (e.g., hsa-miR-21-5p), optionally with other stats.

        cp ~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-*.txt .
        #"hsa-miR-3180|hsa-miR-3180-3p"
        #>hsa-miR-3180 MIMAT0018178 Homo sapiens miR-3180
        #UGGGGCGGAGCUUCCGGAG
        #>hsa-miR-3180-3p MIMAT0015058 Homo sapiens miR-3180-3p
        #UGGGGCGGAGCUUCCGGAGGCC

    #Input_2. Reference FASTA (Reference sequences from miRBase or GENCODE)
        #From miRBase: https://mirbase.org/download/  https://mirbase.org/download/CURRENT/
        ##miRBase_v21
        #mature.fa.gz → contains mature miRNA sequences
        #hairpin.fa.gz → for pre-miRNAs
        mv ../RBP_enrichments_OLD_DEL/mature_v21.fa .

    # -- Extract Sequences + Background Set --

    #Inputs:
    #    * up_miRNA.txt and down_miRNA.txt: DE results (first column = miRNA name, e.g., hsa-miR-21-5p)
    #    * mature.fa or hairpin.fa from miRBase

    #Outputs:
    #    * mirna_up.fa
    #    * mirna_down.fa
    #    * mirna_background.fa

    #Use all remaining miRNAs as background:
    python ~/Scripts/prepare_miRNA_sets.py untreated_vs_parental_cells-up.txt untreated_vs_parental_cells-down.txt mature_v21.fa mirna --full-background
    mv mirna_background.fa mirna_full-background.fa

    #Use random subset background. Note that the generated background has the number of maxsize(up, down), in the case is up (84 records):
    #python ~/Scripts/prepare_miRNA_sets.py untreated_vs_parental_cells-up.txt untreated_vs_parental_cells-down.txt mature_v21.fa mirna
    # grep ">" mature_v21.fa | wc -l  #35828
    # grep ">" mirna_full-background.fa | wc -l  #35710-->35723
    # grep ">" mirna_up.fa | wc -l  #84
    # grep ">" mirna_down.fa | wc -l  #34
    # grep ">" mirna_background.fa | wc -l  #84-->67
    # #35,710 + 84 + 34 = 35,828

    fimo --thresh 1e-4 --oc fimo_mirna_down attract_human.meme mirna_down.fa
    fimo --thresh 1e-4 --oc fimo_mirna_up attract_human.meme mirna_up.fa
    fimo --thresh 1e-4 --oc fimo_mirna_full-background attract_human.meme mirna_full-background.fa
    #fimo --thresh 1e-4 --oc fimo_mirna_background attract_human.meme mirna_background.fa

    python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
    --input fimo_mirna_down/fimo.tsv \
    --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
    --output_filtered fimo_mirna_down/fimo.filtered.tsv \
    --output_annotated fimo_mirna_down/fimo.filtered.annotated.tsv  #21
    python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
    --input fimo_mirna_up/fimo.tsv \
    --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
    --output_filtered fimo_mirna_up/fimo.filtered.tsv \
    --output_annotated fimo_mirna_up/fimo.filtered.annotated.tsv  #48
    python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
    --input fimo_mirna_full-background/fimo.tsv \
    --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
    --output_filtered fimo_mirna_full-background/fimo.filtered.tsv \
    --output_annotated fimo_mirna_full-background/fimo.filtered.annotated.tsv  #896
    #python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
    #--input fimo_mirna_background/fimo.tsv \
    #--annot Homo_sapiens.GRCh38.gene_annotation.tsv \
    #--output_filtered fimo_mirna_background/fimo.filtered.tsv \
    #--output_annotated fimo_mirna_background/fimo.filtered.annotated.tsv  #57

    python ~/Scripts/run_enrichment_miRNAs.py \
        --attract ATtRACT_db.txt \
        --fimo_fg fimo_mirna_up/fimo.filtered.tsv \
        --fimo_bg fimo_mirna_full-background/fimo.filtered.tsv \
        --output rbp_enrichment_mirna_up.csv \
        --strategy inclusive
    python ~/Scripts/run_enrichment_miRNAs.py \
        --attract ATtRACT_db.txt \
        --fimo_fg fimo_mirna_down/fimo.filtered.tsv \
        --fimo_bg fimo_mirna_full-background/fimo.filtered.tsv \
        --output rbp_enrichment_mirna_down.csv \
        --strategy inclusive
    #python ~/Scripts/run_enrichment_miRNAs.py \
    #    --attract ATtRACT_db.txt \
    #    --fimo_fg fimo_mirna_up/fimo.filtered.tsv \
    #    --fimo_bg fimo_mirna_background/fimo.filtered.tsv \
    #    --output rbp_enrichment_mirna_up_on_subset-background.csv \
    #    --strategy inclusive
    #python ~/Scripts/run_enrichment_miRNAs.py \
    #    --attract ATtRACT_db.txt \
    #    --fimo_fg fimo_mirna_down/fimo.filtered.tsv \
    #    --fimo_bg fimo_mirna_background/fimo.filtered.tsv \
    #    --output rbp_enrichment_mirna_down_on_subset-background.csv \
    #    --strategy inclusive

    #FXR2   1 (hsa-miR-92b-5p)  1   1   118 0.0168067226890756  0.365546218487395
    #ORB2   1 (hsa-miR-4748)    1   1   118 0.0168067226890756  0.365546218487395

    #-- Get all genes the number 1621 refers to --
    grep "^FXR2" ATtRACT_db.txt
    #motif_ids is M020_0.6
    grep "^M020_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv > FXR2.txt
    grep "^M020_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv
    #cut -d$'\t' -f11 AGO2.txt | sort -u > AGO2_uniq.txt
    #wc -l AGO2_uniq.txt (1621 records)

    grep "^ORB2" ATtRACT_db.txt
    grep "^M120_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv

Reports for RBP enrichment results

Please find attached the results of the RNA-binding protein (RBP) enrichment analysis using FIMO and the ATtRACT motif database, along with a brief description of the procedures used for both the 3′ UTR-based analysis (RNA-seq) and the miRNA-based analysis (small RNA-seq).

1. RBP Motif Enrichment from RNA-seq (3′ UTRs)

We focused on 3′ UTRs, as they are key regulatory regions for RBPs. Sequences shorter than 16 nucleotides were excluded. Using FIMO (from the MEME suite) with motifs from the ATtRACT database, we scanned both foreground and background 3′ UTR sets to identify motif occurrences.

Foreground: Differentially expressed transcripts (e.g., MKL-1 up/down, WaGa up/down)
Background: All non-differentially expressed transcripts

Analysis: Fisher’s exact test was used to assess motif enrichment; p-values were adjusted using the Benjamini–Hochberg method.

Output files (RNA-seq):

    * rbp_enrichment_MKL-1_down.xlsx / .png
    * rbp_enrichment_MKL-1_up.xlsx / .png
    * rbp_enrichment_WaGa_down.xlsx / .png
    * rbp_enrichment_WaGa_up.xlsx / .png

2. RBP Motif Enrichment from Small RNA-seq (miRNAs)

This analysis focused on differentially expressed miRNAs, using either mature miRNA sequences from miRBase. We scanned for RBP binding motifs within these sequences using FIMO and assessed motif enrichment relative to background sets.

Foreground: DE miRNAs (up/down) from small RNA-seq comparisons
Background: All other miRNAs from miRBase

Analysis: FIMO was used with --thresh 1e-4, followed by annotation and filtering. Enrichment was assessed using Fisher’s test + BH correction.

Output files (miRNAs):

    * rbp_enrichment_mirna_down.xlsx
    * rbp_enrichment_mirna_up.xlsx

How to Interpret the Numbers
Each row in the result tables represents one RBP and its enrichment statistics:

a: foreground genes/sequences with the motif
b: background genes/sequences with the motif
c: total number of foreground genes/sequences
d: total number of background genes/sequences

These values are used to compute p-values and FDRs.

For example, in rbp_enrichment_MKL-1_up.xlsx, AGO2 has a = 1621, meaning FIMO detected AGO2 motifs in 1,621 genes in the MKL-1 upregulated set. These genes are listed in AGO2_uniq.txt.

Similarly, for the miRNA analysis (e.g., rbp_enrichment_mirna_up.xlsx and rbp_enrichment_mirna_down.xlsx), the numbers represent counts of unique miRNAs with at least one significant motif hit. As examples, I calculated the detailed membership for FXR2 and ORB2.

diff RBP_enrichments/rbp_enrichment_MKL-1_up.csv  RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.csv
#diff RBP_enrichments/rbp_enrichment_MKL-1_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.xlsx
#diff RBP_enrichments/rbp_enrichment_MKL-1_up.png   RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.png
diff RBP_enrichments/rbp_enrichment_MKL-1_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.csv
#diff RBP_enrichments/rbp_enrichment_MKL-1_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.xlsx
#diff RBP_enrichments/rbp_enrichment_MKL-1_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.png
diff RBP_enrichments/rbp_enrichment_WaGa_up.csv RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.csv
#diff RBP_enrichments/rbp_enrichment_WaGa_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.xlsx
#diff RBP_enrichments/rbp_enrichment_WaGa_up.png RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.png
diff RBP_enrichments/rbp_enrichment_WaGa_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.csv
#diff RBP_enrichments/rbp_enrichment_WaGa_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.xlsx
#diff RBP_enrichments/rbp_enrichment_WaGa_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.png
okular RBP_enrichments/MKL-1_volcano_up.pdf
okular RBP_enrichments_OLD_DEL/MKL-1_volcano_up.pdf
okular RBP_enrichments/MKL-1_rbp_enrichment_heatmap.pdf
okular RBP_enrichments_OLD_DEL/MKL-1_rbp_enrichment_heatmap.pdf

diff RBP_enrichments/rbp_enrichment_mirna_up.csv RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up_on_full-background.csv
#diff RBP_enrichments/rbp_enrichment_mirna_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up.xlsx
#diff RBP_enrichments/rbp_enrichment_mirna_up.png RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up_on_full-background.png
diff RBP_enrichments/rbp_enrichment_mirna_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down_on_full-background.csv
#diff RBP_enrichments/rbp_enrichment_mirna_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down.xlsx
#diff RBP_enrichments/rbp_enrichment_mirna_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down_on_full-background.png

Generate the sequence logos for the enriched RBP motives

import os
import pandas as pd
import matplotlib.pyplot as plt
import logomaker
from pathlib import Path
import re

# --------------
# Config
# --------------
motif_table_path = "ATtRACT_db.txt"
pwm_file_path = "pwm.txt"
input_files = [
    "rbp_enrichment_MKL-1_up.csv",
    "rbp_enrichment_MKL-1_down.csv",
    "rbp_enrichment_WaGa_up.csv",
    "rbp_enrichment_WaGa_down.csv",
    "rbp_enrichment_mirna_up.csv",
    "rbp_enrichment_mirna_down.csv"
]
output_dir = "sequence_logos"
os.makedirs(output_dir, exist_ok=True)

# --------------
# Helper Functions
# --------------
def load_pwm_file(pwm_path):
    pwm_dict = {}
    current_id = None
    pwm_matrix = []
    with open(pwm_path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith('>'):
                if current_id and pwm_matrix:
                    pwm_dict[current_id] = pwm_matrix
                parts = line[1:].split()
                current_id = parts[0]
                pwm_matrix = []
            else:
                row = [float(x) for x in line.split()]
                if len(row) == 4:
                    pwm_matrix.append(row)
    if current_id and pwm_matrix:
        pwm_dict[current_id] = pwm_matrix
    return pwm_dict

def sanitize_filename(text):
    return re.sub(r'[^\w\-_.]', '_', text)

def plot_logo(pwm_df, title, output_path):
    fig, ax = plt.subplots(figsize=(len(pwm_df)*0.5, 2))
    logo = logomaker.Logo(pwm_df, ax=ax, color_scheme='classic')
    logo.style_spines(visible=False)
    logo.style_spines(spines=['left', 'bottom'], visible=True)
    ax.set_ylabel("Information")
    ax.set_title(title)
    fig.tight_layout()
    fig.savefig(output_path, bbox_inches='tight')
    plt.close(fig)

# --------------
# Load Motif Table and Filter per your strategy
# --------------
motif_table = pd.read_csv(motif_table_path, sep='\t')

# Filter rows where Score ends with '**'
motif_table = motif_table[motif_table['Score'].str.endswith('**')].copy()

# Remove '**' and convert Score to float
motif_table['Score'] = motif_table['Score'].str.replace(r'\*\*$', '', regex=True).astype(float)

# Function to keep Homo sapiens rows if exist, otherwise all
def keep_human_if_exists(group):
    human_rows = group[group['Organism'] == 'Homo_sapiens']
    return human_rows if not human_rows.empty else group

motif_table = motif_table.groupby('Gene_name', group_keys=False).apply(keep_human_if_exists)

# If multiple remain per RBP, pick one randomly (seeded for reproducibility)
motif_table = motif_table.groupby('Gene_name').apply(lambda g: g.sample(1, random_state=42)).reset_index(drop=True)

# Build motif map for quick lookup
motif_map = motif_table[['Gene_name', 'Matrix_id']].drop_duplicates()

# Load PWM dictionary
pwm_dict = load_pwm_file(pwm_file_path)

# --------------
# Process Each Enrichment File
# --------------
for file in input_files:
    print(f"\n📂 Processing {file}")
    df = pd.read_csv(file)

    if 'fdr' not in df.columns:
        print(f"⚠️ Skipping {file}: no 'fdr' column")
        continue

    sig_df = df[df['fdr'] <= 0.05].copy()

    for _, row in sig_df.iterrows():
        rbp_name = row['RBP']
        matches = motif_map[motif_map['Gene_name'] == rbp_name]

        if matches.empty:
            print(f"  ⚠️ No motif entry for RBP: {rbp_name}")
            continue

        # Should be exactly one row per RBP now
        motif_row = matches.iloc[0]
        matrix_id = motif_row['Matrix_id']

        if matrix_id not in pwm_dict:
            print(f"  ⚠️ PWM not found for matrix ID {matrix_id} (RBP: {rbp_name})")
            continue

        pwm = pwm_dict[matrix_id]
        pwm_df = pd.DataFrame(pwm, columns=list("ACGT"))

        title = f"{rbp_name} ({matrix_id})"
        safe_name = sanitize_filename(f"{Path(file).stem}_{rbp_name}_{matrix_id}.png")
        out_path = os.path.join(output_dir, safe_name)

        plot_logo(pwm_df, title, out_path)

print("\n✅ Sequence logo generation complete.")

Plot pie-chart for RNA-seq results

mamba activate plot-numpy1
(plot-numpy1) jhuang@WS-2290C:/media/jhuang/Elements1/Data_Ute$ python rna_type_piecharts.py /mnt/nvme1n1p1/Homo_sapiens/Ensembl/GRCh38/Annotation/Genes/genes.gtf Data_RNA-Seq_MKL-1+WaGa/merged_gene_counts_MKL-1_human.txt Data_RNA-Seq_MKL-1+WaGa/merged_gene_counts_WaGa_human.txt

Code of rna_type_piecharts.py

import sys
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np

def parse_gtf(gtf_file):
    biotype_dict = {}
    with open(gtf_file) as f:
        for line in f:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if fields[2] != 'gene':
                continue
            attr_field = fields[8]
            attrs = {}
            for attr in attr_field.split(';'):
                attr = attr.strip()
                if attr == '':
                    continue
                key, val = attr.split(' ', 1)
                attrs[key] = val.strip('"')
            gene_id = attrs.get('gene_id')
            gene_biotype = attrs.get('gene_biotype') or attrs.get('gene_type')  # some GTFs use gene_type
            if gene_id and gene_biotype:
                biotype_dict[gene_id] = gene_biotype
    return biotype_dict

def load_counts(mkl_file, waga_file):
    df_mkl = pd.read_csv(mkl_file, sep='\t')
    df_waga = pd.read_csv(waga_file, sep='\t')
    # Check for Geneid/gene_name match
    if not (df_mkl['Geneid'].equals(df_waga['Geneid']) and df_mkl['gene_name'].equals(df_waga['gene_name'])):
        mismatch = df_mkl.loc[
            (df_mkl['Geneid'] != df_waga['Geneid']) | (df_mkl['gene_name'] != df_waga['gene_name']),
            ['Geneid', 'gene_name']
        ]
        print("⚠️ Mismatched rows found between Geneid/gene_name columns:")
        print(mismatch)
        raise ValueError("Mismatch in Geneid/gene_name columns between MKL-1 and WaGa count files.")

    # Drop duplicated Geneid/gene_name from WaGa counts before merge
    df_waga_samples = df_waga.drop(columns=['Geneid', 'gene_name'])

    # Merge horizontally
    df = pd.concat([df_mkl, df_waga_samples], axis=1)
    return df, df_mkl, df_waga

def map_biotypes(df, biotype_dict):
    # Map gene biotypes by Geneid; if not found, assign 'unknown'
    df['gene_biotype'] = df['Geneid'].map(biotype_dict).fillna('unknown')
    return df

def save_raw_data_for_pie(df, output_excel):
    # Sum counts by biotype
    # Sum all sample columns (exclude Geneid, gene_name, gene_biotype)
    sample_cols = [c for c in df.columns if c not in ['Geneid', 'gene_name', 'gene_biotype']]
    df['total_counts'] = df[sample_cols].sum(axis=1)

    # Aggregate counts per biotype
    pie_data = df.groupby('gene_biotype')['total_counts'].sum().reset_index()

    with pd.ExcelWriter(output_excel) as writer:
        df.to_excel(writer, sheet_name='All_gene_counts', index=False)
        pie_data.to_excel(writer, sheet_name='Pie_data', index=False)

    return pie_data

def plot_pie(pie_data, output_png):
    pie_data = pie_data[pie_data['total_counts'] > 0].sort_values('total_counts', ascending=False)

    N = 10  # Top N biotypes
    if len(pie_data) > N:
        top_data = pie_data.nlargest(N, 'total_counts')
        others = pd.DataFrame([{
            'gene_biotype': 'Other',
            'total_counts': pie_data['total_counts'].sum() - top_data['total_counts'].sum()
        }])
        pie_data = pd.concat([top_data, others], ignore_index=True)

    sizes = pie_data['total_counts']
    labels = pie_data['gene_biotype']

    # Use updated color map syntax
    cmap = plt.colormaps['Pastel1']
    colors = [cmap(i % cmap.N) for i in range(len(labels))]

    fig, ax = plt.subplots(figsize=(10, 10))

    # Draw pie chart without labels/autopct
    wedges, _ = ax.pie(
        sizes,
        startangle=90,
        colors=colors,
        radius=1.2
    )

    total = sum(sizes)

    for i, wedge in enumerate(wedges):
        angle = (wedge.theta2 + wedge.theta1) / 2.
        x = np.cos(np.deg2rad(angle))
        y = np.sin(np.deg2rad(angle))

        pct = sizes.iloc[i] / total * 100
        if pct < 1:
            continue  # Skip small slices

        ha = 'left' if x > 0 else 'right'
        ax.annotate(
            f"{labels.iloc[i]} ({pct:.1f}%)",
            xy=(x, y),
            xytext=(1.4 * x, 1.4 * y),
            ha=ha, va='center',
            fontsize=9,
            #arrowprops=dict(arrowstyle='-', color='gray')
            arrowprops=dict(arrowstyle='-', connectionstyle='angle3,angleA=0,angleB=90', color='gray')
        )

    # Build detailed legend: biotype (count, %)
    legend_labels = [
        f"{lbl} ({int(cnt):,}, {cnt / total:.1%})"
        for lbl, cnt in zip(labels, sizes)
    ]

    ax.legend(
        wedges,
        legend_labels,
        title="Gene Biotype",
        loc="center left",
        bbox_to_anchor=(1, 0.5),
        fontsize=9,
        title_fontsize=10
    )

    ax.set_title("", fontsize=14)
    ax.axis('equal')
    plt.tight_layout()
    plt.savefig(output_png, bbox_inches='tight')
    plt.close()

def main():
    if len(sys.argv) != 4:
        print("Usage: python rna_type_piecharts.py <genes.gtf> <MKL-1_counts.txt> <WaGa_counts.txt>")
        sys.exit(1)

    gtf_file = sys.argv[1]
    mkl_counts_file = sys.argv[2]
    waga_counts_file = sys.argv[3]

    print("🔍 Parsing gene biotypes from GTF...")
    biotype_dict = parse_gtf(gtf_file)

    print("📊 Loading and merging count matrices...")
    df, df_mkl, df_waga = load_counts(mkl_counts_file, waga_counts_file)

    print("🧬 Mapping gene biotypes...")
    df = map_biotypes(df, biotype_dict)
    df_mkl = map_biotypes(df_mkl, biotype_dict)
    df_waga = map_biotypes(df_waga, biotype_dict)

    print("💾 Saving raw data for pie charts to Excel...")
    pie_data = save_raw_data_for_pie(df, 'rna_biotype_pie_data.xlsx')
    pie_mkl_data = save_raw_data_for_pie(df_mkl, 'rna_biotype_pie_data_MKL-1.xlsx')
    pie_waga_data = save_raw_data_for_pie(df_waga, 'rna_biotype_pie_data_WaGa.xlsx')

    print("📈 Plotting pie chart...")
    plot_pie(pie_data, 'rna_biotype_pie_chart.png')
    plot_pie(pie_mkl_data, 'rna_biotype_pie_chart_MKL-1.png')
    plot_pie(pie_waga_data, 'rna_biotype_pie_chart_WaGa.png')

    print("✅ Done! Excel data saved to 'rna_biotype_pie*_data.xlsx' and pie chart saved to 'rna_biotype_pie*_chart.png'.")

if __name__ == "__main__":
    main()

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

Workflow for RNA-Binding Protein Enrichment and RNA Type Distribution Analysis (Ute’s Project)

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章