Workflow for RNA-Binding Protein Enrichment and RNA Type Distribution Analysis (Ute’s Project)

gene_x 0 like s 158 view s

Tags: pipeline

  1. Download RBP motifs (PWM) from ATtRACT_DB: ATtRACT_db.txt and pwm.txt; Convert to MEME format (if needed) (Under ~/DATA/Data_Ute/RBP_enrichments/)

    # Both generate_named_meme.py and generate_attract_human_meme.py generate the attract_human.meme inkl. Gene_name
    # Note that "grep + generate_named_meme.py" = generate_attract_human_meme.py
    #grep "Homo_sapiens" ATtRACT_db.txt > attract_human.txt  #3256
    #python ~/Scripts/generate_named_meme.py pwm.txt attract_human.txt  #OUTPUT: attract_human_named.meme
    #python ~/Scripts/generate_attract_human_meme.py pwm.txt ATtRACT_db.txt #OUTPUT: attract_human.meme and it is the same to attract_human_named.meme, both are named motifs, in total 1583 MOTIFs, for example "MOTIF 904_HNRNPH2", only for human-reading, not for pipeline
    
    #cut -f12 attract_human.txt | sort | uniq > valid_ids.txt
    python ~/Scripts/convert_attract_pwm_to_meme.py  #Input is "pwm.txt" #OUTPUT: "attract_human.meme", in total 1583 MOTIFs, for example "MOTIF 904"
    
  2. Download GENCODE (Under ~/REFs/)

    #Visit and Download: GENCODE FTP site https://www.gencodegenes.org/human/
        * GTF annotation file (e.g., gencode.v48.annotation.gtf.gz)
        * Corresponding genome FASTA (e.g., GRCh38.primary_assembly.genome.fa.gz)
    wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/gencode.v48.annotation.gtf.gz
    wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/GRCh38.primary_assembly.genome.fa.gz
    gunzip gencode.v48.annotation.gtf.gz
    gunzip GRCh38.primary_assembly.genome.fa.gz
    
  3. Get 3UTR.fasta, 5UTR.fasta, CDS.fasta and transcripts.fasta based on GENCODE-files (Under ~/DATA/Data_Ute/RBP_enrichments/)

    grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt > MKL-1_wt.EV_vs_parental-up_protein_coding.txt
    grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt > MKL-1_wt.EV_vs_parental-down_protein_coding.txt
    grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-all.txt > MKL-1_wt.EV_vs_parental-all_protein_coding.txt
    grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt > WaGa_wt.EV_vs_parental-up_protein_coding.txt
    grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt > WaGa_wt.EV_vs_parental-down_protein_coding.txt
    grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-all.txt > WaGa_wt.EV_vs_parental-all_protein_coding.txt
    
    #Usage: python extract_transcript_parts.py <gene_list.txt> <gencode.gtf> <genome.fa> <out_prefix>, the script generates <out_prefix>.[transcripts|CDS|5UTR|3UTR].fasta files.
    python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-down_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_down  #112
    python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-up_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_up  #5988
    python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-all_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_background  #19239
    python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-down_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_down  #93
    python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-up_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_up  #6538
    python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-all_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_background  #19239
    
    python ~/Scripts/filter_short_fasta.py MKL-1_up.3UTR.fasta MKL-1_up.filtered.3UTR.fasta
    python ~/Scripts/filter_short_fasta.py MKL-1_down.3UTR.fasta MKL-1_down.filtered.3UTR.fasta
    python ~/Scripts/filter_short_fasta.py MKL-1_background.3UTR.fasta MKL-1_background.filtered.3UTR.fasta
    # 检查背景文件中有多少序列:
    #grep -c "^>" MKL-1_background.filtered.3UTR.fasta
    #68890
    # 检查背景 FIMO 命中的总序列数:
    #cut -f3 fimo_background_MKL-1_background/fimo.tsv | sort | uniq | wc -l
    #67841
    python ~/Scripts/filter_short_fasta.py WaGa_background.3UTR.fasta WaGa_background.filtered.3UTR.fasta
    python ~/Scripts/filter_short_fasta.py WaGa_up.3UTR.fasta WaGa_up.filtered.3UTR.fasta
    python ~/Scripts/filter_short_fasta.py WaGa_down.3UTR.fasta WaGa_down.filtered.3UTR.fasta
    
  4. FIMO for motif scan on 3UTR sequences

    # Generate fimo_foreground_MKL-1_down/fimo.tsv
    mamba activate meme_env
    fimo --thresh 1e-4 --oc fimo_foreground_MKL-1_down attract_human_named.meme MKL-1_down.3UTR.fasta
    fimo --thresh 1e-4 --oc fimo_foreground_MKL-1_up attract_human.meme MKL-1_up.3UTR.fasta
    fimo --thresh 1e-4 --oc fimo_background_MKL-1_background attract_human.meme MKL-1_background.3UTR.fasta
    fimo --thresh 1e-4 --oc fimo_foreground_WaGa_down attract_human.meme WaGa_down.3UTR.fasta
    fimo --thresh 1e-4 --oc fimo_foreground_WaGa_up attract_human.meme WaGa_up.3UTR.fasta
    fimo --thresh 1e-4 --oc fimo_background_WaGa_background attract_human.meme WaGa_background.3UTR.fasta
    
    #Keep only one match per gene (based on Ensembl Gene ID like ENSG00000134871) for each RBP motif, even if multiple transcripts have hits.
    #python ~/Scripts/filter_fimo_best_per_gene.py --input fimo_foreground/fimo.tsv --output fimo_foreground/fimo.filtered.tsv
    
    python ~/Scripts/convert_gtf_to_Gene_annotation_TSV_file.py ~/REFs/Homo_sapiens.GRCh38.114.gtf Homo_sapiens.GRCh38.gene_annotation.tsv
    
    python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
    --input fimo_foreground_MKL-1_down/fimo.tsv \
    --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
    --output_filtered fimo_foreground_MKL-1_down/fimo.filtered.tsv \
    --output_annotated fimo_foreground_MKL-1_down/fimo.filtered.annotated.tsv
    #21559
    python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
    --input fimo_foreground_MKL-1_up/fimo.tsv \
    --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
    --output_filtered fimo_foreground_MKL-1_up/fimo.filtered.tsv \
    --output_annotated fimo_foreground_MKL-1_up/fimo.filtered.annotated.tsv
    #(736661 rows)
    python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
    --input fimo_background_MKL-1_background/fimo.tsv \
    --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
    --output_filtered fimo_background_MKL-1_background/fimo.filtered.tsv \
    --output_annotated fimo_background_MKL-1_background/fimo.filtered.annotated.tsv
    #(1869075 rows)
    python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
    --input fimo_foreground_WaGa_down/fimo.tsv \
    --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
    --output_filtered fimo_foreground_WaGa_down/fimo.filtered.tsv \
    --output_annotated fimo_foreground_WaGa_down/fimo.filtered.annotated.tsv
    #(20364 rows)
    python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
    --input fimo_foreground_WaGa_up/fimo.tsv \
    --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
    --output_filtered fimo_foreground_WaGa_up/fimo.filtered.tsv \
    --output_annotated fimo_foreground_WaGa_up/fimo.filtered.annotated.tsv
    #(805634 rows)
    python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
    --input fimo_background_WaGa_background/fimo.tsv \
    --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
    --output_filtered fimo_background_WaGa_background/fimo.filtered.tsv \
    --output_annotated fimo_background_WaGa_background/fimo.filtered.annotated.tsv
    #(1811615 rows)
    
    python ~/Scripts/run_enrichment.py \
        --attract ATtRACT_db.txt \
        --fimo_fg fimo_foreground_MKL-1_up/fimo.filtered.tsv \
        --fimo_bg fimo_background_MKL-1_background/fimo.filtered.tsv \
        --output rbp_enrichment_MKL-1_up.csv \
        --strategy inclusive
    python ~/Scripts/run_enrichment.py \
        --attract ATtRACT_db.txt \
        --fimo_fg fimo_foreground_MKL-1_down/fimo.filtered.tsv \
        --fimo_bg fimo_background_MKL-1_background/fimo.filtered.tsv \
        --output rbp_enrichment_MKL-1_down.csv
    python ~/Scripts/run_enrichment.py \
        --attract ATtRACT_db.txt \
        --fimo_fg fimo_foreground_WaGa_up/fimo.filtered.tsv \
        --fimo_bg fimo_background_WaGa_background/fimo.filtered.tsv \
        --output rbp_enrichment_WaGa_up.csv
    python ~/Scripts/run_enrichment.py \
        --attract ATtRACT_db.txt \
        --fimo_fg fimo_foreground_WaGa_down/fimo.filtered.tsv \
        --fimo_bg fimo_background_WaGa_background/fimo.filtered.tsv \
        --output rbp_enrichment_WaGa_down.csv
    
    #Column Meaning in rbp_enrichment_*.[csv|xlsx]
    #a  Number of unique foreground UTRs hit by the RBP
    #b  Number of unique background UTRs hit by the RBP
    #c  Total number of foreground UTRs
    #d  Total number of background UTRs (⬅️ this is the value you're asking about)
    #p_value, fdr   From Fisher's exact test on enrichment
    
    # The following pdf-files are not used and sent!
    python ~/Scripts/plot_volcano.py --csv rbp_enrichment_MKL-1_up.csv --output MKL-1_volcano_up.pdf --title "Upregulated MKL-1"
    python ~/Scripts/plot_rbp_heatmap.py \
    --csvs rbp_enrichment_MKL-1_up.csv rbp_enrichment_MKL-1_down.csv \
    --labels Upregulated Downregulated \
    --output MKL-1_rbp_enrichment_heatmap.pdf
    
    #-- Get all genes the number 1621 refers to --
    #AGO2,1621,5050,5732,12987,1.0,1.0   #MKL-1_up
    #motif_ids are 414 and 399
    grep "^414" fimo.filtered.annotated.tsv > AGO2.txt
    grep "^399" fimo.filtered.annotated.tsv >> AGO2.txt
    cut -d$'\t' -f11 AGO2.txt | sort -u > AGO2_uniq.txt
    wc -l AGO2_uniq.txt
    #1621 AGO2_uniq.txt
    
  5. miRNAs motif analysis using ATtRACT + FIMO

        #* Extract their sequences
        #* Generate a background set
        #* Run RBP enrichment (e.g., with RBPmap or FIMO)
        #* Get p-adjusted enrichment stats (e.g., Fisher + BH)
    
        #Input_1. DE results (differential expression file from smallRNA-seq)
            #Input: up- and down-, all-regulated files
            #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-up.txt  #66
            #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-down.txt  #38
            #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-all.txt  #1304
            #Format: 1st column = miRNA ID (e.g., hsa-miR-21-5p), optionally with other stats.
    
            cp ~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-*.txt .
            #"hsa-miR-3180|hsa-miR-3180-3p"
            #>hsa-miR-3180 MIMAT0018178 Homo sapiens miR-3180
            #UGGGGCGGAGCUUCCGGAG
            #>hsa-miR-3180-3p MIMAT0015058 Homo sapiens miR-3180-3p
            #UGGGGCGGAGCUUCCGGAGGCC
    
        #Input_2. Reference FASTA (Reference sequences from miRBase or GENCODE)
            #From miRBase: https://mirbase.org/download/  https://mirbase.org/download/CURRENT/
            ##miRBase_v21
            #mature.fa.gz → contains mature miRNA sequences
            #hairpin.fa.gz → for pre-miRNAs
            mv ../RBP_enrichments_OLD_DEL/mature_v21.fa .
    
        # -- Extract Sequences + Background Set --
    
        #Inputs:
        #    * up_miRNA.txt and down_miRNA.txt: DE results (first column = miRNA name, e.g., hsa-miR-21-5p)
        #    * mature.fa or hairpin.fa from miRBase
    
        #Outputs:
        #    * mirna_up.fa
        #    * mirna_down.fa
        #    * mirna_background.fa
    
        #Use all remaining miRNAs as background:
        python ~/Scripts/prepare_miRNA_sets.py untreated_vs_parental_cells-up.txt untreated_vs_parental_cells-down.txt mature_v21.fa mirna --full-background
        mv mirna_background.fa mirna_full-background.fa
    
        #Use random subset background. Note that the generated background has the number of maxsize(up, down), in the case is up (84 records):
        #python ~/Scripts/prepare_miRNA_sets.py untreated_vs_parental_cells-up.txt untreated_vs_parental_cells-down.txt mature_v21.fa mirna
        # grep ">" mature_v21.fa | wc -l  #35828
        # grep ">" mirna_full-background.fa | wc -l  #35710-->35723
        # grep ">" mirna_up.fa | wc -l  #84
        # grep ">" mirna_down.fa | wc -l  #34
        # grep ">" mirna_background.fa | wc -l  #84-->67
        # #35,710 + 84 + 34 = 35,828
    
        fimo --thresh 1e-4 --oc fimo_mirna_down attract_human.meme mirna_down.fa
        fimo --thresh 1e-4 --oc fimo_mirna_up attract_human.meme mirna_up.fa
        fimo --thresh 1e-4 --oc fimo_mirna_full-background attract_human.meme mirna_full-background.fa
        #fimo --thresh 1e-4 --oc fimo_mirna_background attract_human.meme mirna_background.fa
    
        python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
        --input fimo_mirna_down/fimo.tsv \
        --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
        --output_filtered fimo_mirna_down/fimo.filtered.tsv \
        --output_annotated fimo_mirna_down/fimo.filtered.annotated.tsv  #21
        python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
        --input fimo_mirna_up/fimo.tsv \
        --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
        --output_filtered fimo_mirna_up/fimo.filtered.tsv \
        --output_annotated fimo_mirna_up/fimo.filtered.annotated.tsv  #48
        python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
        --input fimo_mirna_full-background/fimo.tsv \
        --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
        --output_filtered fimo_mirna_full-background/fimo.filtered.tsv \
        --output_annotated fimo_mirna_full-background/fimo.filtered.annotated.tsv  #896
        #python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
        #--input fimo_mirna_background/fimo.tsv \
        #--annot Homo_sapiens.GRCh38.gene_annotation.tsv \
        #--output_filtered fimo_mirna_background/fimo.filtered.tsv \
        #--output_annotated fimo_mirna_background/fimo.filtered.annotated.tsv  #57
    
        python ~/Scripts/run_enrichment_miRNAs.py \
            --attract ATtRACT_db.txt \
            --fimo_fg fimo_mirna_up/fimo.filtered.tsv \
            --fimo_bg fimo_mirna_full-background/fimo.filtered.tsv \
            --output rbp_enrichment_mirna_up.csv \
            --strategy inclusive
        python ~/Scripts/run_enrichment_miRNAs.py \
            --attract ATtRACT_db.txt \
            --fimo_fg fimo_mirna_down/fimo.filtered.tsv \
            --fimo_bg fimo_mirna_full-background/fimo.filtered.tsv \
            --output rbp_enrichment_mirna_down.csv \
            --strategy inclusive
        #python ~/Scripts/run_enrichment_miRNAs.py \
        #    --attract ATtRACT_db.txt \
        #    --fimo_fg fimo_mirna_up/fimo.filtered.tsv \
        #    --fimo_bg fimo_mirna_background/fimo.filtered.tsv \
        #    --output rbp_enrichment_mirna_up_on_subset-background.csv \
        #    --strategy inclusive
        #python ~/Scripts/run_enrichment_miRNAs.py \
        #    --attract ATtRACT_db.txt \
        #    --fimo_fg fimo_mirna_down/fimo.filtered.tsv \
        #    --fimo_bg fimo_mirna_background/fimo.filtered.tsv \
        #    --output rbp_enrichment_mirna_down_on_subset-background.csv \
        #    --strategy inclusive
    
        #FXR2   1 (hsa-miR-92b-5p)  1   1   118 0.0168067226890756  0.365546218487395
        #ORB2   1 (hsa-miR-4748)    1   1   118 0.0168067226890756  0.365546218487395
    
        #-- Get all genes the number 1621 refers to --
        grep "^FXR2" ATtRACT_db.txt
        #motif_ids is M020_0.6
        grep "^M020_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv > FXR2.txt
        grep "^M020_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv
        #cut -d$'\t' -f11 AGO2.txt | sort -u > AGO2_uniq.txt
        #wc -l AGO2_uniq.txt (1621 records)
    
        grep "^ORB2" ATtRACT_db.txt
        grep "^M120_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv
    
  6. Reports for RBP enrichment results

    Please find attached the results of the RNA-binding protein (RBP) enrichment analysis using FIMO and the ATtRACT motif database, along with a brief description of the procedures used for both the 3′ UTR-based analysis (RNA-seq) and the miRNA-based analysis (small RNA-seq).
    
    1. RBP Motif Enrichment from RNA-seq (3′ UTRs)
    
    We focused on 3′ UTRs, as they are key regulatory regions for RBPs. Sequences shorter than 16 nucleotides were excluded. Using FIMO (from the MEME suite) with motifs from the ATtRACT database, we scanned both foreground and background 3′ UTR sets to identify motif occurrences.
    
    Foreground: Differentially expressed transcripts (e.g., MKL-1 up/down, WaGa up/down)
    Background: All non-differentially expressed transcripts
    
    Analysis: Fisher’s exact test was used to assess motif enrichment; p-values were adjusted using the Benjamini–Hochberg method.
    
    Output files (RNA-seq):
    
        * rbp_enrichment_MKL-1_down.xlsx / .png
        * rbp_enrichment_MKL-1_up.xlsx / .png
        * rbp_enrichment_WaGa_down.xlsx / .png
        * rbp_enrichment_WaGa_up.xlsx / .png
    
    2. RBP Motif Enrichment from Small RNA-seq (miRNAs)
    
    This analysis focused on differentially expressed miRNAs, using either mature miRNA sequences from miRBase. We scanned for RBP binding motifs within these sequences using FIMO and assessed motif enrichment relative to background sets.
    
    Foreground: DE miRNAs (up/down) from small RNA-seq comparisons
    Background: All other miRNAs from miRBase
    
    Analysis: FIMO was used with --thresh 1e-4, followed by annotation and filtering. Enrichment was assessed using Fisher’s test + BH correction.
    
    Output files (miRNAs):
    
        * rbp_enrichment_mirna_down.xlsx
        * rbp_enrichment_mirna_up.xlsx
    
    How to Interpret the Numbers
    Each row in the result tables represents one RBP and its enrichment statistics:
    
    a: foreground genes/sequences with the motif
    b: background genes/sequences with the motif
    c: total number of foreground genes/sequences
    d: total number of background genes/sequences
    
    These values are used to compute p-values and FDRs.
    
    For example, in rbp_enrichment_MKL-1_up.xlsx, AGO2 has a = 1621, meaning FIMO detected AGO2 motifs in 1,621 genes in the MKL-1 upregulated set. These genes are listed in AGO2_uniq.txt.
    
    Similarly, for the miRNA analysis (e.g., rbp_enrichment_mirna_up.xlsx and rbp_enrichment_mirna_down.xlsx), the numbers represent counts of unique miRNAs with at least one significant motif hit. As examples, I calculated the detailed membership for FXR2 and ORB2.
    
    diff RBP_enrichments/rbp_enrichment_MKL-1_up.csv  RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.csv
    #diff RBP_enrichments/rbp_enrichment_MKL-1_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.xlsx
    #diff RBP_enrichments/rbp_enrichment_MKL-1_up.png   RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.png
    diff RBP_enrichments/rbp_enrichment_MKL-1_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.csv
    #diff RBP_enrichments/rbp_enrichment_MKL-1_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.xlsx
    #diff RBP_enrichments/rbp_enrichment_MKL-1_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.png
    diff RBP_enrichments/rbp_enrichment_WaGa_up.csv RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.csv
    #diff RBP_enrichments/rbp_enrichment_WaGa_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.xlsx
    #diff RBP_enrichments/rbp_enrichment_WaGa_up.png RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.png
    diff RBP_enrichments/rbp_enrichment_WaGa_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.csv
    #diff RBP_enrichments/rbp_enrichment_WaGa_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.xlsx
    #diff RBP_enrichments/rbp_enrichment_WaGa_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.png
    okular RBP_enrichments/MKL-1_volcano_up.pdf
    okular RBP_enrichments_OLD_DEL/MKL-1_volcano_up.pdf
    okular RBP_enrichments/MKL-1_rbp_enrichment_heatmap.pdf
    okular RBP_enrichments_OLD_DEL/MKL-1_rbp_enrichment_heatmap.pdf
    
    diff RBP_enrichments/rbp_enrichment_mirna_up.csv RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up_on_full-background.csv
    #diff RBP_enrichments/rbp_enrichment_mirna_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up.xlsx
    #diff RBP_enrichments/rbp_enrichment_mirna_up.png RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up_on_full-background.png
    diff RBP_enrichments/rbp_enrichment_mirna_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down_on_full-background.csv
    #diff RBP_enrichments/rbp_enrichment_mirna_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down.xlsx
    #diff RBP_enrichments/rbp_enrichment_mirna_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down_on_full-background.png
    
  7. Generate the sequence logos for the enriched RBP motives

    import os
    import pandas as pd
    import matplotlib.pyplot as plt
    import logomaker
    from pathlib import Path
    import re
    
    # --------------
    # Config
    # --------------
    motif_table_path = "ATtRACT_db.txt"
    pwm_file_path = "pwm.txt"
    input_files = [
        "rbp_enrichment_MKL-1_up.csv",
        "rbp_enrichment_MKL-1_down.csv",
        "rbp_enrichment_WaGa_up.csv",
        "rbp_enrichment_WaGa_down.csv",
        "rbp_enrichment_mirna_up.csv",
        "rbp_enrichment_mirna_down.csv"
    ]
    output_dir = "sequence_logos"
    os.makedirs(output_dir, exist_ok=True)
    
    # --------------
    # Helper Functions
    # --------------
    def load_pwm_file(pwm_path):
        pwm_dict = {}
        current_id = None
        pwm_matrix = []
        with open(pwm_path, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                if line.startswith('>'):
                    if current_id and pwm_matrix:
                        pwm_dict[current_id] = pwm_matrix
                    parts = line[1:].split()
                    current_id = parts[0]
                    pwm_matrix = []
                else:
                    row = [float(x) for x in line.split()]
                    if len(row) == 4:
                        pwm_matrix.append(row)
        if current_id and pwm_matrix:
            pwm_dict[current_id] = pwm_matrix
        return pwm_dict
    
    def sanitize_filename(text):
        return re.sub(r'[^\w\-_.]', '_', text)
    
    def plot_logo(pwm_df, title, output_path):
        fig, ax = plt.subplots(figsize=(len(pwm_df)*0.5, 2))
        logo = logomaker.Logo(pwm_df, ax=ax, color_scheme='classic')
        logo.style_spines(visible=False)
        logo.style_spines(spines=['left', 'bottom'], visible=True)
        ax.set_ylabel("Information")
        ax.set_title(title)
        fig.tight_layout()
        fig.savefig(output_path, bbox_inches='tight')
        plt.close(fig)
    
    # --------------
    # Load Motif Table and Filter per your strategy
    # --------------
    motif_table = pd.read_csv(motif_table_path, sep='\t')
    
    # Filter rows where Score ends with '**'
    motif_table = motif_table[motif_table['Score'].str.endswith('**')].copy()
    
    # Remove '**' and convert Score to float
    motif_table['Score'] = motif_table['Score'].str.replace(r'\*\*$', '', regex=True).astype(float)
    
    # Function to keep Homo sapiens rows if exist, otherwise all
    def keep_human_if_exists(group):
        human_rows = group[group['Organism'] == 'Homo_sapiens']
        return human_rows if not human_rows.empty else group
    
    motif_table = motif_table.groupby('Gene_name', group_keys=False).apply(keep_human_if_exists)
    
    # If multiple remain per RBP, pick one randomly (seeded for reproducibility)
    motif_table = motif_table.groupby('Gene_name').apply(lambda g: g.sample(1, random_state=42)).reset_index(drop=True)
    
    # Build motif map for quick lookup
    motif_map = motif_table[['Gene_name', 'Matrix_id']].drop_duplicates()
    
    # Load PWM dictionary
    pwm_dict = load_pwm_file(pwm_file_path)
    
    # --------------
    # Process Each Enrichment File
    # --------------
    for file in input_files:
        print(f"\n📂 Processing {file}")
        df = pd.read_csv(file)
    
        if 'fdr' not in df.columns:
            print(f"⚠️ Skipping {file}: no 'fdr' column")
            continue
    
        sig_df = df[df['fdr'] <= 0.05].copy()
    
        for _, row in sig_df.iterrows():
            rbp_name = row['RBP']
            matches = motif_map[motif_map['Gene_name'] == rbp_name]
    
            if matches.empty:
                print(f"  ⚠️ No motif entry for RBP: {rbp_name}")
                continue
    
            # Should be exactly one row per RBP now
            motif_row = matches.iloc[0]
            matrix_id = motif_row['Matrix_id']
    
            if matrix_id not in pwm_dict:
                print(f"  ⚠️ PWM not found for matrix ID {matrix_id} (RBP: {rbp_name})")
                continue
    
            pwm = pwm_dict[matrix_id]
            pwm_df = pd.DataFrame(pwm, columns=list("ACGT"))
    
            title = f"{rbp_name} ({matrix_id})"
            safe_name = sanitize_filename(f"{Path(file).stem}_{rbp_name}_{matrix_id}.png")
            out_path = os.path.join(output_dir, safe_name)
    
            plot_logo(pwm_df, title, out_path)
    
    print("\n✅ Sequence logo generation complete.")
    
  8. Plot pie-chart for RNA-seq results

    mamba activate plot-numpy1
    (plot-numpy1) jhuang@WS-2290C:/media/jhuang/Elements1/Data_Ute$ python rna_type_piecharts.py /mnt/nvme1n1p1/Homo_sapiens/Ensembl/GRCh38/Annotation/Genes/genes.gtf Data_RNA-Seq_MKL-1+WaGa/merged_gene_counts_MKL-1_human.txt Data_RNA-Seq_MKL-1+WaGa/merged_gene_counts_WaGa_human.txt
    
  9. Code of rna_type_piecharts.py

    import sys
    import pandas as pd
    import matplotlib.pyplot as plt
    from matplotlib import cm
    import numpy as np
    
    def parse_gtf(gtf_file):
        biotype_dict = {}
        with open(gtf_file) as f:
            for line in f:
                if line.startswith('#'):
                    continue
                fields = line.strip().split('\t')
                if fields[2] != 'gene':
                    continue
                attr_field = fields[8]
                attrs = {}
                for attr in attr_field.split(';'):
                    attr = attr.strip()
                    if attr == '':
                        continue
                    key, val = attr.split(' ', 1)
                    attrs[key] = val.strip('"')
                gene_id = attrs.get('gene_id')
                gene_biotype = attrs.get('gene_biotype') or attrs.get('gene_type')  # some GTFs use gene_type
                if gene_id and gene_biotype:
                    biotype_dict[gene_id] = gene_biotype
        return biotype_dict
    
    def load_counts(mkl_file, waga_file):
        df_mkl = pd.read_csv(mkl_file, sep='\t')
        df_waga = pd.read_csv(waga_file, sep='\t')
        # Check for Geneid/gene_name match
        if not (df_mkl['Geneid'].equals(df_waga['Geneid']) and df_mkl['gene_name'].equals(df_waga['gene_name'])):
            mismatch = df_mkl.loc[
                (df_mkl['Geneid'] != df_waga['Geneid']) | (df_mkl['gene_name'] != df_waga['gene_name']),
                ['Geneid', 'gene_name']
            ]
            print("⚠️ Mismatched rows found between Geneid/gene_name columns:")
            print(mismatch)
            raise ValueError("Mismatch in Geneid/gene_name columns between MKL-1 and WaGa count files.")
    
        # Drop duplicated Geneid/gene_name from WaGa counts before merge
        df_waga_samples = df_waga.drop(columns=['Geneid', 'gene_name'])
    
        # Merge horizontally
        df = pd.concat([df_mkl, df_waga_samples], axis=1)
        return df, df_mkl, df_waga
    
    def map_biotypes(df, biotype_dict):
        # Map gene biotypes by Geneid; if not found, assign 'unknown'
        df['gene_biotype'] = df['Geneid'].map(biotype_dict).fillna('unknown')
        return df
    
    def save_raw_data_for_pie(df, output_excel):
        # Sum counts by biotype
        # Sum all sample columns (exclude Geneid, gene_name, gene_biotype)
        sample_cols = [c for c in df.columns if c not in ['Geneid', 'gene_name', 'gene_biotype']]
        df['total_counts'] = df[sample_cols].sum(axis=1)
    
        # Aggregate counts per biotype
        pie_data = df.groupby('gene_biotype')['total_counts'].sum().reset_index()
    
        with pd.ExcelWriter(output_excel) as writer:
            df.to_excel(writer, sheet_name='All_gene_counts', index=False)
            pie_data.to_excel(writer, sheet_name='Pie_data', index=False)
    
        return pie_data
    
    def plot_pie(pie_data, output_png):
        pie_data = pie_data[pie_data['total_counts'] > 0].sort_values('total_counts', ascending=False)
    
        N = 10  # Top N biotypes
        if len(pie_data) > N:
            top_data = pie_data.nlargest(N, 'total_counts')
            others = pd.DataFrame([{
                'gene_biotype': 'Other',
                'total_counts': pie_data['total_counts'].sum() - top_data['total_counts'].sum()
            }])
            pie_data = pd.concat([top_data, others], ignore_index=True)
    
        sizes = pie_data['total_counts']
        labels = pie_data['gene_biotype']
    
        # Use updated color map syntax
        cmap = plt.colormaps['Pastel1']
        colors = [cmap(i % cmap.N) for i in range(len(labels))]
    
        fig, ax = plt.subplots(figsize=(10, 10))
    
        # Draw pie chart without labels/autopct
        wedges, _ = ax.pie(
            sizes,
            startangle=90,
            colors=colors,
            radius=1.2
        )
    
        total = sum(sizes)
    
        for i, wedge in enumerate(wedges):
            angle = (wedge.theta2 + wedge.theta1) / 2.
            x = np.cos(np.deg2rad(angle))
            y = np.sin(np.deg2rad(angle))
    
            pct = sizes.iloc[i] / total * 100
            if pct < 1:
                continue  # Skip small slices
    
            ha = 'left' if x > 0 else 'right'
            ax.annotate(
                f"{labels.iloc[i]} ({pct:.1f}%)",
                xy=(x, y),
                xytext=(1.4 * x, 1.4 * y),
                ha=ha, va='center',
                fontsize=9,
                #arrowprops=dict(arrowstyle='-', color='gray')
                arrowprops=dict(arrowstyle='-', connectionstyle='angle3,angleA=0,angleB=90', color='gray')
            )
    
        # Build detailed legend: biotype (count, %)
        legend_labels = [
            f"{lbl} ({int(cnt):,}, {cnt / total:.1%})"
            for lbl, cnt in zip(labels, sizes)
        ]
    
        ax.legend(
            wedges,
            legend_labels,
            title="Gene Biotype",
            loc="center left",
            bbox_to_anchor=(1, 0.5),
            fontsize=9,
            title_fontsize=10
        )
    
        ax.set_title("", fontsize=14)
        ax.axis('equal')
        plt.tight_layout()
        plt.savefig(output_png, bbox_inches='tight')
        plt.close()
    
    def main():
        if len(sys.argv) != 4:
            print("Usage: python rna_type_piecharts.py <genes.gtf> <MKL-1_counts.txt> <WaGa_counts.txt>")
            sys.exit(1)
    
        gtf_file = sys.argv[1]
        mkl_counts_file = sys.argv[2]
        waga_counts_file = sys.argv[3]
    
        print("🔍 Parsing gene biotypes from GTF...")
        biotype_dict = parse_gtf(gtf_file)
    
        print("📊 Loading and merging count matrices...")
        df, df_mkl, df_waga = load_counts(mkl_counts_file, waga_counts_file)
    
        print("🧬 Mapping gene biotypes...")
        df = map_biotypes(df, biotype_dict)
        df_mkl = map_biotypes(df_mkl, biotype_dict)
        df_waga = map_biotypes(df_waga, biotype_dict)
    
        print("💾 Saving raw data for pie charts to Excel...")
        pie_data = save_raw_data_for_pie(df, 'rna_biotype_pie_data.xlsx')
        pie_mkl_data = save_raw_data_for_pie(df_mkl, 'rna_biotype_pie_data_MKL-1.xlsx')
        pie_waga_data = save_raw_data_for_pie(df_waga, 'rna_biotype_pie_data_WaGa.xlsx')
    
        print("📈 Plotting pie chart...")
        plot_pie(pie_data, 'rna_biotype_pie_chart.png')
        plot_pie(pie_mkl_data, 'rna_biotype_pie_chart_MKL-1.png')
        plot_pie(pie_waga_data, 'rna_biotype_pie_chart_WaGa.png')
    
        print("✅ Done! Excel data saved to 'rna_biotype_pie*_data.xlsx' and pie chart saved to 'rna_biotype_pie*_chart.png'.")
    
    if __name__ == "__main__":
        main()
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum