Processing RNAseq_2025_WT_vs_ΔIJ_on_ATCC19606

gene_x 0 like s 9 view s

Tags: KEGG, GO, pathway, pipeline, RNA-seq

  1. Vorgabe

    #perform PCA analysis, Venn diagram analysis, as well as KEGG and GO annotations. We would also appreciate it if you could include CPM calculations for this dataset (gene_cpm_counts.xlsx). For comparative analysis, we are particularly interested in identifying DEGs between WT and ΔIJ across the different treatments and time points.
    
  2. Preparing raw data

    mkdir raw_data; cd raw_data
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-17-1/WT-17-1_1.fq.gz WT-17-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-17-1/WT-17-1_2.fq.gz WT-17-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-17-2/WT-17-2_1.fq.gz WT-17-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-17-2/WT-17-2_2.fq.gz WT-17-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-17-3/WT-17-3_1.fq.gz WT-17-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-17-3/WT-17-3_2.fq.gz WT-17-r3_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-24-1/WT-24-1_1.fq.gz WT-24-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-24-1/WT-24-1_2.fq.gz WT-24-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-24-2/WT-24-2_1.fq.gz WT-24-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-24-2/WT-24-2_2.fq.gz WT-24-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-24-3/WT-24-3_1.fq.gz WT-24-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT-24-3/WT-24-3_2.fq.gz WT-24-r3_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-17-1/ΔIJ-17-1_1.fq.gz deltaIJ-17-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-17-1/ΔIJ-17-1_2.fq.gz deltaIJ-17-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-17-2/ΔIJ-17-2_1.fq.gz deltaIJ-17-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-17-2/ΔIJ-17-2_2.fq.gz deltaIJ-17-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-17-3/ΔIJ-17-3_1.fq.gz deltaIJ-17-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-17-3/ΔIJ-17-3_2.fq.gz deltaIJ-17-r3_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-24-1/ΔIJ-24-1_1.fq.gz deltaIJ-24-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-24-1/ΔIJ-24-1_2.fq.gz deltaIJ-24-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-24-2/ΔIJ-24-2_1.fq.gz deltaIJ-24-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-24-2/ΔIJ-24-2_2.fq.gz deltaIJ-24-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-24-3/ΔIJ-24-3_1.fq.gz deltaIJ-24-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/ΔIJ-24-3/ΔIJ-24-3_2.fq.gz deltaIJ-24-r3_R2.fq.gz
    
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-17-1/preWT-17-1_1.fq.gz pre_WT-17-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-17-1/preWT-17-1_2.fq.gz pre_WT-17-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-17-2/preWT-17-2_1.fq.gz pre_WT-17-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-17-2/preWT-17-2_2.fq.gz pre_WT-17-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-17-3/preWT-17-3_1.fq.gz pre_WT-17-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-17-3/preWT-17-3_2.fq.gz pre_WT-17-r3_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-24-1/preWT-24-1_1.fq.gz pre_WT-24-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-24-1/preWT-24-1_2.fq.gz pre_WT-24-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-24-2/preWT-24-2_1.fq.gz pre_WT-24-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-24-2/preWT-24-2_2.fq.gz pre_WT-24-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-24-3/preWT-24-3_1.fq.gz pre_WT-24-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preWT-24-3/preWT-24-3_2.fq.gz pre_WT-24-r3_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-17-1/preΔIJ-17-1_1.fq.gz pre_deltaIJ-17-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-17-1/preΔIJ-17-1_2.fq.gz pre_deltaIJ-17-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-17-2/preΔIJ-17-2_1.fq.gz pre_deltaIJ-17-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-17-2/preΔIJ-17-2_2.fq.gz pre_deltaIJ-17-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-17-3/preΔIJ-17-3_1.fq.gz pre_deltaIJ-17-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-17-3/preΔIJ-17-3_2.fq.gz pre_deltaIJ-17-r3_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-24-1/preΔIJ-24-1_1.fq.gz pre_deltaIJ-24-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-24-1/preΔIJ-24-1_2.fq.gz pre_deltaIJ-24-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-24-2/preΔIJ-24-2_1.fq.gz pre_deltaIJ-24-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-24-2/preΔIJ-24-2_2.fq.gz pre_deltaIJ-24-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-24-3/preΔIJ-24-3_1.fq.gz pre_deltaIJ-24-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/preΔIJ-24-3/preΔIJ-24-3_2.fq.gz pre_deltaIJ-24-r3_R2.fq.gz
    
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-17-1/WT0_5-17-1_1.fq.gz 0_5_WT-17-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-17-1/WT0_5-17-1_2.fq.gz 0_5_WT-17-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-17-2/WT0_5-17-2_1.fq.gz 0_5_WT-17-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-17-2/WT0_5-17-2_2.fq.gz 0_5_WT-17-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-17-3/WT0_5-17-3_1.fq.gz 0_5_WT-17-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-17-3/WT0_5-17-3_2.fq.gz 0_5_WT-17-r3_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-24-1/WT0_5-24-1_1.fq.gz 0_5_WT-24-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-24-1/WT0_5-24-1_2.fq.gz 0_5_WT-24-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-24-2/WT0_5-24-2_1.fq.gz 0_5_WT-24-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-24-2/WT0_5-24-2_2.fq.gz 0_5_WT-24-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-24-3/WT0_5-24-3_1.fq.gz 0_5_WT-24-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/WT0_5-24-3/WT0_5-24-3_2.fq.gz 0_5_WT-24-r3_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-17-1/0_5ΔIJ-17-1_1.fq.gz 0_5_deltaIJ-17-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-17-1/0_5ΔIJ-17-1_2.fq.gz 0_5_deltaIJ-17-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-17-2/0_5ΔIJ-17-2_1.fq.gz 0_5_deltaIJ-17-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-17-2/0_5ΔIJ-17-2_2.fq.gz 0_5_deltaIJ-17-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-17-3/0_5ΔIJ-17-3_1.fq.gz 0_5_deltaIJ-17-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-17-3/0_5ΔIJ-17-3_2.fq.gz 0_5_deltaIJ-17-r3_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-24-1/0_5ΔIJ-24-1_1.fq.gz 0_5_deltaIJ-24-r1_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-24-1/0_5ΔIJ-24-1_2.fq.gz 0_5_deltaIJ-24-r1_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-24-2/0_5ΔIJ-24-2_1.fq.gz 0_5_deltaIJ-24-r2_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-24-2/0_5ΔIJ-24-2_2.fq.gz 0_5_deltaIJ-24-r2_R2.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-24-3/0_5ΔIJ-24-3_1.fq.gz 0_5_deltaIJ-24-r3_R1.fq.gz
    ln -s ../RSMR00204/X101SC25062155-Z01/X101SC25062155-Z01-J001/01.RawData/0_5ΔIJ-24-3/0_5ΔIJ-24-3_2.fq.gz 0_5_deltaIJ-24-r3_R2.fq.gz
    
  3. (Done) Downloading CP059040.fasta and CP059040.gff from GenBank

  4. Preparing the directory trimmed

    mkdir trimmed trimmed_unpaired;
    for sample_id in WT-17-r1 WT-17-r2 WT-17-r3 WT-24-r1 WT-24-r2 WT-24-r3 deltaIJ-17-r1 deltaIJ-17-r2 deltaIJ-17-r3 deltaIJ-24-r1 deltaIJ-24-r2 deltaIJ-24-r3  pre_WT-17-r1 pre_WT-17-r2 pre_WT-17-r3 pre_WT-24-r1 pre_WT-24-r2 pre_WT-24-r3 pre_deltaIJ-17-r1 pre_deltaIJ-17-r2 pre_deltaIJ-17-r3 pre_deltaIJ-24-r1 pre_deltaIJ-24-r2 pre_deltaIJ-24-r3  0_5_WT-17-r1 0_5_WT-17-r2 0_5_WT-17-r3 0_5_WT-24-r1 0_5_WT-24-r2 0_5_WT-24-r3 0_5_deltaIJ-17-r1 0_5_deltaIJ-17-r2 0_5_deltaIJ-17-r3 0_5_deltaIJ-24-r1 0_5_deltaIJ-24-r2 0_5_deltaIJ-24-r3; do \
            java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 raw_data/${sample_id}_R1.fq.gz raw_data/${sample_id}_R2.fq.gz trimmed/${sample_id}_R1.fq.gz trimmed_unpaired/${sample_id}_R1.fq.gz trimmed/${sample_id}_R2.fq.gz trimmed_unpaired/${sample_id}_R2.fq.gz ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log;
    done
    
  5. Preparing samplesheet.csv

    sample,fastq_1,fastq_2,strandedness
    WT_17_r1,WT-17-r1_R1.fq.gz,WT-17-r1_R2.fq.gz,auto
    WT_17_r2,WT-17-r2_R1.fq.gz,WT-17-r2_R2.fq.gz,auto
    WT_17_r3,WT-17-r3_R1.fq.gz,WT-17-r3_R2.fq.gz,auto
    WT_24_r1,WT-24-r1_R1.fq.gz,WT-24-r1_R2.fq.gz,auto
    WT_24_r2,WT-24-r2_R1.fq.gz,WT-24-r2_R2.fq.gz,auto
    WT_24_r3,WT-24-r3_R1.fq.gz,WT-24-r3_R2.fq.gz,auto
    deltaIJ_17_r1,deltaIJ-17-r1_R1.fq.gz,deltaIJ-17-r1_R2.fq.gz,auto
    deltaIJ_17_r2,deltaIJ-17-r2_R1.fq.gz,deltaIJ-17-r2_R2.fq.gz,auto
    deltaIJ_17_r3,deltaIJ-17-r3_R1.fq.gz,deltaIJ-17-r3_R2.fq.gz,auto
    deltaIJ_24_r1,deltaIJ-24-r1_R1.fq.gz,deltaIJ-24-r1_R2.fq.gz,auto
    deltaIJ_24_r2,deltaIJ-24-r2_R1.fq.gz,deltaIJ-24-r2_R2.fq.gz,auto
    deltaIJ_24_r3,deltaIJ-24-r3_R1.fq.gz,deltaIJ-24-r3_R2.fq.gz,auto
    pre_WT_17_r1,pre_WT-17-r1_R1.fq.gz,pre_WT-17-r1_R2.fq.gz,auto
    pre_WT_17_r2,pre_WT-17-r2_R1.fq.gz,pre_WT-17-r2_R2.fq.gz,auto
    pre_WT_17_r3,pre_WT-17-r3_R1.fq.gz,pre_WT-17-r3_R2.fq.gz,auto
    pre_WT_24_r1,pre_WT-24-r1_R1.fq.gz,pre_WT-24-r1_R2.fq.gz,auto
    pre_WT_24_r2,pre_WT-24-r2_R1.fq.gz,pre_WT-24-r2_R2.fq.gz,auto
    pre_WT_24_r3,pre_WT-24-r3_R1.fq.gz,pre_WT-24-r3_R2.fq.gz,auto
    pre_deltaIJ_17_r1,pre_deltaIJ-17-r1_R1.fq.gz,pre_deltaIJ-17-r1_R2.fq.gz,auto
    pre_deltaIJ_17_r2,pre_deltaIJ-17-r2_R1.fq.gz,pre_deltaIJ-17-r2_R2.fq.gz,auto
    pre_deltaIJ_17_r3,pre_deltaIJ-17-r3_R1.fq.gz,pre_deltaIJ-17-r3_R2.fq.gz,auto
    pre_deltaIJ_24_r1,pre_deltaIJ-24-r1_R1.fq.gz,pre_deltaIJ-24-r1_R2.fq.gz,auto
    pre_deltaIJ_24_r2,pre_deltaIJ-24-r2_R1.fq.gz,pre_deltaIJ-24-r2_R2.fq.gz,auto
    pre_deltaIJ_24_r3,pre_deltaIJ-24-r3_R1.fq.gz,pre_deltaIJ-24-r3_R2.fq.gz,auto
    0_5_WT_17_r1,0_5_WT-17-r1_R1.fq.gz,0_5_WT-17-r1_R2.fq.gz,auto
    0_5_WT_17_r2,0_5_WT-17-r2_R1.fq.gz,0_5_WT-17-r2_R2.fq.gz,auto
    0_5_WT_17_r3,0_5_WT-17-r3_R1.fq.gz,0_5_WT-17-r3_R2.fq.gz,auto
    0_5_WT_24_r1,0_5_WT-24-r1_R1.fq.gz,0_5_WT-24-r1_R2.fq.gz,auto
    0_5_WT_24_r2,0_5_WT-24-r2_R1.fq.gz,0_5_WT-24-r2_R2.fq.gz,auto
    0_5_WT_24_r3,0_5_WT-24-r3_R1.fq.gz,0_5_WT-24-r3_R2.fq.gz,auto
    0_5_deltaIJ_17_r1,0_5_deltaIJ-17-r1_R1.fq.gz,0_5_deltaIJ-17-r1_R2.fq.gz,auto
    0_5_deltaIJ_17_r2,0_5_deltaIJ-17-r2_R1.fq.gz,0_5_deltaIJ-17-r2_R2.fq.gz,auto
    0_5_deltaIJ_17_r3,0_5_deltaIJ-17-r3_R1.fq.gz,0_5_deltaIJ-17-r3_R2.fq.gz,auto
    0_5_deltaIJ_24_r1,0_5_deltaIJ-24-r1_R1.fq.gz,0_5_deltaIJ-24-r1_R2.fq.gz,auto
    0_5_deltaIJ_24_r2,0_5_deltaIJ-24-r2_R1.fq.gz,0_5_deltaIJ-24-r2_R2.fq.gz,auto
    0_5_deltaIJ_24_r3,0_5_deltaIJ-24-r3_R1.fq.gz,0_5_deltaIJ-24-r3_R2.fq.gz,auto
    
  6. nextflow run

    #Example1: http://xgenes.com/article/article-content/157/prepare-virus-gtf-for-nextflow-run/
    
    docker pull nfcore/rnaseq
    ln -s /home/jhuang/Tools/nf-core-rnaseq-3.12.0/ rnaseq
    
    #Default: --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'exon'
    #(host_env) !NOT_WORKING! jhuang@WS-2290C:~/DATA/Data_Tam_RNAseq_2024$ /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results    --fasta "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CP059040.fasta" --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CP059040.gff"        -profile docker -resume  --max_cpus 55 --max_memory 512.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'star_salmon'    --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'transcript'
    
    # -- DEBUG_1 (CDS --> exon in CP059040.gff) --
    #Checking the record (see below) in results/genome/CP059040.gtf
    #In ./results/genome/CP059040.gtf e.g. "CP059040.1      Genbank transcript      1       1398    .       +       .       transcript_id "gene-H0N29_00005"; gene_id "gene-H0N29_00005"; gene_name "dnaA"; Name "dnaA"; gbkey "Gene"; gene "dnaA"; gene_biotype "protein_coding"; locus_tag "H0N29_00005";"
    #--featurecounts_feature_type 'transcript' returns only the tRNA results
    #Since the tRNA records have "transcript and exon". In gene records, we have "transcript and CDS". replace the CDS with exon
    
    grep -P "\texon\t" CP059040.gff | sort | wc -l    #96
    grep -P "cmsearch\texon\t" CP059040.gff | wc -l    #=10  ignal recognition particle sRNA small typ, transfer-messenger RNA, 5S ribosomal RNA
    grep -P "Genbank\texon\t" CP059040.gff | wc -l    #=12  16S and 23S ribosomal RNA
    grep -P "tRNAscan-SE\texon\t" CP059040.gff | wc -l    #tRNA 74
    wc -l star_salmon/AUM_r3/quant.genes.sf  #--featurecounts_feature_type 'transcript' results in 96 records!
    
    grep -P "\tCDS\t" CP059040.gff | wc -l  #3701
    sed 's/\tCDS\t/\texon\t/g' CP059040.gff > CP059040_m.gff
    grep -P "\texon\t" CP059040_m.gff | sort | wc -l  #3797
    
    # -- DEBUG_2: combination of 'CP059040_m.gff' and 'exon' results in ERROR, using 'transcript' instead!
    --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CP059040_m.gff" --featurecounts_feature_type 'transcript'
    
    # ---- SUCCESSFUL with directly downloaded gff3 and fasta from NCBI using docker after replacing 'CDS' with 'exon' ----
    mv trimmed/*.fq.gz .; rmdir trimmed
    (host_env) /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results    --fasta "/home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040.fasta" --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_m.gff"        -profile docker -resume  --max_cpus 90 --max_memory 900.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'star_salmon'    --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'transcript'
    
    # -- DEBUG_3: make sure the header of fasta is the same to the *_m.gff file
    
  7. Import data and pca-plot

    #mamba activate r_env
    
    #install.packages("ggfun")
    # Import the required libraries
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library(gplots)
    library(tximport)
    library(DESeq2)
    #library("org.Hs.eg.db")
    library(dplyr)
    library(tidyverse)
    #install.packages("devtools")
    #devtools::install_version("gtable", version = "0.3.0")
    library(gplots)
    library("RColorBrewer")
    #install.packages("ggrepel")
    library("ggrepel")
    # install.packages("openxlsx")
    library(openxlsx)
    library(EnhancedVolcano)
    library(DESeq2)
    library(edgeR)
    
    setwd("~/DATA/Data_Tam_RNAseq_2025_WT_deltaIJ_ATCC19606/results/star_salmon")
    # Define paths to your Salmon output quantification files
    files <- c("WT_17_r1" = "./WT_17_r1/quant.sf",
               "WT_17_r2" = "./WT_17_r2/quant.sf",
               "WT_17_r3" = "./WT_17_r3/quant.sf",
               "WT_24_r1" = "./WT_24_r1/quant.sf",
               "WT_24_r2" = "./WT_24_r2/quant.sf",
               "WT_24_r3" = "./WT_24_r3/quant.sf",
               "deltaIJ_17_r1" = "./deltaIJ_17_r1/quant.sf",
               "deltaIJ_17_r2" = "./deltaIJ_17_r2/quant.sf",
               "deltaIJ_17_r3" = "./deltaIJ_17_r3/quant.sf",
               "deltaIJ_24_r1" = "./deltaIJ_24_r1/quant.sf",
               "deltaIJ_24_r2" = "./deltaIJ_24_r2/quant.sf",
               "deltaIJ_24_r3" = "./deltaIJ_24_r3/quant.sf",
               "pre_WT_17_r1" = "./pre_WT_17_r1/quant.sf",
               "pre_WT_17_r2" = "./pre_WT_17_r2/quant.sf",
               "pre_WT_17_r3" = "./pre_WT_17_r3/quant.sf",
               "pre_WT_24_r1" = "./pre_WT_24_r1/quant.sf",
               "pre_WT_24_r2" = "./pre_WT_24_r2/quant.sf",
               "pre_WT_24_r3" = "./pre_WT_24_r3/quant.sf",
               "pre_deltaIJ_17_r1" = "./pre_deltaIJ_17_r1/quant.sf",
               "pre_deltaIJ_17_r2" = "./pre_deltaIJ_17_r2/quant.sf",
               "pre_deltaIJ_17_r3" = "./pre_deltaIJ_17_r3/quant.sf",
               "pre_deltaIJ_24_r1" = "./pre_deltaIJ_24_r1/quant.sf",
               "pre_deltaIJ_24_r2" = "./pre_deltaIJ_24_r2/quant.sf",
               "pre_deltaIJ_24_r3" = "./pre_deltaIJ_24_r3/quant.sf",
               "0_5_WT_17_r1" = "./0_5_WT_17_r1/quant.sf",
               "0_5_WT_17_r2" = "./0_5_WT_17_r2/quant.sf",
               "0_5_WT_17_r3" = "./0_5_WT_17_r3/quant.sf",
               "0_5_WT_24_r1" = "./0_5_WT_24_r1/quant.sf",
               "0_5_WT_24_r2" = "./0_5_WT_24_r2/quant.sf",
               "0_5_WT_24_r3" = "./0_5_WT_24_r3/quant.sf",
               "0_5_deltaIJ_17_r1" = "./0_5_deltaIJ_17_r1/quant.sf",
               "0_5_deltaIJ_17_r2" = "./0_5_deltaIJ_17_r2/quant.sf",
               "0_5_deltaIJ_17_r3" = "./0_5_deltaIJ_17_r3/quant.sf",
               "0_5_deltaIJ_24_r1" = "./0_5_deltaIJ_24_r1/quant.sf",
               "0_5_deltaIJ_24_r2" = "./0_5_deltaIJ_24_r2/quant.sf",
               "0_5_deltaIJ_24_r3" = "./0_5_deltaIJ_24_r3/quant.sf")
    # Import the transcript abundance data with tximport
    txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    # Define the replicates and condition of the samples
    replicate <- factor(c("r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3",     "r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3",      "r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3"))
    condition <- factor(c("WT_17","WT_17","WT_17","WT_24","WT_24","WT_24", "deltaIJ_17","deltaIJ_17","deltaIJ_17","deltaIJ_24","deltaIJ_24","deltaIJ_24",   "pre_WT_17","pre_WT_17","pre_WT_17","pre_WT_24","pre_WT_24","pre_WT_24", "pre_deltaIJ_17","pre_deltaIJ_17","pre_deltaIJ_17","pre_deltaIJ_24","pre_deltaIJ_24","pre_deltaIJ_24",   "0_5_WT_17","0_5_WT_17","0_5_WT_17","0_5_WT_24","0_5_WT_24","0_5_WT_24", "0_5_deltaIJ_17","0_5_deltaIJ_17","0_5_deltaIJ_17","0_5_deltaIJ_24","0_5_deltaIJ_24","0_5_deltaIJ_24"))
    # Define the colData for DESeq2
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    
    # ------------------------
    # 1️⃣ Setup and input files
    # ------------------------
    
    # Read in transcript-to-gene mapping
    tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    
    # Prepare tx2gene for gene-level summarization (remove gene_name if needed)
    tx2gene_geneonly <- tx2gene[, c("transcript_id", "gene_id")]
    
    # -------------------------------
    # 2️⃣ Transcript-level counts
    # -------------------------------
    # Create DESeqDataSet directly from tximport (transcript-level)
    dds_tx <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    write.csv(counts(dds_tx), file="transcript_counts.csv")
    
    # --------------------------------
    # 3️⃣ Gene-level summarization
    # --------------------------------
    # Re-import Salmon data summarized at gene level
    txi_gene <- tximport(files, type="salmon", tx2gene=tx2gene_geneonly, txOut=FALSE)
    
    # Create DESeqDataSet for gene-level counts
    dds <- DESeqDataSetFromTximport(txi_gene, colData=colData, design=~condition+replicate)
    
    # --------------------------------
    # 4️⃣ Raw counts table (with gene names)
    # --------------------------------
    # Extract raw gene-level counts
    counts_data <- as.data.frame(counts(dds, normalized=FALSE))
    counts_data$gene_id <- rownames(counts_data)
    
    # Add gene names
    tx2gene_unique <- unique(tx2gene[, c("gene_id", "gene_name")])
    counts_data <- merge(counts_data, tx2gene_unique, by="gene_id", all.x=TRUE)
    
    # Reorder columns: gene_id, gene_name, then counts
    count_cols <- setdiff(colnames(counts_data), c("gene_id", "gene_name"))
    counts_data <- counts_data[, c("gene_id", "gene_name", count_cols)]
    
    # --------------------------------
    # 5️⃣ Calculate CPM
    # --------------------------------
    library(edgeR)
    library(openxlsx)
    
    # Prepare count matrix for CPM calculation
    count_matrix <- as.matrix(counts_data[, !(colnames(counts_data) %in% c("gene_id", "gene_name"))])
    
    # Calculate CPM
    #cpm_matrix <- cpm(count_matrix, normalized.lib.sizes=FALSE)
    total_counts <- colSums(count_matrix)
    cpm_matrix <- t(t(count_matrix) / total_counts) * 1e6
    cpm_matrix <- as.data.frame(cpm_matrix)
    
    # Add gene_id and gene_name back to CPM table
    cpm_counts <- cbind(counts_data[, c("gene_id", "gene_name")], cpm_matrix)
    
    # --------------------------------
    # 6️⃣ Save outputs (CPM calculations required to send!)
    # --------------------------------
    write.csv(counts_data, "gene_raw_counts.csv", row.names=FALSE)
    write.xlsx(counts_data, "gene_raw_counts.xlsx", row.names=FALSE)
    write.xlsx(cpm_counts, "gene_cpm_counts.xlsx", row.names=FALSE)
    
  8. PCA

    dim(counts(dds))
    head(counts(dds), 10)
    
    library(DESeq2)
    library(RColorBrewer)
    library(gplots)
    library(ggplot2)
    
    # Load or generate DESeqDataSet object: dds
    # dds <- DESeqDataSetFromMatrix(...)  # <- already assumed
    
    # Apply rlog transformation
    rld <- rlogTransformation(dds)
    
    # Define condition names in correct order
    condition <- factor(c(
      "WT_17","WT_17","WT_17",
      "WT_24","WT_24","WT_24",
      "deltaIJ_17","deltaIJ_17","deltaIJ_17",
      "deltaIJ_24","deltaIJ_24","deltaIJ_24",
      "pre_WT_17","pre_WT_17","pre_WT_17",
      "pre_WT_24","pre_WT_24","pre_WT_24",
      "pre_deltaIJ_17","pre_deltaIJ_17","pre_deltaIJ_17",
      "pre_deltaIJ_24","pre_deltaIJ_24","pre_deltaIJ_24",
      "0_5_WT_17","0_5_WT_17","0_5_WT_17",
      "0_5_WT_24","0_5_WT_24","0_5_WT_24",
      "0_5_deltaIJ_17","0_5_deltaIJ_17","0_5_deltaIJ_17",
      "0_5_deltaIJ_24","0_5_deltaIJ_24","0_5_deltaIJ_24"
    ))
    
    # Replace with descriptive condition names
    condition <- factor(condition,
      levels = c(
        "WT_17", "deltaIJ_17", "WT_24", "deltaIJ_24",
        "pre_WT_17", "pre_deltaIJ_17", "pre_WT_24", "pre_deltaIJ_24",
        "0_5_WT_17", "0_5_deltaIJ_17", "0_5_WT_24", "0_5_deltaIJ_24"
      ),
      labels = c(
        "WT-17", "ΔIJ-17",
        "WT-24", "ΔIJ-24",
        "preWT-17", "preΔIJ-17",
        "preWT-24", "preΔIJ-24",
        "0_5WT-17", "0_5ΔIJ-17",
        "0_5WT-24", "0_5ΔIJ-24"
      )
    )
    
    # Assign to rld
    colData(rld)$condition <- condition
    
    # Define colors (12 distinct ones)
    condition_colors <- c(
      "#1f78b4", "#33a02c", "#a6cee3", "#b2df8a",
      "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00",
      "#cab2d6", "#6a3d9a", "#ffff99", "#b15928"
    )
    
    names(condition_colors) <- levels(condition)
    
    # Plot PCA
    png("pca_colored.png", width=1200, height=800)
    pcaData <- plotPCA(rld, intgroup="condition", returnData=TRUE)
    percentVar <- round(100 * attr(pcaData, "percentVar"))
    
    ggplot(pcaData, aes(PC1, PC2, color=condition)) +
      geom_point(size=4) +
      scale_color_manual(values=condition_colors) +
      xlab(paste0("PC1: ", percentVar[1], "% variance")) +
      ylab(paste0("PC2: ", percentVar[2], "% variance")) +
      theme_bw() +
      theme(axis.text = element_text(size=12), legend.text = element_text(size=10))
    dev.off()
    
    # Heatmap of sample distances
    png("heatmap.png", width=1200, height=800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc), Colv=as.dendrogram(hc),
              trace="none", symm=TRUE, col=rev(hmcol),
              margin=c(13, 13), labRow=condition, labCol=condition)
    dev.off()
    
  9. Select the differentially expressed genes

    #https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
    #https://www.biostars.org/p/282295/
    #https://www.biostars.org/p/335751/
    #> dds$condition
    #CONSOLE: mkdir star_salmon/degenes
    
    setwd("degenes")
    #---- relevel to control ----
    dds$condition <- relevel(dds$condition, "WT_17")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("deltaIJ_17_vs_WT_17")
    
    dds$condition <- relevel(dds$condition, "WT_24")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("deltaIJ_24_vs_WT_24")
    
    dds$condition <- relevel(dds$condition, "pre_WT_17")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("pre_deltaIJ_17_vs_pre_WT_17")
    
    dds$condition <- relevel(dds$condition, "pre_WT_24")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("pre_deltaIJ_24_vs_pre_WT_24")
    
    dds$condition <- relevel(dds$condition, "0_5_WT_17")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("0_5_deltaIJ_17_vs_0_5_WT_17")
    
    dds$condition <- relevel(dds$condition, "0_5_WT_24")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("0_5_deltaIJ_24_vs_0_5_WT_24")
    
    for (i in clist) {
      contrast = paste("condition", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      res_df <- as.data.frame(res)
    
      write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
      up <- subset(res_df, padj<=0.05 & log2FoldChange>=2)
      down <- subset(res_df, padj<=0.05 & log2FoldChange<=-2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    # -- Under host-env --
    grep -P "\tgene\t" CP059040.gff > CP059040_gene.gff
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff deltaIJ_17_vs_WT_17-all.txt deltaIJ_17_vs_WT_17-all.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff deltaIJ_17_vs_WT_17-up.txt deltaIJ_17_vs_WT_17-up.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff deltaIJ_17_vs_WT_17-down.txt deltaIJ_17_vs_WT_17-down.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff deltaIJ_24_vs_WT_24-all.txt deltaIJ_24_vs_WT_24-all.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff deltaIJ_24_vs_WT_24-up.txt deltaIJ_24_vs_WT_24-up.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff deltaIJ_24_vs_WT_24-down.txt deltaIJ_24_vs_WT_24-down.csv
    
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff pre_deltaIJ_17_vs_pre_WT_17-all.txt pre_deltaIJ_17_vs_pre_WT_17-all.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff pre_deltaIJ_17_vs_pre_WT_17-up.txt pre_deltaIJ_17_vs_pre_WT_17-up.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff pre_deltaIJ_17_vs_pre_WT_17-down.txt pre_deltaIJ_17_vs_pre_WT_17-down.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff pre_deltaIJ_24_vs_pre_WT_24-all.txt pre_deltaIJ_24_vs_pre_WT_24-all.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff pre_deltaIJ_24_vs_pre_WT_24-up.txt pre_deltaIJ_24_vs_pre_WT_24-up.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff pre_deltaIJ_24_vs_pre_WT_24-down.txt pre_deltaIJ_24_vs_pre_WT_24-down.csv
    
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff 0_5_deltaIJ_17_vs_0_5_WT_17-all.txt 0_5_deltaIJ_17_vs_0_5_WT_17-all.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff 0_5_deltaIJ_17_vs_0_5_WT_17-up.txt 0_5_deltaIJ_17_vs_0_5_WT_17-up.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff 0_5_deltaIJ_17_vs_0_5_WT_17-down.txt 0_5_deltaIJ_17_vs_0_5_WT_17-down.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff 0_5_deltaIJ_24_vs_0_5_WT_24-all.txt 0_5_deltaIJ_24_vs_0_5_WT_24-all.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff 0_5_deltaIJ_24_vs_0_5_WT_24-up.txt 0_5_deltaIJ_24_vs_0_5_WT_24-up.csv
    python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff 0_5_deltaIJ_24_vs_0_5_WT_24-down.txt 0_5_deltaIJ_24_vs_0_5_WT_24-down.csv
    
    res <- read.csv("deltaIJ_17_vs_WT_17-all.csv")
    res <- read.csv("deltaIJ_24_vs_WT_24-all.csv")
    res <- read.csv("pre_deltaIJ_17_vs_pre_WT_17-all.csv")
    res <- read.csv("pre_deltaIJ_24_vs_pre_WT_24-all.csv")
    res <- read.csv("0_5_deltaIJ_17_vs_0_5_WT_17-all.csv")
    res <- read.csv("0_5_deltaIJ_24_vs_0_5_WT_24-all.csv")
    # Replace empty GeneName with modified GeneID
    res$GeneName <- ifelse(
      res$GeneName == "" | is.na(res$GeneName),
      gsub("gene-", "", res$GeneID),
      res$GeneName
    )
    duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
    res <- res %>%
      group_by(GeneName) %>%
      slice_min(padj, with_ties = FALSE) %>%
      ungroup()
    res <- as.data.frame(res)
    # Sort res first by padj (ascending) and then by log2FoldChange (descending)
    res <- res[order(res$padj, -res$log2FoldChange), ]
    
    up_regulated <- res[res$log2FoldChange >= 2 & res$padj <= 5e-2, ]
    down_regulated <- res[res$log2FoldChange <= -2 & res$padj <= 5e-2, ]
    wb <- createWorkbook()
    addWorksheet(wb, "Complete_Data")
    writeData(wb, "Complete_Data", res)
    addWorksheet(wb, "Up_Regulated")
    writeData(wb, "Up_Regulated", up_regulated)
    addWorksheet(wb, "Down_Regulated")
    writeData(wb, "Down_Regulated", down_regulated)
    # Save the workbook to a file
    saveWorkbook(wb, "Gene_Expression_ΔIJ-17_vs_WT-17.xlsx", overwrite = TRUE)
    saveWorkbook(wb, "Gene_Expression_ΔIJ-24_vs_WT-24.xlsx", overwrite = TRUE)
    saveWorkbook(wb, "Gene_Expression_preΔIJ-17_vs_preWT-17.xlsx", overwrite = TRUE)
    saveWorkbook(wb, "Gene_Expression_preΔIJ-24_vs_preWT-24.xlsx", overwrite = TRUE)
    saveWorkbook(wb, "Gene_Expression_0_5ΔIJ-17_vs_0_5WT-17.xlsx", overwrite = TRUE)
    saveWorkbook(wb, "Gene_Expression_0_5ΔIJ-24_vs_0_5WT-24.xlsx", overwrite = TRUE)
    
    # Set the 'GeneName' column as row.names
    rownames(res) <- res$GeneName
    # Drop the 'GeneName' column since it's now the row names
    res$GeneName <- NULL
    head(res)
    
    ## Ensure the data frame matches the expected format
    ## For example, it should have columns: log2FoldChange, padj, etc.
    #res <- as.data.frame(res)
    ## Remove rows with NA in log2FoldChange (if needed)
    #res <- res[!is.na(res$log2FoldChange),]
    
    # Replace padj = 0 with a small value
    #res$padj[res$padj == 0] <- 1e-305
    
    #library(EnhancedVolcano)
    # Assuming res is already sorted and processed
    png("ΔIJ-17_vs_WT-17.png", width=1000, height=1200)
    png("ΔIJ-24_vs_WT-24.png", width=1000, height=1200)
    png("preΔIJ-17_vs_preWT-17.png", width=1000, height=1200)
    png("preΔIJ-24_vs_preWT-24.png", width=1000, height=1200)
    png("0_5ΔIJ-17_vs_0_5WT-17.png", width=1000, height=1200)
    png("0_5ΔIJ-24_vs_0_5WT-24.png", width=1000, height=1200)
    #max.overlaps = 10
    EnhancedVolcano(res,
                    lab = rownames(res),
                    x = 'log2FoldChange',
                    y = 'padj',
                    pCutoff = 5e-2,
                    FCcutoff = 2,
                    title = '',
                    subtitleLabSize = 18,
                    pointSize = 3.0,
                    labSize = 5.0,
                    colAlpha = 1,
                    legendIconSize = 4.0,
                    drawConnectors = TRUE,
                    widthConnectors = 0.5,
                    colConnectors = 'black',
                    subtitle = expression("0_5ΔIJ-24 versus 0_5WT-24"))
    dev.off()
    
  10. Venn diagram

        #To visualize gene expression overlaps across your conditions, Venn diagrams are useful — but only when comparing 2–5 groups at a time. Given your conditions and comparisons, here’s the best strategy:
    
        #✅ Best Venn Diagram Setup Options
        #You’re comparing wild-type (WT) and ΔIJ mutant strains under different conditions (no treatment, treatment A, treatment B) and time points (17h, 24h). To avoid overcrowded or unreadable plots, group comparisons by specific contrasts:
    
        #Option 1: Treatment Effect at One Time Point (ΔIJ vs WT)
        #Compare ΔIJ vs WT at 17h or 24h, under all 3 treatments (None, A, B):
    
        # Venn: “Treatment-dependent differences (ΔIJ vs WT) at 17h”
        #* ΔIJ vs WT (no treatment) – ΔIJ-17 vs WT-17
        #* ΔIJ vs WT (treatment A) – preΔIJ-17 vs preWT-17
        #* ΔIJ vs WT (treatment B) – 0_5ΔIJ-17 vs WT0_5-17
    
        #👉 3-way Venn diagram: Shows overlap in DEGs between different treatment conditions for the ΔIJ effect at a single time point.
    
        # Install and load required packages
        if (!require("VennDiagram")) install.packages("VennDiagram")
        if (!require("openxlsx")) install.packages("openxlsx")
        library(VennDiagram)
        library(openxlsx)
    
        # Set working directory
        setwd("/mnt/md1/DATA/Data_Tam_RNAseq_2025_WT_deltaIJ_ATCC19606/results/star_salmon/degenes")
    
        # Read upregulated gene lists at 17h
        df_no_treatment <- read.csv("deltaIJ_17_vs_WT_17-up.txt", header = TRUE)
        genes_no_treatment <- df_no_treatment[[1]]
    
        df_treatA <- read.csv("pre_deltaIJ_17_vs_pre_WT_17-up.txt", header = TRUE)
        genes_treatA <- df_treatA[[1]]
    
        df_treatB <- read.csv("0_5_deltaIJ_17_vs_0_5_WT_17-up.txt", header = TRUE)
        genes_treatB <- df_treatB[[1]]
    
        # Clean gene names (optional, in case of extra characters like quotes)
        genes_no_treatment <- gsub("^\"|\"$", "", genes_no_treatment)
        genes_treatA <- gsub("^\"|\"$", "", genes_treatA)
        genes_treatB <- gsub("^\"|\"$", "", genes_treatB)
    
        # Create a list for Venn
        venn_list <- list(
          "No_Treatment" = genes_no_treatment,
          "Treatment_A" = genes_treatA,
          "Treatment_B" = genes_treatB
        )
    
        # Save Venn diagram
        venn.diagram(
          x = venn_list,
          filename = "venn_17h_upregulated_treatments.png",
          imagetype = "png",
          output = TRUE,
          col = "transparent",
          fill = c("#66c2a5", "#fc8d62", "#8da0cb"),
          alpha = 0.5,
          cex = 1.5,
          cat.cex = 1.4,
          cat.pos = 0,
          cat.dist = 0.05,
          main = "Upregulated Genes (ΔIJ vs WT, 17h)",
          main.cex = 1.5
        )
    
        # Intersections
        only_no <- setdiff(genes_no_treatment, union(genes_treatA, genes_treatB))
        only_A <- setdiff(genes_treatA, union(genes_no_treatment, genes_treatB))
        only_B <- setdiff(genes_treatB, union(genes_no_treatment, genes_treatA))
    
        no_A <- intersect(genes_no_treatment, genes_treatA)
        no_B <- intersect(genes_no_treatment, genes_treatB)
        A_B <- intersect(genes_treatA, genes_treatB)
    
        no_A_B <- Reduce(intersect, list(genes_no_treatment, genes_treatA, genes_treatB))
    
        # Remove overlapping from pairwise (keep only those not in 3-way)
        no_A <- setdiff(no_A, no_A_B)
        no_B <- setdiff(no_B, no_A_B)
        A_B <- setdiff(A_B, no_A_B)
    
        # Write to Excel
        wb <- createWorkbook()
        addWorksheet(wb, "Only_No_Treatment")
        addWorksheet(wb, "Only_Treatment_A")
        addWorksheet(wb, "Only_Treatment_B")
        addWorksheet(wb, "No_Treatment_AND_Treatment_A")
        addWorksheet(wb, "No_Treatment_AND_Treatment_B")
        addWorksheet(wb, "Treatment_A_AND_Treatment_B")
        addWorksheet(wb, "All_Three")
    
        writeData(wb, "Only_No_Treatment", only_no)
        writeData(wb, "Only_Treatment_A", only_A)
        writeData(wb, "Only_Treatment_B", only_B)
        writeData(wb, "No_Treatment_AND_Treatment_A", no_A)
        writeData(wb, "No_Treatment_AND_Treatment_B", no_B)
        writeData(wb, "Treatment_A_AND_Treatment_B", A_B)
        writeData(wb, "All_Three", no_A_B)
    
        saveWorkbook(wb, "upregulated_17h_intersections.xlsx", overwrite = TRUE)
    
        #--
    
        if (!require("VennDiagram")) install.packages("VennDiagram")
        if (!require("openxlsx")) install.packages("openxlsx")
        library(VennDiagram)
        library(openxlsx)
    
        setwd("/mnt/md1/DATA/Data_Tam_RNAseq_2025_WT_deltaIJ_ATCC19606/results/star_salmon/degenes")
    
        # Helper function
        process_and_save_venn <- function(label, files, outfile_prefix) {
          gene_lists <- list()
    
          # Read gene lists
          for (name in names(files)) {
            df <- read.csv(files[[name]], header = TRUE)
            genes <- gsub("^\"|\"$", "", df[[1]])
            gene_lists[[name]] <- genes
          }
    
          # Plot Venn
          venn.diagram(
            x = gene_lists,
            filename = paste0(outfile_prefix, ".png"),
            imagetype = "png",
            output = TRUE,
            col = "transparent",
            fill = c("#66c2a5", "#fc8d62", "#8da0cb"),
            alpha = 0.5,
            cex = 1.5,
            cat.cex = 1.4,
            #cat.pos = 0,
            #cat.dist = 0.05,
            cat.pos = c(-30, 30, 135),   # Move labels around the circles
            cat.dist = c(0.04, 0.04, 0.04),  # Push labels further outside
            main = label,
            main.cex = 1.5
          )
    
          # Intersections
          A <- gene_lists[[1]]
          B <- gene_lists[[2]]
          C <- gene_lists[[3]]
    
          only_A <- setdiff(A, union(B, C))
          only_B <- setdiff(B, union(A, C))
          only_C <- setdiff(C, union(A, B))
    
          AB <- setdiff(intersect(A, B), intersect(intersect(A, B), C))
          AC <- setdiff(intersect(A, C), intersect(intersect(A, C), B))
          BC <- setdiff(intersect(B, C), intersect(intersect(A, B), C))
    
          ABC <- Reduce(intersect, list(A, B, C))
    
          # Save Excel
          wb <- createWorkbook()
          addWorksheet(wb, "Only_A")
          addWorksheet(wb, "Only_B")
          addWorksheet(wb, "Only_C")
          addWorksheet(wb, "A_and_B")
          addWorksheet(wb, "A_and_C")
          addWorksheet(wb, "B_and_C")
          addWorksheet(wb, "All_Three")
    
          writeData(wb, "Only_A", only_A)
          writeData(wb, "Only_B", only_B)
          writeData(wb, "Only_C", only_C)
          writeData(wb, "A_and_B", AB)
          writeData(wb, "A_and_C", AC)
          writeData(wb, "B_and_C", BC)
          writeData(wb, "All_Three", ABC)
    
          saveWorkbook(wb, paste0(outfile_prefix, ".xlsx"), overwrite = TRUE)
        }
    
        ### === UPREGULATED GENES === ###
        process_and_save_venn(
          label = "Upregulated Genes (ΔIJ vs WT, 17h)",
          files = list(
            "No_Treatment" = "deltaIJ_17_vs_WT_17-up.txt",
            "Treatment_A" = "pre_deltaIJ_17_vs_pre_WT_17-up.txt",
            "Treatment_B" = "0_5_deltaIJ_17_vs_0_5_WT_17-up.txt"
          ),
          outfile_prefix = "venn_upregulated_17h"
        )
    
        process_and_save_venn(
          label = "Upregulated Genes (ΔIJ vs WT, 24h)",
          files = list(
            "No_Treatment" = "deltaIJ_24_vs_WT_24-up.txt",
            "Treatment_A" = "pre_deltaIJ_24_vs_pre_WT_24-up.txt",
            "Treatment_B" = "0_5_deltaIJ_24_vs_0_5_WT_24-up.txt"
          ),
          outfile_prefix = "venn_upregulated_24h"
        )
    
        ### === DOWNREGULATED GENES === ###
        process_and_save_venn(
          label = "Downregulated Genes (ΔIJ vs WT, 17h)",
          files = list(
            "No_Treatment" = "deltaIJ_17_vs_WT_17-down.txt",
            "Treatment_A" = "pre_deltaIJ_17_vs_pre_WT_17-down.txt",
            "Treatment_B" = "0_5_deltaIJ_17_vs_0_5_WT_17-down.txt"
          ),
          outfile_prefix = "venn_downregulated_17h"
        )
    
        process_and_save_venn(
          label = "Downregulated Genes (ΔIJ vs WT, 24h)",
          files = list(
            "No_Treatment" = "deltaIJ_24_vs_WT_24-down.txt",
            "Treatment_A" = "pre_deltaIJ_24_vs_pre_WT_24-down.txt",
            "Treatment_B" = "0_5_deltaIJ_24_vs_0_5_WT_24-down.txt"
          ),
          outfile_prefix = "venn_downregulated_24h"
        )
    

KEGG and GO annotations in non-model organisms

https://www.biobam.com/functional-analysis/

  1. Assign KEGG and GO Terms (see diagram above)

    Since your organism is non-model, standard R databases (org.Hs.eg.db, etc.) won’t work. You’ll need to manually retrieve KEGG and GO annotations.

    • Preparing file 1 eggnog_out.emapper.annotations.txt for the R-code below: (KEGG Terms): EggNog based on orthology and phylogenies

      EggNOG-mapper assigns both KEGG Orthology (KO) IDs and GO terms.

      Install EggNOG-mapper:

      mamba create -n eggnog_env python=3.8 eggnog-mapper -c conda-forge -c bioconda  #eggnog-mapper_2.1.12
      mamba activate eggnog_env
      

      Run annotation:

      #diamond makedb --in eggnog6.prots.faa -d eggnog_proteins.dmnd
      mkdir /home/jhuang/mambaforge/envs/eggnog_env/lib/python3.8/site-packages/data/
      download_eggnog_data.py --dbname eggnog.db -y --data_dir /home/jhuang/mambaforge/envs/eggnog_env/lib/python3.8/site-packages/data/
      #NOT_WORKING: emapper.py -i CP059040_gene.fasta -o eggnog_dmnd_out --cpu 60 -m diamond[hmmer,mmseqs] --dmnd_db /home/jhuang/REFs/eggnog_data/data/eggnog_proteins.dmnd
      python ~/Scripts/update_fasta_header.py CP059040_protein_.fasta CP059040_protein.fasta
      emapper.py -i CP059040_protein.fasta -o eggnog_out --cpu 60 --resume
      #----> result annotations.tsv: Contains KEGG, GO, and other functional annotations.
      #---->  470.IX87_14445:
          * 470 likely refers to the organism or strain (e.g., Acinetobacter baumannii ATCC 19606 or another related strain).
          * IX87_14445 would refer to a specific gene or protein within that genome.
      

      Extract KEGG KO IDs from annotations.emapper.annotations.

    • Preparing file 2 blast2go_annot.annot2_ for the R-code below:

    • Basic (GO Terms from 'Blast2GO 5 Basic', saved in blast2go_annot.annot): Using Blast/Diamond + Blast2GO_GUI based on sequence alignment + GO mapping

      • 'Load protein sequences' (Tags: NONE, generated columns: Nr, SeqName) -->
      • Buttons 'blast' (Tags: BLASTED, generated columns: Description, Length, #Hits, e-Value, sim mean),
      • Button 'mapping' (Tags: MAPPED, generated columns: #GO, GO IDs, GO Names), "Mapping finished - Please proceed now to annotation."
      • Button 'annot' (Tags: ANNOTATED, generated columns: Enzyme Codes, Enzyme Names), "Annotation finished." * Used parameter 'Annotation CutOff': The Blast2GO Annotation Rule seeks to find the most specific GO annotations with a certain level of reliability. An annotation score is calculated for each candidate GO which is composed by the sequence similarity of the Blast Hit, the evidence code of the source GO and the position of the particular GO in the Gene Ontology hierarchy. This annotation score cutoff select the most specific GO term for a given GO branch which lies above this value. * Used parameter 'GO Weight' is a value which is added to Annotation Score of a more general/abstract Gene Ontology term for each of its more specific, original source GO terms. In this case, more general GO terms which summarise many original source terms (those ones directly associated to the Blast Hits) will have a higher Annotation Score.
    • Advanced (GO Terms from 'Blast2GO 5 Basic'): Interpro based protein families / domains --> Button interpro

      • Button 'interpro' (Tags: INTERPRO, generated columns: InterPro IDs, InterPro GO IDs, InterPro GO Names) --> "InterProScan Finished - You can now merge the obtained GO Annotations."
    • MERGE the results of InterPro GO IDs (advanced) to GO IDs (basic) and generate final GO IDs, saved in blast2go_annot.annot2

      • Button 'interpro'/'Merge InterProScan GOs to Annotation' --> "Merge (add and validate) all GO terms retrieved via InterProScan to the already existing GO annotation." --> "Finished merging GO terms from InterPro with annotations. Maybe you want to run ANNEX (Annotation Augmentation)."
      • (NOT_USED) Button 'annot'/'ANNEX' --> "ANNEX finished. Maybe you want to do the next step: Enzyme Code Mapping."
    • PREPARING go_terms and ec_terms: annot_* file:

      cut -f1-2 -d$'\t' blast2go_annot.annot2 > blast2go_annot.annot2_

  2. Perform KEGG and GO Enrichment in R

        #BiocManager::install("GO.db")
        #BiocManager::install("AnnotationDbi")
    
        # Load required libraries
        library(openxlsx)  # For Excel file handling
        library(dplyr)     # For data manipulation
        library(tidyr)
        library(stringr)
        library(clusterProfiler)  # For KEGG and GO enrichment analysis
        #library(org.Hs.eg.db)  # Replace with appropriate organism database
        library(GO.db)
        library(AnnotationDbi)
    
        setwd("~/DATA/Data_Tam_RNAseq_2025_WT_deltaIJ_ATCC19606//results/star_salmon/degenes")
        # Step 1: Load the blast2go annotation file with a check for missing columns
        annot_df <- read.table("/home/jhuang/b2gWorkspace_Tam_RNAseq_2024/blast2go_annot.annot2_",
                            header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
    
        # If the structure is inconsistent, we can make sure there are exactly 3 columns:
        colnames(annot_df) <- c("GeneID", "Term")
        # Step 2: Filter and aggregate GO and EC terms as before
        go_terms <- annot_df %>%
        filter(grepl("^GO:", Term)) %>%
        group_by(GeneID) %>%
        summarize(GOs = paste(Term, collapse = ","), .groups = "drop")
        ec_terms <- annot_df %>%
        filter(grepl("^EC:", Term)) %>%
        group_by(GeneID) %>%
        summarize(EC = paste(Term, collapse = ","), .groups = "drop")
    
        # Load the results
        #res <- read.csv("deltaIJ_17_vs_WT_17-all.csv")   #up11, down3
        #res <- read.csv("deltaIJ_24_vs_WT_24-all.csv")   #up0, down2
        #res <- read.csv("pre_deltaIJ_17_vs_pre_WT_17-all.csv")  #up238, down90
        #res <- read.csv("pre_deltaIJ_24_vs_pre_WT_24-all.csv")  #up83, down64
        #res <- read.csv("0_5_deltaIJ_17_vs_0_5_WT_17-all.csv")  #up74, down14
        res <- read.csv("0_5_deltaIJ_24_vs_0_5_WT_24-all.csv")  #up1, down3
    
        # Replace empty GeneName with modified GeneID
        res$GeneName <- ifelse(
            res$GeneName == "" | is.na(res$GeneName),
            gsub("gene-", "", res$GeneID),
            res$GeneName
        )
    
        # Remove duplicated genes by selecting the gene with the smallest padj
        duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
        res <- res %>%
        group_by(GeneName) %>%
        slice_min(padj, with_ties = FALSE) %>%
        ungroup()
    
        res <- as.data.frame(res)
        # Sort res first by padj (ascending) and then by log2FoldChange (descending)
        res <- res[order(res$padj, -res$log2FoldChange), ]
        # Read eggnog annotations
        eggnog_data <- read.delim("~/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/eggnog_out.emapper.annotations.txt", header = TRUE, sep = "\t")
        # Remove the "gene-" prefix from GeneID in res to match eggnog 'query' format
        res$GeneID <- gsub("gene-", "", res$GeneID)
        # Merge eggnog data with res based on GeneID
        res <- res %>%
        left_join(eggnog_data, by = c("GeneID" = "query"))
    
        # DEBUG: NOT_NECESSARY, since res has already GeneName
        ##Convert row names to a new column 'GeneName' in res
        #res_with_geneName <- res %>%
        #mutate(GeneName = rownames(res)) %>%
        #as.data.frame()  # Ensure that it's a regular data frame without row names
        ## View the result
        #head(res_with_geneName)
    
        # Merge with the res dataframe
        # Perform the left joins and rename columns
        res_updated <- res %>%
        left_join(go_terms, by = "GeneID") %>%
        left_join(ec_terms, by = "GeneID") %>% dplyr::select(-EC.x, -GOs.x) %>% dplyr::rename(EC = EC.y, GOs = GOs.y)
    
        # DEBUG: NOT_NECESSARY, since 'GeneName' is already the first column.
        ## Reorder columns to move 'GeneName' as the first column in res_updated
        #res_updated <- res_updated %>%
        #select(GeneName, everything())
    
        ## Count the number of rows in the KEGG_ko, GOs, EC columns that have non-missing values
        #num_non_missing_KEGG_ko <- sum(res_updated$KEGG_ko != "-" & !is.na(res_updated$KEGG_ko))
        #print(num_non_missing_KEGG_ko)
        ##[1] 2030
        #num_non_missing_GOs <- sum(res_updated$GOs != "-" & !is.na(res_updated$GOs))
        #print(num_non_missing_GOs)
        ##[1] 2865 --> 2875
        #num_non_missing_EC <- sum(res_updated$EC != "-" & !is.na(res_updated$EC))
        #print(num_non_missing_EC)
        ##[1] 1701
    
        # Filter up-regulated genes
        up_regulated <- res_updated[res_updated$log2FoldChange > 2 & res_updated$padj < 0.05, ]
        # Filter down-regulated genes
        down_regulated <- res_updated[res_updated$log2FoldChange < -2 & res_updated$padj < 0.05, ]
    
        # Create a new workbook
        wb <- createWorkbook()
        # Add the complete dataset as the first sheet (with annotations)
        addWorksheet(wb, "Complete_Data")
        writeData(wb, "Complete_Data", res_updated)
        # Add the up-regulated genes as the second sheet (with annotations)
        addWorksheet(wb, "Up_Regulated")
        writeData(wb, "Up_Regulated", up_regulated)
        # Add the down-regulated genes as the third sheet (with annotations)
        addWorksheet(wb, "Down_Regulated")
        writeData(wb, "Down_Regulated", down_regulated)
        # Save the workbook to a file
        saveWorkbook(wb, "Gene_Expression_with_Annotations_0_5ΔIJ-24_vs_0_5WT-24.xlsx", overwrite = TRUE)
    
        # Set GeneName as row names after the join
        rownames(res_updated) <- res_updated$GeneName
        res_updated <- res_updated %>% dplyr::select(-GeneName)
        ## Set the 'GeneName' column as row.names
        #rownames(res_updated) <- res_updated$GeneName
        ## Drop the 'GeneName' column since it's now the row names
        #res_updated$GeneName <- NULL
    
        # ---- Perform KEGG enrichment analysis (up_regulated) ----
        gene_list_kegg_up <- up_regulated$KEGG_ko
        gene_list_kegg_up <- gsub("ko:", "", gene_list_kegg_up)
        kegg_enrichment_up <- enrichKEGG(gene = gene_list_kegg_up, organism = 'ko')
        # -- convert the GeneID (Kxxxxxx) to the true GeneID --
        # Step 0: Create KEGG to GeneID mapping
        kegg_to_geneid_up <- up_regulated %>%
        dplyr::select(KEGG_ko, GeneID) %>%
        filter(!is.na(KEGG_ko)) %>%  # Remove missing KEGG KO entries
        mutate(KEGG_ko = str_remove(KEGG_ko, "ko:"))  # Remove 'ko:' prefix if present
        # Step 1: Clean KEGG_ko values (separate multiple KEGG IDs)
        kegg_to_geneid_clean <- kegg_to_geneid_up %>%
        mutate(KEGG_ko = str_remove_all(KEGG_ko, "ko:")) %>%  # Remove 'ko:' prefixes
        separate_rows(KEGG_ko, sep = ",") %>%  # Ensure each KEGG ID is on its own row
        filter(KEGG_ko != "-") %>%  # Remove invalid KEGG IDs ("-")
        distinct()  # Remove any duplicate mappings
        # Step 2.1: Expand geneID column in kegg_enrichment_up
        expanded_kegg <- kegg_enrichment_up %>%
        as.data.frame() %>%
        separate_rows(geneID, sep = "/") %>%  # Split multiple KEGG IDs (Kxxxxx)
        left_join(kegg_to_geneid_clean, by = c("geneID" = "KEGG_ko"), relationship = "many-to-many") %>%  # Explicitly handle many-to-many
        distinct() %>%  # Remove duplicate matches
        group_by(ID) %>%
        summarise(across(everything(), ~ paste(unique(na.omit(.)), collapse = "/")), .groups = "drop")  # Re-collapse results
        #dplyr::glimpse(expanded_kegg)
        # Step 3.1: Replace geneID column in the original dataframe
        kegg_enrichment_up_df <- as.data.frame(kegg_enrichment_up)
        # Remove old geneID column and merge new one
        kegg_enrichment_up_df <- kegg_enrichment_up_df %>%
        dplyr::select(-geneID) %>%  # Remove old geneID column
        left_join(expanded_kegg %>% dplyr::select(ID, GeneID), by = "ID") %>%  # Merge new GeneID column
        dplyr::rename(geneID = GeneID)  # Rename column back to geneID
    
        # ---- Perform KEGG enrichment analysis (down_regulated) ----
        # Step 1: Extract KEGG KO terms from down-regulated genes
        gene_list_kegg_down <- down_regulated$KEGG_ko
        gene_list_kegg_down <- gsub("ko:", "", gene_list_kegg_down)
        # Step 2: Perform KEGG enrichment analysis
        kegg_enrichment_down <- enrichKEGG(gene = gene_list_kegg_down, organism = 'ko')
        # --- Convert KEGG gene IDs (Kxxxxxx) to actual GeneIDs ---
        # Step 3: Create KEGG to GeneID mapping from down_regulated dataset
        kegg_to_geneid_down <- down_regulated %>%
        dplyr::select(KEGG_ko, GeneID) %>%
        filter(!is.na(KEGG_ko)) %>%  # Remove missing KEGG KO entries
        mutate(KEGG_ko = str_remove(KEGG_ko, "ko:"))  # Remove 'ko:' prefix if present
        # Step 4: Clean KEGG_ko values (handle multiple KEGG IDs)
        kegg_to_geneid_down_clean <- kegg_to_geneid_down %>%
        mutate(KEGG_ko = str_remove_all(KEGG_ko, "ko:")) %>%  # Remove 'ko:' prefixes
        separate_rows(KEGG_ko, sep = ",") %>%  # Ensure each KEGG ID is on its own row
        filter(KEGG_ko != "-") %>%  # Remove invalid KEGG IDs ("-")
        distinct()  # Remove duplicate mappings
        # Step 5: Expand geneID column in kegg_enrichment_down
        expanded_kegg_down <- kegg_enrichment_down %>%
        as.data.frame() %>%
        separate_rows(geneID, sep = "/") %>%  # Split multiple KEGG IDs (Kxxxxx)
        left_join(kegg_to_geneid_down_clean, by = c("geneID" = "KEGG_ko"), relationship = "many-to-many") %>%  # Handle many-to-many mappings
        distinct() %>%  # Remove duplicate matches
        group_by(ID) %>%
        summarise(across(everything(), ~ paste(unique(na.omit(.)), collapse = "/")), .groups = "drop")  # Re-collapse results
        # Step 6: Replace geneID column in the original kegg_enrichment_down dataframe
        kegg_enrichment_down_df <- as.data.frame(kegg_enrichment_down) %>%
        dplyr::select(-geneID) %>%  # Remove old geneID column
        left_join(expanded_kegg_down %>% dplyr::select(ID, GeneID), by = "ID") %>%  # Merge new GeneID column
        dplyr::rename(geneID = GeneID)  # Rename column back to geneID
        # View the updated dataframe
        head(kegg_enrichment_down_df)
    
        # Create a new workbook
        wb <- createWorkbook()
        # Save enrichment results to the workbook
        addWorksheet(wb, "KEGG_Enrichment_Up")
        writeData(wb, "KEGG_Enrichment_Up", as.data.frame(kegg_enrichment_up_df))
        # Save enrichment results to the workbook
        addWorksheet(wb, "KEGG_Enrichment_Down")
        writeData(wb, "KEGG_Enrichment_Down", as.data.frame(kegg_enrichment_down_df))
        #saveWorkbook(wb, "KEGG_Enrichment.xlsx", overwrite = TRUE)
    
        # ---- Perform GO enrichment analysis (TODO: extract the merged GO IDs from 'Blast2GO 5 Basic' and adapt the code below!)----
    
        # Define gene list (up-regulated genes)
        gene_list_go_up <- up_regulated$GeneID  # Extract the 149 up-regulated genes
        gene_list_go_down <- down_regulated$GeneID  # Extract the 65 down-regulated genes
    
        # Define background gene set (all genes in res)
        background_genes <- res_updated$GeneID  # Extract the 3646 background genes
    
        # Prepare GO annotation data from res
        go_annotation <- res_updated[, c("GOs","GeneID")]  # Extract relevant columns
        go_annotation <- go_annotation %>%
        tidyr::separate_rows(GOs, sep = ",")  # Split multiple GO terms into separate rows
    
        # Perform GO enrichment analysis, where pAdjustMethod is one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"
        go_enrichment_up <- enricher(
            gene = gene_list_go_up,                # Up-regulated genes
            TERM2GENE = go_annotation,       # Custom GO annotation
            pvalueCutoff = 1.0,             # Significance threshold
            pAdjustMethod = "BH",
            universe = background_genes      # Define the background gene set
        )
        go_enrichment_up <- as.data.frame(go_enrichment_up)
    
        go_enrichment_down <- enricher(
            gene = gene_list_go_down,                # Up-regulated genes
            TERM2GENE = go_annotation,       # Custom GO annotation
            pvalueCutoff = 1.0,             # Significance threshold
            pAdjustMethod = "BH",
            universe = background_genes      # Define the background gene set
        )
        go_enrichment_down <- as.data.frame(go_enrichment_down)
    
        ## Remove the 'p.adjust' column since no adjusted methods have been applied!
        #go_enrichment_up <- go_enrichment_up[, !names(go_enrichment_up) %in% "p.adjust"]
        # Update the Description column with the term descriptions
        go_enrichment_up$Description <- sapply(go_enrichment_up$ID, function(go_id) {
        # Using select to get the term description
        term <- tryCatch({
            AnnotationDbi::select(GO.db, keys = go_id, columns = "TERM", keytype = "GOID")
        }, error = function(e) {
            message(paste("Error for GO term:", go_id))  # Print which GO ID caused the error
            return(data.frame(TERM = NA))  # In case of error, return NA
        })
    
        if (nrow(term) > 0) {
            return(term$TERM)
        } else {
            return(NA)  # If no description found, return NA
        }
        })
        ## Print the updated data frame
        #print(go_enrichment_up)
    
        ## Remove the 'p.adjust' column since no adjusted methods have been applied!
        #go_enrichment_down <- go_enrichment_down[, !names(go_enrichment_down) %in% "p.adjust"]
        # Update the Description column with the term descriptions
        go_enrichment_down$Description <- sapply(go_enrichment_down$ID, function(go_id) {
        # Using select to get the term description
        term <- tryCatch({
            AnnotationDbi::select(GO.db, keys = go_id, columns = "TERM", keytype = "GOID")
        }, error = function(e) {
            message(paste("Error for GO term:", go_id))  # Print which GO ID caused the error
            return(data.frame(TERM = NA))  # In case of error, return NA
        })
    
        if (nrow(term) > 0) {
            return(term$TERM)
        } else {
            return(NA)  # If no description found, return NA
        }
        })
    
        addWorksheet(wb, "GO_Enrichment_Up")
        writeData(wb, "GO_Enrichment_Up", as.data.frame(go_enrichment_up))
    
        addWorksheet(wb, "GO_Enrichment_Down")
        writeData(wb, "GO_Enrichment_Down", as.data.frame(go_enrichment_down))
    
        # Save the workbook with enrichment results
        saveWorkbook(wb, "KEGG_and_GO_Enrichments_0_5ΔIJ-24_vs_0_5WT-24.xlsx", overwrite = TRUE)
    
        #Error for GO term: GO:0006807: replace GO:0006807  obsolete nitrogen compound metabolic process
        #TODO: marked the color as yellow if the p.adjusted <= 0.05 in GO_enrichment!
    
  3. Finalizing the KEGG and GO Enrichment table

        1. NOTE: geneIDs in KEGG_Enrichment have been already translated from ko to geneID in H0N29_*-format;
        2. NEED_MANUAL_DELETION: p.adjust values have been calculated, we have to filter all records in GO_Enrichment-results by |p.adjust|<=0.05.
        3. DELETE_ALL_q-values, since sometimes the qvalues missing!: If only one record --> no q-values: the missing qvalue is expected here — you can't calculate q-values with only one p-value. The p.adjust (e.g. Benjamini-Hochberg FDR) is still valid because it technically works even with a single p-value, but qvalue requires more data.
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum