Processing transposon insertion-site deep sequencing (Tn-seq) data

gene_x 0 like s 86 view s

Tags: processing, pipeline

  1. run tpp https://transit.readthedocs.io/en/latest/transit_running.html

    #-maxreads 10000 or not_given for take all!
    #-primer AGATGTGTATAAGAGACAG     the default primer of Tn5 is TAAGAGACAG!
    #-primer-start-window 0,159  set 0,159 as default!
    
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R1_001.fastq -reads2 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R2_001.fastq -output 10_chimera -mismatches initial_mutants_rep1 -bwa-alg mem -primer ACCTACCCCNCCGCTCTC -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg initial_mutants_rep1.tpp.cfg
    
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R1_001.fastq -reads2 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R2_001.fastq -output 10 -mismatches initial_mutants_rep1 -bwa-alg mem  -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg initial_mutants_rep1.tpp.cfg
    
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr3/LB_culture_rep1_S26_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr3/LB_culture_rep1_S26_R2_001.fastq.gz -output LB_culture_rep1  -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg LB_culture_rep1.tpp.cfg
    
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr9/intracellular_mutants_24h_rep1_S29_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr9/intracellular_mutants_24h_rep1_S29_R2_001.fastq.gz -output intracellular_mutants_24h_rep1 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg intracellular_mutants_24h_rep1.tpp.cfg
    
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr8/extracellular_mutants_24h_rep2_S28_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr8/extracellular_mutants_24h_rep2_S28_R2_001.fastq.gz -output extracellular_mutants_24h_rep2 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg extracellular_mutants_24h_rep2.tpp.cfg
    
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr6/growthout_control_24h_rep2_S27_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr6/growthout_control_24h_rep2_S27_R2_001.fastq.gz -output growthout_control_24h_rep2 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg growthout_control_24h_rep2.tpp.cfg
    
    #END
    
    cp initial_mutants_rep1.tn_stats initial_mutants_rep1.tn_stats_
    #Delete all general statistics before the table data in initial_mutants_rep1.tn_stats_; delete the content after "# FR_corr (Fwd templates vs. Rev templates):"
    sed -i 's/read_count (TA sites only, for Himar1)/read_counts/g' *.tn_stats_
    sed -i 's/NZ_mean (among templates)/NZ_mean (mean template count over non-zero TA sites)/g' *.tn_stats_
    
    python3 ../parse_tn_stats.py initial_mutants_rep1.tn_stats_ initial_mutants_rep1.tn_stats.xlsx
    python3 ../parse_tn_stats.py LB_culture_rep1.tn_stats_ LB_culture_rep1.tn_stats.xlsx
    
    #calculate the sum of the first and second columns by "=SUM(B2:B130)" and "=SUM(C2:C130)"  441057 and 276060
    
    mkdir initial_mutants_rep1_wig
    mv *.wig initial_mutants_rep1_wig/
    zip -r initial_mutants_rep1_wig.zip initial_mutants_rep1_wig/
    
    #The counts-files are too big, not nessasary to send:
    #~/Tools/csv2xls-0.4/csv_to_xls.py initial_mutants_rep1.tn_stats_ *.counts -d$',' -o initial_mutants_rep1.stats.xls;
    
    #   contig_1_10 699 411
    #   contig_1_8  3206    2031
    #   contig_1_7  3787    2376
    #   contig_1_6  4604    2871
    #   contig_1_5  2794    1765
    #   contig_1_4  83  58
    #   contig_1_3  2944    1882
    #   contig_1_2  15446   9678
    #   contig_1_1  14391   8954
    
  2. delete PCR-duplicate for checking if the template counts in tpp are correct

    To address the issue of PCR duplicates in paired-end sequencing data (which you referred to as "PCA-duplicate," but I believe you meant "PCR duplicate"), you can use tools designed for post-alignment processing of SAM/BAM files. These tools identify and remove or mark duplicates where both reads of a pair are identical to another pair in the library, often indicating that they are duplicates resulting from PCR amplification rather than unique sequencing events.
    
    Step 0. Install Samtools and Picard Tools for Removing PCR Duplicates: Both Samtools and Picard are widely used for manipulating SAM/BAM files, including removing duplicates. Here's how you can use Picard to remove PCR duplicates:
    
        conda install -c bioconda picard samtools
    
    Step 1: Convert SAM to BAM and sort the BAM file
    If your file is in SAM format, you need to convert it to BAM format first using Samtools; PCR duplicate removal requires that the BAM file be sorted by coordinate. This can be done using Samtools:
    
        samtools view -Sb initial_mutants_rep1.sam > initial_mutants_rep1.bam
        samtools sort initial_mutants_rep1.bam -o initial_mutants_rep1_sorted.bam
        #557566 + 0 read1
        #557566 + 0 read2
        samtools index initial_mutants_rep1_sorted.bam
    
        #   contig_1_1  14391   8954
        #@SQ     SN:gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1        LN:84292
    
        # Extract reads from contig_1_1
        samtools view -b initial_mutants_rep1_sorted.bam "gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1" > contig_1_1.bam
        # Run flagstat on the filtered BAM file
        samtools flagstat contig_1_1.bam
        #16589 + 0 read1
        #16579 + 0 read2
    
    Step 3: Mark or Remove Duplicates Using Picard
    Picard Tools can be used to mark duplicates. Here, I'll show you how to remove them:
    
    # -Xmx4g between java and -jar
    # java -jar /usr/local/bin/picard.jar
            picard MarkDuplicates \
            I=contig_1_1.bam \
            O=contig_1_1_no_duplicates.bam \
            M=marked_dup_metrics.txt \
            REMOVE_DUPLICATES=true
    # ## METRICS CLASS        picard.sam.DuplicationMetrics
    #LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED     SECONDARY_OR_SUPPLEMENTARY_RDS  UNMAPPED_READS  UNPAIRED_READ_DUPLICATES        READ_PAIR_DUPLICATES    READ_PAIR_OPTICAL_DUPLICATES    PERCENT_DUPLICATION     ESTIMATED_LIBRARY_SIZE
    #Unknown Library 111     16473   81      111     90      6114    0       0.372629        16270
    
        samtools flagstat contig_1_1_no_duplicates.bam
        #10405 + 0 read1
        #10445 + 0 read2
    
    This command marks duplicates (PCR and optical) and removes them, outputting a file without duplicates. The M option specifies where to write metrics about the duplicates.
    
    Step 3 (Alternatively). Using samtools to Remove Duplicates Directly
    
    Alternatively, if you prefer a simple and fast tool, samtools has a rmdup utility, but it's less versatile compared to Picard:
    
        samtools rmdup -S input_sorted.bam output_no_duplicates.bam
    
    Note that samtools rmdup is deprecated in recent versions of samtools because it does not handle all edge cases as well as Picard.
    
    Considerations
    Read Alignment: Ensure that your reads are aligned correctly and that the SAM/BAM files are error-free before duplicate removal.
    Optical vs. PCR Duplicates: Picard differentiates between PCR duplicates (originating from the same fragment of DNA) and optical duplicates (artifacts from the sequencing platform). Ensure that this distinction is clear if relevant for your analysis.
    Quality Checks: After removing duplicates, it's wise to perform quality control checks to understand how much of your data was affected by duplicates.
    
  3. Check how the failed trimmed reads not contain the the transposon:genomic boundary TAAGAGACAG

    #-3.1- find read sequences
    >VH00358:89:AAFC5MTM5:1:1101:63544:1019_:N:0:TTTCTCTA+CTCGACG
    ACCTACCCCNCCGCTCTCATCAACCCAATAACGCAGGCAATCAAGCACCCACTGCATCACATAAGGTTGGCTAAGGCGCAATGTATTGCCACAACCGGTCATGTTGTCATATTCACCCTCAGAGGTGAGCCAGTAGTAGCTGGCATTGTCGATCCCACG
    >VH00358:89:AAFC5MTM5:1:1101:63960:1019_:N:0:TTTCTCTA+CTCGACG
    CAAAGAGGGGGGCGAAAAGATTTTAAACGATCTTGGCGAAATGAATTTTGAGTTTGTCGTGTGTGACTCACCTGCCGGTATCGAAAGCGGTGCGTTGATGGCACTGTATTTTGCTGACG
    >VH00358:89:AAFC5MTM5:1:1101:64150:1019_:N:0:TTTCTCTA+CTCGACG
    TCTTTACCGGGGATGGGACGCAAGATCTGCGCGCACTGGAACCGGCTTATGTTTCCTCCGTTGACAGTGGGAATCTGGCTGGGCATTTGATTGTACTGGCCAATACCTGTGAAGAGTGGGCCGCAGAACCCTTAGCGGCCAACGGGGCCAAGGGATTG
    >VH00358:89:AAFC5MTM5:1:1101:65400:1019_:N:0:TTTCTCTA+CTCGACG
    CACCTACCCCCCAGCTCTCATCAACCACAATTGACGCAACATCAGCTGGCGCCATGGCATTAATAACAAACTGGATGAATGGCCCCTGGGATCAACAGCACGAACAAACCGCCGTGTGGTGCCATCAGTTGCGCGCCGAAGGCCATTGACAGAGCACC
    >VH00358:89:AAFC5MTM5:1:1101:62048:1038_:N:0:TTTCTCTA+CTCGACG
    CCTACACCCCCGCTCTCATCAACCGCAGCAAAGAAAACGAAAATAATGAAGTCATAAAACTCAAGCGCTCCGCCTAAAGCCGCCAGAGTGAGGGTTTTATAATCTTGCTTATTCAGCCGACGGTTATGAT
    >VH00358:89:AAFC5MTM5:1:1101:64623:1038_:N:0:TTTCTCTA+CTCGACG
    CCTACACCCCAGCTCTCATCAACCAATAACCAGTCAACATCACTGACATCATGTTGCTGGCAATATTCCAGCAGTGCATCTGGCGCCCAAAAACGCGCGCGATCCCCTGCTCGCTGACTTAATACTTGAGTGGCGGCCTTGCCCGGCCCAGTAACCCA
    >VH00358:89:AAFC5MTM5:1:1101:65343:1038_:N:0:TTTCTCTA+CTCGACG
    ACCTACCACCAAGCTCTCATCAACCTGAATGGATTGAGGGCTACGCTCCCAGATCTGTTCTGCCAATTGGCGCAGATGGTTATCGGTGCAATACAAATGCACAAATGACAAATCGTCTCTTACATCTCAGCTCGATAAACTGCCCTTTGGCATAATGC
    >VH00358:89:AAFC5MTM5:1:1101:63317:1057_:N:0:GTTCTCTA+CTCGACG
    GTTGGCGTCCGGCATCTGCATATCACTCATAAAGCGCCTCGATAATCCCACGGATATCCTGGTCGCTAGCTGGGCGCGGGTTGGTGCGCAATGTGAGCTCGGCCAGGGCTGCAGCAATCATATCGGGCAGATGTTGCTGCAATTGGCGCTCATTAACTT
    >VH00358:89:AAFC5MTM5:1:1101:63430:1057_:N:0:TTTCTCTA+CTCGACG
    GCACTGGAAGAGCCGACTAGCCTCAATACTCTTGAACTGCTACCGGAATTATTTGCCGCCAATATTGCCTCGGTGAAAATTGAAGGGCGTCAGCGCAGCCCGGCTTATGTCAGCCAGGTGGCGAAAGTGTGGCGGCAGGCAATTGACCGCTATCTGGC
    >VH00358:89:AAFC5MTM5:1:1101:58299:1076_:N:0:TTTCTCTA+CTCGACG
    TAAATAGGCCAGACTTGAAATCACACGATCCGGCCAGCGATT
    
    #-3.2- generate the genomic sequences
    blastn -db ../WA-314_m.fna -query initial_mutants_rep1.trimmed1_failed_trim_n20 -out failed_trim_n20_on_yersinia.blastn -evalue 10000 -num_threads 124 -outfmt 6 -max_target_seqs 1
    
    VH00358:89:AAFC5MTM5:1:1101:63544:1019_:N:0:TTTCTCTA+CTCGACG    gi|420260421|ref|NZ_AKKR01000094.1|contig_3_51  100.000 141     0       0       19      159     20552   20692   1.20e-70        261
    VH00358:89:AAFC5MTM5:1:1101:63960:1019_:N:0:TTTCTCTA+CTCGACG    gi|420258256|ref|NZ_AKKR01000038.1|contig_1_38  98.319  119     2       0       1       119     43674   43556   3.14e-55        209
    VH00358:89:AAFC5MTM5:1:1101:64150:1019_:N:0:TTTCTCTA+CTCGACG    gi|420257402|ref|NZ_AKKR01000018.1|contig_1_12  98.101  158     3       0       1       158     59515   59672   4.26e-75        276
    VH00358:89:AAFC5MTM5:1:1101:65400:1019_:N:0:TTTCTCTA+CTCGACG    gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2   100.000 57      0       0       18      74      78361   78417   5.91e-24        106
    VH00358:89:AAFC5MTM5:1:1101:62048:1038_:N:0:TTTCTCTA+CTCGACG    gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1   100.000 110     0       0       21      130     78749   78640   1.62e-53        204
    VH00358:89:AAFC5MTM5:1:1101:64623:1038_:N:0:TTTCTCTA+CTCGACG    gi|420259377|ref|NZ_AKKR01000067.1|contig_3_23  99.329  149     0       1       10      158     54935   55082   7.13e-73        268
    VH00358:89:AAFC5MTM5:1:1101:65343:1038_:N:0:TTTCTCTA+CTCGACG    gi|420258810|ref|NZ_AKKR01000051.1|contig_3_3   99.301  143     1       0       16      158     46439   46581   4.29e-70        259
    VH00358:89:AAFC5MTM5:1:1101:63317:1057_:N:0:GTTCTCTA+CTCGACG    gi|420257889|ref|NZ_AKKR01000029.1|contig_1_25  96.226  159     6       0       1       159     36032   36190   1.20e-70        261
    VH00358:89:AAFC5MTM5:1:1101:63430:1057_:N:0:TTTCTCTA+CTCGACG    gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2   100.000 158     0       0       1       158     54938   55095   4.23e-80        292
    VH00358:89:AAFC5MTM5:1:1101:58299:1076_:N:0:TTTCTCTA+CTCGACG    gi|420259608|ref|NZ_AKKR01000075.1|contig_3_31  97.619  42      1       0       1       42      25535   25576   1.04e-14        73.1
    
    samtools faidx WA-314_m.fna "gi|420260421|ref|NZ_AKKR01000094.1|contig_3_51":20534-20692 > genome_rg_for_read1.fasta
    samtools faidx WA-314_m.fna "gi|420258256|ref|NZ_AKKR01000038.1|contig_1_38":43556-43674 > genome_rg_for_read2.fasta
    samtools faidx WA-314_m.fna "gi|420257402|ref|NZ_AKKR01000018.1|contig_1_12":59515-59672 > genome_rg_for_read3.fasta
    samtools faidx WA-314_m.fna "gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2":78344-78501 > genome_rg_for_read4.fasta
    samtools faidx WA-314_m.fna "gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1":78640-78769 > genome_rg_for_read5.fasta
    samtools faidx WA-314_m.fna "gi|420259377|ref|NZ_AKKR01000067.1|contig_3_23":54926-55082 > genome_rg_for_read6.fasta
    samtools faidx WA-314_m.fna "gi|420258810|ref|NZ_AKKR01000051.1|contig_3_3":46424-46581 > genome_rg_for_read7.fasta
    samtools faidx WA-314_m.fna "gi|420257889|ref|NZ_AKKR01000029.1|contig_1_25":36032-36190 > genome_rg_for_read8.fasta
    samtools faidx WA-314_m.fna "gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2":54938-55095 > genome_rg_for_read9.fasta
    samtools faidx WA-314_m.fna "gi|420259608|ref|NZ_AKKR01000075.1|contig_3_31":25535-25576 > genome_rg_for_read10.fasta
    
    #-3.3- transposon amplicon start sequence
    >Transposon_ampli_start
    ACCTACAACAAAGCTCTCATCAAC CGTGGCGGGGATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTCAGGGTTGAGATGTGTA TAAGAGACAG
    
    #-3.4- generate 10 failed_${read}_group.fasta and multiple align them
    for read in read1 read2 read3 read4 read5 read6 read7 read8 read9 read10; do
    cat failed_${read}.fasta genome_rg_for_${read}.fasta transposon_ampli_start.fasta > failed_${read}_group.fasta
    mafft --adjustdirection --clustalout failed_${read}_group.fasta > failed_${read}_group.aln
    done
    
    cat failed_read1_group.aln failed_read2_group.aln failed_read3_group.aln failed_read4_group.aln failed_read5_group.aln failed_read6_group.aln failed_read7_group.aln failed_read8_group.aln failed_read9_group.aln failed_read10_group.aln > failed_reads_group.aln
    
    #-3.5- show the results in checking_failed_reads.pdf
    
  4. Source code of parse_tn_stats.py used in the point 1.

    import argparse
    import pandas as pd
    
    def parse_tn_stats_detailed(file_path):
        with open(file_path, 'r') as file:
            lines = file.readlines()
    
        print("Number of lines read:", len(lines))
        if lines:
            print("First few lines:", lines[:10])
    
        metrics_df = pd.DataFrame()
        current_metric = None
        metric_data = {}
    
        for line in lines:
            if line.startswith('# ') and not line.startswith('#  '):  # Ensure it's not a contig data line  # Metric definition
                if current_metric and metric_data:
                    # Save data before starting new metric
                    for contig, value in metric_data.items():
                        metrics_df.at[contig, current_metric] = value
                    metric_data = {}
                # Extract the metric name
                current_metric = line.split(':')[0].strip('# ')
                print("Processing new metric:", current_metric)  # Debug output
            else:  # Data lines for contigs under the current metric
                parts = line.strip().split(':')
                contig = parts[0].strip()
                value = parts[1].strip()
                try:
                    value = float(value) if value.lower() != 'nan' else pd.NA
                except ValueError:
                    value = pd.NA  # In case of conversion failure
                metric_data[contig] = value
    
        # Capture the last metric data after finishing all lines
        if current_metric and metric_data:
            for contig, value in metric_data.items():
                metrics_df.at[contig, current_metric] = value
    
        metrics_df.fillna(value=pd.NA, inplace=True)
        return metrics_df
    
    def main():
        parser = argparse.ArgumentParser(description="Parse detailed TN stats from a file and output to Excel.")
        parser.add_argument("input_file", help="Input file path for TN stats.")
        parser.add_argument("output_file", help="Output Excel file path.")
        args = parser.parse_args()
    
        # Parse the detailed metrics from the provided file path
        detailed_metrics_df = parse_tn_stats_detailed(args.input_file)
    
        # Save the DataFrame to an Excel file
        with pd.ExcelWriter(args.output_file, engine='openpyxl') as writer:
            detailed_metrics_df.to_excel(writer, sheet_name='Detailed Metrics')
    
        print(f"Final DataFrame saved to Excel: {args.output_file}")
    
    if __name__ == "__main__":
        main()
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum