Processing for Data_Tam_DNAseq_2025_ATCC19606

gene_x 0 like s 9 view s

Tags: pipeline

  1. Download the raw data.

    see README_Downloads

  2. Call variant calling using snippy

    ln -s ~/Tools/bacto/db/ .;
    ln -s ~/Tools/bacto/envs/ .;
    ln -s ~/Tools/bacto/local/ .;
    cp ~/Tools/bacto/Snakefile .;
    cp ~/Tools/bacto/bacto-0.1.json .;
    cp ~/Tools/bacto/cluster.json .;
    
    mkdir raw_data; cd raw_data;
    
    # Note that the names must be ending with fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W2/W2_1.fq.gz W2_R1.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W2/W2_2.fq.gz W2_R2.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W3/W3_1.fq.gz W3_R1.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W3/W3_2.fq.gz W3_R2.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W4/W4_1.fq.gz W4_R1.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W4/W4_2.fq.gz W4_R2.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y2/Y2_1.fq.gz Y2_R1.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y2/Y2_2.fq.gz Y2_R2.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y3/Y3_1.fq.gz Y3_R1.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y3/Y3_2.fq.gz Y3_R2.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y4/Y4_1.fq.gz Y4_R1.fastq.gz
    ln -s ../RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y4/Y4_2.fq.gz Y4_R2.fastq.gz
    
    #download CP059040.gb from GenBank
    mv ~/Downloads/sequence\(1\).gb db/CP059040.gb
    #setting the following in bacto-0.1.json
    
        "fastqc": false,
        "taxonomic_classifier": false,
        "assembly": true,
        "typing_ariba": false,
        "typing_mlst": true,
        "pangenome": true,
        "variants_calling": true,
        "phylogeny_fasttree": true,
        "phylogeny_raxml": true,
        "recombination": false, (due to gubbins-error set false)
    
        "genus": "Acinetobacter",
        "kingdom": "Bacteria",
        "species": "baumannii",  (in both prokka and mykrobe)
        "reference": "db/CP059040.gb"
    
    mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3
    (bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    
  3. Download complete_genome_470_ena_taxon_descendants_count.fasta (Done in ~/REFs)

    #Acinetobacter baumannii Taxonomy ID: 470
    #esearch -db nucleotide -query "txid470[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_470_ncbi.fasta
    #python ~/Scripts/filter_fasta.py genome_470_ncbi.fasta complete_genome_470_ncbi.fasta  #
    
    # ---- Download related genomes from ENA ----
    https://www.ebi.ac.uk/ena/browser/view/470
    #Click "Sequence" and download "Counts" (13059) and "Taxon descendants count" (16091) if there is enough time! Downloading time points is 28.02.2025.
    python ~/Scripts/filter_fasta.py  ena_470_sequence.fasta complete_genome_470_ena_taxon_descendants_count.fasta  #16091-->920
    #python ~/Scripts/filter_fasta.py ena_470_sequence_Counts.fasta complete_genome_470_ena_Counts.fasta  #xxx, 5.8G
    
  4. Run vrap

    #replace --virus to the specific taxonomy (e.g. Acinetobacter baumannii) --> change virus_user_db --> specific_bacteria_user_db
    cp trimmed/Y2_trimmed_P*.fastq .
    cp trimmed/W2_trimmed_P*.fastq .
    gzip *.fastq
    ln -s ~/Tools/vrap/ .
    mamba activate /home/jhuang/miniconda3/envs/vrap
    (vrap) vrap/vrap.py  -1 Y2_trimmed_P_1.fastq.gz -2 Y2_trimmed_P_2.fastq.gz -o vrap_Y2 --bt2idx=/home/jhuang/REFs/genome --host=/home/jhuang/REFs/genome.fa --virus=/home/jhuang/REFs/complete_genome_470_ena_taxon_descendants_count.fasta --nt=/mnt/nvme1n1p1/blast/nt --nr=/mnt/nvme1n1p1/blast/nr  -t 100 -l 200  -g
    (vrap) vrap/vrap.py  -1 W2_trimmed_P_1.fastq.gz -2 W2_trimmed_P_2.fastq.gz -o vrap_W2 --bt2idx=/home/jhuang/REFs/genome --host=/home/jhuang/REFs/genome.fa --virus=/home/jhuang/REFs/complete_genome_470_ena_taxon_descendants_count.fasta --nt=/mnt/nvme1n1p1/blast/nt --nr=/mnt/nvme1n1p1/blast/nr  -t 100 -l 200  -g
    
  5. Using spandx calling variants (almost the same results to the one from viral-ngs!)

    #DONE
    mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/CP059040
    cp db/CU459141.gb  ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/CP059040/genes.gbk
    vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
    #CU459141.chromosome : CU459141
    /home/jhuang/miniconda3/envs/spandx/bin/snpEff build -c ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config -genbank CU459141 -v   #-d
    #00:00:02 Saving sequences for chromosome 'CU459141.1 ...
    mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3
    /home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff build -c ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config -genbank CP059040 -v
    
    ~/Scripts/genbank2fasta.py CP059040.gb
    mv CP059040.gb_converted.fna CP059040.fasta    #rename "CP059040.1 xxxxx" to "CP059040" in the fasta-file
    
    ln -s /home/jhuang/Tools/spandx/ spandx
    mamba activate /home/jhuang/miniconda3/envs/spandx
    (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref CP059040.fasta --annotation --database CP059040 -resume  # 386 records in Outputs/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt
    
    #DEBUG the results of spandx
    cd Outputs/Master_vcf
    (spandx) cp -r ../../snippy/Y2/reference .
    (spandx) cp ../../spandx/bin/SNP_matrix.sh ./
    #Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to
    "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh
    (spandx) bash SNP_matrix.sh CP059040 .
    
    python3 ~/Scripts/summarize_snippy_res.py snippy
    grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv  # 29 records
    
    diff snippy/id2.txt Outputs/Master_vcf/id2.txt
    
  6. Draw the BRIC plots of Y2, Y3, Y4, W2, W3, W4 on reference.

    mkdir ~/brigWorkspace_Tam_DNAseq_2025_ATCC19606
    cp CP059040.gb ~/brigWorkspace_Tam_DNAseq_2025_ATCC19606
    cp ~/DATA/Data_Tam_DNAseq_2025_ATCC19606/shovill/W2/contigs.fa ~/brigWorkspace_Tam_DNAseq_2025_ATCC19606/gen/W2.fasta
    cp ~/DATA/Data_Tam_DNAseq_2025_ATCC19606/shovill/W3/contigs.fa ~/brigWorkspace_Tam_DNAseq_2025_ATCC19606/gen/W3.fasta
    cp ~/DATA/Data_Tam_DNAseq_2025_ATCC19606/shovill/W4/contigs.fa ~/brigWorkspace_Tam_DNAseq_2025_ATCC19606/gen/W4.fasta
    cp ~/DATA/Data_Tam_DNAseq_2025_ATCC19606/shovill/Y2/contigs.fa ~/brigWorkspace_Tam_DNAseq_2025_ATCC19606/gen/Y2.fasta
    cp ~/DATA/Data_Tam_DNAseq_2025_ATCC19606/shovill/Y3/contigs.fa ~/brigWorkspace_Tam_DNAseq_2025_ATCC19606/gen/Y3.fasta
    cp ~/DATA/Data_Tam_DNAseq_2025_ATCC19606/shovill/Y4/contigs.fa ~/brigWorkspace_Tam_DNAseq_2025_ATCC19606/gen/Y4.fasta
    
    #!!!!IMPORTANT_CRITICAL_DEBUG!!!!: we should use jdk1.6, otherwise results in ERROR!
    ~/Tools/Java1.6/jdk1.6.0_45/bin/java -Xmx25000M -jar BRIG.jar
    #DEBUG2
    Output-file should be "/home/jhuang/CP059040" to avoid to cover CP059040.gb.
    
    mv W2.fastaVsCP059040.gb.fna.tab W2.fastaVsCP059040.gb.fna.tab_
    mv W3.fastaVsCP059040.gb.fna.tab W3.fastaVsCP059040.gb.fna.tab_
    mv W4.fastaVsCP059040.gb.fna.tab W4.fastaVsCP059040.gb.fna.tab_
    mv Y2.fastaVsCP059040.gb.fna.tab Y2.fastaVsCP059040.gb.fna.tab_
    mv Y3.fastaVsCP059040.gb.fna.tab Y3.fastaVsCP059040.gb.fna.tab_
    mv Y4.fastaVsCP059040.gb.fna.tab Y4.fastaVsCP059040.gb.fna.tab_
    python3 ~/Scripts/filter_blast.py W2.fastaVsCP059040.gb.fna.tab_ W2.fastaVsCP059040.gb.fna.tab --evalue 1e-10 --min_length 200 --min_identity 95
    python3 ~/Scripts/filter_blast.py W3.fastaVsCP059040.gb.fna.tab_ W3.fastaVsCP059040.gb.fna.tab --evalue 1e-10 --min_length 200 --min_identity 95
    python3 ~/Scripts/filter_blast.py W4.fastaVsCP059040.gb.fna.tab_ W4.fastaVsCP059040.gb.fna.tab --evalue 1e-10 --min_length 200 --min_identity 95
    python3 ~/Scripts/filter_blast.py Y2.fastaVsCP059040.gb.fna.tab_ Y2.fastaVsCP059040.gb.fna.tab --evalue 1e-10 --min_length 200 --min_identity 95
    python3 ~/Scripts/filter_blast.py Y3.fastaVsCP059040.gb.fna.tab_ Y3.fastaVsCP059040.gb.fna.tab --evalue 1e-10 --min_length 200 --min_identity 95
    python3 ~/Scripts/filter_blast.py Y4.fastaVsCP059040.gb.fna.tab_ Y4.fastaVsCP059040.gb.fna.tab --evalue 1e-10 --min_length 200 --min_identity 95
    
    #Rerun without 'Re-do BLAST'
    
  7. Detect the gap positions

    blastn -query gen/Y4.fasta -subject scratch/CP059040.gb.fna -evalue 1e-10 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -out Y4_vs_ref.tsv
    python3 ~/Scripts/filter_blast.py Y4_vs_ref.tsv Y4_vs_ref.tsv_ --evalue 1e-10 --min_length 200 --min_identity 95
    awk '{start=($9<$10)?$9:$10; end=($9>$10)?$9:$10; print $0, start, end}' Y4_vs_ref.tsv_ | sort -k13,13n > Y4_sorted.tsv
    awk '
    {
    if (NR > 1) {
            gap_size = $13 - prev_end - 1
            if (gap_size > 200) {
            print "Gap in reference: " prev_end+1 " to " $13-1 ", size: " gap_size " nt"
            }
    }
    prev_end = $14
    }' Y4_sorted.tsv
    
    Gap in reference: 17584 to 17854, size: 271 nt
    Gap in reference: 737225 to 741666, size: 4442 nt * (adeI and adeJ)
    Gap in reference: 864814 to 867580, size: 2767 nt * (H0N29_04000)
    Gap in reference: 1326848 to 1327449, size: 602 nt
    Gap in reference: 1328557 to 1328922, size: 366 nt
    Gap in reference: 1481945 to 1482157, size: 213 nt
    Gap in reference: 1615852 to 1616118, size: 267 nt
    Gap in reference: 1641267 to 1641756, size: 490 nt
    Gap in reference: 1679492 to 1692554, size: 13063 nt
    Gap in reference: 1725690 to 1806693, size: 81004 nt
    Gap in reference: 1817484 to 1817867, size: 384 nt
    Gap in reference: 1851794 to 1852056, size: 263 nt
    Gap in reference: 1864443 to 1864741, size: 299 nt
    Gap in reference: 2123729 to 2123929, size: 201 nt
    Gap in reference: 2146439 to 2147201, size: 763 nt
    Gap in reference: 2269547 to 2270147, size: 601 nt
    Gap in reference: 2382428 to 2383507, size: 1080 nt
    Gap in reference: 2451609 to 2452242, size: 634 nt
    #between H0N29_13300 and H0N29_13660
    Gap in reference: 2810875 to 2813304, size: 2430 nt *
    Gap in reference: 2814695 to 2858885, size: 44191 nt *
    Gap in reference: 2860666 to 2862228, size: 1563 nt *
    Gap in reference: 2862967 to 2863469, size: 503 nt *
    Gap in reference: 2899740 to 2900190, size: 451 nt
    
    Gap in reference: 3010504 to 3010847, size: 344 nt
    Gap in reference: 3833923 to 3834384, size: 462 nt
    Gap in reference: 3834592 to 3860147, size: 25556 nt
    Gap in reference: 3862342 to 3862920, size: 579 nt
    Gap in reference: 3892085 to 3913973, size: 21889 nt
    Gap in reference: 3914242 to 3942704, size: 28463 nt
    
    blastn -query gen/Y3.fasta -subject scratch/CP059040.gb.fna -evalue 1e-10 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -out Y3_vs_ref.tsv
    python3 ~/Scripts/filter_blast.py Y3_vs_ref.tsv Y3_vs_ref.tsv_ --evalue 1e-10 --min_length 200 --min_identity 95
    awk '{start=($9<$10)?$9:$10; end=($9>$10)?$9:$10; print $0, start, end}' Y3_vs_ref.tsv_ | sort -k13,13n > Y3_sorted.tsv
    awk '
    {
    if (NR > 1) {
            gap_size = $13 - prev_end - 1
            if (gap_size > 200) {
            print "Gap in reference: " prev_end+1 " to " $13-1 ", size: " gap_size " nt"
            }
    }
    prev_end = $14
    }' Y3_sorted.tsv
    
    Gap in reference: 737225 to 741666, size: 4442 nt *
    Gap in reference: 792096 to 792573, size: 478 nt
    
    Gap in reference: 1679492 to 1724344, size: 44853 nt
    Gap in reference: 1725690 to 2190107, size: 464418 nt
    
    Gap in reference: 2810875 to 2813304, size: 2430 nt *
    Gap in reference: 2814695 to 2858885, size: 44191 nt *
    Gap in reference: 2860666 to 2862228, size: 1563 nt *
    Gap in reference: 2862967 to 2863469, size: 503 nt *
    
    Gap in reference: 3833923 to 3834384, size: 462 nt
    Gap in reference: 3834592 to 3881864, size: 47273 nt
    Gap in reference: 3892085 to 3913973, size: 21889 nt
    Gap in reference: 3914242 to 3942704, size: 28463 nt
    
    blastn -query gen/Y2.fasta -subject scratch/CP059040.gb.fna -evalue 1e-10 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -out Y2_vs_ref.tsv
    python3 ~/Scripts/filter_blast.py Y2_vs_ref.tsv Y2_vs_ref.tsv_ --evalue 1e-10 --min_length 200 --min_identity 95
    awk '{start=($9<$10)?$9:$10; end=($9>$10)?$9:$10; print $0, start, end}' Y2_vs_ref.tsv_ | sort -k13,13n > Y2_sorted.tsv
    awk '
    {
    if (NR > 1) {
            gap_size = $13 - prev_end - 1
            if (gap_size > 200) {
            print "Gap in reference: " prev_end+1 " to " $13-1 ", size: " gap_size " nt"
            }
    }
    prev_end = $14
    }' Y2_sorted.tsv
    Gap in reference: 737225 to 741666, size: 4442 nt *
    Gap in reference: 794508 to 1327449, size: 532942 nt
    Gap in reference: 1679492 to 1724344, size: 44853 nt
    Gap in reference: 1725690 to 2190107, size: 464418 nt
    Gap in reference: 2810875 to 2813304, size: 2430 nt *
    Gap in reference: 2814695 to 2858885, size: 44191 nt *
    Gap in reference: 2860666 to 2862228, size: 1563 nt *
    Gap in reference: 2862967 to 2863469, size: 503 nt *
    Gap in reference: 3288370 to 3642512, size: 354143 nt
    Gap in reference: 3833923 to 3834384, size: 462 nt
    Gap in reference: 3834592 to 3881863, size: 47272 nt
    Gap in reference: 3892085 to 3913973, size: 21889 nt
    Gap in reference: 3914242 to 3942704, size: 28463 nt
    
    blastn -query gen/W4.fasta -subject scratch/CP059040.gb.fna -evalue 1e-10 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -out W4_vs_ref.tsv
    python3 ~/Scripts/filter_blast.py W4_vs_ref.tsv W4_vs_ref.tsv_ --evalue 1e-10 --min_length 200 --min_identity 95
    awk '{start=($9<$10)?$9:$10; end=($9>$10)?$9:$10; print $0, start, end}' W4_vs_ref.tsv_ | sort -k13,13n > W4_sorted.tsv
    awk '
    {
    if (NR > 1) {
            gap_size = $13 - prev_end - 1
            if (gap_size > 200) {
            print "Gap in reference: " prev_end+1 " to " $13-1 ", size: " gap_size " nt"
            }
    }
    prev_end = $14
    }' W4_sorted.tsv
    
    Gap in reference: 737225 to 741666, size: 4442 nt *
    Gap in reference: 792438 to 793713, size: 1276 nt
    Gap in reference: 1679492 to 1724344, size: 44853 nt
    Gap in reference: 1725690 to 1817596, size: 91907 nt
    Gap in reference: 2810875 to 2813304, size: 2430 nt *
    Gap in reference: 2814695 to 2858885, size: 44191 nt *
    Gap in reference: 2860666 to 2862228, size: 1563 nt *
    Gap in reference: 2862967 to 2863469, size: 503 nt *
    Gap in reference: 3288370 to 3642512, size: 354143 nt
    Gap in reference: 3833923 to 3834384, size: 462 nt
    Gap in reference: 3834592 to 3881863, size: 47272 nt
    Gap in reference: 3892085 to 3913973, size: 21889 nt
    Gap in reference: 3914242 to 3942704, size: 28463 nt
    
    blastn -query gen/W3.fasta -subject scratch/CP059040.gb.fna -evalue 1e-10 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -out W3_vs_ref.tsv
    python3 ~/Scripts/filter_blast.py W3_vs_ref.tsv W3_vs_ref.tsv_ --evalue 1e-10 --min_length 200 --min_identity 95
    awk '{start=($9<$10)?$9:$10; end=($9>$10)?$9:$10; print $0, start, end}' W3_vs_ref.tsv_ | sort -k13,13n > W3_sorted.tsv
    awk '
    {
    if (NR > 1) {
            gap_size = $13 - prev_end - 1
            if (gap_size > 200) {
            print "Gap in reference: " prev_end+1 " to " $13-1 ", size: " gap_size " nt"
            }
    }
    prev_end = $14
    }' W3_sorted.tsv
    
    Gap in reference: 737225 to 741666, size: 4442 nt *
    Gap in reference: 792096 to 792573, size: 478 nt
    Gap in reference: 1679492 to 1724344, size: 44853 nt
    Gap in reference: 1725690 to 2190107, size: 464418 nt
    Gap in reference: 2810875 to 2813304, size: 2430 nt *
    Gap in reference: 2814695 to 2858885, size: 44191 nt *
    Gap in reference: 2860666 to 2862228, size: 1563 nt *
    Gap in reference: 2862967 to 2863469, size: 503 nt *
    Gap in reference: 3288370 to 3642512, size: 354143 nt
    Gap in reference: 3833923 to 3834384, size: 462 nt
    Gap in reference: 3834592 to 3853883, size: 19292 nt
    Gap in reference: 3892085 to 3913973, size: 21889 nt
    Gap in reference: 3914242 to 3942704, size: 28463 nt
    
    blastn -query gen/W2.fasta -subject scratch/CP059040.gb.fna -evalue 1e-10 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -out W2_vs_ref.tsv
    python3 ~/Scripts/filter_blast.py W2_vs_ref.tsv W2_vs_ref.tsv_ --evalue 1e-10 --min_length 200 --min_identity 95
    awk '{start=($9<$10)?$9:$10; end=($9>$10)?$9:$10; print $0, start, end}' W2_vs_ref.tsv_ | sort -k13,13n > W2_sorted.tsv
    awk '
    {
    if (NR > 1) {
            gap_size = $13 - prev_end - 1
            if (gap_size > 200) {
            print "Gap in reference: " prev_end+1 " to " $13-1 ", size: " gap_size " nt"
            }
    }
    prev_end = $14
    }' W2_sorted.tsv
    
    Gap in reference: 737225 to 741666, size: 4442 nt *
    Gap in reference: 1679492 to 1724344, size: 44853 nt
    Gap in reference: 1725690 to 2190107, size: 464418 nt
    Gap in reference: 2810875 to 2813304, size: 2430 nt *
    Gap in reference: 2814695 to 2858885, size: 44191 nt *
    Gap in reference: 2860666 to 2862228, size: 1563 nt *
    Gap in reference: 2862967 to 2863469, size: 503 nt *
    Gap in reference: 3288370 to 3642512, size: 354143 nt
    Gap in reference: 3833923 to 3834384, size: 462 nt
    Gap in reference: 3834592 to 3881863, size: 47272 nt
    Gap in reference: 3892085 to 3913973, size: 21889 nt
    Gap in reference: 3914242 to 3942704, size: 28463 nt
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum