Call and merge SNP and Indel results from using two variant calling methods

gene_x 0 like s 880 view s

Tags: variation, pipeline

  1. call variant calling using snippy
    1. mv snippy_HDRNA_01 snippy_CP133676
    2. mv snippy_HDRNA_03 snippy_CP133677
    3. cp -r snippy_HDRNA_06 snippy_CP133678
    4. mv snippy_HDRNA_06 snippy_CP133679
    5. cp -r snippy_HDRNA_07 snippy_CP133680
    6. mv snippy_HDRNA_07 snippy_CP133681
    7. cp -r snippy_HDRNA_08 snippy_CP133682
    8. mv snippy_HDRNA_08 snippy_CP133683
    9. cp -r snippy_HDRNA_12 snippy_CP133684
    10. cp -r snippy_HDRNA_12 snippy_CP133685
    11. cp -r snippy_HDRNA_12 snippy_CP133686
    12. mv snippy_HDRNA_12 snippy_CP133687
    13. cp -r snippy_HDRNA_16 snippy_CP133688
    14. cp -r snippy_HDRNA_16 snippy_CP133689
    15. cp -r snippy_HDRNA_16 snippy_CP133690
    16. cp -r snippy_HDRNA_16 snippy_CP133691
    17. mv snippy_HDRNA_16 snippy_CP133692
    18. cp -r snippy_HDRNA_17 snippy_CP133693
    19. cp -r snippy_HDRNA_17 snippy_CP133694
    20. mv snippy_HDRNA_17 snippy_CP133695
    21. cp -r snippy_HDRNA_19 snippy_CP133696
    22. cp -r snippy_HDRNA_19 snippy_CP133697
    23. cp -r snippy_HDRNA_19 snippy_CP133698
    24. mv snippy_HDRNA_19 snippy_CP133699
    25. cp -r snippy_HDRNA_20 snippy_CP133700
    26. mv snippy_HDRNA_20 snippy_CP133701
    27. #git clone https://github.com/huang/bacto
    28. #mv bacto/* ./
    29. #rm -rf bacto
    30. for sample in snippy_CP133678 snippy_CP133679 snippy_CP133680 snippy_CP133681 snippy_CP133682 snippy_CP133683 snippy_CP133684 snippy_CP133685 snippy_CP133686 snippy_CP133687 snippy_CP133688 snippy_CP133689 snippy_CP133690 snippy_CP133691 snippy_CP133692 snippy_CP133693 snippy_CP133694 snippy_CP133695 snippy_CP133696 snippy_CP133697 snippy_CP133698 snippy_CP133699 snippy_CP133700 snippy_CP133701; do
    31. cd ${sample};
    32. ln -s ~/Tools/bacto/db/ .;
    33. ln -s ~/Tools/bacto/envs/ .;
    34. ln -s ~/Tools/bacto/local/ .;
    35. cp ~/Tools/bacto/Snakefile .;
    36. cp ~/Tools/bacto/bacto-0.1.json .;
    37. cp ~/Tools/bacto/cluster.json .;
    38. cd ..;
    39. done
    40. #prepare raw_data and bacto-0.1.json
    41. #set "reference": "db/CP133***.gb" in bacto-0.1.json
    42. conda activate bengal3_ac3
    43. #cd snippy_CP133676
    44. #/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    45. #cd snippy_CP133677
    46. #/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    47. cd snippy_CP133678
    48. (bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    49. cd ../snippy_CP133679
    50. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    51. cd ../snippy_CP133680
    52. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    53. cd ../snippy_CP133681
    54. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    55. cd ../snippy_CP133682
    56. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    57. cd snippy_CP133683
    58. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    59. cd ../snippy_CP133684
    60. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    61. cd ../snippy_CP133685
    62. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    63. cd ../snippy_CP133686
    64. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    65. cd ../snippy_CP133687
    66. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    67. cd ../snippy_CP133688
    68. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    69. cd ../snippy_CP133689
    70. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    71. cd ../snippy_CP133690
    72. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    73. cd ../snippy_CP133691
    74. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    75. cd ../snippy_CP133692
    76. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    77. cd ../snippy_CP133693
    78. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    79. cd ../snippy_CP133694
    80. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    81. cd ../snippy_CP133695
    82. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    83. cd ../snippy_CP133696
    84. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    85. cd ../snippy_CP133697
    86. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    87. cd ../snippy_CP133698
    88. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    89. cd ../snippy_CP133699
    90. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    91. cd ../snippy_CP133700
    92. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    93. cd ../snippy_CP133701
    94. /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    95. cd ..

2, using spandx calling variants (almost the same results to the one from viral-ngs!)

  1. mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610
  2. cp PP810610.1.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk
  3. vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
  4. /home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610 -d
  5. gzip hCoV229E_Rluc_trimmed_P_1.fastq hCoV229E_Rluc_trimmed_P_2.fastq p10_DMSO_trimmed_P_1.fastq p10_DMSO_trimmed_P_2.fastq p10_K22_trimmed_P_1.fastq p10_K22_trimmed_P_2.fastq p10_K7523_trimmed_P_1.fastq p10_K7523_trimmed_P_2.fastq
  6. ln -s /home/jhuang/Tools/spandx/ spandx
  7. (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref PP810610.fasta --annotation --database PP810610 -resume

3, summarize all SNPs and Indels from the snippy result directory.

  1. #Output: snippy_CP133676/snippy/summary_snps_indels.csv
  2. python3 summarize_snippy_res.py snippy_CP133676/snippy
  3. #-- post-processing of Outputs_CP133676 produced by SPANDx (temporary not necessary) --
  4. #cd Outputs_CP133676/Phylogeny_and_annotation
  5. #awk '{if($3!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
  6. #cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
  7. #grep -v "/" f1_7 > f1_7_
  8. #grep -v "\." f1_7_ > f1_7__
  9. #grep -v "*" f1_7__ > f1_7___
  10. #grep -v "INDEL" f1_7___ > f1_7_SNPs
  11. #cut -d$'\t' -f2-2 f1_7_SNPs > f1_7_SNPs_f2
  12. #grep -wFf <(awk 'NR>0 {print $1}' f1_7_SNPs_f2) All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
  13. #grep -v "SNP" f1_7___ > f1_7_indels
  14. #cut -d$'\t' -f2-2 f1_7_indels > f1_7_indels_f2
  15. #grep -wFf <(awk 'NR>0 {print $1}' f1_7_indels_f2) All_SNPs_indels_annotated_.txt > All_indels_annotated.txt

4, merge the following two files summary_snps_indels.csv (192) and All_SNPs_indels_annotated.txt (248) --> merged_variants_CP133676.csv (94)

  1. python3 merge_snps_indels.py snippy_CP133676/snippy/summary_snps_indels.csv Outputs_CP133676/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt merged_variants_CP133676.csv
  2. #check if the number of the output file is correct?
  3. comm -12 <(cut -d, -f2 snippy_CP133676/snippy/summary_snps_indels.csv | sort | uniq) <(cut -f2 Outputs_CP133676/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt | sort | uniq) | wc -l
  4. comm -12 <(cut -d, -f2 snippy_CP133676/snippy/summary_snps_indels.csv | sort | uniq) <(cut -f2 Outputs_CP133676/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt | sort | uniq)

5, (optional, since it should not happed) filter rows without change in snippy_CP133676/snippy/summary_snps_indels.csv (94)

  1. awk -F, '
  2. NR == 1 {print; next} # Print the header line
  3. {
  4. ref = $3; # Reference base (assuming REF is in the 3rd column)
  5. same = 1; # Flag to check if all bases are the same as reference
  6. samples = ""; # Initialize variable to hold sample data
  7. for (i = 6; i <= NF - 8; i++) { # Loop through all sample columns, adjusting for the number of fixed columns
  8. samples = samples $i " "; # Collect sample data
  9. if ($i != ref) {
  10. same = 0;
  11. }
  12. }
  13. if (!same) {
  14. print $0; # Print the entire line if not all bases are the same as the reference
  15. #print "Samples: " samples; # Print all sample data for checking
  16. }
  17. }
  18. ' merged_variants_CP133676.csv > merged_variants_CP133676_.csv
  19. #Explanation:
  20. # -F, sets the field separator to a comma.
  21. # NR == 1 {print; next} prints the header line and skips to the next line.
  22. # ref = $3 sets the reference base (assumed to be in the 3rd column).
  23. # same = 1 initializes a flag to check if all sample bases are the same as the reference.
  24. # samples = ""; initializes a string to collect sample data.
  25. # for (i = 6; i <= NF - 8; i++) loops through the sample columns. This assumes the first 5 columns are fixed (CHROM, POS, REF, ALT, TYPE), and the last 8 columns are fixed (Effect, Impact, Functional_Class, Codon_change, Protein_and_nucleotide_change, Amino_Acid_Length, Gene_name, Biotype).
  26. # samples = samples $i " "; collects sample data.
  27. # if ($i != ref) { same = 0; } checks if any sample base is different from the reference base. If found, it sets same to 0.
  28. # if (!same) { print $0; print "Samples: " samples; } prints the entire line and the sample data if not all sample bases are the same as the reference.

6, improve the header

  1. sed -i '1s/_trimmed_P//g' merged_variants_CP133676.csv

7, check the REF and K1 have the same base and delete those records with difference.

  1. cut -f3 -d',' merged_variants_CP133676.csv > f3
  2. cut -f6 -d',' merged_variants_CP133676.csv > f6
  3. diff f3 f6
  4. awk -F, '$3 == $6 || NR==1' merged_variants_CP133676.csv > filtered_merged_variants_CP133676.csv #(93)
  5. cut -f3 -d',' filtered_merged_variants_CP133676.csv > f3
  6. cut -f6 -d',' filtered_merged_variants_CP133676.csv > f6
  7. diff f3 f6

8, MANUALLY REMOVE the column f6 in filtered_merged_variants_CP133676.csv, and rename CHROM to HDRNA_01_K01 in the header, summarize chr and plasmids SNPs of a sample together to a single list, save as an Excel-file.

9, code of summarize_snippy_res.py

  1. import pandas as pd
  2. import glob
  3. import argparse
  4. import os
  5. #python3 summarize_snps_indels.py snippy_HDRNA_01/snippy
  6. #The following record for 2365295 is wrong, since I am sure in the HDRNA_01_K010, it should be a 'G', since in HDRNA_01_K010.csv
  7. #CP133676,2365295,snp,A,G,G:178 A:0
  8. #
  9. #The current output is as follows:
  10. #CP133676,2365295,A,snp,A,A,A,A,A,A,A,A,A,A,None,,,,,,None,None
  11. #CP133676,2365295,A,snp,A,A,A,A,A,A,A,A,A,A,nan,,,,,,nan,nan
  12. #grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv
  13. #BUG: CP133676,2365295,A,snp,A,A,A,A,A,A,A,A,A,A,nan,,,,,,nan,nan
  14. import pandas as pd
  15. import glob
  16. import argparse
  17. import os
  18. def main(base_directory):
  19. # List of isolate identifiers
  20. isolates = [f"HDRNA_01_K0{i}" for i in range(1, 11)]
  21. expected_columns = ["CHROM", "POS", "REF", "ALT", "TYPE", "EFFECT", "LOCUS_TAG", "GENE", "PRODUCT"]
  22. # Find all CSV files in the directory and its subdirectories
  23. csv_files = glob.glob(os.path.join(base_directory, '**', '*.csv'), recursive=True)
  24. # Create an empty DataFrame to store the summary
  25. summary_df = pd.DataFrame()
  26. # Iterate over each CSV file
  27. for file_path in csv_files:
  28. # Extract isolate identifier from the file name
  29. isolate = os.path.basename(file_path).replace('.csv', '')
  30. df = pd.read_csv(file_path)
  31. # Ensure all expected columns are present, adding missing ones as empty columns
  32. for col in expected_columns:
  33. if col not in df.columns:
  34. df[col] = None
  35. # Extract relevant columns
  36. df = df[expected_columns]
  37. # Ensure consistent data types
  38. df = df.astype({"CHROM": str, "POS": int, "REF": str, "ALT": str, "TYPE": str, "EFFECT": str, "LOCUS_TAG": str, "GENE": str, "PRODUCT": str})
  39. # Add the isolate column with the ALT allele
  40. df[isolate] = df["ALT"]
  41. # Drop columns that are not needed for the summary
  42. df = df.drop(["ALT"], axis=1)
  43. if summary_df.empty:
  44. summary_df = df
  45. else:
  46. summary_df = pd.merge(summary_df, df, on=["CHROM", "POS", "REF", "TYPE", "EFFECT", "LOCUS_TAG", "GENE", "PRODUCT"], how="outer")
  47. # Fill missing values with the REF allele for each isolate column
  48. for isolate in isolates:
  49. if isolate in summary_df.columns:
  50. summary_df[isolate] = summary_df[isolate].fillna(summary_df["REF"])
  51. else:
  52. summary_df[isolate] = summary_df["REF"]
  53. # Rename columns to match the required format
  54. summary_df = summary_df.rename(columns={
  55. "CHROM": "CHROM",
  56. "POS": "POS",
  57. "REF": "REF",
  58. "TYPE": "TYPE",
  59. "EFFECT": "Effect",
  60. "LOCUS_TAG": "Gene_name",
  61. "GENE": "Biotype",
  62. "PRODUCT": "Product"
  63. })
  64. # Replace any remaining None or NaN values in the non-isolate columns with empty strings
  65. summary_df = summary_df.fillna("")
  66. # Add empty columns for Impact, Functional_Class, Codon_change, Protein_and_nucleotide_change, Amino_Acid_Length
  67. summary_df["Impact"] = ""
  68. summary_df["Functional_Class"] = ""
  69. summary_df["Codon_change"] = ""
  70. summary_df["Protein_and_nucleotide_change"] = ""
  71. summary_df["Amino_Acid_Length"] = ""
  72. # Reorder columns
  73. cols = ["CHROM", "POS", "REF", "TYPE"] + isolates + ["Effect", "Impact", "Functional_Class", "Codon_change", "Protein_and_nucleotide_change", "Amino_Acid_Length", "Gene_name", "Biotype"]
  74. summary_df = summary_df.reindex(columns=cols)
  75. # Remove duplicate rows
  76. summary_df = summary_df.drop_duplicates()
  77. # Save the summary DataFrame to a CSV file
  78. output_file = os.path.join(base_directory, "summary_snps_indels.csv")
  79. summary_df.to_csv(output_file, index=False)
  80. print("Summary CSV file created successfully at:", output_file)
  81. if __name__ == "__main__":
  82. parser = argparse.ArgumentParser(description="Summarize SNPs and Indels from CSV files.")
  83. parser.add_argument("directory", type=str, help="Base directory containing the CSV files in subdirectories.")
  84. args = parser.parse_args()
  85. main(args.directory)

10, code of merge_snps_indels.py

  1. import pandas as pd
  2. import argparse
  3. import os
  4. def merge_files(input_file1, input_file2, output_file):
  5. # Read the input files
  6. df1 = pd.read_csv(input_file1)
  7. df2 = pd.read_csv(input_file2, sep='\t')
  8. # Merge the dataframes on the 'POS' column, keeping only the rows that have common 'POS' values
  9. merged_df = pd.merge(df2, df1[['POS']], on='POS', how='inner')
  10. # Remove duplicate rows
  11. merged_df.drop_duplicates(inplace=True)
  12. # Save the merged dataframe to the output file
  13. merged_df.to_csv(output_file, index=False)
  14. print("Merged file created successfully at:", output_file)
  15. if __name__ == "__main__":
  16. parser = argparse.ArgumentParser(description="Merge two SNP and Indel files based on the 'POS' column.")
  17. parser.add_argument("input_file1", type=str, help="Path to the first input file (summary_snps_indels.csv).")
  18. parser.add_argument("input_file2", type=str, help="Path to the second input file (All_SNPs_indels_annotated.txt).")
  19. parser.add_argument("output_file", type=str, help="Path to the output file.")
  20. args = parser.parse_args()
  21. merge_files(args.input_file1, args.input_file2, args.output_file)

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum