Merge pull request #2 from ANSES-Ploufragan/dev

Dev
ANSES-Ploufragan · Mar 27, 2024 · 8e0d10a · 8e0d10a
2 parents d642b34 + f461895
commit 8e0d10a
Show file tree

Hide file tree

Showing 9 changed files with 696 additions and 237 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,5 +1,18 @@
-0.1.10-beta:
-        - now horizontal line colors are for proteins, dot shape is for genes. Added vertical dotted lines to show contig limits
+0.2.0:
+  - update dependencies versions
+  - correct bug in PYTHON_SCRIPTS/convert_vcffile_to_readablefile2.py that missed some variants in text summary file
+	- add boxes for genes under graph (to fit traditional display of viral genomes)
+	- add [optional] coverage depth display in final png (if no cov depth provided, part disabled) aligned of variant graph
+	  for positions
+0.1.11:
+  - display prot instead of genes and genes instead of prot
+  - add num in front of genes to ensure to keep apparition order
+  - replace fstrings in vvv2_display.py to allow better compatibility
+  - add src/correct_covdepth_f.py script to convert cov depth files of samtools depth so as to have cumulative positions for display
+  - prepare vvv2_display to coverage depth addition in input
+  - add new script to setup.py file and change version to 0.1.11
+0.1.10:
+  - now horizontal line colors are for proteins, dot shape is for genes. Added vertical dotted lines to show contig limits
 	- correction in vadr parsing using different virus models
 	- tested on calicivirus, dengue, flavivirus, coronavirus, sarscov2, hcv, norovirus
 	- better parse stemp_loops

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 # Description
 
 Tools to create:
-- a .png image file describing all variants (obtained from vardict-java variant caller) alongside a genome/assembly (to provide) with their proportion (ordinates), with CDS descriptions (obtained from vadr annotator).
+- a .png image file describing all variants (obtained from vardict-java variant caller) alongside a genome/assembly (to provide) with their proportion (ordinates), with CDS descriptions (obtained from vadr annotator). At the top of the figure can be displayed the coverage depth repartition (if `-o cov_depth_f` option is provided).
 
 Python/R scripts and Galaxy wrapper to use them.
 
@@ -26,12 +26,13 @@ Convert ```vardict-java``` variant calling vcf file to human readable txt file
 - ```PYTHON_SCRIPTS/correct_multicontig_vardict_vcf.py```: 
 Correct ```vadr``` annotation output .tbl file for contigs positions when the assembly provided is composed of more than one contig.
 
-
 <!-- - ```R_SCRIPTS/visualize_coverage_depth.R```: -->
 <!-- Create a .png file showing coverage depth alongside the genome, from a bam alignment file. -->
 
 - ```R_SCRIPTS/visualize_snp_v4.R```:
-Create a .png file showing variant proportions alongside the genome/assembly and CDS positions.
+Create a .png file showing on the same png figure:
+  - coverage depth repartition alongside the genome/assembly (if `-o cov_depth_d` option provided)
+  - variant proportions alongside the genome/assembly and CDS positions.
 
 # Installation
 
@@ -50,8 +51,17 @@ vvv2_display.py -h
 
 Typical usage:
 ```
-vvv2_display.py -p res_vadr_pass.tsv -f res_vadr_fail.tsv -s res_vadr_seqstat.txt -n res_vardict_all.vcf -m contig_limits.txt -r res_vvv2_display.png 
+vvv2_display.py -p res_vadr_pass.tsv -f res_vadr_fail.tsv -s res_vadr_seqstat.txt -n res_vardict_all.vcf -r res_vvv2_display.png -o cov_depth_f.txt
 ```
+where:
+  - `res_vadr_pass.tsv` is the 'pass' file of vadr annotation program run on the genome/assembly
+  - `res_vadr_fail.tsv` is the 'fail' file of vadr annotation program
+  - `res_vadr_seqstat.txt` is the 'seqstat' file of vadr annotation program
+  - `res_vardict_all.vcf` is the result of vardict-java variant caller
+  - `res_vvv2_display.png` is the name of the main output file (will be created)
+  - `cov_depth_f.txt` is the coverage depth by position, provided by `samtools depth` run on the bam alignement file.
+
+> All other options are for Galaxy wrapper compatibility (these are intermediate temporary files that must appear as parameter for Galaxy wrapper but are not used in a usual command line call)
 
 # Galaxy wrapper
 

diff --git a/setup.py b/setup.py
@@ -26,7 +26,7 @@
 
 setuptools.setup(
     name="vvv2_display",  # Required
-    version="0.1.10",  # Required
+    version="0.2.0",  # Required
     description="Viral Variant Visualizer 2 display",  # Optional
     long_description=long_description,
     long_description_content_type="text/markdown",  # Optional (see note above)
@@ -59,6 +59,7 @@
                   "convert_tbl2json.py",
                   "convert_vcffile_to_readablefile2.py",
                   "correct_multicontig_vardict_vcf.py",
+                  "correct_covdepth_f.py",
                   "visualize_coverage_depth.R",
                   "visualize_snp_v4.R"
          ]
@@ -68,14 +69,19 @@
     include_package_data=True,
     install_requires=[
         "python>=3.9",
-        "r-ggplot2==3.6.6",
+        "r-ggplot2>=3.4.4",
+        "r-gridextra>=2.3",
+        "r-cowplot>=1.1.1",
+        "conda-forge::r-stringr>=1.5.1",
+        "conda-forge::r-jsonlite>=1.8.8",
         "pysam==0.19.1",
-        "numpy==1.23.1"],  # Optional
+        "numpy>=1.23.1"],  # Optional
     scripts=[
         "src/vvv2_display.py",
         "src/convert_tbl2json.py",
         "src/convert_vcffile_to_readablefile2.py",
         "src/correct_multicontig_vardict_vcf.py",
+        "correct_covdepth_f.py",
         "src/visualize_coverage_depth.R",
         "src/visualize_snp_v4.R"
         ],

diff --git a/src/convert_vcffile_to_readablefile2.py b/src/convert_vcffile_to_readablefile2.py
@@ -16,6 +16,7 @@
 # You should have received a copy of the GNU General Public License 
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
+# FT - correct regexp part for description parsing: simplified, now do not miss variant: March 27th 2024
 # FT - correct gene identif/name (position provided instead sometimes): June 15th 2023
 # FT - last modification: September 19th 2022 to replace pyvcf by pysam
 # AF - last modification: August 21st 2018
@@ -96,7 +97,7 @@ def find_key_proteins(dico_json, genomepos, gene_res):
     """ This function retrieves the protein name and the start and end position of
 this protein.
     """
-    protein = 'non coding RNA'
+    protein = 'untranslated RNA'
     base_inf_allproteins = ''
     base_sup_allproteins = ''
     # print("find_key_proteins call pos ("+str(genomepos)+", "+gene_res+")")    
@@ -112,7 +113,7 @@ def find_key_proteins(dico_json, genomepos, gene_res):
                         protein = 'intergene'
                         base_inf_allproteins = str(base_inf)
                         base_sup_allproteins = str(base_sup)
-                    elif protein == 'non coding RNA':
+                    elif protein == 'untranslated RNA':
                         protein = key
                         base_inf_allproteins = str(base_inf)
                         base_sup_allproteins = str(base_sup)
@@ -284,7 +285,7 @@ def is_homopolymer(lseq, rseq, ref, alt):
     A = 0 # this flag is used in the write_line function in order to add indices to line where variants are upper than threshold
     with open(args.out, "w") as filout:
         summary_list = []
-        regex = '([0-9]+)\t1\t([A-Z\>\<]+)\t([A-Z\>\<]+)\t([0-9\.]+)\t[A-Z\>\<]+\t[A-Z\>\<]+\t([A-Z0-9a-z\-_\, /]+)\t([A-Z0-9a-z\-_\, /\]\[]*)\t1\t([0-9]+)\t([A-Z]+\t[A-Z]+\t(no|yes))\n'
+        regex = '([0-9]+)\t1\t([A-Z\>\<]+)\t([A-Z\>\<]+)\t([0-9\.]+)\t[A-Z\>\<]+\t[A-Z\>\<]+\t([A-Z0-9a-z\-_\, /]+)\t([^\t]*)\t1\t([0-9]+)\t([A-Z]+\t[A-Z]+\t(no|yes))\n'
         REGEX = re.compile(regex)    
         filout.write("position\tSNP\tref\talt\tvariant_percent\tadd_ref\tadd_alt\tgene_id\tprotein_id\tsize_point\tindice\tlseq\trseq\tisHomo\n")
         a = 0 # flag to find the first genomic region after initialization of i
@@ -318,7 +319,7 @@ def is_homopolymer(lseq, rseq, ref, alt):
             filout.write(line)
         filout.write("""
 *NB: an homopolymer region is set to 'yes' if there is a succession of at least 3 identical nucleotides.
-     it looks like a restrictive measure, but Ion Torrent sequencing is very bad on such region, so make sure you verify these variants.""")
+     it looks like a restrictive measure, but Ion Torrent and Nanopore sequencing are very bad on such region, so make sure you verify these variants.""")
     print(prog_tag + ' '+ args.out +" file created")
     print(prog_tag + ' '+ args.outs +" file created")
 ## ~ end of script ~ ##
diff --git a/src/correct_covdepth_f.py b/src/correct_covdepth_f.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# This file is part of the vvv2_display distribution (https://github.com/ANSES-Ploufragan/vvv2_display).
+# Copyright (c) 2023 Fabrice Touzain.
+# 
+# This program is free software: you can redistribute it and/or modify  
+# it under the terms of the GNU General Public License as published by  
+# the Free Software Foundation, version 3.
+#
+# This program is distributed in the hope that it will be useful, but 
+# WITHOUT ANY WARRANTY; without even the implied warranty of 
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License 
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+###
+# USE PYTHON3
+# from vcf file of vardict and bed file with contig positions, creates a vcf file with
+# contigs shifted position (to create a picture of variants and annotations)
+###
+import argparse, os, sys, csv, re, warnings
+from os import path
+import subprocess
+
+# to be able to report line number in error messages
+import inspect
+frame = inspect.currentframe()
+
+# debug
+b_test_cov_depth_corr_f = False #  2023 12 19
+b_test = False
+
+prog_tag = '[' + os.path.basename(__file__) + ']'
+
+cov_depth_f    = '' # in, cov_depth file provided by samtools depth -aa file.bam
+cov_depth_corr_f = '' # out same as previous but positions on contigs are shifted according to previous contigs
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-s", "--cov_depth_f", dest='cov_depth_f',
+                    help="in: coverage depth file provided by samtools -aa in.bam (all contigs)",
+                    metavar="FILE")
+parser.add_argument("-c", "--cov_depth_corr_f", dest='cov_depth_corr_f',
+                    help="out: coverage depth file corrected for position (cumulative position in case of several contigs)",
+                    metavar="FILE")
+parser.add_argument("-z", "--test_all", dest='b_test_all',
+                    help="[Optional] run all tests",
+                    action='store_true')
+parser.add_argument("-v", "--verbose", dest='b_verbose',
+                    help="[Optional] To have details on records when running",
+                    action='store_true')
+parser.set_defaults(b_test_all=False)
+parser.set_defaults(b_verbose=False)
+
+# get absolute path in case of files
+args = parser.parse_args()
+
+# -------------------------------------------
+# check arguments
+b_test_all = args.b_test_all
+
+if b_test_all:
+    b_test_cov_depth_corr_f = True
+    b_test = True
+else:
+    b_test = b_test_cov_depth_corr_f
+
+if ((not b_test)and
+    ((len(sys.argv) < 2) or (len(sys.argv) > 10))):
+    parser.print_help()
+    print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
+          " arguments, exit line "+str(frame.f_lineno))
+    sys.exit(0)
+
+# print('args:', args)
+if args.cov_depth_f is not None:
+    cov_depth_f = os.path.abspath(args.cov_depth_f)
+elif(not b_test):
+    sys.exit(prog_tag + "[Error] You must provide cov_depth_f")
+if args.cov_depth_corr_f is not None:
+    cov_depth_corr_f = os.path.abspath(args.cov_depth_corr_f)
+elif(not b_test):
+    sys.exit(prog_tag + "[Error] You must provide cov_depth_corr_f")
+
+if args.b_verbose is not None:
+    b_verbose = args.b_verbose
+
+if b_test_cov_depth_corr_f:
+    test_dir = '../test_vvv2_display/'
+    test_name = [
+        'res'
+        # 'res2'
+    ]
+    for resn in test_name:
+        cov_depth_f         = test_dir + "res2_covdepth.txt"
+        cov_depth_corr_f = test_dir + "res2_covdepth_corrected.txt"
+        cmd = ' '.join(['./correct_covdepth_f.py',
+                        "-s", cov_depth_f,
+                        "-c", cov_depth_corr_f
+                        ])
+        if b_verbose:
+            cmd = cmd + " -v"
+        print(prog_tag + " cmd:"+cmd)
+        print(prog_tag + " START")
+        os.system(cmd)
+        print(prog_tag + " END")
+    sys.exit()
+
+# ----------------------------------------------------------------------
+# reads seqstat file to get info on genome_length and on contigs lengths
+# ----------------------------------------------------------------------
+print(prog_tag + " Reads "+cov_depth_f+" file")
+print(prog_tag + " Creates "+cov_depth_corr_f+" file")
+o = open(cov_depth_corr_f, 'w+')
+
+# init var
+prev_contig_nr = 1
+contig_num   = 0
+
+pos = 0
+curr_shift = 0
+with open(cov_depth_f) as cdf:
+    for line in cdf:
+        line_fields = line.split("\t")
+        # print("line_fields:"+','.join(line_fields))
+        contig_nr = line_fields[0]
+        contig_pos = line_fields[1]
+        cov_depth = line_fields[2]
+        # print("line_fields:"+','.join(line_fields))
+        if b_verbose:
+            print(prog_tag + " contig:"+contig_name+" length:"+str(contig_length))
+
+        # write in output file
+        # if contig change, record shift to apply
+        if contig_nr != prev_contig_nr:
+            curr_shift = pos
+        # get position WITH SHIFT
+        pos = curr_shift + int(contig_pos)
+        pos_str = str(pos)
+        # write pos and related coverage depth
+        o.write("\t".join([pos_str, cov_depth]))
+        if b_verbose:            
+            print("record pos:",pos_str,", cov_depth:", cov_depth)
+
+        prev_contig_nr = contig_nr
+cdf.close()
+o.close()
+
+print(prog_tag + ' '+ cov_depth_corr_f + " file created")