From ad2f3b980205f1093e855ecfdc0a5ec95f8168af Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 19 Dec 2023 11:33:59 +0100 Subject: [PATCH 01/42] increase number shift to avoid overlap of number and dot --- src/visualize_snp_v4.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index ff37070..a08c33a 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -160,7 +160,7 @@ p6bis = p6 +guides(shape = guide_legend(order=1, direction="horizontal", title=" # p7 = p6 + theme(legend.position = "bottom") # modify the legend position p8 = p6bis + ylim(-0.06,1.2) # modify the scale -p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.01, label = indice, angle = 0)) # add indice to the grapĥ +p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice, angle = 0)) # add indice to the grapĥ #p10 = p9 + geom_text(x = 0, y = threshold + 0.01, label = t) # add the threshold text #p11 = p10 + geom_line(aes(x = position, y = 0.5), color = "red") p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red") From f264245f804082622b2a8221a72c80483f65d931 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 19 Dec 2023 11:34:31 +0100 Subject: [PATCH 02/42] replace f-string to improve compatibility with old python3 --- src/vvv2_display.py | 227 ++++++++++++++++++++++---------------------- 1 file changed, 116 insertions(+), 111 deletions(-) diff --git a/src/vvv2_display.py b/src/vvv2_display.py index b46b958..4b88a62 100755 --- a/src/vvv2_display.py +++ b/src/vvv2_display.py @@ -76,8 +76,8 @@ def __main__(): ######################################### # directories fo sub programs ######################################### - PYTHON_SCRIPTS = f"{dir_path}/" # PYTHON_SCRIPTS/" - R_SCRIPTS = f"{dir_path}/" # R_SCRIPTS/" + PYTHON_SCRIPTS = dir_path + "/" # PYTHON_SCRIPTS/" + R_SCRIPTS = dir_path + "/" # R_SCRIPTS/" ######################################### parser = argparse.ArgumentParser() @@ -224,58 +224,58 @@ def __main__(): # ------------------------------------------------------------------ # TEST for vvv2_display # ------------------------------------------------------------------ - test_dir = f"{dir_path}/../test_vvv2_display" + test_dir = dir_path + "/../test_vvv2_display" if b_test_vvv2_display: # -------------------------------------------------------------- # COMPLETE GENOME # in files - pass_annot_f = f"{test_dir}/res2_vadr_pass.tbl" # from vadr results - fail_annot_f = f"{test_dir}/res2_vadr_fail.tbl" # from vadr results - seq_stat_f = f"{test_dir}/res2_vadr.seqstat" # from vadr results - vardict_vcf_f = f"{test_dir}/res2_vardict.vcf" # from lofreq results + pass_annot_f = test_dir + "/res2_vadr_pass.tbl" # from vadr results + fail_annot_f = test_dir + "/res2_vadr_fail.tbl" # from vadr results + seq_stat_f = test_dir + "/res2_vadr.seqstat" # from vadr results + vardict_vcf_f = test_dir + "/res2_vardict.vcf" # from lofreq results # tmp out files - json_annot_f = f"{test_dir}/res2_vadr.json" # from convert_tbl2json.py - contig_limits_f= f"{test_dir}/contig_limits.txt" + json_annot_f = test_dir + "/res2_vadr.json" # from convert_tbl2json.py + contig_limits_f= test_dir + "/contig_limits.txt" # final out file - png_var_f = f"{test_dir}/res2_vvv2.png" # from ... - cmd = ' '.join([f"{dir_path}/vvv2_display.py", - f"--pass_tbl_f {pass_annot_f}", - f"--fail_tbl_f {fail_annot_f}", - f"--seq_stat_f {seq_stat_f}", - f"--vcf_f {vardict_vcf_f}", - f"--contig_limits_f {contig_limits_f}", - f"--png_var_f {png_var_f}" + png_var_f = test_dir + "/res2_vvv2.png" # from ... + cmd = ' '.join([ dir_path + "/vvv2_display.py", + "--pass_tbl_f", pass_annot_f, + "--fail_tbl_f", fail_annot_f, + "--seq_stat_f", seq_stat_f, + "--vcf_f", vardict_vcf_f, + "--contig_limits_f", contig_limits_f, + "--png_var_f", png_var_f ]) - print(f"{prog_tag} START") - print(f"{prog_tag} cmd:{cmd}") + print(prog_tag + " START") + print(prog_tag + " cmd:" + cmd) os.system(cmd) - print(f"{prog_tag} END") + print(prog_tag + " END") # -------------------------------------------------------------- # -------------------------------------------------------------- # CONTIGS # in files - pass_annot_f = f"{test_dir}/res_vadr_pass.tbl" # from vadr results - fail_annot_f = f"{test_dir}/res_vadr_fail.tbl" # from vadr results - seq_stat_f = f"{test_dir}/res_vadr.seqstat" # from vadr results - vardict_vcf_f = f"{test_dir}/res_vardict.vcf" # from lofreq results + pass_annot_f = test_dir + "/res_vadr_pass.tbl" # from vadr results + fail_annot_f = test_dir + "/res_vadr_fail.tbl" # from vadr results + seq_stat_f = test_dir + "/res_vadr.seqstat" # from vadr results + vardict_vcf_f = test_dir + "/res_vardict.vcf" # from lofreq results # tmp out files - json_annot_f = f"{test_dir}/res_vadr.json" # from convert_tbl2json. - contig_limits_f = f"{test_dir}/contig_limits.txt" # from lofreq results py + json_annot_f = test_dir + "/res_vadr.json" # from convert_tbl2json. + contig_limits_f = test_dir + "/contig_limits.txt" # from lofreq results py # final out file - png_var_f = f"{test_dir}/res_vvv2.png" # from ... - cmd = ' '.join([f"{dir_path}/vvv2_display.py", - f"--pass_tbl_f {pass_annot_f}", - f"--fail_tbl_f {fail_annot_f}", - f"--seq_stat_f {seq_stat_f}", - f"--vcf_f {vardict_vcf_f}", - f"--contig_limits_f {contig_limits_f}", - f"--png_var_f {png_var_f}" + png_var_f = test_dir + "/res_vvv2.png" # from ... + cmd = ' '.join([ dir_path + "/vvv2_display.py", + "--pass_tbl_f", pass_annot_f, + "--fail_tbl_f", fail_annot_f, + "--seq_stat_f", seq_stat_f, + "--vcf_f", vardict_vcf_f, + "--contig_limits_f", contig_limits_f, + "--png_var_f", png_var_f ]) - print(f"{prog_tag} START") - print(f"{prog_tag} cmd:{cmd}") + print(prog_tag + " START") + print(prog_tag + " cmd:" + cmd) os.system(cmd) - print(f"{prog_tag} END") + print(prog_tag + " END") # -------------------------------------------------------------- sys.exit() @@ -287,19 +287,19 @@ def __main__(): # ------------------------------------------------------------------ if b_test_convert_tbl2json: # # COMPLETE GENOME - # pass_annot_f = f"{test_dir}/res2_vadr_pass.tbl" # from vadr results - # fail_annot_f = f"{test_dir}/res2_vadr_fail.tbl" # from vadr results - # seq_stat_f = f"{test_dir}/res2_vadr.seqstat" # from vadr results - # json_annot_f = f"{test_dir}/res2_vadr.json" - ## bed_annot_f = f"{test_dir}/res2_vadr.bed" - # bed_vardict_annot_f = f"{test_dir}/res2_vadr.4vardict.bed" + # pass_annot_f = test_dir + "/res2_vadr_pass.tbl" # from vadr results + # fail_annot_f = test_dir + "/res2_vadr_fail.tbl" # from vadr results + # seq_stat_f = test_dir + "/res2_vadr.seqstat" # from vadr results + # json_annot_f = test_dir + "/res2_vadr.json" + ## bed_annot_f = test_dir + "/res2_vadr.bed" + # bed_vardict_annot_f = test_dir + "/res2_vadr.4vardict.bed" # CONTIGS - pass_annot_f = f"{test_dir}/res_vadr_pass.tbl" # from vadr results - fail_annot_f = f"{test_dir}/res_vadr_fail.tbl" # from vadr results - seq_stat_f = f"{test_dir}/res_vadr.seqstat" # from vadr results - json_annot_f = f"{test_dir}/res_vadr.json" - # bed_annot_f = f"{test_dir}/res_vadr.bed" - bed_vardict_annot_f = f"{test_dir}/res_vadr.4vardict.bed" + pass_annot_f = test_dir + "/res_vadr_pass.tbl" # from vadr results + fail_annot_f = test_dir + "/res_vadr_fail.tbl" # from vadr results + seq_stat_f = test_dir + "/res_vadr.seqstat" # from vadr results + json_annot_f = test_dir + "/res_vadr.json" + # bed_annot_f = test_dir + "/res_vadr.bed" + bed_vardict_annot_f = test_dir + "/res_vadr.4vardict.bed" if(json_annot_f == ''): json_annot_f = vardict_vcf_f @@ -310,22 +310,22 @@ def __main__(): # bed_vardict_annot_f = bed_vardict_annot_f.replace('.vcf', '.vardict.bed') bed_vardict_annot_f = re.sub('\.[^\.]+$', '_vardict.bed', bed_vardict_annot_f) - p_script = f"{PYTHON_SCRIPTS}convert_tbl2json.py" - cmd = ' '.join([f"{p_script}", - f"--pass_annot_f {pass_annot_f}", - f"--fail_annot_f {fail_annot_f}", - f"--seq_stat_f {seq_stat_f}", - f"--json_out_f {json_annot_f}", - # f"--bed_out_f {bed_annot_f}", - f"--bed_vardict_out_f {bed_vardict_annot_f}" + p_script = PYTHON_SCRIPTS + "convert_tbl2json.py" + cmd = ' '.join([ p_script, + "--pass_annot_f", pass_annot_f, + "--fail_annot_f", fail_annot_f, + "--seq_stat_f", seq_stat_f, + "--json_out_f", json_annot_f, + # "--bed_out_f", bed_annot_f, + "--bed_vardict_out_f", bed_vardict_annot_f ]) - print(f"{prog_tag} cmd:{cmd}") + print(prog_tag + " cmd:" + cmd) if b_test_convert_tbl2json: - print(f"{prog_tag} [test_convert_tbl2json] START") - print(f"{prog_tag} cmd:{cmd}") + print(prog_tag + " [test_convert_tbl2json] START") + print(prog_tag + " cmd:" + cmd) os.system(cmd) - print(f"{prog_tag} [test_convert_tbl2json] END") + print(prog_tag + " [test_convert_tbl2json] END") sys.exit() else: os.system(cmd) @@ -335,31 +335,31 @@ def __main__(): # in the final created picture # ------------------------------------------------------------------ if b_test_correct_multicontig_vardict_vcf: - seq_stat_f = f"{test_dir}/res_vadr.seqstat" # from vadr results - vardict_vcf_f = f"{test_dir}/res_vardict.vcf" # from vardict results - correct_vcf_f = f"{test_dir}/res_correct.vcf" # corrected out results - contig_limits_f = f"{test_dir}/contig_limits.txt" # contig limits + seq_stat_f = test_dir + "/res_vadr.seqstat" # from vadr results + vardict_vcf_f = test_dir + "/res_vardict.vcf" # from vardict results + correct_vcf_f = test_dir + "/res_correct.vcf" # corrected out results + contig_limits_f = test_dir + "/contig_limits.txt" # contig limits if(correct_vcf_f == ''): correct_vcf_f = vardict_vcf_f # correct_vcf_f = correct_vcf_f.replace('vardict.vcf', '_correct.vcf') correct_vcf_f = re.sub('\.[^\.]+$', '_correct.vcf', correct_vcf_f) - p_script = f"{PYTHON_SCRIPTS}correct_multicontig_vardict_vcf.py" - print(f"p_script:{p_script}") - cmd = ' '.join([f"{p_script}", - f"--seq_stat_f {seq_stat_f}", - f"--vardict_vcf_f {vardict_vcf_f}", - f"--correct_vcf_f {correct_vcf_f}", - f"--contig_limits_f {contig_limits_f}" + p_script = PYTHON_SCRIPTS + "correct_multicontig_vardict_vcf.py" + print("p_script:" + p_script) + cmd = ' '.join([p_script, + "--seq_stat_f", seq_stat_f, + "--vardict_vcf_f", vardict_vcf_f, + "--correct_vcf_f", correct_vcf_f, + "--contig_limits_f", contig_limits_f ]) - print(f"{prog_tag} cmd:{cmd}") + print(prog_tag + " cmd:" + cmd) if b_test_correct_multicontig_vardict_vcf: - print(f"{prog_tag} [test_correct_multicontig_vardict_vcf] START") - print(f"{prog_tag} cmd:{cmd}") + print(prog_tag + " [test_correct_multicontig_vardict_vcf] START") + print(prog_tag + " cmd:" + cmd) os.system(cmd) - print(f"{prog_tag} [test_correct_multicontig_vardict_vcf] END") + print(prog_tag + " [test_correct_multicontig_vardict_vcf] END") sys.exit() else: os.system(cmd) @@ -369,17 +369,17 @@ def __main__(): # ------------------------------------------------------------------ if b_test_convert_vcffile_to_readable: # # COMPLETE GENOME - # vardict_vcf_f = f"{test_dir}/res2_vardict.vcf" # from vardict results - # correct_vcf_f = f"{test_dir}/res2_correct.vcf" # corrected vardict results - # json_annot_f = f"{test_dir}/res2_vadr.json" - # snp_loc_f = f"{test_dir}/res2_snp.txt" - # snp_loc_summary_f = f"{test_dir}/res2_snp_summary.txt" + # vardict_vcf_f = test_dir + "/res2_vardict.vcf" # from vardict results + # correct_vcf_f = test_dir + "/res2_correct.vcf" # corrected vardict results + # json_annot_f = test_dir + "/res2_vadr.json" + # snp_loc_f = test_dir + "/res2_snp.txt" + # snp_loc_summary_f = test_dir + "/res2_snp_summary.txt" # CONTIGS - # vardict_vcf_f = f"{test_dir}/res_vardict.vcf" # from vardict results - correct_vcf_f = f"{test_dir}/res_correct.vcf" - json_annot_f = f"{test_dir}/res_vadr.json" - snp_loc_f = f"{test_dir}/res_snp.txt" - snp_loc_summary_f = f"{test_dir}/res_snp_summary.txt" + # vardict_vcf_f = test_dir + "/res_vardict.vcf" # from vardict results + correct_vcf_f = test_dir + "/res_correct.vcf" + json_annot_f = test_dir + "/res_vadr.json" + snp_loc_f = test_dir + "/res_snp.txt" + snp_loc_summary_f = test_dir + "/res_snp_summary.txt" if(snp_loc_f == ''): snp_loc_f = vardict_vcf_f @@ -393,21 +393,21 @@ def __main__(): # json_annot_f = f"{ech}_gene_position_viral_consensus.json" # snp_loc_f = f"{ech}_snp_location" # p_script = f"{PYTHON_SCRIPTS}convert_vcffile_to_readablefile.py" # use pyvcf - p_script = f"{PYTHON_SCRIPTS}convert_vcffile_to_readablefile2.py" # use pysam + p_script = PYTHON_SCRIPTS + "convert_vcffile_to_readablefile2.py" # use pysam threshold = "0.07" - cmd = ' '.join([f"{p_script}", - f"--vcfs {correct_vcf_f}", - f"--json {json_annot_f}", - f"--out {snp_loc_f}", - f"--outs {snp_loc_summary_f}", - f"--threshold {threshold}"]) - print(f"{prog_tag} cmd:{cmd}") + cmd = ' '.join([p_script, + "--vcfs", correct_vcf_f, + "--json", json_annot_f, + "--out", snp_loc_f, + "--outs", snp_loc_summary_f, + "--threshold", threshold]) + print(prog_tag + " cmd:" + cmd) if b_test_convert_vcffile_to_readable: - print(f"{prog_tag} [test_convert_vcffile_to_readable] START") - print(f"{prog_tag} cmd:{cmd}") + print(prog_tag + " [test_convert_vcffile_to_readable] START") + print(prog_tag + " cmd:" + cmd) os.system(cmd) - print(f"{prog_tag} [test_convert_vcffile_to_readable] END") + print(prog_tag + " [test_convert_vcffile_to_readable] END") sys.exit() else: os.system(cmd) @@ -419,35 +419,40 @@ def __main__(): # ------------------------------------------------------------------ if b_test_visualize_snp_v4: # # COMPLETE GENOME - # snp_loc_f = f"{test_dir}/res2_snp.txt" - # snp_loc_summary_f = f"{test_dir}/res2_snp_summary.txt" - # png_var_f = f"{test_dir}/res2_snp.png" + # snp_loc_f = test_dir + "/res2_snp.txt" + # snp_loc_summary_f = test_dir + "/res2_snp_summary.txt" + # png_var_f = test_dir + "/res2_snp.png" # CONTIGS - snp_loc_f = f"{test_dir}/res_snp.txt" - snp_loc_summary_f = f"{test_dir}/res_snp_summary.txt" - png_var_f = f"{test_dir}/res_snp.png" - contig_limits_f = f"{test_dir}/contig_limits.txt" + snp_loc_f = test_dir + "/res_snp.txt" + snp_loc_summary_f = test_dir + "/res_snp_summary.txt" + png_var_f = test_dir + "/res_snp.png" + contig_limits_f = test_dir + "/contig_limits.txt" if(png_var_f == ''): png_var_f = snp_loc_f png_var_f = re.sub('\.[^\.]+$', '.png', png_var_f) # env r-env.yaml - r_script = f"{R_SCRIPTS}visualize_snp_v4.R" + r_script = R_SCRIPTS + "visualize_snp_v4.R" threshold = "0.07" # png_var_f = f"{ech}_graphic_variant.png" - cmd = f"R --vanilla --quiet --args {snp_loc_f} {contig_limits_f} {threshold} {png_var_f} < {r_script} > /dev/null" - print(f"{prog_tag} cmd:{cmd}") + cmd = " ".join(["R --vanilla --quiet --args", + snp_loc_f, + contig_limits_f, + threshold, + png_var_f, + " < ", r_script, " > /dev/null"]) + print(prog_tag + " cmd:" + cmd) if b_test_visualize_snp_v4: - print(f"{prog_tag} [test_visualize_snp_v4] START") - print(f"{prog_tag} cmd:{cmd}") + print(prog_tag + " [test_visualize_snp_v4] START") + print(prog_tag + " cmd:" + cmd) os.system(cmd) - print(f"{prog_tag} [test_visualize_snp_v4] END") + print(prog_tag + " [test_visualize_snp_v4] END") else: os.system(cmd) # ------------------------------------------------------------------ - print(f"{png_var_f} file created") + print(png_var_f + " file created") ##### MAIN END if __name__=="__main__":__main__() From ebfbaa2cb7fbc6a450800b6b2eb20de4d3701ef9 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 19 Dec 2023 16:02:12 +0100 Subject: [PATCH 03/42] show prot as genes were, show genes as prot were. Add number to sort protein labels to better see legend --- src/visualize_snp_v4.R | 67 +++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index a08c33a..2804dfd 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -1,5 +1,6 @@ #!/usr/bin/env Rscript # +# FT; last modification December 19th 2023 # AF; last modification February 16th 2019 # # Description: This script has been written in order to generate a graph @@ -37,7 +38,7 @@ if(is.null(density)){ # means no snp found } # defined a wide color palet manually to ensure better visibility, the other colors defined later will not be so distinctive -protcols=c('#99CC00','#CC9900','#FFCC33','#FF9900','#FF6600','#FF3300','#CC3300','#990033','#FF3366','#FF9999','#FFCCFF','#CC99CC','#996699','#993399','#660066','#990066','#660099','#CC00CC','#6600CC','#9966FF','#6633FF','#330099','#0033CC','#3366FF','#0033FF','#00CCFF','#006699','#00CC99','#009966','#33FFCC','#33CC99','#66FFCC','#33CCCC','#99FFFF','#66CC99','#339966','#99CC99','#99FF99','#66CC66','#66FF66','#669933','#336600','#00FF00','#66CC33','#CCFF66','#CCCC99','#FFFFCC','#FFCC99','#FFCCCC','#FF99CC','#CC99FF','#9999CC','#CCCCFF','#333333','#999999','#666666','#CCCCCC','#000000','#FF0000','#33D033','#CCD066','#CC00FF','#9900FF','#0099FF','#003333','#0099CC','#00FF99','#CCFF00') +genecols=c('#99CC00','#CC9900','#FFCC33','#FF9900','#FF6600','#FF3300','#CC3300','#990033','#FF3366','#FF9999','#FFCCFF','#CC99CC','#996699','#993399','#660066','#990066','#660099','#CC00CC','#6600CC','#9966FF','#6633FF','#330099','#0033CC','#3366FF','#0033FF','#00CCFF','#006699','#00CC99','#009966','#33FFCC','#33CC99','#66FFCC','#33CCCC','#99FFFF','#66CC99','#339966','#99CC99','#99FF99','#66CC66','#66FF66','#669933','#336600','#00FF00','#66CC33','#CCFF66','#CCCC99','#FFFFCC','#FFCC99','#FFCCCC','#FF99CC','#CC99FF','#9999CC','#CCCCFF','#333333','#999999','#666666','#CCCCCC','#000000','#FF0000','#33D033','#CCD066','#CC00FF','#9900FF','#0099FF','#003333','#0099CC','#00FF99','#CCFF00') # ,'#','#','#','#','#','#','#','#','#','#','#','#', # '#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#', @@ -45,44 +46,44 @@ protcols=c('#99CC00','#CC9900','#FFCC33','#FF9900','#FF6600','#FF3300','#CC3300' # '#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#','#') # check color redundancy -protcols_unique = sort(unique(protcols)) +genecols_unique = sort(unique(genecols)) print("max_col_number_unique:") -print(length(protcols_unique)) +print(length(genecols_unique)) -if( length(protcols) != length(protcols_unique) ){ - protcols = sort(protcols) - for( icol in 1:length(protcols_unique) ){ - print("protcols_unique de ") +if( length(genecols) != length(genecols_unique) ){ + genecols = sort(genecols) + for( icol in 1:length(genecols_unique) ){ + print("genecols_unique de ") print(icol) print(":") - print(protcols_unique[icol]) - if( protcols_unique[icol] != protcols[icol] ){ - redund_vals = protcols[icol] + print(genecols_unique[icol]) + if( genecols_unique[icol] != genecols[icol] ){ + redund_vals = genecols[icol] print("Color found several times:") print(redund_vals) stop() } } - print("first values identical, added values in protcols:") - for( icol in length(protcols_unique)+1:length(protcols) ){ + print("first values identical, added values in genecols:") + for( icol in length(genecols_unique)+1:length(genecols) ){ print("Color found several times:") - print(protcols[icol]) + print(genecols[icol]) } stop() } -# add missing colors of rainbow 4000 to protcols, to be sure to always have enough colors (mimmivirus =~ 3000 genes) +# add missing colors of rainbow 4000 to genecols, to be sure to always have enough colors (mimmivirus =~ 3000 genes) complete_col_set=rainbow(n=10000) -indexes_to_rm = match(protcols, complete_col_set) +indexes_to_rm = match(genecols, complete_col_set) for(i in indexes_to_rm){ complete_col_set[ -i ] } -protcols = append(protcols, complete_col_set) +genecols = append(genecols, complete_col_set) -# check how many color we can manage -print("max_col_number:") -print(length(protcols)) +# # check how many color we can manage +# print("max_col_number:") +# print(length(genecols)) # print("genes:") @@ -111,6 +112,9 @@ print(length(protcols)) # } # needed to have color labels NOT ORDERED and to choose colors +gene_id_labels = unique(density$gene_id) +density$gene_id_num = paste( sprintf("%03d", match(density$gene_id, gene_id_labels) ), density$gene_id) + protein_id_labels = unique(density$protein_id) density$protein_id_num = paste( sprintf("%03d", match(density$protein_id, protein_id_labels) ), density$protein_id) @@ -118,21 +122,24 @@ density$protein_id_num = paste( sprintf("%03d", match(density$protein_id, prote # stem_loops_labels = unique(density$stem_loops) # density$stem_loops_num = paste( sprintf("%03d", match(density$stem_loops, stem_loops_labels) ), density$stem_loops) -# print("protein_id_num:") -# print(density$protein_id_num) +# print("gene_id_num:") +# print(density$gene_id_num) -# print("protein_id_labels:") -# print(protein_id_labels) +print("gene_id_labels:") +print(gene_id_labels) -p1 = p + geom_point(aes(x = position, y = -0.05, colour = density$protein_id_num, shape = density$gene_id), size = 1, show.legend = F, shape = 15) # add the consensus points, and remove the legend "size" for proteins +p1 = p + geom_point(aes(x = position, y = -0.05, colour = density$gene_id_num, shape = density$protein_id_num), size = 1, show.legend = F, shape = 15) # add the consensus points, and remove the legend "size" for proteins # needed to have more than 6 symbols for gene_id legend -p1bis = p1 + scale_shape_manual(values=c(1:25)) +p1bis = p1 + scale_shape_manual(values=c(1:25,1:25)) # use color more easily distinguishable -p1ter = p1bis + scale_color_manual(values=protcols[1:length(protein_id_labels)]) +p1ter = p1bis + scale_color_manual(values=genecols[1:length(gene_id_labels)]) -p2 = p1ter + geom_point(aes(x = position, y = as.numeric(variant_percent), color = density$protein_id_num, shape = density$gene_id)) # add the variant points +print("length of density protein_id:") +print(length(density$protein_id_num)) +p2 = p1ter + geom_point(aes(x = position, y = as.numeric(variant_percent), color = density$gene_id_num, shape = density$protein_id_num)) # add the variant points +# p2 = p1ter + geom_point(aes(x = position, y = as.numeric(variant_percent), color = density$gene_id_num, shape = density$gene_id)) # add the variant points p3 = p2 + geom_line(aes(x=position, y=threshold)) # add the threshold line @@ -155,15 +162,15 @@ p4 = p3bis + labs(title = t) # add graph title p5 = p4 + xlab("Base Position") # add x axe title p6 = p5 + ylab("Variant Frequency") # add axes and graph titles -p6bis = p6 +guides(shape = guide_legend(order=1, direction="horizontal", title="gene"), - color = guide_legend(order=2, direction="vertical", title="protein")) +p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="protein"), + color = guide_legend(order=2, direction="horizontal", title="gene")) # p7 = p6 + theme(legend.position = "bottom") # modify the legend position p8 = p6bis + ylim(-0.06,1.2) # modify the scale p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice, angle = 0)) # add indice to the grapĥ #p10 = p9 + geom_text(x = 0, y = threshold + 0.01, label = t) # add the threshold text #p11 = p10 + geom_line(aes(x = position, y = 0.5), color = "red") -p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red") +p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red", legend.position="top") p11 = p10 + theme(plot.title = element_text(hjust=0.5)) From 2946b6e8abb2fd2136bb788ab534e51766df671b Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 19 Dec 2023 16:41:20 +0100 Subject: [PATCH 04/42] change 'non coding RNA' to 'non translated RNA' --- src/convert_vcffile_to_readablefile2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/convert_vcffile_to_readablefile2.py b/src/convert_vcffile_to_readablefile2.py index 5979ca9..1ff8f2a 100755 --- a/src/convert_vcffile_to_readablefile2.py +++ b/src/convert_vcffile_to_readablefile2.py @@ -96,7 +96,7 @@ def find_key_proteins(dico_json, genomepos, gene_res): """ This function retrieves the protein name and the start and end position of this protein. """ - protein = 'non coding RNA' + protein = 'non translated RNA' base_inf_allproteins = '' base_sup_allproteins = '' # print("find_key_proteins call pos ("+str(genomepos)+", "+gene_res+")") @@ -112,7 +112,7 @@ def find_key_proteins(dico_json, genomepos, gene_res): protein = 'intergene' base_inf_allproteins = str(base_inf) base_sup_allproteins = str(base_sup) - elif protein == 'non coding RNA': + elif protein == 'non translated RNA': protein = key base_inf_allproteins = str(base_inf) base_sup_allproteins = str(base_sup) From c6d6e8d9d62ddb59dcac75b9e21c2864005cafeb Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 19 Dec 2023 16:42:01 +0100 Subject: [PATCH 05/42] add ':' after numbers used to sort genes and prot and keep order of apparition in the graph --- src/visualize_snp_v4.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index 2804dfd..a32ac88 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -113,10 +113,10 @@ genecols = append(genecols, complete_col_set) # needed to have color labels NOT ORDERED and to choose colors gene_id_labels = unique(density$gene_id) -density$gene_id_num = paste( sprintf("%03d", match(density$gene_id, gene_id_labels) ), density$gene_id) +density$gene_id_num = paste( sprintf("%03d", match(density$gene_id, gene_id_labels) ), ":", density$gene_id) protein_id_labels = unique(density$protein_id) -density$protein_id_num = paste( sprintf("%03d", match(density$protein_id, protein_id_labels) ), density$protein_id) +density$protein_id_num = paste( sprintf("%03d", match(density$protein_id, protein_id_labels) ), ":", density$protein_id) # # TODO stem_loop # stem_loops_labels = unique(density$stem_loops) @@ -136,8 +136,6 @@ p1bis = p1 + scale_shape_manual(values=c(1:25,1:25)) # use color more easily distinguishable p1ter = p1bis + scale_color_manual(values=genecols[1:length(gene_id_labels)]) -print("length of density protein_id:") -print(length(density$protein_id_num)) p2 = p1ter + geom_point(aes(x = position, y = as.numeric(variant_percent), color = density$gene_id_num, shape = density$protein_id_num)) # add the variant points # p2 = p1ter + geom_point(aes(x = position, y = as.numeric(variant_percent), color = density$gene_id_num, shape = density$gene_id)) # add the variant points @@ -162,15 +160,15 @@ p4 = p3bis + labs(title = t) # add graph title p5 = p4 + xlab("Base Position") # add x axe title p6 = p5 + ylab("Variant Frequency") # add axes and graph titles -p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="protein"), - color = guide_legend(order=2, direction="horizontal", title="gene")) +p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="protein", legend.position="top"), + color = guide_legend(order=2, direction="horizontal", title="gene", legend.position="top")) # p7 = p6 + theme(legend.position = "bottom") # modify the legend position p8 = p6bis + ylim(-0.06,1.2) # modify the scale p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice, angle = 0)) # add indice to the grapĥ #p10 = p9 + geom_text(x = 0, y = threshold + 0.01, label = t) # add the threshold text #p11 = p10 + geom_line(aes(x = position, y = 0.5), color = "red") -p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red", legend.position="top") +p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red") p11 = p10 + theme(plot.title = element_text(hjust=0.5)) From 61da32f2e494b78de5711cc6641dc22f39145e17 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 19 Dec 2023 19:29:58 +0100 Subject: [PATCH 06/42] modified to add parameter for cov display file to allow cov depth display in graph above variants/annotations --- src/visualize_snp_v4.R | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index a32ac88..a555087 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -28,6 +28,13 @@ if(inherits(contig_limits,"try-error")) threshold = as.numeric(args[3]) # define threshold +outfile = args[4] + +# to prepare coverage depth graph above variant/annotation graph +coverage_depth <- try( read.table(args[5], h=F, sep = "\t") ) # read the dataframe +if(inherits(coverage_depth,"try-error")) + coverage_depth <- NULL + t1 = as.character(threshold) # define threshold as a character t = paste("Nucleotide Variation - threshold = ", t1, sep = ' ') @@ -113,11 +120,13 @@ genecols = append(genecols, complete_col_set) # needed to have color labels NOT ORDERED and to choose colors gene_id_labels = unique(density$gene_id) -density$gene_id_num = paste( sprintf("%03d", match(density$gene_id, gene_id_labels) ), ":", density$gene_id) +density$gene_id_num = paste( sprintf("%02d", match(density$gene_id, gene_id_labels) ), ":", density$gene_id) protein_id_labels = unique(density$protein_id) density$protein_id_num = paste( sprintf("%03d", match(density$protein_id, protein_id_labels) ), ":", density$protein_id) +# protein_id_ordered = fct_reorder(protein_id_labels, density$protein_id_num) + # # TODO stem_loop # stem_loops_labels = unique(density$stem_loops) # density$stem_loops_num = paste( sprintf("%03d", match(density$stem_loops, stem_loops_labels) ), density$stem_loops) @@ -160,8 +169,8 @@ p4 = p3bis + labs(title = t) # add graph title p5 = p4 + xlab("Base Position") # add x axe title p6 = p5 + ylab("Variant Frequency") # add axes and graph titles -p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="protein", legend.position="top"), - color = guide_legend(order=2, direction="horizontal", title="gene", legend.position="top")) +p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="proteins (order: protein names)"), + color = guide_legend(order=2, direction="horizontal", title="genes (order: gene names)")) # p7 = p6 + theme(legend.position = "bottom") # modify the legend position p8 = p6bis + ylim(-0.06,1.2) # modify the scale @@ -172,6 +181,6 @@ p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red") p11 = p10 + theme(plot.title = element_text(hjust=0.5)) -ggsave(args[4], device = "png", plot = last_plot(), width = 20, dpi = 600) # save the graph +ggsave(outfile, device = "png", plot = last_plot(), width = 20, dpi = 600) # save the graph # ~ end of script ~ From 2a3939dff0bc74ea35de17330bef51e7dc8ec8eb Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 19 Dec 2023 19:30:29 +0100 Subject: [PATCH 07/42] add call to correct_covdepth_f.py, TODO:test --- src/vvv2_display.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/src/vvv2_display.py b/src/vvv2_display.py index 4b88a62..7d95a07 100755 --- a/src/vvv2_display.py +++ b/src/vvv2_display.py @@ -44,6 +44,9 @@ def __main__(): prog_tag = '[' + os.path.basename(__file__) + ']' + # to record if we display cov depth in graph or not (depends on provided intputs) + b_cov_depth_display = False + # -------------------------------------- # input files # -------------------------------------- @@ -96,6 +99,12 @@ def __main__(): parser.add_argument("-r", "--png_var_f", dest='png_var_f', help="out: png file with variant proportions and annotations", metavar="FILE") + parser.add_argument("-o", "--cov_depth_f", dest='cov_depth_f', + help="[optional] in: text file of coverage depths (given by samtools depth)", + metavar="FILE") + parser.add_argument("-e", "--cov_depth_corr_f", dest='cov_depth_corr_f', + help="[optional] out: text file of coverage depths with cumulated position in case of several contigs, for display (tmp file, for galaxy compatibility)", + metavar="FILE") parser.add_argument("-t", "--snp_loc_f", dest='snp_loc_f', help="[optional] out: variant description for relevant positions, txt file (if not provided, file name deduced from png name)", metavar="FILE") @@ -205,6 +214,9 @@ def __main__(): snp_loc_f = os.path.abspath(args.snp_loc_f) if args.snp_loc_summary_f is not None: snp_loc_summary_f = os.path.abspath(args.snp_loc_summary_f) + if args.cov_depth_f is not None: + cov_depth_f = os.path.abspath(args.cov_depth_f) + b_cov_depth_display = True # ---------------------------------------------------------------- # optional arguments only for Galaxy compatibility if args.json_annot_f is not None: @@ -215,6 +227,8 @@ def __main__(): correct_vcf_f = os.path.abspath(args.correct_vcf_f) if args.contig_limits_f is not None: contig_limits_f = os.path.abspath(args.contig_limits_f) + if args.cov_depth_corr_f is not None: + cov_depth_corr_f = os.path.abspath(args.cov_depth_corr_f) # ---------------------------------------------------------------- if args.b_verbose is not None: @@ -431,7 +445,17 @@ def __main__(): if(png_var_f == ''): png_var_f = snp_loc_f png_var_f = re.sub('\.[^\.]+$', '.png', png_var_f) - + + # creates corrected cov depth file + if b_cov_depth_display: + cmd = cmd + " ".join([ + " --cov_depth_f", cov_depth_f, + " --cov_depth_corr_f", cov_depth_corr_f, + " " + ]) + print(prog_tag + " cmd:" + cmd) + os.system(cmd) + # env r-env.yaml r_script = R_SCRIPTS + "visualize_snp_v4.R" threshold = "0.07" @@ -440,8 +464,13 @@ def __main__(): snp_loc_f, contig_limits_f, threshold, - png_var_f, - " < ", r_script, " > /dev/null"]) + png_var_f]) + + # parameter to allow coverage depth display above variants/annotations + if b_cov_depth_display: + cmd = cmd + cov_depth_corr_f + " " + + cmd = cmd + " ".join([" < ", r_script, " > /dev/null"]) print(prog_tag + " cmd:" + cmd) if b_test_visualize_snp_v4: From 51ee90d35690cd087b00b536ec2edb97b88904f4 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 19 Dec 2023 19:30:56 +0100 Subject: [PATCH 08/42] script to correct cov depth position if several contigs --- src/correct_covdepth_f.py | 151 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100755 src/correct_covdepth_f.py diff --git a/src/correct_covdepth_f.py b/src/correct_covdepth_f.py new file mode 100755 index 0000000..ea6afd1 --- /dev/null +++ b/src/correct_covdepth_f.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# This file is part of the vvv2_display distribution (https://github.com/ANSES-Ploufragan/vvv2_display). +# Copyright (c) 2023 Fabrice Touzain. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +### +# USE PYTHON3 +# from vcf file of vardict and bed file with contig positions, creates a vcf file with +# contigs shifted position (to create a picture of variants and annotations) +### +import argparse, os, sys, csv, re, warnings +from os import path +import subprocess + +# to be able to report line number in error messages +import inspect +frame = inspect.currentframe() + +# debug +b_test_correct_covdepth_f = False # 2023 12 19 +b_test = False + +prog_tag = '[' + os.path.basename(__file__) + ']' + +covdepth_f = '' # in, covdepth file provided by samtools depth -aa file.bam +correct_covdepth_f = '' # out same as previous but positions on contigs are shifted according to previous contigs + +parser = argparse.ArgumentParser() +parser.add_argument("-s", "--covdepth_f", dest='covdepth_f', + help="in: coverage depth file provided by samtools -aa in.bam (all contigs)", + metavar="FILE") +parser.add_argument("-c", "--correct_covdepth_f", dest='correct_covdepth_f', + help="out: coverage depth file corrected for position (cumulative position in case of several contigs)", + metavar="FILE") +parser.add_argument("-z", "--test_all", dest='b_test_all', + help="[Optional] run all tests", + action='store_true') +parser.add_argument("-v", "--verbose", dest='b_verbose', + help="[Optional] To have details on records when running", + action='store_true') +parser.set_defaults(b_test_all=False) +parser.set_defaults(b_verbose=False) + +# get absolute path in case of files +args = parser.parse_args() + +# ------------------------------------------- +# check arguments +b_test_all = args.b_test_all + +if b_test_all: + b_test_correct_covdepth_f = True + b_test = True +else: + b_test = b_test_correct_covdepth_f + +if ((not b_test)and + ((len(sys.argv) < 2) or (len(sys.argv) > 10))): + parser.print_help() + print(prog_tag + "[Error] we found "+str(len(sys.argv)) + + " arguments, exit line "+str(frame.f_lineno)) + sys.exit(0) + +# print('args:', args) +if args.covdepth_f is not None: + covdepth_f = os.path.abspath(args.covdepth_f) +elif(not b_test): + sys.exit(prog_tag + "[Error] You must provide covdepth_f") +if args.correct_covdepth_f is not None: + correct_covdepth_f = os.path.abspath(args.correct_covdepth_f) +elif(not b_test): + sys.exit(prog_tag + "[Error] You must provide correct_covdepth_f") + +if args.b_verbose is not None: + b_verbose = args.b_verbose + +if b_test_correct_covdepth_f: + test_dir = '../test_vvv2_display/' + test_name = [ + 'res' + # 'res2' + ] + for resn in test_name: + covdepth_f = test_dir + "res_vvv2_covdepth.txt" + correct_covdepth_f = test_dir + "res_vvv2_covdepth_corrected.txt" + cmd = ' '.join(['./correct_covdepth_f.py', + "-s", covdepth_f, + "-c", correct_covdepth_f + ]) + if b_verbose: + cmd = cmd + " -v" + print(prog_tag + " cmd:"+cmd) + print(prog_tag + " START") + os.system(cmd) + print(prog_tag + " END") + sys.exit() + +# ---------------------------------------------------------------------- +# reads seqstat file to get info on genome_length and on contigs lengths +# ---------------------------------------------------------------------- +print(prog_tag + " Reads "+covdepth_f+" file") +print(prog_tag + " Creates "+correct_covdepth_f+" file") +o = open(correct_covdepth_f, 'w+') + +# init var +prev_contig_nr = 1 +contig_num = 0 + +pos = 0 +curr_shift = 0 +with open(covdepth_f) as cdf: + for line in cdf: + line_fields = line.split("\t") + # print("line_fields:"+','.join(line_fields)) + contig_nr = line_fields[0] + contig_pos = line_fields[1] + covdepth = line_fields[2] + # print("line_fields:"+','.join(line_fields)) + if b_verbose: + print(prog_tag + " contig:"+contig_name+" length:"+str(contig_length)) + + # write in output file + # if contig change, record shift to apply + if contig_nr != prev_contig_nr: + curr_shift = pos + # get position WITH SHIFT + pos = curr_shift + int(contig_pos) + pos_str = str(pos) + # write pos and related coverage depth + o.write("\t".join([pos_str, covdepth])) + if b_verbose: + print("record pos:",pos_str,", covdepth:", covdepth) + + prev_contig_nr = contig_nr +cdf.close() +o.close() + +print(prog_tag + ' '+ correct_covdepth_f + " file created") From f37ecab50cc82457077838dd61b20ec0b2a6a587 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 20 Dec 2023 14:03:37 +0100 Subject: [PATCH 09/42] var name changed for homogeneity, file names too --- src/correct_covdepth_f.py | 52 +++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/correct_covdepth_f.py b/src/correct_covdepth_f.py index ea6afd1..7ce917e 100755 --- a/src/correct_covdepth_f.py +++ b/src/correct_covdepth_f.py @@ -30,19 +30,19 @@ frame = inspect.currentframe() # debug -b_test_correct_covdepth_f = False # 2023 12 19 +b_test_cov_depth_corr_f = False # 2023 12 19 b_test = False prog_tag = '[' + os.path.basename(__file__) + ']' -covdepth_f = '' # in, covdepth file provided by samtools depth -aa file.bam -correct_covdepth_f = '' # out same as previous but positions on contigs are shifted according to previous contigs +cov_depth_f = '' # in, cov_depth file provided by samtools depth -aa file.bam +cov_depth_corr_f = '' # out same as previous but positions on contigs are shifted according to previous contigs parser = argparse.ArgumentParser() -parser.add_argument("-s", "--covdepth_f", dest='covdepth_f', +parser.add_argument("-s", "--cov_depth_f", dest='cov_depth_f', help="in: coverage depth file provided by samtools -aa in.bam (all contigs)", metavar="FILE") -parser.add_argument("-c", "--correct_covdepth_f", dest='correct_covdepth_f', +parser.add_argument("-c", "--cov_depth_corr_f", dest='cov_depth_corr_f', help="out: coverage depth file corrected for position (cumulative position in case of several contigs)", metavar="FILE") parser.add_argument("-z", "--test_all", dest='b_test_all', @@ -62,10 +62,10 @@ b_test_all = args.b_test_all if b_test_all: - b_test_correct_covdepth_f = True + b_test_cov_depth_corr_f = True b_test = True else: - b_test = b_test_correct_covdepth_f + b_test = b_test_cov_depth_corr_f if ((not b_test)and ((len(sys.argv) < 2) or (len(sys.argv) > 10))): @@ -75,30 +75,30 @@ sys.exit(0) # print('args:', args) -if args.covdepth_f is not None: - covdepth_f = os.path.abspath(args.covdepth_f) +if args.cov_depth_f is not None: + cov_depth_f = os.path.abspath(args.cov_depth_f) elif(not b_test): - sys.exit(prog_tag + "[Error] You must provide covdepth_f") -if args.correct_covdepth_f is not None: - correct_covdepth_f = os.path.abspath(args.correct_covdepth_f) + sys.exit(prog_tag + "[Error] You must provide cov_depth_f") +if args.cov_depth_corr_f is not None: + cov_depth_corr_f = os.path.abspath(args.cov_depth_corr_f) elif(not b_test): - sys.exit(prog_tag + "[Error] You must provide correct_covdepth_f") + sys.exit(prog_tag + "[Error] You must provide cov_depth_corr_f") if args.b_verbose is not None: b_verbose = args.b_verbose -if b_test_correct_covdepth_f: +if b_test_cov_depth_corr_f: test_dir = '../test_vvv2_display/' test_name = [ 'res' # 'res2' ] for resn in test_name: - covdepth_f = test_dir + "res_vvv2_covdepth.txt" - correct_covdepth_f = test_dir + "res_vvv2_covdepth_corrected.txt" + cov_depth_f = test_dir + "res2_covdepth.txt" + cov_depth_corr_f = test_dir + "res2_covdepth_corrected.txt" cmd = ' '.join(['./correct_covdepth_f.py', - "-s", covdepth_f, - "-c", correct_covdepth_f + "-s", cov_depth_f, + "-c", cov_depth_corr_f ]) if b_verbose: cmd = cmd + " -v" @@ -111,9 +111,9 @@ # ---------------------------------------------------------------------- # reads seqstat file to get info on genome_length and on contigs lengths # ---------------------------------------------------------------------- -print(prog_tag + " Reads "+covdepth_f+" file") -print(prog_tag + " Creates "+correct_covdepth_f+" file") -o = open(correct_covdepth_f, 'w+') +print(prog_tag + " Reads "+cov_depth_f+" file") +print(prog_tag + " Creates "+cov_depth_corr_f+" file") +o = open(cov_depth_corr_f, 'w+') # init var prev_contig_nr = 1 @@ -121,13 +121,13 @@ pos = 0 curr_shift = 0 -with open(covdepth_f) as cdf: +with open(cov_depth_f) as cdf: for line in cdf: line_fields = line.split("\t") # print("line_fields:"+','.join(line_fields)) contig_nr = line_fields[0] contig_pos = line_fields[1] - covdepth = line_fields[2] + cov_depth = line_fields[2] # print("line_fields:"+','.join(line_fields)) if b_verbose: print(prog_tag + " contig:"+contig_name+" length:"+str(contig_length)) @@ -140,12 +140,12 @@ pos = curr_shift + int(contig_pos) pos_str = str(pos) # write pos and related coverage depth - o.write("\t".join([pos_str, covdepth])) + o.write("\t".join([pos_str, cov_depth])) if b_verbose: - print("record pos:",pos_str,", covdepth:", covdepth) + print("record pos:",pos_str,", cov_depth:", cov_depth) prev_contig_nr = contig_nr cdf.close() o.close() -print(prog_tag + ' '+ correct_covdepth_f + " file created") +print(prog_tag + ' '+ cov_depth_corr_f + " file created") From b4d0b989274d9b37977a03ba3d1d0853d03d4ac3 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 20 Dec 2023 14:04:38 +0100 Subject: [PATCH 10/42] add test for src/correct_covdepth_f.py --- src/vvv2_display.py | 54 ++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/src/vvv2_display.py b/src/vvv2_display.py index 7d95a07..e2a6b5c 100755 --- a/src/vvv2_display.py +++ b/src/vvv2_display.py @@ -37,6 +37,7 @@ def __main__(): b_test_convert_tbl2json = False # ok 2022 04 26 complete tc, b_test_correct_multicontig_vardict_vcf = False # ok 2022 04 29 partial tc b_test_convert_vcffile_to_readable = False # ok 2022 04 28 complete tc, + b_test_correct_covdepth_f = True # 2023 12 20 tc, b_test_visualize_snp_v4 = False # ok 2022 04 28 complete tc, b_test = False dir_path = os.path.dirname(os.path.abspath(__file__)) # dir of current script @@ -55,6 +56,8 @@ def __main__(): seq_stat_f = '' # in, seq stat file from vadr vardict_vcf_f = '' # in, vcf file from VarDict correct_vcf_f = '' # in, vcf file corrected for pos (when multicontigs) + cov_depth_f = '' # in, cov depth file provided by samtools depth -aa + cov_depth_corr_f= '' # in, cov depth corrected for pos (when pulticontigs) # -------------------------------------- # -------------------------------------- @@ -138,6 +141,9 @@ def __main__(): parser.add_argument("-d", "--test_visualize_snp_v4", dest='b_test_visualize_snp_v4', help="[Optional] run test to visualize snp in a png", action='store_true') + parser.add_argument("-g", "--test_correct_covdepth_f", dest='b_test_correct_covdepth_f', + help="[Optional] run test to correct position in cov depth file", + action='store_true') parser.add_argument("-v", "--verbose", dest='b_verbose', help="[Optional] To have details on records when running", action='store_true') @@ -158,24 +164,27 @@ def __main__(): b_test_correct_multicontig_vardict_vcf = args.b_test_correct_multicontig_vardict_vcf b_test_convert_vcffile_to_readable = args.b_test_convert_vcffile_to_readable b_test_visualize_snp_v4 = args.b_test_visualize_snp_v4 + b_test_correct_covdepth_f = args.b_test_correct_covdepth_f if b_test_vvv2_display: b_test_convert_tbl2json = True b_test_correct_multicontig_vardict_vcf = True b_test_convert_vcffile_to_readable = True b_test_visualize_snp_v4 = True + b_test_correct_covdepth_f = True b_test = True else: b_test = (b_test_vvv2_display or b_test_convert_tbl2json or b_test_correct_multicontig_vardict_vcf or b_test_convert_vcffile_to_readable or + b_test_correct_covdepth_f or b_test_visualize_snp_v4) # print(f"b_test:{b_test}") # print(f"b_test_convert_tbl2json:{b_test_convert_tbl2json}") if ((not b_test)and - ((len(sys.argv) < 9) or (len(sys.argv) > 23))): + ((len(sys.argv) < 9) or (len(sys.argv) > 25))): print("\n".join([prog_tag, "Aim: Display of SNP proportions, annotations, for an assembly", "in:", @@ -246,10 +255,12 @@ def __main__(): pass_annot_f = test_dir + "/res2_vadr_pass.tbl" # from vadr results fail_annot_f = test_dir + "/res2_vadr_fail.tbl" # from vadr results seq_stat_f = test_dir + "/res2_vadr.seqstat" # from vadr results - vardict_vcf_f = test_dir + "/res2_vardict.vcf" # from lofreq results + vardict_vcf_f = test_dir + "/res2_vardict.vcf" # from lofreq results + cov_depth_f = test_dir + "/res2_covdepth.txt" # from samtools results # tmp out files json_annot_f = test_dir + "/res2_vadr.json" # from convert_tbl2json.py contig_limits_f= test_dir + "/contig_limits.txt" + cov_depth_corr_f= test_dir + "/res2_covdepth_corr.txt" # final out file png_var_f = test_dir + "/res2_vvv2.png" # from ... cmd = ' '.join([ dir_path + "/vvv2_display.py", @@ -257,7 +268,9 @@ def __main__(): "--fail_tbl_f", fail_annot_f, "--seq_stat_f", seq_stat_f, "--vcf_f", vardict_vcf_f, - "--contig_limits_f", contig_limits_f, + "--contig_limits_f", contig_limits_f, + "--cov_depth_f", cov_depth_f, + "--cov_depth_corr_f", cov_depth_corr_f, "--png_var_f", png_var_f ]) print(prog_tag + " START") @@ -283,7 +296,9 @@ def __main__(): "--fail_tbl_f", fail_annot_f, "--seq_stat_f", seq_stat_f, "--vcf_f", vardict_vcf_f, - "--contig_limits_f", contig_limits_f, + "--contig_limits_f", contig_limits_f, + "--cov_depth_f", cov_depth_f, + "--cov_depth_corr_f", cov_depth_corr_f, "--png_var_f", png_var_f ]) print(prog_tag + " START") @@ -427,6 +442,25 @@ def __main__(): os.system(cmd) # ------------------------------------------------------------------ + # creates corrected cov depth file + if b_cov_depth_display: + if b_test_correct_covdepth_f and (not b_test_vvv2_display): + cov_depth_f = test_dir + "/res_vvv2_covdepth.txt" + cov_depth_corr_f = test_dir + "/res_vvv2_covdepth_corrected.txt" + cmd = " ".join([ + PYTHON_SCRIPTS + "correct_covdepth_f.py", + "--cov_depth_f", cov_depth_f, + "--cov_depth_corr_f", cov_depth_corr_f + ]) + if b_test_correct_covdepth_f and (not b_test_vvv2_display): + print(prog_tag + " [test_correct_covdepth_f] START") + print(prog_tag + " cmd:" + cmd) + os.system(cmd) + print(prog_tag + " [test_correct_covdepth_f] END") + sys.exit() + else: + print(prog_tag + " cmd:" + cmd) + os.system(cmd) # ------------------------------------------------------------------ # creates png graphic of variants from snp file and threshold @@ -446,16 +480,6 @@ def __main__(): png_var_f = snp_loc_f png_var_f = re.sub('\.[^\.]+$', '.png', png_var_f) - # creates corrected cov depth file - if b_cov_depth_display: - cmd = cmd + " ".join([ - " --cov_depth_f", cov_depth_f, - " --cov_depth_corr_f", cov_depth_corr_f, - " " - ]) - print(prog_tag + " cmd:" + cmd) - os.system(cmd) - # env r-env.yaml r_script = R_SCRIPTS + "visualize_snp_v4.R" threshold = "0.07" @@ -468,7 +492,7 @@ def __main__(): # parameter to allow coverage depth display above variants/annotations if b_cov_depth_display: - cmd = cmd + cov_depth_corr_f + " " + cmd = cmd + " " + cov_depth_corr_f + " " cmd = cmd + " ".join([" < ", r_script, " > /dev/null"]) print(prog_tag + " cmd:" + cmd) From eb0a9edf0f26115e55beef73b30ecf57340ea4b6 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 20 Dec 2023 18:14:53 +0100 Subject: [PATCH 11/42] add R dependencies --- src/vvv2_display.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vvv2_display.yaml b/src/vvv2_display.yaml index f60f383..b9d51e0 100644 --- a/src/vvv2_display.yaml +++ b/src/vvv2_display.yaml @@ -10,4 +10,4 @@ dependencies: - ca-certificates - openssl - r-ggplot2=3.3.6 - + - r-gridextra From 0a5d8672e02c0a07f980bc54babd24842ad99a19 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 20 Dec 2023 18:15:04 +0100 Subject: [PATCH 12/42] add R dependencies --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1eef2b7..94fb47f 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setuptools.setup( name="vvv2_display", # Required - version="0.1.10", # Required + version="0.1.11", # Required description="Viral Variant Visualizer 2 display", # Optional long_description=long_description, long_description_content_type="text/markdown", # Optional (see note above) @@ -59,6 +59,7 @@ "convert_tbl2json.py", "convert_vcffile_to_readablefile2.py", "correct_multicontig_vardict_vcf.py", + "correct_covdepth_f.py", "visualize_coverage_depth.R", "visualize_snp_v4.R" ] @@ -69,6 +70,7 @@ install_requires=[ "python>=3.9", "r-ggplot2==3.6.6", + "r-gridextra=2.3" "pysam==0.19.1", "numpy==1.23.1"], # Optional scripts=[ @@ -76,6 +78,7 @@ "src/convert_tbl2json.py", "src/convert_vcffile_to_readablefile2.py", "src/correct_multicontig_vardict_vcf.py", + "correct_covdepth_f.py", "src/visualize_coverage_depth.R", "src/visualize_snp_v4.R" ], From 4c60a209a75e5a777911d64e254449fe898799e5 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 20 Dec 2023 18:15:29 +0100 Subject: [PATCH 13/42] cov depth chgmt --- CHANGELOG | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index e554c43..c5d5840 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,11 @@ -0.1.10-beta: +0.1.11: + - display prot instead of genes and genes instead of prot + - add num in front of genes to ensure to keep apparition order + - replace fstrings in vvv2_display.py to allow better compatibility + - add src/correct_covdepth_f.py script to convert cov depth files of samtools depth so as to have cumulative positions for display + - prepare vvv2_display to coverage depth addition in input + - add new script to setup.py file and change version to 0.1.11 +0.1.10: - now horizontal line colors are for proteins, dot shape is for genes. Added vertical dotted lines to show contig limits - correction in vadr parsing using different virus models - tested on calicivirus, dengue, flavivirus, coronavirus, sarscov2, hcv, norovirus From 74dd436f38855542448aed351d25d06504ed7bfa Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 20 Dec 2023 18:15:50 +0100 Subject: [PATCH 14/42] add coverage depth --- src/visualize_snp_v4.R | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index a555087..06b6b50 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -16,6 +16,8 @@ # install.packages("viridis") # library("viridis") library(ggplot2) # import the ggplot2 library +library(gridExtra) # multi graph on the same figure + args <- commandArgs(TRUE) # all arguments are character types density <- try( read.table(args[1], h=T, sep = "\t") ) # read the dataframe @@ -38,10 +40,22 @@ if(inherits(coverage_depth,"try-error")) t1 = as.character(threshold) # define threshold as a character t = paste("Nucleotide Variation - threshold = ", t1, sep = ' ') +if(! is.null(coverage_depth)){ # means covdepth provided, we prepare var for graph + maxi = max(coverage_depth$V2) + mini = min(coverage_depth$V2) + med = median(coverage_depth$V2) + m = paste("Coverage - median_coverage =", med, "[", mini, ":", maxi, "]", sep = " ") + cd = ggplot(coverage_depth, aes(x = V1, y = V2)) + geom_line(color = "black", size = 0.5) + # + labs(list(title = m, x = "Base Position", y = "Number of Reads") + cd1 = cd + labs(title = m) # add graph title + cd2 = cd1 + xlab("Base Position") # add x axe title + cd3 = cd2 + ylab("Number of reads") # add axes and graph titles +} + if(is.null(density)){ # means no snp found - p = ggplot() # plot initialization + p = ggplot() # plot initialization }else{ - p = ggplot(density) # plot initialization + p = ggplot(density) # plot initialization } # defined a wide color palet manually to ensure better visibility, the other colors defined later will not be so distinctive @@ -169,8 +183,7 @@ p4 = p3bis + labs(title = t) # add graph title p5 = p4 + xlab("Base Position") # add x axe title p6 = p5 + ylab("Variant Frequency") # add axes and graph titles -p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="proteins (order: protein names)"), - color = guide_legend(order=2, direction="horizontal", title="genes (order: gene names)")) +p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="proteins (order: protein names)"), color = guide_legend(order=2, direction="horizontal", title="genes (order: gene names)")) # p7 = p6 + theme(legend.position = "bottom") # modify the legend position p8 = p6bis + ylim(-0.06,1.2) # modify the scale @@ -178,9 +191,13 @@ p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice #p10 = p9 + geom_text(x = 0, y = threshold + 0.01, label = t) # add the threshold text #p11 = p10 + geom_line(aes(x = position, y = 0.5), color = "red") p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red") -p11 = p10 + theme(plot.title = element_text(hjust=0.5)) - +p11 = p10 + theme(plot.title = element_text(hjust=0.5),legend.position="bottom") # -ggsave(outfile, device = "png", plot = last_plot(), width = 20, dpi = 600) # save the graph +grid.arrange(cd, p11, nrow=2 + , widths = c(2, 2), heights = c(1.0, 3.0) +) +g <- arrangeGrob(cd3, p11) +ggsave(outfile, device = "png", plot = g, width = 21, units="cm", height=30, dpi = 600) # save the graph +# 1 inch = 2.54cm # ~ end of script ~ From 4a29163fa15c0e7347236d9efef6e695bb062b82 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Thu, 21 Dec 2023 21:55:34 +0100 Subject: [PATCH 15/42] improve covdepth and variant display. Still problematic: legend of variants not in the width of the page --- src/visualize_snp_v4.R | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index 06b6b50..0ba06f5 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -17,6 +17,7 @@ # library("viridis") library(ggplot2) # import the ggplot2 library library(gridExtra) # multi graph on the same figure +library(cowplot) args <- commandArgs(TRUE) # all arguments are character types @@ -38,20 +39,22 @@ if(inherits(coverage_depth,"try-error")) coverage_depth <- NULL t1 = as.character(threshold) # define threshold as a character -t = paste("Nucleotide Variation - threshold = ", t1, sep = ' ') +# t = paste("Nucleotide Variation - threshold = ", t1, sep = ' ') if(! is.null(coverage_depth)){ # means covdepth provided, we prepare var for graph maxi = max(coverage_depth$V2) mini = min(coverage_depth$V2) med = median(coverage_depth$V2) m = paste("Coverage - median_coverage =", med, "[", mini, ":", maxi, "]", sep = " ") - cd = ggplot(coverage_depth, aes(x = V1, y = V2)) + geom_line(color = "black", size = 0.5) + options(repr.plot.width = 5, repr.plot.height =1) + cd = ggplot(coverage_depth, aes(x = V1, y = V2)) + geom_line(color = "black", linewidth = 0.5) # + labs(list(title = m, x = "Base Position", y = "Number of Reads") cd1 = cd + labs(title = m) # add graph title - cd2 = cd1 + xlab("Base Position") # add x axe title - cd3 = cd2 + ylab("Number of reads") # add axes and graph titles + cd2 = cd1 + xlab("") # add x axe title + cd3 = cd2 + ylab("Number of reads") # add axes and graph titles } +options(repr.plot.width = 5, repr.plot.height =5) if(is.null(density)){ # means no snp found p = ggplot() # plot initialization }else{ @@ -151,7 +154,7 @@ density$protein_id_num = paste( sprintf("%03d", match(density$protein_id, prote print("gene_id_labels:") print(gene_id_labels) -p1 = p + geom_point(aes(x = position, y = -0.05, colour = density$gene_id_num, shape = density$protein_id_num), size = 1, show.legend = F, shape = 15) # add the consensus points, and remove the legend "size" for proteins +p1 = p + geom_point(aes(x = position, y = -0.05, colour = density$gene_id_num, shape = density$protein_id_num), size = 1, show.legend = F, shape = 15) # add the consensus points, and remove the legend "linewidth" for proteins # needed to have more than 6 symbols for gene_id legend p1bis = p1 + scale_shape_manual(values=c(1:25,1:25)) @@ -179,25 +182,23 @@ if(! is.null(contig_limits)){ # means more than 1 contig # } -p4 = p3bis + labs(title = t) # add graph title -p5 = p4 + xlab("Base Position") # add x axe title +# p4 = p3bis + labs(title = t) # add graph title +p5 = p3bis + xlab("Base Position") # add x axe title p6 = p5 + ylab("Variant Frequency") # add axes and graph titles -p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="proteins (order: protein names)"), color = guide_legend(order=2, direction="horizontal", title="genes (order: gene names)")) +p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="proteins (order: protein names)", nrow=5), color = guide_legend(order=2, direction="horizontal", title="genes (order: gene names)"),nrow=3) # p7 = p6 + theme(legend.position = "bottom") # modify the legend position p8 = p6bis + ylim(-0.06,1.2) # modify the scale -p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice, angle = 0)) # add indice to the grapĥ +p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice, angle = 0)) # add indice to the graph #p10 = p9 + geom_text(x = 0, y = threshold + 0.01, label = t) # add the threshold text #p11 = p10 + geom_line(aes(x = position, y = 0.5), color = "red") p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red") -p11 = p10 + theme(plot.title = element_text(hjust=0.5),legend.position="bottom") # +minus_maxlength_over6 = - max(contig_limits) / 50 +p10bis = p10 + geom_text(aes(x = minus_maxlength_over6, y = 0.07, label = t1, angle = 0)) # add indice to the graph +p11 = p10bis + theme(plot.title = element_text(hjust=0.5),legend.position="bottom") # -grid.arrange(cd, p11, nrow=2 - , widths = c(2, 2), heights = c(1.0, 3.0) -) -g <- arrangeGrob(cd3, p11) +g <- plot_grid(cd3, p11, align = "v", nrow = 2, rel_heights = c(1/6, 5/6)) -ggsave(outfile, device = "png", plot = g, width = 21, units="cm", height=30, dpi = 600) # save the graph -# 1 inch = 2.54cm +ggsave(outfile, device = "png", plot = g, width = 21, units="cm", height=29.7, dpi = 600) # save the graph # ~ end of script ~ From 21cddf777f637c333e2728301963a27ff5ceca89 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Mon, 8 Jan 2024 17:08:41 +0100 Subject: [PATCH 16/42] add R packages dependencies --- setup.py | 3 ++- src/vvv2_display.yaml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 94fb47f..cb41480 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,8 @@ install_requires=[ "python>=3.9", "r-ggplot2==3.6.6", - "r-gridextra=2.3" + "r-gridextra==2.3", + "r-cowplot==1.1.1", "pysam==0.19.1", "numpy==1.23.1"], # Optional scripts=[ diff --git a/src/vvv2_display.yaml b/src/vvv2_display.yaml index b9d51e0..851959e 100644 --- a/src/vvv2_display.yaml +++ b/src/vvv2_display.yaml @@ -11,3 +11,4 @@ dependencies: - openssl - r-ggplot2=3.3.6 - r-gridextra + - r-cowplot From 9cd863715a63c96fd66afe37f40ad5d98e9542ea Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Mon, 8 Jan 2024 17:18:25 +0100 Subject: [PATCH 17/42] update versions --- src/vvv2_display.xml | 4 ++-- src/vvv2_display.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vvv2_display.xml b/src/vvv2_display.xml index 3368b72..3bd7f9c 100644 --- a/src/vvv2_display.xml +++ b/src/vvv2_display.xml @@ -1,10 +1,10 @@ - + - vvv2_display + vvv2_display Date: Mon, 8 Jan 2024 17:47:13 +0100 Subject: [PATCH 18/42] adjust text size relatively --- src/visualize_snp_v4.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index 0ba06f5..a1d1984 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -187,16 +187,20 @@ p5 = p3bis + xlab("Base Position") # add x axe title p6 = p5 + ylab("Variant Frequency") # add axes and graph titles p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="proteins (order: protein names)", nrow=5), color = guide_legend(order=2, direction="horizontal", title="genes (order: gene names)"),nrow=3) - -# p7 = p6 + theme(legend.position = "bottom") # modify the legend position -p8 = p6bis + ylim(-0.06,1.2) # modify the scale + +# for legend text size +# p7 = p6bis + theme(legend.position = "bottom") # modify the legend position +p7 = p6bis + theme(legend.text = element_text(size=rel(0.5)), legend.title=element_text(size=rel(0.8))) # modify the legend position + + +p8 = p7 + ylim(-0.06,1.2) # modify the scale p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice, angle = 0)) # add indice to the graph #p10 = p9 + geom_text(x = 0, y = threshold + 0.01, label = t) # add the threshold text #p11 = p10 + geom_line(aes(x = position, y = 0.5), color = "red") p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red") minus_maxlength_over6 = - max(contig_limits) / 50 p10bis = p10 + geom_text(aes(x = minus_maxlength_over6, y = 0.07, label = t1, angle = 0)) # add indice to the graph -p11 = p10bis + theme(plot.title = element_text(hjust=0.5),legend.position="bottom") # +p11 = p10bis + theme(plot.title = element_text(hjust=0.5),legend.position="bottom") g <- plot_grid(cd3, p11, align = "v", nrow = 2, rel_heights = c(1/6, 5/6)) From 0677297cdf64396fabcd8ca8a00fd540243ee232 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 9 Jan 2024 22:04:27 +0100 Subject: [PATCH 19/42] add dependencies to r-ggh4x --- setup.py | 1 + src/vvv2_display.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/setup.py b/setup.py index cb41480..86cf694 100644 --- a/setup.py +++ b/setup.py @@ -72,6 +72,7 @@ "r-ggplot2==3.6.6", "r-gridextra==2.3", "r-cowplot==1.1.1", + "conda-forge::r-ggh4x", "pysam==0.19.1", "numpy==1.23.1"], # Optional scripts=[ diff --git a/src/vvv2_display.yaml b/src/vvv2_display.yaml index a145ef4..1500d2c 100644 --- a/src/vvv2_display.yaml +++ b/src/vvv2_display.yaml @@ -12,3 +12,4 @@ dependencies: - r-ggplot2=3.3.6 - r-gridextra=2.3 - r-cowplot=1.1.1 + - conda-forge::r-ggh4x From 6bc413bf755a9b7820b8cce77ce2bc3e4d806959 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 9 Jan 2024 22:05:04 +0100 Subject: [PATCH 20/42] light cleaning of code, start to use ggh4x, to confirm on not --- src/visualize_snp_v4.R | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index a1d1984..583054a 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -18,6 +18,7 @@ library(ggplot2) # import the ggplot2 library library(gridExtra) # multi graph on the same figure library(cowplot) +library(ggh4x) # to adjust legend width to the entire plot (and not go further hopefully) args <- commandArgs(TRUE) # all arguments are character types @@ -186,11 +187,25 @@ if(! is.null(contig_limits)){ # means more than 1 contig p5 = p3bis + xlab("Base Position") # add x axe title p6 = p5 + ylab("Variant Frequency") # add axes and graph titles -p6bis = p6 +guides(shape = guide_legend(order=1, direction="vertical", title="proteins (order: protein names)", nrow=5), color = guide_legend(order=2, direction="horizontal", title="genes (order: gene names)"),nrow=3) - -# for legend text size -# p7 = p6bis + theme(legend.position = "bottom") # modify the legend position -p7 = p6bis + theme(legend.text = element_text(size=rel(0.5)), legend.title=element_text(size=rel(0.8))) # modify the legend position +p6bis = p6 +guides( shape = guide_legend(order=1, + direction="vertical", + title="proteins (order: protein names)", + nrow=5 + ), + color = guide_legend(order=2, + direction="horizontal", + title="genes (order: gene names)"), + nrow=3 + ) + +# modify the legend text sizes end position +p7 = p6bis + theme( plot.title = element_text(hjust=0.5), + legend.position="bottom", + # legend.spacing.x=unit(1,"cm"), + legend.text = element_text(size=rel(0.5)), + legend.title=element_text(size=rel(0.8)), + + ) # modify the legend position p8 = p7 + ylim(-0.06,1.2) # modify the scale @@ -199,8 +214,7 @@ p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice #p11 = p10 + geom_line(aes(x = position, y = 0.5), color = "red") p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red") minus_maxlength_over6 = - max(contig_limits) / 50 -p10bis = p10 + geom_text(aes(x = minus_maxlength_over6, y = 0.07, label = t1, angle = 0)) # add indice to the graph -p11 = p10bis + theme(plot.title = element_text(hjust=0.5),legend.position="bottom") +p11 = p10 + geom_text(aes(x = minus_maxlength_over6, y = 0.07, label = t1, angle = 0)) # add indice to the graph g <- plot_grid(cd3, p11, align = "v", nrow = 2, rel_heights = c(1/6, 5/6)) From 37c8691589880b456e4db97ff984af3b510dd89e Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 10 Jan 2024 15:07:52 +0100 Subject: [PATCH 21/42] update R dependencies --- setup.py | 2 +- src/vvv2_display.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 86cf694..c1dd236 100644 --- a/setup.py +++ b/setup.py @@ -72,7 +72,7 @@ "r-ggplot2==3.6.6", "r-gridextra==2.3", "r-cowplot==1.1.1", - "conda-forge::r-ggh4x", + "conda-forge::r-stringr=1.5.1", "pysam==0.19.1", "numpy==1.23.1"], # Optional scripts=[ diff --git a/src/vvv2_display.yaml b/src/vvv2_display.yaml index 1500d2c..2cfd83f 100644 --- a/src/vvv2_display.yaml +++ b/src/vvv2_display.yaml @@ -12,4 +12,4 @@ dependencies: - r-ggplot2=3.3.6 - r-gridextra=2.3 - r-cowplot=1.1.1 - - conda-forge::r-ggh4x + - conda-forge::r-stringr=1.5.1 From 1eeee767fc80267c72e274ed6f76441d382e14c7 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 10 Jan 2024 15:08:59 +0100 Subject: [PATCH 22/42] suppress similarity description in protein legends, better placement of legends --- src/visualize_snp_v4.R | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index 583054a..bb4ebff 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -18,7 +18,8 @@ library(ggplot2) # import the ggplot2 library library(gridExtra) # multi graph on the same figure library(cowplot) -library(ggh4x) # to adjust legend width to the entire plot (and not go further hopefully) +library(stringr) +# library(ggh4x) # to adjust legend width to the entire plot (and not go further hopefully) args <- commandArgs(TRUE) # all arguments are character types @@ -39,6 +40,8 @@ coverage_depth <- try( read.table(args[5], h=F, sep = "\t") ) # read the datafra if(inherits(coverage_depth,"try-error")) coverage_depth <- NULL +width <- unit(21, "cm") + t1 = as.character(threshold) # define threshold as a character # t = paste("Nucleotide Variation - threshold = ", t1, sep = ' ') @@ -141,6 +144,25 @@ gene_id_labels = unique(density$gene_id) density$gene_id_num = paste( sprintf("%02d", match(density$gene_id, gene_id_labels) ), ":", density$gene_id) protein_id_labels = unique(density$protein_id) + +# truncate long legend +replacement <- function(x){ + replaced = str_replace_all(x, ":[[:alnum:] \\-]+,","") + replaced = str_replace_all(replaced, ":[[:alnum:] \\-]+:",":") + replaced = str_replace_all(replaced, ":[[:alnum:] \\-]+$","") + return( replaced ) +} + +# protein_id_labels = lapply(protein_id_labels, FUN = function(x) str_replace_all(x, ":[A-Za-z0-9 ]+,","")) +protein_id_labels = lapply(protein_id_labels, replacement) +print("protein_id_labels shortened:") +print(protein_id_labels) + +density$protein_id = lapply(density$protein_id, replacement) +print("protein_id shortened:") +print(density$protein_id) + + density$protein_id_num = paste( sprintf("%03d", match(density$protein_id, protein_id_labels) ), ":", density$protein_id) # protein_id_ordered = fct_reorder(protein_id_labels, density$protein_id_num) @@ -170,7 +192,7 @@ p3 = p2 + geom_line(aes(x=position, y=threshold)) # add the threshold line p3bis = p3 if(! is.null(contig_limits)){ # means more than 1 contig - for(i in contig_limits){ + for(i in contig_limits){ p3bis = p3bis + geom_vline(xintercept=i,linetype="dotted") # add the contig limits (vertical dotted line) } } @@ -187,13 +209,13 @@ if(! is.null(contig_limits)){ # means more than 1 contig p5 = p3bis + xlab("Base Position") # add x axe title p6 = p5 + ylab("Variant Frequency") # add axes and graph titles -p6bis = p6 +guides( shape = guide_legend(order=1, +p6bis = p6 +guides( shape = guide_legend(order=2, direction="vertical", title="proteins (order: protein names)", - nrow=5 + nrow=15 ), - color = guide_legend(order=2, - direction="horizontal", + color = guide_legend(order=1, + direction="vertical", title="genes (order: gene names)"), nrow=3 ) @@ -201,12 +223,10 @@ p6bis = p6 +guides( shape = guide_legend(order=1, # modify the legend text sizes end position p7 = p6bis + theme( plot.title = element_text(hjust=0.5), legend.position="bottom", - # legend.spacing.x=unit(1,"cm"), legend.text = element_text(size=rel(0.5)), legend.title=element_text(size=rel(0.8)), - ) # modify the legend position - + ) # modify the legend position p8 = p7 + ylim(-0.06,1.2) # modify the scale p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice, angle = 0)) # add indice to the graph @@ -218,5 +238,5 @@ p11 = p10 + geom_text(aes(x = minus_maxlength_over6, y = 0.07, label = t1, angle g <- plot_grid(cd3, p11, align = "v", nrow = 2, rel_heights = c(1/6, 5/6)) -ggsave(outfile, device = "png", plot = g, width = 21, units="cm", height=29.7, dpi = 600) # save the graph +ggsave(outfile, device = "png", plot = g, width = width, units="cm", height=29.7, dpi = 600) # save the graph # ~ end of script ~ From 5fa688b1b82f35b1fce5374b7482e9b957172bcd Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 10 Jan 2024 17:54:17 +0100 Subject: [PATCH 23/42] correct graph alignment, remove useless annot, increase slighly legend text size, set percentage for variants, better display treshold on y axis --- src/visualize_snp_v4.R | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index bb4ebff..8c00113 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -1,6 +1,6 @@ #!/usr/bin/env Rscript # -# FT; last modification December 19th 2023 +# FT; last modification January 10th 2024 # AF; last modification February 16th 2019 # # Description: This script has been written in order to generate a graph @@ -147,9 +147,11 @@ protein_id_labels = unique(density$protein_id) # truncate long legend replacement <- function(x){ - replaced = str_replace_all(x, ":[[:alnum:] \\-]+,","") - replaced = str_replace_all(replaced, ":[[:alnum:] \\-]+:",":") - replaced = str_replace_all(replaced, ":[[:alnum:] \\-]+$","") + replaced = str_replace_all(x, ":[[:alnum:] \\-]+,",",") # replace :...., by , + replaced = str_replace_all(replaced, ":[[:alnum:] \\-]+:",":") # replace :....: by : + replaced = str_replace_all(replaced, ":[[:alnum:] \\-]+$","") # remove last :.... + replaced = str_replace_all(replaced, "\\[[[:alnum:] \\-]+\\]","") # remove [...] + replaced = str_replace_all(replaced, " (?:putative|growth) [[:alnum:] \\-]+","") # remove useless annotations return( replaced ) } @@ -212,7 +214,7 @@ p6 = p5 + ylab("Variant Frequency") # add axes and graph titles p6bis = p6 +guides( shape = guide_legend(order=2, direction="vertical", title="proteins (order: protein names)", - nrow=15 + nrow=16 ), color = guide_legend(order=1, direction="vertical", @@ -223,19 +225,21 @@ p6bis = p6 +guides( shape = guide_legend(order=2, # modify the legend text sizes end position p7 = p6bis + theme( plot.title = element_text(hjust=0.5), legend.position="bottom", - legend.text = element_text(size=rel(0.5)), + legend.text = element_text(size=rel(0.6)), legend.title=element_text(size=rel(0.8)), ) # modify the legend position -p8 = p7 + ylim(-0.06,1.2) # modify the scale -p9 = p8 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice, angle = 0)) # add indice to the graph +p9 = p7 + geom_text(aes(x = position, y = variant_percent + 0.03, label = indice, angle = 0)) # add indice to the graph #p10 = p9 + geom_text(x = 0, y = threshold + 0.01, label = t) # add the threshold text #p11 = p10 + geom_line(aes(x = position, y = 0.5), color = "red") p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red") minus_maxlength_over6 = - max(contig_limits) / 50 -p11 = p10 + geom_text(aes(x = minus_maxlength_over6, y = 0.07, label = t1, angle = 0)) # add indice to the graph +# give scale and breaks of the y axis +p11 = p10 + scale_y_continuous(limits=c(-0.06,1.1), labels = scales::percent, breaks=c(0.0, 0.07, 0.2, 0.4, 0.6, 0.8, 1.0)) + +# gives proportions for covdepth graph (cd3) and variant graph (p11) in the grid plot g <- plot_grid(cd3, p11, align = "v", nrow = 2, rel_heights = c(1/6, 5/6)) ggsave(outfile, device = "png", plot = g, width = width, units="cm", height=29.7, dpi = 600) # save the graph From f4d9827130c2b039830784d48b16dfffb5918c48 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 10 Jan 2024 17:58:10 +0100 Subject: [PATCH 24/42] remove space after 'growth' when removing useless annotation --- src/visualize_snp_v4.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index 8c00113..0cc8ed6 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -151,7 +151,7 @@ replacement <- function(x){ replaced = str_replace_all(replaced, ":[[:alnum:] \\-]+:",":") # replace :....: by : replaced = str_replace_all(replaced, ":[[:alnum:] \\-]+$","") # remove last :.... replaced = str_replace_all(replaced, "\\[[[:alnum:] \\-]+\\]","") # remove [...] - replaced = str_replace_all(replaced, " (?:putative|growth) [[:alnum:] \\-]+","") # remove useless annotations + replaced = str_replace_all(replaced, " (?:putative|growth)[[:alnum:] \\-]+","") # remove useless annotations return( replaced ) } From 39ef2df45a9f67c3ad2b78fa0e137fac9cf59fb7 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Thu, 11 Jan 2024 14:49:46 +0100 Subject: [PATCH 25/42] changes 'non translated RNA' by 'untranslated RNA' --- src/convert_vcffile_to_readablefile2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/convert_vcffile_to_readablefile2.py b/src/convert_vcffile_to_readablefile2.py index 1ff8f2a..1539acd 100755 --- a/src/convert_vcffile_to_readablefile2.py +++ b/src/convert_vcffile_to_readablefile2.py @@ -96,7 +96,7 @@ def find_key_proteins(dico_json, genomepos, gene_res): """ This function retrieves the protein name and the start and end position of this protein. """ - protein = 'non translated RNA' + protein = 'untranslated RNA' base_inf_allproteins = '' base_sup_allproteins = '' # print("find_key_proteins call pos ("+str(genomepos)+", "+gene_res+")") @@ -112,7 +112,7 @@ def find_key_proteins(dico_json, genomepos, gene_res): protein = 'intergene' base_inf_allproteins = str(base_inf) base_sup_allproteins = str(base_sup) - elif protein == 'non translated RNA': + elif protein == 'untranslated RNA': protein = key base_inf_allproteins = str(base_inf) base_sup_allproteins = str(base_sup) From cb573930cc036d127ef38cd8c075c0243a1f247a Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Thu, 11 Jan 2024 14:50:24 +0100 Subject: [PATCH 26/42] minor correction in legend titles --- src/visualize_snp_v4.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index 0cc8ed6..3959b4b 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -213,12 +213,12 @@ p6 = p5 + ylab("Variant Frequency") # add axes and graph titles p6bis = p6 +guides( shape = guide_legend(order=2, direction="vertical", - title="proteins (order: protein names)", + title="proteins (order: protein positions)", nrow=16 ), color = guide_legend(order=1, direction="vertical", - title="genes (order: gene names)"), + title="genes (order: gene positions)"), nrow=3 ) From 2ba73d447c7d6543df4836750dabf7423742e7cf Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Mon, 29 Jan 2024 16:51:37 +0100 Subject: [PATCH 27/42] correct argument usage depending on using covdepth or not. Adapt dsplay to using coveragedepth graph or not --- src/visualize_snp_v4.R | 55 +++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index 3959b4b..64d3a73 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -1,6 +1,6 @@ #!/usr/bin/env Rscript # -# FT; last modification January 10th 2024 +# FT; last modification January 29th 2024 # AF; last modification February 16th 2019 # # Description: This script has been written in order to generate a graph @@ -35,33 +35,44 @@ threshold = as.numeric(args[3]) # define threshold outfile = args[4] -# to prepare coverage depth graph above variant/annotation graph -coverage_depth <- try( read.table(args[5], h=F, sep = "\t") ) # read the dataframe -if(inherits(coverage_depth,"try-error")) - coverage_depth <- NULL +if( length(args) > 4 ) +{ + b_covdepth <- TRUE + # to prepare coverage depth graph above variant/annotation graph + coverage_depth <- try( read.table(args[5], h=F, sep = "\t") ) # read the dataframe + if(inherits(coverage_depth,"try-error")) + coverage_depth <- NULL +}else +{ + b_covdepth <- FALSE +} width <- unit(21, "cm") t1 = as.character(threshold) # define threshold as a character # t = paste("Nucleotide Variation - threshold = ", t1, sep = ' ') -if(! is.null(coverage_depth)){ # means covdepth provided, we prepare var for graph - maxi = max(coverage_depth$V2) - mini = min(coverage_depth$V2) - med = median(coverage_depth$V2) - m = paste("Coverage - median_coverage =", med, "[", mini, ":", maxi, "]", sep = " ") - options(repr.plot.width = 5, repr.plot.height =1) - cd = ggplot(coverage_depth, aes(x = V1, y = V2)) + geom_line(color = "black", linewidth = 0.5) - # + labs(list(title = m, x = "Base Position", y = "Number of Reads") - cd1 = cd + labs(title = m) # add graph title - cd2 = cd1 + xlab("") # add x axe title - cd3 = cd2 + ylab("Number of reads") # add axes and graph titles +if(b_covdepth){ + + if(! is.null(coverage_depth)){ # means covdepth provided, we prepare var for graph + maxi = max(coverage_depth$V2) + mini = min(coverage_depth$V2) + med = median(coverage_depth$V2) + m = paste("Coverage - median_coverage =", med, "[", mini, ":", maxi, "]", sep = " ") + options(repr.plot.width = 5, repr.plot.height =1) + cd = ggplot(coverage_depth, aes(x = V1, y = V2)) + geom_line(color = "black", linewidth = 0.5) + # + labs(list(title = m, x = "Base Position", y = "Number of Reads") + cd1 = cd + labs(title = m) # add graph title + cd2 = cd1 + xlab("") # add x axe title + cd3 = cd2 + ylab("Number of reads") # add axes and graph titles + } } options(repr.plot.width = 5, repr.plot.height =5) if(is.null(density)){ # means no snp found p = ggplot() # plot initialization -}else{ +}else +{ p = ggplot(density) # plot initialization } @@ -239,8 +250,14 @@ minus_maxlength_over6 = - max(contig_limits) / 50 # give scale and breaks of the y axis p11 = p10 + scale_y_continuous(limits=c(-0.06,1.1), labels = scales::percent, breaks=c(0.0, 0.07, 0.2, 0.4, 0.6, 0.8, 1.0)) -# gives proportions for covdepth graph (cd3) and variant graph (p11) in the grid plot -g <- plot_grid(cd3, p11, align = "v", nrow = 2, rel_heights = c(1/6, 5/6)) +if( b_covdepth ){ + # gives proportions for covdepth graph (cd3) and variant graph (p11) in the grid plot + g <- plot_grid(cd3, p11, align = "v", nrow = 2, rel_heights = c(1/6, 5/6)) +}else +{ + # gives proportions for covdepth graph (cd3) and variant graph (p11) in the grid plot + g <- plot(p11) +} ggsave(outfile, device = "png", plot = g, width = width, units="cm", height=29.7, dpi = 600) # save the graph # ~ end of script ~ From 56a8b06980c64f860c2d27438864e47d847ef0e1 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Mon, 29 Jan 2024 16:52:48 +0100 Subject: [PATCH 28/42] inactivate option parsing if test activated, adapt script launch depending on using coervage depth info or not --- src/vvv2_display.py | 120 +++++++++++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 47 deletions(-) diff --git a/src/vvv2_display.py b/src/vvv2_display.py index e2a6b5c..2ee6d53 100755 --- a/src/vvv2_display.py +++ b/src/vvv2_display.py @@ -21,7 +21,9 @@ # vvv2_display script: from vardict (variant calling) and vadr (annotator) results, # creates a picture of variants alongside the detected viral genome ### -import argparse, os, sys, warnings, re +import argparse, os, sys, warnings, re, subprocess +# hashlib: no, needs python 3.11 to have sha256 on files, trigger installation problème +# we use ssytem sha256sum function instead from os import path import subprocess @@ -37,7 +39,7 @@ def __main__(): b_test_convert_tbl2json = False # ok 2022 04 26 complete tc, b_test_correct_multicontig_vardict_vcf = False # ok 2022 04 29 partial tc b_test_convert_vcffile_to_readable = False # ok 2022 04 28 complete tc, - b_test_correct_covdepth_f = True # 2023 12 20 tc, + b_test_correct_covdepth_f = False # ok 2024 01 20 tc, b_test_visualize_snp_v4 = False # ok 2022 04 28 complete tc, b_test = False dir_path = os.path.dirname(os.path.abspath(__file__)) # dir of current script @@ -156,6 +158,7 @@ def __main__(): # get absolute path in case of files args = parser.parse_args() + print(prog_tag + " arguments obtained, checking... line "+str(frame.f_lineno)) # ------------------------------------------- # check arguments @@ -198,50 +201,67 @@ def __main__(): " arguments, exit line "+str(frame.f_lineno)) sys.exit(0) - # print('args:', args) - if args.pass_annot_f is not None: - pass_annot_f = os.path.abspath(args.pass_annot_f) - elif(not b_test): - sys.exit("[Error] You must provide pass_annot_f") - if args.fail_annot_f is not None: - fail_annot_f = os.path.abspath(args.fail_annot_f) - elif(not b_test): - sys.exit("[Error] You must provide fail_annot_f") - if args.seq_stat_f is not None: - seq_stat_f = os.path.abspath(args.seq_stat_f) - elif(not b_test): - sys.exit("[Error] You must provide seq_stat_f") - if args.vardict_vcf_f is not None: - vardict_vcf_f = os.path.abspath(args.vardict_vcf_f) - elif(not b_test): - sys.exit("[Error] You must provide vcf_f") - if args.png_var_f is not None: - png_var_f = os.path.abspath(args.png_var_f) - elif(not b_test): - sys.exit("[Error] You must provide png_var_f name for output") - if args.snp_loc_f is not None: - snp_loc_f = os.path.abspath(args.snp_loc_f) - if args.snp_loc_summary_f is not None: - snp_loc_summary_f = os.path.abspath(args.snp_loc_summary_f) - if args.cov_depth_f is not None: - cov_depth_f = os.path.abspath(args.cov_depth_f) - b_cov_depth_display = True - # ---------------------------------------------------------------- - # optional arguments only for Galaxy compatibility - if args.json_annot_f is not None: - json_annot_f = os.path.abspath(args.json_annot_f) - if args.bed_vardict_annot_f is not None: - bed_vardict_annot_f = os.path.abspath(args.bed_vardict_annot_f) - if args.correct_vcf_f is not None: - correct_vcf_f = os.path.abspath(args.correct_vcf_f) - if args.contig_limits_f is not None: - contig_limits_f = os.path.abspath(args.contig_limits_f) - if args.cov_depth_corr_f is not None: - cov_depth_corr_f = os.path.abspath(args.cov_depth_corr_f) - # ---------------------------------------------------------------- - - if args.b_verbose is not None: - b_verbose = args.b_verbose + if not b_test_vvv2_display: + # print('args:', args) + if args.pass_annot_f is not None: + pass_annot_f = os.path.abspath(args.pass_annot_f) + elif(not b_test): + sys.exit("[Error] You must provide pass_annot_f") + if args.fail_annot_f is not None: + fail_annot_f = os.path.abspath(args.fail_annot_f) + elif(not b_test): + sys.exit("[Error] You must provide fail_annot_f") + if args.seq_stat_f is not None: + seq_stat_f = os.path.abspath(args.seq_stat_f) + elif(not b_test): + sys.exit("[Error] You must provide seq_stat_f") + if args.vardict_vcf_f is not None: + vardict_vcf_f = os.path.abspath(args.vardict_vcf_f) + elif(not b_test): + sys.exit("[Error] You must provide vcf_f") + if args.png_var_f is not None: + png_var_f = os.path.abspath(args.png_var_f) + elif(not b_test): + sys.exit("[Error] You must provide png_var_f name for output") + if args.snp_loc_f is not None: + snp_loc_f = os.path.abspath(args.snp_loc_f) + if args.snp_loc_summary_f is not None: + snp_loc_summary_f = os.path.abspath(args.snp_loc_summary_f) + + if( (args.cov_depth_f is not None) and (os.path.isfile(args.cov_depth_f)) ): + cov_depth_f = os.path.abspath(args.cov_depth_f) + b_cov_depth_display = True + else: + b_cov_depth_display = False + # ---------------------------------------------------------------- + # optional arguments only for Galaxy compatibility + if args.json_annot_f is not None: + json_annot_f = os.path.abspath(args.json_annot_f) + if args.bed_vardict_annot_f is not None: + bed_vardict_annot_f = os.path.abspath(args.bed_vardict_annot_f) + if args.correct_vcf_f is not None: + correct_vcf_f = os.path.abspath(args.correct_vcf_f) + + + if args.contig_limits_f is None: + # if name of contig_limits file is not provided by user, deduce a name file with a part + # deduced from pass_annot_f using sha256 checksum + # to avoid bad interferences of serveral vvv2_display runs results + + cmd = "/usr/bin/sha256sum "+pass_annot_f + # print(prog_tag + " cmd:"+cmd) + sha256_f = subprocess.getoutput(cmd).split()[0] + contig_limits_f = str(sha256_f) + '_contig_limits.txt' + # print(prog_tag + " contig_limits file name created:"+ contig_limits_f) + else: + contig_limits_f = os.path.abspath(args.contig_limits_f) + if args.cov_depth_corr_f is not None: + cov_depth_corr_f = os.path.abspath(args.cov_depth_corr_f) + # ---------------------------------------------------------------- + + if args.b_verbose is not None: + b_verbose = args.b_verbose + print(prog_tag + " arguments checked... line "+str(frame.f_lineno)) # ------------------------------------------------------------------ @@ -373,7 +393,7 @@ def __main__(): correct_vcf_f = vardict_vcf_f # correct_vcf_f = correct_vcf_f.replace('vardict.vcf', '_correct.vcf') correct_vcf_f = re.sub('\.[^\.]+$', '_correct.vcf', correct_vcf_f) - + p_script = PYTHON_SCRIPTS + "correct_multicontig_vardict_vcf.py" print("p_script:" + p_script) cmd = ' '.join([p_script, @@ -391,6 +411,7 @@ def __main__(): print(prog_tag + " [test_correct_multicontig_vardict_vcf] END") sys.exit() else: + print(prog_tag + " cmd:" + cmd) os.system(cmd) # ------------------------------------------------------------------ @@ -444,6 +465,7 @@ def __main__(): # creates corrected cov depth file if b_cov_depth_display: + print("b_cov_depth_display:"+str(b_cov_depth_display)+ ", line " + str(frame.f_lineno)) if b_test_correct_covdepth_f and (not b_test_vvv2_display): cov_depth_f = test_dir + "/res_vvv2_covdepth.txt" cov_depth_corr_f = test_dir + "/res_vvv2_covdepth_corrected.txt" @@ -506,6 +528,10 @@ def __main__(): os.system(cmd) # ------------------------------------------------------------------ print(png_var_f + " file created") + + # remove useless file + if os.path.isfile(contig_limits_f): + os.unlink(contig_limits_f) ##### MAIN END if __name__=="__main__":__main__() From b9d9bc713cd5b1cd4a2b096f50ff51b07ba107f9 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Mon, 29 Jan 2024 17:40:48 +0100 Subject: [PATCH 29/42] change note for homopolymers by mentioning now Iont-Torrent AND Nanopore, not only Ion-Torrent --- src/convert_vcffile_to_readablefile2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/convert_vcffile_to_readablefile2.py b/src/convert_vcffile_to_readablefile2.py index 1539acd..de77118 100755 --- a/src/convert_vcffile_to_readablefile2.py +++ b/src/convert_vcffile_to_readablefile2.py @@ -318,7 +318,7 @@ def is_homopolymer(lseq, rseq, ref, alt): filout.write(line) filout.write(""" *NB: an homopolymer region is set to 'yes' if there is a succession of at least 3 identical nucleotides. - it looks like a restrictive measure, but Ion Torrent sequencing is very bad on such region, so make sure you verify these variants.""") + it looks like a restrictive measure, but Ion Torrent and Nanopore sequencing are very bad on such region, so make sure you verify these variants.""") print(prog_tag + ' '+ args.out +" file created") print(prog_tag + ' '+ args.outs +" file created") ## ~ end of script ~ ## From eb1d1116ae3b641911cbe978cb0c69c85bea9845 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Fri, 9 Feb 2024 11:06:10 +0100 Subject: [PATCH 30/42] correct bug when providing snp_loc_summary_f but not snp_loc_f --- src/vvv2_display.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vvv2_display.py b/src/vvv2_display.py index 2ee6d53..d22c09d 100755 --- a/src/vvv2_display.py +++ b/src/vvv2_display.py @@ -433,9 +433,9 @@ def __main__(): if(snp_loc_f == ''): snp_loc_f = vardict_vcf_f - snp_loc_summary_f = vardict_vcf_f - # snp_loc_f = snp_loc_f.replace('.vcf', '_snp.txt') snp_loc_f = re.sub('\.[^\.]+$', '_snp.txt', snp_loc_f) + if(snp_loc_summary_f == ''): + snp_loc_summary_f = vardict_vcf_f snp_loc_summary_f = re.sub('\.[^\.]+$', '_snp_summary.txt', snp_loc_summary_f) # vcf file from vardict From 382d469850e023c41e5fd34c29fb37fec2ca4264 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 27 Feb 2024 22:19:56 +0100 Subject: [PATCH 31/42] dvpmt to display genes as boxes --- src/visualize_snp_v4.R | 112 ++++++++++++++++++++++++++++++----------- src/vvv2_display.py | 17 +++++-- src/vvv2_display.yaml | 2 + 3 files changed, 98 insertions(+), 33 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index 64d3a73..0cb71cf 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -13,13 +13,12 @@ ################################################################################ # ~ start of script ~ -# install.packages("viridis") -# library("viridis") library(ggplot2) # import the ggplot2 library library(gridExtra) # multi graph on the same figure library(cowplot) library(stringr) -# library(ggh4x) # to adjust legend width to the entire plot (and not go further hopefully) +library(jsonlite) # to read json +b_verbose <- FALSE args <- commandArgs(TRUE) # all arguments are character types @@ -33,13 +32,18 @@ if(inherits(contig_limits,"try-error")) threshold = as.numeric(args[3]) # define threshold -outfile = args[4] +# added 2024 02 27 +json_genes = read_json(args[4]) # json file with genes limits in "genes"->"name" [start,end] +#str(json_genes$genes) -if( length(args) > 4 ) + +outfile = args[5] + +if( length(args) > 5 ) { b_covdepth <- TRUE # to prepare coverage depth graph above variant/annotation graph - coverage_depth <- try( read.table(args[5], h=F, sep = "\t") ) # read the dataframe + coverage_depth <- try( read.table(args[6], h=F, sep = "\t") ) # read the dataframe if(inherits(coverage_depth,"try-error")) coverage_depth <- NULL }else @@ -86,21 +90,26 @@ genecols=c('#99CC00','#CC9900','#FFCC33','#FF9900','#FF6600','#FF3300','#CC3300' # check color redundancy genecols_unique = sort(unique(genecols)) -print("max_col_number_unique:") -print(length(genecols_unique)) + +if( b_verbose ){ + print("max_col_number_unique:") + print(length(genecols_unique)) +} if( length(genecols) != length(genecols_unique) ){ genecols = sort(genecols) for( icol in 1:length(genecols_unique) ){ - print("genecols_unique de ") - print(icol) - print(":") - print(genecols_unique[icol]) - if( genecols_unique[icol] != genecols[icol] ){ - redund_vals = genecols[icol] - print("Color found several times:") - print(redund_vals) - stop() + if( b_verbose ){ + print("genecols_unique de ") + print(icol) + print(":") + print(genecols_unique[icol]) + } + if( genecols_unique[icol] != genecols[icol] ){ + redund_vals = genecols[icol] + print("Color found several times:") + print(redund_vals) + stop() } } print("first values identical, added values in genecols:") @@ -166,15 +175,18 @@ replacement <- function(x){ return( replaced ) } + # protein_id_labels = lapply(protein_id_labels, FUN = function(x) str_replace_all(x, ":[A-Za-z0-9 ]+,","")) protein_id_labels = lapply(protein_id_labels, replacement) -print("protein_id_labels shortened:") -print(protein_id_labels) - +if( b_verbose ){ + print("protein_id_labels shortened:") + print(protein_id_labels) +} density$protein_id = lapply(density$protein_id, replacement) -print("protein_id shortened:") -print(density$protein_id) - +if( b_verbose ){ + print("protein_id shortened:") + print(density$protein_id) +} density$protein_id_num = paste( sprintf("%03d", match(density$protein_id, protein_id_labels) ), ":", density$protein_id) @@ -187,16 +199,60 @@ density$protein_id_num = paste( sprintf("%03d", match(density$protein_id, prote # print("gene_id_num:") # print(density$gene_id_num) -print("gene_id_labels:") -print(gene_id_labels) +if( b_verbose ){ + print("gene_id_labels:") + print(gene_id_labels) +} + +# add the consensus points, and remove the legend "linewidth" for proteins +p1 = p + geom_point(aes(x = position, y = -0.05, colour = density$gene_id_num, shape = density$protein_id_num), size = 1, show.legend = F, shape = 15) +# add consensus gene boxes +p1bis = p1 + +gnames = names(json_genes$genes) +y1 = -0.04 +y2 = -0.02 + +nrange = c(1:length(names)) +df <- data.frame( + xmin = lapply(nrange, FUN=function(x) json_genes$genes[[x]][[1]] ), # gene_start + xmax = lapply(nrange, FUN=function(x) json_genes$genes[[x]][[2]] ), # gene_end + ymin = lapply(nrange, FUN=function(x) -0.02 * (x %% 2) ), # lower box line + ymax = lapply(nrange, FUN=function(x) -0.02 * (1 + (x %% 2)) ), # upper box line + color = rep("black", length(names)), + fill = rep("white", length(names))) + +# p1bis = p1 + geom_rect(data=df, +# mapping = aes(xmin = df$xmin, xmax = df$xmax, +# ymin = df$ymin, ymax = df$ymax, color=df$color, fill = df$fill)) + +# ggplot(df) + + # sapply(df, FUN=function(xmin, xmax, ymin, ymax, color, fill) geom_rect(X, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill))) + + +# for( i in 1:length(json_genes$genes) ){ + +# # get the names of the lists in genes, here genes names +# gname = gnames[i] +# gstart = json_genes$genes[[i]][[1]] +# gend = json_genes$genes[[i]][[2]] +# if((i %% 2) == 0){ +# y1 = -0.04 +# y2 = -0.02 +# }else{ +# y1 = -0.02 +# y2 = 0.0 +# } +# print(c('gene no:', i, gname, gstart, gend, y1, y2)) +# p1bis = p1bis + geom_rect(aes(xmin = gstart, xmax = gend, ymin = y1, ymax = y2), color = "black", fill = "white", show.legend = F) +# } -p1 = p + geom_point(aes(x = position, y = -0.05, colour = density$gene_id_num, shape = density$protein_id_num), size = 1, show.legend = F, shape = 15) # add the consensus points, and remove the legend "linewidth" for proteins # needed to have more than 6 symbols for gene_id legend -p1bis = p1 + scale_shape_manual(values=c(1:25,1:25)) +p1ter = p1bis + scale_shape_manual(values=c(1:25,1:25)) # use color more easily distinguishable -p1ter = p1bis + scale_color_manual(values=genecols[1:length(gene_id_labels)]) +p1quad = p1ter + scale_color_manual(values=genecols[1:length(gene_id_labels)]) p2 = p1ter + geom_point(aes(x = position, y = as.numeric(variant_percent), color = density$gene_id_num, shape = density$protein_id_num)) # add the variant points # p2 = p1ter + geom_point(aes(x = position, y = as.numeric(variant_percent), color = density$gene_id_num, shape = density$gene_id)) # add the variant points diff --git a/src/vvv2_display.py b/src/vvv2_display.py index d22c09d..93f1bda 100755 --- a/src/vvv2_display.py +++ b/src/vvv2_display.py @@ -43,6 +43,8 @@ def __main__(): b_test_visualize_snp_v4 = False # ok 2022 04 28 complete tc, b_test = False dir_path = os.path.dirname(os.path.abspath(__file__)) # dir of current script + + b_verbose = True # allow to run tests from everywhere prog_tag = '[' + os.path.basename(__file__) + ']' @@ -493,10 +495,11 @@ def __main__(): # snp_loc_summary_f = test_dir + "/res2_snp_summary.txt" # png_var_f = test_dir + "/res2_snp.png" # CONTIGS - snp_loc_f = test_dir + "/res_snp.txt" - snp_loc_summary_f = test_dir + "/res_snp_summary.txt" - png_var_f = test_dir + "/res_snp.png" - contig_limits_f = test_dir + "/contig_limits.txt" + snp_loc_f = test_dir + "/res_snp.txt" + snp_loc_summary_f = test_dir + "/res_snp_summary.txt" + json_annot_f = test_dir + "/res_vadr.json" + png_var_f = test_dir + "/res_snp.png" + contig_limits_f = test_dir + "/contig_limits.txt" if(png_var_f == ''): png_var_f = snp_loc_f @@ -510,13 +513,17 @@ def __main__(): snp_loc_f, contig_limits_f, threshold, + json_annot_f, png_var_f]) # parameter to allow coverage depth display above variants/annotations if b_cov_depth_display: cmd = cmd + " " + cov_depth_corr_f + " " - cmd = cmd + " ".join([" < ", r_script, " > /dev/null"]) + if b_verbose: + cmd = cmd + " ".join([" < ", r_script, " > /dev/null"]) + else: + cmd = cmd + " ".join([" < ", r_script]) print(prog_tag + " cmd:" + cmd) if b_test_visualize_snp_v4: diff --git a/src/vvv2_display.yaml b/src/vvv2_display.yaml index 2cfd83f..167a05a 100644 --- a/src/vvv2_display.yaml +++ b/src/vvv2_display.yaml @@ -13,3 +13,5 @@ dependencies: - r-gridextra=2.3 - r-cowplot=1.1.1 - conda-forge::r-stringr=1.5.1 + - conda-forge::r-jsonlite=1.8.8 + From f206f9ecf16c78e699d21cf4b8a5b39e7cf17b06 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Thu, 29 Feb 2024 14:37:00 +0100 Subject: [PATCH 32/42] add gene boxes bellow abscissia, add gene names in box when box space is long enough --- src/visualize_snp_v4.R | 116 +++++++++++++++++++++++++++-------------- 1 file changed, 76 insertions(+), 40 deletions(-) diff --git a/src/visualize_snp_v4.R b/src/visualize_snp_v4.R index 0cb71cf..5f62d06 100755 --- a/src/visualize_snp_v4.R +++ b/src/visualize_snp_v4.R @@ -1,6 +1,6 @@ #!/usr/bin/env Rscript # -# FT; last modification January 29th 2024 +# FT; last modification February 29th 2024 # AF; last modification February 16th 2019 # # Description: This script has been written in order to generate a graph @@ -205,48 +205,84 @@ if( b_verbose ){ } # add the consensus points, and remove the legend "linewidth" for proteins -p1 = p + geom_point(aes(x = position, y = -0.05, colour = density$gene_id_num, shape = density$protein_id_num), size = 1, show.legend = F, shape = 15) +p1 = p + geom_point(aes(x = position, y = -0.1, colour = density$gene_id_num, shape = density$protein_id_num), size = 1, show.legend = F, shape = 15) # add consensus gene boxes p1bis = p1 gnames = names(json_genes$genes) -y1 = -0.04 -y2 = -0.02 - -nrange = c(1:length(names)) -df <- data.frame( - xmin = lapply(nrange, FUN=function(x) json_genes$genes[[x]][[1]] ), # gene_start - xmax = lapply(nrange, FUN=function(x) json_genes$genes[[x]][[2]] ), # gene_end - ymin = lapply(nrange, FUN=function(x) -0.02 * (x %% 2) ), # lower box line - ymax = lapply(nrange, FUN=function(x) -0.02 * (1 + (x %% 2)) ), # upper box line - color = rep("black", length(names)), - fill = rep("white", length(names))) - -# p1bis = p1 + geom_rect(data=df, -# mapping = aes(xmin = df$xmin, xmax = df$xmax, -# ymin = df$ymin, ymax = df$ymax, color=df$color, fill = df$fill)) - -# ggplot(df) + - # sapply(df, FUN=function(xmin, xmax, ymin, ymax, color, fill) geom_rect(X, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill))) - - -# for( i in 1:length(json_genes$genes) ){ - -# # get the names of the lists in genes, here genes names -# gname = gnames[i] -# gstart = json_genes$genes[[i]][[1]] -# gend = json_genes$genes[[i]][[2]] -# if((i %% 2) == 0){ -# y1 = -0.04 -# y2 = -0.02 -# }else{ -# y1 = -0.02 -# y2 = 0.0 -# } -# print(c('gene no:', i, gname, gstart, gend, y1, y2)) -# p1bis = p1bis + geom_rect(aes(xmin = gstart, xmax = gend, ymin = y1, ymax = y2), color = "black", fill = "white", show.legend = F) -# } - +ybase = -0.04 +yshift = -0.00 +xshift = -0.03 +# minimal length of gene to consider to display name in gene boxes in the graph (bellow the absissia) +min_glength = 600 + +nrange = 1:length(gnames) + +xmi=lapply(nrange, FUN=function(x){ json_genes$genes[[x]][[1]] } ) +xmi=as.numeric(unlist(xmi)) + +xma=lapply(nrange, FUN=function(x){ json_genes$genes[[x]][[2]] } ) +xma=as.numeric(unlist(xma)) + +# average x to write text into +xmm=lapply(nrange, FUN=function(x){ (xma[x] + xmi[x]) / 2 + xshift } ) +xmm=as.numeric(unlist(xmm)) + +ymi=lapply(nrange, FUN=function(x){ ybase * (x %% 2) + yshift } ) +ymi=as.numeric(unlist(ymi)) # c(-0.04, -0.04,-0.04, -0.04,-0.04, -0.04,-0.04, -0.04,-0.04, -0.04,-0.04, -0.04,-0.04) + +yma=lapply(nrange, FUN=function(x){ ybase * (1 + (x %% 2)) + yshift } ) +yma=as.numeric(unlist(yma)) # c(-0.02, -0.02,-0.02, -0.02,-0.02, -0.02,-0.02, -0.02,-0.02, -0.02,-0.02, -0.02,-0.02) + +# average y to write text into +ymm=lapply(nrange, FUN=function(x){ (yma[x] + ymi[x]) / 2 + yshift } ) +ymm=as.numeric(unlist(ymm)) + +gnames_label=lapply(nrange, FUN=function(x){ + glabel = "" + if( (xma[x] - xmi[x]) > min_glength){ + glabel=gnames[x] + } + glabel + } + ) +# colrect=rep("black", length(gnames)) +# filrect=rep("white", length(gnames)) + +df=data.frame(xmi,xma,ymi,yma) # ,colrect,filrect) + +print(gnames) +print(gnames_label) +print(json_genes$genes[[1]][[1]]) +class(json_genes$genes[[1]][[1]]) +print(nrange) +print(xmi) +print(xma) +print(xmm) +print(ymi) +print(yma) +print(ymm) +# print(colrect) +# print(filrect) + +class(xmi) +class(ymi) +# class(colrect) + +p1bis = p1 + geom_rect(data=df, mapping=aes(xmin=xmi, + xmax=xma, + ymin=ymi, + ymax=yma), + # color=colrect, + # fill=filrect), + color="black", + fill="white", + show.legend = FALSE, + inherit.aes = FALSE) + geom_text( + data = df, + aes(x = xmm, y = ymm, label = gnames_label), + size = 3, check_overlap = TRUE # vjust = 0, hjust = 0, + ) # needed to have more than 6 symbols for gene_id legend p1ter = p1bis + scale_shape_manual(values=c(1:25,1:25)) @@ -304,7 +340,7 @@ p10 = p9 + geom_line(aes(x = position, y = 0.5), color = "red") minus_maxlength_over6 = - max(contig_limits) / 50 # give scale and breaks of the y axis -p11 = p10 + scale_y_continuous(limits=c(-0.06,1.1), labels = scales::percent, breaks=c(0.0, 0.07, 0.2, 0.4, 0.6, 0.8, 1.0)) +p11 = p10 + scale_y_continuous(limits=c(-0.1,1.1), labels = scales::percent, breaks=c(0.0, 0.07, 0.2, 0.4, 0.6, 0.8, 1.0)) if( b_covdepth ){ # gives proportions for covdepth graph (cd3) and variant graph (p11) in the grid plot From 28267ae8aa5f2ad7a4128260a31dc5737a700374 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 12 Mar 2024 08:56:46 +0100 Subject: [PATCH 33/42] add option to provide cov depth file for using cov depth display at the top of the graphical output --- src/vvv2_display.xml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/vvv2_display.xml b/src/vvv2_display.xml index 3bd7f9c..59f3e1f 100644 --- a/src/vvv2_display.xml +++ b/src/vvv2_display.xml @@ -1,19 +1,16 @@ - - - vvv2_display + + + vvv2_display + + @@ -76,6 +74,8 @@ optional arguments: in: vcf variant file provided by vardict -r FILE, --png_var_f FILE out: png file with variant proportions and annotations + -o FILE, --cov_depth_f FILE + [optional] in: text file of coverage depths (given by samtools depth) ]]> From ec70fe5568b1a7cb62a57f36c696dd4fd42bb1c9 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Tue, 12 Mar 2024 08:57:30 +0100 Subject: [PATCH 34/42] update version to 0.2.0, add dependency for json parsing in R --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c1dd236..083c7c6 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setuptools.setup( name="vvv2_display", # Required - version="0.1.11", # Required + version="0.2.0", # Required description="Viral Variant Visualizer 2 display", # Optional long_description=long_description, long_description_content_type="text/markdown", # Optional (see note above) @@ -73,6 +73,7 @@ "r-gridextra==2.3", "r-cowplot==1.1.1", "conda-forge::r-stringr=1.5.1", + "conda-forge::r-jsonlite=1.8.8", "pysam==0.19.1", "numpy==1.23.1"], # Optional scripts=[ From 0ee1bbe6ef674fd79aef021251091a97c155156f Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 27 Mar 2024 12:05:04 +0100 Subject: [PATCH 35/42] correct bug missing variants in summary.txt: simplify regexp for comment (avoid to fill all possible characters in descriptions) --- src/convert_vcffile_to_readablefile2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/convert_vcffile_to_readablefile2.py b/src/convert_vcffile_to_readablefile2.py index de77118..12b5c71 100755 --- a/src/convert_vcffile_to_readablefile2.py +++ b/src/convert_vcffile_to_readablefile2.py @@ -284,7 +284,7 @@ def is_homopolymer(lseq, rseq, ref, alt): A = 0 # this flag is used in the write_line function in order to add indices to line where variants are upper than threshold with open(args.out, "w") as filout: summary_list = [] - regex = '([0-9]+)\t1\t([A-Z\>\<]+)\t([A-Z\>\<]+)\t([0-9\.]+)\t[A-Z\>\<]+\t[A-Z\>\<]+\t([A-Z0-9a-z\-_\, /]+)\t([A-Z0-9a-z\-_\, /\]\[]*)\t1\t([0-9]+)\t([A-Z]+\t[A-Z]+\t(no|yes))\n' + regex = '([0-9]+)\t1\t([A-Z\>\<]+)\t([A-Z\>\<]+)\t([0-9\.]+)\t[A-Z\>\<]+\t[A-Z\>\<]+\t([A-Z0-9a-z\-_\, /]+)\t([^\t]*)\t1\t([0-9]+)\t([A-Z]+\t[A-Z]+\t(no|yes))\n' REGEX = re.compile(regex) filout.write("position\tSNP\tref\talt\tvariant_percent\tadd_ref\tadd_alt\tgene_id\tprotein_id\tsize_point\tindice\tlseq\trseq\tisHomo\n") a = 0 # flag to find the first genomic region after initialization of i From 0be1619c44e7cdd48a1a15fe8863a9c6428b1b71 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 27 Mar 2024 13:54:43 +0100 Subject: [PATCH 36/42] verbose mode set to false, update test dates info --- src/vvv2_display.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/vvv2_display.py b/src/vvv2_display.py index 93f1bda..8b8016d 100755 --- a/src/vvv2_display.py +++ b/src/vvv2_display.py @@ -38,13 +38,13 @@ def __main__(): b_test_vvv2_display = False # ok 2022 05 05 complet, partial tc b_test_convert_tbl2json = False # ok 2022 04 26 complete tc, b_test_correct_multicontig_vardict_vcf = False # ok 2022 04 29 partial tc - b_test_convert_vcffile_to_readable = False # ok 2022 04 28 complete tc, + b_test_convert_vcffile_to_readable = False # ok 2024 03 27 complete tc, b_test_correct_covdepth_f = False # ok 2024 01 20 tc, - b_test_visualize_snp_v4 = False # ok 2022 04 28 complete tc, + b_test_visualize_snp_v4 = False # ok 2024 03 27 complete tc, b_test = False dir_path = os.path.dirname(os.path.abspath(__file__)) # dir of current script - b_verbose = True + b_verbose = False # allow to run tests from everywhere prog_tag = '[' + os.path.basename(__file__) + ']' From 6c16da833f99961b160e737842bf98e85635cfa8 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 27 Mar 2024 14:15:24 +0100 Subject: [PATCH 37/42] update modif log and date --- src/convert_vcffile_to_readablefile2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/convert_vcffile_to_readablefile2.py b/src/convert_vcffile_to_readablefile2.py index 12b5c71..a53a0cc 100755 --- a/src/convert_vcffile_to_readablefile2.py +++ b/src/convert_vcffile_to_readablefile2.py @@ -16,6 +16,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . # +# FT - correct regexp part for description parsing: simplified, now do not miss variant: March 27th 2024 # FT - correct gene identif/name (position provided instead sometimes): June 15th 2023 # FT - last modification: September 19th 2022 to replace pyvcf by pysam # AF - last modification: August 21st 2018 From 102a4c54ac225f835d07bd946c5e70b08b5b6f5a Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 27 Mar 2024 14:16:12 +0100 Subject: [PATCH 38/42] update dependencies according to version observed in current env --- setup.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 083c7c6..7d536f5 100644 --- a/setup.py +++ b/setup.py @@ -69,13 +69,13 @@ include_package_data=True, install_requires=[ "python>=3.9", - "r-ggplot2==3.6.6", - "r-gridextra==2.3", - "r-cowplot==1.1.1", - "conda-forge::r-stringr=1.5.1", - "conda-forge::r-jsonlite=1.8.8", + "r-ggplot2>=3.4.4", + "r-gridextra>=2.3", + "r-cowplot>=1.1.1", + "conda-forge::r-stringr>=1.5.1", + "conda-forge::r-jsonlite>=1.8.8", "pysam==0.19.1", - "numpy==1.23.1"], # Optional + "numpy>=1.23.1"], # Optional scripts=[ "src/vvv2_display.py", "src/convert_tbl2json.py", From cc7bde58af0a9c35980ecb69b2b12485b96f9307 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 27 Mar 2024 14:22:23 +0100 Subject: [PATCH 39/42] record modifications for version 0.2.0 --- CHANGELOG | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index c5d5840..1609ca0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,12 +1,18 @@ +0.2.0: + - update dependencies versions + - correct bug in PYTHON_SCRIPTS/convert_vcffile_to_readablefile2.py that missed some variants in text summary file + - add boxes for genes under graph (to fit traditional display of viral genomes) + - add [optional] coverage depth display in final png (if no cov depth provided, part disabled) aligned of variant graph + for positions 0.1.11: - - display prot instead of genes and genes instead of prot - - add num in front of genes to ensure to keep apparition order - - replace fstrings in vvv2_display.py to allow better compatibility - - add src/correct_covdepth_f.py script to convert cov depth files of samtools depth so as to have cumulative positions for display - - prepare vvv2_display to coverage depth addition in input - - add new script to setup.py file and change version to 0.1.11 + - display prot instead of genes and genes instead of prot + - add num in front of genes to ensure to keep apparition order + - replace fstrings in vvv2_display.py to allow better compatibility + - add src/correct_covdepth_f.py script to convert cov depth files of samtools depth so as to have cumulative positions for display + - prepare vvv2_display to coverage depth addition in input + - add new script to setup.py file and change version to 0.1.11 0.1.10: - - now horizontal line colors are for proteins, dot shape is for genes. Added vertical dotted lines to show contig limits + - now horizontal line colors are for proteins, dot shape is for genes. Added vertical dotted lines to show contig limits - correction in vadr parsing using different virus models - tested on calicivirus, dengue, flavivirus, coronavirus, sarscov2, hcv, norovirus - better parse stemp_loops From 622fdbb1a60c1ac7feaa0a8cb9f1a65a087d508e Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 27 Mar 2024 14:27:49 +0100 Subject: [PATCH 40/42] update version requirement, replacing = by >= for most of them --- src/vvv2_display.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/vvv2_display.yaml b/src/vvv2_display.yaml index 167a05a..29005e8 100644 --- a/src/vvv2_display.yaml +++ b/src/vvv2_display.yaml @@ -6,12 +6,12 @@ channels: - r dependencies: - pysam=0.19.1 - - numpy=1.23.3 + - numpy>=1.23.1 - ca-certificates - openssl - - r-ggplot2=3.3.6 - - r-gridextra=2.3 - - r-cowplot=1.1.1 - - conda-forge::r-stringr=1.5.1 - - conda-forge::r-jsonlite=1.8.8 + - r-ggplot2>=3.4.4 + - r-gridextra>=2.3 + - r-cowplot>=1.1.1 + - conda-forge::r-stringr>=1.5.1 + - conda-forge::r-jsonlite>=1.8.8 From fe4789bebd5fc88e801909477a0014fb5b99ee5c Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 27 Mar 2024 16:30:33 +0100 Subject: [PATCH 41/42] pysam version requirement relax --- src/vvv2_display.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vvv2_display.yaml b/src/vvv2_display.yaml index 29005e8..34ec02a 100644 --- a/src/vvv2_display.yaml +++ b/src/vvv2_display.yaml @@ -5,7 +5,7 @@ channels: - defaults - r dependencies: - - pysam=0.19.1 + - pysam>=0.19.1 - numpy>=1.23.1 - ca-certificates - openssl From f46189528b868d1887f0d18395e07d4f2d8dfda7 Mon Sep 17 00:00:00 2001 From: "f.touzain" Date: Wed, 27 Mar 2024 16:53:46 +0100 Subject: [PATCH 42/42] update documentation --- README.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 75420c5..bcf3092 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ # Description Tools to create: -- a .png image file describing all variants (obtained from vardict-java variant caller) alongside a genome/assembly (to provide) with their proportion (ordinates), with CDS descriptions (obtained from vadr annotator). +- a .png image file describing all variants (obtained from vardict-java variant caller) alongside a genome/assembly (to provide) with their proportion (ordinates), with CDS descriptions (obtained from vadr annotator). At the top of the figure can be displayed the coverage depth repartition (if `-o cov_depth_f` option is provided). Python/R scripts and Galaxy wrapper to use them. @@ -26,12 +26,13 @@ Convert ```vardict-java``` variant calling vcf file to human readable txt file - ```PYTHON_SCRIPTS/correct_multicontig_vardict_vcf.py```: Correct ```vadr``` annotation output .tbl file for contigs positions when the assembly provided is composed of more than one contig. - - ```R_SCRIPTS/visualize_snp_v4.R```: -Create a .png file showing variant proportions alongside the genome/assembly and CDS positions. +Create a .png file showing on the same png figure: + - coverage depth repartition alongside the genome/assembly (if `-o cov_depth_d` option provided) + - variant proportions alongside the genome/assembly and CDS positions. # Installation @@ -50,8 +51,17 @@ vvv2_display.py -h Typical usage: ``` -vvv2_display.py -p res_vadr_pass.tsv -f res_vadr_fail.tsv -s res_vadr_seqstat.txt -n res_vardict_all.vcf -m contig_limits.txt -r res_vvv2_display.png +vvv2_display.py -p res_vadr_pass.tsv -f res_vadr_fail.tsv -s res_vadr_seqstat.txt -n res_vardict_all.vcf -r res_vvv2_display.png -o cov_depth_f.txt ``` +where: + - `res_vadr_pass.tsv` is the 'pass' file of vadr annotation program run on the genome/assembly + - `res_vadr_fail.tsv` is the 'fail' file of vadr annotation program + - `res_vadr_seqstat.txt` is the 'seqstat' file of vadr annotation program + - `res_vardict_all.vcf` is the result of vardict-java variant caller + - `res_vvv2_display.png` is the name of the main output file (will be created) + - `cov_depth_f.txt` is the coverage depth by position, provided by `samtools depth` run on the bam alignement file. + +> All other options are for Galaxy wrapper compatibility (these are intermediate temporary files that must appear as parameter for Galaxy wrapper but are not used in a usual command line call) # Galaxy wrapper