diff --git a/.gitattributes b/.gitattributes index 7142dc0..53ad104 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,202 @@ -# Auto detect text files and perform LF normalization -* text=auto +## GITATTRIBUTES FOR WEB PROJECTS +# +# These settings are for any web project. +# +# Details per file setting: +# text These files should be normalized (i.e. convert CRLF to LF). +# binary These files are binary and should be left untouched. +# +# Note that binary is a macro for -text -diff. +###################################################################### -* text eol=lf +# Auto detect +## Handle line endings automatically for files detected as +## text and leave all files detected as binary untouched. +## This will handle all files NOT defined below. +* text=auto + +# Source code +*.bash text eol=lf +*.bat text eol=crlf +*.cmd text eol=crlf +*.coffee text +*.css text diff=css +*.htm text diff=html +*.html text diff=html +*.inc text +*.ini text +*.js text +*.json text +*.jsx text +*.less text +*.ls text +*.map text -diff +*.od text +*.onlydata text +*.php text diff=php +*.pl text +*.ps1 text eol=crlf +*.py text eol=lf +*.rb text diff=ruby +*.sass text +*.scm text +*.scss text diff=css +*.sh text eol=lf +*.sql text +*.styl text +*.tag text +*.ts text +*.tsx text +*.xml text +*.xhtml text diff=html + +# Docker +Dockerfile text + +# Documentation +*.ipynb text +*.markdown text diff=markdown +*.md text diff=markdown +*.mdwn text diff=markdown +*.mdown text diff=markdown +*.mkd text diff=markdown +*.mkdn text diff=markdown +*.mdtxt text +*.mdtext text +*.txt text +AUTHORS text +CHANGELOG text +CHANGES text +CONTRIBUTING text +COPYING text +copyright text +*COPYRIGHT* text +INSTALL text +license text +LICENSE text +NEWS text +readme text +*README* text +TODO text + +# Templates +*.dot text +*.ejs text +*.haml text +*.handlebars text +*.hbs text +*.hbt text +*.jade text +*.latte text +*.mustache text +*.njk text +*.phtml text +*.tmpl text +*.tpl text +*.twig text +*.vue text + +# Configs +*.cnf text +*.conf text +*.config text +.editorconfig text +.env text +.gitattributes text +.gitconfig text +.htaccess text +*.lock text -diff +package.json text eol=lf +package-lock.json text -diff +pnpm-lock.yaml text eol=lf -diff +yarn.lock text -diff +*.toml text +*.yaml text +*.yml text +browserslist text +Makefile text +makefile text + +# Heroku +Procfile text + +# Graphics +*.ai binary +*.bmp binary +*.eps binary +*.gif binary +*.gifv binary +*.ico binary +*.jng binary +*.jp2 binary +*.jpg binary +*.jpeg binary +*.jpx binary +*.jxr binary +*.pdf binary +*.png binary +*.psb binary +*.psd binary +# SVG treated as an asset (binary) by default. +*.svg text +# If you want to treat it as binary, +# use the following line instead. +# *.svg binary +*.svgz binary +*.tif binary +*.tiff binary +*.wbmp binary +*.webp binary + +# Audio +*.kar binary +*.m4a binary +*.mid binary +*.midi binary +*.mp3 binary +*.ogg binary +*.ra binary + +# Video +*.3gpp binary +*.3gp binary +*.as binary +*.asf binary +*.asx binary +*.fla binary +*.flv binary +*.m4v binary +*.mng binary +*.mov binary +*.mp4 binary +*.mpeg binary +*.mpg binary +*.ogv binary +*.swc binary +*.swf binary +*.webm binary + +# Archives +*.7z binary +*.gz binary +*.jar binary +*.rar binary +*.tar binary +*.zip binary + +# Fonts +*.ttf binary +*.eot binary +*.otf binary +*.woff binary +*.woff2 binary + +# Executables +*.exe binary +*.pyc binary + +# RC files (like .babelrc or .eslintrc) +*.*rc text + +# Ignore files (like .npmignore or .gitignore) +*.*ignore text diff --git a/.gitignore b/.gitignore index a68d843..e012ea4 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ Gencode/* Cache/* *nohup* Annotations/* +PAMs/* +samplesIDs/* +guides/* \ No newline at end of file diff --git a/PostProcess/CRISPRme_plots.py b/PostProcess/CRISPRme_plots.py index 0b68ea7..39b65fe 100644 --- a/PostProcess/CRISPRme_plots.py +++ b/PostProcess/CRISPRme_plots.py @@ -18,13 +18,28 @@ #import seaborn as sns import sys import matplotlib +import warnings + +# ignore all warnings +warnings.filterwarnings("ignore") # set matplotlib to not use X11 server matplotlib.use('Agg') +# # Read file +# df = pd.read_csv(sys.argv[1], sep="\t", +# index_col=False, na_values=['n'], nrows=1200) +# out_folder = sys.argv[2] + +# # Make index column that numbers the OTs starting from 1 +# df = df.reset_index() +# df["index"] += 1 + # Read file -df = pd.read_csv(sys.argv[1], sep="\t", na_values=['n'], nrows=1200) +df = pd.read_csv(sys.argv[1], sep="\t", + index_col=False, na_values=['n'], nrows=1200) out_folder = sys.argv[2] +guide = sys.argv[3] # Make index column that numbers the OTs starting from 1 df = df.reset_index() @@ -110,7 +125,8 @@ # Save plt.tight_layout() -plt.savefig(out_folder+"CRISPRme_top_100_linear_annotated_for_supplement.png") +plt.savefig( + out_folder+f"CRISPRme_top_100_linear_annotated_for_supplement_{guide}.png") plt.clf() @@ -163,5 +179,5 @@ # Save plt.tight_layout() -plt.savefig(out_folder+"CRISPRme_top_1000_log_for_main_text.png") +plt.savefig(out_folder+f"CRISPRme_top_1000_log_for_main_text_{guide}.png") plt.clf() diff --git a/PostProcess/CRISPRme_plots_personal.py b/PostProcess/CRISPRme_plots_personal.py new file mode 100644 index 0000000..f2134cd --- /dev/null +++ b/PostProcess/CRISPRme_plots_personal.py @@ -0,0 +1,183 @@ +""" +Linda Lin +1/15/2021 + +Input: CRISPRme best file (top 1000 sites or more, without on-target) +Output: top OT plots for CRISPRme manuscript + +(Note: will get runtime warnings but all are fine to ignore) +""" + +import pandas as pd +import numpy as np +import math +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import matplotlib.lines as mlines +import matplotlib.colors as mcolors +#import seaborn as sns +import sys +import matplotlib +import warnings + +# ignore all warnings +warnings.filterwarnings("ignore") + +# set matplotlib to not use X11 server +matplotlib.use('Agg') + +# # Read file +# df = pd.read_csv(sys.argv[1], sep="\t", +# index_col=False, na_values=['n'], nrows=1200) +# out_folder = sys.argv[2] + +# # Make index column that numbers the OTs starting from 1 +# df = df.reset_index() +# df["index"] += 1 + +# Read file +df = pd.read_csv(sys.argv[1], sep="\t", + index_col=False, na_values=['n'], nrows=100) +out_folder = sys.argv[2] +guide = sys.argv[3] + +# Make index column that numbers the OTs starting from 1 +df = df.reset_index() +df["index"] += 1 + +# If prim_AF = 'n', then it's a ref-nominated site, so we enter a fake numerical AF +# This will cause a warning of invalid sqrt later on, but that's fine to ignore +df["prim_AF"] = df["prim_AF"].fillna(-1) + +# If multiple AFs (haplotype with multiple SNPs), take min AF +# Approximation until we have haplotype frequencies +df["AF"] = df["prim_AF"].astype(str).str.split(',') +df["AF"] = df["AF"].apply(lambda x: min(x)) +df["AF"] = pd.to_numeric(df["AF"]) + +# Adjustments for plotting purposes +# so haplotypes that got rounded down to AF = 0 (min AF = 0.01) still appear in the plot +df["plot_AF"] = df["AF"] + 0.001 +df["plot_AF"] *= 1000 # make points larger +df["plot_AF"] = np.sqrt(df["plot_AF"]) # so size increase is linear + +# Calculate ref_AF as (1 – alt_AF) +# Not precisely correct because there can be other non-ref haplotypes, but approximation should be accurate in most cases +df["ref_AF"] = 1 - df["AF"] +df["ref_AF"] *= 1000 # make points larger +df["ref_AF"] = np.sqrt(df["ref_AF"]) # so size increase is linear + +# Transparent colors +transparent_red = mcolors.colorConverter.to_rgba("red", alpha=0.5) +transparent_blue = mcolors.colorConverter.to_rgba("blue", alpha=0.5) +transparent_gray = mcolors.colorConverter.to_rgba("gray", alpha=0.5) + +# Color by annotation, with coding > accessible > neither +# Note that coding & accessible categories are not mutually exclusive, but coding is more important than accessible, so if both, it's colored as coding +df["color_str"] = np.where(df["gene_annotation"] == + "CDS", "red", "gray") # coding +df["color_str"] = np.where(df["annotation"].str.contains( + "HSC-1", na=False), "blue", df["color_str"]) # DHS, accessible in HSCs +df["color"] = df["color_str"].apply(lambda x: transparent_red if x == "red" else ( + transparent_blue if x == "blue" else transparent_gray)) + +""" +Linear, annotated, top 100 (for supplement) +""" +# Plot data +ax = df.plot.scatter(x="index", y="highest_CFD_score(ref)", + s="ref_AF", c="color", marker='*', zorder=1) +df.plot.scatter(x="index", y="highest_CFD_score(alt)", + s="plot_AF", c="color", zorder=2, ax=ax) +# plt.title("Top CRISPRme-identified sites for sgRNA 1617") +plt.xlabel("Candidate off-target site") +plt.ylabel("CFD score") + +# Boundaries +plt.xlim(xmin=0.01, xmax=100) +plt.ylim(ymin=0, ymax=1) + +# Arrows +for x, y, z in zip(df["index"], df["highest_CFD_score(ref)"], df["highest_CFD_score(alt)"]-df["highest_CFD_score(ref)"]): + plt.arrow(x, y+0.01, 0, z-0.02, color='green', head_width=0.75, + head_length=0.015, length_includes_head=True, alpha=0.5, zorder=0) + # +/- to avoid overlap of arrow w/ points + +# Size legend +s1 = mlines.Line2D([], [], marker='o', label='1', linestyle='None', + markersize=math.sqrt(math.sqrt((1+0.001)*1000)), color='black') +s01 = mlines.Line2D([], [], marker='o', label='0.1', linestyle='None', + markersize=math.sqrt(math.sqrt((0.1+0.001)*1000)), color='black') +s001 = mlines.Line2D([], [], marker='o', label='0.01', linestyle='None', + markersize=math.sqrt(math.sqrt((0.01+0.001)*1000)), color='black') +plt.gca().add_artist(plt.legend( + handles=[s1, s01, s001], title="Allele frequency", loc=9)) + +# Shape & color legend +star = mlines.Line2D([], [], marker='*', label='Reference', + linestyle='None', markersize=10, color='black') +circle = mlines.Line2D([], [], marker='o', label='Alternative', + linestyle='None', markersize=10, color='black') +red = mpatches.Patch(color=transparent_red, label='Coding') +blue = mpatches.Patch(color=transparent_blue, label='DHS in HSC-1') +gray = mpatches.Patch(color=transparent_gray, label='Neither') +plt.legend(handles=[star, circle, red, blue, gray]) + +# Save +plt.tight_layout() +plt.savefig( + out_folder+f"CRISPRme_top_100_linear_annotated_for_supplement_{guide}.png") +plt.clf() + + +""" +Log, ref/alt, top 1000: for main text +""" +# matplotlib plot settings +plt.rcParams["figure.dpi"] = 600 +plt.rcParams["figure.figsize"] = 4.5, 2.5 +plt.rcParams.update({'font.size': 7}) +plt.rcParams['pdf.fonttype'] = 42 +plt.rcParams['ps.fonttype'] = 42 + +# seaborn plot settings - also need to comment out line 114 and uncomment line 115 +# sns.set_style('white') +# sns.set_context('talk') # alternatively, 'poster' +# k = 2 # scale factor +# fig = plt.figure(figsize=(7*k, 2*k)) +# ax = fig.add_subplot(1,1,1) + +# Plot data +ax = df.plot.scatter(x="index", y="highest_CFD_score(ref)", + s="ref_AF", c=transparent_red, zorder=1) +# ax = df.plot.scatter(x="index", y="highest_CFD_score(ref)", s="ref_AF", c=transparent_red, zorder=1, ax=ax) +df.plot.scatter(x="index", y="highest_CFD_score(alt)", + s="plot_AF", c=transparent_blue, zorder=2, ax=ax) +ax.set_xscale("log") +# plt.title("Top CRISPRme-identified sites for sgRNA 1617") +plt.xlabel("Candidate off-target site") +plt.ylabel("CFD score") + +# Boundaries +plt.xlim(xmin=0.9, xmax=100) +plt.ylim(ymin=0, ymax=1) + +# Arrows +for x, y, z in zip(df["index"], df["highest_CFD_score(ref)"], df["highest_CFD_score(alt)"]-df["highest_CFD_score(ref)"]): + plt.arrow(x, y+0.02, 0, z-0.04, color='gray', head_width=(x*(10**0.005-10**(-0.005))), + head_length=0.02, length_includes_head=True, zorder=0, alpha=0.5) + # +/- to avoid overlap of arrow w/ points, head_width calculated to remain constant despite log scale of x-axis + +# Size legend +plt.gca().add_artist(plt.legend( + handles=[s1, s01, s001], title="Allele frequency", ncol=3, loc=9)) + +# Color legend +red = mpatches.Patch(color=transparent_red, label="Reference") +blue = mpatches.Patch(color=transparent_blue, label="Alternative") +plt.legend(handles=[red, blue]) + +# Save +plt.tight_layout() +plt.savefig(out_folder+f"CRISPRme_top_1000_log_for_main_text_{guide}.png") +plt.clf() diff --git a/PostProcess/analisi_indels_NNN.py b/PostProcess/analisi_indels_NNN.py index 3e3bedf..e0ef3ed 100644 --- a/PostProcess/analisi_indels_NNN.py +++ b/PostProcess/analisi_indels_NNN.py @@ -888,6 +888,7 @@ def reverse_complement_table(seq): # t = t[::-1] guide_no_pam = final_result[1][pos_beg:pos_end] list_t = list(t) + tmp_pos_mms = None for position_t, char_t in enumerate(final_result[2][pos_beg:pos_end]): if char_t.upper() != guide_no_pam[position_t]: mm_new_t += 1 diff --git a/PostProcess/analisi_indels_NNN.sh b/PostProcess/analisi_indels_NNN.sh index 808af1f..6c3c2fa 100644 --- a/PostProcess/analisi_indels_NNN.sh +++ b/PostProcess/analisi_indels_NNN.sh @@ -44,7 +44,7 @@ output_folder=${12} touch $REFtargets.corrected # 1) Rimozione duplicati, estrazione semicommon e unique e creazione file total -echo 'Creazione file .total.txt' +#echo 'Creazione file .total.txt' ./extraction.sh "$REFtargets.corrected" "$ENRtargets" "$jobid" # OUTPUT $jobid.common_targets.txt -> Non usato # $jobid.semi_common_targets.txt # $jobid.unique_targets.txt @@ -66,7 +66,7 @@ rm "$jobid.semi_common_targets.minmaxdisr.txt" #awk '{real_guide=$2; gsub("-","",real_guide); print $0"\tn\tn\tn\tn\t"real_guide"\tn\tn\tn"}' $ENRtargets.corrected > $jobid.total.txt -echo 'Creazione cluster del file .total.txt' +#echo 'Creazione cluster del file .total.txt' # 3) Clustering ./cluster.dict.py "$jobid.total.txt" 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' # OUTPUT $jobid.total.cluster.txt @@ -95,7 +95,7 @@ rm "$jobid.total.txt" # I file .bestCFD.txt si possono unire (attenzione che avrò 3 header all'interno del file) per ottenere il file completo # I file .CFDGraph.txt vanno sommati tra loro per ottenere il file finale -> AL MOMENTO NON NECESSARIO PER QUESTA ANALISI (TENERE COMUNQUE I FILE) -echo 'Estrazione sample dal file .total.cluster.txt' +#echo 'Estrazione sample dal file .total.cluster.txt' ./analisi_indels_NNN.py "$annotationfile" "$jobid.total.cluster.txt" "$jobid" "$dictionaries" "$pam_file" "$mismatch" "$referencegenome" "$guide_file" $bulgesDNA $bulgesRNA # OUTPUT $jobid.bestCFD.txt diff --git a/PostProcess/convert_gnomAD.py b/PostProcess/convert_gnomAD.py index 091cc39..12065df 100644 --- a/PostProcess/convert_gnomAD.py +++ b/PostProcess/convert_gnomAD.py @@ -2,40 +2,85 @@ import gzip import sys -import re -inVCF = gzip.open(sys.argv[1], 'rt') +import glob +import os +import multiprocessing + +vcfDIR = sys.argv[1] inPop = open(sys.argv[2], 'r').readlines() -outVCF = gzip.open(sys.argv[1].replace('bgz', 'gz'), 'wt') - -header = list() -popDict = dict() -for pop in inPop: - popDict[pop.strip()] = '0|0' - -for line in inVCF: - if '##' in line: - header.append(line.strip()) - else: - popheader = '\t'.join(popDict.keys()) - header.append(line.strip()+'\t'+popheader+'\n') - break - -# print('\n'.join(header)) -outVCF.write('\n'.join(header)) - - -for line in inVCF: - split = line.strip().split('\t') - info = split[7].strip().split(';') - for pop in popDict: - popDict[pop] = '0|0' - for index, data in enumerate(info): - if 'AC-'+str(pop)+'=' in data: - ACvalue = int(data.strip().split('=')[1]) - if ACvalue > 0: - popDict[pop] = '0|1' - outVCF.write('\t'.join(split[:8])+'\t'+'\t'.join(popDict.values())+'\n') - -inVCF.close() -outVCF.close() -# print('\t'.join(split[:8]), '\t', '\t'.join(popDict.values())) +threads = 0 +try: + threads = int(sys.argv[3]) +except: + threads = len(os.sched_getaffinity(0))-2 + +def convertVCF(inVCF): + #open VCF file + outVCFname = inVCF.replace('bgz', 'gz') + outVCF = gzip.open(inVCF.replace('bgz', 'gz'), 'wt') + inVCF = gzip.open(inVCF, 'rt') + + #read samplesID from file + header = list() + popDict = dict() + for pop in inPop: + if '#' in pop: + continue + popDict[pop.split()[0].strip()] = '0|0' + + #read header from original VCF + for line in inVCF: + if '##' in line: + header.append(line.strip()) + else: + popheader = '\t'.join(popDict.keys()) + header.append('##FORMAT=') + header.append(line.strip()+'\tFORMAT'+'\t'+popheader+'\n') + break + #insert header into new VCF + outVCF.write('\n'.join(header)) + + #read each variant into the VCF + for line in inVCF: + split = line.strip().split('\t') + if split[6] != 'PASS': #skip rows with no PASS in FILTER + continue + info = split[7].strip().split(';') + for pop in popDict: + popDict[pop] = '0|0' + for index, data in enumerate(info): #read AC for each gnomAD population and insert a fake GT for each sample + if 'AC_'+str(pop)+'=' in data: + ACvalue = int(data.strip().split('=')[1]) + if ACvalue > 0: + popDict[pop] = '0|1' + split[7] = info[2] + #write each line passing the filtering into the new VCF + outVCF.write('\t'.join(split[:8])+'\tGT\t'+'\t'.join(popDict.values())+'\n') + + inVCF.close() + outVCF.close() + return outVCFname + +def bcftools_merging(outVCFname): + # finalOutVCF = outVCFname.replace('genomes','genomes.collapsed') + tempName = outVCFname.strip().split('.') + tempName[-3] = tempName[-3]+'.collapsed' + finalOutVCF = '.'.join(tempName) + os.system(f"bcftools norm -m+ -O z -o {finalOutVCF} {outVCFname}") + +def full_process(inVCF): + #function to process each VCF from initial conversion to collapsing multi-variant alleles with shared reference into one entry + outVCFname = convertVCF(inVCF) #convert VCF from gnomADv3.1 to VCF4.2 compatible with CRISRPme + bcftools_merging(outVCFname) #merge multi-variant alleles into single entry + os.system(f"rm -f {outVCFname}") #removed unused temp vcf file + + +if __name__ == '__main__': + pool = multiprocessing.Pool(threads) + #call convert vcf for each vcf in dir + listVCF = glob.glob(vcfDIR+'/*.vcf.bgz') + for inVCF in listVCF: + pool.apply_async(full_process, args=(inVCF,)) + # wait until all threads are completed than join + pool.close() + pool.join() diff --git a/PostProcess/generate_img_radar_chart.py b/PostProcess/generate_img_radar_chart.py new file mode 100644 index 0000000..a40b7a0 --- /dev/null +++ b/PostProcess/generate_img_radar_chart.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python + +# Libraries +from operator import itemgetter +from os.path import isfile, join +from os import listdir +import os +import warnings +import glob +from itertools import islice +import sys + +from pandas.core.tools.numeric import to_numeric +import numpy as np +import scipy.spatial.distance as sp +from math import pi +import pandas as pd +from matplotlib import patches as mpatches +from matplotlib import pyplot as plt +import math +import matplotlib +import time +import random +import multiprocessing +import json + +# matplotlib.use("TkAgg") +matplotlib.use('Agg') + +warnings.filterwarnings("ignore") + +plt.style.use('seaborn-poster') +matplotlib.rcParams['pdf.fonttype'] = 42 +matplotlib.rcParams['ps.fonttype'] = 42 +random.seed(a=None, version=2) + +guide = sys.argv[1] # guide file used during search +try: + guideDictFile = open(sys.argv[2]) + motifDictFile = open(sys.argv[3]) +except: + sys.exit() +mismatch = sys.argv[4] +bulge = sys.argv[5] +elem = sys.argv[6] +outDir = sys.argv[7] # directory to output the figures +# threads = int(sys.argv[6]) # number of concurrent execution of image creation + +web_server = False +population = False +# file_extension = 'pdf' +file_extension = 'png' + +if '-ws' in sys.argv[:]: + web_server = True +if web_server: + file_extension = 'png' + + +def generatePlot(guide, guideDict, motifDict, mismatch, bulge, source): + + # check if no targets are found for that combination source/totalcount and skip the execution + if guideDict['General'] == 0: + return + percentage_list = [] + for elem in guideDict: + if float(guideDict['General']) != 0: + percentage_list.append( + float(str(float(guideDict[elem])/float(guideDict['General']))[0:5])) + else: + percentage_list.append(float(0)) + + guideDataFrame = pd.DataFrame.from_dict(guideDict, orient='index') + guideDataFrame['Percentage'] = percentage_list + guideDataFrame.columns = ['Total', 'Percentage'] + # convert to int total column + # print('prima', guideDataFrame) + # guideDataFrame['Total'] = guideDataFrame['Total'].astype('int32') + guideDataFrame = guideDataFrame.T + # print('dopo', guideDataFrame) + # number of variable + categories = list(guideDataFrame)[0:] + N = len(categories) + + # We are going to plot the first line of the data frame. + # But we need to repeat the first value to close the circular graph: + values = guideDataFrame.loc['Percentage'].values.flatten().tolist() + values += values[:1] + + # What will be the angle of each axis in the plot? (we divide the plot / number of variable) + angles = [n / float(N) * 2 * pi for n in range(N)] + angles += angles[:1] + + # Initialise the spider plot + ax = plt.subplot(2, 2, 1, polar=True) + + for label, rot in zip(ax.get_xticklabels(), angles): + if (rot == 0): + label.set_horizontalalignment("center") + if (rot > 0): + label.set_horizontalalignment("left") + if (rot > 3): + label.set_horizontalalignment("center") + if (rot > 4): + label.set_horizontalalignment("right") + + # Draw one axe per variable + add labels labels yet + plt.xticks(angles[:-1], categories, color='black', size=14) + + # Draw ylabels + # # # Draw ylabels + ax.set_rlabel_position(0) + plt.yticks([0, 0.25, 0.50, 0.75], ["0", "0.25", + "0.50", "0.75"], color="black", size=12) + plt.ylim(0, 1) + + # Fill area + ax.fill(angles, values, 'b', alpha=0.1) + + # # # offset posizione y-axis + ax.set_theta_offset(pi / 2) + ax.set_theta_direction(-1) + # Plot data + ax.plot(angles, values, linewidth=1, linestyle='solid') + + plt.subplot(2, 2, 2) + transpose_list = [] + guideDataFrame = guideDataFrame.T + # guideDataFrame['Total'] = to_numeric( + # guideDataFrame['Total'], downcast='integer') + # print('lamadonna', guideDataFrame) + for elem in categories: + transpose_list.append(list(guideDataFrame.loc[elem])) + # transpose_list.append( + # [guideDataFrame.loc[elem, 'Total'], guideDataFrame.loc[elem, 'Percentage']]) + # print(transpose_list) + templist = list() + for couple in transpose_list: + couple[0] = int(couple[0]) + templist.append(couple) + # templist to convert into only the total column + transpose_list = templist + + plt.axis('off') + table = plt.table(cellText=transpose_list, rowLabels=categories, colLabels=['Total', 'Percentage'], + loc='best', colWidths=[0.25, 0.25]) + table.auto_set_font_size(False) + table.set_fontsize(13) + + totalMotif = [0]*len(guide) + for count in range(len(guide)): + for nuc in motifDict: + totalMotif[count] += motifDict[nuc][count] + + maxmax = max(totalMotif) + for count in range(len(guide)): + for nuc in motifDict: + if maxmax != 0: + motifDict[nuc][count] = float( + motifDict[nuc][count]/float(maxmax)) + # motifDict[nuc][count] = float( + # str(float(motifDict[nuc][count])/float(maxmax))[0:5]) + + # ind = np.arange(0, len(guide), 1) + 0.15 + ind = np.arange(0, len(guide), 1) + width = 0.7 # the width of the bars: can also be len(x) sequence + + motif = plt.subplot(2, 1, 2, frameon=False) + + A = np.array(motifDict['A'], dtype=float) + C = np.array(motifDict['C'], dtype=float) + G = np.array(motifDict['G'], dtype=float) + T = np.array(motifDict['T'], dtype=float) + RNA = np.array(motifDict['RNA'], dtype=float) + DNA = np.array(motifDict['DNA'], dtype=float) + + p1 = plt.bar(ind, A, width, align='center') + p2 = plt.bar(ind, C, width, bottom=A, align='center') + p3 = plt.bar(ind, G, width, bottom=A+C, align='center') + p4 = plt.bar(ind, T, width, bottom=C+G+A, align='center') + p5 = plt.bar(ind, RNA, width, bottom=C+G+A+T, align='center') + p6 = plt.bar(ind, DNA, width, bottom=C+G+A+T+RNA, + align='center') + + # plt.xlim(0, len(guide)) + # strArray = np.array([list(guide)]) + plt.xticks(ticks=ind, labels=list(guide)) + + plt.legend((p1[0], p2[0], p3[0], p4[0], p5[0], p6[0]), + ('A', 'C', 'G', 'T', 'bRNA', 'bDNA'), fontsize=13, loc='upper left', ncol=6) + + # strArray = np.array([list(guide)]) + # table = plt.table(cellText=strArray, loc='bottom', + # cellLoc='center', rowLoc='bottom') + # table.auto_set_font_size(False) + # table.set_fontsize(12) + # # table.scale(1, 1.6) + # table.xticks = ([]) + # table.yticks = ([]) + # plt.xticks + + plt.suptitle(str(mismatch)+" Mismatches + "+str(bulge)+" Bulge "+str(source), + horizontalalignment='center', color='black', size=25) + + plt.tight_layout() + plt.subplots_adjust(top=0.85, bottom=0.05, left=0.05, + right=0.95, wspace=0.1) + plt.savefig(outDir+"/summary_single_guide_" + str(guide) + "_" + str(mismatch) + + "."+str(bulge) + '_' + str(source) + "." + file_extension, format=file_extension) + + plt.close('all') + + +guide = guide.strip() +guideDict = json.load(guideDictFile) +motifDict = json.load(motifDictFile) +if __name__ == '__main__': + # skip creation if no target in global category + if guideDict[mismatch][bulge]['TOTAL']['General'] == 0: + sys.exit() + + generatePlot(guide, guideDict[mismatch][bulge][elem], + motifDict[mismatch][bulge][elem], mismatch, bulge, elem) diff --git a/PostProcess/merge_1000G_HGDP.py b/PostProcess/merge_1000G_HGDP.py index f0992e8..ea05dbe 100644 --- a/PostProcess/merge_1000G_HGDP.py +++ b/PostProcess/merge_1000G_HGDP.py @@ -24,28 +24,46 @@ def init_summary_by_sample(path_samplesID): def get_best_targets(cluster, fileOut, fileOut_disc, cfd, snp_info): list_1000G = [] list_HGDP = [] - + list_ref = [] + for ele in cluster: if ele[-1] == 'HGDP': list_HGDP.append(ele) - else: + elif ele[-1] == '1000G': list_1000G.append(ele) + else: + list_ref.append(ele) list_HGDP.sort(key = lambda x : (float(x[cfd]), -int(x[total])), reverse=True) #taaaac list_1000G.sort(key = lambda x : (float(x[cfd]), -int(x[total])), reverse=True) + list_ref.sort(key = lambda x : (float(x[cfd]), -int(x[total])), reverse=True) best_HGDP = [] best_1000G = [] best_cfd_1000G_or_ref = 0 + cfd_var = -1 if len(list_HGDP) > 0: best_HGDP = list_HGDP[0] + cfd_var = float(best_HGDP[cfd]) best_cfd_1000G_or_ref = float(best_HGDP[cfd+1]) if len(list_1000G) > 0: best_1000G = list_1000G[0] + if cfd_var < float(best_1000G[cfd]): + cfd_var = float(best_1000G[cfd]) + #print('prev 1000G best cfd',best_cfd_1000G_or_ref) if best_cfd_1000G_or_ref < float(best_1000G[cfd]): best_cfd_1000G_or_ref = float(best_1000G[cfd]) if best_cfd_1000G_or_ref < float(best_1000G[cfd+1]): best_cfd_1000G_or_ref = float(best_1000G[cfd+1]) + ref_is_best = False + #print(list_ref) + if len(list_ref) > 0: + best_ref = list_ref[0] + #print('prev ref best cfd',best_cfd_1000G_or_ref) + if best_cfd_1000G_or_ref <= float(best_ref[cfd]): + best_cfd_1000G_or_ref = float(best_ref[cfd]) + if best_cfd_1000G_or_ref >= cfd_var: + ref_is_best = True #append diff_cfd from best 1000G in cluster for ele in list_HGDP: @@ -54,14 +72,27 @@ def get_best_targets(cluster, fileOut, fileOut_disc, cfd, snp_info): for ele in list_1000G: ele.append(str(float(ele[cfd]) - best_cfd_1000G_or_ref)) + for ele in list_ref: + ele.append(str(float(ele[cfd]) - best_cfd_1000G_or_ref)) #write best target in bestFile populations = [] - n_seq_in_cluster = str(len(list_1000G) + len(list_HGDP) - 1) - if len(best_HGDP) > 0 and float(best_HGDP[cfd]) - best_cfd_1000G_or_ref > 0: + n_seq_in_cluster = str(len(list_1000G) + len(list_HGDP) + len(list_ref) - 1) + #print(ref_is_best) + if ref_is_best: + best_ref[cfd-1] = n_seq_in_cluster + best_ref[2*cfd+1] = n_seq_in_cluster + + best_ref.append("NO_POP") + fileOut.write('\t'.join(best_ref)+"\n") + list_ref.pop(0) + elif len(best_HGDP) > 0 and float(best_HGDP[cfd]) - best_cfd_1000G_or_ref > 0: best_HGDP[cfd-1] = n_seq_in_cluster best_HGDP[2*cfd+1] = n_seq_in_cluster + if len(list_1000G) == 0: #this is when HGDP generates a new target not found in 1000G + best_HGDP[-2] = "HGDP_only" + for sample in best_HGDP[true_guide-2].split(','): populations.append(dict_samples_HGDP[sample][2]) pops = Counter(populations) @@ -88,20 +119,26 @@ def get_best_targets(cluster, fileOut, fileOut_disc, cfd, snp_info): list_1000G.pop(0) list_1000G.extend(list_HGDP) + list_1000G.extend(list_ref) list_1000G.sort(key = lambda x : (float(x[cfd]), -int(x[total])), reverse=True) for ele in list_1000G: populations = [] - if ele[-2] == "HGDP": + if ele[-2] == "HGDP" or ele[-2] == "HGDP_only": + #print(ele) for sample in ele[true_guide-2].split(','): populations.append(dict_samples_HGDP[sample][2]) - else: + elif ele[-2] == "1000G": for sample in ele[true_guide-2].split(','): populations.append(dict_samples_1000G[sample][2]) - pops = Counter(populations) - string_populations = [] - for pop in pops: - string_populations.append(str(pops[pop])+"_"+str(pop)) + else: + string_populations = "NO_POP" + + if ele[-2] != "REF": + pops = Counter(populations) + string_populations = [] + for pop in pops: + string_populations.append(str(pops[pop])+"_"+str(pop)) ele[cfd-1] = n_seq_in_cluster ele[2*cfd+1] = n_seq_in_cluster ele.append(','.join(string_populations)) diff --git a/PostProcess/new_simple_analysis.py b/PostProcess/new_simple_analysis.py index a770a1b..ab45799 100644 --- a/PostProcess/new_simple_analysis.py +++ b/PostProcess/new_simple_analysis.py @@ -104,7 +104,7 @@ def __lt__(self, other): def calc_cfd(guide_seq, sg, pam, mm_scores, pam_scores, do_scores): if do_scores == False: - print("NON CALCOLO") + # print("NON CALCOLO") score = 0 return score score = 1 @@ -495,6 +495,84 @@ def retrieveFromDict(chr_pos): cluster_to_save.append(final_line) +for t in cluster_to_save: + + if t[0] == 'DNA': + cfd_score = calc_cfd(t[1][int(t[bulge_pos]):], t[2].upper()[int( + t[bulge_pos]):-3], t[2].upper()[-2:], mm_scores, pam_scores, do_scores) + t.append(str(cfd_score)) + else: + cfd_score = calc_cfd(t[1], t[2].upper()[ + :-3], t[2].upper()[-2:], mm_scores, pam_scores, do_scores) + t.append(str(cfd_score)) + # print(cfd_score) + +cluster_to_save.sort(key=lambda x: ( + float(x[-1]), reversor(int(x[9])), reversor(int(x[-2]))), reverse=True) + +cluster_to_save_mmbl = cluster_to_save.copy() +cluster_to_save_mmbl.sort(key=lambda x: (int(x[8]), int(x[7]))) + +keys_seen = [] +saved = False + +if cluster_to_save: + c = cluster_to_save[0] + c.pop(-2) + cfd_clus_key = c[3] + " " + c[5] + " " + c[6] + " " + c[17] + + if c[12] == 'n': + cfd_for_graph['ref'][round(float(c[-1]) * 100)] += 1 + else: + cfd_for_graph['var'][round(float(c[-1]) * 100)] += 1 + + cfd_best.write('\t'.join(c)) + cluster_to_save.pop(0) + keys_seen.append(cfd_clus_key) + saved = True + +list_for_alt = [] +for c in cluster_to_save: + c.pop(-2) + new_cfd_clus_key = c[3] + " " + c[5] + " " + c[6] + " " + c[17] + if new_cfd_clus_key not in keys_seen: + keys_seen.append(new_cfd_clus_key) + if c[12] == 'n': + cfd_for_graph['ref'][round(float(c[-1]) * 100)] += 1 + else: + cfd_for_graph['var'][round(float(c[-1]) * 100)] += 1 + list_for_alt.append('\t'.join(c)) +if saved: + cfd_best.write('\t' + str(len(list_for_alt)) + '\n') +for ele in list_for_alt: + cfd_best.write(ele+'\t'+str(len(list_for_alt)) + '\n') + +keys_seen = [] +saved = False +if cluster_to_save_mmbl: + c = cluster_to_save_mmbl[0] + cfd_clus_key = c[3] + " " + c[5] + " " + c[6] + " " + c[17] + + mmblg_best.write('\t'.join(c)) + cluster_to_save_mmbl.pop(0) + keys_seen.append(cfd_clus_key) + saved = True + +list_for_alt = [] +for c in cluster_to_save_mmbl: + new_cfd_clus_key = c[3] + " " + c[5] + " " + c[6] + " " + c[17] + if new_cfd_clus_key not in keys_seen: + keys_seen.append(new_cfd_clus_key) + list_for_alt.append('\t'.join(c)) + +if saved: + mmblg_best.write('\t' + str(len(list_for_alt)) + '\n') +for ele in list_for_alt: + mmblg_best.write(ele+'\t'+str(len(list_for_alt)) + '\n') + +cluster_to_save = [] + + cfd_best.close() mmblg_best.close() diff --git a/PostProcess/pool_post_analisi_indel.py b/PostProcess/pool_post_analisi_indel.py index d5234c5..320327c 100644 --- a/PostProcess/pool_post_analisi_indel.py +++ b/PostProcess/pool_post_analisi_indel.py @@ -4,21 +4,21 @@ import sys from multiprocessing import Pool -output_folder=sys.argv[1] -ref_folder=sys.argv[2] -vcf_folder=sys.argv[3] -guide_file=sys.argv[4] -mm=sys.argv[5] -bDNA=sys.argv[6] -bRNA=sys.argv[7] -annotation_file=sys.argv[8] -pam_file=sys.argv[9] +output_folder = sys.argv[1] +ref_folder = sys.argv[2] +vcf_folder = sys.argv[3] +guide_file = sys.argv[4] +mm = sys.argv[5] +bDNA = sys.argv[6] +bRNA = sys.argv[7] +annotation_file = sys.argv[8] +pam_file = sys.argv[9] # sampleID=sys.argv[10] -dict_folder=sys.argv[10] -final_res=sys.argv[11] -final_res_alt=sys.argv[12] -ncpus=int(sys.argv[13]) - +dict_folder = sys.argv[10] +final_res = sys.argv[11] +final_res_alt = sys.argv[12] +# ncpus=int(sys.argv[13]) +ncpus = 4 def start_analysis(f): @@ -26,7 +26,8 @@ def start_analysis(f): for elem in splitted: if "chr" in elem: chrom = elem - os.system(f"./post_analisi_indel.sh \"{output_folder}\" \"{ref_folder}\" \"{vcf_folder}\" \"{guide_file}\" \"{mm}\" \"{bDNA}\" \"{bRNA}\" {annotation_file} {pam_file} {dict_folder} {final_res} {final_res_alt} {chrom}") + os.system( + f"./post_analisi_indel.sh \"{output_folder}\" \"{ref_folder}\" \"{vcf_folder}\" \"{guide_file}\" \"{mm}\" \"{bDNA}\" \"{bRNA}\" {annotation_file} {pam_file} {dict_folder} {final_res} {final_res_alt} {chrom}") chrs = [] @@ -37,6 +38,5 @@ def start_analysis(f): t = 6 if ncpus < 6: t = ncpus -with Pool(processes = t) as pool: +with Pool(processes=t) as pool: pool.map(start_analysis, chrs) - diff --git a/PostProcess/post_process.sh b/PostProcess/post_process.sh index f80de26..cf20bc9 100644 --- a/PostProcess/post_process.sh +++ b/PostProcess/post_process.sh @@ -29,6 +29,6 @@ LC_ALL=C sort -T $dir -k4,4rg $1.found.bed -o $1.found.bed echo 'Starting integration with empirical data (this may take a while)' "$starting_dir"./resultIntegrator.py $1 $3 $1.found.bed $4 $6/ true $5 $7 echo 'Removing unnecessary files' -rm $1.bed $1.found.bed $1.redirectFile.out $1.temp.bed +rm -f $1.bed $1.found.bed $1.redirectFile.out $1.temp.bed sed -i 1i"#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tCFD\tCFD_ref\tHighest_CFD_Risk_Score\tHighest_CFD_Absolute_Risk_Score\tMMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref\tMMBLG_CFD_Risk_Score\tMMBLG_CFD_Absolute_Risk_Score" "$1" diff --git a/PostProcess/process_summaries.py b/PostProcess/process_summaries.py index 1496cae..0431c39 100644 --- a/PostProcess/process_summaries.py +++ b/PostProcess/process_summaries.py @@ -41,12 +41,13 @@ def init_summary_by_sample(path_samplesID): general_table = {} with open(f"{path_guides}", "r") as guide_file: for line in guide_file: - stripped = line.strip() - guides.append(stripped) + if line.strip(): + stripped = line.strip() + guides.append(stripped) - general_table[stripped] = dict() - general_table[stripped]['ref'] = np.zeros((bulge+1, mms+1), dtype=int) - general_table[stripped]['var'] = np.zeros((bulge+1, mms+1), dtype=int) + general_table[stripped] = dict() + general_table[stripped]['ref'] = np.zeros((bulge+1, mms+1), dtype=int) + general_table[stripped]['var'] = np.zeros((bulge+1, mms+1), dtype=int) add_to_general_table = {} count_superpop = {} @@ -106,19 +107,22 @@ def init_summary_by_sample(path_samplesID): dict_summary_by_guide[bulge_type][3] += 1 if var: - samples = splitted[13].split(",") + samples = set(splitted[13].split(",")) seen_superpop = set() + seen_pop = set() for sample in samples: - if sample != "NO_SAMPLES": + if sample != "NO_SAMPLES" and sample != '': dict_samples[sample][3] += 1 - dict_pop[dict_samples[sample][1]] += 1 - dict_superpop[dict_samples[sample][2]] += 1 + seen_pop.add(dict_samples[sample][1]) seen_superpop.add(dict_samples[sample][2]) if pam_creation: dict_samples[sample][6] += 1 for superpop in seen_superpop: + dict_superpop[superpop] += 1 count_superpop[guide][superpop]['distributions'][int( splitted[8]) + int(splitted[9])][int(splitted[9])] += 1 + for pop in seen_pop: + dict_pop[pop] += 1 else: add_to_general_table[guide]['distributions'][int( splitted[8]) + int(splitted[9])][int(splitted[9])] += 1 diff --git a/PostProcess/radar_chart.py b/PostProcess/radar_chart.py index 6fdfe78..e538f51 100644 --- a/PostProcess/radar_chart.py +++ b/PostProcess/radar_chart.py @@ -267,6 +267,7 @@ def guideDictCreation(): def fillDict(guide, guideDict, motifDict): # fill dictionary with info read from the final file + inFinalFile.seek(0) if '#' in inFinalFile.readline(): print('SKIP HEADER') else: @@ -275,6 +276,8 @@ def fillDict(guide, guideDict, motifDict): for line in inFinalFile: # print(line) split = line.strip().split('\t') + if guide not in split[15]: + continue alignedSequence = split[2] # aligned target with the guide mismatch = int(split[8]) # mm extracted from target file bulge = int(split[9]) # bul extracted from target file diff --git a/PostProcess/radar_chart_dict_generator.py b/PostProcess/radar_chart_dict_generator.py new file mode 100644 index 0000000..cfd2006 --- /dev/null +++ b/PostProcess/radar_chart_dict_generator.py @@ -0,0 +1,458 @@ +#!/usr/bin/env python + +# Libraries +from operator import itemgetter +from os.path import isfile, join +from os import listdir +import os +import warnings +import glob +from itertools import islice +import sys +import numpy as np +import scipy.spatial.distance as sp +from math import pi +import pandas as pd +from matplotlib import patches as mpatches +from matplotlib import pyplot as plt +import math +import matplotlib +import time +import random +import multiprocessing +import json + +# matplotlib.use("TkAgg") +matplotlib.use('Agg') + +warnings.filterwarnings("ignore") + +plt.style.use('seaborn-poster') +matplotlib.rcParams['pdf.fonttype'] = 42 +matplotlib.rcParams['ps.fonttype'] = 42 +random.seed(a=None, version=2) +# final file containing all the results after post processing +inGuideFile = open(sys.argv[1], 'r') # guide file used during search +inFinalFile = open(sys.argv[2], 'r') # final result file from search +inSamplesIDFile = open(sys.argv[3], 'r').readlines() # sampleID file +inSamplesIDFile.pop(0) # pop header from sampleID file +# annotation file used during search +inAnnotationsFile = open(sys.argv[4], 'r') +outDir = sys.argv[5] # directory to output the figures +threads = int(sys.argv[6]) # number of concurrent execution of image creation + +web_server = False +population = False +# file_extension = 'pdf' +file_extension = 'png' + +if '-ws' in sys.argv[:]: + web_server = True +if web_server: + file_extension = 'png' + + +# def generatePlot(guide, guideDict, motifDict, mismatch, bulge, source): + +# # check if no targets are found for that combination source/totalcount and skip the execution +# if guideDict['General'] == 0: +# return +# percentage_list = [] +# for elem in guideDict: +# if float(guideDict['General']) != 0: +# percentage_list.append( +# float(str(float(guideDict[elem])/float(guideDict['General']))[0:5])) +# else: +# percentage_list.append(float(0)) + +# guideDataFrame = pd.DataFrame.from_dict(guideDict, orient='index') +# guideDataFrame['Percentage'] = percentage_list +# guideDataFrame.columns = ['Total', 'Percentage'] +# guideDataFrame = guideDataFrame.T +# # number of variable +# categories = list(guideDataFrame)[0:] +# N = len(categories) + +# # We are going to plot the first line of the data frame. +# # But we need to repeat the first value to close the circular graph: +# values = guideDataFrame.loc['Percentage'].values.flatten().tolist() +# values += values[:1] + +# # What will be the angle of each axis in the plot? (we divide the plot / number of variable) +# angles = [n / float(N) * 2 * pi for n in range(N)] +# angles += angles[:1] + +# # Initialise the spider plot +# ax = plt.subplot(2, 2, 1, polar=True) + +# for label, rot in zip(ax.get_xticklabels(), angles): +# if (rot == 0): +# label.set_horizontalalignment("center") +# if (rot > 0): +# label.set_horizontalalignment("left") +# if (rot > 3): +# label.set_horizontalalignment("center") +# if (rot > 4): +# label.set_horizontalalignment("right") + +# # Draw one axe per variable + add labels labels yet +# plt.xticks(angles[:-1], categories, color='black', size=14) + +# # Draw ylabels +# # # # Draw ylabels +# ax.set_rlabel_position(0) +# plt.yticks([0, 0.25, 0.50, 0.75], ["0", "0.25", +# "0.50", "0.75"], color="black", size=12) +# plt.ylim(0, 1) + +# # Fill area +# ax.fill(angles, values, 'b', alpha=0.1) + +# # # # offset posizione y-axis +# ax.set_theta_offset(pi / 2) +# ax.set_theta_direction(-1) +# # Plot data +# ax.plot(angles, values, linewidth=1, linestyle='solid') + +# plt.subplot(2, 2, 2) +# transpose_list = [] +# guideDataFrame = guideDataFrame.T +# for elem in categories: +# transpose_list.append(list(guideDataFrame.loc[elem])) +# # transpose_list.append( +# # [guideDataFrame.loc[elem, 'Total'], guideDataFrame.loc[elem, 'Percentage']]) + +# plt.axis('off') +# table = plt.table(cellText=transpose_list, rowLabels=categories, colLabels=['Total', 'Percentage'], +# loc='best', colWidths=[0.25, 0.25]) +# table.auto_set_font_size(False) +# table.set_fontsize(13) + +# totalMotif = [0]*len(guide) +# for count in range(len(guide)): +# for nuc in motifDict: +# totalMotif[count] += motifDict[nuc][count] + +# maxmax = max(totalMotif) +# for count in range(len(guide)): +# for nuc in motifDict: +# if maxmax != 0: +# motifDict[nuc][count] = float( +# motifDict[nuc][count]/float(maxmax)) +# # motifDict[nuc][count] = float( +# # str(float(motifDict[nuc][count])/float(maxmax))[0:5]) + +# # ind = np.arange(0, len(guide), 1) + 0.15 +# ind = np.arange(0, len(guide), 1) +# width = 0.7 # the width of the bars: can also be len(x) sequence + +# motif = plt.subplot(2, 1, 2, frameon=False) + +# A = np.array(motifDict['A'], dtype=float) +# C = np.array(motifDict['C'], dtype=float) +# G = np.array(motifDict['G'], dtype=float) +# T = np.array(motifDict['T'], dtype=float) +# RNA = np.array(motifDict['RNA'], dtype=float) +# DNA = np.array(motifDict['DNA'], dtype=float) + +# p1 = plt.bar(ind, A, width, align='center') +# p2 = plt.bar(ind, C, width, bottom=A, align='center') +# p3 = plt.bar(ind, G, width, bottom=A+C, align='center') +# p4 = plt.bar(ind, T, width, bottom=C+G+A, align='center') +# p5 = plt.bar(ind, RNA, width, bottom=C+G+A+T, align='center') +# p6 = plt.bar(ind, DNA, width, bottom=C+G+A+T+RNA, +# align='center') + +# # plt.xlim(0, len(guide)) +# # strArray = np.array([list(guide)]) +# plt.xticks(ticks=ind, labels=list(guide)) + +# plt.legend((p1[0], p2[0], p3[0], p4[0], p5[0], p6[0]), +# ('A', 'C', 'G', 'T', 'bRNA', 'bDNA'), fontsize=13, loc='upper right', ncol=6) + +# # strArray = np.array([list(guide)]) +# # table = plt.table(cellText=strArray, loc='bottom', +# # cellLoc='center', rowLoc='bottom') +# # table.auto_set_font_size(False) +# # table.set_fontsize(12) +# # # table.scale(1, 1.6) +# # table.xticks = ([]) +# # table.yticks = ([]) +# # plt.xticks + +# plt.suptitle(str(mismatch)+" Mismatches+ "+str(bulge)+" Bulge "+str(source), +# horizontalalignment='center', color='black', size=25) + +# plt.tight_layout() +# plt.subplots_adjust(top=0.85, bottom=0.05, left=0.05, +# right=0.95, wspace=0.1) +# plt.savefig(outDir+"/summary_single_guide_" + str(guide) + "_" + str(mismatch) + +# "."+str(bulge) + '_' + str(source) + "." + file_extension, format=file_extension) + +# plt.close('all') + + +def motifDictCreation(guide): + # create time stamp for motif dict + motifDict = dict() + for mismatch in range(0, 11): + motifDict[mismatch] = dict() + for bulge in range(0, 6): + motifDict[mismatch][bulge] = dict() + motifDict[mismatch][bulge]['TOTAL'] = dict() + motifDict[mismatch][bulge]['TOTAL']['A'] = [0]*len(guide) + motifDict[mismatch][bulge]['TOTAL']['C'] = [0]*len(guide) + motifDict[mismatch][bulge]['TOTAL']['G'] = [0]*len(guide) + motifDict[mismatch][bulge]['TOTAL']['T'] = [0]*len(guide) + motifDict[mismatch][bulge]['TOTAL']['RNA'] = [0]*len(guide) + motifDict[mismatch][bulge]['TOTAL']['DNA'] = [0]*len(guide) + for sample in populationDict: + superpop = populationDict[sample][0] + pop = populationDict[sample][1] + motifDict[mismatch][bulge][sample] = dict() + motifDict[mismatch][bulge][sample]['A'] = [0]*len(guide) + motifDict[mismatch][bulge][sample]['C'] = [0]*len(guide) + motifDict[mismatch][bulge][sample]['G'] = [0]*len(guide) + motifDict[mismatch][bulge][sample]['T'] = [0]*len(guide) + motifDict[mismatch][bulge][sample]['RNA'] = [0]*len(guide) + motifDict[mismatch][bulge][sample]['DNA'] = [0]*len(guide) + if superpop not in motifDict[mismatch][bulge]: + motifDict[mismatch][bulge][superpop] = dict() + motifDict[mismatch][bulge][superpop]['A'] = [0]*len(guide) + motifDict[mismatch][bulge][superpop]['C'] = [0]*len(guide) + motifDict[mismatch][bulge][superpop]['G'] = [0]*len(guide) + motifDict[mismatch][bulge][superpop]['T'] = [0]*len(guide) + motifDict[mismatch][bulge][superpop]['RNA'] = [ + 0]*len(guide) + motifDict[mismatch][bulge][superpop]['DNA'] = [ + 0]*len(guide) + if pop not in motifDict[mismatch][bulge]: + motifDict[mismatch][bulge][pop] = dict() + motifDict[mismatch][bulge][pop]['A'] = [0]*len(guide) + motifDict[mismatch][bulge][pop]['C'] = [0]*len(guide) + motifDict[mismatch][bulge][pop]['G'] = [0]*len(guide) + motifDict[mismatch][bulge][pop]['T'] = [0]*len(guide) + motifDict[mismatch][bulge][pop]['RNA'] = [0]*len(guide) + motifDict[mismatch][bulge][pop]['DNA'] = [0]*len(guide) + return motifDict + + +def guideDictCreation(): + # create one stamp for guideDict + guideDict = dict() + for mismatch in range(0, 11): + guideDict[mismatch] = dict() + for bulge in range(0, 6): + guideDict[mismatch][bulge] = dict() + guideDict[mismatch][bulge]['TOTAL'] = dict() + guideDict[mismatch][bulge]['TOTAL']['General'] = 0 + for annot in annotationsSet: + guideDict[mismatch][bulge]['TOTAL'][annot] = 0 + for sample in populationDict: + superpop = populationDict[sample][0] + pop = populationDict[sample][1] + guideDict[mismatch][bulge][sample] = dict() + guideDict[mismatch][bulge][sample]['General'] = 0 + if superpop not in guideDict[mismatch][bulge]: + guideDict[mismatch][bulge][superpop] = dict() + guideDict[mismatch][bulge][superpop]['General'] = 0 + if pop not in guideDict[mismatch][bulge]: + guideDict[mismatch][bulge][pop] = dict() + guideDict[mismatch][bulge][pop]['General'] = 0 + for annot in annotationsSet: + guideDict[mismatch][bulge][superpop][annot] = 0 + guideDict[mismatch][bulge][pop][annot] = 0 + guideDict[mismatch][bulge][sample][annot] = 0 + return guideDict + + +def fillDict(guide, guideDict, motifDict): + # fill dictionary with info read from the final file + inFinalFile.seek(0) + if '#' in inFinalFile.readline(): + print('SKIP HEADER') + else: + inFinalFile.seek(0) + + for line in inFinalFile: + # print(line) + split = line.strip().split('\t') + if guide not in split[15]: + continue + alignedSequence = split[2] # aligned target with the guide + mismatch = int(split[8]) # mm extracted from target file + bulge = int(split[9]) # bul extracted from target file + + # add target to TOTAL dict + # print(mismatch, bulge, guideDict[mismatch][bulge]['TOTAL']['General']) + guideDict[mismatch][bulge]['TOTAL']['General'] += 1 + # find annotations and add data to the TOTAL dict + if split[14] != 'n': + annotationsList = split[14].strip().split(',') + for annotation in annotationsList: + guideDict[mismatch][bulge]['TOTAL'][annotation] += 1 + # find motif in X and RNA/DNA targets + if 'DNA' not in split[0]: + for count, nucleotide in enumerate(alignedSequence): + if nucleotide.islower(): + motifDict[mismatch][bulge]['TOTAL'][nucleotide.upper() + ][count] += 1 + elif nucleotide == '-': + motifDict[mismatch][bulge]['TOTAL'][split[0]][count] += 1 + if guide[count] == 'N': + motifDict[mismatch][bulge]['TOTAL'][nucleotide.upper() + ][count] += 1 + else: + alignedGuide = split[1] + for count, nucleotide in enumerate(alignedGuide[bulge:]): + if nucleotide == '-': + motifDict[mismatch][bulge]['TOTAL'][split[0]][count] += 1 + for count, nucleotide in enumerate(alignedSequence[bulge:]): + if nucleotide.islower(): + motifDict[mismatch][bulge]['TOTAL'][nucleotide.upper() + ][count] += 1 + if guide[count] == 'N': + motifDict[mismatch][bulge]['TOTAL'][nucleotide.upper() + ][count] += 1 + # check if samples are available and take them if yes + samplePresent = False + if 'n' not in split[13] and 'NO_SAMPLES' not in split[13]: + samplePresent = split[13].strip().split(',') + if samplePresent != False: # if samples are present analyze them + alreadySampled = set() + for sample in samplePresent: + superpop = populationDict[sample][0] + pop = populationDict[sample][1] + if superpop not in alreadySampled: # check if superpop is already been updated for the current target to avoid duplicate count + alreadySampled.add(superpop) + guideDict[mismatch][bulge][superpop]['General'] += 1 + # find motif in X and RNA/DNA targets + if 'DNA' not in split[0]: + for count, nucleotide in enumerate(alignedSequence): + if nucleotide.islower(): + motifDict[mismatch][bulge][superpop][nucleotide.upper() + ][count] += 1 + elif nucleotide == '-': + motifDict[mismatch][bulge][superpop][split[0] + ][count] += 1 + if guide[count] == 'N': + motifDict[mismatch][bulge][superpop][nucleotide.upper() + ][count] += 1 + else: + alignedGuide = split[1] + for count, nucleotide in enumerate(alignedGuide[bulge:]): + if nucleotide == '-': + motifDict[mismatch][bulge][superpop][split[0] + ][count] += 1 + for count, nucleotide in enumerate(alignedSequence[bulge:]): + if nucleotide.islower(): + motifDict[mismatch][bulge][superpop][nucleotide.upper() + ][count] += 1 + if guide[count] == 'N': + motifDict[mismatch][bulge][superpop][nucleotide.upper() + ][count] += 1 + if split[14] != 'n': + annotationsList = split[14].strip().split(',') + for annotation in annotationsList: + guideDict[mismatch][bulge][superpop][annotation] += 1 + if pop not in alreadySampled: # check if pop is already been updated for the current target to avoid duplicate count + alreadySampled.add(pop) + guideDict[mismatch][bulge][pop]['General'] += 1 + # find motif in X and RNA/DNA targets + if 'DNA' not in split[0]: + for count, nucleotide in enumerate(alignedSequence): + if nucleotide.islower(): + motifDict[mismatch][bulge][pop][nucleotide.upper() + ][count] += 1 + elif nucleotide == '-': + motifDict[mismatch][bulge][pop][split[0] + ][count] += 1 + if guide[count] == 'N': + motifDict[mismatch][bulge][pop][nucleotide.upper() + ][count] += 1 + else: + alignedGuide = split[1] + for count, nucleotide in enumerate(alignedGuide[bulge:]): + if nucleotide == '-': + motifDict[mismatch][bulge][pop][split[0] + ][count] += 1 + for count, nucleotide in enumerate(alignedSequence[bulge:]): + if nucleotide.islower(): + motifDict[mismatch][bulge][pop][nucleotide.upper() + ][count] += 1 + if guide[count] == 'N': + motifDict[mismatch][bulge][pop][nucleotide.upper() + ][count] += 1 + if split[14] != 'n': + annotationsList = split[14].strip().split(',') + for annotation in annotationsList: + guideDict[mismatch][bulge][pop][annotation] += 1 + # add target to single sample, in this case duplicate are allowed + guideDict[mismatch][bulge][sample]['General'] += 1 + # find motif in X and RNA/DNA targets + if 'DNA' not in split[0]: + for count, nucleotide in enumerate(alignedSequence): + if nucleotide.islower(): + motifDict[mismatch][bulge][sample][nucleotide.upper() + ][count] += 1 + elif nucleotide == '-': + motifDict[mismatch][bulge][sample][split[0] + ][count] += 1 + if guide[count] == 'N': + motifDict[mismatch][bulge][sample][nucleotide.upper() + ][count] += 1 + else: + alignedGuide = split[1] + for count, nucleotide in enumerate(alignedGuide[bulge:]): + if nucleotide == '-': + motifDict[mismatch][bulge][sample][split[0] + ][count] += 1 + for count, nucleotide in enumerate(alignedSequence[bulge:]): + if nucleotide.islower(): + motifDict[mismatch][bulge][sample][nucleotide.upper() + ][count] += 1 + if guide[count] == 'N': + motifDict[mismatch][bulge][sample][nucleotide.upper() + ][count] += 1 + if split[14] != 'n': + annotationsList = split[14].strip().split(',') + for annotation in annotationsList: + guideDict[mismatch][bulge][sample][annotation] += 1 + + +# data containing populations and annotations +annotationsSet = set() +populationDict = dict() + +# read all the annotations +for line in inAnnotationsFile: + for elem in line.strip().split('\t')[3].strip().split(','): + annotationsSet.add(elem) +annotationsSet = sorted(annotationsSet) + +# read all the population, superpop and samples +for line in inSamplesIDFile: + split = line.strip().split('\t') + populationDict[split[0]] = [split[2], split[1]] + +for guide in inGuideFile: + # guide = guide.strip().replace('N', '') + guide = guide.strip() + guideDict = guideDictCreation() + motifDict = motifDictCreation(guide) + fillDict(guide, guideDict, motifDict) + json.dump(guideDict, open(outDir+f"/guide_dict_{guide}.json", 'w')) + json.dump(motifDict, open(outDir+f"/motif_dict_{guide}.json", 'w')) + # print('Starting image creation for guide: ', guide) + # if __name__ == '__main__': + # pool = multiprocessing.Pool(threads) + # for mismatch in range(0, 8): + # for bulge in range(0, 3): + # # skip creation if no target in global category + # if guideDict[mismatch][bulge]['TOTAL']['General'] == 0: + # continue + # for elem in guideDict[mismatch][bulge]: + # pool.apply_async(generatePlot, args=( + # guide, guideDict[mismatch][bulge][elem], motifDict[mismatch][bulge][elem], mismatch, bulge, elem)) + # pool.close() + # pool.join() diff --git a/PostProcess/remove_contiguous_samples_cfd.py b/PostProcess/remove_contiguous_samples_cfd.py index 0fcf7ce..68879a4 100644 --- a/PostProcess/remove_contiguous_samples_cfd.py +++ b/PostProcess/remove_contiguous_samples_cfd.py @@ -56,10 +56,12 @@ def get_best_targets(cluster, fileOut, fileOut_disc, cfd, snp_info): if ele[snp_info] == 'n': list_ref.append(ele) else: - if ele[snp_info] in dict_var.keys(): - dict_var[ele[snp_info]].append(ele) + if (ele[pos], ele[snp_info]) in dict_var.keys(): #merge samples of identical targets (coming from different VCF datasets) + dict_var[(ele[pos], ele[snp_info])][0][true_guide - 2] = dict_var[(ele[pos],ele[snp_info])][0][true_guide - 2] + "," + ele[true_guide - 2] #true_guide - 2 points to samples column + dict_var[(ele[pos], ele[snp_info])][0][snp_info - 2] = dict_var[(ele[pos],ele[snp_info])][0][snp_info - 2] + "," + ele[snp_info - 2] #snp_info - 2 points to rsID column + dict_var[(ele[pos], ele[snp_info])][0][snp_info - 1] = dict_var[(ele[pos],ele[snp_info])][0][snp_info - 1] + "," + ele[snp_info - 1] #ttuesnp_info_guide - 2 points to AF column else: - dict_var[ele[snp_info]] = [ele] + dict_var[(ele[pos], ele[snp_info])] = [ele] list_ref.sort(key = lambda x : int(x[total])) if len(list_ref) > 1: diff --git a/PostProcess/resultIntegrator.py b/PostProcess/resultIntegrator.py index d167117..65a861d 100644 --- a/PostProcess/resultIntegrator.py +++ b/PostProcess/resultIntegrator.py @@ -1,12 +1,10 @@ #!/usr/bin/env python -from intervaltree import Interval, IntervalTree +from intervaltree import IntervalTree import sys import time -import concurrent.futures import glob import subprocess -import pandas as pd def rev_comp(a): @@ -95,7 +93,7 @@ def createBedforMultiAlternative(variantList, samples): # annotation data open inAnnotationFile = open(annotationFile, 'r') # guide file -realguide = open(guideFile, 'r').readlines() +# realguide = open(guideFile, 'r').readlines() # open outputDir to write results originFileName = crispritzResultFile.split('/') originFileName = originFileName[len(originFileName)-1] @@ -118,16 +116,24 @@ def createBedforMultiAlternative(variantList, samples): genomeDict = {} empiricalDict = {} valueDict = {} +# saveDict = {"real_guide": 'n', "genome": 'n', "chr": 'n', "prim_pos": 'n', "strand": 'n', "highest_CFD_guide_alignment": 'n', "highest_CFD_alignment(ref)": 'n', +# "highest_CFD_alignment(alt)": 'n', "ref_seq_length": 'n', "ref_pos_alt(aligned_strand)": 'n', "pam": 'n', "annotation": 'n', "highest_CFD_score": 'n', +# "highest_CFD_score(ref)": 'n', "highest_CFD_score(alt)": 'n', "risk_score": 'n', "absolute_risk_score": 'n', "highest_CFD_mismatch": 'n', +# "highest_CFD_bulge": 'n', "highest_CFD_mismatch+bulge": 'n', "fewest_mm+bulge_guide_alignment": 'n', "fewest_mm+bulge_alignment(ref)": 'n', +# "fewest_mm+bulge_alignment(alt)": 'n', "fewest_mm+bulge_CFD_score(ref)": 'n', "fewest_mm+bulge_CFD_score(alt)": 'n', "fewest_mismatch": 'n', +# "fewest_bulge": 'n', "fewest_mismatch+bulge": 'n', "alt_haplotypes": 'n', "prim_origin": 'n', "prim_AF": 'n', "prim_samples": 'n', +# "prim_SNP_ID(positive_strand)": 'n', "gene_name": 'n', "gene_ID": 'n', "gene_annotation": 'n', "gene_distance(kb)": 'n', "lowest_empirical": 'n', +# "Nature2019": 'n', "Nature2019_mm+bul": 'n', "CHANGEseq": 'n', "CHANGEseq_mm+bul": 'n', "CIRCLEseq": 'n', "CIRCLEseq_mm+bul": 'n', "ONEseq": 'n', +# "ONEseq_mm+bul": 'n', "GUIDEseq_293": 'n', "GUIDEseq_293_mm+bul": 'n', "GUIDEseq_CD34": 'n', "GUIDEseq_CD34_mm+bul": 'n', "GUIDEseq": 'n', +# "GUIDEseq_mm+bul": 'n'} + saveDict = {"real_guide": 'n', "genome": 'n', "chr": 'n', "prim_pos": 'n', "strand": 'n', "highest_CFD_guide_alignment": 'n', "highest_CFD_alignment(ref)": 'n', "highest_CFD_alignment(alt)": 'n', "ref_seq_length": 'n', "ref_pos_alt(aligned_strand)": 'n', "pam": 'n', "annotation": 'n', "highest_CFD_score": 'n', "highest_CFD_score(ref)": 'n', "highest_CFD_score(alt)": 'n', "risk_score": 'n', "absolute_risk_score": 'n', "highest_CFD_mismatch": 'n', "highest_CFD_bulge": 'n', "highest_CFD_mismatch+bulge": 'n', "fewest_mm+bulge_guide_alignment": 'n', "fewest_mm+bulge_alignment(ref)": 'n', "fewest_mm+bulge_alignment(alt)": 'n', "fewest_mm+bulge_CFD_score(ref)": 'n', "fewest_mm+bulge_CFD_score(alt)": 'n', "fewest_mismatch": 'n', "fewest_bulge": 'n', "fewest_mismatch+bulge": 'n', "alt_haplotypes": 'n', "prim_origin": 'n', "prim_AF": 'n', "prim_samples": 'n', - "prim_SNP_ID(positive_strand)": 'n', "gene_name": 'n', "gene_ID": 'n', "gene_annotation": 'n', "gene_distance(kb)": 'n', "lowest_empirical": 'n', - "Nature2019": 'n', "Nature2019_mm+bul": 'n', "CHANGEseq": 'n', "CHANGEseq_mm+bul": 'n', "CIRCLEseq": 'n', "CIRCLEseq_mm+bul": 'n', "ONEseq": 'n', - "ONEseq_mm+bul": 'n', "GUIDEseq_293": 'n', "GUIDEseq_293_mm+bul": 'n', "GUIDEseq_CD34": 'n', "GUIDEseq_CD34_mm+bul": 'n', "GUIDEseq": 'n', - "GUIDEseq_mm+bul": 'n'} + "prim_SNP_ID(positive_strand)": 'n', "gene_name": 'n', "gene_ID": 'n', "gene_annotation": 'n', "gene_distance(kb)": 'n', "lowest_empirical": 'n'} start_time = time.time() @@ -139,13 +145,20 @@ def createBedforMultiAlternative(variantList, samples): empList.append(count) # adding empirical data to the tree empiricalTree[int(empList[2]):int(empList[3])] = empList + # generate temp empirical dict to save empirical information per row empiricalDict[str(empList[5])] = 50 + # value of empirical row, keeping info about mm+bul for each empirical origin (seq data, in-silico, vitro, ecc) valueDict[str(empList[5])] = 'n' + # update save dict with user-defined names from empirical data + saveDict[str(empList[5])] = 'n' + newkey = str(empList[5])+'_mm+bul' + saveDict[newkey] = 'n' # writing header in file save = '#' -for key in saveDict: - save += str(key)+'\t' +save += '\t'.join(list(saveDict.keys())) +# for key in saveDict: +# save += str(key)+'\t' save += '\n' outFile.write(save) diff --git a/PostProcess/scriptAnalisiNNN_v3.sh b/PostProcess/scriptAnalisiNNN_v3.sh index 282927c..fb79b03 100644 --- a/PostProcess/scriptAnalisiNNN_v3.sh +++ b/PostProcess/scriptAnalisiNNN_v3.sh @@ -38,7 +38,7 @@ output_folder=${12} echo $jobid # 1) Rimozione duplicati, estrazione semicommon e unique e creazione file total -echo 'Creazione file .total.txt' +#echo 'Creazione file .total.txt' ./extraction.sh $REFtargets $ENRtargets $jobid # OUTPUT $jobid.common_targets.txt -> Non usato # $jobid.semi_common_targets.txt # $jobid.unique_targets.txt @@ -55,7 +55,7 @@ cat $jobid.unique_targets.pamcreation.txt $jobid.semi_common_targets.minmaxdisr. rm $jobid.unique_targets.pamcreation.txt rm $jobid.semi_common_targets.minmaxdisr.txt -echo 'Creazione cluster del file .total.txt' +#echo 'Creazione cluster del file .total.txt' # 3) Clustering ./cluster.dict.py $jobid.total.txt 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' # OUTPUT $jobid.total.cluster.txt rm $jobid.total.txt @@ -83,7 +83,7 @@ rm $jobid.total.txt # I file .bestCFD.txt si possono unire (attenzione che avrò 3 header all'interno del file) per ottenere il file completo # I file .CFDGraph.txt vanno sommati tra loro per ottenere il file finale -> AL MOMENTO NON NECESSARIO PER QUESTA ANALISI (TENERE COMUNQUE I FILE) -echo 'Estrazione sample dal file .total.cluster.txt' +#echo 'Estrazione sample dal file .total.cluster.txt' # ./simpleAnalysis_v3.py "$annotationfile" "$jobid.total.cluster.txt" "$jobid" "$dictionaries" "$pam_file" $mismatch "$referencegenome" "$guide_file" $bulgesDNA $bulgesRNA ./new_simple_analysis.py "$referencegenome" "$dictionaries" "$jobid.total.cluster.txt" "${pam_file}" "$jobid" "$mismatch" diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh index 32a224d..a8090e4 100644 --- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh +++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh @@ -36,15 +36,15 @@ touch $log output=$output_folder/output.txt touch $output -rm $output_folder/queue.txt +rm -f $output_folder/queue.txt #for vcf_f in "${vcf_list[@]}"; if [ $2 == "_" ]; then echo -e "_" >> $output_folder/tmp_list_vcf.txt vcf_list=$output_folder/tmp_list_vcf.txt fi -echo "" >> $vcf_list +echo >> $vcf_list if [ $6 != "_" ]; then - echo "" >> $6 + echo >> $6 fi while read vcf_f; @@ -52,7 +52,7 @@ do if [ -z "$vcf_f" ]; then continue fi - vcf_folder=$(realpath ${vcf_f}) + vcf_folder="${current_working_directory}/VCFs/${vcf_f}" ref_name=$(basename $1) #folder_of_folders=$(dirname $1) vcf_name=$(basename $vcf_f) @@ -64,6 +64,7 @@ do echo -e 'Job\tStart\t'$(date) > $log + echo -e 'Job\tStart\t'$(date) >&2 unset real_chroms declare -a real_chroms @@ -133,6 +134,7 @@ do cd "$current_working_directory/Genomes" if ! [ -d "$current_working_directory/Genomes/${ref_name}+${vcf_name}" ]; then echo -e 'Add-variants\tStart\t'$(date) >> $log + echo -e 'Add-variants\tStart\t'$(date) >&2 echo -e "Adding variants" crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" #if ! [ -d "${ref_name}+${vcf_name}" ]; then @@ -152,9 +154,12 @@ do mkdir "genome_library/${true_pam}_2_${ref_name}+${vcf_name}_INDELS" fi echo -e 'Add-variants\tEnd\t'$(date) >> $log + echo -e 'Add-variants\tEnd\t'$(date) >&2 echo -e 'Indexing Indels\tStart\t'$(date) >> $log + echo -e 'Indexing Indels\tStart\t'$(date) >&2 ${starting_dir}/./pool_index_indels.py "$current_working_directory/Genomes/variants_genome/" "$pam_file" $true_pam $ref_name $vcf_name $ncpus echo -e 'Indexing Indels\tEnd\t'$(date) >> $log + echo -e 'Indexing Indels\tEnd\t'$(date) >&2 if ! [ -d $current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS ]; then mkdir $current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS fi @@ -172,8 +177,10 @@ do if [ "$vcf_name" != "_" ]; then if ! [ -d "$current_working_directory/genome_library/${true_pam}_2_${ref_name}+${vcf_name}_INDELS" ]; then echo -e 'Indexing Indels\tStart\t'$(date) >> $log + echo -e 'Indexing Indels\tStart\t'$(date) >&2 ${starting_dir}/./pool_index_indels.py "$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS/" "$pam_file" $true_pam $ref_name $vcf_name $ncpus echo -e 'Indexing Indels\tEnd\t'$(date) >> $log + echo -e 'Indexing Indels\tEnd\t'$(date) >&2 fi fi @@ -187,11 +194,13 @@ do if ! [ $bMax -gt 1 ]; then if ! [ -d "$current_working_directory/genome_library/${true_pam}_1_${ref_name}" ]; then echo -e 'Index-genome Reference\tStart\t'$(date) >> $log + echo -e 'Index-genome Reference\tStart\t'$(date) >&2 echo -e 'Indexing_Reference' > $output echo -e "Indexing reference genome" crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus pid_index_ref=$! echo -e 'Index-genome Reference\tEnd\t'$(date) >> $log + echo -e 'Index-genome Reference\tEnd\t'$(date) >&2 idx_ref="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" else echo -e "Reference Index already present" @@ -199,21 +208,25 @@ do fi else echo -e 'Index-genome Reference\tStart\t'$(date) >> $log + echo -e 'Index-genome Reference\tStart\t'$(date) >&2 echo -e 'Indexing_Reference' > $output echo -e "Indexing reference genome" crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus pid_index_ref=$! echo -e 'Index-genome Reference\tEnd\t'$(date) >> $log + echo -e 'Index-genome Reference\tEnd\t'$(date) >&2 idx_ref="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" fi else echo -e "Reference Index already present" echo -e 'Index-genome Reference\tEnd\t'$(date) >> $log + echo -e 'Index-genome Reference\tEnd\t'$(date) >&2 idx_ref="$current_working_directory/genome_library/${true_pam}_2_${ref_name}" fi else echo -e "Reference Index already present" echo -e 'Index-genome Reference\tEnd\t'$(date) >> $log + echo -e 'Index-genome Reference\tEnd\t'$(date) >&2 idx_ref="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" fi @@ -224,11 +237,13 @@ do if ! [ $bMax -gt 1 ]; then if ! [ -d "$current_working_directory/genome_library/${true_pam}_1_${ref_name}+${vcf_name}" ]; then echo -e 'Index-genome Variant\tStart\t'$(date) >> $log + echo -e 'Index-genome Variant\tStart\t'$(date) >&2 echo -e 'Indexing_Enriched' > $output echo -e "Indexing variant genome" crispritz.py index-genome "${ref_name}+${vcf_name}" "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" -bMax $bMax -th $ncpus #${ref_folder%/}+${vcf_name}/ pid_index_var=$! echo -e 'Index-genome Variant\tEnd\t'$(date) >> $log + echo -e 'Index-genome Variant\tEnd\t'$(date) >&2 idx_var="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}+${vcf_name}" else echo -e "Variant Index already present" @@ -236,21 +251,25 @@ do fi else echo -e 'Index-genome Variant\tStart\t'$(date) >> $log + echo -e 'Index-genome Variant\tStart\t'$(date) >&2 echo -e 'Indexing_Enriched' > $output echo -e "Indexing variant genome" crispritz.py index-genome "${ref_name}+${vcf_name}" "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" -bMax $bMax -th $ncpus pid_index_ref=$! echo -e 'Index-genome Variant\tEnd\t'$(date) >> $log + echo -e 'Index-genome Variant\tEnd\t'$(date) >&2 idx_var="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}+${vcf_name}" fi else echo -e "Variant Index already present" echo -e 'Index-genome Variant\tEnd\t'$(date) >> $log + echo -e 'Index-genome Variant\tEnd\t'$(date) >&2 idx_var="$current_working_directory/genome_library/${true_pam}_2_${ref_name}+${vcf_name}" fi else echo -e "Variant Index already present" echo -e 'Index-genome Variant\tEnd\t'$(date) >> $log + echo -e 'Index-genome Variant\tEnd\t'$(date) >&2 idx_var="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}+${vcf_name}" fi fi @@ -259,6 +278,7 @@ do echo $idx_ref if ! [ -f "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then echo -e 'Search Reference\tStart\t'$(date) >> $log + echo -e 'Search Reference\tStart\t'$(date) >&2 echo -e 'Search Reference' > $output crispritz.py search $idx_ref "$pam_file" "$guide_file" "${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $(expr $ncpus / 4) & pid_search_ref=$! @@ -269,10 +289,12 @@ do if [ "$vcf_name" != "_" ]; then if ! [ -f "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then echo -e 'Search Variant\tStart\t'$(date) >> $log + echo -e 'Search Variant\tStart\t'$(date) >&2 echo -e 'Search Variant' > $output crispritz.py search "$idx_var" "$pam_file" "$guide_file" "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $(expr $ncpus / 4) -var mv "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" "$output_folder/crispritz_targets" echo -e 'Search Variant\tEnd\t'$(date) >> $log + echo -e 'Search Variant\tEnd\t'$(date) >&2 else echo -e "Search for variant already done" fi @@ -280,11 +302,13 @@ do if ! [ -f "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then echo -e "Search INDELs Start" echo -e 'Search INDELs\tStart\t'$(date) >> $log + echo -e 'Search INDELs\tStart\t'$(date) >&2 cd $starting_dir ./pool_search_indels.py "$ref_folder" "$vcf_folder" "$vcf_name" "$guide_file" "$pam_file" $bMax $mm $bDNA $bRNA "$output_folder" $true_pam "$current_working_directory/" mv "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" "$output_folder/crispritz_targets" echo -e "Search INDELs End" echo -e 'Search INDELs\tEnd\t'$(date) >> $log + echo -e 'Search INDELs\tEnd\t'$(date) >&2 else echo -e "Search INDELs already done" fi @@ -295,6 +319,7 @@ do sleep 100 done echo -e 'Search Reference\tEnd\t'$(date) >> $log + echo -e 'Search Reference\tEnd\t'$(date) >&2 if [ -f "$output_folder/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then mv "$output_folder/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" "$output_folder/crispritz_targets" @@ -312,6 +337,7 @@ do echo -e 'Post analysis' > $output if [ "$vcf_name" != "_" ]; then echo -e 'Post-analysis SNPs\tStart\t'$(date) >> $log + echo -e 'Post-analysis SNPs\tStart\t'$(date) >&2 final_res="$output_folder/final_results_$(basename ${output_folder}).bestMerge.txt" final_res_alt="$output_folder/final_results_$(basename ${output_folder}).altMerge.txt" if ! [ -f "$final_res" ]; then @@ -324,6 +350,7 @@ do ./pool_post_analisi_snp.py $output_folder $ref_folder $vcf_name $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $dict_folder $final_res $final_res_alt $ncpus echo -e 'Post-analysis SNPs\tEnd\t'$(date) >> $log + echo -e 'Post-analysis SNPs\tEnd\t'$(date) >&2 for key in "${real_chroms[@]}" do echo "Concatenating $key" @@ -334,6 +361,7 @@ do done else echo -e 'Post-analysis\tStart\t'$(date) >> $log + echo -e 'Post-analysis\tStart\t'$(date) >&2 final_res="$output_folder/final_results_$(basename ${output_folder}).bestMerge.txt" final_res_alt="$output_folder/final_results_$(basename ${output_folder}).altMerge.txt" if ! [ -f "$final_res" ]; then @@ -353,6 +381,7 @@ do # rm "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.altMerge.txt" done echo -e 'Post-analysis\tEnd\t'$(date) >> $log + echo -e 'Post-analysis\tEnd\t'$(date) >&2 fi @@ -361,8 +390,10 @@ do cd "$starting_dir" echo -e 'Post-analysis INDELs\tStart\t'$(date) >> $log + echo -e 'Post-analysis INDELs\tStart\t'$(date) >&2 ./pool_post_analisi_indel.py $output_folder $ref_folder $vcf_folder $guide_file $mm $bDNA $bRNA $annotation_file $pam_file "$current_working_directory/Dictionaries/" $final_res $final_res_alt $ncpus echo -e 'Post-analysis INDELs\tEnd\t'$(date) >> $log + echo -e 'Post-analysis INDELs\tEnd\t'$(date) >&2 for key in "${array_fake_chroms[@]}" do echo "Concatenating $key" @@ -382,7 +413,8 @@ do if [ -z "$samples" ]; then continue fi - tail -n +2 $samples >> "$output_folder/sampleID.txt" + # tail -n +2 $samples >> "$output_folder/sampleID.txt" + grep -v '#' "${current_working_directory}/samplesIDs/$samples" >> "$output_folder/sampleID.txt" done < $sampleID sed -i 1i"#SAMPLE_ID\tPOPULATION_ID\tSUPERPOPULATION_ID\tGENDER" "$output_folder/sampleID.txt" @@ -391,16 +423,20 @@ sampleID=$output_folder/sampleID.txt echo -e 'Merging targets' > $output echo -e 'Merging Close Targets\tStart\t'$(date) >> $log +echo -e 'Merging Close Targets\tStart\t'$(date) >&2 ./merge_close_targets_cfd.sh $final_res $final_res.trimmed $merge_t mv $final_res.trimmed $final_res mv $final_res.trimmed.discarded_samples $final_res_alt echo -e 'Merging Close Targets\tEnd\t'$(date) >> $log +echo -e 'Merging Close Targets\tEnd\t'$(date) >&2 echo -e 'Merging Alternative Chromosomes\tStart\t'$(date) >> $log +echo -e 'Merging Alternative Chromosomes\tStart\t'$(date) >&2 ./merge_alt_chr.sh $final_res $final_res.chr_merged echo -e 'Merging Alternative Chromosomes\tEnd\t'$(date) >> $log +echo -e 'Merging Alternative Chromosomes\tEnd\t'$(date) >&2 mv $final_res.chr_merged $final_res @@ -424,6 +460,7 @@ mv $output_folder/indels.CFDGraph.txt $output_folder/cfd_graphs echo -e 'Creating images' > $output echo -e 'Creating images\tStart\t'$(date) >> $log +echo -e 'Creating images\tStart\t'$(date) >&2 echo -e "Adding risk score" ./add_risk_score.py $final_res $final_res.risk mv "$final_res.risk" "${output_folder}/$(basename ${output_folder}).bestMerge.txt" @@ -463,13 +500,16 @@ fi cd $starting_dir if [ "$vcf_name" != "_" ]; then - ./radar_chart.py $guide_file "${output_folder}/$(basename ${output_folder}).bestMerge.txt" $sampleID $annotation_file "$output_folder/imgs" $ncpus + #./radar_chart.py $guide_file "${output_folder}/$(basename ${output_folder}).bestMerge.txt" $sampleID $annotation_file "$output_folder/imgs" $ncpus + ./radar_chart_dict_generator.py $guide_file "${output_folder}/$(basename ${output_folder}).bestMerge.txt" $sampleID $annotation_file "$output_folder" $ncpus else echo -e "dummy_file" > dummy.txt - ./radar_chart.py $guide_file "${output_folder}/$(basename ${output_folder}).bestMerge.txt" dummy.txt $annotation_file "$output_folder/imgs" $ncpus + #./radar_chart.py $guide_file "${output_folder}/$(basename ${output_folder}).bestMerge.txt" dummy.txt $annotation_file "$output_folder/imgs" $ncpus + ./radar_chart_dict_generator.py $guide_file "${output_folder}/$(basename ${output_folder}).bestMerge.txt" dummy.txt $annotation_file "$output_folder" $ncpus rm dummy.txt fi echo -e 'Creating images\tEnd\t'$(date) >> $log +echo -e 'Creating images\tEnd\t'$(date) >&2 # if [ "$vcf_name" != "_" ]; then # cp $sampleID $output_folder/sampleID.txt @@ -477,24 +517,53 @@ echo -e 'Creating images\tEnd\t'$(date) >> $log echo -e 'Building database' echo -e 'Creating database\tStart\t'$(date) >> $log +echo -e 'Creating database\tStart\t'$(date) >&2 if [ -f "${output_folder}/$(basename ${output_folder}).db" ]; then rm "${output_folder}/$(basename ${output_folder}).db" fi python $starting_dir/db_creation.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt" "${output_folder}/$(basename ${output_folder})" echo -e 'Creating database\tEnd\t'$(date) >> $log +echo -e 'Creating database\tEnd\t'$(date) >&2 echo $gene_proximity echo -e 'Integrating results\tStart\t'$(date) >> $log +echo -e 'Integrating results\tStart\t'$(date) >&2 +echo >> $guide_file +# while read guide; +# do +# if [ -z "$guide" ]; then +# continue +# fi +# touch "${output_folder}/imgs/CRISPRme_top_1000_log_for_main_text_${guide}.png" +# done < $guide_file if [ $gene_proximity != "_" ]; then touch "${output_folder}/dummy.txt" genome_version=$(echo ${ref_name} | sed 's/_ref//' | sed -e 's/\n//') #${output_folder}/Params.txt | awk '{print $2}' | sed 's/_ref//' | sed -e 's/\n//') echo $genome_version bash $starting_dir/post_process.sh "${output_folder}/$(basename ${output_folder}).bestMerge.txt" "${gene_proximity}" "${output_folder}/dummy.txt" "${guide_file}" $genome_version "${output_folder}" "vuota" rm "${output_folder}/dummy.txt" - python $starting_dir/CRISPRme_plots.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/imgs/" + while read guide; + do + if [ -z "$guide" ]; then + continue + fi + head -1 "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" >> "${output_folder}/tmp_linda_plot_file_${guide}.txt" + fgrep "$guide" "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" >> "${output_folder}/tmp_linda_plot_file_${guide}.txt" + python $starting_dir/CRISPRme_plots.py "${output_folder}/tmp_linda_plot_file_${guide}.txt" "${output_folder}/imgs/" $guide &> "${output_folder}/warnings.txt" + rm -f "${output_folder}/warnings.txt" + rm "${output_folder}/tmp_linda_plot_file_${guide}.txt" + done < $guide_file fi +truncate -s -1 $guide_file +truncate -s -1 $vcf_list +if [ $6 != "_" ]; then + truncate -s -1 $6 +fi + echo -e 'Integrating results\tEnd\t'$(date) >> $log +echo -e 'Integrating results\tEnd\t'$(date) >&2 echo -e 'Job\tDone\t'$(date) >> $log +echo -e 'Job\tDone\t'$(date) >&2 echo -e 'Job End' > $output echo -e "JOB END" diff --git a/assets/favicon.png b/assets/favicon.png new file mode 100644 index 0000000..c4f1480 Binary files /dev/null and b/assets/favicon.png differ diff --git a/assets/helpPage/advOpt.PNG b/assets/helpPage/advOpt.PNG index 271d47c..1c92810 100644 Binary files a/assets/helpPage/advOpt.PNG and b/assets/helpPage/advOpt.PNG differ diff --git a/assets/helpPage/allowed.PNG b/assets/helpPage/allowed.PNG deleted file mode 100644 index b53795a..0000000 Binary files a/assets/helpPage/allowed.PNG and /dev/null differ diff --git a/assets/helpPage/crRNA.PNG b/assets/helpPage/crRNA.PNG deleted file mode 100644 index f8ef73c..0000000 Binary files a/assets/helpPage/crRNA.PNG and /dev/null differ diff --git a/assets/helpPage/email.PNG b/assets/helpPage/email.PNG deleted file mode 100644 index 9e5ef0a..0000000 Binary files a/assets/helpPage/email.PNG and /dev/null differ diff --git a/assets/helpPage/genome.png b/assets/helpPage/genome.png index 99e0b40..7710333 100644 Binary files a/assets/helpPage/genome.png and b/assets/helpPage/genome.png differ diff --git a/assets/helpPage/guides.PNG b/assets/helpPage/guides.PNG index 5fe5b33..ab1d238 100644 Binary files a/assets/helpPage/guides.PNG and b/assets/helpPage/guides.PNG differ diff --git a/assets/helpPage/homeCRISPRme.PNG b/assets/helpPage/homeCRISPRme.PNG deleted file mode 100644 index 97a64e2..0000000 Binary files a/assets/helpPage/homeCRISPRme.PNG and /dev/null differ diff --git a/assets/helpPage/nuclease.png b/assets/helpPage/nuclease.png new file mode 100644 index 0000000..ed89251 Binary files /dev/null and b/assets/helpPage/nuclease.png differ diff --git a/assets/helpPage/pam.png b/assets/helpPage/pam.png index 6a1c7da..d9894a7 100644 Binary files a/assets/helpPage/pam.png and b/assets/helpPage/pam.png differ diff --git a/assets/helpPage/sequence.PNG b/assets/helpPage/sequence.PNG index 9188ec6..9605e0b 100644 Binary files a/assets/helpPage/sequence.PNG and b/assets/helpPage/sequence.PNG differ diff --git a/assets/helpPage/submit.png b/assets/helpPage/submit.png deleted file mode 100644 index 6c1f856..0000000 Binary files a/assets/helpPage/submit.png and /dev/null differ diff --git a/assets/helpPage/thresholds.png b/assets/helpPage/thresholds.png new file mode 100644 index 0000000..ec60c7a Binary files /dev/null and b/assets/helpPage/thresholds.png differ diff --git a/assets/helpPage/warning.png b/assets/helpPage/warning.png index 5bdde73..5ab437d 100644 Binary files a/assets/helpPage/warning.png and b/assets/helpPage/warning.png differ diff --git a/assets/main_page.png b/assets/main_page.png index cdd588c..4058248 100644 Binary files a/assets/main_page.png and b/assets/main_page.png differ diff --git a/assets/resultPage/customRanking.png b/assets/resultPage/customRanking.png new file mode 100644 index 0000000..f8b130b Binary files /dev/null and b/assets/resultPage/customRanking.png differ diff --git a/assets/resultPage/personalCard.png b/assets/resultPage/personalCard.png new file mode 100644 index 0000000..9674618 Binary files /dev/null and b/assets/resultPage/personalCard.png differ diff --git a/assets/resultPage/populationDistribution.png b/assets/resultPage/populationDistribution.png deleted file mode 100644 index 1dc8904..0000000 Binary files a/assets/resultPage/populationDistribution.png and /dev/null differ diff --git a/assets/resultPage/resultsSummary.png b/assets/resultPage/resultsSummary.png index b5d7041..15f9ea1 100644 Binary files a/assets/resultPage/resultsSummary.png and b/assets/resultPage/resultsSummary.png differ diff --git a/assets/resultPage/summaryByGraphic.PNG b/assets/resultPage/summaryByGraphic.PNG deleted file mode 100644 index f253b36..0000000 Binary files a/assets/resultPage/summaryByGraphic.PNG and /dev/null differ diff --git a/assets/resultPage/summaryByGraphic_population.PNG b/assets/resultPage/summaryByGraphic_population.PNG index 388a1e8..14ec36e 100644 Binary files a/assets/resultPage/summaryByGraphic_population.PNG and b/assets/resultPage/summaryByGraphic_population.PNG differ diff --git a/assets/resultPage/summaryByGraphic_sample.PNG b/assets/resultPage/summaryByGraphic_sample.PNG index 208a658..47ab7be 100644 Binary files a/assets/resultPage/summaryByGraphic_sample.PNG and b/assets/resultPage/summaryByGraphic_sample.PNG differ diff --git a/assets/resultPage/summaryByGraphicaGecko.png b/assets/resultPage/summaryByGraphicaGecko.png deleted file mode 100644 index 37285d9..0000000 Binary files a/assets/resultPage/summaryByGraphicaGecko.png and /dev/null differ diff --git a/assets/resultPage/summaryByGuide.png b/assets/resultPage/summaryByGuide.png index 27f4175..6793052 100644 Binary files a/assets/resultPage/summaryByGuide.png and b/assets/resultPage/summaryByGuide.png differ diff --git a/assets/resultPage/summaryByPosition.png b/assets/resultPage/summaryByPosition.png index 2cb5a22..d70fe97 100644 Binary files a/assets/resultPage/summaryByPosition.png and b/assets/resultPage/summaryByPosition.png differ diff --git a/assets/resultPage/summaryBySamples.png b/assets/resultPage/summaryBySamples.png index ac70560..38dd980 100644 Binary files a/assets/resultPage/summaryBySamples.png and b/assets/resultPage/summaryBySamples.png differ diff --git a/assets/waitPage/jobDone.png b/assets/waitPage/jobDone.png index 863f99b..363a3da 100644 Binary files a/assets/waitPage/jobDone.png and b/assets/waitPage/jobDone.png differ diff --git a/assets/waitPage/loadPage.png b/assets/waitPage/loadPage.png index 1c80120..affdc1a 100644 Binary files a/assets/waitPage/loadPage.png and b/assets/waitPage/loadPage.png differ diff --git a/crisprme.py b/crisprme.py index 0ba7ad6..628d9c0 100644 --- a/crisprme.py +++ b/crisprme.py @@ -791,11 +791,13 @@ def complete_search(): p.close() os.system(f'cp {guidefile} {outputfolder}/guides.txt') if variant: - subprocess.run([script_path+'./submit_job_automated_new_multiple_vcfs.sh', str(genomedir), str(vcfdir), str(guidefile), str(pamfile), str(annotationfile), str( - samplefile), str(bMax), str(mm), str(bDNA), str(bRNA), str(merge_t), str(outputfolder), str(script_path), str(thread), str(current_working_directory), str(gene_annotation)]) + with open(f"{outputfolder}/log_verbose.txt", 'w') as log_verbose: + subprocess.run([script_path+'./submit_job_automated_new_multiple_vcfs.sh', str(genomedir), str(vcfdir), str(outputfolder)+"/guides.txt", str(pamfile), str(annotationfile), str( + samplefile), str(bMax), str(mm), str(bDNA), str(bRNA), str(merge_t), str(outputfolder), str(script_path), str(thread), str(current_working_directory), str(gene_annotation)], stdout=log_verbose) else: - subprocess.run([script_path+'./submit_job_automated_new_multiple_vcfs.sh', str(genomedir), '_', str(guidefile), str(pamfile), str(annotationfile), str(script_path+'vuoto.txt'), - str(bMax), str(mm), str(bDNA), str(bRNA), str(merge_t), str(outputfolder), str(script_path), str(thread), str(current_working_directory), str(gene_annotation)]) + with open(f"{outputfolder}/log_verbose.txt", 'w') as log_verbose: + subprocess.run([script_path+'./submit_job_automated_new_multiple_vcfs.sh', str(genomedir), '_', str(outputfolder)+"/guides.txt", str(pamfile), str(annotationfile), str(script_path+'vuoto.txt'), + str(bMax), str(mm), str(bDNA), str(bRNA), str(merge_t), str(outputfolder), str(script_path), str(thread), str(current_working_directory), str(gene_annotation)], stdout=log_verbose) def target_integration(): @@ -809,7 +811,7 @@ def target_integration(): print( "\t--empirical_data, used to specify the file that contains gencode annotation to find nearest gene to any target [OPTIONAL]") print( - "\t--vcf_dir, used to specify the directory containing vcf files used in the search phase, necessary to obtain haplotype frequence in multi-variant targets [OPTIONAL]") + "\t--vcf_dir, used to specify the directory containing vcf files used in the search phase, necessary to obtain haplotype frequence in multi-variant targets [OPTIONAL][BETA]") print("\t--output, used to specify the output folder for the results") exit(0) @@ -915,8 +917,72 @@ def target_integration(): os.system(script_path+"./post_process.sh "+target_file+" "+gencode_file + " "+empiricalfile+" "+guidefile+" "+str(genome_version)+" "+outputfolder+" "+vcf_dir+" "+script_path) +def gnomAD_converter(): + if "--help" in input_args: + print("This is the VCF gnomAD converter provided to convert all gnomADv3.1 VCFs into CRISPRme supported VCFs") + print("These are the flags that must be used in order to run this function:") + print("\t--gnomAD_VCFdir, used to specify the directory containing gnomADv3.1 original VCFs") + print("\t--samplesID, used to specify the pre-generated samplesID file necessary to introduce samples into gnomAD variant") + print("\t--thread, used to specify the number of core used to process VCFs in parallel (DEFAULT is ALL available minus 2) [OPTIONAL]") + exit(0) + + if "--gnomAD_VCFdir" not in input_args: + print("--gnomAD_VCFdir not in input, MANDATORY TO CONVERT DATA") + # vcf_dir = script_path+'vuota/' + exit(1) + else: + try: + vcf_dir = os.path.abspath( + input_args[input_args.index("--gnomAD_VCFdir")+1]) + except IndexError: + print("Please input some parameter for flag --gnomAD_VCFdir") + exit(1) + if not os.path.isdir(vcf_dir): + print("The folder specified for --gnomAD_VCFdir does not exist") + exit(1) + + if "--thread" not in input_args: + # print("--thread must be contained in the input") + # exit(1) + thread = len(os.sched_getaffinity(0))-2 + else: + try: + thread = input_args[input_args.index("--thread")+1] + except IndexError: + print("Please input some parameter for flag --thread") + exit(1) + try: + thread = int(thread) + except: + print("Please input a number for flag thread") + exit(1) + if thread <= 0 or thread > len(os.sched_getaffinity(0))-2: + print("thread is set to default (ALL available minus 2)") + thread = len(os.sched_getaffinity(0))-2 + # exit(1) + + if "--samplesID" not in input_args: + print("--samplesID not in input, MANDATORY TO CONVERT DATA") + exit(1) + elif "--samplesID" in input_args: + try: + samplefile = os.path.abspath( + input_args[input_args.index("--samplesID")+1]) + except IndexError: + print("Please input some parameter for flag --samplesID") + exit(1) + if not os.path.isfile(samplefile): + print("The file specified for --samplesID does not exist") + exit(1) + + os.system(script_path+"./convert_gnomAD.py "+vcf_dir+" "+samplefile +" "+ str(thread)) + def web_interface(): + if "--help" in input_args: + print("This function must be launched without input, it starts a local server to use the web interface.") + print("Open your web-browser and write 127.0.0.1:8080 in the search bar if you are executing locally, if you are executing on an extern server write :8080 in the search bar") + exit(0) # print(corrected_web_path) subprocess.run(corrected_web_path+'/./index.py') @@ -925,18 +991,22 @@ def web_interface(): def callHelp(): print("help:\n", "\nALL FASTA FILEs USED BY THE SOFTWARE MUST BE UNZIPPED AND CHROMOSOME SEPARATED, ALL VCFs USED BY THE SOFTWARE MUST BE ZIPPED AND CHROMOSOME SEPARATED", - "\ncrisprime complete-search FUNCTION SEARCHING THE WHOLE GENOME (REFERENCE AND VARIANT IF REQUESTED) AND PERFORM CFD ANALYSIS AND TARGET SELECTION", + "\n\ncrisprime.py complete-search FUNCTION SEARCHING THE WHOLE GENOME (REFERENCE AND VARIANT IF REQUESTED) AND PERFORM CFD ANALYSIS AND TARGET SELECTION", #"\ncrisprime search-only FUNCTION SEARCHING THE WHOLE GENOME (REFERENCE AND VARIANT IF REQUESTED) PRODUCING RESULTS FOR POST-ANALYSIS", #"\ncrisprime post-analysis-only FUNCTION THAT PERFORMS CFD ANALYSIS AND TARGET SELECTION STARTING FROM SEARCH RESULTS", - "\ncrisprime targets-integration FUNCTION THAT INTEGRATES IN-SILICO TARGETS WITH EMPIRICAL DATA GENERATING A USABLE PANEL", - "\ncrisprme web-interface FUNCTION TO ACTIVATE WEB INTERFACE OF CRISPRme" - "\n\nADD help TO ANY FUNCTION TO VISUALIZE A BRIEF HELP PAGE (example: crisprime complete-search --help)\n") + "\n\ncrisprime.py targets-integration FUNCTION THAT INTEGRATES IN-SILICO TARGETS WITH EMPIRICAL DATA GENERATING A USABLE PANEL", + "\n\ncrisprime.py gnomAD-converter FUNCTION THAT CONVERTS ALL gnomADv3.1 vcf.bgz FILES INTO COMPATIBLE VCFs", + "\n\ncrisprime.py web-interface FUNCTION TO ACTIVATE WEB INTERFACE OF CRISPRme TO USE WITH A BROWSER LOCALLY" + "\n\nADD help TO ANY FUNCTION TO VISUALIZE A BRIEF HELP PAGE (example: crisprime.py complete-search --help)\n") if len(sys.argv) < 2: + directoryCheck() callHelp() elif sys.argv[1] == 'complete-search': complete_search() +elif sys.argv[1] == 'gnomAD-converter': + gnomAD_converter() # elif sys.argv[1] == 'search-only': # search_only() # elif sys.argv[1] == 'post-analysis-only': diff --git a/folder_structure.png b/folder_structure.png deleted file mode 100644 index 9593d98..0000000 Binary files a/folder_structure.png and /dev/null differ diff --git a/index.py b/index.py index e684f73..1b39750 100644 --- a/index.py +++ b/index.py @@ -37,7 +37,8 @@ def directoryCheck(): # function to check the main directory status, if some directory is missing, create it - directoryList = ['Genomes', 'Results', 'Dictionaries', 'VCFs', 'Annotations', 'Gencode', 'PAMs', 'samplesIDs'] + directoryList = ['Genomes', 'Results', 'Dictionaries', + 'VCFs', 'Annotations', 'Gencode', 'PAMs', 'samplesIDs'] for directory in directoryList: if not os.path.exists(current_working_directory+directory): os.makedirs(current_working_directory+directory) diff --git a/pages/help_page.py b/pages/help_page.py index 9019ff8..a5644c6 100644 --- a/pages/help_page.py +++ b/pages/help_page.py @@ -9,7 +9,7 @@ import dash_bootstrap_components as dbc import base64 # for decoding upload content import io # for decoding upload content - +from app import app_main_directory def helpPage(): final_list = [] @@ -20,7 +20,7 @@ def helpPage(): 'CRISPRme performs predictive analysis and result assessment on population and individual specific CRISPR/Cas experiments.' + ' CRISPRme enumerates on- and off-target accounting simultaneously for substitutions, DNA/RNA bulges and common genetic variants from the 1000 genomes project.' ]), - html.P(['Open this ', html.A('example', href='http://crispritz.di.univr.it/result?job=Q47PXDTBC8', + html.P(['Open this ', html.A('example', href='http://crisprme.di.univr.it/load?job=6FDKYQS472', target='_blank'), ' to navigate the results we show in this page']) ]) @@ -33,71 +33,132 @@ def helpPage(): html.Div([ html.P( ['In the main page of CRISPRme, users can select a wide range of options to personalize their searches. The input phase is divided into three main steps:', - html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open('assets/main_page.png', 'rb').read()).decode()), width="100%", height="auto")] + html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+'/assets/main_page.png', 'rb').read()).decode()), width="100%", height="auto")] ), html.Ul( [ html.Li( [ - 'STEP 1: Genome and PAM selection', + 'STEP 1: Guide, Nuclease and PAM selection', html.Ul( [ html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( - open('assets/helpPage/genome.png', 'rb').read()).decode()), width='30%'), + open(app_main_directory+'/assets/helpPage/guides.png', 'rb').read()).decode()), width='30%'), html.Li( - 'Genome: here you can select a genome from the ones present, some genomes have also the variant version enriched (indicated with a \'+\' symbol) with genetic variant from the 1000genome project'), + 'Individual Protospacer(s): a list of crRNAs sequences, consisting in 1 or more sequences (max 1000 sequences) to search on the genome'), + html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( + open(app_main_directory+'/assets/helpPage/sequence.png', 'rb').read()).decode()), width='40%'), + html.Li('Genomic sequence(s): one or more genetic sequences (max 1000 characters), each sequence MUST BE separated with the header \'>name\'. The sequence can be also submitted with a ' + + 'chromosome range, also provided with an header. The region will be extracted from the Genome selected in STEP 1'), html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( - open('assets/helpPage/pam.png', 'rb').read()).decode()), width='30%'), + open(app_main_directory+'/assets/helpPage/nuclease.png', 'rb').read()).decode()), width='30%'), html.Li( - 'PAM: here you can select a Protospacer Adjacent Motif for a specific Cas protein.'), + 'Nuclease: here you can select a specific Cas protein.'), + html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( + open(app_main_directory+'/assets/helpPage/pam.png', 'rb').read()).decode()), width='30%'), + html.Li( + 'PAM: here you can select a Protospacer Adjacent Motif for the specified Cas protein.'), ], style={'padding': '15px'} ) ] ), + # html.Li( + # [ + # 'STEP 1: Genome and PAM selection', + # html.Ul( + # [ + # html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( + # open(app_main_directory+'/assets/helpPage/genome.png', 'rb').read()).decode()), width='30%'), + # html.Li( + # 'Genome: here you can select a genome from the ones present, some genomes have also the variant version enriched (indicated with a \'+\' symbol) with genetic variant from the 1000genome project'), + # html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( + # open(app_main_directory+'/assets/helpPage/pam.png', 'rb').read()).decode()), width='30%'), + # html.Li( + # 'PAM: here you can select a Protospacer Adjacent Motif for a specific Cas protein.'), + # ], style={'padding': '15px'} + # ) + # ] + # ), html.Li( [ - 'STEP 2: Guide insertion and threshold configuration', + 'STEP 2: Genome selection and threshold configuration', html.Ul( [ html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( - open('assets/helpPage/guides.PNG', 'rb').read()).decode()), width='40%'), + open(app_main_directory+'/assets/helpPage/genome.png', 'rb').read()).decode()), width='40%'), html.Li( - 'Guides: a list of crRNAs sequences, consisting in 1 or more sequences (max 1000 sequences) to search on the genome'), - html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( - open('assets/helpPage/sequence.PNG', 'rb').read()).decode()), width='40%'), - html.Li('Sequence: one or more genetic sequences (max 1000 characters), each sequence MUST BE separated with the header \'>name\'. The sequence can be also submitted with a ' + - 'chromosome range, also provided with an header. The region will be extracted from the Genome selected in STEP 1'), + 'Genome: here you can select a genome from the ones present and combine it with one or more VCF datasets (1000G, HGDP, personal variants)'), html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( - open('assets/helpPage/crRNA.PNG', 'rb').read()).decode()), width='20%'), + open(app_main_directory+'/assets/helpPage/thresholds.png', 'rb').read()).decode()), width='20%'), html.Li( 'Allowed mismatches: number of tolerated mismatches in a target'), html.Li( 'Bulge DNA size: size of bubbles tolerated on the DNA sequence (can be consecutive(AA--AA) or interleaved(AA-A-AA)).'), html.Li( 'Bulge RNA size: size of bubbles tolerated on the RNA sequence (can be consecutive(AA--AA) or interleaved(AA-A-AA))'), - # html.Img(src = 'data:image/png;base64,{}'.format(base64.b64encode(open('assets/helpPage/crRNA.PNG', 'rb').read()).decode()), width='20%' ), - html.Li( - 'crRNA length: available only when a genetic sequence is given as input, represents the length of the guides (without PAM) that you want to extract from the sequence.') ], style={'padding': '15px'} ) ] ), + # html.Li( + # [ + # 'STEP 2: Guide insertion and threshold configuration', + # html.Ul( + # [ + # html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( + # open(app_main_directory+'/assets/helpPage/guides.png', 'rb').read()).decode()), width='40%'), + # html.Li( + # 'Guides: a list of crRNAs sequences, consisting in 1 or more sequences (max 1000 sequences) to search on the genome'), + # html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( + # open(app_main_directory+'/assets/helpPage/sequence.png', 'rb').read()).decode()), width='40%'), + # html.Li('Sequence: one or more genetic sequences (max 1000 characters), each sequence MUST BE separated with the header \'>name\'. The sequence can be also submitted with a ' + + # 'chromosome range, also provided with an header. The region will be extracted from the Genome selected in STEP 1'), + # html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( + # open(app_main_directory+'/assets/helpPage/crRNA.png', 'rb').read()).decode()), width='20%'), + # html.Li( + # 'Allowed mismatches: number of tolerated mismatches in a target'), + # html.Li( + # 'Bulge DNA size: size of bubbles tolerated on the DNA sequence (can be consecutive(AA--AA) or interleaved(AA-A-AA)).'), + # html.Li( + # 'Bulge RNA size: size of bubbles tolerated on the RNA sequence (can be consecutive(AA--AA) or interleaved(AA-A-AA))'), + # # html.Img(src = 'data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+'/assets/helpPage/crRNA.png', 'rb').read()).decode()), width='20%' ), + # html.Li( + # 'crRNA length: available only when a genetic sequence is given as input, represents the length of the guides (without PAM) that you want to extract from the sequence.') + # ], style={'padding': '15px'} + # ) + # ] + # ), html.Li( [ - 'STEP 3 (Advanced options): Select various comparisons', + 'STEP 3 Select Annotation(s) and notify by email', html.Ul( [ html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( - open('assets/helpPage/advOpt.PNG', 'rb').read()).decode()), width='40%'), + open(app_main_directory+'/assets/helpPage/advOpt.png', 'rb').read()).decode()), width='40%'), html.Li( - 'Compare your results with the GeCKO v2 library: selected by default, compares the results of your guides with the results obtained in a previous search with guides from the well-known GeCKO library.'), - html.Li('Compare your results with the corresponding reference genome: selected by default when an enriched genome is chosen, compares the results with the respective reference genome to evaluate differences when variant are added.'), + 'The user can choose the annotation to use, either only the provided set of annotations or combine it with a personal annotation file.'), html.Li( 'Notify me by email: if selected, let you insert an email to receive a notification when your job is terminated.'), ], style={'padding': '15px'} ) ] ) + # html.Li( + # [ + # 'STEP 3 (Advanced options): Select various comparisons', + # html.Ul( + # [ + # html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( + # open(app_main_directory+'/assets/helpPage/advOpt.png', 'rb').read()).decode()), width='40%'), + # html.Li( + # 'Compare your results with the GeCKO v2 library: selected by default, compares the results of your guides with the results obtained in a previous search with guides from the well-known GeCKO library.'), + # html.Li('Compare your results with the corresponding reference genome: selected by default when an enriched genome is chosen, compares the results with the respective reference genome to evaluate differences when variant are added.'), + # html.Li( + # 'Notify me by email: if selected, let you insert an email to receive a notification when your job is terminated.'), + # ], style={'padding': '15px'} + # ) + # ] + # ) ], style={'padding': '15px'} ) ]) @@ -116,7 +177,7 @@ def helpPage(): [ 'WARNING: If some inputs are missing, a warning popup will be displayed', html.P(), html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( - open('assets/helpPage/warning.png', 'rb').read()).decode()), width='100%'), + open(app_main_directory+'/assets/helpPage/warning.png', 'rb').read()).decode()), width='100%'), ], color='warning', fade=False, style={'width': '70%'} ) ] @@ -128,10 +189,10 @@ def helpPage(): [ 'After the submission, the status of the search will be displayed in a new page', html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( - open('assets/waitPage/loadPage.png', 'rb').read()).decode()), width='100%'), + open(app_main_directory+'/assets/waitPage/loadPage.png', 'rb').read()).decode()), width='100%'), 'When the job is complete, the result link will appear at the end of the status report', html.P(html.Img(src='data:image/png;base64,{}'.format(base64.b64encode( - open('assets/waitPage/jobDone.png', 'rb').read()).decode()), width='20%')) + open(app_main_directory+'/assets/waitPage/jobDone.png', 'rb').read()).decode()), width='20%')) ] ) ) @@ -140,41 +201,47 @@ def helpPage(): final_list.append( html.P( [ - 'At the top of the page, you find a table with the list of sgRNAs used during the search phase. This table summarizes the results obtained for each input guide.', - html.P(html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open( - 'assets/resultPage/resultsSummary.png', 'rb').read()).decode()), width='100%')), + 'At the top of the page, you find a table with the list of gRNAs used during the search phase. This table summarizes the results obtained for each input guide.', + html.P(html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+ + '/assets/resultPage/resultsSummary.png', 'rb').read()).decode()), width='100%')), html.Ul( [ html.Li('CFD: Off-Target Cutting Frequency Determination Score, calculates how much is the affinity of the guides with the off-targets, basically tells you the likelihood of the guide to perform cut in off-target regions.'), - html.Li('Doench 2016: On-Target Efficacy Scoring (Azimuth 2.0), it’s a trained machine learning model that gives you the likelihood of on-target activity for the selected guide.'), - html.Li( - 'On-Targets Reference: shows how many possible On-Targets the guide can have in the Reference Genome.'), - html.Li(['Samples in Class 0 - 0+ - 1 - 1+: shows the number of samples grouped by Sample Class:', - html.Ul([ - html.Li( - 'Class 0: Samples that does not have any On-Targets'), - html.Li( - 'Class 0+: Samples that have a subset of the Reference Genome On-Targets'), - html.Li( - 'Class 1: Samples that have the same On-Targets as the Reference Genome'), - html.Li( - 'Class 1+: Samples that creates at least a new On-Target, that is not present in the Reference Genome') - ]) - ]), + # html.Li('Doench 2016: On-Target Efficacy Scoring (Azimuth 2.0), it’s a trained machine learning model that gives you the likelihood of on-target activity for the selected guide.'), + # html.Li( + # 'On-Targets Reference: shows how many possible On-Targets the guide can have in the Reference Genome.'), + # html.Li(['Samples in Class 0 - 0+ - 1 - 1+: shows the number of samples grouped by Sample Class:', + # html.Ul([ + # html.Li( + # 'Class 0: Samples that does not have any On-Targets'), + # html.Li( + # 'Class 0+: Samples that have a subset of the Reference Genome On-Targets'), + # html.Li( + # 'Class 1: Samples that have the same On-Targets as the Reference Genome'), + # html.Li( + # 'Class 1+: Samples that creates at least a new On-Target, that is not present in the Reference Genome') + # ]) + # ]), html.Li('Off-Targets Reference (0 - n Mismatches + Bulges): shows how many possible Off-Targets the guide can have in the Reference Genome. Targets are also grouped by Mismatch + Bulge value.'), - html.Li('Off-Targets Enriched (0 - n Mismatches + Bulges): shows how many possible Off-Targets the guide can have in the Enriched Genome. Targets are also grouped by Mismatch + Bulge value.') + html.Li('Off-Targets Variant (0 - n Mismatches + Bulges): shows how many possible Off-Targets the guide can have in the Variant Genome. Targets are also grouped by Mismatch + Bulge value.') ], style={'padding': '15px'} ), - html.P(html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open( - 'assets/resultPage/populationDistribution.png', 'rb').read()).decode()), width='100%')), - 'The Show Target Distribution in Populations button opens a section where informations about the number of targets found in each Superpopulation (EAS, EUR, AFR, AMR, SAS) are provided by means of a barplot for each Mismatch + Bulge value. ', - html.P('In the middle of the page there are four tabs:'), + # html.P(html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+ + # '/assets/resultPage/populationDistribution.png', 'rb').read()).decode()), width='100%')), + # 'The Show Target Distribution in Populations button opens a section where informations about the number of targets found in each Superpopulation (EAS, EUR, AFR, AMR, SAS) are provided by means of a barplot for each Mismatch + Bulge value. ', + html.P('In the middle of the page there are six tabs:'), html.Ul( [ - html.Li([html.Span('Summary by Guide: ', style={ + html.Li([html.Span('Custom ranking: ', style={ + 'color': 'red'}), 'This page allows the user to sort the results based on various values like number of mismatches, bulges, CFD, etc...']), + html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+ + '/assets/resultPage/customRanking.png', 'rb').read()).decode()), width='100%'), + + + html.Li([html.Span('Summary by Mismatches/Bulges: ', style={ 'color': 'red'}), 'This table collects all the possible On-/Off- Targets grouped by mismatch/bulge couples.']), - html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open( - 'assets/resultPage/summaryByGuide.png', 'rb').read()).decode()), width='100%'), + html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+ + '/assets/resultPage/summaryByGuide.png', 'rb').read()).decode()), width='100%'), html.Ul( [ html.Li( @@ -186,68 +253,91 @@ def helpPage(): html.Li( 'Targets in Reference: number of targets found in the Reference Genome for that combination of mismatch/bulge'), html.Li( - 'Targets in Enriched: number of targets found in the Enriched Genome for that combination of mismatch/bulge. Each of these targets is associated with at least one sample'), + 'Targets in Variant: number of targets found in the Variant Genome for that combination of mismatch/bulge. Each of these targets is associated with at least one sample'), html.Li( 'PAM Creation: number of possible created PAMs due to variants addition'), html.Li( 'Show Targets: open a new page to display all the targets of the row of interest') ] ), + + html.Li([html.Span('Summary by Sample: ', style={ - 'color': 'red'}), 'This table collects all the possible On-/Off- Targets grouped by sample name.']), - html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open( - 'assets/resultPage/summaryBySamples.png', 'rb').read()).decode()), width='100%'), + 'color': 'red'}), 'This table collects all the possible On-/Off- Targets grouped by sample ID.']), + html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+ + '/assets/resultPage/summaryBySamples.png', 'rb').read()).decode()), width='100%'), html.Ul( [ + html.Li( + 'Gender: the sample gender'), html.Li( 'Population: population which the sample belong to'), html.Li( 'Super Population: continent which the sample belong to'), html.Li( - 'Targets in Enriched: number of targets found in the Enriched Genome that are generated by that sample'), + 'Targets in Variant: number of targets found in the Variant Genome that are generated by that sample'), html.Li( - 'Targets in Population: number of targets found in the Enriched Genome that are generated by all the sample of the population'), + 'Targets in Population: number of targets found in the Variant Genome that are generated by all the sample of the population'), html.Li( - 'Targets in Super Population: number of targets found in the Enriched Genome that are generated by all the populations'), + 'Targets in Super Population: number of targets found in the Variant Genome that are generated by all the populations'), html.Li( 'PAM Creation: number of possible created PAMs due to variants addition'), - html.Li( - 'Class: Sample Class (0 - 0+ - 1 - 1+) associated with the sample'), + # html.Li( + # 'Class: Sample Class (0 - 0+ - 1 - 1+) associated with the sample'), html.Li( 'Show Targets: open a new page to display all the targets of the row of interest') ] ), - html.Li([html.Span('Summary by Position: ', style={ + + html.Li([html.Span('Query Genomic Region: ', style={ 'color': 'red'}), 'This table collects all the possible On-/Off- Targets grouped by position in the genome (composed by chromosome and relative position)']), - html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open( - 'assets/resultPage/summaryByPosition.png', 'rb').read()).decode()), width='100%'), - html.Ul( - [ - html.Li( - 'Position: chromosome relative position of the first letter of the guide'), - html.Li( - 'Best Target: best target found in that position'), - html.Li( - 'Min Mismatch: minimum number of mismatches present in the targets in that position'), - html.Li( - 'Min Bulge: minimum number of bulges present in the targets in that position'), - html.Li( - 'Bulge: number of bulges present in the targets in that position'), - html.Li( - 'Targets in Cluster by Mismatch Value: Matrix showing the distribution of the targets grouped by mismatch/bulge count'), - ] + html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+ + '/assets/resultPage/summaryByPosition.png', 'rb').read()).decode()), width='100%'), + # html.Ul( + # [ + # html.Li( + # 'Position: chromosome relative position of the first letter of the guide'), + # html.Li( + # 'Best Target: best target found in that position'), + # html.Li( + # 'Min Mismatch: minimum number of mismatches present in the targets in that position'), + # html.Li( + # 'Min Bulge: minimum number of bulges present in the targets in that position'), + # html.Li( + # 'Bulge: number of bulges present in the targets in that position'), + # html.Li( + # 'Targets in Cluster by Mismatch Value: Matrix showing the distribution of the targets grouped by mismatch/bulge count'), + # ] - ), - html.Li([html.Span('Graphical Report: ', style={ + # ), + html.Li([html.Span('Graphical Reports: ', style={ 'color': 'red'}), 'This page shows graphics about a specific guide, including genomic annotation and motif logos. The main feature introduced is the possibility to visualize graphical reports at individual level.']), - html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open( - 'assets/resultPage/summaryByGraphicaGecko.png', 'rb').read()).decode()), width='100%'), + html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+ + '/assets/resultPage/summaryByGraphic_population.png', 'rb').read()).decode()), width='100%'), + html.Li( + 'Select a Mismatch and Bulge Value: generate graphics with the specified mismatch+bulge value'), + html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+ + '/assets/resultPage/summaryByGraphic_sample.png', 'rb').read()).decode()), width='100%'), + html.Li( + 'Select Individual Data: generate individual data, by selecting Super Population, Population and Sample'), + + html.Li([html.Span('Personal Risk Cards: ', style={ + 'color': 'red'}), 'This page shows at individual level the most important data for a given sample.']), + html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open(app_main_directory+ + '/assets/resultPage/personalCard.png', 'rb').read()).decode()), width='100%'), html.Ul( [ html.Li( - 'Select a Mismatch Value: generate graphics with the specified mismatch value'), + 'Plot showing the difference in terms of CFD score in a specific genetic region, when accounting for variants in targets in which the selected sample appear.'), + + html.Li( + 'Plot showing the difference in terms of CFD score in a specific genetic region, when accounting for variants in targets unique to the selected sample.'), html.Li( - 'Select Individual Data: generate individual data, by selecting Super Population, Population and Sample') + 'Set of tables reporting the personal information for the selected sample. \n \ + The first table reports personal targets, PAM creation target and private targets. \n \ + The second table lists for each target the crRNA and DNA sequences, the position and cluster position, \ + the chromosome, direction, mismatches, bulge size and total. It also reports the CFD score for the reference and variant target, \ + the annotation and eventually the new PAM generation due to substitution or insertion/deletion (not showed in the figure).') ] ) ], style={'padding': '15px'} @@ -257,41 +347,41 @@ def helpPage(): ) ) - final_list.append( - html.Div( - [ - html.H3('Browser Compatibility'), - html.Div([ - dash_table.DataTable( - data=[{'OS': 'Linux', 'V': 'Ubuntu 18.04.2 LTS', 'Ch': '79.0', 'S': 'n/a', 'ME': 'n/a', 'F': '71.0'}, - {'OS': 'MacOS', 'V': 'Mojave', 'Ch': ' 79.0', - 'S': '13.0.4', 'ME': 'n/a', 'F': '71.0'}, - {'OS': 'Windows', 'V': '10', 'Ch': '79.0', 'S': 'n/a', 'ME': '44.18362.449.0', 'F': '71.0'}], + # final_list.append( + # html.Div( + # [ + # html.H3('Browser Compatibility'), + # html.Div([ + # dash_table.DataTable( + # data=[{'OS': 'Linux', 'V': 'Ubuntu 18.04.2 LTS', 'Ch': '79.0', 'S': 'n/a', 'ME': 'n/a', 'F': '71.0'}, + # {'OS': 'MacOS', 'V': 'Mojave', 'Ch': ' 79.0', + # 'S': '13.0.4', 'ME': 'n/a', 'F': '71.0'}, + # {'OS': 'Windows', 'V': '10', 'Ch': '79.0', 'S': 'n/a', 'ME': '44.18362.449.0', 'F': '71.0'}], - columns=[{'id': 'OS', 'name': 'Operative System'}, {'id': 'V', 'name': 'Version'}, {'id': 'Ch', 'name': 'Chrome'}, - {'id': 'S', 'name': 'Safari'}, {'id': 'ME', 'name': 'Microsoft Edge'}, { - 'id': 'F', 'name': 'Firefox'} - ], + # columns=[{'id': 'OS', 'name': 'Operative System'}, {'id': 'V', 'name': 'Version'}, {'id': 'Ch', 'name': 'Chrome'}, + # {'id': 'S', 'name': 'Safari'}, {'id': 'ME', 'name': 'Microsoft Edge'}, { + # 'id': 'F', 'name': 'Firefox'} + # ], - style_cell={ - 'textAlign': 'center', - 'width': '20' - }, + # style_cell={ + # 'textAlign': 'center', + # 'width': '20' + # }, - style_data_conditional=[ - { - 'if': {'row_index': 'odd'}, - 'backgroundColor': 'rgb(248, 248, 248)' - } - ], - style_header={ - 'backgroundColor': 'rgb(230, 230, 230)', - 'fontWeight': 'bold' - } - ) - ], style={'display': 'inline-block', 'width': '48%'}) - ] - ) - ) + # style_data_conditional=[ + # { + # 'if': {'row_index': 'odd'}, + # 'backgroundColor': 'rgb(248, 248, 248)' + # } + # ], + # style_header={ + # 'backgroundColor': 'rgb(230, 230, 230)', + # 'fontWeight': 'bold' + # } + # ) + # ], style={'display': 'inline-block', 'width': '48%'}) + # ] + # ) + # ) page = html.Div(final_list, style={'margin': '1%'}) return page diff --git a/pages/history_page.py b/pages/history_page.py index 65ae9f0..2fa257f 100644 --- a/pages/history_page.py +++ b/pages/history_page.py @@ -121,8 +121,13 @@ def get_results(): n_guides = 'NA' resultParamDataframe = resultParamDataframe.append({'Job': job, 'Genome_Selected': genome_selected, 'Variant_Selected': genome_idx, 'Mismatches': mms, 'DNA_bulge': dna, 'RNA_bulge': rna, 'PAM': pam, 'Number_Guides': n_guides, 'Start': job_start}, ignore_index=True) - resultParamDataframe = resultParamDataframe.sort_values( - ['Mismatches', 'DNA_bulge', 'RNA_bulge'], ascending=[True, True, True]) + try: + resultParamDataframe['Start'] = pd.to_datetime(resultParamDataframe['Start']) + resultParamDataframe.sort_values(by=['Start'], inplace=True, ascending=False) + except: + pass + # resultParamDataframe = resultParamDataframe.sort_values( + # ['Mismatches', 'DNA_bulge', 'RNA_bulge'], ascending=[True, True, True]) return resultParamDataframe diff --git a/pages/main_page.py b/pages/main_page.py index 7112771..59a178c 100644 --- a/pages/main_page.py +++ b/pages/main_page.py @@ -165,15 +165,20 @@ def changeUrl(n, href, nuclease, genome_selected, ref_var, annotation_var, vcf_i genome_selected = 'hg38_ref' if pam is None or pam == '': pam = '20bp-NGG-SpCas9' + len_guide_sequence = 20 + else: + for elem in pam.split('-'): + if 'bp' in elem: + len_guide_sequence = int(elem.replace('bp', '')) if text_guides is None or text_guides == '': text_guides = 'GAGTCCGAGCAGAAGAAGAA\nCCATCGGTGGCCGTTTGCCC' - else: + elif guide_type != 'GS': text_guides = text_guides.strip() if (not all(len(elem) == len(text_guides.split('\n')[0]) for elem in text_guides.split('\n'))): text_guides = selectSameLenGuides(text_guides) # if (len_guide_sequence is None or str(len_guide_sequence) == '') and ('sequence-tab' in active_tab): # len_guide_sequence = 20 - len_guide_sequence = 20 + # print('guide len', len_guide_sequence) # if (text_sequence is None or text_sequence == '') and ('sequence-tab' in active_tab): # text_sequence = '>sequence\nTACCCCAAACGCGGAGGCGCCTCGGGAAGGCGAGGTGGGCAAGTTCAATGCCAAGCGTGACGGGGGA' @@ -257,29 +262,33 @@ def changeUrl(n, href, nuclease, genome_selected, ref_var, annotation_var, vcf_i vcf_folder = "_" vcf_file.write(vcf_folder+"\n") if '1000G' in ref_var: - vcf_folder = current_working_directory+"/VCFs/hg38_1000G/" + # vcf_folder = current_working_directory+"/VCFs/hg38_1000G/" + vcf_folder = "hg38_1000G/" vcf_file.write(vcf_folder+"\n") - sample_list.append(current_working_directory + \ - 'samplesIDs/hg38_1000G.samplesID.txt') + # sample_list.append(current_working_directory + 'samplesIDs/hg38_1000G.samplesID.txt') + sample_list.append('hg38_1000G.samplesID.txt') # dictionary_directory = current_working_directory + \ # 'dictionaries/dictionary_VCFs_1000_genome_project' if 'HGDP' in ref_var: - vcf_folder = current_working_directory+"/VCFs/hg38_HGDP/" + # vcf_folder = current_working_directory+"/VCFs/hg38_HGDP/" + vcf_folder = "hg38_HGDP/" vcf_file.write(vcf_folder+"\n") - sample_list.append(current_working_directory + \ - 'samplesIDs/hg38_HGDP.samplesID.txt') + # sample_list.append(current_working_directory +'samplesIDs/hg38_HGDP.samplesID.txt') + sample_list.append('hg38_HGDP.samplesID.txt') # dictionary_directory = current_working_directory + \ # 'dictionaries/dictionary_VCFs_1000_genome_project' if "PV" in ref_var: # genome_selected = data_personal_genome[sel_cel_genome['row'] # ]["Personal Genomes"] - vcf_folder = current_working_directory+"/VCFs/" + vcf_input + # vcf_folder = current_working_directory+"/VCFs/" + vcf_input + vcf_folder = vcf_input vcf_file.write(vcf_folder+"\n") # genome_ref = genome_selected.split("+")[0].replace(" ", "_") # sample_list = current_working_directory + 'samplesID/' + vcf_input # sample_list.append(current_working_directory + "/samplesID/" + vcf_input + "/" + [f for f in listdir( # current_working_directory + 'samplesID/' + vcf_input) if isfile(join(current_working_directory + 'samplesID/' + vcf_input, f))][0]) - sample_list.append(current_working_directory + "/samplesIDs/" + vcf_input + ".samplesID.txt") + # sample_list.append(current_working_directory + "/samplesIDs/" + vcf_input + ".samplesID.txt") + sample_list.append(vcf_input + ".samplesID.txt") # dictionary_directory = current_working_directory + \ # 'dictionaries/'+genome_ref+'+'+vcf_input @@ -320,26 +329,49 @@ def changeUrl(n, href, nuclease, genome_selected, ref_var, annotation_var, vcf_i if guide_type == 'GS': text_sequence = text_guides + # print(text_sequence) + # exit() # Extract sequence and create the guides guides = [] + # extracted_seqs = list() + # for lines in text_sequence: + # print('linea', lines) for name_and_seq in text_sequence.split('>'): if '' == name_and_seq: continue name = name_and_seq[:name_and_seq.find('\n')] seq = name_and_seq[name_and_seq.find('\n'):] - seq = seq.strip().split() - seq = ''.join(seq) + # seq = seq.strip().split() + # seq = ''.join(seq) + seq = seq.strip() # name, seq = name_and_seq.strip().split('\n') if 'chr' in seq: - extracted_seq = extract_seq.extractSequence( - name, seq, genome_ref.replace(' ', '_')) + # extracted_seq = extract_seq.extractSequence( + # name, seq, genome_ref.replace(' ', '_')) + for single_row in seq.split('\n'): + if '' == single_row: + continue + pieces_of_row = single_row.strip().split() + seq_to_extract = pieces_of_row[0]+":" + \ + pieces_of_row[1]+"-"+pieces_of_row[2] + extracted_seq = extract_seq.extractSequence( + name, seq_to_extract, genome_ref.replace(' ', '_')) + guides.extend(convert_pam.getGuides( + extracted_seq, pam_char, len_guide_sequence, pam_begin)) else: + seq = seq.split() + seq = ''.join(seq) extracted_seq = seq.strip() - - guides.extend(convert_pam.getGuides( - extracted_seq, pam_char, len_guide_sequence, pam_begin)) - text_guides = '\n'.join(guides).strip() - + guides.extend(convert_pam.getGuides( + extracted_seq, pam_char, len_guide_sequence, pam_begin)) + # print('extracted seq', extracted_seq) + # guides.extend(convert_pam.getGuides( + # extracted_seq, pam_char, len_guide_sequence, pam_begin)) + if not guides: + guides = "GAGTCCGAGCAGAAGAAGAA" + text_guides = '\n'.join(guides).strip() + # print(text_guides, 'and', guides, 'and', pam_char) + # exit() text_guides = text_guides.upper() for g in text_guides.split('\n'): for c in g: @@ -348,6 +380,7 @@ def changeUrl(n, href, nuclease, genome_selected, ref_var, annotation_var, vcf_i if len(text_guides.split('\n')) > 1000: text_guides = '\n'.join(text_guides.split('\n')[:1000]).strip() len_guides = len(text_guides.split('\n')[0]) + # Adjust guides by adding Ns to make compatible with Crispritz if (pam_begin): pam_to_file = pam_char + ('N' * len_guides) + ' ' + index_pam_value @@ -408,25 +441,25 @@ def changeUrl(n, href, nuclease, genome_selected, ref_var, annotation_var, vcf_i # genome_idx = '' # genome_idx_ref = '' # for gidx in genome_indices_created: - # if pam_char in gidx and genome_selected in gidx: - # if int(gidx.split('_')[1]) >= int(max_bulges): - # if '+' in gidx: - # genome_idx = gidx - # genome_idx_ref = gidx.split('+')[0] + # if pam_char in gidx and genome_selected in gidx: + # if int(gidx.split('_')[1]) >= int(max_bulges): + # if '+' in gidx: + # genome_idx = gidx + # genome_idx_ref = gidx.split('+')[0] # if genome_idx == '': - # if int(max_bulges) > 0: - # generate_index_enr = True - # with open(result_dir + '/pam_indexing.txt', 'w+') as pam_id_file: - # pam_id_file.write(pam_to_indexing) - # genome_idx = pam_char + '_' + str(max_bulges) + '_' + genome_selected + # if int(max_bulges) > 0: + # generate_index_enr = True + # with open(result_dir + '/pam_indexing.txt', 'w+') as pam_id_file: + # pam_id_file.write(pam_to_indexing) + # genome_idx = pam_char + '_' + str(max_bulges) + '_' + genome_selected # if genome_idx_ref == '': - # if int(max_bulges) > 0: - # generate_index_ref = True - # with open(result_dir + '/pam_indexing.txt', 'w+') as pam_id_file: - # pam_id_file.write(pam_to_indexing) - # genome_idx_ref = pam_char + '_' + \ - # str(max_bulges) + '_' + genome_selected.split('+')[0] + # if int(max_bulges) > 0: + # generate_index_ref = True + # with open(result_dir + '/pam_indexing.txt', 'w+') as pam_id_file: + # pam_id_file.write(pam_to_indexing) + # genome_idx_ref = pam_char + '_' + \ + # str(max_bulges) + '_' + genome_selected.split('+')[0] # if ref_var != '': # generate_index_enr = False @@ -544,8 +577,11 @@ def changeUrl(n, href, nuclease, genome_selected, ref_var, annotation_var, vcf_i ' ' + genome_type + ' ' + app_main_directory + ' ' + str(dictionary_directory) + ' ' + str(sample_list) + ' ' + str(generate_index_ref) + ' ' + str(generate_index_enr) + ' ' + current_working_directory)) ''' # il 3 è il merge threshold - command = f"{app_main_directory}/PostProcess/./submit_job_automated_new_multiple_vcfs.sh {current_working_directory}/Genomes/{genome_ref} {result_dir}/list_vcfs.txt {guides_file} {pam} {current_working_directory}/Annotations/{annotation_name} {result_dir}/samplesID.txt {max([int(dna), int(rna)])} {mms} {dna} {rna} {3} {result_dir} {app_main_directory}/PostProcess {8} {current_working_directory} {current_working_directory}/Gencode/gencode.protein_coding.bed {dest_email}" - exeggutor.submit(subprocess.run, command, shell=True) + command = f"{app_main_directory}/PostProcess/./submit_job_automated_new_multiple_vcfs.sh {current_working_directory}/Genomes/{genome_ref} {result_dir}/list_vcfs.txt {guides_file} {pam} {current_working_directory}/Annotations/{annotation_name} {result_dir}/samplesID.txt {max([int(dna), int(rna)])} {mms} {dna} {rna} {3} {result_dir} {app_main_directory}/PostProcess {8} {current_working_directory} {current_working_directory}/Gencode/gencode.protein_coding.bed {dest_email} 1> {result_dir}/log_verbose.txt" + #with open(f"{result_dir}/log_verbose.txt", 'w') as log_verbose: + # log_verbose = open(f"{result_dir}/log_verbose.txt", 'w') + exeggutor.submit(subprocess.run, command, shell=True)#, stdout=log_verbose) + #subprocess.run(command, shell=True, stdout=log_verbose) return '/load', '?job=' + job_id @@ -841,7 +877,7 @@ def changePlaceholderGuideTextArea(value): if value == 'IP': return ['GAGTCCGAGCAGAAGAAGAA\nCCATCGGTGGCCGTTTGCCC'] elif value == 'GS': - return ['>sequence 1\nAAGTCCCAGGACTTCAGAAGagctgtgagaccttggc\n>sequence2\nchr1:11,130,540-11,130,751'] + return ['>sequence1\nAAGTCCCAGGACTTCAGAAGagctgtgagaccttggc\n>sequence_bed\nchr1 11130540 11130751\nchr1 1023000 1024000'] def selectSameLenGuides(list_guides): diff --git a/pages/navbar_creation.py b/pages/navbar_creation.py index 34f4dfc..e4040f8 100644 --- a/pages/navbar_creation.py +++ b/pages/navbar_creation.py @@ -3,7 +3,7 @@ import dash_core_components as dcc from app import URL, DISPLAY_OFFLINE # from index import DISPLAY_OFFLINE -PLOTLY_LOGO = 'assets/37143442.png' +PLOTLY_LOGO = 'assets/favicon.png' # DISPLAY_OFFLINE = '' search_bar = dbc.Row( @@ -55,7 +55,7 @@ def Navbar(): align="center", no_gutters=True, ), - href=URL, + href=URL + '/index', ), dbc.NavbarToggler(id="navbar-toggler"), dbc.Collapse(search_bar, id="navbar-collapse", navbar=True), diff --git a/pages/results_page.py b/pages/results_page.py index 3b63604..1591c29 100644 --- a/pages/results_page.py +++ b/pages/results_page.py @@ -64,6 +64,11 @@ def resultPage(job_id): if (not isdir(job_directory)): return html.Div(dbc.Alert("The selected result does not exist", color="danger")) + count_guides = 0 + with open(current_working_directory + 'Results/' + value + '/guides.txt') as g: + for line in g: + count_guides += 1 + # Load mismatches with open(current_working_directory + 'Results/' + value + '/Params.txt') as p: all_params = p.read() @@ -125,8 +130,8 @@ def resultPage(job_id): {'name': ['', 'CFD'], 'id':'CFD', 'type':'text'}, # {'name': ['', 'Doench 2016'], 'id':'Doench 2016', # 'type':'text'}, # Doench, only for REF or VAR - {'name': ['', 'Doench 2016', ''], 'id':'Doench 2016', - 'type':'text'}, # REF Doench, only for Both + # {'name': ['', 'Doench 2016', ''], 'id':'Doench 2016', + # 'type':'text'}, # REF Doench, only for Both # {'name': ['Doench 2016', 'Enriched'], 'id':'Enriched', # 'type':'text'}, # VAR Doench, only for Both {'name': [ @@ -180,30 +185,30 @@ def resultPage(job_id): ) add_to_description = html.P( - 'General summary for the given guides. For each guide, the number of Off-Targets found for each Mismatch + Bulge value is shown.' + 'General summary for input guides. For each guide, is reported the count of targets in reference and variant genome grouped by mismatches count and bulge size.' ) if genome_type == 'both': add_to_description = html.P( [ - 'General summary for the given guides. For each guide, the number of ', - html.Span( - "Samples for each Class is provided", - id="tooltip-sample-class", - style={"textDecoration": "underline", "cursor": "pointer"} - ), - ', along with the number of Off-Targets found for each Mismatch + Bulge value, for both Reference and Enriched Genomes.', - dbc.Tooltip( - [ - html.Div([html.P([html.B('Class 0:'), ' Samples that does not have any On-Targets']), - html.P([html.B( - 'Class 0+:'), ' Samples that have a subset of the Reference Genome On-Targets']), - html.P([html.B( - 'Class 1:'), ' Samples that have the same On-Targets as the Reference Genome']), - html.P([html.B('Class 1+:'), ' Samples that creates at least a new On-Target, that is not present in the Reference Genome'])], - style={'display': 'inline-block'}) - ], - target="tooltip-sample-class", style={'font-size': '12px'} - ) + 'General summary for input guides. For each guide, is reported the count of targets in reference and variant genome grouped by mismatches count and bulge size.', + # html.Span( + # "Samples for each Class is provided", + # id="tooltip-sample-class", + # style={"textDecoration": "underline", "cursor": "pointer"} + # ), + # ', along with the number of Off-Targets found for each Mismatch + Bulge value, for both Reference and Enriched Genomes.', + # dbc.Tooltip( + # [ + # html.Div([html.P([html.B('Class 0:'), ' Samples that does not have any On-Targets']), + # html.P([html.B( + # 'Class 0+:'), ' Samples that have a subset of the Reference Genome On-Targets']), + # html.P([html.B( + # 'Class 1:'), ' Samples that have the same On-Targets as the Reference Genome']), + # html.P([html.B('Class 1+:'), ' Samples that creates at least a new On-Target, that is not present in the Reference Genome'])], + # style={'display': 'inline-block'}) + # ], + # target="tooltip-sample-class", style={'font-size': '12px'} + # ) ] ) final_list.append(add_to_description) @@ -211,6 +216,7 @@ def resultPage(job_id): html.Div( html.Div( dash_table.DataTable( + export_format='csv', id='general-profile-table', # page_size=PAGE_SIZE, columns=columns_profile_table, @@ -233,7 +239,7 @@ def resultPage(job_id): style_table={ # 'margin-left': "10%", 'max-height': '260px', - 'overflowY': 'scroll', + 'overflowY': 'auto', # 'overflowX': 'hidden', }, style_data={ @@ -276,7 +282,7 @@ def resultPage(job_id): # {'if': {'column_id': 'Reference'}, # 'width': '10%', # }], - ), id='div-general-profile-table', style={"margin-left": "10%", "margin-right": "10%"}) + ), id='div-general-profile-table', style={"margin-left": "5%", "margin-right": "5%"}) ) ) @@ -285,12 +291,12 @@ def resultPage(job_id): if genome_type == 'ref': final_list.append( dcc.Tabs(id="tabs-reports", value='tab-query-table', children=[ - dcc.Tab(label='Querying summary', value='tab-query-table'), - dcc.Tab(label='Summary by gRNA', + dcc.Tab(label='Custom Ranking', value='tab-query-table'), + dcc.Tab(label='Summary by Mismatches/Bulges', value='tab-summary-by-guide'), - dcc.Tab(label='Summary by Position', + dcc.Tab(label='Query Genomic Region', value='tab-summary-by-position'), - dcc.Tab(label='Graphical Summary', + dcc.Tab(label='Graphical Reports', value='tab-summary-graphical'), @@ -314,22 +320,22 @@ def resultPage(job_id): )), id="collapse-populations", ), - ] + ], hidden=True ) ) final_list.append(html.Br()) final_list.append( dcc.Tabs(id="tabs-reports", value='tab-query-table', children=[ - dcc.Tab(label='Querying summary', value='tab-query-table'), - dcc.Tab(label='Summary by gRNA', + dcc.Tab(label='Custom Ranking', value='tab-query-table'), + dcc.Tab(label='Summary by Mismatches/Bulges', value='tab-summary-by-guide'), dcc.Tab(label='Summary by Sample', value='tab-summary-by-sample'), - dcc.Tab(label='Summary by Position', + dcc.Tab(label='Query Genomic Region', value='tab-summary-by-position'), - dcc.Tab(label='Graphical Summary', + dcc.Tab(label='Graphical Reports', value='tab-summary-graphical'), - dcc.Tab(label='Personal Card', + dcc.Tab(label='Personal Risk Cards', value='tab-graphical-sample-card'), ]) ) @@ -341,6 +347,27 @@ def resultPage(job_id): return result_page +# Generate download link summary_by_sample +@app.callback( + [Output('download-link-summary_by_sample', 'children'), + Output('interval-summary_by_sample', 'disabled')], + [Input('interval-summary_by_sample', 'n_intervals')], + [State('div-info-summary_by_sample', 'children'), + State('url', 'search')] +) +def downloadLinkSample(n, file_to_load, search): # file to load = + if n is None: + raise PreventUpdate + job_id = search.split('=')[-1] + # file_to_load = file_to_load + '.zip' + file_to_load = file_to_load + '.txt' + file_to_load = file_to_load.strip().split('/')[-1] + # print(file_to_load) + if os.path.exists(current_working_directory + 'Results/' + job_id + '/' + file_to_load): + return html.A('Download file', href=URL+'/Results/' + job_id + '/' + file_to_load, target='_blank'), True + + return 'Generating download link, Please wait...', False + # Generate download link sumbyposition @app.callback( [Output('download-link-sumbyposition', 'children'), @@ -379,8 +406,6 @@ def downloadLinkSample(n, file_to_load, search): # file to load = job_id.HG001. return 'Generating download link, Please wait...', False # Generate download link sumbyguide - - @app.callback( [Output('download-link-sumbyguide', 'children'), Output('interval-sumbyguide', 'disabled')], @@ -683,7 +708,7 @@ def clusterPage(job_id, hash): # print('qui cluster in grep') #NOTE HEADER NON SALVATO os.popen(put_header + 'LC_ALL=C fgrep ' + guide + ' ' + current_working_directory + 'Results/' + job_id + '/' + job_id + file_to_grep + ' | awk \'$6==' + position + ' && $4==\"' + chromosome + '\"\' > ' + cluster_grep_result).read() # NOTE top1 will have sample and annotation, other targets will have '.'-> 18/03 all samples and annotation are already writter for all targets - os.system('zip ' + cluster_grep_result.replace('.txt', + os.system('zip '+'-j ' + cluster_grep_result.replace('.txt', '.zip') + ' ' + cluster_grep_result+" &") final_list.append( html.Div(job_id + '.' + chromosome + '_' + position + '.' + guide, @@ -937,7 +962,6 @@ def samplePage(job_id, hash): dcc.Interval(interval=5*1000, id='interval-sumbysample') ] - ) ] ) @@ -954,7 +978,7 @@ def samplePage(job_id, hash): os.system('LC_ALL=C fgrep ' + guide + ' ' + file_to_grep + ' | awk \'$14~\"' + sample + '\"\' > ' + sample_grep_result) - os.system('zip ' + sample_grep_result.replace('.txt', + os.system('zip '+'-j ' + sample_grep_result.replace('.txt', '.zip') + ' ' + sample_grep_result + " &") cols = [{"name": i, "id": i, 'type': t, 'hideable': True} @@ -1437,7 +1461,7 @@ def guidePagev3(job_id, hash): os.system('LC_ALL=C fgrep ' + guide + ' ' + file_to_grep + ' | LC_ALL=C fgrep ' + bulge_t + ' | awk \'$9==' + mms + ' && $10==' + bulge_s + '\'> ' + guide_grep_result) - os.system('zip ' + guide_grep_result.replace('.txt', '.zip') + + os.system('zip '+'-j ' + guide_grep_result.replace('.txt', '.zip') + ' ' + guide_grep_result + " &") # , shell = True) global_store_subset(job_id, bulge_t, bulge_s, mms, guide) @@ -1530,7 +1554,7 @@ def loadDistributionPopulations(sel_cel, all_guides, job_id): '\n') if 'Max_bulges' in s)).split('\t')[-1]) distributions = [dbc.Row(html.P( - 'On- and Off-Targets distributions in the Reference and Enriched Genome. For the Enriched Genome, the targets are divided into 5 SuperPopulations (EAS, EUR, AFR, AMR, SAS).', style={'margin-left': '0.75rem'}))] + 'On- and Off-Targets distributions in the Reference and Variant Genome. For the Variant Genome, the targets are divided into 5 SuperPopulations (EAS, EUR, AFR, AMR, SAS).', style={'margin-left': '0.75rem'}))] for i in range(math.ceil((mms + max_bulges + 1) / BARPLOT_LEN)): all_images = [] @@ -1854,10 +1878,17 @@ def filterPositionTable(filter_q, n, search, sel_cel, all_guides, current_page, if chrom == 'None': raise PreventUpdate # chrom = None - pos = filter_q[1] - if pos == 'None': + # pos = filter_q[1] + # if pos == 'None': + # raise PreventUpdate + # # pos = None + start = filter_q[1] + if start == 'None': + raise PreventUpdate + + end = filter_q[2] + if end == 'None': raise PreventUpdate - # pos = None current_page = current_page.split('/')[0] current_page = int(current_page) @@ -1871,11 +1902,14 @@ def filterPositionTable(filter_q, n, search, sel_cel, all_guides, current_page, file_to_grep = job_directory + job_id + '.bestMerge.txt' # file_to_grep_alt = job_directory + job_id +'.altMerge.txt' pos_grep_result = current_working_directory + \ - 'Results/' + job_id + '/' + job_id + '.' + pos + '.txt' + 'Results/' + job_id + '/' + job_id + '.' + start + "." + end + '.txt' + # os.system( + # f'LC_ALL=C fgrep {guide} {file_to_grep} | LC_ALL=C grep -P \"{chrom}\t{pos}\t\" > {pos_grep_result}') + # os.system( + # f'LC_ALL=C fgrep {guide} {file_to_grep} | awk \'$5 == \"{chrom}\" && ($6>={start} && $6<={end})\' | sort -k6,6n > {pos_grep_result}') os.system( - f'LC_ALL=C fgrep {guide} {file_to_grep} | LC_ALL=C grep -P \"{chrom}\t{pos}\t\" > {pos_grep_result}') - # os.popen(f'LC_ALL=C fgrep {guide} {file_to_grep_alt} | LC_ALL=C grep -P \"{chrom}\t{pos}\t\" >> {pos_grep_result}').read() + f'awk \'$16 == \"{guide}\" && $5 == \"{chrom}\" && ($6>={start} && $6<={end})\' {file_to_grep} | sort -k6,6n > {pos_grep_result}') with open(file_to_grep, 'r') as ftg: header = ftg.readline().split('\t')[:24] @@ -1896,6 +1930,7 @@ def filterPositionTable(filter_q, n, search, sel_cel, all_guides, current_page, out_1 = [ dash_table.DataTable( id="table-position", + export_format="csv", columns=[{"name": i, "id": i} for i in df.columns], data=df.to_dict('records'), style_cell_conditional=[{ @@ -1903,8 +1938,9 @@ def filterPositionTable(filter_q, n, search, sel_cel, all_guides, current_page, 'textAlign': 'left' }], style_table={ - 'overflowX': 'scroll' - } + 'overflowX': 'scroll', + }, + page_size=10, ) ] @@ -1918,18 +1954,19 @@ def filterPositionTable(filter_q, n, search, sel_cel, all_guides, current_page, Output('div-position-filter-query', 'children'), [Input('button-filter-position', 'n_clicks')], [State('dropdown-chr-table-position', 'value'), - State('input-position', 'value')] + State('input-position-start', 'value'), + State('input-position-end', 'value')] ) -def updatePositionFilter(n, chrom, pos_start): # , pos_end +def updatePositionFilter(n, chrom, pos_start, pos_end): # , pos_end if n is None: raise PreventUpdate if pos_start == '': pos_start = 'None' - # if pos_end == '': - # pos_end = 'None' - return str(chrom) + ',' + str(pos_start) # + ',' + str(pos_end) + if pos_end == '': + pos_end = 'None' + return str(chrom) + ',' + str(pos_start) + ',' + str(pos_end) # Callback to view next/prev page on sample table @@ -1994,16 +2031,16 @@ def filterSampleTable(nPrev, nNext, filter_q, n, search, sel_cel, all_guides, cu guide = all_guides[int(sel_cel[0]['row'])]['Guide'] if genome_type == 'both': col_names_sample = ['Sample', 'Gender', 'Population', 'Super Population', 'Targets in Reference', - 'Targets in Enriched', 'Targets in Population', 'Targets in Super Population', 'PAM Creation', 'Class'] + 'Targets in Variant', 'Targets in Population', 'Targets in Super Population', 'PAM Creation', 'Class'] else: col_names_sample = ['Sample', 'Gender', 'Population', 'Super Population', 'Targets in Reference', - 'Targets in Enriched', 'Targets in Population', 'Targets in Super Population', 'PAM Creation', 'Class'] + 'Targets in Variant', 'Targets in Population', 'Targets in Super Population', 'PAM Creation', 'Class'] # Last button pressed is filtering, return the first page of the filtered table if max(btn_sample_section) == n: if genome_type == 'both': df = pd.read_csv(job_directory + job_id + '.summary_by_samples.' + guide + '.txt', sep='\t', names=col_names_sample, skiprows=1) - df = df.sort_values('Targets in Enriched', ascending=False) + df = df.sort_values('Targets in Variant', ascending=False) df.drop(['Targets in Reference'], axis=1, inplace=True) else: df = pd.read_csv(job_directory + job_id + '.summary_by_samples.' + @@ -2036,7 +2073,7 @@ def filterSampleTable(nPrev, nNext, filter_q, n, search, sel_cel, all_guides, cu if genome_type == 'both': df = pd.read_csv(job_directory + job_id + '.summary_by_samples.' + guide + '.txt', sep='\t', names=col_names_sample, skiprows=1) - df = df.sort_values('Targets in Enriched', ascending=False) + df = df.sort_values('Targets in Variant', ascending=False) df.drop(['Targets in Reference'], axis=1, inplace=True) else: df = pd.read_csv(job_directory + job_id + '.summary_by_samples.' + @@ -2075,7 +2112,7 @@ def filterSampleTable(nPrev, nNext, filter_q, n, search, sel_cel, all_guides, cu if genome_type == 'both': df = pd.read_csv(job_directory + job_id + '.summary_by_samples.' + guide + '.txt', sep='\t', names=col_names_sample, skiprows=1) - df = df.sort_values('Targets in Enriched', ascending=False) + df = df.sort_values('Targets in Variant', ascending=False) df.drop(['Targets in Reference'], axis=1, inplace=True) else: df = pd.read_csv(job_directory + job_id + '.summary_by_samples.' + @@ -2258,7 +2295,7 @@ def generate_table(dataframe, id_table, genome_type, guide='', job_id='', max_ro # 'Bulge Type' 'Mismatches' 'Bulge Size' 'Targets in Reference' 'Targets in Enriched' 'Combined' 'PAM Creation' '' header.append(html.Tr([html.Th(x, style={ - 'vertical-align': 'middle', 'text-align': 'center'}) for x in ['Reference', 'Enriched', 'Combined']])) + 'vertical-align': 'middle', 'text-align': 'center'}) for x in ['Reference', 'Variant', 'Combined']])) # else: # header = [html.Tr([html.Th(col, style = {'vertical-align':'middle', 'text-align':'center'}) for col in dataframe.columns])] return html.Table( @@ -2284,8 +2321,12 @@ def check_existance_sample(job_directory, job_id, sample): @app.callback( - [Output('div-guide-image', 'children'), - Output('div-sample-image', 'children')], + # [Output('div-guide-image', 'children'), + # Output('div-sample-image', 'children')], + [Output('div-radar-chart-total', 'children'), + Output('div-population-barplot', 'children'), + Output('div-sample-image', 'children'), + Output('div-radar-chart-sample', 'children')], [Input('mm-dropdown', 'value'), Input('blg-dropdown', 'value'), Input('dropdown-superpopulation-sample', 'value'), @@ -2296,21 +2337,84 @@ def check_existance_sample(job_directory, job_id, sample): State('general-profile-table', 'data')] ) def updateImagesTabs(mm, bulge, superpopulation, population, sample, sel_cel, search, all_guides): - if sel_cel is None: - raise PreventUpdate + # if sel_cel is None: + # raise PreventUpdate + # print('entro update tab') job_id = search.split('=')[-1] job_directory = current_working_directory + 'Results/' + job_id + '/' guide = all_guides[int(sel_cel[0]['row'])]['Guide'] # search for getting job id # get guide with sel_cel and all_data + radar_chart_images = [] + population_barplots = [] guide_images = [] sample_images = [] + try: + population_barplots.extend( + [ + html.A( + html.Img( + src='data:image/png;base64,{}'.format(base64.b64encode(open( + current_working_directory + 'Results/' + job_id + '/imgs/populations_distribution_' + guide + '_' + str(int(mm)+int(bulge)) + 'total.png', 'rb').read()).decode()), + id='distribution-population' + str(int(mm)+int(bulge)), width="100%", height="auto" + ), + target="_blank", + href='/Results/' + job_id + '/imgs/' + 'populations_distribution_' + + guide + '_' + + str(int(mm)+int(bulge)) + 'total.png' + ), + html.Div(html.P('Distribution ' + str(int(mm)+int(bulge)) + ' Mismatches + Bulges ', style={ + 'display': 'inline-block'}), style={'text-align': 'center'}) + ] + ) + except: + population_barplots = [ + html.Div( + html.H2( + "No result found for this combination of mismatches and bulges" + ) + ) + ] + + # try: + # guide_images.extend( # population barplot + # [ + # html.A( + # html.Img( + # src='data:image/png;base64,{}'.format(base64.b64encode(open( + # current_working_directory + 'Results/' + job_id + '/imgs/populations_distribution_' + guide + '_' + str(int(mm)+int(bulge)) + 'total.png', 'rb').read()).decode()), + # id='distribution-population' + str(int(mm)+int(bulge)), width="100%", height="auto" + # ), + # target="_blank", + # href='/Results/' + job_id + '/imgs/' + 'populations_distribution_' + + # guide + '_' + + # str(int(mm)+int(bulge)) + 'total.png' + # ), + # html.Div(html.P('Distribution ' + str(int(mm)+int(bulge)) + ' Mismatches + Bulges ', style={ + # 'display': 'inline-block'}), style={'text-align': 'center'}) + # ] + # ) + # except: + # guide_images.append( + # html.Div(html.P('No Targets found with ' + str(int(mm)+int(bulge)) + ' Mismatches + Bulges', style={ + # 'display': 'inline-block'}), style={'text-align': 'center'}), + # # html.Div(html.P('Distribution ' + str(mm) + ' Mismatches + Bulges ', style = {'display':'inline-block'} ),style = {'text-align':'center'}) + # ) + + # guide = guide.replace("N", "") radar_img = '/imgs/summary_single_guide_' + \ - guide.replace("N", "") + '_' + str(mm) + \ + guide + '_' + str(mm) + \ '.' + str(bulge) + '_TOTAL.png' + if not os.path.isfile(f"{job_directory}/{radar_img}"): + # try: + # print('faccio radar chart') + os.system(f"python {app_main_directory}/PostProcess/generate_img_radar_chart.py {guide} {job_directory}/guide_dict_{guide}.json {job_directory}/motif_dict_{guide}.json {mm} {bulge} TOTAL {job_directory}/imgs/") + # except: + # pass + img_found = False try: radar_src = 'data:image/png;base64,{}'.format(base64.b64encode(open( @@ -2325,113 +2429,93 @@ def updateImagesTabs(mm, bulge, superpopulation, population, sample, sel_cel, se radar_href = '' if img_found: - guide_images.extend([ - - dbc.Row(html.Br()), - dbc.Row( - [ - dbc.Col( - html.A( - html.Img(src=radar_src, id='radar-img-guide', - width="100%", height="auto"), - target="_blank", - href=radar_href - ), - # width=10 - ) - ] - ), - ]) + radar_chart_images.append( + html.A( + html.Img(src=radar_src, id='radar-img-guide', + width="100%", height="auto"), + target="_blank", + href=radar_href + ) + ) else: - guide_images.extend([ - - dbc.Row(html.Br()), - dbc.Row( - [ - dbc.Col( - html.H2( - "No result found for this combination of mismatches and bulges" - ), - # width=10 - ) - ] - ), - ]) + radar_chart_images.append( + html.H2( + "No result found for this combination of mismatches and bulges" + ) + ) class_images = [(sample, 'Samples'), (population, 'Population'), (superpopulation, 'Superpopulation')] for c in class_images: img_found = False if c[0]: + current_img = job_directory + '/imgs/summary_single_guide_' +\ + guide + '_' + str(mm) + '.'+str(bulge) + '_' + c[0] + '.png' + if not os.path.isfile(current_img): + try: + os.system( + f"python {app_main_directory}/PostProcess/generate_img_radar_chart.py {guide} {job_directory}/guide_dict_{guide}.json {job_directory}/motif_dict_{guide}.json {mm} {bulge} {c[0]} {job_directory}/imgs/") + except: + pass try: - first_img_source = 'data:image/png;base64,{}'.format(base64.b64encode(open(job_directory + '/imgs/summary_single_guide_' + guide.replace( - "N", "") + '_' + str(mm) + '.'+str(bulge) + '_' + c[0] + '.png', 'rb').read()).decode()) + first_img_source = 'data:image/png;base64,{}'.format( + base64.b64encode(open(current_img, 'rb').read()).decode()) img_found = True except: first_img_source = 'data:image/png;base64,{}'.format(base64.b64encode( open(current_working_directory+'/assets/placeholder.png', 'rb').read()).decode()) try: - first_img_href = 'Results/' + job_id + '/imgs/summary_single_guide_' + \ - guide.replace("N", "") + '_' + str(mm) + "." + str(bulge) + \ + first_img_href = 'Results/' + job_id + '/imgs/summary_single_guide_' +\ + guide + '_' + str(mm) + "." + str(bulge) +\ '_' + c[0] + '.png' except: first_img_href = '' - sample_images.append(dbc.Row(html.Br())) + # sample_images.append(dbc.Row(html.Br())) if img_found: sample_images.append( - dbc.Row( - [ - dbc.Col( - html.A( - html.Img(src=first_img_source, - width="100%", height="auto"), - target="_blank", - href=first_img_href - ), - # width=10 - ), - ], - no_gutters=True - ), + dbc.Col( + html.A( + html.Img(src=first_img_source, + width="100%", height="auto"), + target="_blank", + href=first_img_href + ) + ) ) else: sample_images.append( - dbc.Row( - [ - dbc.Col( - html.H2( - "No result found for this combination of mismatches and bulges" - ), - # width=10 - ), - ], - no_gutters=True - ), + dbc.Col( + html.H2( + "No result found for this combination of mismatches and bulges" + ) + ) ) - return guide_images, sample_images + return radar_chart_images, population_barplots, guide_images, sample_images # Open in browser the result directory -@app.callback( - Output('div-open-result-directory', 'children'), - [Input('button-open-result-directory', 'n_clicks')], - [State('url', 'search'), - State('div-open-result-directory', 'children')] -) -def openResultDirectory(n, search, guide): - if n is None: - raise PreventUpdate - # TODO decidere se aprire tutta la cartella, solo il file, e nel secondo caso creare copia submit job che non rimuova .targets.GUIDE.txt - job_id = search.split('=')[-1] - wb.open_new_tab(current_working_directory + 'Results/' + - job_id + '/' + job_id + '.targets.' + guide + '.txt') - raise PreventUpdate +# @app.callback( +# Output('div-open-result-directory', 'children'), +# [Input('button-open-result-directory', 'n_clicks')], +# [State('url', 'search'), +# State('div-open-result-directory', 'children')] +# ) +# def openResultDirectory(n, search, guide): +# if n is None: +# raise PreventUpdate +# # TODO decidere se aprire tutta la cartella, solo il file, e nel secondo caso creare copia submit job che non rimuova .targets.GUIDE.txt +# job_id = search.split('=')[-1] +# wb.open_new_tab(current_working_directory + 'Results/' + +# job_id + '/' + job_id + '.targets.' + guide + '.txt') +# raise PreventUpdate -@app.callback( +@ app.callback( [Output('download-link-personal-card', 'children'), Output('download-link-personal-card', 'hidden'), + Output('div-personal-plot', 'children'), + Output('div-private-plot', 'children'), Output('div-table-sample-card', 'children'), Output('div-top-target-sample-card', 'children')], [Input('button-sample-card', 'n_clicks')], @@ -2454,25 +2538,43 @@ def generate_sample_card(n, sample, sel_cel, all_guides, search): pam_creation = df.loc[sample, 8] file_to_grep = job_directory + job_id + '.bestMerge.txt' + integrated_to_grep = job_directory+job_id + \ + '.bestMerge.txt.integrated_results.tsv' + integrated_personal = job_directory + job_id + '.' + \ + sample + '.' + guide + '.integrated.personal.txt' + integrated_private = job_directory + job_id + '.' + \ + sample + '.' + guide + '.integrated.private.txt' # file_to_grep_alt = job_directory + job_id +'.altMerge.txt' sample_grep_result = current_working_directory + 'Results/' + \ job_id + '/' + job_id + '.' + sample + '.' + guide + '.private.txt' + # copy header from integrated results into sample files + os.system(f"head -1 {integrated_to_grep} > {integrated_personal}") + os.system(f"head -1 {integrated_to_grep} > {integrated_private}") + os.system(f"LC_ALL=C fgrep {guide} {integrated_to_grep} | fgrep {sample} >> {integrated_personal}") + os.system(f"LC_ALL=C awk \'$32==\"{sample}\"\' {integrated_personal} >> {integrated_private}") + os.system(f"LC_ALL=C fgrep {guide} {file_to_grep} | awk \'$14==\"{sample}\"\' > {sample_grep_result}") + + # plot for images in personal card os.system( - f"LC_ALL=C fgrep {guide} {file_to_grep} | awk \'$14==\"{sample}\"\' > {sample_grep_result}") + f"python {app_main_directory}/PostProcess/CRISPRme_plots_personal.py {integrated_personal} {current_working_directory}/Results/{job_id}/imgs/ {guide}.{sample}.personal > {current_working_directory}/Results/{job_id}/warnings.txt 2>&1") + os.system( + f"python {app_main_directory}/PostProcess/CRISPRme_plots_personal.py {integrated_private} {current_working_directory}/Results/{job_id}/imgs/ {guide}.{sample}.private > {current_working_directory}/Results/{job_id}/warnings.txt 2>&1") + os.system(f"rm -f {current_working_directory}/Results/{job_id}/warnings.txt {integrated_private} {integrated_personal}") + private = 0 for line in open(sample_grep_result): private += 1 # print(personal, pam_creation, private) results_table = pd.DataFrame([[personal, pam_creation, private]], columns=[ - 'Personal', 'PAM Creation', 'Private']).astype(str) + 'Personal', 'PAM Creation', 'Private']).astype(str) if int(private) > 0: tmp_file = current_working_directory + 'Results/' + \ job_id + '/' + job_id + '.' + sample + ".tmp_card.txt" tmp_file_2 = current_working_directory + 'Results/' + \ job_id + '/' + job_id + '.' + sample + ".tmp_card_2.txt" os.system( - f"LC_ALL=C sort -k24rg \"{sample_grep_result}\" > \"{tmp_file}\" ; head -5 {tmp_file} > \"{tmp_file_2}\"") + f"LC_ALL=C sort -k21,21rg \"{sample_grep_result}\" > \"{tmp_file}\" ; head -5 {tmp_file} > \"{tmp_file_2}\"") ans = pd.read_csv(tmp_file_2, sep='\t', header=None, usecols=range(0, 23)) with open(file_to_grep) as f_: @@ -2481,8 +2583,8 @@ def generate_sample_card(n, sample, sel_cel, all_guides, search): ans = ans.astype(str) # os.system(f"rm {tmp_file} &") #do not delete temp file until zip is created os.system(f"rm {tmp_file_2} &") - os.system('zip ' + tmp_file.replace('.txt', - '.zip') + ' ' + tmp_file) # create zip file to download result card /blocking operation on the system + # create zip file to download result card /blocking operation on the system to avoid updating the page before the zip is created + os.system('zip '+'-j ' + tmp_file.replace('.txt','.zip') + ' ' + tmp_file) # do not delete temp file until zip is created os.system(f"rm {tmp_file} &") @@ -2502,7 +2604,7 @@ def generate_sample_card(n, sample, sel_cel, all_guides, search): with open(current_working_directory + 'Results/' + job_id + '/' + job_id + '.' + sample + '.' + guide + '.sample_card.txt', "r") as file_in: infos = file_in.readline().strip().split('\t') results_table = pd.DataFrame([[infos[0], infos[1], infos[2]]], columns=[ - 'Personal', 'PAM Creation', 'Private']) + 'Personal', 'PAM Creation', 'Private']) if int(infos[2]) > 0: targets = [] for line in file_in: @@ -2514,8 +2616,14 @@ def generate_sample_card(n, sample, sel_cel, all_guides, search): c = f_.readline().strip() ans.columns = c.split('\t')[:23] - # image_sample_card = 'data:image/png;base64,{}'.format(base64.b64encode(open( - # current_working_directory + 'Results/' + job_id + '/' + sample + "_personal_card.png", 'rb').read()).decode()) + # image for personal and private + image_personal_top = 'data:image/png;base64,{}'.format(base64.b64encode(open( + current_working_directory + 'Results/' + job_id + f'/imgs/CRISPRme_top_1000_log_for_main_text_{guide}.{sample}.personal.png', 'rb').read()).decode()) + image_private_top = 'data:image/png;base64,{}'.format(base64.b64encode(open( + current_working_directory + 'Results/' + job_id + f'/imgs/CRISPRme_top_1000_log_for_main_text_{guide}.{sample}.private.png', 'rb').read()).decode()) + + # os.system( + # f"rm -f {current_working_directory}/Results/{job_id}/warnings.txt {integrated_private} {integrated_personal}") try: file_to_load = job_id + '.' + sample + '.tmp_card.zip' ans[''] = [''] * ans.shape[0] # taaaaaaaaaac @@ -2526,18 +2634,19 @@ def generate_sample_card(n, sample, sel_cel, all_guides, search): ans_cols.insert(0, '') ans = ans[ans_cols] out_1 = [ - # dbc.Col( - html.A('Download private targets', href=URL+'/Results/' + \ + html.A('Download private targets', href=URL+'/Results/' + job_id + '/' + file_to_load, target='_blank'), False, - # html.A( - # html.Img(src=image_sample_card, id='sample-card-img', - # width="100%", height="auto"), - # target="_blank", - - # ), - # width=10 - # ), + html.A( + html.Img(src=image_personal_top, id='sample-personal-top', + width="100%", height="auto"), + target="_blank" + ), + html.A( + html.Img(src=image_private_top, id='sample-private-top', + width="100%", height="auto"), + target="_blank" + ), dash_table.DataTable( id="results-table", columns=[{"name": i, "id": i} for i in results_table.columns], @@ -2569,9 +2678,11 @@ def generate_sample_card(n, sample, sel_cel, all_guides, search): # ), # width=10 # ), - html.A('Download private targets', href=URL+'/Results/' + \ + html.A('Download private targets', href=URL+'/Results/' + job_id + '/' + file_to_load, target='_blank'), False, + [], + [], dash_table.DataTable( id="results-table", columns=[{"name": i, "id": i} for i in results_table.columns], @@ -2585,13 +2696,15 @@ def generate_sample_card(n, sample, sel_cel, all_guides, search): # Load the table/children under the tab value -@app.callback( +@ app.callback( Output('div-tab-content', 'children'), [Input('tabs-reports', 'value'), Input('general-profile-table', 'selected_cells')], [State('general-profile-table', 'data'), State('url', 'search'), State('div-genome-type', 'children')] + + ) def updateContentTab(value, sel_cel, all_guides, search, genome_type): if value is None or sel_cel is None or not sel_cel or not all_guides: @@ -2618,17 +2731,17 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): # Show Summary by Guide table fl.append( html.P( - ['Summary table counting the number of targets found in the Enriched Genome for each combination of Bulge Type, Bulge Size and Mismatch. Select \'Show Targets\' to view the corresponding list of targets. ', + ['Summary table counting the number of targets found in the Reference and Variant Genome for each combination of Bulge Type, Bulge Size and Mismatch. Select \'Show Targets\' to view the corresponding list of targets. ', # html.A('Click here', href = URL + '/data/' + job_id + '/' + job_id + '.targets.' + guide + '.zip' ,target = '_blank', id = 'download-full-list' ), ' to download the full list of targets.' ] ) ) # fl.append(html.Button(html.A('Download Full list of targets', href = URL + '/data/' + job_id + '/' + job_id + '.targets.' + guide + '.zip' ,target = '_blank', style = {'text-decoration':'none', 'color':'black'} ))) - fl.append(html.Button('Open Result Directory', - id='button-open-result-directory')) - fl.append(html.Div(guide, id='div-open-result-directory', - style={'display': 'none'})) + # fl.append(html.Button('Open Result Directory', + # id='button-open-result-directory')) + # fl.append(html.Div(guide, id='div-open-result-directory', + # style={'display': 'none'})) fl.append(html.Br()) df = pd.read_csv(job_directory + job_id + '.summary_by_guide.' + guide + '.txt', sep='\t') @@ -2659,23 +2772,23 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): elif value == 'tab-summary-by-sample': # Show Summary by Sample table fl.append( - html.P('Summary table counting the number of targets found in the Enriched Genome for each sample. Filter the table by selecting the Population or Superpopulation desired from the dropdowns.') + html.P('Summary table counting the number of targets found in the Variant Genome for each sample. Filter the table by selecting the Population or Superpopulation desired from the dropdowns.') ) if genome_type == 'both': # col_names_sample = ['Sample', 'Gender', 'Population', 'Super Population', 'Targets in Reference', 'Targets in Enriched', 'Targets in Population', 'Targets in Super Population', 'PAM Creation', 'Class'] col_names_sample = ['Sample', 'Gender', 'Population', 'Super Population', 'Targets in Reference', - 'Targets in Enriched', 'Targets in Population', 'Targets in Super Population', 'PAM Creation'] + 'Targets in Variant', 'Targets in Population', 'Targets in Super Population', 'PAM Creation'] df = pd.read_csv(job_directory + job_id + '.summary_by_samples.' + guide + '.txt', sep='\t', names=col_names_sample, skiprows=1) - df = df.sort_values('Targets in Enriched', ascending=False) + df = df.sort_values('Targets in Variant', ascending=False) df.drop(['Targets in Reference'], axis=1, inplace=True) else: # col_names_sample = ['Sample', 'Gender', 'Population', 'Super Population', 'Targets in Reference', 'Targets in Enriched', 'Targets in Population', 'Targets in Super Population', 'PAM Creation', 'Class'] col_names_sample = ['Sample', 'Gender', 'Population', 'Super Population', 'Targets in Reference', - 'Targets in Enriched', 'Targets in Population', 'Targets in Super Population', 'PAM Creation'] + 'Targets in Variant', 'Targets in Population', 'Targets in Super Population', 'PAM Creation'] df = pd.read_csv(job_directory + job_id + '.summary_by_samples.' + guide + '.txt', sep='\t', names=col_names_sample, skiprows=1) - df = df.sort_values('Targets in Enriched', ascending=False) + df = df.sort_values('Targets in Variant', ascending=False) df.drop(['Targets in Reference'], axis=1, inplace=True) df.drop(['Class'], axis=1, inplace=True) more_info_col = [] @@ -2695,6 +2808,8 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): html.Div ( [ + html.Div(job_directory + job_id + '.summary_by_samples.' + guide, + style={'display': 'none'}, id='div-info-summary_by_sample'), dbc.Row( [ dbc.Col(html.Div(dcc.Dropdown( @@ -2708,6 +2823,16 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): 'Filter', id='button-filter-population-sample'))) ] ), + dbc.Row( + dbc.Col( + html.Div( + [ + html.P('Generating download link, Please wait...', id='download-link-summary_by_sample'), + dcc.Interval(interval=1*1000, id='interval-summary_by_sample'), + ] + ) + ) + ) ], style={'width': '50%'} ) @@ -2735,12 +2860,12 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): elif value == 'tab-summary-by-position': # Show Summary by position table fl.append( - html.P('Summary table containing all the targets found in a specific position of the genome. For each position, the enriched target with the lowest Mismatch + Bulge count is shown (if no target was found in the Enriched Genome, the correspondig reference one is shown), along with his Mismatch and Bulge Size values.' + - ' The subtable \'Targets in Cluster by Mismatch Value\' represents the number of targets found in that position for a particular Mismatch-Bulge Size pair.') + html.P( + 'Summary table containing all the targets found in a specific range of positions (chr, start, end) of the genome.') ) fl.append( - html.P('Filter the table by selecting the chromosome of interest and writing the start and/or end position of the region to view.') + html.P('Filter the table by selecting the chromosome of interest and writing the start and end position of the region to view.') ) # Dropdown chromosomes try: @@ -2771,7 +2896,7 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): for chr_name in chr_file] # Colonne tabella: chr, pos, target migliore, min mm, min bulges, num target per ogni categoria di mm e bulge, show targets; ordine per total, poi mm e poi bulge - start_time = time.time() + # start_time = time.time() # df = pd.read_csv( job_directory + job_id + '.summary_by_position.' + guide +'.txt', sep = '\t') # df.rename(columns = {'#Chromosome':'Chromosome'}, inplace = True) # more_info_col = [] @@ -2787,12 +2912,15 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): [ dbc.Col(html.Div(dcc.Dropdown( options=chr_file, id='dropdown-chr-table-position', placeholder='Select a chromosome'))), + # dbc.Col( + # html.Div(dcc.Input(placeholder='Position', id='input-position'))), + dbc.Col( + html.Div(dcc.Input(placeholder='Start Position', id='input-position-start'))), dbc.Col( - html.Div(dcc.Input(placeholder='Position', id='input-position'))), - # dbc.Col(html.Div(dcc.Input(placeholder = 'Start Position', id = 'input-position-start'))), - # dbc.Col(html.Div(dcc.Input(placeholder = 'End Position', id = 'input-position-end'))), + html.Div(dcc.Input(placeholder='End Position', id='input-position-end'))), dbc.Col(html.Div(html.Button( 'Filter', id='button-filter-position'))) + # ) ] ), ], @@ -2803,7 +2931,7 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): # Folr keep current filter: chr,pos_start,pos_end fl.append(html.Div('None,None,None', id='div-position-filter-query', style={'display': 'none'})) - start_time = time.time() + # start_time = time.time() fl.append(html.Div( style={'text-align': 'center'}, id='div-table-position' ) @@ -2820,7 +2948,7 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): samples = df.iloc[:, 0] fl.append( html.P( - 'In this page single Sample Risk Card can be generated and inspected') + 'Summary page containing the single Personal Risk card to be inspected and downloaded') ) fl.append( html.Div @@ -2840,7 +2968,16 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): style={'width': '50%'} ) ) - + fl.append(html.Div( + [ + dbc.Row( + [ + dbc.Col(html.Div('', id='div-personal-plot')), + dbc.Col(html.Div('', id='div-private-plot')) + ] + ) + ] + )) fl.append(html.Div( '', id='div-table-sample-card', style={'text-align': 'center', 'margin-left': '1%', 'margin-right': '1%'} ) @@ -2852,10 +2989,12 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): fl.append(html.Div('', id='div-sample-card')) return fl elif value == 'tab-query-table': + fl.append(html.P( + 'Summary page to query the final result file selecting one/two column to group by the table and extract requested targets')) path = current_working_directory+"/Results/"+job_id+"/"+job_id+".db" conn = sqlite3.connect(path) c = conn.cursor() - dff = pd.DataFrame(columns=['', 'crRNA', 'Reference', 'DNA', 'Chromosome', + dff = pd.DataFrame(columns=[' ', 'crRNA', 'Reference', 'DNA', 'Chromosome', 'Position', 'Direction', 'Mismatches', 'Bulge_Size', 'PAM_gen', 'SNP', 'CFD', 'CFD_ref', 'Highest_CFD_Risk_Score', @@ -2864,8 +3003,8 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): 'Target2 :with lowest Mismatches + Bulge Count': ['Mismatches', 'Bulge_Size', 'Total', 'CFD', 'CFD_Risk_Score', 'CFD_Absolute_Risk_Score']} # target_options = {'Mismatches': ['Bulge_Size', 'Total', 'CFD'], 'Bulge_Size': ['Mismatches', 'Total', 'CFD'], 'Total': ['Mismatches', 'Bulge_Size', 'CFD'], 'CFD': [ # 'Mismatches', 'Bulge_Size', 'Total'], 'Highest_CFD_Risk_Score': [], 'Highest_CFD_Absolute_Risk_Score': [], 'CFD_Risk_Score': [], 'CFD_Absolute_Risk_Score': []} - all_options = {'Target1 :with highest CFD': [' Mismatches', ' Bulges', ' Mismatch+Bulges', ' CFD', ' Risk_Score', ' Absolute_Risk_Score'], - 'Target2 :with lowest Mismatches + Bulges Count': [' Mismatches', ' Bulges', ' Mismatch+Bulges', ' CFD', ' Risk_Score', ' Absolute_Risk_Score']} + all_options = {'Target1 :with highest CFD': [' Mismatches', ' Bulges', ' Mismatch+Bulges', ' CFD', ' Risk Score', ' Absolute Risk Score'], + 'Target2 :with lowest Mismatches + Bulges Count': [' Mismatches', ' Bulges', ' Mismatch+Bulges', ' CFD', ' Risk Score', ' Absolute Risk Score']} # target_o # all_options = {'Target1 :with highest CFD': [' Mismatches', ' Bulges', ' Mismatch+Bulges', ' CFD'], @@ -2875,20 +3014,31 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): label = [{'label': lab} for lab in all_options.keys()] value = [{'value': val} for val in all_value.keys()] target_opt = [label, value] + # try: + # img_panel = dbc.Row( # row with plot + # [ + # dbc.Col( + # [ + # html.A(html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open( + # current_working_directory + 'Results/' + job_id + f'/imgs/CRISPRme_top_1000_log_for_main_text_{guide}.png', 'rb').read()).decode()), + # id='top-1000-score', width="80%", height="auto"), + # target="_blank") + # ], width={"size": 10, "offset": 2} + # ) + # ] + # ) + # except: + # img_panel = dbc.Row( # row with plot + # [ + # dbc.Col( + # [html.Div()] + # ) + # ] + # ) + query_tab_content = html.Div( [ - dbc.Row( # row with plot - [ - dbc.Col( - [ - html.A(html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open( - current_working_directory + 'Results/' + job_id + '/imgs/CRISPRme_top_1000_log_for_main_text.png', 'rb').read()).decode()), - id='top-1000-score', width="80%", height="auto"), - target="_blank") - ], width={"size": 10, "offset": 2} - ) - ] - ), + # img_panel, dbc.Row( # row with main group by, secondo group by and thresholds [ dbc.Col( # col0 phantom target select @@ -3098,13 +3248,15 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): # fl.append( return fl - else: + else: # tab-graphical # Show Report images samp_style = {} if genome_type == 'ref': samp_style = {'display': 'none'} - fl.append(html.Br()) + # fl.append(html.Br()) + fl.append(html.P( + 'Summary Graphical report collecting all the plots and images produced during the search')) opt_mm = [] for i in range(int(mms)+1): @@ -3112,93 +3264,41 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): opt_blg = [] for i in range(int(max_bulges)+1): opt_blg.append({'label': str(i), 'value': str(i)}) - fl_mm_blg = [html.Div([html.P("Select Mismatches"), - dcc.Dropdown(id='mm-dropdown', - options=opt_mm, - value='0', - clearable=False, - style={'width': '60px'} - ) - ], style={'display': 'inline-block', "margin-right": "20px"}), - html.Div([html.P("Select Bulges"), - dcc.Dropdown(id='blg-dropdown', - options=opt_blg, - value='0', - clearable=False, - style={'width': '60px'} - ) - ], style={'display': 'inline-block', "margin-right": "20px"}) - ] - ''' - fl_buttons_0 = [] - fl_buttons_1 = [] - fl_buttons_2 = [] - if int(max_bulges) >= 0: - for i in range(10): - if (i <= int(mms)): # TODO change into (i <= (int(mms) + int(max_bulges))) - fl_buttons_0.append( - html.Button(str(i) + ' MM + 0 BUL', - id='btn' + str(i) + '.0'), - ) - else: - fl_buttons_0.append( - html.Button(str(i) + ' total', id='btn' + - str(i), style={'display': 'none'}), - ) - - if int(max_bulges) >= 1: - for i in range(10): - if (i <= int(mms)): # TODO change into (i <= (int(mms) + int(max_bulges))) - fl_buttons_1.append( - html.Button(str(i) + ' MM + 1 BUL', - id='btn' + str(i) + '.1'), - ) - else: - fl_buttons_1.append( - html.Button(str(i) + ' total', id='btn' + - str(i), style={'display': 'none'}), - ) - - if int(max_bulges) >= 2: - for i in range(10): - if (i <= int(mms)): # TODO change into (i <= (int(mms) + int(max_bulges))) - fl_buttons_2.append( - html.Button(str(i) + ' MM + 2 BUL', - id='btn' + str(i) + '.2'), - ) - else: - fl_buttons_2.append( - html.Button(str(i) + ' total', id='btn' + - str(i), style={'display': 'none'}), - ) - ''' - fl.append(html.Br()) - - radar_img = '/imgs/summary_single_guide_' + guide.replace("N", "") + '_' + str( - 0) + '_total_0.0_TOTAL.png' # TODO choose between 0 mm and max used mms - # barplot_img = 'summary_histogram_' + guide + '_' + str(0) + '_total.png' - # try: #NOTE serve per non generare errori se il barplot non è stato fatto - # barplot_src = 'data:image/png;base64,{}'.format(base64.b64encode(open(current_working_directory + 'Results/' + job_id + '/' + barplot_img, 'rb').read()).decode()) + # fl.append(html.Br()) + + # guide = guide.strip() + # radar_img = '/imgs/summary_single_guide_' + guide + '_' + str( + # 0) + '_total_0.0_TOTAL.png' # TODO choose between 0 mm and max used mms + # if not os.path.isfile(f"{job_directory}/{radar_img}"): + # try: + # os.system(f"python {app_main_directory}/PostProcess/generate_img_radar_chart.py {guide} {job_directory}/guide_dict_{guide}.json {job_directory}/motif_dict_{guide}.json 0 0 TOTAL {job_directory}/imgs/") + # except: + # pass + # barplot_img = 'summary_histogram_' + \ + # guide + '_' + str(0) + '_total.png' + # try: # NOTE serve per non generare errori se il barplot non è stato fatto + # barplot_src = 'data:image/png;base64,{}'.format(base64.b64encode(open( + # current_working_directory + 'Results/' + job_id + '/' + barplot_img, 'rb').read()).decode()) # except: - # barplot_src = '' + # barplot_src = '' # try: - # barplot_href = 'assets/Img/' + job_id + '/' + barplot_img + # barplot_href = 'assets/Img/' + job_id + '/' + barplot_img # except: - # barplot_href = '' + # barplot_href = '' - img_found = False - try: - radar_src = 'data:image/png;base64,{}'.format(base64.b64encode(open( - current_working_directory + 'Results/' + job_id + '/' + radar_img, 'rb').read()).decode()) - img_found = True - except: - radar_src = 'data:image/png;base64,{}'.format(base64.b64encode( - open(current_working_directory+'/assets/placeholder.png', 'rb').read()).decode()) - try: - radar_href = 'assets/Img/' + job_id + '/' + radar_img - except: - radar_href = '' + # img_found = False + # try: + # radar_src = 'data:image/png;base64,{}'.format(base64.b64encode(open( + # current_working_directory + 'Results/' + job_id + '/' + radar_img, 'rb').read()).decode()) + # img_found = True + # except: + # radar_src = 'data:image/png;base64,{}'.format(base64.b64encode( + # open(current_working_directory+'/assets/placeholder.png', 'rb').read()).decode()) + # try: + # radar_href = 'assets/Img/' + job_id + '/' + radar_img + # except: + # radar_href = '' if genome_type != 'ref': population_1000gp = associateSample.loadSampleAssociation( @@ -3214,125 +3314,212 @@ def updateContentTab(value, sel_cel, all_guides, search, genome_type): super_populations = [] populations = [] # fl.append(html.P('Select Mismatch Value')) - fl.append( - html.Div( - [dbc.Row( - [ - dbc.Col( - [ - # html.P('Select Mismatch Value'), - html.Div(fl_mm_blg) - # html.Div(fl_buttons_0), - # html.Div(fl_buttons_1), - # html.Div(fl_buttons_2), + top1000_image = html.Div( + html.A(html.Img(src='data:image/png;base64,{}'.format(base64.b64encode(open( + current_working_directory + 'Results/' + job_id + f'/imgs/CRISPRme_top_1000_log_for_main_text_{guide}.png', 'rb').read()).decode()), + id='top-1000-score', width="80%", height="auto"), + target="_blank") + ) - ] - ), - dbc.Col( - [ - html.P('Select Individual Data', - style=samp_style), - dbc.Row([ - dbc.Col(html.Div(dcc.Dropdown( - options=super_populations, id='dropdown-superpopulation-sample', placeholder='Select a Super Population', style=samp_style))), - dbc.Col(html.Div(dcc.Dropdown( - options=populations, id='dropdown-population-sample', placeholder='Select a Population', style=samp_style))), - dbc.Col(html.Div(dcc.Dropdown( - id='dropdown-sample', placeholder='Select a Sample', style=samp_style))), - ]) - ] - ) + total_buttons = [ + dbc.Col( + html.Div( + [ + html.P("Select Mismatches"), + dcc.Dropdown(id='mm-dropdown', + options=opt_mm, + value='0', + clearable=False, + ) ] - ), - ] + ), width=4 + ), + dbc.Col( + html.Div( + [ + html.P("Select Bulges"), + dcc.Dropdown(id='blg-dropdown', + options=opt_blg, + value='0', + clearable=False, + ) + ] + ), width=4 ) - ) - fl.append(html.Hr()) - if img_found: - fl.append( + ] + # DA SPOSTARE DOPO BARPLOT E RADCHART TOTAL + sample_buttons = [ + dbc.Col( html.Div( [ - dbc.Row( - [ - dbc.Col([ # Guide part - html.Div( - [ - - dbc.Row(html.Br()), - dbc.Row( - [ - dbc.Col( - html.A( - html.Img( - src=radar_src, id='barplot-img-guide', width="100%", height="auto"), - target="_blank", - href=radar_href - ), - # width=10 - ) - ] - ), - ], - id='div-guide-image' - ) - ]), - dbc.Col([ # Sample part - html.Div( - [ - - ], - id='div-sample-image' - ) - ]) - ] + html.P("Select a Superpopulation"), + html.Div( + dcc.Dropdown( + options=super_populations, + id='dropdown-superpopulation-sample', + placeholder='SuperPopulation', + style=samp_style), ) ] - - ) - ) - else: - fl.append( + ), md=4 + ), + dbc.Col( html.Div( [ - dbc.Row( - [ - dbc.Col([ # Guide part - html.Div( - [ - - dbc.Row(html.Br()), - dbc.Row( - [ - dbc.Col( - html.H2( - "No result found for this combination of mismatches and bulges" - ), - # width=10 - ) - ] - ), - ], - id='div-guide-image' - ) - ]), - dbc.Col([ # Sample part - html.Div( - [ - - ], - id='div-sample-image' - ) - ]) - ] + html.P("Select a Population"), + html.Div(dcc.Dropdown( + options=populations, + id='dropdown-population-sample', + placeholder='Population', + style=samp_style), ) ] - - ) + ), md=4 + ), + dbc.Col( + html.Div( + [ + html.P("Select a Sample"), + html.Div(dcc.Dropdown( + id='dropdown-sample', + placeholder='Sample', + style=samp_style), + ) + ] + ), md=4 ) - - fl.append(html.Br()) - fl.append(html.Br()) + ] + fl.append( + html.Div( + [ + dbc.Row( + dbc.Col(top1000_image, width={"size": 10, "offset": 2}) + ), + dbc.Row( + total_buttons, justify='center' + ), + html.Br() + ] + ) + ) + radar_chart_total_content = html.Div(id='div-radar-chart-total') + populations_barplots = html.Div(id='div-population-barplot') + radar_chart_sample_content = html.Div(id='div-radar-chart-sample') + sample_image_content = html.Div(id='div-sample-image') + # [Output('div-radar-chart-total', 'children'), + # Output('div-population-barplot', 'children'), + # Output('div-radar-chart-sample', 'children'), + # Output('div-sample-image', 'children')], + fl.append( + html.Div( + [ + dbc.Row( + [ + dbc.Col(populations_barplots), + dbc.Col(radar_chart_total_content) + ] + ), + html.Br(), + dbc.Row( + sample_buttons + ), + dbc.Row(radar_chart_sample_content) + ] + ) + ) + # fl.append( + # dbc.Row( + # dbc.Col(radar_chart_content) + # ) + # ) + # fl.append(html.Hr()) + # if img_found: + # fl.append( + # html.Div( + # [ + # dbc.Row( + # [ + # dbc.Col([ # Guide part + # html.Div( + # [ + + # # dbc.Row(html.Br()), + # dbc.Row( + # [ + # dbc.Col( + # html.A( + # html.Img( + # src=radar_src, id='barplot-img-guide', width="100%", height="auto"), + # target="_blank", + # href=radar_href + # ), + # # width=10 + # ) + # ] + # ), + # ], + # id='div-guide-image' + # ) + # ]), + # # dbc.Col( + # # [ # Sample part + # # html.Div( + # # [ + + # # ], + # # id='div-sample-image' + # # ) + # # ] + # # ) + # ] + # ) + # ] + + # ) + # ) + # else: + # fl.append( + # html.Div( + # [ + # dbc.Row( + # [ + # dbc.Col([ # Guide part + # html.Div( + # [ + + # dbc.Row(html.Br()), + # dbc.Row( + # [ + # dbc.Col( + # html.H2( + # "No result found for this combination of mismatches and bulges" + # ), + # # width=10 + # ) + # ] + # ), + # ], + # id='div-guide-image' + # ) + # ]), + # dbc.Col([ # Sample part + # html.Div( + # [ + + # ], + # id='div-sample-image' + # ) + # ]) + # ] + # ) + # ] + + # ) + # ) + + # fl.append(html.Br()) + # fl.append(html.Br()) # TODO codice per l'integrazione del CFD graph. When .CFDGraph.txt will be integrated, remove the try/except @@ -3546,10 +3733,10 @@ def update_output(n_clicks, page_current, page_size, sel_cel, target, radio_orde data = data[sub_cols] data.columns = [x[:-2] for x in sub_cols] - data[''] = [''] * data.shape[0] + data[' '] = [' '] * data.shape[0] data_cols = data.columns.tolist() - data_cols.remove('') - data_cols.insert(0, '') + data_cols.remove(' ') + data_cols.insert(0, ' ') data = data.to_dict('records') else: @@ -3579,8 +3766,8 @@ def set_columns_options(selected_target): 'Target2 :with lowest Mismatches + Bulge Count': ['Mismatches', 'Bulge_Size', 'Total', 'CFD', 'CFD_Risk_Score', 'CFD_Absolute_Risk_Score']} # target_options = {'Mismatches': ['Bulge_Size', 'Total', 'CFD'], 'Bulge_Size': ['Mismatches', 'Total', 'CFD'], 'Total': ['Mismatches', 'Bulge_Size', 'CFD'], 'CFD': [ # 'Mismatches', 'Bulge_Size', 'Total'], 'Highest_CFD_Risk_Score': [], 'Highest_CFD_Absolute_Risk_Score': [], 'CFD_Risk_Score': [], 'CFD_Absolute_Risk_Score': []} - all_options = {'Target1 :with highest CFD': [' Mismatches', ' Bulges', ' Mismatch+Bulges', ' CFD', ' Risk_Score', ' Absolute_Risk_Score'], - 'Target2 :with lowest Mismatches + Bulges Count': [' Mismatches', ' Bulges', ' Mismatch+Bulges', ' CFD', ' Risk_Score', ' Absolute_Risk_Score']} + all_options = {'Target1 :with highest CFD': [' Mismatches', ' Bulges', ' Mismatch+Bulges', ' CFD', ' Risk Score', ' Absolute Risk Score'], + 'Target2 :with lowest Mismatches + Bulges Count': [' Mismatches', ' Bulges', ' Mismatch+Bulges', ' CFD', ' Risk Score', ' Absolute Risk Score']} # target_options = {' Mismatches': [' Bulges', ' Mismatch+Bulges', ' CFD'], ' Bulges': [' Mismatches', ' Mismatch+Bulges', ' CFD'], ' Mismatch+Bulges': [' Mismatches', ' Bulges', ' CFD'], ' CFD': [ # ' Mismatches', ' Bulges', ' Mismatch+Bulges'], ' Risk_Score': [], ' Absolute_Risk_Score': [], ' Risk_Score': [], ' Risk_Score': []} # main_order_dict = dict() @@ -3635,7 +3822,7 @@ def set_display_children(selected_order): target_value = {'Mismatches': ['Bulge_Size', 'Total', 'CFD'], 'Bulge_Size': ['Mismatches', 'Total', 'CFD'], 'Total': ['Mismatches', 'Bulge_Size', 'CFD'], 'CFD': [ 'Mismatches', 'Bulge_Size', 'Total'], 'Highest_CFD_Risk_Score': [], 'Highest_CFD_Absolute_Risk_Score': [], 'CFD_Risk_Score': [], 'CFD_Absolute_Risk_Score': []} target_label = {'Mismatches': [' Bulges', ' Mismatch+Bulges', ' CFD'], 'Bulge_Size': [' Mismatches', ' Mismatch+Bulges', ' CFD'], 'Total': [' Mismatches', ' Bulges', ' CFD'], - 'CFD': [' Mismatches', ' Bulges', ' Mismatch+Bulges'], 'Highest_CFD_Risk_Score': [], 'Highest_CFD_Absolute_Risk_Score': [], 'CFD_Risk_Score': [], 'CFD_Absolute_Risk_Score': []} + 'CFD': [' Mismatches', ' Bulges', ' Mismatch+Bulges'], 'Highest CFD Risk Score': [], 'Highest CFD Absolute Risk Score': [], 'CFD Risk Score': [], 'CFD Absolute Risk Score': []} gi = [] if selected_order is not None: diff --git a/seq_script/extract_seq.py b/seq_script/extract_seq.py index 0eb1b11..549db87 100644 --- a/seq_script/extract_seq.py +++ b/seq_script/extract_seq.py @@ -6,6 +6,7 @@ #Input chr1:11,130,540-11,130,751 def extractSequence(name, input_range, genome_selected): + name = '_'.join(name.split()) current_working_directory = os.getcwd() + '/' chrom = input_range.split(':')[0] start_position = input_range.split(':')[1].split('-')[0].replace(',','').replace('.','').replace(' ','')