diff --git a/LDlink/LDassoc.py b/LDlink/LDassoc.py index cb87c964..392df64d 100755 --- a/LDlink/LDassoc.py +++ b/LDlink/LDassoc.py @@ -77,7 +77,7 @@ def calculate_assoc(file, region, pop, request, genome_build, web, myargs): mongo_host = 'localhost' client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) db = client["LDLink"] - + def get_coords_var(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) diff --git a/LDlink/LDcommon.py b/LDlink/LDcommon.py index 5484a49f..52616d57 100644 --- a/LDlink/LDcommon.py +++ b/LDlink/LDcommon.py @@ -48,7 +48,7 @@ }, "grch38_high_coverage": { "title": "GRCh38 High Coverage", - "title_hg": "hg38", + "title_hg": "hg38_HC", "chromosome": "chromosome_grch38", "position": "position_grch38", "gene_begin": "begin_grch38", @@ -110,11 +110,14 @@ def connectMongoDBReadOnly(web): def retrieveTabix1000GData(query_file, coords, query_dir): export_s3_keys = retrieveAWSCredentials() - tabix_snps = export_s3_keys + " cd {2}; tabix -fhD {0}{1} | grep -v -e END".format( + tabix_snps = export_s3_keys + " cd {2}; tabix -fhD --separate-regions {0}{1} | grep -v -e END".format( query_file, coords, query_dir) # print("tabix_snps", tabix_snps) vcf = [x.decode('utf-8') for x in subprocess.Popen(tabix_snps, shell=True, stdout=subprocess.PIPE).stdout.readlines()] - return vcf + h = 0 + while vcf[h][0:2] == "##": + h += 1 + return vcf,h # Query genomic coordinates def get_rsnum(db, coord, genome_build): @@ -214,3 +217,43 @@ def getRecomb(db, filename, chromosome, begin, end, genome_build): genome_build_vars[genome_build]['position']: recomb_obj[genome_build_vars[genome_build]['position']] }) + '\n') return recomb_results_sanitized + +def parse_vcf(vcf,snp_coords): + delimiter = "#" + snp_lists = str('**'.join(vcf)).split(delimiter) + snp_dict = {} + snp_rs_dict = {} + missing_snp = [] + missing_rs = [] + snp_found_list = [] + #print(vcf) + #print(snp_lists) + + for snp in snp_lists[1:]: + snp_tuple = snp.split("**") + snp_key = snp_tuple[0].split("-")[-1].strip() + vcf_list = [] + #print(snp_tuple) + match_v = '' + for v in snp_tuple[1:]:#choose the matched one for dup; if no matched, choose first + if len(v) > 0: + match_v = v + geno = v.strip().split() + if geno[1] == snp_key: + match_v = v + if len(match_v) > 0: + vcf_list.append(match_v) + snp_found_list.append(snp_key) + + #vcf_list.append(snp_tuple.pop()) #always use the last one, even dup + #create snp_key as chr7:pos_rs4 + snp_dict[snp_key] = vcf_list + + for snp_coord in snp_coords: + if snp_coord[-1] not in snp_found_list: + missing_rs.append(snp_coord[0]) + else: + s_key = "chr"+snp_coord[1]+":"+snp_coord[2]+"_"+snp_coord[0] + snp_rs_dict[s_key] = snp_dict[snp_coord[2]] + del snp_dict + return snp_rs_dict," ".join(missing_rs) diff --git a/LDlink/LDhap.py b/LDlink/LDhap.py index f1ef9a68..510b3771 100755 --- a/LDlink/LDhap.py +++ b/LDlink/LDhap.py @@ -6,7 +6,7 @@ from bson import json_util import subprocess import sys -from LDcommon import checkS3File, connectMongoDBReadOnly, genome_build_vars, retrieveTabix1000GData +from LDcommon import checkS3File, connectMongoDBReadOnly, genome_build_vars, retrieveTabix1000GData,parse_vcf # Create LDhap function def calculate_hap(snplst, pop, request, web, genome_build): @@ -196,14 +196,13 @@ def replace_coords_rsid(db, snp_lst): snp_coord_str = [genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int] tabix_coords = " " + " ".join(snp_coord_str) - print("tabix_coords", tabix_coords) + #print("tabix_coords", tabix_coords) # # Extract 1000 Genomes phased genotypes vcf_filePath = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) - vcf = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) - + vcf,h = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: @@ -220,15 +219,10 @@ def set_alleles(a1, a2): a2_n = a2[1:] return(a1_n, a2_n) - # Make sure there are genotype data in VCF file - if vcf[-1][0:6] == "#CHROM": - output["error"] = "No query variants were found in 1000G VCF file. " + str(output["warning"] if "warning" in output else "") - return(json.dumps(output, sort_keys=True, indent=2)) - - h = 0 - while vcf[h][0:2] == "##": - h += 1 + #if vcf[-1][0:6] == "#CHROM": + # output["error"] = "No query variants were found in 1000G VCF file. " + str(output["warning"] if "warning" in output else "") + # return(json.dumps(output, sort_keys=True, indent=2)) head = vcf[h].strip().split() @@ -245,169 +239,90 @@ def set_alleles(a1, a2): for i in range(len(index)-1): hap2.append([]) + # parse vcf + snp_dict,missing_snp = parse_vcf(vcf[h+1:],snp_coords) + # throw error if no data is returned from 1000G + if len(missing_snp.split()) == len(snp_pos): + output["error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(output["warning"] if "warning" in output else "") + return(json.dumps(output, sort_keys=True, indent=2)) + + if len(missing_snp) > 0: + output["warning"] = "Query variant " + str(missing_snp) + " is missing from 1000G (" + genome_build_vars[genome_build]['title'] + ") data. " + str(output["warning"] if "warning" in output else "") + rsnum_lst = [] allele_lst = [] pos_lst = [] - - unique_vcf = [] - dup_vcf = [] - for g in range(h+1, len(vcf)): - geno = vcf[g].strip().split() - geno[0] = geno[0].lstrip('chr') - temp = geno[0]+geno[1] - if temp not in unique_vcf: - unique_vcf.append(temp) - else: - dup_vcf.append(temp) - if snp_pos.count(geno[1]) == 1: - rs_query = rs_nums[snp_pos.index(geno[1])] - warningmsg = "Variant " + rs_query + " is not biallelic, variant removed. " + + for s_key in snp_dict: + # parse snp_key such as chr7:pos_rs4 + snp_keys = s_key.split("_") + snp_key = snp_keys[0].split(':')[1] + rs_input = snp_keys[1] + geno_list = snp_dict[s_key] + g = -1 + for geno in geno_list: + g = g+1 + geno = geno.strip().split() + geno[0] = geno[0].lstrip('chr') + # if 1000G position does not match dbSNP position for variant, use dbSNP position + if geno[1] != snp_key: + mismatch_msg = "Genomic position ("+geno[1]+") in 1000G data does not match dbSNP" + \ + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant " +\ + rs_input + ". " if "warning" in output: - output["warning"] = output["warning"] + warningmsg + output["warning"] = output["warning"] + mismatch_msg else: - output["warning"] = warningmsg - - - counter_dups = 0 - vcf_pos_no_dup = [] - # find if query SNPs yield duplicate results from 1000G data - for g in range(h+1, len(vcf)): - geno = vcf[g - counter_dups].strip().split() - geno[0] = geno[0].lstrip('chr') - temp = geno[0]+geno[1] - if temp in dup_vcf: - counter_dups = counter_dups + 1 - vcf.pop(g - counter_dups) - if geno[1] not in vcf_pos_no_dup: - vcf_pos_no_dup.append(geno[1]) - else: - vcf_pos_no_dup.append(geno[1]) - - # throw error if no data is returned from 1000G - if len(vcf[h+1:]) == 0: - output["error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(output["warning"] if "warning" in output else "") - return(json.dumps(output, sort_keys=True, indent=2)) - - for g in range(h+1, len(vcf)): # 2 rows - geno = vcf[g].strip().split() - geno[0] = geno[0].lstrip('chr') - # if 1000G position does not match dbSNP position for variant, use dbSNP position - if geno[1] not in snp_pos: - snp_pos_index = rs_snp_pos[vcf_pos_no_dup.index(geno[1])] - if "warning" in output: - output["warning"] = output["warning"] + "Genomic position ("+geno[1]+") in VCF file does not match dbSNP" + \ - dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant. " - else: - output["warning"] = "Genomic position ("+geno[1]+") in VCF file does not match dbSNP" + \ - dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant. " - # throw an error in the event of missing query SNPs in 1000G data - if len(vcf_pos_no_dup) == len(snp_pos): - geno[1] = snp_pos[snp_pos_index] - else: - output["error"] = "One or more query variants were not found in 1000G VCF file. " - return(json.dumps(output, sort_keys=True, indent=2)) - - if snp_pos.count(geno[1]) == 1: - rs_query = rs_nums[snp_pos.index(geno[1])] - - else: - pos_index = [] - for p in range(len(snp_pos)): - if snp_pos[p] == geno[1]: - pos_index.append(p) - for p in pos_index: - if rs_nums[p] not in rsnum_lst: - rs_query = rs_nums[p] - break - - if rs_query in rsnum_lst: - continue - - rs_1000g = geno[2] - - if rs_query == rs_1000g: - rsnum = rs_1000g - else: - count = -2 - found = "false" - while count <= 2 and count+g < len(vcf): - geno_next = vcf[g+count].strip().split() - geno_next[0] = geno_next[0].lstrip('chr') - if len(geno_next) >= 3 and rs_query == geno_next[2]: - found = "true" - break - count += 1 - - if found == "false": - if "rs" in rs_1000g: - if "warning" in output: - output["warning"] = output["warning"] + \ - "Genomic position for query variant ("+rs_query + \ - ") does not match RS number at 1000G position (chr" + \ - geno[0]+":"+geno[1]+" = "+rs_1000g+"). " + output["warning"] = mismatch_msg + # throw an error in the event of missing query SNPs in 1000G data + geno[1] = snp_key + + if "," not in geno[3] and "," not in geno[4]: + a1, a2 = set_alleles(geno[3], geno[4]) + count0 = 0 + count1 = 0 + #print(geno) + for i in range(len(index)): + if geno[index[i]] == "0|0": + hap1[i].append(a1) + hap2[i].append(a1) + count0 += 2 + elif geno[index[i]] == "0|1": + hap1[i].append(a1) + hap2[i].append(a2) + count0 += 1 + count1 += 1 + elif geno[index[i]] == "1|0": + hap1[i].append(a2) + hap2[i].append(a1) + count0 += 1 + count1 += 1 + elif geno[index[i]] == "1|1": + hap1[i].append(a2) + hap2[i].append(a2) + count1 += 2 + elif geno[index[i]] == "0": + hap1[i].append(a1) + hap2[i].append(".") + count0 += 1 + elif geno[index[i]] == "1": + hap1[i].append(a2) + hap2[i].append(".") + count1 += 1 else: - output["warning"] = "Genomic position for query variant ("+rs_query + \ - ") does not match RS number at 1000G position (chr" + \ - geno[0]+":"+geno[1]+" = "+rs_1000g+"). " - - indx = [i[0] for i in snps].index(rs_query) - # snps[indx][0]=geno[2] - # rsnum=geno[2] - snps[indx][0] = rs_query - rsnum = rs_query - else: - continue - - if "," not in geno[3] and "," not in geno[4]: - a1, a2 = set_alleles(geno[3], geno[4]) - count0 = 0 - count1 = 0 - for i in range(len(index)): - if geno[index[i]] == "0|0": - hap1[i].append(a1) - hap2[i].append(a1) - count0 += 2 - elif geno[index[i]] == "0|1": - hap1[i].append(a1) - hap2[i].append(a2) - count0 += 1 - count1 += 1 - elif geno[index[i]] == "1|0": - hap1[i].append(a2) - hap2[i].append(a1) - count0 += 1 - count1 += 1 - elif geno[index[i]] == "1|1": - hap1[i].append(a2) - hap2[i].append(a2) - count1 += 2 - elif geno[index[i]] == "0": - hap1[i].append(a1) - hap2[i].append(".") - count0 += 1 - elif geno[index[i]] == "1": - hap1[i].append(a2) - hap2[i].append(".") - count1 += 1 + hap1[i].append(".") + hap2[i].append(".") + rsnum_lst.append(rs_input) + position = "chr"+geno[0]+":"+geno[1] + pos_lst.append(position) + f0 = round(float(count0)/(count0+count1), 4) + f1 = round(float(count1)/(count0+count1), 4) + if f0 >= f1: + alleles = a1+"="+str(round(f0, 3))+", " + \ + a2+"="+str(round(f1, 3)) else: - hap1[i].append(".") - hap2[i].append(".") - - rsnum_lst.append(rsnum) - - position = "chr"+geno[0]+":"+geno[1] - pos_lst.append(position) - - f0 = round(float(count0)/(count0+count1), 4) - f1 = round(float(count1)/(count0+count1), 4) - if f0 >= f1: - alleles = a1+"="+str(round(f0, 3))+", " + \ - a2+"="+str(round(f1, 3)) - else: - alleles = a2+"="+str(round(f1, 3))+", " + \ - a1+"="+str(round(f0, 3)) - allele_lst.append(alleles) - + alleles = a2+"="+str(round(f1, 3))+", " + \ + a1+"="+str(round(f0, 3)) + allele_lst.append(alleles) haps = {} for i in range(len(index)): diff --git a/LDlink/LDlink-5.3.0.css b/LDlink/LDlink-5.3.1.css similarity index 100% rename from LDlink/LDlink-5.3.0.css rename to LDlink/LDlink-5.3.1.css diff --git a/LDlink/LDlink-5.3.0.js b/LDlink/LDlink-5.3.1.js similarity index 99% rename from LDlink/LDlink-5.3.0.js rename to LDlink/LDlink-5.3.1.js index 7535bdfb..a563be80 100755 --- a/LDlink/LDlink-5.3.0.js +++ b/LDlink/LDlink-5.3.1.js @@ -67,7 +67,7 @@ $(document).ready(function() { }); // Load news text from news.html to news-container div - $.get("news-5.3.0.html", function (data) { + $.get("news-5.3.1.html", function (data) { let tmpData = data.split("
") let i = 0; var newsHTMLList = []; @@ -1810,7 +1810,7 @@ function populateHeaderValues(event, numFiles, label) { } function loadHelp() { - $('#help-tab').load('help-5.3.0.html'); + $('#help-tab').load('help-5.3.1.html'); } function calculate(e) { diff --git a/LDlink/LDmatrix.py b/LDlink/LDmatrix.py index 0a5288e6..271a5423 100755 --- a/LDlink/LDmatrix.py +++ b/LDlink/LDmatrix.py @@ -11,7 +11,7 @@ import subprocess import sys from LDcommon import checkS3File, retrieveAWSCredentials, genome_build_vars, getRefGene - +from LDcommon import connectMongoDBReadOnly,retrieveTabix1000GData,parse_vcf # Create LDmatrix function def calculate_matrix(snplst, pop, request, web, request_method, genome_build, r2_d="r2", collapseTranscript=True): # Set data directories using config.yml @@ -90,18 +90,7 @@ def calculate_matrix(snplst, pop, request, web, request_method, genome_build, r2 pop_ids = list(set(ids)) # Connect to Mongo snp database - if env == 'local' or connect_external: - mongo_host = api_mongo_addr - else: - mongo_host = 'localhost' - if web: - client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) - else: - if env == 'local' or connect_external: - client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) - else: - client = MongoClient('localhost', mongo_port) - db = client["LDLink"] + db = connectMongoDBReadOnly(web) def get_coords(db, rsid): rsid = rsid.strip("rs") @@ -249,7 +238,8 @@ def replace_coords_rsid(db, snp_lst): vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) - + vcf,h = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) + # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: @@ -266,21 +256,14 @@ def set_alleles(a1, a2): a2_n = a2[1:] return(a1_n, a2_n) - # Import SNP VCF files - tabix_snps = export_s3_keys + " cd {2}; tabix -fhD {0}{1} | grep -v -e END".format(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) - vcf = [x.decode('utf-8') for x in subprocess.Popen(tabix_snps, shell=True, stdout=subprocess.PIPE).stdout.readlines()] - + # Make sure there are genotype data in VCF file - if vcf[-1][0:6] == "#CHROM": - output["error"] = "No query variants were found in 1000G VCF file" - json_output = json.dumps(output, sort_keys=True, indent=2) - print(json_output, file=out_json) - out_json.close() - return("", "") - - h = 0 - while vcf[h][0:2] == "##": - h += 1 + #if vcf[-1][0:6] == "#CHROM": + # output["error"] = "No query variants were found in 1000G VCF file" + # json_output = json.dumps(output, sort_keys=True, indent=2) + # print(json_output, file=out_json) + # out_json.close() + # return("", "") head = vcf[h].strip().split() @@ -296,153 +279,80 @@ def set_alleles(a1, a2): hap2 = [[]] for i in range(len(index) - 1): hap2.append([]) + + snp_dict,missing_snp = parse_vcf(vcf[h+1:],snp_coords) - rsnum_lst = [] - allele_lst = [] - pos_lst = [] - - unique_vcf = [] - dup_vcf = [] - for g in range(h+1, len(vcf)): - geno = vcf[g].strip().split() - geno[0] = geno[0].lstrip('chr') - temp = geno[0]+geno[1] - if temp not in unique_vcf: - unique_vcf.append(temp) - else: - dup_vcf.append(temp) - if snp_pos.count(geno[1]) == 1: - rs_query = rs_nums[snp_pos.index(geno[1])] - warningmsg = "Variant " + rs_query + " is not biallelic, variant removed. " - if "warning" in output: - output["warning"] = output["warning"] + warningmsg - else: - output["warning"] = warningmsg - - counter_dups = 0 - vcf_pos_no_dup = [] - # find if query SNPs yield duplicate results from 1000G data - for g in range(h+1, len(vcf)): - geno = vcf[g - counter_dups].strip().split() - geno[0] = geno[0].lstrip('chr') - temp = geno[0]+geno[1] - if temp in dup_vcf: - counter_dups = counter_dups + 1 - vcf.pop(g - counter_dups) - if geno[1] not in vcf_pos_no_dup: - vcf_pos_no_dup.append(geno[1]) - else: - vcf_pos_no_dup.append(geno[1]) - - # throw error if no data is returned from 1000G - if len(vcf[h+1:]) == 0: + # all lists does not contain data which is returned from 1000G + if len(missing_snp.split()) == len(snp_pos): output["error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(output["warning"] if "warning" in output else "") json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "") - - for g in range(h + 1, len(vcf)): - geno = vcf[g].strip().split() - geno[0] = geno[0].lstrip('chr') - # if 1000G position does not match dbSNP position for variant, use dbSNP position - if geno[1] not in snp_pos: - snp_pos_index = rs_snp_pos[vcf_pos_no_dup.index(geno[1])] - if "warning" in output: - output["warning"] = output["warning"] + "Genomic position ("+geno[1]+") in VCF file does not match dbSNP" + \ - dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant. " - else: - output["warning"] = "Genomic position ("+geno[1]+") in VCF file does not match dbSNP" + \ - dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant. " - # throw an error in the event of missing query SNPs in 1000G data - if len(vcf_pos_no_dup) == len(snp_pos): - geno[1] = snp_pos[snp_pos_index] - else: - output["error"] = "One or more query variants were not found in 1000G VCF file. " - json_output = json.dumps(output, sort_keys=True, indent=2) - print(json_output, file=out_json) - out_json.close() - return("", "") - if snp_pos.count(geno[1]) == 1: - rs_query = rs_nums[snp_pos.index(geno[1])] + if len(missing_snp) > 0 : + output["warning"] = "Query variant " + str(missing_snp) + " is missing from 1000G (" + genome_build_vars[genome_build]['title'] + ") data. " + str(output["warning"] if "warning" in output else "") + - else: - pos_index = [] - for p in range(len(snp_pos)): - if snp_pos[p] == geno[1]: - pos_index.append(p) - for p in pos_index: - if rs_nums[p] not in rsnum_lst: - rs_query = rs_nums[p] - break - - if rs_query in rsnum_lst: - continue - - rs_1000g = geno[2] - - if rs_query == rs_1000g: - rsnum = rs_1000g - else: - count = -2 - found = "false" - while count <= 2 and count + g < len(vcf): - geno_next = vcf[g + count].strip().split() - geno_next[0] = geno_next[0].lstrip('chr') - if len(geno_next) >= 3 and rs_query == geno_next[2]: - found = "true" - break - count += 1 - - if found == "false": - if "rs" in rs_1000g: - if "warning" in output: - output["warning"] = output[ - "warning"] + ". Genomic position for query variant (" + rs_query + ") does not match RS number at 1000G position (chr"+geno[0]+":"+geno[1]+" = "+rs_1000g+")" - else: - output[ - "warning"] = "Genomic position for query variant (" + rs_query + ") does not match RS number at 1000G position (chr"+geno[0]+":"+geno[1]+" = "+rs_1000g+")" - - indx = [i[0] for i in snps].index(rs_query) - # snps[indx][0] = geno[2] - # rsnum = geno[2] - snps[indx][0] = rs_query - rsnum = rs_query - else: - continue - - if "," not in geno[3] and "," not in geno[4]: - a1, a2 = set_alleles(geno[3], geno[4]) - for i in range(len(index)): - if geno[index[i]] == "0|0": - hap1[i].append(a1) - hap2[i].append(a1) - elif geno[index[i]] == "0|1": - hap1[i].append(a1) - hap2[i].append(a2) - elif geno[index[i]] == "1|0": - hap1[i].append(a2) - hap2[i].append(a1) - elif geno[index[i]] == "1|1": - hap1[i].append(a2) - hap2[i].append(a2) - elif geno[index[i]] == "0": - hap1[i].append(a1) - hap2[i].append(".") - elif geno[index[i]] == "1": - hap1[i].append(a2) - hap2[i].append(".") + rsnum_lst = [] + allele_lst = [] + pos_lst = [] + #print(snp_dict) + for s_key in snp_dict: + # parse snp_key such as chr7:pos_rs4 + snp_keys = s_key.split("_") + snp_key = snp_keys[0].split(':')[1] + rs_input = snp_keys[1] + geno_list = snp_dict[s_key] + g = -1 + for geno in geno_list: + g = g+1 + geno = geno.strip().split() + geno[0] = geno[0].lstrip('chr') + #print(geno) + # if 1000G position does not match dbSNP position for variant, use dbSNP position + if geno[1] != snp_key: + mismatch_msg = "Genomic position ("+geno[1]+") in 1000G data does not match dbSNP" + \ + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant " + \ + rs_input + ". " + if "warning" in output: + output["warning"] = output["warning"] + mismatch_msg else: - hap1[i].append(".") - hap2[i].append(".") + output["warning"] = mismatch_msg + # throw an error in the event of missing query SNPs in 1000G data + geno[1] = snp_key + + if "," not in geno[3] and "," not in geno[4]: + a1, a2 = set_alleles(geno[3], geno[4]) + for i in range(len(index)): + if geno[index[i]] == "0|0": + hap1[i].append(a1) + hap2[i].append(a1) + elif geno[index[i]] == "0|1": + hap1[i].append(a1) + hap2[i].append(a2) + elif geno[index[i]] == "1|0": + hap1[i].append(a2) + hap2[i].append(a1) + elif geno[index[i]] == "1|1": + hap1[i].append(a2) + hap2[i].append(a2) + elif geno[index[i]] == "0": + hap1[i].append(a1) + hap2[i].append(".") + elif geno[index[i]] == "1": + hap1[i].append(a2) + hap2[i].append(".") + else: + hap1[i].append(".") + hap2[i].append(".") - rsnum_lst.append(rsnum) + rsnum_lst.append(rs_input) - position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1] - pos_lst.append(position) - alleles = a1 + "/" + a2 - allele_lst.append(alleles) + position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1] + pos_lst.append(position) + alleles = a1 + "/" + a2 + allele_lst.append(alleles) # Calculate Pairwise LD Statistics @@ -642,7 +552,7 @@ def set_alleles(a1, a2): # Generate error if less than two SNPs if len(x) < 2: - output["error"] = "Less than two variants to plot." + output["error"] = "Less than two variants to plot. " + str(output["warning"] if "warning" in output else "") json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() diff --git a/LDlink/LDmatrix_plot_sub.py b/LDlink/LDmatrix_plot_sub.py index 03931caf..b3839a15 100644 --- a/LDlink/LDmatrix_plot_sub.py +++ b/LDlink/LDmatrix_plot_sub.py @@ -11,7 +11,7 @@ from bson import json_util, ObjectId import subprocess from LDcommon import checkS3File, retrieveAWSCredentials, genome_build_vars - +from LDcommon import retrieveTabix1000GData,parse_vcf # LDmatrix subprocess to export bokeh to high quality images in the background def calculate_matrix_svg(snplst, pop, request, genome_build, r2_d="r2", collapseTranscript=True): @@ -155,7 +155,8 @@ def replace_coords_rsid(db, snp_lst): vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) - + vcf,h = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) + # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: @@ -172,14 +173,7 @@ def set_alleles(a1, a2): a2_n = a2[1:] return(a1_n, a2_n) - # Import SNP VCF files - tabix_snps = export_s3_keys + " cd {2}; tabix -fhD {0}{1} | grep -v -e END".format(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) - vcf = [x.decode('utf-8') for x in subprocess.Popen(tabix_snps, shell=True, stdout=subprocess.PIPE).stdout.readlines()] - - h = 0 - while vcf[h][0:2] == "##": - h += 1 - + head = vcf[h].strip().split() # Extract haplotypes @@ -195,120 +189,59 @@ def set_alleles(a1, a2): for i in range(len(index) - 1): hap2.append([]) + snp_dict,missing_snp = parse_vcf(vcf[h+1:],snp_coords) + rsnum_lst = [] allele_lst = [] pos_lst = [] - - unique_vcf = [] - dup_vcf = [] - for g in range(h+1, len(vcf)): - geno = vcf[g].strip().split() - geno[0] = geno[0].lstrip('chr') - temp = geno[0]+geno[1] - if temp not in unique_vcf: - unique_vcf.append(temp) - else: - dup_vcf.append(temp) - if snp_pos.count(geno[1]) == 1: - rs_query = rs_nums[snp_pos.index(geno[1])] - - counter_dups = 0 - vcf_pos_no_dup = [] - for g in range(h+1, len(vcf)): - geno = vcf[g - counter_dups].strip().split() - geno[0] = geno[0].lstrip('chr') - temp = geno[0]+geno[1] - if temp in dup_vcf: - counter_dups = counter_dups + 1 - vcf.pop(g - counter_dups) - if geno[1] not in vcf_pos_no_dup: - vcf_pos_no_dup.append(geno[1]) - else: - vcf_pos_no_dup.append(geno[1]) - for g in range(h + 1, len(vcf)): - geno = vcf[g].strip().split() - geno[0] = geno[0].lstrip('chr') - # if 1000G position does not match dbSNP position for variant, use dbSNP position - if geno[1] not in snp_pos: - snp_pos_index = rs_snp_pos[vcf_pos_no_dup.index(geno[1])] - # throw an error in the event of missing query SNPs in 1000G data - if len(vcf_pos_no_dup) == len(snp_pos): - geno[1] = snp_pos[snp_pos_index] - else: - return - if snp_pos.count(geno[1]) == 1: - rs_query = rs_nums[snp_pos.index(geno[1])] - - else: - pos_index = [] - for p in range(len(snp_pos)): - if snp_pos[p] == geno[1]: - pos_index.append(p) - for p in pos_index: - if rs_nums[p] not in rsnum_lst: - rs_query = rs_nums[p] - break - - if rs_query in rsnum_lst: - continue - - rs_1000g = geno[2] - - if rs_query == rs_1000g: - rsnum = rs_1000g - else: - count = -2 - found = "false" - while count <= 2 and count + g < len(vcf): - geno_next = vcf[g + count].strip().split() - geno_next[0] = geno_next[0].lstrip('chr') - if len(geno_next) >= 3 and rs_query == geno_next[2]: - found = "true" - break - count += 1 - - if found == "false": - indx = [i[0] for i in snps].index(rs_query) - # snps[indx][0] = geno[2] - # rsnum = geno[2] - snps[indx][0] = rs_query - rsnum = rs_query - else: - continue - - if "," not in geno[3] and "," not in geno[4]: - a1, a2 = set_alleles(geno[3], geno[4]) - for i in range(len(index)): - if geno[index[i]] == "0|0": - hap1[i].append(a1) - hap2[i].append(a1) - elif geno[index[i]] == "0|1": - hap1[i].append(a1) - hap2[i].append(a2) - elif geno[index[i]] == "1|0": - hap1[i].append(a2) - hap2[i].append(a1) - elif geno[index[i]] == "1|1": - hap1[i].append(a2) - hap2[i].append(a2) - elif geno[index[i]] == "0": - hap1[i].append(a1) - hap2[i].append(".") - elif geno[index[i]] == "1": - hap1[i].append(a2) - hap2[i].append(".") - else: - hap1[i].append(".") - hap2[i].append(".") + for s_key in snp_dict: + # parse snp_key such as chr7:pos_rs4 + snp_keys = s_key.split("_") + snp_key = snp_keys[0].split(':')[1] + rs_input = snp_keys[1] + geno_list = snp_dict[s_key] + g = -1 + for geno in geno_list: + g = g+1 + geno = geno.strip().split() + geno[0] = geno[0].lstrip('chr') + # if 1000G position does not match dbSNP position for variant, use dbSNP position + if geno[1] != snp_key: + geno[1] = snp_key + + if "," not in geno[3] and "," not in geno[4]: + a1, a2 = set_alleles(geno[3], geno[4]) + for i in range(len(index)): + if geno[index[i]] == "0|0": + hap1[i].append(a1) + hap2[i].append(a1) + elif geno[index[i]] == "0|1": + hap1[i].append(a1) + hap2[i].append(a2) + elif geno[index[i]] == "1|0": + hap1[i].append(a2) + hap2[i].append(a1) + elif geno[index[i]] == "1|1": + hap1[i].append(a2) + hap2[i].append(a2) + elif geno[index[i]] == "0": + hap1[i].append(a1) + hap2[i].append(".") + elif geno[index[i]] == "1": + hap1[i].append(a2) + hap2[i].append(".") + else: + hap1[i].append(".") + hap2[i].append(".") - rsnum_lst.append(rsnum) + rsnum_lst.append(rs_input) - position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1] - pos_lst.append(position) - alleles = a1 + "/" + a2 - allele_lst.append(alleles) + position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1] + pos_lst.append(position) + alleles = a1 + "/" + a2 + allele_lst.append(alleles) # Calculate Pairwise LD Statistics all_haps = hap1 + hap2 diff --git a/LDlink/SNPclip.py b/LDlink/SNPclip.py index b67a4afc..c3110ec9 100755 --- a/LDlink/SNPclip.py +++ b/LDlink/SNPclip.py @@ -11,7 +11,7 @@ import subprocess import sys import collections -from LDcommon import checkS3File, retrieveAWSCredentials, retrieveTabix1000GData, genome_build_vars, get_rsnum +from LDcommon import checkS3File, retrieveAWSCredentials, retrieveTabix1000GData, genome_build_vars, get_rsnum,parse_vcf,connectMongoDBReadOnly ########### # SNPclip # @@ -96,18 +96,7 @@ def calculate_clip(snplst, pop, request, web, genome_build, r2_threshold=0.1, ma pop_ids = list(set(ids)) # Connect to Mongo snp database - if env == 'local' or connect_external: - mongo_host = api_mongo_addr - else: - mongo_host = 'localhost' - if web: - client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) - else: - if env == 'local' or connect_external: - client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) - else: - client = MongoClient('localhost', mongo_port) - db = client["LDLink"] + db = connectMongoDBReadOnly(web) def get_coords(db, rsid): rsid = rsid.strip("rs") @@ -240,7 +229,7 @@ def replace_coords_rsid(db, snp_lst): checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) - vcf = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) + vcf,h = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) # Make MAF function def calc_maf(genos): @@ -308,10 +297,6 @@ def calc_r2(var1, var2): # Import SNP VCF file hap_dict = {} - h = 0 - while vcf[h][0:2] == "##": - h += 1 - head = vcf[h].strip().split() # Extract population specific haplotypes @@ -320,156 +305,71 @@ def calc_r2(var1, var2): if head[i] in pop_ids: pop_index.append(i) - rsnum_lst = [] - - unique_vcf = [] - dup_vcf = [] - for g in range(h+1, len(vcf)): - geno = vcf[g].strip().split() - geno[0] = geno[0].lstrip('chr') - temp = geno[0]+geno[1] - if temp not in unique_vcf: - unique_vcf.append(temp) - else: - dup_vcf.append(temp) - if snp_pos.count(geno[1]) == 1: - rs_query = rs_nums[snp_pos.index(geno[1])] - warningmsg = "Variant " + rs_query + " is not biallelic, variant removed. " - if "warning" in output and geno[1] not in output["warning"]: - output["warning"] = output["warning"]+warningmsg - else: - output["warning"] = warningmsg - - counter_dups = 0 - vcf_pos_no_dup = [] - # find if query SNPs yield duplicate results from 1000G data - for g in range(h+1, len(vcf)): - geno = vcf[g - counter_dups].strip().split() - geno[0] = geno[0].lstrip('chr') - temp = geno[0]+geno[1] - if temp in dup_vcf: - counter_dups = counter_dups + 1 - vcf.pop(g - counter_dups) - if geno[1] not in vcf_pos_no_dup: - vcf_pos_no_dup.append(geno[1]) - else: - vcf_pos_no_dup.append(geno[1]) - - # throw error if no data is returned from 1000G - if len(vcf[h+1:]) == 0: + snp_dict,missing_snp = parse_vcf(vcf[h+1:],snp_coords) + + # throw error if no data is returned from 1000G + if len(missing_snp.split()) == len(snp_pos): output["error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(output["warning"] if "warning" in output else "") json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") + + if len(missing_snp) > 0: + output["warning"] = "Query variant " + str(missing_snp) + " is missing from 1000G (" + genome_build_vars[genome_build]['title'] + ") data. " + str(output["warning"] if "warning" in output else "") + + rsnum_lst = [] - for g in range(h+1, len(vcf)): - geno = vcf[g].strip().split() - geno[0] = geno[0].lstrip('chr') - # if 1000G position does not match dbSNP position for variant, use dbSNP position - if geno[1] not in snp_pos: - if "warning" in output: - output["warning"] = output["warning"]+". Genomic position ("+geno[1]+") in VCF file does not match db" + \ - dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant" - else: - output["warning"] = "Genomic position ("+geno[1]+") in VCF file does not match db" + \ - dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant" - if len(vcf_pos_no_dup) == len(snp_pos): - geno[1] = snp_pos[g-h-1] - else: - output["error"] = "One or more query variants were not found in 1000G VCF file. " - json_output = json.dumps(output, sort_keys=True, indent=2) - print(json_output, file=out_json) - out_json.close() - return("", "", "") - - if snp_pos.count(geno[1]) == 1: - rs_query = rs_nums[snp_pos.index(geno[1])] - - else: - pos_index = [] - for p in range(len(snp_pos)): - if snp_pos[p] == geno[1]: - pos_index.append(p) - for p in pos_index: - if rs_nums[p] not in rsnum_lst: - rs_query = rs_nums[p] - break - - if rs_query in rsnum_lst: - continue - - rs_1000g = geno[2] - - if rs_query == rs_1000g: - rsnum = rs_1000g - else: - count = -2 - found = "false" - while count <= 2 and count+g < len(vcf): - geno_next = vcf[g+count].strip().split() - geno_next[0] = geno_next[0].lstrip('chr') - if len(geno_next) >= 3 and rs_query == geno_next[2]: - found = "true" - break - count += 1 - - if found == "false": - if "rs" in rs_1000g: - if "warning" in output: - output["warning"] = output["warning"] + \ - ". Genomic position for query variant ("+rs_query + \ - ") does not match RS number at 1000G position (chr" + \ - geno[0]+":"+geno[1]+" = "+rs_1000g+")" - else: - output["warning"] = "Genomic position for query variant ("+rs_query + \ - ") does not match RS number at 1000G position (chr" + \ - geno[0]+":"+geno[1]+" = "+rs_1000g+")" - - indx = [i[0] for i in snps].index(rs_query) - # snps[indx][0]=geno[2] - # rsnum=geno[2] - snps[indx][0] = rs_query - rsnum = rs_query - # try: - # indx=[i[0] for i in snps].index(rs_query) - # snps[indx][0]=geno[2] - # rsnum=geno[2] - # except ValueError: - # print("List does not contain value:") - # print "#####" - # print "variable rs_query " + rs_query - # print "variable snps " + str(snps) - # print "#####" - else: - continue - - details[rsnum] = ["chr"+geno[0]+":"+geno[1]] - - if "," not in geno[3] and "," not in geno[4]: - temp_genos = [] - for i in range(len(pop_index)): - temp_genos.append(geno[pop_index[i]]) - f0, f1, maf = calc_maf(temp_genos) - a0, a1 = set_alleles(geno[3], geno[4]) - details[rsnum].append( - a0+"="+str(round(f0, 3))+", "+a1+"="+str(round(f1, 3))) - if maf_threshold <= maf: - hap_dict[rsnum] = [temp_genos] - rsnum_lst.append(rsnum) + + + for s_key in snp_dict: + # parse snp_key such as chr7:pos_rs4 + snp_keys = s_key.split("_") + snp_key = snp_keys[0].split(':')[1] + rs_input = snp_keys[1] + geno_list = snp_dict[s_key] + g = -1 + for geno in geno_list: + g = g+1 + geno = geno.strip().split() + geno[0] = geno[0].lstrip('chr') + # if 1000G position does not match dbSNP position for variant, use dbSNP position + if geno[1] != snp_key: + mismatch_msg = "Genomic position ("+geno[1]+") in 1000G data does not match db" + \ + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant " + \ + rs_input + ". " + if "warning" in output: + output["warning"] = output["warning"]+ mismatch_msg + else: + output["warning"] = mismatch_msg + geno[1] = snp_key + + details[rs_input] = ["chr"+geno[0]+":"+geno[1]] + + if "," not in geno[3] and "," not in geno[4]: + temp_genos = [] + for i in range(len(pop_index)): + temp_genos.append(geno[pop_index[i]]) + f0, f1, maf = calc_maf(temp_genos) + a0, a1 = set_alleles(geno[3], geno[4]) + details[rs_input].append( + a0+"="+str(round(f0, 3))+", "+a1+"="+str(round(f1, 3))) + if maf_threshold <= maf: + hap_dict[rs_input] = [temp_genos] + rsnum_lst.append(rs_input) + else: + details[rs_input].append( + "Variant MAF is "+str(round(maf, 4))+", variant removed.") else: - details[rsnum].append( - "Variant MAF is "+str(round(maf, 4))+", variant removed.") - else: - details[rsnum].append(geno[3]+"=NA, "+geno[4]+"=NA") - details[rsnum].append("Variant is not biallelic, variant removed.") + details[rs_input].append(geno[3]+"=NA, "+geno[4]+"=NA") + details[rs_input].append("Variant is not biallelic, variant removed.") for i in rs_nums: if i not in rsnum_lst: if i not in details: index_i = rs_nums.index(i) details[i] = ["chr"+snp_coords[index_i][1]+":"+snp_coords[index_i][2]+"-" + - snp_coords[index_i][2], "NA", "Variant not in 1000G VCF file, variant removed."] + snp_coords[index_i][2], "NA", "Variant not in 1000G data, variant removed."] # Thin the SNPs # sup_2=u"\u00B2" diff --git a/LDlink/help-5.3.0.html b/LDlink/help-5.3.1.html similarity index 100% rename from LDlink/help-5.3.0.html rename to LDlink/help-5.3.1.html diff --git a/LDlink/index.html b/LDlink/index.html index 39b5d310..fbc5416c 100755 --- a/LDlink/index.html +++ b/LDlink/index.html @@ -11,6 +11,9 @@ + + + @@ -24,7 +27,7 @@ - + @@ -2820,7 +2823,7 @@
LDlink 5.3.1 Release (04/28/2022)
+LDlink 5.3 Release (04/05/2022)