Skip to content

Commit

Permalink
Merge pull request #181 from CBIIT/ldlink-5.3.1
Browse files Browse the repository at this point in the history
Ldlink 5.3.1
  • Loading branch information
kvnjng authored Apr 28, 2022
2 parents 1c1bb31 + 445327a commit a374f1b
Show file tree
Hide file tree
Showing 11 changed files with 322 additions and 613 deletions.
2 changes: 1 addition & 1 deletion LDlink/LDassoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def calculate_assoc(file, region, pop, request, genome_build, web, myargs):
mongo_host = 'localhost'
client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
db = client["LDLink"]

def get_coords_var(db, rsid):
rsid = rsid.strip("rs")
query_results = db.dbsnp.find_one({"id": rsid})
Expand Down
49 changes: 46 additions & 3 deletions LDlink/LDcommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
},
"grch38_high_coverage": {
"title": "GRCh38 High Coverage",
"title_hg": "hg38",
"title_hg": "hg38_HC",
"chromosome": "chromosome_grch38",
"position": "position_grch38",
"gene_begin": "begin_grch38",
Expand Down Expand Up @@ -110,11 +110,14 @@ def connectMongoDBReadOnly(web):

def retrieveTabix1000GData(query_file, coords, query_dir):
export_s3_keys = retrieveAWSCredentials()
tabix_snps = export_s3_keys + " cd {2}; tabix -fhD {0}{1} | grep -v -e END".format(
tabix_snps = export_s3_keys + " cd {2}; tabix -fhD --separate-regions {0}{1} | grep -v -e END".format(
query_file, coords, query_dir)
# print("tabix_snps", tabix_snps)
vcf = [x.decode('utf-8') for x in subprocess.Popen(tabix_snps, shell=True, stdout=subprocess.PIPE).stdout.readlines()]
return vcf
h = 0
while vcf[h][0:2] == "##":
h += 1
return vcf,h

# Query genomic coordinates
def get_rsnum(db, coord, genome_build):
Expand Down Expand Up @@ -214,3 +217,43 @@ def getRecomb(db, filename, chromosome, begin, end, genome_build):
genome_build_vars[genome_build]['position']: recomb_obj[genome_build_vars[genome_build]['position']]
}) + '\n')
return recomb_results_sanitized

def parse_vcf(vcf,snp_coords):
delimiter = "#"
snp_lists = str('**'.join(vcf)).split(delimiter)
snp_dict = {}
snp_rs_dict = {}
missing_snp = []
missing_rs = []
snp_found_list = []
#print(vcf)
#print(snp_lists)

for snp in snp_lists[1:]:
snp_tuple = snp.split("**")
snp_key = snp_tuple[0].split("-")[-1].strip()
vcf_list = []
#print(snp_tuple)
match_v = ''
for v in snp_tuple[1:]:#choose the matched one for dup; if no matched, choose first
if len(v) > 0:
match_v = v
geno = v.strip().split()
if geno[1] == snp_key:
match_v = v
if len(match_v) > 0:
vcf_list.append(match_v)
snp_found_list.append(snp_key)

#vcf_list.append(snp_tuple.pop()) #always use the last one, even dup
#create snp_key as chr7:pos_rs4
snp_dict[snp_key] = vcf_list

for snp_coord in snp_coords:
if snp_coord[-1] not in snp_found_list:
missing_rs.append(snp_coord[0])
else:
s_key = "chr"+snp_coord[1]+":"+snp_coord[2]+"_"+snp_coord[0]
snp_rs_dict[s_key] = snp_dict[snp_coord[2]]
del snp_dict
return snp_rs_dict," ".join(missing_rs)
251 changes: 83 additions & 168 deletions LDlink/LDhap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from bson import json_util
import subprocess
import sys
from LDcommon import checkS3File, connectMongoDBReadOnly, genome_build_vars, retrieveTabix1000GData
from LDcommon import checkS3File, connectMongoDBReadOnly, genome_build_vars, retrieveTabix1000GData,parse_vcf

# Create LDhap function
def calculate_hap(snplst, pop, request, web, genome_build):
Expand Down Expand Up @@ -196,14 +196,13 @@ def replace_coords_rsid(db, snp_lst):

snp_coord_str = [genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int]
tabix_coords = " " + " ".join(snp_coord_str)
print("tabix_coords", tabix_coords)
#print("tabix_coords", tabix_coords)
# # Extract 1000 Genomes phased genotypes
vcf_filePath = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1]))
vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)
vcf = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])

vcf,h = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
# Define function to correct indel alleles
def set_alleles(a1, a2):
if len(a1) == 1 and len(a2) == 1:
Expand All @@ -220,15 +219,10 @@ def set_alleles(a1, a2):
a2_n = a2[1:]
return(a1_n, a2_n)


# Make sure there are genotype data in VCF file
if vcf[-1][0:6] == "#CHROM":
output["error"] = "No query variants were found in 1000G VCF file. " + str(output["warning"] if "warning" in output else "")
return(json.dumps(output, sort_keys=True, indent=2))

h = 0
while vcf[h][0:2] == "##":
h += 1
#if vcf[-1][0:6] == "#CHROM":
# output["error"] = "No query variants were found in 1000G VCF file. " + str(output["warning"] if "warning" in output else "")
# return(json.dumps(output, sort_keys=True, indent=2))

head = vcf[h].strip().split()

Expand All @@ -245,169 +239,90 @@ def set_alleles(a1, a2):
for i in range(len(index)-1):
hap2.append([])

# parse vcf
snp_dict,missing_snp = parse_vcf(vcf[h+1:],snp_coords)
# throw error if no data is returned from 1000G
if len(missing_snp.split()) == len(snp_pos):
output["error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(output["warning"] if "warning" in output else "")
return(json.dumps(output, sort_keys=True, indent=2))

if len(missing_snp) > 0:
output["warning"] = "Query variant " + str(missing_snp) + " is missing from 1000G (" + genome_build_vars[genome_build]['title'] + ") data. " + str(output["warning"] if "warning" in output else "")

rsnum_lst = []
allele_lst = []
pos_lst = []

unique_vcf = []
dup_vcf = []
for g in range(h+1, len(vcf)):
geno = vcf[g].strip().split()
geno[0] = geno[0].lstrip('chr')
temp = geno[0]+geno[1]
if temp not in unique_vcf:
unique_vcf.append(temp)
else:
dup_vcf.append(temp)
if snp_pos.count(geno[1]) == 1:
rs_query = rs_nums[snp_pos.index(geno[1])]
warningmsg = "Variant " + rs_query + " is not biallelic, variant removed. "

for s_key in snp_dict:
# parse snp_key such as chr7:pos_rs4
snp_keys = s_key.split("_")
snp_key = snp_keys[0].split(':')[1]
rs_input = snp_keys[1]
geno_list = snp_dict[s_key]
g = -1
for geno in geno_list:
g = g+1
geno = geno.strip().split()
geno[0] = geno[0].lstrip('chr')
# if 1000G position does not match dbSNP position for variant, use dbSNP position
if geno[1] != snp_key:
mismatch_msg = "Genomic position ("+geno[1]+") in 1000G data does not match dbSNP" + \
dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant " +\
rs_input + ". "
if "warning" in output:
output["warning"] = output["warning"] + warningmsg
output["warning"] = output["warning"] + mismatch_msg
else:
output["warning"] = warningmsg


counter_dups = 0
vcf_pos_no_dup = []
# find if query SNPs yield duplicate results from 1000G data
for g in range(h+1, len(vcf)):
geno = vcf[g - counter_dups].strip().split()
geno[0] = geno[0].lstrip('chr')
temp = geno[0]+geno[1]
if temp in dup_vcf:
counter_dups = counter_dups + 1
vcf.pop(g - counter_dups)
if geno[1] not in vcf_pos_no_dup:
vcf_pos_no_dup.append(geno[1])
else:
vcf_pos_no_dup.append(geno[1])

# throw error if no data is returned from 1000G
if len(vcf[h+1:]) == 0:
output["error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(output["warning"] if "warning" in output else "")
return(json.dumps(output, sort_keys=True, indent=2))

for g in range(h+1, len(vcf)): # 2 rows
geno = vcf[g].strip().split()
geno[0] = geno[0].lstrip('chr')
# if 1000G position does not match dbSNP position for variant, use dbSNP position
if geno[1] not in snp_pos:
snp_pos_index = rs_snp_pos[vcf_pos_no_dup.index(geno[1])]
if "warning" in output:
output["warning"] = output["warning"] + "Genomic position ("+geno[1]+") in VCF file does not match dbSNP" + \
dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant. "
else:
output["warning"] = "Genomic position ("+geno[1]+") in VCF file does not match dbSNP" + \
dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant. "
# throw an error in the event of missing query SNPs in 1000G data
if len(vcf_pos_no_dup) == len(snp_pos):
geno[1] = snp_pos[snp_pos_index]
else:
output["error"] = "One or more query variants were not found in 1000G VCF file. "
return(json.dumps(output, sort_keys=True, indent=2))

if snp_pos.count(geno[1]) == 1:
rs_query = rs_nums[snp_pos.index(geno[1])]

else:
pos_index = []
for p in range(len(snp_pos)):
if snp_pos[p] == geno[1]:
pos_index.append(p)
for p in pos_index:
if rs_nums[p] not in rsnum_lst:
rs_query = rs_nums[p]
break

if rs_query in rsnum_lst:
continue

rs_1000g = geno[2]

if rs_query == rs_1000g:
rsnum = rs_1000g
else:
count = -2
found = "false"
while count <= 2 and count+g < len(vcf):
geno_next = vcf[g+count].strip().split()
geno_next[0] = geno_next[0].lstrip('chr')
if len(geno_next) >= 3 and rs_query == geno_next[2]:
found = "true"
break
count += 1

if found == "false":
if "rs" in rs_1000g:
if "warning" in output:
output["warning"] = output["warning"] + \
"Genomic position for query variant ("+rs_query + \
") does not match RS number at 1000G position (chr" + \
geno[0]+":"+geno[1]+" = "+rs_1000g+"). "
output["warning"] = mismatch_msg
# throw an error in the event of missing query SNPs in 1000G data
geno[1] = snp_key

if "," not in geno[3] and "," not in geno[4]:
a1, a2 = set_alleles(geno[3], geno[4])
count0 = 0
count1 = 0
#print(geno)
for i in range(len(index)):
if geno[index[i]] == "0|0":
hap1[i].append(a1)
hap2[i].append(a1)
count0 += 2
elif geno[index[i]] == "0|1":
hap1[i].append(a1)
hap2[i].append(a2)
count0 += 1
count1 += 1
elif geno[index[i]] == "1|0":
hap1[i].append(a2)
hap2[i].append(a1)
count0 += 1
count1 += 1
elif geno[index[i]] == "1|1":
hap1[i].append(a2)
hap2[i].append(a2)
count1 += 2
elif geno[index[i]] == "0":
hap1[i].append(a1)
hap2[i].append(".")
count0 += 1
elif geno[index[i]] == "1":
hap1[i].append(a2)
hap2[i].append(".")
count1 += 1
else:
output["warning"] = "Genomic position for query variant ("+rs_query + \
") does not match RS number at 1000G position (chr" + \
geno[0]+":"+geno[1]+" = "+rs_1000g+"). "

indx = [i[0] for i in snps].index(rs_query)
# snps[indx][0]=geno[2]
# rsnum=geno[2]
snps[indx][0] = rs_query
rsnum = rs_query
else:
continue

if "," not in geno[3] and "," not in geno[4]:
a1, a2 = set_alleles(geno[3], geno[4])
count0 = 0
count1 = 0
for i in range(len(index)):
if geno[index[i]] == "0|0":
hap1[i].append(a1)
hap2[i].append(a1)
count0 += 2
elif geno[index[i]] == "0|1":
hap1[i].append(a1)
hap2[i].append(a2)
count0 += 1
count1 += 1
elif geno[index[i]] == "1|0":
hap1[i].append(a2)
hap2[i].append(a1)
count0 += 1
count1 += 1
elif geno[index[i]] == "1|1":
hap1[i].append(a2)
hap2[i].append(a2)
count1 += 2
elif geno[index[i]] == "0":
hap1[i].append(a1)
hap2[i].append(".")
count0 += 1
elif geno[index[i]] == "1":
hap1[i].append(a2)
hap2[i].append(".")
count1 += 1
hap1[i].append(".")
hap2[i].append(".")
rsnum_lst.append(rs_input)
position = "chr"+geno[0]+":"+geno[1]
pos_lst.append(position)
f0 = round(float(count0)/(count0+count1), 4)
f1 = round(float(count1)/(count0+count1), 4)
if f0 >= f1:
alleles = a1+"="+str(round(f0, 3))+", " + \
a2+"="+str(round(f1, 3))
else:
hap1[i].append(".")
hap2[i].append(".")

rsnum_lst.append(rsnum)

position = "chr"+geno[0]+":"+geno[1]
pos_lst.append(position)

f0 = round(float(count0)/(count0+count1), 4)
f1 = round(float(count1)/(count0+count1), 4)
if f0 >= f1:
alleles = a1+"="+str(round(f0, 3))+", " + \
a2+"="+str(round(f1, 3))
else:
alleles = a2+"="+str(round(f1, 3))+", " + \
a1+"="+str(round(f0, 3))
allele_lst.append(alleles)

alleles = a2+"="+str(round(f1, 3))+", " + \
a1+"="+str(round(f0, 3))
allele_lst.append(alleles)

haps = {}
for i in range(len(index)):
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions LDlink/LDlink-5.3.0.js → LDlink/LDlink-5.3.1.js
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ $(document).ready(function() {
});

// Load news text from news.html to news-container div
$.get("news-5.3.0.html", function (data) {
$.get("news-5.3.1.html", function (data) {
let tmpData = data.split("<p>")
let i = 0;
var newsHTMLList = [];
Expand Down Expand Up @@ -1810,7 +1810,7 @@ function populateHeaderValues(event, numFiles, label) {
}

function loadHelp() {
$('#help-tab').load('help-5.3.0.html');
$('#help-tab').load('help-5.3.1.html');
}

function calculate(e) {
Expand Down
Loading

0 comments on commit a374f1b

Please sign in to comment.