Merge pull request #181 from CBIIT/ldlink-5.3.1

Ldlink 5.3.1
CBIIT · Apr 28, 2022 · a374f1b · a374f1b
2 parents 1c1bb31 + 445327a
commit a374f1b
Show file tree

Hide file tree

Showing 11 changed files with 322 additions and 613 deletions.
diff --git a/LDlink/LDassoc.py b/LDlink/LDassoc.py
@@ -77,7 +77,7 @@ def calculate_assoc(file, region, pop, request, genome_build, web, myargs):
 				mongo_host = 'localhost'
 			client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
 			db = client["LDLink"]
-
+			
 			def get_coords_var(db, rsid):
 				rsid = rsid.strip("rs")
 				query_results = db.dbsnp.find_one({"id": rsid})

diff --git a/LDlink/LDcommon.py b/LDlink/LDcommon.py
@@ -48,7 +48,7 @@
     },
     "grch38_high_coverage": {
         "title": "GRCh38 High Coverage",
-        "title_hg": "hg38",
+        "title_hg": "hg38_HC",
         "chromosome": "chromosome_grch38",
         "position": "position_grch38",
         "gene_begin": "begin_grch38",
@@ -110,11 +110,14 @@ def connectMongoDBReadOnly(web):
 
 def retrieveTabix1000GData(query_file, coords, query_dir):
     export_s3_keys = retrieveAWSCredentials()
-    tabix_snps = export_s3_keys + " cd {2}; tabix -fhD {0}{1} | grep -v -e END".format(
+    tabix_snps = export_s3_keys + " cd {2}; tabix -fhD --separate-regions {0}{1} | grep -v -e END".format(
         query_file, coords, query_dir)
     # print("tabix_snps", tabix_snps)
     vcf = [x.decode('utf-8') for x in subprocess.Popen(tabix_snps, shell=True, stdout=subprocess.PIPE).stdout.readlines()]
-    return vcf
+    h = 0
+    while vcf[h][0:2] == "##":
+        h += 1
+    return vcf,h
 
 # Query genomic coordinates
 def get_rsnum(db, coord, genome_build):
@@ -214,3 +217,43 @@ def getRecomb(db, filename, chromosome, begin, end, genome_build):
                 genome_build_vars[genome_build]['position']: recomb_obj[genome_build_vars[genome_build]['position']]
             }) + '\n')
     return recomb_results_sanitized
+
+def parse_vcf(vcf,snp_coords):
+    delimiter = "#"
+    snp_lists = str('**'.join(vcf)).split(delimiter)
+    snp_dict = {}
+    snp_rs_dict = {}
+    missing_snp = []
+    missing_rs = []    
+    snp_found_list = [] 
+    #print(vcf)
+    #print(snp_lists)
+
+    for snp in snp_lists[1:]:
+        snp_tuple = snp.split("**")
+        snp_key = snp_tuple[0].split("-")[-1].strip()
+        vcf_list = [] 
+        #print(snp_tuple)
+        match_v = ''
+        for v in snp_tuple[1:]:#choose the matched one for dup; if no matched, choose first
+            if len(v) > 0:
+                match_v = v
+                geno = v.strip().split()
+                if geno[1] == snp_key:
+                    match_v = v
+        if len(match_v) > 0:
+            vcf_list.append(match_v)
+            snp_found_list.append(snp_key)   
+
+        #vcf_list.append(snp_tuple.pop()) #always use the last one, even dup
+        #create snp_key as chr7:pos_rs4
+        snp_dict[snp_key] = vcf_list
+
+    for snp_coord in snp_coords:
+        if snp_coord[-1] not in snp_found_list:
+            missing_rs.append(snp_coord[0])
+        else:
+            s_key = "chr"+snp_coord[1]+":"+snp_coord[2]+"_"+snp_coord[0]
+            snp_rs_dict[s_key] = snp_dict[snp_coord[2]]
+    del snp_dict
+    return snp_rs_dict," ".join(missing_rs)
diff --git a/LDlink/LDhap.py b/LDlink/LDhap.py
@@ -6,7 +6,7 @@
 from bson import json_util
 import subprocess
 import sys
-from LDcommon import checkS3File, connectMongoDBReadOnly, genome_build_vars, retrieveTabix1000GData
+from LDcommon import checkS3File, connectMongoDBReadOnly, genome_build_vars, retrieveTabix1000GData,parse_vcf
 
 # Create LDhap function
 def calculate_hap(snplst, pop, request, web, genome_build):
@@ -196,14 +196,13 @@ def replace_coords_rsid(db, snp_lst):
 
     snp_coord_str = [genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int]
     tabix_coords = " " + " ".join(snp_coord_str)
-    print("tabix_coords", tabix_coords)
+    #print("tabix_coords", tabix_coords)
     # # Extract 1000 Genomes phased genotypes
     vcf_filePath = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1]))
     vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)
 
     checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)
-    vcf = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
-
+    vcf,h = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
     # Define function to correct indel alleles
     def set_alleles(a1, a2):
         if len(a1) == 1 and len(a2) == 1:
@@ -220,15 +219,10 @@ def set_alleles(a1, a2):
             a2_n = a2[1:]
         return(a1_n, a2_n)
 
-
     # Make sure there are genotype data in VCF file
-    if vcf[-1][0:6] == "#CHROM":
-        output["error"] = "No query variants were found in 1000G VCF file. " + str(output["warning"] if "warning" in output else "")
-        return(json.dumps(output, sort_keys=True, indent=2))
-
-    h = 0
-    while vcf[h][0:2] == "##":
-        h += 1
+    #if vcf[-1][0:6] == "#CHROM":
+    #    output["error"] = "No query variants were found in 1000G VCF file. " + str(output["warning"] if "warning" in output else "")
+    #    return(json.dumps(output, sort_keys=True, indent=2))
 
     head = vcf[h].strip().split()
 
@@ -245,169 +239,90 @@ def set_alleles(a1, a2):
     for i in range(len(index)-1):
         hap2.append([])
 
+    # parse vcf
+    snp_dict,missing_snp = parse_vcf(vcf[h+1:],snp_coords)
+   # throw error if no data is returned from 1000G
+    if len(missing_snp.split()) == len(snp_pos):
+        output["error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(output["warning"] if "warning" in output else "")
+        return(json.dumps(output, sort_keys=True, indent=2))
+
+    if len(missing_snp) > 0:
+        output["warning"] = "Query variant " + str(missing_snp) + " is missing from 1000G (" + genome_build_vars[genome_build]['title'] + ") data. " + str(output["warning"] if "warning" in output else "")
+
     rsnum_lst = []
     allele_lst = []
     pos_lst = []
-
-    unique_vcf = []
-    dup_vcf = []
-    for g in range(h+1, len(vcf)):
-        geno = vcf[g].strip().split()
-        geno[0] = geno[0].lstrip('chr')
-        temp = geno[0]+geno[1]
-        if temp not in unique_vcf:
-            unique_vcf.append(temp)
-        else:
-            dup_vcf.append(temp)
-            if snp_pos.count(geno[1]) == 1:
-                rs_query = rs_nums[snp_pos.index(geno[1])]
-                warningmsg = "Variant " + rs_query + " is not biallelic, variant removed. " 
+
+    for s_key in snp_dict:
+        # parse snp_key such as chr7:pos_rs4
+        snp_keys = s_key.split("_")
+        snp_key = snp_keys[0].split(':')[1]
+        rs_input = snp_keys[1]
+        geno_list = snp_dict[s_key] 
+        g = -1
+        for geno in geno_list:
+            g = g+1
+            geno = geno.strip().split()
+            geno[0] = geno[0].lstrip('chr')
+            # if 1000G position does not match dbSNP position for variant, use dbSNP position
+            if geno[1] != snp_key:
+                mismatch_msg = "Genomic position ("+geno[1]+") in 1000G data does not match dbSNP" + \
+                        dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant " +\
+                        rs_input + ". "
                 if "warning" in output:
-                    output["warning"] = output["warning"] + warningmsg
+                    output["warning"] = output["warning"] + mismatch_msg
                 else:
-                    output["warning"] = warningmsg
-
-
-    counter_dups = 0
-    vcf_pos_no_dup = []
-    # find if query SNPs yield duplicate results from 1000G data
-    for g in range(h+1, len(vcf)):
-        geno = vcf[g - counter_dups].strip().split()
-        geno[0] = geno[0].lstrip('chr')
-        temp = geno[0]+geno[1]
-        if temp in dup_vcf:
-            counter_dups = counter_dups + 1
-            vcf.pop(g - counter_dups)
-            if geno[1] not in vcf_pos_no_dup:
-                vcf_pos_no_dup.append(geno[1])
-        else:
-            vcf_pos_no_dup.append(geno[1])
-
-    # throw error if no data is returned from 1000G
-    if len(vcf[h+1:]) == 0:
-        output["error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(output["warning"] if "warning" in output else "")
-        return(json.dumps(output, sort_keys=True, indent=2))
-
-    for g in range(h+1, len(vcf)): # 2 rows
-        geno = vcf[g].strip().split()
-        geno[0] = geno[0].lstrip('chr')
-        # if 1000G position does not match dbSNP position for variant, use dbSNP position
-        if geno[1] not in snp_pos:
-            snp_pos_index = rs_snp_pos[vcf_pos_no_dup.index(geno[1])]
-            if "warning" in output:
-                output["warning"] = output["warning"] + "Genomic position ("+geno[1]+") in VCF file does not match dbSNP" + \
-                    dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant. "
-            else:
-                output["warning"] = "Genomic position ("+geno[1]+") in VCF file does not match dbSNP" + \
-                    dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant. "
-            # throw an error in the event of missing query SNPs in 1000G data
-            if len(vcf_pos_no_dup) == len(snp_pos):
-                geno[1] = snp_pos[snp_pos_index]
-            else:
-                output["error"] = "One or more query variants were not found in 1000G VCF file. "
-                return(json.dumps(output, sort_keys=True, indent=2))
-
-        if snp_pos.count(geno[1]) == 1:
-            rs_query = rs_nums[snp_pos.index(geno[1])]
-
-        else:
-            pos_index = []
-            for p in range(len(snp_pos)):
-                if snp_pos[p] == geno[1]:
-                    pos_index.append(p)
-            for p in pos_index:
-                if rs_nums[p] not in rsnum_lst:
-                    rs_query = rs_nums[p]
-                    break
-
-        if rs_query in rsnum_lst:
-            continue
-
-        rs_1000g = geno[2]
-
-        if rs_query == rs_1000g:
-            rsnum = rs_1000g
-        else:
-            count = -2
-            found = "false"
-            while count <= 2 and count+g < len(vcf):
-                geno_next = vcf[g+count].strip().split()
-                geno_next[0] = geno_next[0].lstrip('chr')
-                if len(geno_next) >= 3 and rs_query == geno_next[2]:
-                    found = "true"
-                    break
-                count += 1
-
-            if found == "false":
-                if "rs" in rs_1000g:
-                    if "warning" in output:
-                        output["warning"] = output["warning"] + \
-                            "Genomic position for query variant ("+rs_query + \
-                            ") does not match RS number at 1000G position (chr" + \
-                            geno[0]+":"+geno[1]+" = "+rs_1000g+"). "
+                    output["warning"] = mismatch_msg
+                # throw an error in the event of missing query SNPs in 1000G data
+                geno[1] = snp_key
+
+            if "," not in geno[3] and "," not in geno[4]:
+                a1, a2 = set_alleles(geno[3], geno[4])
+                count0 = 0
+                count1 = 0
+                #print(geno)
+                for i in range(len(index)):
+                    if geno[index[i]] == "0|0":
+                        hap1[i].append(a1)
+                        hap2[i].append(a1)
+                        count0 += 2
+                    elif geno[index[i]] == "0|1":
+                        hap1[i].append(a1)
+                        hap2[i].append(a2)
+                        count0 += 1
+                        count1 += 1
+                    elif geno[index[i]] == "1|0":
+                        hap1[i].append(a2)
+                        hap2[i].append(a1)
+                        count0 += 1
+                        count1 += 1
+                    elif geno[index[i]] == "1|1":
+                        hap1[i].append(a2)
+                        hap2[i].append(a2)
+                        count1 += 2
+                    elif geno[index[i]] == "0":
+                        hap1[i].append(a1)
+                        hap2[i].append(".")
+                        count0 += 1
+                    elif geno[index[i]] == "1":
+                        hap1[i].append(a2)
+                        hap2[i].append(".")
+                        count1 += 1
                     else:
-                        output["warning"] = "Genomic position for query variant ("+rs_query + \
-                            ") does not match RS number at 1000G position (chr" + \
-                            geno[0]+":"+geno[1]+" = "+rs_1000g+"). "
-
-                indx = [i[0] for i in snps].index(rs_query)
-                # snps[indx][0]=geno[2]
-                # rsnum=geno[2]
-                snps[indx][0] = rs_query
-                rsnum = rs_query
-            else:
-                continue
-
-        if "," not in geno[3] and "," not in geno[4]:
-            a1, a2 = set_alleles(geno[3], geno[4])
-            count0 = 0
-            count1 = 0
-            for i in range(len(index)):
-                if geno[index[i]] == "0|0":
-                    hap1[i].append(a1)
-                    hap2[i].append(a1)
-                    count0 += 2
-                elif geno[index[i]] == "0|1":
-                    hap1[i].append(a1)
-                    hap2[i].append(a2)
-                    count0 += 1
-                    count1 += 1
-                elif geno[index[i]] == "1|0":
-                    hap1[i].append(a2)
-                    hap2[i].append(a1)
-                    count0 += 1
-                    count1 += 1
-                elif geno[index[i]] == "1|1":
-                    hap1[i].append(a2)
-                    hap2[i].append(a2)
-                    count1 += 2
-                elif geno[index[i]] == "0":
-                    hap1[i].append(a1)
-                    hap2[i].append(".")
-                    count0 += 1
-                elif geno[index[i]] == "1":
-                    hap1[i].append(a2)
-                    hap2[i].append(".")
-                    count1 += 1
+                        hap1[i].append(".")
+                        hap2[i].append(".")
+                rsnum_lst.append(rs_input)
+                position = "chr"+geno[0]+":"+geno[1]
+                pos_lst.append(position)
+                f0 = round(float(count0)/(count0+count1), 4)
+                f1 = round(float(count1)/(count0+count1), 4)
+                if f0 >= f1:
+                    alleles = a1+"="+str(round(f0, 3))+", " + \
+                        a2+"="+str(round(f1, 3))
                 else:
-                    hap1[i].append(".")
-                    hap2[i].append(".")
-
-            rsnum_lst.append(rsnum)
-
-            position = "chr"+geno[0]+":"+geno[1]
-            pos_lst.append(position)
-
-            f0 = round(float(count0)/(count0+count1), 4)
-            f1 = round(float(count1)/(count0+count1), 4)
-            if f0 >= f1:
-                alleles = a1+"="+str(round(f0, 3))+", " + \
-                    a2+"="+str(round(f1, 3))
-            else:
-                alleles = a2+"="+str(round(f1, 3))+", " + \
-                    a1+"="+str(round(f0, 3))
-            allele_lst.append(alleles)
-
+                    alleles = a2+"="+str(round(f1, 3))+", " + \
+                        a1+"="+str(round(f0, 3))
+                allele_lst.append(alleles)
 
     haps = {}
     for i in range(len(index)):

diff --git a/LDlink/LDlink-5.3.0.css → LDlink/LDlink-5.3.1.css b/LDlink/LDlink-5.3.0.css → LDlink/LDlink-5.3.1.css
diff --git a/LDlink/LDlink-5.3.0.js → LDlink/LDlink-5.3.1.js b/LDlink/LDlink-5.3.0.js → LDlink/LDlink-5.3.1.js
@@ -67,7 +67,7 @@ $(document).ready(function() {
     });
 
     // Load news text from news.html to news-container div
-    $.get("news-5.3.0.html", function (data) {
+    $.get("news-5.3.1.html", function (data) {
         let tmpData = data.split("<p>")
         let i = 0;
         var newsHTMLList = [];
@@ -1810,7 +1810,7 @@ function populateHeaderValues(event, numFiles, label) {
 }
 
 function loadHelp() {
-    $('#help-tab').load('help-5.3.0.html');
+    $('#help-tab').load('help-5.3.1.html');
 }
 
 function calculate(e) {