From e9e81a0f64e9ded77d0a7997cd0a768e79674ce2 Mon Sep 17 00:00:00 2001
From: Rohan Juneja <rohanj2006@gmail.com>
Date: Fri, 9 Feb 2024 18:44:36 -0800
Subject: [PATCH] add split_colon; split_chembl

---
 parser.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/parser.py b/parser.py
index 8873c09..384e3ee 100644
--- a/parser.py
+++ b/parser.py
@@ -68,7 +68,7 @@
 COLUMN_DATA stores where each column should go in the actual document. Each key corresponds to a column name in the csv. For each of these, four fields must be defined:
 
 - "location": defines the location of the column in the document, nested layers should be separated with a "."
-- "type": The type of data in the column. Supported types include "int", "string", "split_comma" (ie. "a,b,c"), and "split_semicolon" (ie "a;b;c")
+- "type": The type of data in the column. Supported types include "int", "string", "split_comma" (ie. "a,b,c"), "split_semicolon" (ie "a;b;c"), "split_chembl" (ie "CHEMBL1CHEMBL2CHEMBL3"), "split_colon" (ie "a::b::c")
 - "uniprot_type": This indicates whether the data is specifically applicable for "swissprot" or "trembl" documents, if it is applicable for both use "all"
 - "relation": This simply should store a boolean of whether this field is in the relation of the document, as the relation is treated a bit differently in terms of merging
 """
@@ -105,13 +105,13 @@
     },
     "BindingDB Ligand Name": {
       "location": "object.name",
-      "type": "string",
+      "type": "split_colon",
       "uniprot_type": "all",
       "relation": False
     },
     "Target Name Assigned by Curator or DataSource": {
       "location": "subject.name",
-      "type": "string",
+      "type": "split_colon",
       "uniprot_type": "all",
       "relation": False
     },
@@ -261,7 +261,7 @@
     },
     "ChEMBL ID of Ligand": {
       "location": "object.chembl",
-      "type": "string",
+      "type": "split_chembl",
       "uniprot_type": "all",
       "relation": False
     },
@@ -377,7 +377,7 @@ def append_field(doc: dict, key: str, value: any):
     for i in keys[:len(keys)-1]:
         key_ref = key_ref[i]
 
-    if COLUMN_DATA[key]['type'] == "split_comma" or COLUMN_DATA[key]['type'] == "split_semicolon":
+    if COLUMN_DATA[key]['type'] == "split_comma" or COLUMN_DATA[key]['type'] == "split_semicolon" or COLUMN_DATA[key]['type'] == "split_chembl" or COLUMN_DATA[key]['type'] == "split_colon":
         if isinstance(key_ref[keys[-1]][0], list) and value not in key_ref[keys[-1]]:
             key_ref[keys[-1]].append(value)
         if not isinstance(key_ref[keys[-1]][0], list) and value != key_ref[keys[-1]]:
@@ -427,6 +427,10 @@ def process_field(field_name: str, value: str):
         return value.split(',')
     if field_type == "split_semicolon":
         return value.split('; ')
+    if field_type == "split_colon":
+        return value.split('::')
+    if field_type == "split_chembl":
+        return ['CHEMBL' + x for x in value.split('CHEMBL') if x != '']
     return value