From f1bcc2074e6eccb66df6853caa02fb809f27368f Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Mon, 27 Feb 2023 19:55:58 -0500
Subject: [PATCH 1/2] Add function to yield Terms from file

---
 gilda/grounder.py | 45 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/gilda/grounder.py b/gilda/grounder.py
index c186ba3..da0ab71 100644
--- a/gilda/grounder.py
+++ b/gilda/grounder.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from collections import defaultdict, Counter
 from textwrap import dedent
-from typing import List, Mapping, Optional, Set, Tuple, Union
+from typing import Iterator, List, Mapping, Optional, Set, Tuple, Union
 from adeft.disambiguate import load_disambiguator
 from adeft.modeling.classify import load_model_info
 from adeft import available_shortforms as available_adeft_models
@@ -23,6 +23,7 @@
     "GrounderInput",
     "ScoredMatch",
     "load_terms_file",
+    "load_entries_from_terms_file",
     "filter_for_organism",
     "load_adeft_models",
     "load_gilda_models",
@@ -542,20 +543,19 @@ def get_grounding_dict(self) -> Mapping[str, str]:
         }
 
 
-def load_terms_file(terms_file: Union[str, Path]) -> Mapping[str, List[Term]]:
-    """Load a TSV file containing terms into a lookup dictionary.
+def load_entries_from_terms_file(terms_file: Union[str, Path]) -> Iterator[Term]:
+    """Yield Terms from a compressed terms TSV file path.
 
     Parameters
     ----------
     terms_file :
-        Path to a TSV terms file with columns corresponding to the serialized
-        elements of a Term.
+        Path to a compressed TSV terms file with columns corresponding to the
+        serialized elements of a Term.
 
     Returns
     -------
     :
-        A lookup dictionary whose keys are normalized entity texts, and values
-        are lists of Terms with that normalized entity text.
+        Terms loaded from the file yielded by a generator.
     """
     with gzip.open(terms_file, 'rt', encoding='utf-8') as fh:
         entries = {}
@@ -564,12 +564,31 @@ def load_terms_file(terms_file: Union[str, Path]) -> Mapping[str, List[Term]]:
         next(reader)
         for row in reader:
             row_nones = [r if r else None for r in row]
-            entry = Term(*row_nones)
-            if row[0] in entries:
-                entries[row[0]].append(entry)
-            else:
-                entries[row[0]] = [entry]
-        return entries
+            yield Term(*row_nones)
+
+
+def load_terms_file(terms_file: Union[str, Path]) -> Mapping[str, List[Term]]:
+    """Load a TSV file containing terms into a lookup dictionary.
+
+    Parameters
+    ----------
+    terms_file :
+        Path to a compressed TSV terms file with columns corresponding to the
+        serialized elements of a Term.
+
+    Returns
+    -------
+    :
+        A lookup dictionary whose keys are normalized entity texts, and values
+        are lists of Terms with that normalized entity text.
+    """
+    entries = {}
+    for term in load_entries_from_terms_file(terms_file):
+        if term.norm_text in entries:
+            entries[term.norm_text].append(term)
+        else:
+            entries[term.norm_text] = [term]
+    return entries
 
 
 def filter_for_organism(terms, organisms):

From 1741bb70ee82a800981fbea12c3c87d2df0b0e30 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Tue, 28 Feb 2023 09:47:02 -0500
Subject: [PATCH 2/2] Add a function to dump terms

---
 gilda/generate_terms.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/gilda/generate_terms.py b/gilda/generate_terms.py
index 1b8c16b..993d6f5 100644
--- a/gilda/generate_terms.py
+++ b/gilda/generate_terms.py
@@ -694,9 +694,8 @@ def get_all_terms():
     return terms
 
 
-def main():
-    terms = get_all_terms()
-    from .resources import GROUNDING_TERMS_PATH as fname
+def dump_terms(terms, fname):
+    """Dump a list of terms to a tsv.gz file."""
     logger.info('Dumping into %s' % fname)
     header = ['norm_text', 'text', 'db', 'id', 'entry_name', 'status',
               'source', 'organism', 'source_db', 'source_id']
@@ -706,5 +705,11 @@ def main():
         writer.writerows(t.to_list() for t in terms)
 
 
+def main():
+    from .resources import GROUNDING_TERMS_PATH as fname
+    terms = get_all_terms()
+    dump_terms(terms, fname)
+
+
 if __name__ == '__main__':
     main()