diff --git a/gilda/generate_terms.py b/gilda/generate_terms.py index 1b8c16b..993d6f5 100644 --- a/gilda/generate_terms.py +++ b/gilda/generate_terms.py @@ -694,9 +694,8 @@ def get_all_terms(): return terms -def main(): - terms = get_all_terms() - from .resources import GROUNDING_TERMS_PATH as fname +def dump_terms(terms, fname): + """Dump a list of terms to a tsv.gz file.""" logger.info('Dumping into %s' % fname) header = ['norm_text', 'text', 'db', 'id', 'entry_name', 'status', 'source', 'organism', 'source_db', 'source_id'] @@ -706,5 +705,11 @@ def main(): writer.writerows(t.to_list() for t in terms) +def main(): + from .resources import GROUNDING_TERMS_PATH as fname + terms = get_all_terms() + dump_terms(terms, fname) + + if __name__ == '__main__': main() diff --git a/gilda/grounder.py b/gilda/grounder.py index c186ba3..da0ab71 100644 --- a/gilda/grounder.py +++ b/gilda/grounder.py @@ -7,7 +7,7 @@ from pathlib import Path from collections import defaultdict, Counter from textwrap import dedent -from typing import List, Mapping, Optional, Set, Tuple, Union +from typing import Iterator, List, Mapping, Optional, Set, Tuple, Union from adeft.disambiguate import load_disambiguator from adeft.modeling.classify import load_model_info from adeft import available_shortforms as available_adeft_models @@ -23,6 +23,7 @@ "GrounderInput", "ScoredMatch", "load_terms_file", + "load_entries_from_terms_file", "filter_for_organism", "load_adeft_models", "load_gilda_models", @@ -542,20 +543,19 @@ def get_grounding_dict(self) -> Mapping[str, str]: } -def load_terms_file(terms_file: Union[str, Path]) -> Mapping[str, List[Term]]: - """Load a TSV file containing terms into a lookup dictionary. +def load_entries_from_terms_file(terms_file: Union[str, Path]) -> Iterator[Term]: + """Yield Terms from a compressed terms TSV file path. Parameters ---------- terms_file : - Path to a TSV terms file with columns corresponding to the serialized - elements of a Term. + Path to a compressed TSV terms file with columns corresponding to the + serialized elements of a Term. Returns ------- : - A lookup dictionary whose keys are normalized entity texts, and values - are lists of Terms with that normalized entity text. + Terms loaded from the file yielded by a generator. """ with gzip.open(terms_file, 'rt', encoding='utf-8') as fh: entries = {} @@ -564,12 +564,31 @@ def load_terms_file(terms_file: Union[str, Path]) -> Mapping[str, List[Term]]: next(reader) for row in reader: row_nones = [r if r else None for r in row] - entry = Term(*row_nones) - if row[0] in entries: - entries[row[0]].append(entry) - else: - entries[row[0]] = [entry] - return entries + yield Term(*row_nones) + + +def load_terms_file(terms_file: Union[str, Path]) -> Mapping[str, List[Term]]: + """Load a TSV file containing terms into a lookup dictionary. + + Parameters + ---------- + terms_file : + Path to a compressed TSV terms file with columns corresponding to the + serialized elements of a Term. + + Returns + ------- + : + A lookup dictionary whose keys are normalized entity texts, and values + are lists of Terms with that normalized entity text. + """ + entries = {} + for term in load_entries_from_terms_file(terms_file): + if term.norm_text in entries: + entries[term.norm_text].append(term) + else: + entries[term.norm_text] = [term] + return entries def filter_for_organism(terms, organisms):