From 0e2bfcb6e0c3d91f32df082d4137a858a0660151 Mon Sep 17 00:00:00 2001 From: davidmezzetti <561939+davidmezzetti@users.noreply.github.com> Date: Mon, 6 Jan 2025 08:33:15 -0500 Subject: [PATCH] Add keyword filter to PMB and modify filtering logic, closes #60 --- src/python/paperetl/file/pmb.py | 52 +++++++++++++++++---------------- test/python/testfiledatabase.py | 14 +++++---- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/src/python/paperetl/file/pmb.py b/src/python/paperetl/file/pmb.py index 3f1bfe4..1a92806 100644 --- a/src/python/paperetl/file/pmb.py +++ b/src/python/paperetl/file/pmb.py @@ -52,8 +52,8 @@ def parse(stream, source, config): config: path to config directory """ - # Load id and MeSH filters, if available - ids, codes = PMB.load(config, "ids"), PMB.load(config, "codes") + # Load id, MeSH and keyword filters, if available + ids, codes, keywords = PMB.load(config, "ids"), PMB.load(config, "codes"), PMB.load(config, "keywords") # Convert ids to ints ids = set(int(x) for x in ids) if ids else None @@ -65,19 +65,29 @@ def parse(stream, source, config): for event, element in document: if event == "end" and element.tag == "PubmedArticle": - yield PMB.process(element, source, ids, codes) + yield PMB.process(element, source, ids, codes, keywords) root.clear() @staticmethod - def process(element, source, ids, codes): + def process(element, source, ids, codes, keywords): """ Processes a single XML article element into an Article. + This method applies the following logic for filters. + + - Match on ids + - Match on MeSH codes + - Match on keywords + + Filters only fail if there is a list of terms to check. When there are multiple filters active, only one + filter has to pass. + Args: element: XML element source: text string describing stream source, can be None ids: List of ids to select, can be None codes: List of MeSH codes to select, can be None + keywords: List of keywords to search for, can be None Returns: Article or None if Article not parsed @@ -90,13 +100,15 @@ def process(element, source, ids, codes): # General fields uid = int(citation.find("PMID").text) - # If ids is set, check before processing rest of record - if ids and uid not in ids: + # Apply ids filter, skip article if ids filter fails and no other filters active + idsfail = ids and uid not in ids + if idsfail and not codes and not keywords: return None - # If MeSH codes is set and codes were parsed, check before processing rest of record + # Apply MeSH codes filter, skip article if all active filters fail mesh = PMB.mesh(citation) - if mesh and codes and not any(x for x in mesh if x in codes): + codesfail = codes and not any(x for x in mesh if x in codes) + if (not ids or idsfail) and codesfail and not keywords: return None source = source if source else "PMB" @@ -117,24 +129,14 @@ def process(element, source, ids, codes): # Abstract text sections = PMB.sections(article, title) - # Require title and at least one section - if len(sections) > 1: - # Article metadata - id, source, published, publication, authors, affiliations, affiliation, title, - # tags, reference, entry date - metadata = ( - str(uid), - source, - published, - publication, - authors, - affiliations, - affiliation, - title, - tags, - reference, - entry, - ) + # Apply keywords filter, skip article if all active filters fail + wordsfail = keywords and not any(k for k in keywords if any(x for _, x in sections if k.lower() in x.lower())) + if (not ids or idsfail) and (not codes or codesfail) and wordsfail: + return None + # Require title (title is in sections) and at least one other section + if len(sections) > 1: + metadata = (str(uid), source, published, publication, authors, affiliations, affiliation, title, tags, reference, entry) return Article(metadata, sections) return None diff --git a/test/python/testfiledatabase.py b/test/python/testfiledatabase.py index db21e8b..c01c13d 100644 --- a/test/python/testfiledatabase.py +++ b/test/python/testfiledatabase.py @@ -37,16 +37,20 @@ def setUpClass(cls): """ # Build articles database - Execute.run(Utils.FILE + "/data", Utils.FILE + "/models", None, True) + Execute.run(Utils.FILE + "/data", Utils.FILE + "/models", None, True, 1) - # Generate filter files - with open(Utils.FILE + "/models/codes", "w", encoding="utf-8") as output: + # Generate ids filter file + with open(Utils.FILE + "/models/ids", "w", encoding="utf-8") as output: output.write("0\n") - with open(Utils.FILE + "/models/ids", "w", encoding="utf-8") as output: + # Run again with replace=False and ids filtering + Execute.run(Utils.FILE + "/data", Utils.FILE + "/models", Utils.FILE + "/models") + + # Generate codes filter file + with open(Utils.FILE + "/models/codes", "w", encoding="utf-8") as output: output.write("0\n") - # Run again with replace=False and filtering + # Run again with replace=False and ids + codes filtering Execute.run(Utils.FILE + "/data", Utils.FILE + "/models", Utils.FILE + "/models") def setUp(self):