Skip to content

Commit

Permalink
Add keyword filter to PMB and modify filtering logic, closes #60
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed Jan 6, 2025
1 parent 32e0f30 commit 0e2bfcb
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 30 deletions.
52 changes: 27 additions & 25 deletions src/python/paperetl/file/pmb.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def parse(stream, source, config):
config: path to config directory
"""

# Load id and MeSH filters, if available
ids, codes = PMB.load(config, "ids"), PMB.load(config, "codes")
# Load id, MeSH and keyword filters, if available
ids, codes, keywords = PMB.load(config, "ids"), PMB.load(config, "codes"), PMB.load(config, "keywords")

# Convert ids to ints
ids = set(int(x) for x in ids) if ids else None
Expand All @@ -65,19 +65,29 @@ def parse(stream, source, config):

for event, element in document:
if event == "end" and element.tag == "PubmedArticle":
yield PMB.process(element, source, ids, codes)
yield PMB.process(element, source, ids, codes, keywords)
root.clear()

@staticmethod
def process(element, source, ids, codes):
def process(element, source, ids, codes, keywords):
"""
Processes a single XML article element into an Article.
This method applies the following logic for filters.
- Match on ids
- Match on MeSH codes
- Match on keywords
Filters only fail if there is a list of terms to check. When there are multiple filters active, only one
filter has to pass.
Args:
element: XML element
source: text string describing stream source, can be None
ids: List of ids to select, can be None
codes: List of MeSH codes to select, can be None
keywords: List of keywords to search for, can be None
Returns:
Article or None if Article not parsed
Expand All @@ -90,13 +100,15 @@ def process(element, source, ids, codes):
# General fields
uid = int(citation.find("PMID").text)

# If ids is set, check before processing rest of record
if ids and uid not in ids:
# Apply ids filter, skip article if ids filter fails and no other filters active
idsfail = ids and uid not in ids
if idsfail and not codes and not keywords:
return None

# If MeSH codes is set and codes were parsed, check before processing rest of record
# Apply MeSH codes filter, skip article if all active filters fail
mesh = PMB.mesh(citation)
if mesh and codes and not any(x for x in mesh if x in codes):
codesfail = codes and not any(x for x in mesh if x in codes)
if (not ids or idsfail) and codesfail and not keywords:
return None

source = source if source else "PMB"
Expand All @@ -117,24 +129,14 @@ def process(element, source, ids, codes):
# Abstract text
sections = PMB.sections(article, title)

# Require title and at least one section
if len(sections) > 1:
# Article metadata - id, source, published, publication, authors, affiliations, affiliation, title,
# tags, reference, entry date
metadata = (
str(uid),
source,
published,
publication,
authors,
affiliations,
affiliation,
title,
tags,
reference,
entry,
)
# Apply keywords filter, skip article if all active filters fail
wordsfail = keywords and not any(k for k in keywords if any(x for _, x in sections if k.lower() in x.lower()))
if (not ids or idsfail) and (not codes or codesfail) and wordsfail:
return None

# Require title (title is in sections) and at least one other section
if len(sections) > 1:
metadata = (str(uid), source, published, publication, authors, affiliations, affiliation, title, tags, reference, entry)
return Article(metadata, sections)

return None
Expand Down
14 changes: 9 additions & 5 deletions test/python/testfiledatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,20 @@ def setUpClass(cls):
"""

# Build articles database
Execute.run(Utils.FILE + "/data", Utils.FILE + "/models", None, True)
Execute.run(Utils.FILE + "/data", Utils.FILE + "/models", None, True, 1)

# Generate filter files
with open(Utils.FILE + "/models/codes", "w", encoding="utf-8") as output:
# Generate ids filter file
with open(Utils.FILE + "/models/ids", "w", encoding="utf-8") as output:
output.write("0\n")

with open(Utils.FILE + "/models/ids", "w", encoding="utf-8") as output:
# Run again with replace=False and ids filtering
Execute.run(Utils.FILE + "/data", Utils.FILE + "/models", Utils.FILE + "/models")

# Generate codes filter file
with open(Utils.FILE + "/models/codes", "w", encoding="utf-8") as output:
output.write("0\n")

# Run again with replace=False and filtering
# Run again with replace=False and ids + codes filtering
Execute.run(Utils.FILE + "/data", Utils.FILE + "/models", Utils.FILE + "/models")

def setUp(self):
Expand Down

0 comments on commit 0e2bfcb

Please sign in to comment.