Skip to content

Commit

Permalink
Merge pull request #105 from indralab/depluralize
Browse files Browse the repository at this point in the history
Improve depluralization
  • Loading branch information
bgyori authored Jan 3, 2023
2 parents a189a7b + 78cdc66 commit be74335
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 23 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.6", "3.10"]
python-version: ["3.7", "3.10"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
Expand Down
4 changes: 2 additions & 2 deletions gilda/grounder.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ def _generate_lookups(self, raw_str: str) -> Set[str]:
roman_arabic = normalize(replace_roman_arabic(raw_str))
lookups.add(roman_arabic)
# Finally, we attempt to depluralize the word
depluralized = normalize(depluralize(raw_str)[0])
lookups.add(depluralized)
for singular, rule in depluralize(raw_str):
lookups.add(normalize(singular))

logger.debug('Looking up the following strings: %s' %
', '.join(lookups))
Expand Down
31 changes: 18 additions & 13 deletions gilda/process.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""Module containing various string processing functions used for grounding."""
from typing import List, Tuple

import regex as re
import unidecode

Expand Down Expand Up @@ -168,7 +170,7 @@ def get_capitalization_pattern(word, beginning_of_sentence=False):
return 'mixed'


def depluralize(word):
def depluralize(word: str) -> List[Tuple[str, str]]:
"""Return the depluralized version of the word, along with a status flag.
Parameters
Expand All @@ -178,43 +180,46 @@ def depluralize(word):
Returns
-------
str
list of str pairs:
The original word, if it is detected to be non-plural, or the
depluralized version of the word.
str
A status flag representing the detected pluralization status of the
depluralized version of the word, and a status flag representing the
detected pluralization status of the
word, with non_plural (e.g., BRAF), plural_oes (e.g., mosquitoes),
plural_ies (e.g., antibodies), plural_es (e.g., switches),
plural_cap_s (e.g., MAPKs), and plural_s (e.g., receptors).
"""
# If the word doesn't end in s, we assume it's not plural
if not word.endswith('s'):
return word, 'non_plural'
return [(word, 'non_plural')]
# Another case is words ending in -sis (e.g., apoptosis), these are almost
# exclusively non plural so we return here too
elif word.endswith('sis'):
return word, 'non_plural'
return [(word, 'non_plural')]
# This is the case when the word ends with an o which is pluralized as oes
# e.g., mosquitoes
elif word.endswith('oes'):
return word[:-2], 'plural_oes'
return [(word[:-2], 'plural_oes'),
(word[:-1], 'plural_s')]
# This is the case when the word ends with a y which is pluralized as ies,
# e.g., antibodies
elif word.endswith('ies'):
return word[:-3] + 'y', 'plural_ies'
return [(word[:-3] + 'y', 'plural_ies'),
(word[:-1], 'plural_s')]
# These are the cases where words form plurals by adding -es so we
# return by stripping it off
# return by stripping it off. However, it's not possible to determine
# if the word doesn't end in e.g., -xe or -se in a singluar form, and
# so we also return a variant to account for this.
elif word.endswith(('xes', 'ses', 'ches', 'shes')):
return word[:-2], 'plural_es'
return [(word[:-2], 'plural_es'), (word[:-1], 'plural_s')]
# If the word is all caps and the last letter is an s, then it's a very
# strong signal that it is pluralized so we have a custom return value
# for that
elif re.match(r'^\p{Lu}+$', word[:-1]):
return word[:-1], 'plural_caps_s'
return [(word[:-1], 'plural_caps_s')]
# Otherwise, we just go with the assumption that the last s is the
# plural marker
else:
return word[:-1], 'plural_s'
return [(word[:-1], 'plural_s')]
# Note: there don't seem to be any compelling examples of -f or -fe -> ves
# so it is not implemented

Expand Down
19 changes: 12 additions & 7 deletions gilda/tests/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@


def test_depluralize():
assert depluralize('BRAF') == ('BRAF', 'non_plural')
assert depluralize('apoptosis') == ('apoptosis', 'non_plural')
assert depluralize('mosquitoes') == ('mosquito', 'plural_oes')
assert depluralize('antibodies') == ('antibody', 'plural_ies')
assert depluralize('branches') == ('branch', 'plural_es')
assert depluralize('CDs') == ('CD', 'plural_caps_s')
assert depluralize('receptors') == ('receptor', 'plural_s')
assert depluralize('BRAF') == [('BRAF', 'non_plural')]
assert depluralize('apoptosis') == [('apoptosis', 'non_plural')]
assert depluralize('mosquitoes') == [('mosquito', 'plural_oes'),
('mosquitoe', 'plural_s')]
assert depluralize('antibodies') == [('antibody', 'plural_ies'),
('antibodie', 'plural_s')]
assert depluralize('branches') == [('branch', 'plural_es'),
('branche', 'plural_s')]
assert depluralize('CDs') == [('CD', 'plural_caps_s')]
assert depluralize('receptors') == [('receptor', 'plural_s')]
assert depluralize('kinases') == [('kinas', 'plural_es'),
('kinase', 'plural_s')]


def test_greek():
Expand Down

0 comments on commit be74335

Please sign in to comment.