Skip to content

Commit

Permalink
Move over NLTK code from Coaster; revise Ruff config
Browse files Browse the repository at this point in the history
  • Loading branch information
jace committed May 13, 2024
1 parent 8b4cd7d commit 43aa155
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 66 deletions.
32 changes: 31 additions & 1 deletion hasjob/tagging.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from collections import defaultdict
from collections.abc import Iterable
from urllib.parse import urljoin

import nltk
import requests

from coaster.nlp import extract_named_entities
from coaster.utils import text_blocks

from . import app, rq
Expand All @@ -25,6 +26,35 @@


@rq.job('hasjob')
def extract_named_entities(text_blocks: Iterable[str]) -> set[str]:
"""Return a set of named entities extracted from the provided text blocks."""
sentences = []
for text in text_blocks:
sentences.extend(nltk.sent_tokenize(text))

tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

def extract_entity_names(tree: nltk.Tree) -> list[str]:
entity_names = []

if hasattr(tree, "label"):
if tree.label() == "NE":
entity_names.append(" ".join(child[0] for child in tree))
else:
for child in tree:
entity_names.extend(extract_entity_names(child))

return entity_names

entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))

return set(entity_names)


def tag_locations(jobpost_id):
with app.test_request_context():
post = JobPost.query.get(jobpost_id)
Expand Down
102 changes: 38 additions & 64 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,64 +86,6 @@ exclude_dirs = ['node_modules', 'build/lib']
skips = ['*/*_test.py', '*/test_*.py']

[tool.ruff]
# This is a slight customisation of the default rules
# 1. Hasjob still supports Python 3.7 pending its EOL
# 2. Rule E402 (module-level import not top-level) is disabled as isort handles it
# 3. Rule E501 (line too long) is left to Black; some strings are worse for wrapping

# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
select = ["E", "F"]
ignore = ["E402", "E501"]

# Allow autofix for all enabled rules (when `--fix`) is provided.
fixable = [
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"I",
"N",
"Q",
"S",
"T",
"W",
"ANN",
"ARG",
"BLE",
"COM",
"DJ",
"DTZ",
"EM",
"ERA",
"EXE",
"FBT",
"ICN",
"INP",
"ISC",
"NPY",
"PD",
"PGH",
"PIE",
"PL",
"PT",
"PTH",
"PYI",
"RET",
"RSE",
"RUF",
"SIM",
"SLF",
"TCH",
"TID",
"TRY",
"UP",
"YTT",
]
unfixable = []

# Exclude a variety of commonly ignored directories.
exclude = [
".bzr",
Expand Down Expand Up @@ -171,23 +113,48 @@ exclude = [
# Same as Black.
line-length = 88

# Target Python 3.11
target-version = "py311"

[tool.ruff.format]
docstring-code-format = true
quote-style = "preserve"

[tool.ruff.lint]
# Allow unused variables when underscore-prefixed.
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"

# Target Python 3.11
target-version = "py311"
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
select = ["E", "F"]
ignore = ["E402", "E501"]

[tool.ruff.mccabe]
# Allow autofix for all enabled rules (when `--fix`) is provided.
fixable = ["ALL"]
unfixable = []

# Allow these characters in strings
allowed-confusables = ["", ""]

[tool.ruff.lint.mccabe]
# Unlike Flake8, default to a complexity level of 10.
max-complexity = 10

[tool.ruff.isort]
[tool.ruff.lint.extend-per-file-ignores]
"__init__.py" = ["E402"] # Allow non-top-level imports
"tests/**.py" = [
"S101", # Allow assert
"ANN001", # Args don't need types (usually fixtures)
"N802", # Fixture returning a class may be named per class name convention
"N803", # Args don't require naming convention (fixture could be a class)
]

[tool.ruff.lint.isort]
# These config options should match isort config above under [tool.isort]
combine-as-imports = true
extra-standard-library = ['typing_extensions']
split-on-trailing-comma = false
relative-imports-order = 'furthest-to-closest'
known-first-party = ['coaster']
known-first-party = ['coaster', 'baseframe', 'flask_lastuser']
section-order = [
'future',
'standard-library',
Expand All @@ -197,5 +164,12 @@ section-order = [
'local-folder',
]

[tool.ruff.isort.sections]
[tool.ruff.lint.isort.sections]
repo = ['hasjob']

[tool.ruff.lint.flake8-pytest-style]
fixture-parentheses = false
mark-parentheses = false

[tool.ruff.lint.pyupgrade]
keep-runtime-typing = true
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ Flask-Migrate
Flask-Redis
Flask-RQ2
Flask-SQLAlchemy
Flask-Testing
git+https://github.com/maxcountryman/flask-uploads.git#egg=Flask-Uploads
Flask-WTF
geoip2
gunicorn
html2text
jsmin
langid
nltk
Pillow
premailer
progressbar2
Expand Down

0 comments on commit 43aa155

Please sign in to comment.