From 0b84f87b517888d152e5fb68df18bcec993be1be Mon Sep 17 00:00:00 2001 From: Kiran Jonnalagadda Date: Tue, 14 May 2024 00:37:25 +0530 Subject: [PATCH] Move over NLTK code from Coaster; revise Ruff config (#768) --- hasjob/__init__.py | 4 +- hasjob/forms/board.py | 2 +- hasjob/forms/domain.py | 2 +- hasjob/forms/filterset.py | 2 +- hasjob/forms/jobpost.py | 5 +- hasjob/models/board.py | 3 +- hasjob/models/campaign.py | 3 +- hasjob/models/jobpost.py | 3 +- hasjob/models/user.py | 2 +- hasjob/tagging.py | 32 +++++++++++- hasjob/views/campaign.py | 3 +- hasjob/views/helper.py | 5 +- hasjob/views/index.py | 2 +- hasjob/views/listing.py | 2 +- hasjob/views/login.py | 2 +- pyproject.toml | 104 ++++++++++++++------------------------ requirements.txt | 2 +- 17 files changed, 94 insertions(+), 84 deletions(-) diff --git a/hasjob/__init__.py b/hasjob/__init__.py index abf1d872..c0434221 100644 --- a/hasjob/__init__.py +++ b/hasjob/__init__.py @@ -3,8 +3,6 @@ import geoip2.database from flask import Flask from flask_assets import Bundle -from flask_lastuser import Lastuser -from flask_lastuser.sqlalchemy import UserManager from flask_mail import Mail from flask_migrate import Migrate from flask_redis import FlaskRedis @@ -13,6 +11,8 @@ import coaster.app from baseframe import Version, assets, baseframe from coaster.assets import WebpackManifest +from flask_lastuser import Lastuser +from flask_lastuser.sqlalchemy import UserManager from ._version import __version__ from .uploads import configure as uploads_configure diff --git a/hasjob/forms/board.py b/hasjob/forms/board.py index 05c954f1..6552007e 100644 --- a/hasjob/forms/board.py +++ b/hasjob/forms/board.py @@ -1,5 +1,5 @@ import tldextract -from flask import Markup +from markupsafe import Markup import baseframe.forms as forms from baseframe import _, __ diff --git a/hasjob/forms/domain.py b/hasjob/forms/domain.py index 50fd31ed..4069546b 100644 --- a/hasjob/forms/domain.py +++ b/hasjob/forms/domain.py @@ -1,4 +1,4 @@ -# from flask import Markup +# from markupsafe import Markup import baseframe.forms as forms from baseframe import __ diff --git a/hasjob/forms/filterset.py b/hasjob/forms/filterset.py index 4792771d..c67fcff5 100644 --- a/hasjob/forms/filterset.py +++ b/hasjob/forms/filterset.py @@ -74,7 +74,7 @@ class FiltersetForm(forms.Form): __("Tags"), autocomplete_endpoint='/api/1/tag/autocomplete', results_key='tags' ) - def set_queries(self): + def __post_init__(self): if not self.edit_parent: self.edit_parent = g.board self.types.query = ( diff --git a/hasjob/forms/jobpost.py b/hasjob/forms/jobpost.py index c627136b..59b3ce53 100644 --- a/hasjob/forms/jobpost.py +++ b/hasjob/forms/jobpost.py @@ -2,13 +2,14 @@ from decimal import Decimal, InvalidOperation from difflib import SequenceMatcher -from flask import Markup, g, request -from flask_lastuser import LastuserResourceError +from flask import g, request +from markupsafe import Markup import baseframe.forms as forms from baseframe import _, __ from baseframe.utils import is_public_email_domain from coaster.utils import get_email_domain, getbool +from flask_lastuser import LastuserResourceError from .. import app, lastuser from ..models import CURRENCY, PAY_TYPE, Domain, JobApplication, JobType, User diff --git a/hasjob/models/board.py b/hasjob/models/board.py index 8a7e424c..1dfb3240 100644 --- a/hasjob/models/board.py +++ b/hasjob/models/board.py @@ -1,6 +1,7 @@ from __future__ import annotations -from flask import Markup, url_for +from flask import url_for +from markupsafe import Markup from pytz import timezone from sqlalchemy.ext.associationproxy import association_proxy from werkzeug.utils import cached_property diff --git a/hasjob/models/campaign.py b/hasjob/models/campaign.py index 94cebfdc..af722443 100644 --- a/hasjob/models/campaign.py +++ b/hasjob/models/campaign.py @@ -2,7 +2,8 @@ from datetime import timedelta -from flask import Markup, request +from flask import request +from markupsafe import Markup from sqlalchemy import event from sqlalchemy.ext.associationproxy import association_proxy from sqlalchemy.ext.orderinglist import ordering_list diff --git a/hasjob/models/jobpost.py b/hasjob/models/jobpost.py index 7de32245..145421fe 100644 --- a/hasjob/models/jobpost.py +++ b/hasjob/models/jobpost.py @@ -3,8 +3,9 @@ from datetime import timedelta import tldextract -from flask import Markup, escape, url_for +from flask import url_for from flask_babel import format_datetime +from markupsafe import Markup, escape from sqlalchemy import event from sqlalchemy.dialects.postgresql import TSVECTOR from sqlalchemy.ext.associationproxy import association_proxy diff --git a/hasjob/models/user.py b/hasjob/models/user.py index cbb98645..1411cb71 100644 --- a/hasjob/models/user.py +++ b/hasjob/models/user.py @@ -4,11 +4,11 @@ from uuid import uuid4 from flask import request -from flask_lastuser.sqlalchemy import UserBase2 from baseframe import _, cache from coaster.sqlalchemy import JsonDict, UuidMixin from coaster.utils import utcnow +from flask_lastuser.sqlalchemy import UserBase2 from . import BaseMixin, Model, backref, db, relationship, sa diff --git a/hasjob/tagging.py b/hasjob/tagging.py index c2a5b156..ed174916 100644 --- a/hasjob/tagging.py +++ b/hasjob/tagging.py @@ -1,9 +1,10 @@ from collections import defaultdict +from collections.abc import Iterable from urllib.parse import urljoin +import nltk import requests -from coaster.nlp import extract_named_entities from coaster.utils import text_blocks from . import app, rq @@ -25,6 +26,35 @@ @rq.job('hasjob') +def extract_named_entities(text_blocks: Iterable[str]) -> set[str]: + """Return a set of named entities extracted from the provided text blocks.""" + sentences = [] + for text in text_blocks: + sentences.extend(nltk.sent_tokenize(text)) + + tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] + tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] + chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True) + + def extract_entity_names(tree: nltk.Tree) -> list[str]: + entity_names = [] + + if hasattr(tree, "label"): + if tree.label() == "NE": + entity_names.append(" ".join(child[0] for child in tree)) + else: + for child in tree: + entity_names.extend(extract_entity_names(child)) + + return entity_names + + entity_names = [] + for tree in chunked_sentences: + entity_names.extend(extract_entity_names(tree)) + + return set(entity_names) + + def tag_locations(jobpost_id): with app.test_request_context(): post = JobPost.query.get(jobpost_id) diff --git a/hasjob/views/campaign.py b/hasjob/views/campaign.py index 151f264a..69b2e12e 100644 --- a/hasjob/views/campaign.py +++ b/hasjob/views/campaign.py @@ -4,7 +4,8 @@ from functools import wraps from io import StringIO -from flask import Markup, abort, flash, g, redirect, render_template, request, url_for +from flask import abort, flash, g, redirect, render_template, request, url_for +from markupsafe import Markup from pytz import UTC from baseframe import __ diff --git a/hasjob/views/helper.py b/hasjob/views/helper.py index 7a9dfaa5..4d4fc7b8 100644 --- a/hasjob/views/helper.py +++ b/hasjob/views/helper.py @@ -7,9 +7,9 @@ from uuid import uuid4 import bleach -from flask import Markup, copy_current_request_context, g, request, session -from flask_lastuser import signal_user_looked_up +from flask import copy_current_request_context, g, request, session from geoip2.errors import AddressNotFoundError +from markupsafe import Markup from pytz import UTC from sqlalchemy.exc import IntegrityError @@ -17,6 +17,7 @@ from baseframe.signals import form_validation_error, form_validation_success from coaster.sqlalchemy import failsafe_add from coaster.utils import utcnow +from flask_lastuser import signal_user_looked_up from .. import app, lastuser, redis_store, rq from ..extapi import location_geodata diff --git a/hasjob/views/index.py b/hasjob/views/index.py index e6419f8b..57cd3f61 100644 --- a/hasjob/views/index.py +++ b/hasjob/views/index.py @@ -3,7 +3,6 @@ from uuid import uuid4 from flask import ( - Markup, Response, abort, flash, @@ -14,6 +13,7 @@ request, url_for, ) +from markupsafe import Markup from baseframe import _, request_is_xhr from coaster.utils import ParseError, getbool, parse_isoformat, utcnow diff --git a/hasjob/views/listing.py b/hasjob/views/listing.py index 8168bfbf..847d9671 100644 --- a/hasjob/views/listing.py +++ b/hasjob/views/listing.py @@ -3,7 +3,6 @@ import bleach from flask import ( - Markup, abort, flash, g, @@ -16,6 +15,7 @@ ) from flask_mail import Message from html2text import html2text +from markupsafe import Markup from premailer import transform as email_transform from sqlalchemy.exc import IntegrityError from sqlalchemy.orm.exc import StaleDataError diff --git a/hasjob/views/login.py b/hasjob/views/login.py index 63a46a5f..526d83d6 100644 --- a/hasjob/views/login.py +++ b/hasjob/views/login.py @@ -1,8 +1,8 @@ from flask import Response, flash, g, redirect -from flask_lastuser import signal_user_session_refreshed from sqlalchemy.exc import IntegrityError from coaster.views import get_next_url +from flask_lastuser import signal_user_session_refreshed from .. import app, lastuser from ..models import UserActiveAt, db diff --git a/pyproject.toml b/pyproject.toml index 12c2a135..3b9a81e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ use_parentheses = true combine_as_imports = true split_on_trailing_comma = false extra_standard_library = ['typing_extensions'] -known_first_party = ['baseframe', 'coaster'] +known_first_party = ['baseframe', 'coaster', 'flask_lastuser'] known_repo = ['hasjob'] default_section = 'THIRDPARTY' sections = [ @@ -86,64 +86,6 @@ exclude_dirs = ['node_modules', 'build/lib'] skips = ['*/*_test.py', '*/test_*.py'] [tool.ruff] -# This is a slight customisation of the default rules -# 1. Hasjob still supports Python 3.7 pending its EOL -# 2. Rule E402 (module-level import not top-level) is disabled as isort handles it -# 3. Rule E501 (line too long) is left to Black; some strings are worse for wrapping - -# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. -select = ["E", "F"] -ignore = ["E402", "E501"] - -# Allow autofix for all enabled rules (when `--fix`) is provided. -fixable = [ - "A", - "B", - "C", - "D", - "E", - "F", - "G", - "I", - "N", - "Q", - "S", - "T", - "W", - "ANN", - "ARG", - "BLE", - "COM", - "DJ", - "DTZ", - "EM", - "ERA", - "EXE", - "FBT", - "ICN", - "INP", - "ISC", - "NPY", - "PD", - "PGH", - "PIE", - "PL", - "PT", - "PTH", - "PYI", - "RET", - "RSE", - "RUF", - "SIM", - "SLF", - "TCH", - "TID", - "TRY", - "UP", - "YTT", -] -unfixable = [] - # Exclude a variety of commonly ignored directories. exclude = [ ".bzr", @@ -171,23 +113,48 @@ exclude = [ # Same as Black. line-length = 88 +# Target Python 3.11 +target-version = "py311" + +[tool.ruff.format] +docstring-code-format = true +quote-style = "preserve" + +[tool.ruff.lint] # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -# Target Python 3.11 -target-version = "py311" +# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. +select = ["E", "F"] +ignore = ["E402", "E501"] -[tool.ruff.mccabe] +# Allow autofix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Allow these characters in strings +allowed-confusables = ["‘", "’"] + +[tool.ruff.lint.mccabe] # Unlike Flake8, default to a complexity level of 10. max-complexity = 10 -[tool.ruff.isort] +[tool.ruff.lint.extend-per-file-ignores] +"__init__.py" = ["E402"] # Allow non-top-level imports +"tests/**.py" = [ + "S101", # Allow assert + "ANN001", # Args don't need types (usually fixtures) + "N802", # Fixture returning a class may be named per class name convention + "N803", # Args don't require naming convention (fixture could be a class) +] + +[tool.ruff.lint.isort] # These config options should match isort config above under [tool.isort] combine-as-imports = true extra-standard-library = ['typing_extensions'] split-on-trailing-comma = false relative-imports-order = 'furthest-to-closest' -known-first-party = ['coaster'] +known-first-party = ['coaster', 'baseframe', 'flask_lastuser'] section-order = [ 'future', 'standard-library', @@ -197,5 +164,12 @@ section-order = [ 'local-folder', ] -[tool.ruff.isort.sections] +[tool.ruff.lint.isort.sections] repo = ['hasjob'] + +[tool.ruff.lint.flake8-pytest-style] +fixture-parentheses = false +mark-parentheses = false + +[tool.ruff.lint.pyupgrade] +keep-runtime-typing = true diff --git a/requirements.txt b/requirements.txt index 8eee7d60..1bf9cc8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,6 @@ Flask-Migrate Flask-Redis Flask-RQ2 Flask-SQLAlchemy -Flask-Testing git+https://github.com/maxcountryman/flask-uploads.git#egg=Flask-Uploads Flask-WTF geoip2 @@ -19,6 +18,7 @@ gunicorn html2text jsmin langid +nltk Pillow premailer progressbar2