diff --git a/.github/workflows/build_deploy_doc.yml b/.github/workflows/build_deploy_doc.yml index abb7acc..45abce2 100644 --- a/.github/workflows/build_deploy_doc.yml +++ b/.github/workflows/build_deploy_doc.yml @@ -29,7 +29,7 @@ jobs: # https://github.com/actions/setup-python - uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.9" - name: Install run: | diff --git a/.github/workflows/mypy_check.yml b/.github/workflows/mypy_check.yml index e7ae6b5..9dd71c5 100644 --- a/.github/workflows/mypy_check.yml +++ b/.github/workflows/mypy_check.yml @@ -28,7 +28,7 @@ jobs: # https://github.com/actions/setup-python - uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.9" - name: Install run: | diff --git a/.github/workflows/pypi_upload.yml b/.github/workflows/pypi_upload.yml index 317aaab..c17b1fb 100644 --- a/.github/workflows/pypi_upload.yml +++ b/.github/workflows/pypi_upload.yml @@ -15,7 +15,7 @@ jobs: # https://github.com/actions/setup-python - uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.9" - name: Install run: | diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index ff8a835..ebe8c51 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: # https://github.com/actions/checkout diff --git a/.github/workflows/ruff_check.yml b/.github/workflows/ruff_check.yml index 21969f2..ff4fc7d 100644 --- a/.github/workflows/ruff_check.yml +++ b/.github/workflows/ruff_check.yml @@ -28,7 +28,7 @@ jobs: # https://github.com/actions/setup-python - uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.9" - name: Install run: | diff --git a/.github/workflows/static_checks.yml b/.github/workflows/static_checks.yml index 54cb1e8..9b383b2 100644 --- a/.github/workflows/static_checks.yml +++ b/.github/workflows/static_checks.yml @@ -28,7 +28,7 @@ jobs: # https://github.com/actions/setup-python - uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.9" - name: Install run: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5e51b4b..6828b19 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -177,16 +177,16 @@ First [install pyenv](https://github.com/pyenv/pyenv#installation) if you do not Next install the appropriate Python version. We recommend the development on the oldest still permitted Python version of the project. This version number can be found in the `pyproject.toml` file in the setting called -`tool.poetry.dependencies.python`. If this is set like `python = "^3.8"` -we use pyenv to install Python 3.8: -`pyenv install 3.8` -This installs the latest 3.8 Python version. +`tool.poetry.dependencies.python`. If this is set like `python = "^3.9"` +we use pyenv to install Python 3.9: +`pyenv install 3.9` +This installs the latest 3.9 Python version. If the Python installation was successful we use `pyenv versions` to see which exact Version is installed. Then we activate this version with `pyenv local `. This command will create a `.python-version` file in the project directory. Make sure that you are still in the project directory. -For example execute: `pyenv local 3.8.17` +For example execute: `pyenv local 3.9` ### 5. Install the Project with Poetry diff --git a/docs/source/conf.py b/docs/source/conf.py index 6561499..d5e3248 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,7 +21,7 @@ # import os # import sys # sys.path.insert(0, os.path.abspath('.')) -from typing import List + # -- Project information ----------------------------------------------------- @@ -54,7 +54,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns: List[str] = [] +exclude_patterns: list[str] = [] # -- Options for HTML output ------------------------------------------------- diff --git a/mltb2/arangodb.py b/mltb2/arangodb.py index 19e4d70..a983aee 100644 --- a/mltb2/arangodb.py +++ b/mltb2/arangodb.py @@ -12,9 +12,10 @@ import gzip from argparse import ArgumentParser +from collections.abc import Sequence from contextlib import closing from dataclasses import dataclass -from typing import Any, Dict, Optional, Sequence, Union +from typing import Any, Optional, Union import jsonlines from arango import ArangoClient @@ -26,7 +27,7 @@ from mltb2.db import AbstractBatchDataManager -def _check_config_keys(config: Dict[str, Optional[str]], expected_config_keys: Sequence[str]) -> None: +def _check_config_keys(config: dict[str, Optional[str]], expected_config_keys: Sequence[str]) -> None: """Check if all expected keys are in config. This is useful to check if a config file contains all necessary keys. @@ -211,9 +212,10 @@ def arango_collection_backup() -> None: output_file_name = f"./{args.col}_backup.jsonl.gz" print(f"Writing backup to '{output_file_name}'...") - with closing(ArangoClient(hosts=arango_config["hosts"])) as arango_client, gzip.open( # type: ignore[arg-type] - output_file_name, "w" - ) as gzip_out: + with ( + closing(ArangoClient(hosts=arango_config["hosts"])) as arango_client, # type: ignore[arg-type] + gzip.open(output_file_name, "w") as gzip_out, + ): connection = arango_client.db( arango_config["db_name"], # type: ignore[arg-type] arango_config["username"], # type: ignore[arg-type] @@ -288,7 +290,7 @@ def from_config_file(cls, config_file_name): ) def import_dicts( - self, dicts: Sequence[Dict[str, Any]], collection_name: str, create_collection: bool = False + self, dicts: Sequence[dict[str, Any]], collection_name: str, create_collection: bool = False ) -> None: """Import data to ArangoDB. diff --git a/mltb2/bs.py b/mltb2/bs.py index 5170deb..6b209bf 100644 --- a/mltb2/bs.py +++ b/mltb2/bs.py @@ -9,7 +9,7 @@ ``pip install mltb2[bs]`` """ -from typing import Any, Dict, Optional +from typing import Any, Optional import mdformat from bs4 import BeautifulSoup @@ -35,7 +35,7 @@ def extract_text(soup: BeautifulSoup, join_str: Optional[str] = None) -> str: return result -def extract_one(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: Dict[str, Any]) -> Any: +def extract_one(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> Any: """Extract exactly one specified element from a BeautifulSoup object. This function expacts that exactly only one result is found. @@ -60,7 +60,7 @@ def extract_one(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, ** return result -def extract_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: Dict[str, Any]) -> Any: +def extract_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> Any: """Extract all specified elements from a BeautifulSoup object. Args: @@ -77,7 +77,7 @@ def extract_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, ** return result -def remove_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: Dict[str, Any]) -> None: +def remove_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> None: """Remove all specified elements from a BeautifulSoup object. The removal is done in place. Nothing is returned. diff --git a/mltb2/data.py b/mltb2/data.py index ae0996b..9914ad5 100644 --- a/mltb2/data.py +++ b/mltb2/data.py @@ -24,7 +24,7 @@ import os from hashlib import sha256 from io import StringIO -from typing import Optional, Tuple +from typing import Optional import joblib import numpy as np @@ -101,7 +101,7 @@ def _load_colon_label() -> pd.Series: return label_series -def load_colon(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd.DataFrame]: +def load_colon(mltb2_base_data_dir: Optional[str] = None) -> tuple[pd.Series, pd.DataFrame]: """Load colon data. The data is loaded and parsed from the internet. @@ -128,7 +128,7 @@ def load_colon(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd return result -def load_prostate(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd.DataFrame]: +def load_prostate(mltb2_base_data_dir: Optional[str] = None) -> tuple[pd.Series, pd.DataFrame]: """Load prostate data. The data is loaded and parsed from ``_. @@ -177,7 +177,7 @@ def load_prostate(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, return result -def load_leukemia_big(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd.DataFrame]: +def load_leukemia_big(mltb2_base_data_dir: Optional[str] = None) -> tuple[pd.Series, pd.DataFrame]: """Load leukemia (big) data. The data is loaded and parsed from the internet. diff --git a/mltb2/db.py b/mltb2/db.py index 9e2b83c..9bb15c0 100644 --- a/mltb2/db.py +++ b/mltb2/db.py @@ -5,8 +5,9 @@ """Database utils module.""" from abc import ABC, abstractmethod +from collections.abc import Sequence from dataclasses import dataclass -from typing import Callable, Sequence +from typing import Callable class AbstractBatchDataManager(ABC): diff --git a/mltb2/fasttext.py b/mltb2/fasttext.py index ee3c65b..71922e8 100644 --- a/mltb2/fasttext.py +++ b/mltb2/fasttext.py @@ -11,7 +11,7 @@ import os from dataclasses import dataclass, field -from typing import List, Optional +from typing import Optional import fasttext from fasttext.FastText import _FastText @@ -52,7 +52,7 @@ def get_model_path_and_download() -> str: return model_full_path - def __call__(self, text: str, num_lang: int = 10, always_detect_lang: Optional[List[str]] = None): + def __call__(self, text: str, num_lang: int = 10, always_detect_lang: Optional[list[str]] = None): """Identify languages of a given text. Args: diff --git a/mltb2/files.py b/mltb2/files.py index b043257..11e20cb 100644 --- a/mltb2/files.py +++ b/mltb2/files.py @@ -16,9 +16,10 @@ import contextlib import os import random +from collections.abc import Sequence from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Set +from typing import Any, Optional from uuid import uuid4 import joblib @@ -83,12 +84,12 @@ class FileBasedRestartableBatchDataProcessor: result_dir: The directory where the results are stored. """ - data: List[Dict[str, Any]] + data: list[dict[str, Any]] batch_size: int uuid_name: str result_dir: str _result_dir_path: Path = field(init=False, repr=False) - _own_lock_uuids: Set[str] = field(init=False, repr=False, default_factory=set) + _own_lock_uuids: set[str] = field(init=False, repr=False, default_factory=set) def __post_init__(self) -> None: """Do post init.""" @@ -99,7 +100,7 @@ def __post_init__(self) -> None: if not len(self.data) > 0: raise ValueError("data must not be empty!") - uuids: Set[str] = set() + uuids: set[str] = set() # check uuid_name for idx, d in enumerate(self.data): @@ -134,8 +135,8 @@ def _get_uuid_from_filename(filename: str) -> Optional[str]: uuid = filename[: filename.rindex("_")] return uuid - def _get_locked_or_done_uuids(self) -> Set[str]: - locked_or_done_uuids: Set[str] = set() + def _get_locked_or_done_uuids(self) -> set[str]: + locked_or_done_uuids: set[str] = set() for child_path in self._result_dir_path.iterdir(): if child_path.is_file(): filename = child_path.name @@ -144,20 +145,20 @@ def _get_locked_or_done_uuids(self) -> Set[str]: locked_or_done_uuids.add(uuid) return locked_or_done_uuids - def _write_lock_files(self, batch: Sequence[Dict[str, Any]]) -> None: + def _write_lock_files(self, batch: Sequence[dict[str, Any]]) -> None: for d in batch: uuid = d[self.uuid_name] (self._result_dir_path / f"{uuid}.lock").touch() self._own_lock_uuids.add(uuid) - def _get_remaining_data(self) -> List[Dict[str, Any]]: - locked_or_done_uuids: Set[str] = self._get_locked_or_done_uuids() + def _get_remaining_data(self) -> list[dict[str, Any]]: + locked_or_done_uuids: set[str] = self._get_locked_or_done_uuids() remaining_data = [d for d in self.data if d[self.uuid_name] not in locked_or_done_uuids] return remaining_data - def read_batch(self) -> Sequence[Dict[str, Any]]: + def read_batch(self) -> Sequence[dict[str, Any]]: """Read the next batch of data.""" - remaining_data: List[Dict[str, Any]] = self._get_remaining_data() + remaining_data: list[dict[str, Any]] = self._get_remaining_data() # if we think we are done, delete all lock files and check again # this is because lock files might be orphaned @@ -172,7 +173,7 @@ def read_batch(self) -> Sequence[Dict[str, Any]]: self._write_lock_files(next_batch) return next_batch - def _save_batch_data(self, batch: Sequence[Dict[str, Any]]) -> None: + def _save_batch_data(self, batch: Sequence[dict[str, Any]]) -> None: for d in batch: uuid = d[self.uuid_name] if uuid not in self._own_lock_uuids: @@ -180,19 +181,19 @@ def _save_batch_data(self, batch: Sequence[Dict[str, Any]]) -> None: filename = self._result_dir_path / f"{uuid}_{str(uuid4())}.pkl.gz" # noqa: RUF010 joblib.dump(d, filename, compress=("gzip", 3)) - def _remove_lock_files(self, batch: Sequence[Dict[str, Any]]) -> None: + def _remove_lock_files(self, batch: Sequence[dict[str, Any]]) -> None: for d in batch: uuid = d[self.uuid_name] (self._result_dir_path / f"{uuid}.lock").unlink(missing_ok=True) self._own_lock_uuids.discard(uuid) - def save_batch(self, batch: Sequence[Dict[str, Any]]) -> None: + def save_batch(self, batch: Sequence[dict[str, Any]]) -> None: """Save the batch of data.""" self._save_batch_data(batch) self._remove_lock_files(batch) @staticmethod - def load_data(result_dir: str, ignore_load_error: bool = False) -> List[Dict[str, Any]]: + def load_data(result_dir: str, ignore_load_error: bool = False) -> list[dict[str, Any]]: """Load all data. After all data is processed, this method can be used to load all data. diff --git a/mltb2/md.py b/mltb2/md.py index a8397ed..a6b97f0 100644 --- a/mltb2/md.py +++ b/mltb2/md.py @@ -12,7 +12,7 @@ import re from dataclasses import dataclass -from typing import Final, List +from typing import Final from tqdm import tqdm @@ -21,7 +21,7 @@ _HEADLINE_REGEX: Final = re.compile(r"^#+ .*", flags=re.MULTILINE) -def _chunk_md_by_headline(md_text: str) -> List[str]: +def _chunk_md_by_headline(md_text: str) -> list[str]: """Chunk Markdown by headlines. Args: @@ -30,7 +30,7 @@ def _chunk_md_by_headline(md_text: str) -> List[str]: Returns: The list of Markdown chunks. """ - positions: List[int] = [m.start() for m in re.finditer(_HEADLINE_REGEX, md_text)] + positions: list[int] = [m.start() for m in re.finditer(_HEADLINE_REGEX, md_text)] # extend positions if 0 not in positions: @@ -41,7 +41,7 @@ def _chunk_md_by_headline(md_text: str) -> List[str]: return result -def chunk_md(md_text: str) -> List[str]: +def chunk_md(md_text: str) -> list[str]: """Chunk Markdown by headlines and merge isolated headlines. Merges isolated headlines with their corresponding subsequent paragraphs. @@ -85,7 +85,7 @@ class MdTextSplitter: transformers_token_counter: TransformersTokenCounter show_progress_bar: bool = False - def __call__(self, md_text: str) -> List[str]: + def __call__(self, md_text: str) -> list[str]: """Split the Markdown text into sections. Args: @@ -98,8 +98,8 @@ def __call__(self, md_text: str) -> List[str]: assert len(md_chunks) == len(counts) # type: ignore[arg-type] - result_merges: List[str] = [] - temp_merges: List[str] = [] + result_merges: list[str] = [] + temp_merges: list[str] = [] current_count: int = 0 for md_chunk, count in zip( diff --git a/mltb2/openai.py b/mltb2/openai.py index 6453daa..f950bad 100644 --- a/mltb2/openai.py +++ b/mltb2/openai.py @@ -13,8 +13,9 @@ import os +from collections.abc import Iterable from dataclasses import dataclass, field -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Optional, Union import tiktoken import yaml @@ -48,7 +49,7 @@ def __post_init__(self) -> None: """Do post init.""" self.encoding = tiktoken.encoding_for_model(self.model_name) - def __call__(self, text: Union[str, Iterable]) -> Union[int, List[int]]: + def __call__(self, text: Union[str, Iterable]) -> Union[int, list[int]]: """Count tokens for text. Args: @@ -106,13 +107,13 @@ class OpenAiChatResult: completion_tokens: Optional[int] = None total_tokens: Optional[int] = None finish_reason: Optional[str] = None - completion_args: Optional[Dict[str, Any]] = None + completion_args: Optional[dict[str, Any]] = None @classmethod def from_chat_completion( cls, chat_completion: ChatCompletion, - completion_kwargs: Optional[Dict[str, Any]] = None, + completion_kwargs: Optional[dict[str, Any]] = None, ): """Construct this class from an OpenAI ``ChatCompletion`` object. @@ -141,7 +142,7 @@ def from_chat_completion( return cls(**result) # type: ignore[arg-type] -def remove_openai_tokens(messages: List[Dict[str, str]]) -> List[Dict[str, str]]: +def remove_openai_tokens(messages: list[dict[str, str]]) -> list[dict[str, str]]: """Remove OpenAI special tokens from the messages. These tokens are ``<|im_start|>`` and ``<|im_end|>`` and they can cause problems when passed to the OpenAI API. @@ -210,8 +211,8 @@ def from_yaml(cls, yaml_file, api_key: Optional[str] = None, **kwargs): def create_completions( self, - prompt: Union[str, List[Dict[str, str]]], - completion_kwargs: Optional[Dict[str, Any]] = None, + prompt: Union[str, list[dict[str, str]]], + completion_kwargs: Optional[dict[str, Any]] = None, clean_openai_tokens: bool = False, ) -> OpenAiChatResult: """Create a model response for the given prompt (chat conversation). @@ -269,8 +270,8 @@ def create_completions( async def create_completions_async( self, - prompt: Union[str, List[Dict[str, str]]], - completion_kwargs: Optional[Dict[str, Any]] = None, + prompt: Union[str, list[dict[str, str]]], + completion_kwargs: Optional[dict[str, Any]] = None, clean_openai_tokens: bool = False, ) -> OpenAiChatResult: """Create a model response for the given prompt (chat conversation). diff --git a/mltb2/plot.py b/mltb2/plot.py index 6478874..c63b900 100644 --- a/mltb2/plot.py +++ b/mltb2/plot.py @@ -115,9 +115,9 @@ def boxplot( if ylabel is not None: ax.set(ylabel=ylabel) - ax.boxplot(values, labels=labels, vert=vert) + ax.boxplot(values, labels=labels, vert=vert) # type: ignore[call-arg] grid_axis = "y" if vert else "x" - plt.grid(b=True, axis=grid_axis, linestyle="--") + plt.grid(b=True, axis=grid_axis, linestyle="--") # type: ignore[arg-type] plt.xticks(rotation=90) diff --git a/mltb2/somajo.py b/mltb2/somajo.py index 7d17677..a0eb87e 100644 --- a/mltb2/somajo.py +++ b/mltb2/somajo.py @@ -11,8 +11,9 @@ from abc import ABC +from collections.abc import Iterable from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Literal, Optional, Set, Tuple, Union +from typing import Literal, Optional, Union from somajo import SoMaJo from tqdm import tqdm @@ -62,7 +63,7 @@ def detokenize(tokens) -> str: return result -def extract_token_class_set(sentences: Iterable, keep_token_classes: Optional[str] = None) -> Set[str]: +def extract_token_class_set(sentences: Iterable, keep_token_classes: Optional[str] = None) -> set[str]: """Extract token from sentences by token class. Args: @@ -91,7 +92,7 @@ class SoMaJoSentenceSplitter(SoMaJoBaseClass): show_progress_bar: bool = False - def __call__(self, text: str) -> List[str]: + def __call__(self, text: str) -> list[str]: """Split the text into a list of sentences. Args: @@ -118,7 +119,7 @@ class JaccardSimilarity(SoMaJoBaseClass): language: The language. ``de_CMC`` for German or ``en_PTB`` for English. """ - def get_token_set(self, text: str) -> Set[str]: + def get_token_set(self, text: str) -> set[str]: """Get token set for text. Args: @@ -157,7 +158,7 @@ class TokenExtractor(SoMaJoBaseClass): language: The language. ``de_CMC`` for German or ``en_PTB`` for English. """ - def extract_url_set(self, text: Union[Iterable, str]) -> Set[str]: + def extract_url_set(self, text: Union[Iterable, str]) -> set[str]: """Extract URLs from text. An example: @@ -187,7 +188,7 @@ def extract_url_set(self, text: Union[Iterable, str]) -> Set[str]: result = extract_token_class_set(sentences, keep_token_classes="URL") return result - def extract_token_set(self, text: Union[Iterable, str], keep_token_classes: Optional[str] = None) -> Set[str]: + def extract_token_set(self, text: Union[Iterable, str], keep_token_classes: Optional[str] = None) -> set[str]: """Extract tokens from text. Args: @@ -214,7 +215,7 @@ class UrlSwapper: token_extractor: TokenExtractor url_pattern: str = "https://link-{}.com" - _url_map: Dict[str, str] = field(init=False, repr=False) # map from real url to swapped url + _url_map: dict[str, str] = field(init=False, repr=False) # map from real url to swapped url def __post_init__(self): """Do post init.""" @@ -229,7 +230,7 @@ def swap_urls(self, text: str) -> str: text = text.replace(url, self._url_map[url]) # replace return text - def reverse_swap_urls(self, text: str) -> Tuple[str, Set[str]]: + def reverse_swap_urls(self, text: str) -> tuple[str, set[str]]: """Revert the url swap. Returns: diff --git a/mltb2/somajo_transformers.py b/mltb2/somajo_transformers.py index 18f835a..b407bf6 100644 --- a/mltb2/somajo_transformers.py +++ b/mltb2/somajo_transformers.py @@ -16,7 +16,6 @@ from dataclasses import dataclass -from typing import List from tqdm import tqdm @@ -46,7 +45,7 @@ class TextSplitter: show_progress_bar: bool = False ignore_overly_long_sentences: bool = False - def __call__(self, text: str) -> List[str]: + def __call__(self, text: str) -> list[str]: """Split the text into sections. Args: @@ -59,8 +58,8 @@ def __call__(self, text: str) -> List[str]: assert len(sentences) == len(counts) # type: ignore[arg-type] - result_splits: List[str] = [] - current_sentences: List[str] = [] + result_splits: list[str] = [] + current_sentences: list[str] = [] current_count: int = 0 for sentence, count in zip(tqdm(sentences, disable=not self.show_progress_bar), counts): # type: ignore[arg-type] if count > self.max_token: diff --git a/mltb2/text.py b/mltb2/text.py index ac018f4..ebdb831 100644 --- a/mltb2/text.py +++ b/mltb2/text.py @@ -14,13 +14,14 @@ import re from collections import Counter, defaultdict +from collections.abc import Iterable from dataclasses import dataclass, field -from typing import Dict, Final, Iterable, Optional, Pattern, Set, Tuple, Union +from typing import Final, Optional, Union from scipy.spatial.distance import cityblock from tqdm import tqdm -INVISIBLE_CHARACTERS: Final[Tuple[str, ...]] = ( +INVISIBLE_CHARACTERS: Final[tuple[str, ...]] = ( "\u200b", # Zero Width Space (ZWSP) https://www.compart.com/en/unicode/U+200b "\u00ad", # Soft Hyphen (SHY) https://www.compart.com/en/unicode/U+00ad # TODO: what about: @@ -28,9 +29,9 @@ # https://www.compart.com/en/unicode/U+2029 ) -INVISIBLE_CHARACTERS_TRANS: Final[Dict[int, None]] = str.maketrans({char: None for char in INVISIBLE_CHARACTERS}) +INVISIBLE_CHARACTERS_TRANS: Final[dict[int, None]] = str.maketrans({char: None for char in INVISIBLE_CHARACTERS}) -SPECIAL_WHITESPACES: Final[Tuple[str, ...]] = ( +SPECIAL_WHITESPACES: Final[tuple[str, ...]] = ( # unicode block "General Punctuation": https://www.compart.com/en/unicode/block/U+2000 "\u2000", # En Quad "\u2001", # Em Quad @@ -48,13 +49,13 @@ "\u00a0", # No-Break Space (NBSP) https://www.compart.com/en/unicode/U+00a0 ) -SPECIAL_WHITESPACES_TRANS: Final[Dict[int, str]] = str.maketrans({char: " " for char in SPECIAL_WHITESPACES}) +SPECIAL_WHITESPACES_TRANS: Final[dict[int, str]] = str.maketrans({char: " " for char in SPECIAL_WHITESPACES}) INVISIBLE_CHARACTERS_AND_SPECIAL_WHITESPACES_TRANS = {**SPECIAL_WHITESPACES_TRANS, **INVISIBLE_CHARACTERS_TRANS} -MULTI_SPACE_PATTERN: Pattern = re.compile(r" {2,}") +MULTI_SPACE_PATTERN: re.Pattern = re.compile(r" {2,}") -XML_TAG_PATTERN: Pattern = re.compile(r"<\/?[\w:]+( \/|\/|)>") +XML_TAG_PATTERN: re.Pattern = re.compile(r"<\/?[\w:]+( \/|\/|)>") def has_xml_tag(text: str) -> bool: @@ -241,7 +242,7 @@ class TextDistance: _normalized_char_counts: Optional[defaultdict] = field(default=None, init=False) # set of all counted characters - see _normalize_char_counter - _counted_char_set: Optional[Set[str]] = field(default=None, init=False) + _counted_char_set: Optional[set[str]] = field(default=None, init=False) # flag if fit was called _fit_called: bool = field(default=False, init=False) diff --git a/mltb2/transformers.py b/mltb2/transformers.py index 882e37f..4aa23b1 100644 --- a/mltb2/transformers.py +++ b/mltb2/transformers.py @@ -10,8 +10,9 @@ """ import os +from collections.abc import Iterable from dataclasses import dataclass, field -from typing import Iterable, List, Union +from typing import Union import sklearn import torch @@ -39,7 +40,7 @@ def __post_init__(self): """Do post init.""" self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name_or_path) - def __call__(self, text: Union[str, Iterable]) -> Union[int, List[int]]: + def __call__(self, text: Union[str, Iterable]) -> Union[int, list[int]]: """Count tokens for text. Args: diff --git a/pyproject.toml b/pyproject.toml index 297e06e..2db15c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "mltb2" -version = "1.0.1rc4" +version = "1.0.1rc5" description = "Machine Learning Toolbox 2" authors = ["PhilipMay "] readme = "README.md" @@ -33,10 +33,11 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Natural Language :: German", "Natural Language :: English", - "Programming Language :: Python :: 3.8", + # "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3 :: Only", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Scientific/Engineering :: Artificial Intelligence", @@ -55,8 +56,8 @@ arango-col-backup = 'mltb2.arangodb:arango_collection_backup' "Bug Tracker" = "https://github.com/telekom/mltb2/issues" [tool.poetry.dependencies] -python = "^3.8" -numpy = "*" +python = "^3.9" +numpy = "^1" # restriction by fasttext scipy = "*" tqdm = "*" platformdirs = {version = "*", optional = true} @@ -122,11 +123,11 @@ addopts = "--random-order-bucket=global" [tool.black] line-length = 119 -target-version = ["py38", "py39", "py310", "py311"] +target-version = ["py39", "py310", "py311", "py312"] [tool.ruff] line-length = 119 -target-version = "py38" +target-version = "py39" [tool.ruff.lint] diff --git a/tests/ori_data_loader.py b/tests/ori_data_loader.py index ce1684b..8caaaa0 100644 --- a/tests/ori_data_loader.py +++ b/tests/ori_data_loader.py @@ -9,15 +9,13 @@ """Data loader module.""" -from typing import Tuple - import numpy as np import pandas as pd import requests from bs4 import BeautifulSoup -def load_colon_data() -> Tuple[pd.Series, pd.DataFrame]: +def load_colon_data() -> tuple[pd.Series, pd.DataFrame]: """Load colon data. The data is loaded and parsed from the internet. @@ -72,7 +70,7 @@ def load_colon_data() -> Tuple[pd.Series, pd.DataFrame]: # TODO append random features and shuffle -def load_prostate_data() -> Tuple[pd.Series, pd.DataFrame]: +def load_prostate_data() -> tuple[pd.Series, pd.DataFrame]: """Load prostate data. The data is loaded and parsed from @@ -99,7 +97,7 @@ def load_prostate_data() -> Tuple[pd.Series, pd.DataFrame]: return pd.Series(labels), data -def load_leukemia_data() -> Tuple[pd.Series, pd.DataFrame]: +def load_leukemia_data() -> tuple[pd.Series, pd.DataFrame]: """Load leukemia data. The data is loaded and parsed from the internet. diff --git a/tests/test_openai.py b/tests/test_openai.py index 6595e67..53c3562 100644 --- a/tests/test_openai.py +++ b/tests/test_openai.py @@ -5,7 +5,6 @@ import os from pathlib import Path -from typing import List import pytest import yaml @@ -36,7 +35,7 @@ def test_OpenAiTokenCounter_call_string(): @settings(max_examples=1000) @given(texts=lists(text())) -def test_OpenAiTokenCounter_list_hypothesis(texts: List[str], gpt_4_open_ai_token_counter: OpenAiTokenCounter): +def test_OpenAiTokenCounter_list_hypothesis(texts: list[str], gpt_4_open_ai_token_counter: OpenAiTokenCounter): token_count = gpt_4_open_ai_token_counter(texts) assert len(token_count) == len(texts) # type: ignore[arg-type] assert all(count >= 0 for count in token_count) # type: ignore[union-attr]