Update Python Versions (#166)

* update poetry config * update github worker configs * update contribute guide * fix fasttext issue * update ruff target version to 3.9 * fix code format * change to py 3.9 typing * fix mypy issues * bump to 1.0.1rc5
telekom · Oct 22, 2024 · 25ee24d · 25ee24d
1 parent e946764
commit 25ee24d
Show file tree

Hide file tree

Showing 24 changed files with 103 additions and 98 deletions.
diff --git a/.github/workflows/build_deploy_doc.yml b/.github/workflows/build_deploy_doc.yml
@@ -29,7 +29,7 @@ jobs:
     # https://github.com/actions/setup-python
     - uses: actions/setup-python@v4
       with:
-        python-version: "3.8"
+        python-version: "3.9"
 
     - name: Install
       run: |

diff --git a/.github/workflows/mypy_check.yml b/.github/workflows/mypy_check.yml
@@ -28,7 +28,7 @@ jobs:
     # https://github.com/actions/setup-python
     - uses: actions/setup-python@v4
       with:
-        python-version: "3.8"
+        python-version: "3.9"
 
     - name: Install
       run: |

diff --git a/.github/workflows/pypi_upload.yml b/.github/workflows/pypi_upload.yml
@@ -15,7 +15,7 @@ jobs:
     # https://github.com/actions/setup-python
     - uses: actions/setup-python@v4
       with:
-        python-version: "3.8"
+        python-version: "3.9"
 
     - name: Install
       run: |

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
 
     steps:
     # https://github.com/actions/checkout

diff --git a/.github/workflows/ruff_check.yml b/.github/workflows/ruff_check.yml
@@ -28,7 +28,7 @@ jobs:
     # https://github.com/actions/setup-python
     - uses: actions/setup-python@v4
       with:
-        python-version: "3.8"
+        python-version: "3.9"
 
     - name: Install
       run: |

diff --git a/.github/workflows/static_checks.yml b/.github/workflows/static_checks.yml
@@ -28,7 +28,7 @@ jobs:
     # https://github.com/actions/setup-python
     - uses: actions/setup-python@v4
       with:
-        python-version: "3.8"
+        python-version: "3.9"
 
     - name: Install
       run: |

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -177,16 +177,16 @@ First [install pyenv](https://github.com/pyenv/pyenv#installation) if you do not
 Next install the appropriate Python version.
 We recommend the development on the oldest still permitted Python version of the project.
 This version number can be found in the `pyproject.toml` file in the setting called
-`tool.poetry.dependencies.python`. If this is set like `python = "^3.8"`
-we use pyenv to install Python 3.8:
-`pyenv install 3.8`
-This installs the latest 3.8 Python version.
+`tool.poetry.dependencies.python`. If this is set like `python = "^3.9"`
+we use pyenv to install Python 3.9:
+`pyenv install 3.9`
+This installs the latest 3.9 Python version.
 
 If the Python installation was successful we use `pyenv versions` to see which exact Version is installed.
 Then we activate this version with `pyenv local <version>`.
 This command will create a `.python-version` file in the project directory.
 Make sure that you are still in the project directory.
-For example execute: `pyenv local 3.8.17`
+For example execute: `pyenv local 3.9`
 
 ### 5. Install the Project with Poetry
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -21,7 +21,7 @@
 # import os
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
-from typing import List
+
 
 # -- Project information -----------------------------------------------------
 
@@ -54,7 +54,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = []
+exclude_patterns: list[str] = []
 
 
 # -- Options for HTML output -------------------------------------------------

diff --git a/mltb2/arangodb.py b/mltb2/arangodb.py
@@ -12,9 +12,10 @@
 
 import gzip
 from argparse import ArgumentParser
+from collections.abc import Sequence
 from contextlib import closing
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Sequence, Union
+from typing import Any, Optional, Union
 
 import jsonlines
 from arango import ArangoClient
@@ -26,7 +27,7 @@
 from mltb2.db import AbstractBatchDataManager
 
 
-def _check_config_keys(config: Dict[str, Optional[str]], expected_config_keys: Sequence[str]) -> None:
+def _check_config_keys(config: dict[str, Optional[str]], expected_config_keys: Sequence[str]) -> None:
     """Check if all expected keys are in config.
 
     This is useful to check if a config file contains all necessary keys.
@@ -211,9 +212,10 @@ def arango_collection_backup() -> None:
     output_file_name = f"./{args.col}_backup.jsonl.gz"
     print(f"Writing backup to '{output_file_name}'...")
 
-    with closing(ArangoClient(hosts=arango_config["hosts"])) as arango_client, gzip.open(  # type: ignore[arg-type]
-        output_file_name, "w"
-    ) as gzip_out:
+    with (
+        closing(ArangoClient(hosts=arango_config["hosts"])) as arango_client,  # type: ignore[arg-type]
+        gzip.open(output_file_name, "w") as gzip_out,
+    ):
         connection = arango_client.db(
             arango_config["db_name"],  # type: ignore[arg-type]
             arango_config["username"],  # type: ignore[arg-type]
@@ -288,7 +290,7 @@ def from_config_file(cls, config_file_name):
         )
 
     def import_dicts(
-        self, dicts: Sequence[Dict[str, Any]], collection_name: str, create_collection: bool = False
+        self, dicts: Sequence[dict[str, Any]], collection_name: str, create_collection: bool = False
     ) -> None:
         """Import data to ArangoDB.
 

diff --git a/mltb2/bs.py b/mltb2/bs.py
@@ -9,7 +9,7 @@
     ``pip install mltb2[bs]``
 """
 
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 import mdformat
 from bs4 import BeautifulSoup
@@ -35,7 +35,7 @@ def extract_text(soup: BeautifulSoup, join_str: Optional[str] = None) -> str:
     return result
 
 
-def extract_one(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: Dict[str, Any]) -> Any:
+def extract_one(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> Any:
     """Extract exactly one specified element from a BeautifulSoup object.
 
     This function expacts that exactly only one result is found.
@@ -60,7 +60,7 @@ def extract_one(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **
     return result
 
 
-def extract_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: Dict[str, Any]) -> Any:
+def extract_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> Any:
     """Extract all specified elements from a BeautifulSoup object.
 
     Args:
@@ -77,7 +77,7 @@ def extract_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **
     return result
 
 
-def remove_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: Dict[str, Any]) -> None:
+def remove_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> None:
     """Remove all specified elements from a BeautifulSoup object.
 
     The removal is done in place. Nothing is returned.

diff --git a/mltb2/data.py b/mltb2/data.py
@@ -24,7 +24,7 @@
 import os
 from hashlib import sha256
 from io import StringIO
-from typing import Optional, Tuple
+from typing import Optional
 
 import joblib
 import numpy as np
@@ -101,7 +101,7 @@ def _load_colon_label() -> pd.Series:
     return label_series
 
 
-def load_colon(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd.DataFrame]:
+def load_colon(mltb2_base_data_dir: Optional[str] = None) -> tuple[pd.Series, pd.DataFrame]:
     """Load colon data.
 
     The data is loaded and parsed from the internet.
@@ -128,7 +128,7 @@ def load_colon(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd
     return result
 
 
-def load_prostate(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd.DataFrame]:
+def load_prostate(mltb2_base_data_dir: Optional[str] = None) -> tuple[pd.Series, pd.DataFrame]:
     """Load prostate data.
 
     The data is loaded and parsed from `<https://web.stanford.edu/~hastie/CASI_files/DATA/prostate.html>`_.
@@ -177,7 +177,7 @@ def load_prostate(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series,
     return result
 
 
-def load_leukemia_big(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd.DataFrame]:
+def load_leukemia_big(mltb2_base_data_dir: Optional[str] = None) -> tuple[pd.Series, pd.DataFrame]:
     """Load leukemia (big) data.
 
     The data is loaded and parsed from the internet.

diff --git a/mltb2/db.py b/mltb2/db.py
@@ -5,8 +5,9 @@
 """Database utils module."""
 
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Callable, Sequence
+from typing import Callable
 
 
 class AbstractBatchDataManager(ABC):

diff --git a/mltb2/fasttext.py b/mltb2/fasttext.py
@@ -11,7 +11,7 @@
 
 import os
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import Optional
 
 import fasttext
 from fasttext.FastText import _FastText
@@ -52,7 +52,7 @@ def get_model_path_and_download() -> str:
 
         return model_full_path
 
-    def __call__(self, text: str, num_lang: int = 10, always_detect_lang: Optional[List[str]] = None):
+    def __call__(self, text: str, num_lang: int = 10, always_detect_lang: Optional[list[str]] = None):
         """Identify languages of a given text.
 
         Args:

diff --git a/mltb2/files.py b/mltb2/files.py
@@ -16,9 +16,10 @@
 import contextlib
 import os
 import random
+from collections.abc import Sequence
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Sequence, Set
+from typing import Any, Optional
 from uuid import uuid4
 
 import joblib
@@ -83,12 +84,12 @@ class FileBasedRestartableBatchDataProcessor:
         result_dir: The directory where the results are stored.
     """
 
-    data: List[Dict[str, Any]]
+    data: list[dict[str, Any]]
     batch_size: int
     uuid_name: str
     result_dir: str
     _result_dir_path: Path = field(init=False, repr=False)
-    _own_lock_uuids: Set[str] = field(init=False, repr=False, default_factory=set)
+    _own_lock_uuids: set[str] = field(init=False, repr=False, default_factory=set)
 
     def __post_init__(self) -> None:
         """Do post init."""
@@ -99,7 +100,7 @@ def __post_init__(self) -> None:
         if not len(self.data) > 0:
             raise ValueError("data must not be empty!")
 
-        uuids: Set[str] = set()
+        uuids: set[str] = set()
 
         # check uuid_name
         for idx, d in enumerate(self.data):
@@ -134,8 +135,8 @@ def _get_uuid_from_filename(filename: str) -> Optional[str]:
             uuid = filename[: filename.rindex("_")]
         return uuid
 
-    def _get_locked_or_done_uuids(self) -> Set[str]:
-        locked_or_done_uuids: Set[str] = set()
+    def _get_locked_or_done_uuids(self) -> set[str]:
+        locked_or_done_uuids: set[str] = set()
         for child_path in self._result_dir_path.iterdir():
             if child_path.is_file():
                 filename = child_path.name
@@ -144,20 +145,20 @@ def _get_locked_or_done_uuids(self) -> Set[str]:
                     locked_or_done_uuids.add(uuid)
         return locked_or_done_uuids
 
-    def _write_lock_files(self, batch: Sequence[Dict[str, Any]]) -> None:
+    def _write_lock_files(self, batch: Sequence[dict[str, Any]]) -> None:
         for d in batch:
             uuid = d[self.uuid_name]
             (self._result_dir_path / f"{uuid}.lock").touch()
             self._own_lock_uuids.add(uuid)
 
-    def _get_remaining_data(self) -> List[Dict[str, Any]]:
-        locked_or_done_uuids: Set[str] = self._get_locked_or_done_uuids()
+    def _get_remaining_data(self) -> list[dict[str, Any]]:
+        locked_or_done_uuids: set[str] = self._get_locked_or_done_uuids()
         remaining_data = [d for d in self.data if d[self.uuid_name] not in locked_or_done_uuids]
         return remaining_data
 
-    def read_batch(self) -> Sequence[Dict[str, Any]]:
+    def read_batch(self) -> Sequence[dict[str, Any]]:
         """Read the next batch of data."""
-        remaining_data: List[Dict[str, Any]] = self._get_remaining_data()
+        remaining_data: list[dict[str, Any]] = self._get_remaining_data()
 
         # if we think we are done, delete all lock files and check again
         # this is because lock files might be orphaned
@@ -172,27 +173,27 @@ def read_batch(self) -> Sequence[Dict[str, Any]]:
         self._write_lock_files(next_batch)
         return next_batch
 
-    def _save_batch_data(self, batch: Sequence[Dict[str, Any]]) -> None:
+    def _save_batch_data(self, batch: Sequence[dict[str, Any]]) -> None:
         for d in batch:
             uuid = d[self.uuid_name]
             if uuid not in self._own_lock_uuids:
                 raise ValueError(f"uuid '{uuid}' not locked by me!")
             filename = self._result_dir_path / f"{uuid}_{str(uuid4())}.pkl.gz"  # noqa: RUF010
             joblib.dump(d, filename, compress=("gzip", 3))
 
-    def _remove_lock_files(self, batch: Sequence[Dict[str, Any]]) -> None:
+    def _remove_lock_files(self, batch: Sequence[dict[str, Any]]) -> None:
         for d in batch:
             uuid = d[self.uuid_name]
             (self._result_dir_path / f"{uuid}.lock").unlink(missing_ok=True)
             self._own_lock_uuids.discard(uuid)
 
-    def save_batch(self, batch: Sequence[Dict[str, Any]]) -> None:
+    def save_batch(self, batch: Sequence[dict[str, Any]]) -> None:
         """Save the batch of data."""
         self._save_batch_data(batch)
         self._remove_lock_files(batch)
 
     @staticmethod
-    def load_data(result_dir: str, ignore_load_error: bool = False) -> List[Dict[str, Any]]:
+    def load_data(result_dir: str, ignore_load_error: bool = False) -> list[dict[str, Any]]:
         """Load all data.
 
         After all data is processed, this method can be used to load all data.

diff --git a/mltb2/md.py b/mltb2/md.py
@@ -12,7 +12,7 @@
 
 import re
 from dataclasses import dataclass
-from typing import Final, List
+from typing import Final
 
 from tqdm import tqdm
 
@@ -21,7 +21,7 @@
 _HEADLINE_REGEX: Final = re.compile(r"^#+ .*", flags=re.MULTILINE)
 
 
-def _chunk_md_by_headline(md_text: str) -> List[str]:
+def _chunk_md_by_headline(md_text: str) -> list[str]:
     """Chunk Markdown by headlines.
 
     Args:
@@ -30,7 +30,7 @@ def _chunk_md_by_headline(md_text: str) -> List[str]:
     Returns:
         The list of Markdown chunks.
     """
-    positions: List[int] = [m.start() for m in re.finditer(_HEADLINE_REGEX, md_text)]
+    positions: list[int] = [m.start() for m in re.finditer(_HEADLINE_REGEX, md_text)]
 
     # extend positions
     if 0 not in positions:
@@ -41,7 +41,7 @@ def _chunk_md_by_headline(md_text: str) -> List[str]:
     return result
 
 
-def chunk_md(md_text: str) -> List[str]:
+def chunk_md(md_text: str) -> list[str]:
     """Chunk Markdown by headlines and merge isolated headlines.
 
     Merges isolated headlines with their corresponding subsequent paragraphs.
@@ -85,7 +85,7 @@ class MdTextSplitter:
     transformers_token_counter: TransformersTokenCounter
     show_progress_bar: bool = False
 
-    def __call__(self, md_text: str) -> List[str]:
+    def __call__(self, md_text: str) -> list[str]:
         """Split the Markdown text into sections.
 
         Args:
@@ -98,8 +98,8 @@ def __call__(self, md_text: str) -> List[str]:
 
         assert len(md_chunks) == len(counts)  # type: ignore[arg-type]
 
-        result_merges: List[str] = []
-        temp_merges: List[str] = []
+        result_merges: list[str] = []
+        temp_merges: list[str] = []
         current_count: int = 0
 
         for md_chunk, count in zip(