Skip to content

Commit

Permalink
Update Python Versions (#166)
Browse files Browse the repository at this point in the history
* update poetry config

* update github worker configs

* update contribute guide

* fix fasttext issue

* update ruff target version to 3.9

* fix code format

* change to py 3.9 typing

* fix mypy issues

* bump to 1.0.1rc5
  • Loading branch information
PhilipMay authored Oct 22, 2024
1 parent e946764 commit 25ee24d
Show file tree
Hide file tree
Showing 24 changed files with 103 additions and 98 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_deploy_doc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
# https://github.com/actions/setup-python
- uses: actions/setup-python@v4
with:
python-version: "3.8"
python-version: "3.9"

- name: Install
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/mypy_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
# https://github.com/actions/setup-python
- uses: actions/setup-python@v4
with:
python-version: "3.8"
python-version: "3.9"

- name: Install
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pypi_upload.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
# https://github.com/actions/setup-python
- uses: actions/setup-python@v4
with:
python-version: "3.8"
python-version: "3.9"

- name: Install
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11"]
python-version: ["3.9", "3.10", "3.11", "3.12"]

steps:
# https://github.com/actions/checkout
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ruff_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
# https://github.com/actions/setup-python
- uses: actions/setup-python@v4
with:
python-version: "3.8"
python-version: "3.9"

- name: Install
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/static_checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
# https://github.com/actions/setup-python
- uses: actions/setup-python@v4
with:
python-version: "3.8"
python-version: "3.9"

- name: Install
run: |
Expand Down
10 changes: 5 additions & 5 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,16 @@ First [install pyenv](https://github.com/pyenv/pyenv#installation) if you do not
Next install the appropriate Python version.
We recommend the development on the oldest still permitted Python version of the project.
This version number can be found in the `pyproject.toml` file in the setting called
`tool.poetry.dependencies.python`. If this is set like `python = "^3.8"`
we use pyenv to install Python 3.8:
`pyenv install 3.8`
This installs the latest 3.8 Python version.
`tool.poetry.dependencies.python`. If this is set like `python = "^3.9"`
we use pyenv to install Python 3.9:
`pyenv install 3.9`
This installs the latest 3.9 Python version.

If the Python installation was successful we use `pyenv versions` to see which exact Version is installed.
Then we activate this version with `pyenv local <version>`.
This command will create a `.python-version` file in the project directory.
Make sure that you are still in the project directory.
For example execute: `pyenv local 3.8.17`
For example execute: `pyenv local 3.9`

### 5. Install the Project with Poetry

Expand Down
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
from typing import List


# -- Project information -----------------------------------------------------

Expand Down Expand Up @@ -54,7 +54,7 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns: List[str] = []
exclude_patterns: list[str] = []


# -- Options for HTML output -------------------------------------------------
Expand Down
14 changes: 8 additions & 6 deletions mltb2/arangodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@

import gzip
from argparse import ArgumentParser
from collections.abc import Sequence
from contextlib import closing
from dataclasses import dataclass
from typing import Any, Dict, Optional, Sequence, Union
from typing import Any, Optional, Union

import jsonlines
from arango import ArangoClient
Expand All @@ -26,7 +27,7 @@
from mltb2.db import AbstractBatchDataManager


def _check_config_keys(config: Dict[str, Optional[str]], expected_config_keys: Sequence[str]) -> None:
def _check_config_keys(config: dict[str, Optional[str]], expected_config_keys: Sequence[str]) -> None:
"""Check if all expected keys are in config.
This is useful to check if a config file contains all necessary keys.
Expand Down Expand Up @@ -211,9 +212,10 @@ def arango_collection_backup() -> None:
output_file_name = f"./{args.col}_backup.jsonl.gz"
print(f"Writing backup to '{output_file_name}'...")

with closing(ArangoClient(hosts=arango_config["hosts"])) as arango_client, gzip.open( # type: ignore[arg-type]
output_file_name, "w"
) as gzip_out:
with (
closing(ArangoClient(hosts=arango_config["hosts"])) as arango_client, # type: ignore[arg-type]
gzip.open(output_file_name, "w") as gzip_out,
):
connection = arango_client.db(
arango_config["db_name"], # type: ignore[arg-type]
arango_config["username"], # type: ignore[arg-type]
Expand Down Expand Up @@ -288,7 +290,7 @@ def from_config_file(cls, config_file_name):
)

def import_dicts(
self, dicts: Sequence[Dict[str, Any]], collection_name: str, create_collection: bool = False
self, dicts: Sequence[dict[str, Any]], collection_name: str, create_collection: bool = False
) -> None:
"""Import data to ArangoDB.
Expand Down
8 changes: 4 additions & 4 deletions mltb2/bs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
``pip install mltb2[bs]``
"""

from typing import Any, Dict, Optional
from typing import Any, Optional

import mdformat
from bs4 import BeautifulSoup
Expand All @@ -35,7 +35,7 @@ def extract_text(soup: BeautifulSoup, join_str: Optional[str] = None) -> str:
return result


def extract_one(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: Dict[str, Any]) -> Any:
def extract_one(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> Any:
"""Extract exactly one specified element from a BeautifulSoup object.
This function expacts that exactly only one result is found.
Expand All @@ -60,7 +60,7 @@ def extract_one(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **
return result


def extract_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: Dict[str, Any]) -> Any:
def extract_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> Any:
"""Extract all specified elements from a BeautifulSoup object.
Args:
Expand All @@ -77,7 +77,7 @@ def extract_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **
return result


def remove_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: Dict[str, Any]) -> None:
def remove_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> None:
"""Remove all specified elements from a BeautifulSoup object.
The removal is done in place. Nothing is returned.
Expand Down
8 changes: 4 additions & 4 deletions mltb2/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import os
from hashlib import sha256
from io import StringIO
from typing import Optional, Tuple
from typing import Optional

import joblib
import numpy as np
Expand Down Expand Up @@ -101,7 +101,7 @@ def _load_colon_label() -> pd.Series:
return label_series


def load_colon(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd.DataFrame]:
def load_colon(mltb2_base_data_dir: Optional[str] = None) -> tuple[pd.Series, pd.DataFrame]:
"""Load colon data.
The data is loaded and parsed from the internet.
Expand All @@ -128,7 +128,7 @@ def load_colon(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd
return result


def load_prostate(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd.DataFrame]:
def load_prostate(mltb2_base_data_dir: Optional[str] = None) -> tuple[pd.Series, pd.DataFrame]:
"""Load prostate data.
The data is loaded and parsed from `<https://web.stanford.edu/~hastie/CASI_files/DATA/prostate.html>`_.
Expand Down Expand Up @@ -177,7 +177,7 @@ def load_prostate(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series,
return result


def load_leukemia_big(mltb2_base_data_dir: Optional[str] = None) -> Tuple[pd.Series, pd.DataFrame]:
def load_leukemia_big(mltb2_base_data_dir: Optional[str] = None) -> tuple[pd.Series, pd.DataFrame]:
"""Load leukemia (big) data.
The data is loaded and parsed from the internet.
Expand Down
3 changes: 2 additions & 1 deletion mltb2/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
"""Database utils module."""

from abc import ABC, abstractmethod
from collections.abc import Sequence
from dataclasses import dataclass
from typing import Callable, Sequence
from typing import Callable


class AbstractBatchDataManager(ABC):
Expand Down
4 changes: 2 additions & 2 deletions mltb2/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import os
from dataclasses import dataclass, field
from typing import List, Optional
from typing import Optional

import fasttext
from fasttext.FastText import _FastText
Expand Down Expand Up @@ -52,7 +52,7 @@ def get_model_path_and_download() -> str:

return model_full_path

def __call__(self, text: str, num_lang: int = 10, always_detect_lang: Optional[List[str]] = None):
def __call__(self, text: str, num_lang: int = 10, always_detect_lang: Optional[list[str]] = None):
"""Identify languages of a given text.
Args:
Expand Down
31 changes: 16 additions & 15 deletions mltb2/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
import contextlib
import os
import random
from collections.abc import Sequence
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Set
from typing import Any, Optional
from uuid import uuid4

import joblib
Expand Down Expand Up @@ -83,12 +84,12 @@ class FileBasedRestartableBatchDataProcessor:
result_dir: The directory where the results are stored.
"""

data: List[Dict[str, Any]]
data: list[dict[str, Any]]
batch_size: int
uuid_name: str
result_dir: str
_result_dir_path: Path = field(init=False, repr=False)
_own_lock_uuids: Set[str] = field(init=False, repr=False, default_factory=set)
_own_lock_uuids: set[str] = field(init=False, repr=False, default_factory=set)

def __post_init__(self) -> None:
"""Do post init."""
Expand All @@ -99,7 +100,7 @@ def __post_init__(self) -> None:
if not len(self.data) > 0:
raise ValueError("data must not be empty!")

uuids: Set[str] = set()
uuids: set[str] = set()

# check uuid_name
for idx, d in enumerate(self.data):
Expand Down Expand Up @@ -134,8 +135,8 @@ def _get_uuid_from_filename(filename: str) -> Optional[str]:
uuid = filename[: filename.rindex("_")]
return uuid

def _get_locked_or_done_uuids(self) -> Set[str]:
locked_or_done_uuids: Set[str] = set()
def _get_locked_or_done_uuids(self) -> set[str]:
locked_or_done_uuids: set[str] = set()
for child_path in self._result_dir_path.iterdir():
if child_path.is_file():
filename = child_path.name
Expand All @@ -144,20 +145,20 @@ def _get_locked_or_done_uuids(self) -> Set[str]:
locked_or_done_uuids.add(uuid)
return locked_or_done_uuids

def _write_lock_files(self, batch: Sequence[Dict[str, Any]]) -> None:
def _write_lock_files(self, batch: Sequence[dict[str, Any]]) -> None:
for d in batch:
uuid = d[self.uuid_name]
(self._result_dir_path / f"{uuid}.lock").touch()
self._own_lock_uuids.add(uuid)

def _get_remaining_data(self) -> List[Dict[str, Any]]:
locked_or_done_uuids: Set[str] = self._get_locked_or_done_uuids()
def _get_remaining_data(self) -> list[dict[str, Any]]:
locked_or_done_uuids: set[str] = self._get_locked_or_done_uuids()
remaining_data = [d for d in self.data if d[self.uuid_name] not in locked_or_done_uuids]
return remaining_data

def read_batch(self) -> Sequence[Dict[str, Any]]:
def read_batch(self) -> Sequence[dict[str, Any]]:
"""Read the next batch of data."""
remaining_data: List[Dict[str, Any]] = self._get_remaining_data()
remaining_data: list[dict[str, Any]] = self._get_remaining_data()

# if we think we are done, delete all lock files and check again
# this is because lock files might be orphaned
Expand All @@ -172,27 +173,27 @@ def read_batch(self) -> Sequence[Dict[str, Any]]:
self._write_lock_files(next_batch)
return next_batch

def _save_batch_data(self, batch: Sequence[Dict[str, Any]]) -> None:
def _save_batch_data(self, batch: Sequence[dict[str, Any]]) -> None:
for d in batch:
uuid = d[self.uuid_name]
if uuid not in self._own_lock_uuids:
raise ValueError(f"uuid '{uuid}' not locked by me!")
filename = self._result_dir_path / f"{uuid}_{str(uuid4())}.pkl.gz" # noqa: RUF010
joblib.dump(d, filename, compress=("gzip", 3))

def _remove_lock_files(self, batch: Sequence[Dict[str, Any]]) -> None:
def _remove_lock_files(self, batch: Sequence[dict[str, Any]]) -> None:
for d in batch:
uuid = d[self.uuid_name]
(self._result_dir_path / f"{uuid}.lock").unlink(missing_ok=True)
self._own_lock_uuids.discard(uuid)

def save_batch(self, batch: Sequence[Dict[str, Any]]) -> None:
def save_batch(self, batch: Sequence[dict[str, Any]]) -> None:
"""Save the batch of data."""
self._save_batch_data(batch)
self._remove_lock_files(batch)

@staticmethod
def load_data(result_dir: str, ignore_load_error: bool = False) -> List[Dict[str, Any]]:
def load_data(result_dir: str, ignore_load_error: bool = False) -> list[dict[str, Any]]:
"""Load all data.
After all data is processed, this method can be used to load all data.
Expand Down
14 changes: 7 additions & 7 deletions mltb2/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import re
from dataclasses import dataclass
from typing import Final, List
from typing import Final

from tqdm import tqdm

Expand All @@ -21,7 +21,7 @@
_HEADLINE_REGEX: Final = re.compile(r"^#+ .*", flags=re.MULTILINE)


def _chunk_md_by_headline(md_text: str) -> List[str]:
def _chunk_md_by_headline(md_text: str) -> list[str]:
"""Chunk Markdown by headlines.
Args:
Expand All @@ -30,7 +30,7 @@ def _chunk_md_by_headline(md_text: str) -> List[str]:
Returns:
The list of Markdown chunks.
"""
positions: List[int] = [m.start() for m in re.finditer(_HEADLINE_REGEX, md_text)]
positions: list[int] = [m.start() for m in re.finditer(_HEADLINE_REGEX, md_text)]

# extend positions
if 0 not in positions:
Expand All @@ -41,7 +41,7 @@ def _chunk_md_by_headline(md_text: str) -> List[str]:
return result


def chunk_md(md_text: str) -> List[str]:
def chunk_md(md_text: str) -> list[str]:
"""Chunk Markdown by headlines and merge isolated headlines.
Merges isolated headlines with their corresponding subsequent paragraphs.
Expand Down Expand Up @@ -85,7 +85,7 @@ class MdTextSplitter:
transformers_token_counter: TransformersTokenCounter
show_progress_bar: bool = False

def __call__(self, md_text: str) -> List[str]:
def __call__(self, md_text: str) -> list[str]:
"""Split the Markdown text into sections.
Args:
Expand All @@ -98,8 +98,8 @@ def __call__(self, md_text: str) -> List[str]:

assert len(md_chunks) == len(counts) # type: ignore[arg-type]

result_merges: List[str] = []
temp_merges: List[str] = []
result_merges: list[str] = []
temp_merges: list[str] = []
current_count: int = 0

for md_chunk, count in zip(
Expand Down
Loading

0 comments on commit 25ee24d

Please sign in to comment.