Switch to ruff, include new rules and adapt code (#34)

* Switch to ruff and adapt * More linting rules and adaptations * Even more linting and adjustements * Remove unnecessary test
dobraczka · Jan 2, 2024 · e97e28e · e97e28e
1 parent 95c6852
commit e97e28e
Show file tree

Hide file tree

Showing 33 changed files with 750 additions and 1,064 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,9 +1,24 @@
 repos:
-  - repo: local
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
     hooks:
-      - id: nox
-        name: Nox
-        entry: nox -rs lint --
-        language: system
-        types: [python]
-        require_serial: true
+      - id: check-added-large-files
+      - id: check-case-conflict
+      - id: check-merge-conflict
+      - id: check-symlinks
+      - id: check-yaml
+      - id: debug-statements
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/tox-dev/pyproject-fmt
+    rev: "1.3.0"
+    hooks:
+      - id: pyproject-fmt
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.3
+    hooks:
+      - id: ruff
+        args: ["--fix", "--show-fixes"]
+      - id: ruff-format
diff --git a/docs/conf.py b/docs/conf.py
@@ -13,9 +13,10 @@
 import os
 import sys
 
-from kiez import __version__
 from sphinx.ext.autodoc import between
 
+from kiez import __version__
+
 sys.path.insert(0, os.path.abspath("."))
 
 
@@ -30,7 +31,7 @@ def setup(app):
 # -- Project information -----------------------------------------------------
 
 project = "kiez"
-copyright = "2021, Daniel Obraczka"
+copyright = "2021, Daniel Obraczka"  # noqa: A001
 author = "Daniel Obraczka"
 
 # The full version, including alpha/beta/rc tags

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -31,11 +31,10 @@ You can also get other specific libraries with e.g.:
 Other options to get specific libraries are ``nmslib``,``annoy``, ``ngt``. However faiss is the recommended library, which provides the most accurate and fastest results.
 
 
-To build kiez from source use `poetry <https://python-poetry.org/>`_ 
+To build kiez from source use `poetry <https://python-poetry.org/>`_
 
 .. code-block:: bash
 
-   git clone [email protected]:dobraczka/kiez.git 
+   git clone [email protected]:dobraczka/kiez.git
    cd kiez
    poetry install
-
diff --git a/kiez/analysis/__init__.py b/kiez/analysis/__init__.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # SPDX-License-Identifier: BSD-3-Clause
 
 from .estimation import hubness_score

diff --git a/kiez/analysis/estimation.py b/kiez/analysis/estimation.py
@@ -1,17 +1,14 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 # SPDX-License-Identifier: BSD-3-Clause
 # adapted from skhubness: https://github.com/VarIr/scikit-hubness/
-"""
-Estimate hubness in datasets
-"""
+"""Estimate hubness in datasets."""
 
 
 from __future__ import annotations
 
 import logging
 import warnings
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 import numpy as np
 from scipy import stats
@@ -34,6 +31,8 @@
     "k_occurrence",
 ]
 
+_SPACE_LIMIT = 10000
+
 
 def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float:
     """Hubness measure; corrected for non-negativity of k-occurrence.
@@ -56,8 +55,7 @@ def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float:
     k_occurrence_std = k_occurrence.std(ddof=1)
     a = (clip_left - k_occurrence_mean) / k_occurrence_std
     b = (clip_right - k_occurrence_mean) / k_occurrence_std
-    skew_truncnorm = stats.truncnorm(a, b).moment(3)
-    return skew_truncnorm
+    return stats.truncnorm(a, b).moment(3)
 
 
 def _calc_gini_index(
@@ -151,7 +149,7 @@ def _calc_atkinson_index(k_occurrence: np.ndarray, eps: float = 0.5) -> float:
     return float(1.0 - 1.0 / k_occurrence.mean() * term)
 
 
-def _calc_antihub_occurrence(k_occurrence: np.ndarray) -> Tuple[np.ndarray, float]:
+def _calc_antihub_occurrence(k_occurrence: np.ndarray) -> tuple[np.ndarray, float]:
     """Proportion of antihubs in data set.
 
     Antihubs are objects that are never among the nearest neighbors
@@ -173,7 +171,7 @@ def _calc_antihub_occurrence(k_occurrence: np.ndarray) -> Tuple[np.ndarray, floa
 
 def _calc_hub_occurrence(
     k: int, k_occurrence: np.ndarray, n_test: int, hub_size: float = 2
-) -> Tuple[np.ndarray, float]:
+) -> tuple[np.ndarray, float]:
     """Proportion of nearest neighbor slots occupied by hubs.
 
     Parameters
@@ -202,8 +200,6 @@ def hubness_score(
     *,
     k: Optional[int] = None,
     hub_size: float = 2.0,
-    shuffle_equal: bool = True,
-    random_state=None,
     verbose: int = 0,
     return_value: str = "all_but_gini",
     store_k_occurrence: bool = False,
@@ -222,16 +218,6 @@ def hubness_score(
         number of k for k-nearest neighbor
     hub_size : float
         Hubs are defined as objects with k-occurrence > hub_size * k.
-    shuffle_equal : bool
-        If true shuffle neighbors with identical distances
-        to avoid artifact hubness.
-        NOTE: This is especially useful for secondary distance measures
-        with a finite number of possible values
-    random_state: int, RandomState instance or None, optional
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
     verbose : int
         Level of output messages
     return_value : str
@@ -288,12 +274,11 @@ def hubness_score(
     k_neighbors = nn_ind.copy()
     if k is None:
         k = nn_ind.shape[1]
-    else:
-        if k < k_neighbors.shape[1]:
-            k_neighbors = k_neighbors[:, :k]
-        elif k > k_neighbors.shape[1]:
-            k = nn_ind.shape[1]
-            warnings.warn(f"k > nn_ind.shape[1], k will be set to {k}")
+    elif k < k_neighbors.shape[1]:
+        k_neighbors = k_neighbors[:, :k]
+    elif k > k_neighbors.shape[1]:
+        k = nn_ind.shape[1]
+        warnings.warn(f"k > nn_ind.shape[1], k will be set to {k}", stacklevel=2)
     assert k is not None
 
     # Negative indices can occur, when ANN does not find enough neighbors,
@@ -317,7 +302,7 @@ def hubness_score(
 
     # Gini index
     if return_value in ["gini", "all"]:
-        limiting = "space" if k_occurrence.shape[0] > 10_000 else "time"
+        limiting = "space" if k_occurrence.shape[0] > _SPACE_LIMIT else "time"
         gini_index = _calc_gini_index(k_occurrence, limiting, verbose=verbose)
     else:
         gini_index = np.nan
@@ -360,8 +345,7 @@ def hubness_score(
         hubness_measures["k_occurrence"] = k_occurrence
     if return_value == "all":
         return hubness_measures
-    elif return_value == "all_but_gini":
+    if return_value == "all_but_gini":
         del hubness_measures["gini"]
         return hubness_measures
-    else:
-        return hubness_measures[return_value]
+    return hubness_measures[return_value]
diff --git a/kiez/evaluate/eval_metrics.py b/kiez/evaluate/eval_metrics.py
@@ -1,6 +1,4 @@
-"""
-Calculate evaluation metrics such as hits@k
-"""
+"""Calculate evaluation metrics such as hits@k."""
 from typing import Any, Dict, List, Union
 
 import numpy as np
@@ -27,7 +25,7 @@ def hits(
     gold: Dict[Any, Any],  # source -> target
     k=None,
 ) -> Dict[int, float]:
-    """Show hits@k
+    """Show hits@k.
 
     Parameters
     ----------

diff --git a/kiez/hubness_reduction/base.py b/kiez/hubness_reduction/base.py
@@ -46,13 +46,15 @@ def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarra
     def _set_k_if_needed(self, k: Optional[int] = None) -> int:
         if k is None:
             warnings.warn(
-                f"No k supplied, setting to n_candidates = {self.nn_algo.n_candidates}"
+                f"No k supplied, setting to n_candidates = {self.nn_algo.n_candidates}",
+                stacklevel=2,
             )
             return self.nn_algo.n_candidates
         if k > self.nn_algo.n_candidates:
             warnings.warn(
                 "k > n_candidates supplied! Setting to n_candidates ="
-                f" {self.nn_algo.n_candidates}"
+                f" {self.nn_algo.n_candidates}",
+                stacklevel=2,
             )
             return self.nn_algo.n_candidates
         return k

diff --git a/kiez/hubness_reduction/csls.py b/kiez/hubness_reduction/csls.py
@@ -1,10 +1,7 @@
 from __future__ import annotations
 
-import warnings
-from typing import Tuple
-
 import numpy as np
-from sklearn.utils.validation import check_consistent_length, check_is_fitted
+from sklearn.utils.validation import check_is_fitted
 from tqdm.auto import tqdm
 
 from .base import HubnessReduction
@@ -62,7 +59,7 @@ def transform(
         neigh_dist,
         neigh_ind,
         query,
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    ) -> tuple[np.ndarray, np.ndarray]:
         """Transform distance between test and training data with CSLS.
 
         Parameters

diff --git a/kiez/hubness_reduction/dis_sim.py b/kiez/hubness_reduction/dis_sim.py
@@ -1,19 +1,18 @@
-# -*- coding: utf-8 -*-
 # SPDX-License-Identifier: BSD-3-Clause
 # adapted from skhubness: https://github.com/VarIr/scikit-hubness/
 
 from __future__ import annotations
 
-import warnings
-from typing import Tuple
-
 import numpy as np
 from sklearn.metrics import euclidean_distances
 from sklearn.utils.extmath import row_norms
-from sklearn.utils.validation import check_consistent_length, check_is_fitted
+from sklearn.utils.validation import check_is_fitted
 
 from .base import HubnessReduction
 
+_DESIRED_P_VALUE = 2
+_MINIMUM_DIST = 0.0
+
 
 class DisSimLocal(HubnessReduction):
     """Hubness reduction with DisSimLocal.
@@ -31,7 +30,7 @@ class DisSimLocal(HubnessReduction):
     ----------
     .. [1] Hara K, Suzuki I, Kobayashi K, Fukumizu K, Radovanović M (2016)
            Flattening the density gradient for eliminating spatial centrality to reduce hubness.
-           In: Proceedings of the 30th AAAI conference on artificial intelligence, pp 1659–1665.
+           In: Proceedings of the 30th AAAI conference on artificial intelligence, pp 1659-1665.
            https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12055
     """
 
@@ -40,13 +39,12 @@ def __init__(self, squared: bool = True, **kwargs):
         self.squared = squared
         if self.nn_algo.metric in ["euclidean", "minkowski"]:
             self.squared = False
-            if hasattr(self.nn_algo, "p"):
-                if self.nn_algo.p != 2:
-                    raise ValueError(
-                        "DisSimLocal only supports squared Euclidean distances. If"
-                        " the provided NNAlgorithm has a `p` parameter it must be"
-                        f" set to p=2. Now it is p={self.nn_algo.p}"
-                    )
+            if hasattr(self.nn_algo, "p") and self.nn_algo.p != _DESIRED_P_VALUE:
+                raise ValueError(
+                    "DisSimLocal only supports squared Euclidean distances. If"
+                    " the provided NNAlgorithm has a `p` parameter it must be"
+                    f" set to p=2. Now it is p={self.nn_algo.p}"
+                )
         elif self.nn_algo.metric in ["sqeuclidean"]:
             self.squared = True
         else:
@@ -102,7 +100,7 @@ def transform(
         neigh_dist: np.ndarray,
         neigh_ind: np.ndarray,
         query: np.ndarray,
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    ) -> tuple[np.ndarray, np.ndarray]:
         """Transform distance between test and training data with DisSimLocal.
 
         Parameters
@@ -156,7 +154,7 @@ def transform(
         # certain scikit-learn routines (e.g. in metric='precomputed' usages).
         # We, therefore, shift dissimilarities to non-negative values, if necessary.
         min_dist = hub_reduced_dist.min()
-        if min_dist < 0.0:
+        if min_dist < _MINIMUM_DIST:
             hub_reduced_dist += -min_dist
 
         # Return Euclidean or squared Euclidean distances?

diff --git a/kiez/hubness_reduction/local_scaling.py b/kiez/hubness_reduction/local_scaling.py
@@ -1,14 +1,10 @@
-# -*- coding: utf-8 -*-
 # SPDX-License-Identifier: BSD-3-Clause
 # adapted from skhubness: https://github.com/VarIr/scikit-hubness/
 
 from __future__ import annotations
 
-import warnings
-from typing import Tuple
-
 import numpy as np
-from sklearn.utils.validation import check_consistent_length, check_is_fitted
+from sklearn.utils.validation import check_is_fitted
 from tqdm.auto import tqdm
 
 from .base import HubnessReduction
@@ -34,7 +30,7 @@ class LocalScaling(HubnessReduction):
     ----------
     .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
            Local and global scaling reduce hubs in space. The Journal of Machine
-           Learning Research, 13(1), 2871–2902.
+           Learning Research, 13(1), 2871-2902.
     """
 
     def __init__(self, method: str = "standard", **kwargs):
@@ -86,7 +82,7 @@ def transform(
         neigh_dist,
         neigh_ind,
         query=None,
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    ) -> tuple[np.ndarray, np.ndarray]:
         """Transform distance between test and training data with Mutual Proximity.
 
         Parameters
@@ -146,7 +142,7 @@ def transform(
             r_s_to_t = r_dist_s_to_t.mean(axis=1)
             for i in range_n_test:
                 hub_reduced_dist[i, :] = neigh_dist[i] / np.sqrt(
-                    (r_s_to_t[i] * r_t_to_s[neigh_ind[i]])
+                    r_s_to_t[i] * r_t_to_s[neigh_ind[i]]
                 )
 
         # Return the hubness reduced distances