Even more linting and adjustements

dobraczka · Dec 22, 2023 · a4dc4c3 · a4dc4c3
1 parent 91c14b7
commit a4dc4c3
Show file tree

Hide file tree

Showing 19 changed files with 80 additions and 107 deletions.
diff --git a/kiez/analysis/estimation.py b/kiez/analysis/estimation.py
@@ -31,6 +31,8 @@
     "k_occurrence",
 ]
 
+_SPACE_LIMIT = 10000
+
 
 def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float:
     """Hubness measure; corrected for non-negativity of k-occurrence.
@@ -53,8 +55,7 @@ def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float:
     k_occurrence_std = k_occurrence.std(ddof=1)
     a = (clip_left - k_occurrence_mean) / k_occurrence_std
     b = (clip_right - k_occurrence_mean) / k_occurrence_std
-    skew_truncnorm = stats.truncnorm(a, b).moment(3)
-    return skew_truncnorm
+    return stats.truncnorm(a, b).moment(3)
 
 
 def _calc_gini_index(
@@ -199,8 +200,6 @@ def hubness_score(
     *,
     k: Optional[int] = None,
     hub_size: float = 2.0,
-    shuffle_equal: bool = True,
-    random_state=None,
     verbose: int = 0,
     return_value: str = "all_but_gini",
     store_k_occurrence: bool = False,
@@ -219,16 +218,6 @@ def hubness_score(
         number of k for k-nearest neighbor
     hub_size : float
         Hubs are defined as objects with k-occurrence > hub_size * k.
-    shuffle_equal : bool
-        If true shuffle neighbors with identical distances
-        to avoid artifact hubness.
-        NOTE: This is especially useful for secondary distance measures
-        with a finite number of possible values
-    random_state: int, RandomState instance or None, optional
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
     verbose : int
         Level of output messages
     return_value : str
@@ -285,12 +274,11 @@ def hubness_score(
     k_neighbors = nn_ind.copy()
     if k is None:
         k = nn_ind.shape[1]
-    else:
-        if k < k_neighbors.shape[1]:
-            k_neighbors = k_neighbors[:, :k]
-        elif k > k_neighbors.shape[1]:
-            k = nn_ind.shape[1]
-            warnings.warn(f"k > nn_ind.shape[1], k will be set to {k}", stacklevel=2)
+    elif k < k_neighbors.shape[1]:
+        k_neighbors = k_neighbors[:, :k]
+    elif k > k_neighbors.shape[1]:
+        k = nn_ind.shape[1]
+        warnings.warn(f"k > nn_ind.shape[1], k will be set to {k}", stacklevel=2)
     assert k is not None
 
     # Negative indices can occur, when ANN does not find enough neighbors,
@@ -314,7 +302,7 @@ def hubness_score(
 
     # Gini index
     if return_value in ["gini", "all"]:
-        limiting = "space" if k_occurrence.shape[0] > 10_000 else "time"
+        limiting = "space" if k_occurrence.shape[0] > _SPACE_LIMIT else "time"
         gini_index = _calc_gini_index(k_occurrence, limiting, verbose=verbose)
     else:
         gini_index = np.nan
@@ -357,8 +345,7 @@ def hubness_score(
         hubness_measures["k_occurrence"] = k_occurrence
     if return_value == "all":
         return hubness_measures
-    elif return_value == "all_but_gini":
+    if return_value == "all_but_gini":
         del hubness_measures["gini"]
         return hubness_measures
-    else:
-        return hubness_measures[return_value]
+    return hubness_measures[return_value]
diff --git a/kiez/hubness_reduction/dis_sim.py b/kiez/hubness_reduction/dis_sim.py
@@ -10,6 +10,9 @@
 
 from .base import HubnessReduction
 
+_DESIRED_P_VALUE = 2
+_MINIMUM_DIST = 0.0
+
 
 class DisSimLocal(HubnessReduction):
     """Hubness reduction with DisSimLocal.
@@ -27,7 +30,7 @@ class DisSimLocal(HubnessReduction):
     ----------
     .. [1] Hara K, Suzuki I, Kobayashi K, Fukumizu K, Radovanović M (2016)
            Flattening the density gradient for eliminating spatial centrality to reduce hubness.
-           In: Proceedings of the 30th AAAI conference on artificial intelligence, pp 1659–1665.
+           In: Proceedings of the 30th AAAI conference on artificial intelligence, pp 1659-1665.
            https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12055
     """
 
@@ -36,13 +39,12 @@ def __init__(self, squared: bool = True, **kwargs):
         self.squared = squared
         if self.nn_algo.metric in ["euclidean", "minkowski"]:
             self.squared = False
-            if hasattr(self.nn_algo, "p"):
-                if self.nn_algo.p != 2:
-                    raise ValueError(
-                        "DisSimLocal only supports squared Euclidean distances. If"
-                        " the provided NNAlgorithm has a `p` parameter it must be"
-                        f" set to p=2. Now it is p={self.nn_algo.p}"
-                    )
+            if hasattr(self.nn_algo, "p") and self.nn_algo.p != _DESIRED_P_VALUE:
+                raise ValueError(
+                    "DisSimLocal only supports squared Euclidean distances. If"
+                    " the provided NNAlgorithm has a `p` parameter it must be"
+                    f" set to p=2. Now it is p={self.nn_algo.p}"
+                )
         elif self.nn_algo.metric in ["sqeuclidean"]:
             self.squared = True
         else:
@@ -152,7 +154,7 @@ def transform(
         # certain scikit-learn routines (e.g. in metric='precomputed' usages).
         # We, therefore, shift dissimilarities to non-negative values, if necessary.
         min_dist = hub_reduced_dist.min()
-        if min_dist < 0.0:
+        if min_dist < _MINIMUM_DIST:
             hub_reduced_dist += -min_dist
 
         # Return Euclidean or squared Euclidean distances?

diff --git a/kiez/hubness_reduction/local_scaling.py b/kiez/hubness_reduction/local_scaling.py
@@ -30,7 +30,7 @@ class LocalScaling(HubnessReduction):
     ----------
     .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
            Local and global scaling reduce hubs in space. The Journal of Machine
-           Learning Research, 13(1), 2871–2902.
+           Learning Research, 13(1), 2871-2902.
     """
 
     def __init__(self, method: str = "standard", **kwargs):

diff --git a/kiez/hubness_reduction/mutual_proximity.py b/kiez/hubness_reduction/mutual_proximity.py
@@ -29,7 +29,7 @@ class MutualProximity(HubnessReduction):
     ----------
     .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
            Local and global scaling reduce hubs in space. The Journal of Machine
-           Learning Research, 13(1), 2871–2902.
+           Learning Research, 13(1), 2871-2902.
     """
 
     def __init__(self, method: str = "normal", **kwargs):
@@ -39,7 +39,8 @@ def __init__(self, method: str = "normal", **kwargs):
                 f'Mutual proximity method "{method}" not recognized. Try "normal"'
                 ' or "empiric".'
             )
-        elif method in ["exact", "empiric"]:
+
+        if method in ["exact", "empiric"]:
             self.method = "empiric"
         elif method in ["normal", "gaussi"]:
             self.method = "normal"

diff --git a/kiez/io/temp_file_handling.py b/kiez/io/temp_file_handling.py
@@ -36,10 +36,10 @@ def create_tempfile_preferably_in_dir(
     """
     temp_file = mkstemp if persistent else NamedTemporaryFile
     try:
-        handle = temp_file(suffix=suffix, prefix=prefix, dir=directory)  # type: ignore
+        handle = temp_file(suffix=suffix, prefix=prefix, dir=directory)  # type: ignore[operator]
         warn = False
     except FileNotFoundError:
-        handle = temp_file(suffix=suffix, prefix=prefix, dir=None)  # type: ignore
+        handle = temp_file(suffix=suffix, prefix=prefix, dir=None)  # type: ignore[operator]
         warn = True
 
     # Extract the path (as string)

diff --git a/kiez/kiez.py b/kiez/kiez.py
@@ -88,7 +88,8 @@ def __init__(
                 f"n_neighbors does not take {type(n_candidates)} value, enter"
                 " integer value"
             )
-        elif n_candidates <= 0:
+
+        if n_candidates <= 0:
             raise ValueError(f"Expected n_candidates > 0. Got {n_candidates}")
         if algorithm_kwargs is None:
             algorithm_kwargs = {"n_candidates": n_candidates}

diff --git a/kiez/neighbors/approximate/faiss.py b/kiez/neighbors/approximate/faiss.py
@@ -54,8 +54,8 @@ class Faiss(NNAlgorithm):
     For details about configuring faiss consult their wiki: https://github.com/facebookresearch/faiss/wiki
     """
 
-    valid_metrics = ["l2", "euclidean"]
-    valid_spaces = ["l2"]
+    valid_metrics = ("l2", "euclidean")
+    valid_spaces = "l2"
 
     def __init__(
         self,
@@ -117,5 +117,4 @@ def _kneighbors(self, k, query, index, return_distance, is_self_querying):
             if self.metric == "euclidean":
                 dist = np.sqrt(dist)
             return dist, ind
-        else:
-            return ind
+        return ind
diff --git a/kiez/neighbors/approximate/nmslib.py b/kiez/neighbors/approximate/nmslib.py
@@ -14,6 +14,8 @@
 except ImportError:  # pragma: no cover
     nmslib = None
 
+_VERBOSE_THRESH = 2
+
 
 class NMSLIB(NNAlgorithm):
     """Wrapper for hierarchical navigable small world graphs based approximate nearest neighbor search implementation from NMSLIB.
@@ -44,22 +46,22 @@ class NMSLIB(NNAlgorithm):
     See the nmslib documentation for more details: https://github.com/nmslib/nmslib/blob/master/manual/methods.md
     """
 
-    valid_metrics = [
+    valid_metrics = (
         "euclidean",
         "l2",
         "minkowski",
         "squared_euclidean",
         "sqeuclidean",
         "cosine",
         "cosinesimil",
-    ]
+    )
 
     def __init__(
         self,
         n_candidates: int = 5,
         metric: str = "euclidean",
         method: str = "hnsw",
-        M: int = 16,  # noqa: N803
+        M: int = 16,
         post_processing: int = 2,
         ef_construction: int = 200,
         n_jobs: int = 1,
@@ -97,7 +99,7 @@ def __init__(
         super().__init__(n_candidates=n_candidates, metric=metric, n_jobs=n_jobs)
         self.verbose = verbose
         self.method = method
-        self.M = M  # noqa: N803
+        self.M = M
         self.post_processing = post_processing
         self.ef_construction = ef_construction
 
@@ -128,7 +130,7 @@ def _fit(self, data, is_source: bool):
                 "post": post_processing,
                 "indexThreadQty": self.n_jobs,
             },
-            print_progress=(self.verbose >= 2),
+            print_progress=(self.verbose >= _VERBOSE_THRESH),
         )
         return hnsw_index
 
@@ -158,5 +160,4 @@ def _kneighbors(self, k, query, index, return_distance, is_self_querying):
 
         if return_distance:
             return neigh_dist, neigh_ind
-        else:
-            return neigh_ind
+        return neigh_ind
diff --git a/kiez/neighbors/approximate/nng.py b/kiez/neighbors/approximate/nng.py
@@ -6,6 +6,7 @@
 from __future__ import annotations
 
 import logging
+from types import MappingProxyType
 
 import numpy as np
 from tqdm.auto import tqdm
@@ -62,7 +63,7 @@ class NNG(NNAlgorithmWithJoblib):
     when required.
     """
 
-    valid_metrics = [
+    valid_metrics = (
         "manhattan",
         "L1",
         "euclidean",
@@ -75,13 +76,15 @@ class NNG(NNAlgorithmWithJoblib):
         "Normalized Cosine",
         "Hamming",
         "Jaccard",
-    ]
-    _internal_distance_type = {
-        "manhattan": "L1",
-        "euclidean": "L2",
-        "minkowski": "L2",
-        "sqeuclidean": "L2",
-    }
+    )
+    _internal_distance_type = MappingProxyType(
+        {
+            "manhattan": "L1",
+            "euclidean": "L2",
+            "minkowski": "L2",
+            "sqeuclidean": "L2",
+        }
+    )
 
     def __init__(
         self,
@@ -147,10 +150,7 @@ def _index_dir_plausibility_check(self):
             )
 
     def _fit(self, data, is_source: bool):
-        if is_source:
-            prefix = "kiez_source"
-        else:
-            prefix = "kiez_target"
+        prefix = "kiez_source" if is_source else "kiez_target"
 
         index_path = None
         # Set up a directory to save the index to
@@ -264,5 +264,4 @@ def _kneighbors_part(self, k, query, index, return_distance, is_self_querying):
 
         if return_distance:
             return neigh_dist, neigh_ind
-        else:
-            return neigh_ind
+        return neigh_ind
diff --git a/kiez/neighbors/approximate/random_projection_trees.py b/kiez/neighbors/approximate/random_projection_trees.py
@@ -54,14 +54,14 @@ class Annoy(NNAlgorithmWithJoblib):
     See more details in the annoy documentation: https://github.com/spotify/annoy#full-python-api
     """
 
-    valid_metrics = [
+    valid_metrics = (
         "angular",
         "euclidean",
         "manhattan",
         "hamming",
         "dot",
         "minkowski",
-    ]
+    )
 
     def __init__(
         self,
@@ -239,5 +239,4 @@ def _kneighbors_part(self, k, query, index, return_distance, is_self_querying):
 
         if return_distance:
             return neigh_dist, neigh_ind
-        else:
-            return neigh_ind
+        return neigh_ind
diff --git a/kiez/neighbors/util.py b/kiez/neighbors/util.py
@@ -17,6 +17,6 @@ def available_nn_algorithms() -> List[Type[NNAlgorithm]]:
         try:
             nn_algorithm_resolver.make(ann)
             available.append(nn_algorithm_resolver.lookup(ann))
-        except ImportError:
+        except ImportError:  # noqa: PERF203
             pass
     return available
diff --git a/noxfile.py b/noxfile.py
@@ -109,7 +109,6 @@ def lint(session: Session) -> None:
         "pre-commit",
         "run",
         "--all-files",
-        "--show-diff-on-failure",
         "--hook-stage=manual",
         *session.posargs,
     )
@@ -118,18 +117,15 @@ def lint(session: Session) -> None:
 @session()
 def style_checking(session: Session) -> None:
     args = session.posargs or locations
-    session.install(
-        "pyproject-flake8",
-        "flake8-eradicate",
-        "flake8-isort",
-        "flake8-debugger",
-        "flake8-comprehensions",
-        "flake8-print",
-        "flake8-black",
-        "flake8-bugbear",
-        "pydocstyle",
-    )
-    session.run("pflake8", *args)
+    session.install("ruff")
+    session.run("ruff", "check", *args)
+
+
+@session()
+def pedantic_checking(session: Session) -> None:
+    args = session.posargs or locations
+    session.install("ruff")
+    session.run("ruff", "check", '--extend-select="ARG,TID,PLR0913,PLR0912"', *args)
 
 
 @session()

diff --git a/pyproject.toml b/pyproject.toml
@@ -68,8 +68,8 @@ indent-width = 4
 target-version = "py38"
 
 [tool.ruff.lint]
-ignore = ["E111", "E114", "E117", "E501", "D1", "D203", "D213", "D206", "D300", "Q000", "Q001", "Q002", "Q003", "COM812", "COM819", "ISC001","ISC002", "B905", "W191"]
-select = ["B", "C", "E", "F", "W", "B", "I", "D", "UP", "A", "C4", "T10", "ICN", "PIE", "PYI", "PT"]
+ignore = ["E111", "E114", "E117", "E501", "D1", "D203", "D213", "D206", "D300", "Q000", "Q001", "Q002", "Q003", "COM812", "COM819", "ISC001","ISC002", "B905", "W191", "PLR0913", "PLR0912"]
+select = ["B", "C", "E", "F", "W", "B", "I", "D", "UP", "A", "C4", "T10", "ICN", "PIE", "PYI", "PT", "RET", "SIM", "ERA", "PD", "PGH", "PL", "NPY", "PERF", "RUF"]
 
 [tool.ruff.lint.mccabe]
 max-complexity = 18