From 9ef79da4d8fc4ab3d9eb408a43c9129105691179 Mon Sep 17 00:00:00 2001 From: Nils Vollroth <100927440+nvollroth@users.noreply.github.com> Date: Fri, 5 May 2023 18:23:17 +0200 Subject: [PATCH] feat: extraction of boundary types from various docstrings (#111) Closes #48, closes #36, closes #35, closes #32, closes #31, closes #30, closes #27, closes #8. ### Summary of Changes SpaCy rules were generated to recognize named examples and extract the resulting boundaries. ### Instructions for Manual Testing (if required) 1. Run `pytest` for `test_extract_boundary_values.py`. 2. Check the results of `pytest`. --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Co-authored-by: Lars Reimann --- .../api/_extract_boundary_values.py | 751 ++++++++++++++++++ .../api/test_extract_boundary_values.py | 226 ++++++ 2 files changed, 977 insertions(+) create mode 100644 src/library_analyzer/processing/api/_extract_boundary_values.py create mode 100644 tests/library_analyzer/processing/api/test_extract_boundary_values.py diff --git a/src/library_analyzer/processing/api/_extract_boundary_values.py b/src/library_analyzer/processing/api/_extract_boundary_values.py new file mode 100644 index 00000000..cc1c7068 --- /dev/null +++ b/src/library_analyzer/processing/api/_extract_boundary_values.py @@ -0,0 +1,751 @@ +from dataclasses import dataclass, field +from typing import Any, TypeAlias + +import spacy +from numpy import inf +from spacy.matcher import Matcher +from spacy.tokens import Doc, Span + +from .model import BoundaryType + +_Numeric: TypeAlias = int | float + + +@dataclass +class BoundaryList: + _boundaries: set[BoundaryType] = field(default_factory=set[BoundaryType]) + + def add_boundary(self, match_label: str, type_: str, match_string: Span = None) -> None: + """Add a boundary according to the matched rule. + + Parameters + ---------- + match_label + Label of the matched rule + type_ + Base type of the boundary to be created + match_string + Span containing the string matched by the corresponding rule. + This parameter is not required for every rule. + + """ + match match_label: + case "BOUNDARY_NON_POSITIVE": + self._boundaries.add(_create_non_positive_boundary(type_)) + case "BOUNDARY_POSITIVE": + self._boundaries.add(_create_positive_boundary(type_)) + case "BOUNDARY_NON_NEGATIVE": + self._boundaries.add(_create_non_negative_boundary(type_)) + case "BOUNDARY_NEGATIVE": + self._boundaries.add(_create_negative_boundary(type_)) + case "BOUNDARY_BETWEEN": + self._boundaries.add(_create_between_boundary(match_string, type_)) + case "BOUNDARY_INTERVAL": + self._boundaries.add(_create_interval_boundary(match_string, type_)) + case "BOUNDARY_AT_LEAST": + self._boundaries.add(_create_at_least_boundary(match_string, type_)) + case "BOUNDARY_INTERVAL_RELATIONAL": + self._boundaries.add(_create_interval_relational_boundary(match_string, type_)) + case "BOUNDARY_TYPE_REL_VAL": + self._boundaries.add(_create_type_rel_val_boundary(match_string, type_)) + case "BOUNDARY_INTERVAL_IN_BRACKETS": + self._boundaries.add(_create_interval_in_brackets_boundary(match_string, type_)) + + def get_boundaries(self) -> set[BoundaryType]: + return self._boundaries + + +type_funcs = {"float": float, "int": int} + +_nlp = spacy.load("en_core_web_sm") +_matcher = Matcher(_nlp.vocab) + +_geq_leq_op = [{"ORTH": {"IN": ["<", ">"]}}, {"ORTH": "="}] + +_boundary_at_least = [{"LOWER": "at"}, {"LOWER": "least"}, {"LIKE_NUM": True}] + +_boundary_min = [{"LOWER": "min"}, {"ORTH": "."}, {"LIKE_NUM": True}] + +_boundary_interval = [ + {"LOWER": {"IN": ["in", "within"]}}, + {"LOWER": "the", "OP": "?"}, + {"LOWER": {"IN": ["range", "interval"]}, "OP": "?"}, + {"LOWER": "of", "OP": "?"}, + {"ORTH": {"IN": ["(", "["]}}, + {}, + {"ORTH": ","}, + {}, + {"ORTH": {"IN": [")", "]"]}}, +] + + +_boundary_value_in = [ + {"LOWER": {"FUZZY": "value"}}, + {"LOWER": {"IN": ["is", "in"]}}, + {"ORTH": {"IN": ["(", "["]}}, + {}, + {"ORTH": ","}, + {}, + {"ORTH": {"IN": [")", "]"]}}, +] + + +_boundary_non_negative = [ + {"LOWER": {"IN": ["non", "not"]}}, + {"ORTH": {"IN": ["-", "_"]}, "OP": "?"}, + {"LOWER": "negative"}, +] + +_boundary_positive = [{"LOWER": "strictly", "OP": "?"}, {"LOWER": "positive"}] + +_boundary_non_positive = [ + {"LOWER": {"IN": ["non", "not"]}}, + {"ORTH": {"IN": ["-", "_"]}, "OP": "?"}, + {"LOWER": "positive"}, +] + +_boundary_negative = [{"LOWER": "strictly", "OP": "?"}, {"LOWER": "negative"}] + +_boundary_between = [{"LOWER": "between"}, {"LIKE_NUM": True}, {"LOWER": "and"}, {"LIKE_NUM": True}] + + +_boundary_gtlt_gtlt = [ + {"LIKE_NUM": True}, + {"ORTH": {"IN": ["<", ">"]}}, + {}, + {"ORTH": {"IN": ["<", ">"]}}, + {"LIKE_NUM": True}, +] + + +_boundary_geqleq_geqleq = [{"LIKE_NUM": True}, *_geq_leq_op, {}, *_geq_leq_op, {"LIKE_NUM": True}] + +_boundary_gtlt_geqleq = [{"LIKE_NUM": True}, {"ORTH": {"IN": ["<", ">"]}}, {}, *_geq_leq_op, {"LIKE_NUM": True}] + +_boundary_geqleq_gtlt = [{"LIKE_NUM": True}, *_geq_leq_op, {}, {"ORTH": {"IN": ["<", ">"]}}, {"LIKE_NUM": True}] + +_boundary_and_gtlt_gtlt = [ + {"ORTH": {"IN": ["<", ">"]}}, + {"LIKE_NUM": True}, + {"ORTH": {"IN": ["and", "or"]}}, + {"ORTH": {"IN": ["<", ">"]}}, + {"LIKE_NUM": True}, +] + +_boundary_and_geqleq_geqleq = [ + *_geq_leq_op, + {"LIKE_NUM": True}, + {"ORTH": {"IN": ["and", "or"]}}, + *_geq_leq_op, + {"LIKE_NUM": True}, +] + +_boundary_and_gtlt_geqleq = [ + {"ORTH": {"IN": ["<", ">"]}}, + {"LIKE_NUM": True}, + {"ORTH": {"IN": ["and", "or"]}}, + *_geq_leq_op, + {"LIKE_NUM": True}, +] + +_boundary_and_geqleq_gtlt = [ + *_geq_leq_op, + {"LIKE_NUM": True}, + {"ORTH": {"IN": ["and", "or"]}}, + {"ORTH": {"IN": ["<", ">"]}}, + {"LIKE_NUM": True}, +] + +_boundary_type = [{"LOWER": {"IN": ["float", "int"]}}] + +_boundary_type_gtlt_val = [*_boundary_type, {"ORTH": {"IN": ["<", ">"]}}, {"LIKE_NUM": True}] + +_boundary_type_geqleq_val = [*_boundary_type, *_geq_leq_op, {"LIKE_NUM": True}] + +_boundary_interval_in_brackets = [ + *_boundary_type, + {"ORTH": "("}, + {"ORTH": {"IN": ["(", "["]}}, + {}, + {"ORTH": ","}, + {}, + {"ORTH": {"IN": [")", "]"]}}, + {"ORTH": ")"}, +] + + +def _check_negative_pattern( + matcher: Matcher, # noqa: ARG001 + doc: Doc, # noqa: ARG001 + i: int, + matches: list[tuple[Any, ...]], +) -> Any | None: + """on-match function for the spaCy Matcher. + + Delete the BOUNDARY_NEGATIVE match if the BOUNDARY_NON_NEGATIVE rule has already been detected. + + Parameters + ---------- + matcher + Parameter is ignored. + doc + Parameter is ignored. + i + Index of the match that was recognized by the rule. + + matches + List of matches found by the matcher + + """ + previous_id, _, _ = matches[i - 1] + if _nlp.vocab.strings[previous_id] == "BOUNDARY_NON_NEGATIVE": + matches.remove(matches[i]) + + return None + + +def _check_positive_pattern( + matcher: Matcher, # noqa: ARG001 + doc: Doc, # noqa: ARG001 + i: int, + matches: list[tuple[Any, ...]], +) -> Any | None: + """on-match function for the spaCy Matcher. + + Delete the BOUNDARY_POSITIVE match if the BOUNDARY_NON_POSITIVE rule has already been detected. + + Parameters + ---------- + matcher + Parameter is ignored. + doc + Parameter is ignored. + i + Index of the match that was recognized by the rule. + + matches + List of matches found by the matcher + + """ + previous_id, _, _ = matches[i - 1] + if _nlp.vocab.strings[previous_id] == "BOUNDARY_NON_POSITIVE": + matches.remove(matches[i]) + + return None + + +def _check_interval_relational_pattern( + matcher: Matcher, # noqa: ARG001 + doc: Doc, # noqa: ARG001 + i: int, + matches: list[tuple[Any, ...]], +) -> Any | None: + """on-match function for the spaCy Matcher. + + Delete the BOUNDARY_TYPE_REL_VAL match if the BOUNDARY_INTERVAL_RELATIONAL rule has been detected. + + Parameters + ---------- + matcher + Parameter is ignored. + doc + Parameter is ignored. + i + Index of the match that was recognized by the rule. + + matches + List of matches found by the matcher + + """ + previous_id, _, _ = matches[i - 1] + if _nlp.vocab.strings[previous_id] == "BOUNDARY_TYPE_REL_VAL": + matches.remove(matches[i - 1]) + + return None + + +def _check_interval( + matcher: Matcher, # noqa: ARG001 + doc: Doc, # noqa: ARG001 + i: int, + matches: list[tuple[Any, ...]], +) -> Any | None: + """on-match function for the spaCy Matcher. + + Delete the BOUNDARY_INTERVAL match if the BOUNDARY_INTERVAL rule has been already detected. + + Parameters + ---------- + matcher + Parameter is ignored. + doc + Parameter is ignored. + i + Index of the match that was recognized by the rule. + + matches + List of matches found by the matcher + + """ + previous_id, _, _ = matches[i - 1] + if _nlp.vocab.strings[previous_id] == "BOUNDARY_INTERVAL" and (len(matches) > 1): + matches.remove(matches[i - 1]) + + return None + + +relational_patterns = [ + _boundary_gtlt_gtlt, + _boundary_geqleq_geqleq, + _boundary_geqleq_gtlt, + _boundary_gtlt_geqleq, + _boundary_and_gtlt_gtlt, + _boundary_and_geqleq_geqleq, + _boundary_and_geqleq_gtlt, + _boundary_and_gtlt_geqleq, +] + +_matcher.add("BOUNDARY_AT_LEAST", [_boundary_at_least, _boundary_min]) +_matcher.add("BOUNDARY_INTERVAL", [_boundary_interval, _boundary_value_in], on_match=_check_interval) +_matcher.add("BOUNDARY_POSITIVE", [_boundary_positive], on_match=_check_positive_pattern) +_matcher.add("BOUNDARY_NON_NEGATIVE", [_boundary_non_negative]) +_matcher.add("BOUNDARY_NEGATIVE", [_boundary_negative], on_match=_check_negative_pattern) +_matcher.add("BOUNDARY_NON_POSITIVE", [_boundary_non_positive]) +_matcher.add("BOUNDARY_BETWEEN", [_boundary_between]) +_matcher.add("BOUNDARY_INTERVAL_RELATIONAL", relational_patterns, on_match=_check_interval_relational_pattern) +_matcher.add("BOUNDARY_TYPE", [_boundary_type]) +_matcher.add("BOUNDARY_TYPE_REL_VAL", [_boundary_type_gtlt_val, _boundary_type_geqleq_val]) +_matcher.add("BOUNDARY_INTERVAL_IN_BRACKETS", [_boundary_interval_in_brackets]) + + +def _get_type_value(type_: str, value: _Numeric | str) -> _Numeric: + """Transform the passed value to the value matching type_. + + Parameters + ---------- + type_ + Type to be transformed to. + value + Value to be transformed. + + Returns + ------- + Numeric + Transformed value. + """ + return type_funcs[type_](value) + + +def _create_non_positive_boundary(type_: str) -> BoundaryType: + """Create a BoundaryType with predefined extrema. + + Create a BoundaryType that describes the non-positive value range of the given type. + + Parameters + ---------- + type_ + Base type of Boundary + + Returns + ------- + BoundaryType + + """ + return BoundaryType( + type_, + min=BoundaryType.NEGATIVE_INFINITY, + max=_get_type_value(type_, 0), + min_inclusive=False, + max_inclusive=True, + ) + + +def _create_positive_boundary(type_: str) -> BoundaryType: + """Create a BoundaryType with predefined extrema. + + Create a BoundaryType that describes the positive value range of the given type. + + Parameters + ---------- + type_ + Base type of Boundary + + Returns + ------- + BoundaryType + + """ + return BoundaryType( + type_, + min=_get_type_value(type_, 0), + max=BoundaryType.INFINITY, + min_inclusive=False, + max_inclusive=False, + ) + + +def _create_non_negative_boundary(type_: str) -> BoundaryType: + """Create a BoundaryType with predefined extrema. + + Create a BoundaryType that describes the non-negative value range of the given type. + + Parameters + ---------- + type_ + Base type of Boundary + + Returns + ------- + BoundaryType + + """ + return BoundaryType( + type_, + min=_get_type_value(type_, 0), + max=BoundaryType.INFINITY, + min_inclusive=True, + max_inclusive=False, + ) + + +def _create_negative_boundary(type_: str) -> BoundaryType: + """Create a BoundaryType with predefined extrema. + + Create a BoundaryType that describes the negative value range of the given type. + + Parameters + ---------- + type_ + Base type of Boundary + + Returns + ------- + BoundaryType + + """ + # return type_, ("negative infinity", False), (_get_type_value(type_, 0), False) + return BoundaryType( + type_, + min=BoundaryType.NEGATIVE_INFINITY, + max=_get_type_value(type_, 0), + min_inclusive=False, + max_inclusive=False, + ) + + +def _create_between_boundary(match_string: Span, type_: str) -> BoundaryType: + """Create a BoundaryType with individual extrema. + + Create a BoundaryType whose extrema are extracted from the passed match string. + + Parameters + ---------- + match_string + Match string containing the extrema of the value range. + type_ + Base type of Boundary + + Returns + ------- + BoundaryType + + """ + values = [] + for token in match_string: + if token.like_num: + values.append(_get_type_value(type_, token.text)) + return BoundaryType(type_, min=min(values), max=max(values), min_inclusive=True, max_inclusive=True) + + +def _create_at_least_boundary(match_string: Span, type_: str) -> BoundaryType: + """Create a BoundaryType with individual minimum. + + Create a BoundaryType whose minimum is extracted from the passed match string. + + Parameters + ---------- + match_string + Match string containing the minimum of the value range. + type_ + Base type of Boundary + + Returns + ------- + BoundaryType + + """ + value: _Numeric = 0 + for token in match_string: + if token.like_num: + value = _get_type_value(type_, token.text) + return BoundaryType(type_, min=value, max=BoundaryType.INFINITY, min_inclusive=True, max_inclusive=False) + + +def _create_interval_boundary(match_string: Span, type_: str) -> BoundaryType: + """Create a BoundaryType with individual extrema. + + Create a BoundaryType whose extrema are extracted from the passed match string. + + Parameters + ---------- + match_string + Match string containing the extrema of the value range. + type_ + Base type of Boundary + + Returns + ------- + BoundaryType + + """ + values = [] + brackets = [] + for token in match_string: + if token.text in ["(", "[", ")", "]"]: + brackets.append(token.text) + if token.like_num: + values.append(_get_type_value(type_, token.text)) + + if token.text in ["inf", "infty", "infinty"]: + values.append(inf) + elif token.text in ["negative inf", "negative infty", "negative infinity"]: + values.append(-inf) + + type_func = type_funcs[type_] + if -inf in values: + minimum = BoundaryType.NEGATIVE_INFINITY + min_incl = False + else: + minimum = type_func(min(values)) + min_incl = brackets[0] == "[" + + if inf in values: + maximum = BoundaryType.INFINITY + max_incl = False + else: + maximum = type_func(max(values)) + max_incl = brackets[1] == "]" + + return BoundaryType(type_, min=minimum, max=maximum, min_inclusive=min_incl, max_inclusive=max_incl) + + +def _create_interval_relational_boundary(match_string: Span, type_: str) -> BoundaryType: + """Create a BoundaryType with individual extrema. + + Create a BoundaryType whose extrema are extracted from the passed match string. + + Parameters + ---------- + match_string + Match string containing the extrema of the value range. + type_ + Base type of Boundary + + Returns + ------- + BoundaryType + + """ + relational_ops = [] + values = [] + and_or_found = False + + for token in match_string: + if token.text in ["<", ">"]: + relational_ops.append(token.text) + elif token.text == "=": + relational_ops[len(relational_ops) - 1] += token.text + elif token.like_num: + values.append(token.text) + elif token.text in ["and", "or"]: + and_or_found = True + type_func = type_funcs[type_] + + minimum = type_func(min(values)) + maximum = type_func(max(values)) + + if not and_or_found: + min_incl = (relational_ops[0] == "<=") or (relational_ops[1] == ">=") + max_incl = (relational_ops[1] == "<=") or (relational_ops[0] == ">=") + else: + min_incl = ">=" in relational_ops + max_incl = "<=" in relational_ops + + return BoundaryType(type_, min=minimum, max=maximum, min_inclusive=min_incl, max_inclusive=max_incl) + + +def _create_type_rel_val_boundary(match_string: Span, type_: str) -> BoundaryType: + """Create a BoundaryType with individual minimum or maximum. + + Create a BoundaryType whose minimum or maximum is extracted from the passed match string. + + Parameters + ---------- + match_string + Match string containing the extrema of the value range. + type_ + Base type of Boundary + + Returns + ------- + BoundaryType + + """ + val: _Numeric = 0 + min_: _Numeric | str = 0 + max_: _Numeric | str = 0 + + rel_op = "" + type_func = type_funcs[type_] + min_incl = False + max_incl = False + + for token in match_string: + if token.like_num: + val = type_func(token.text) + if token.text in [">", "<", "="]: + rel_op += token.text + + # type (< | <=) val + if rel_op in ["<", "<="]: + min_ = BoundaryType.NEGATIVE_INFINITY + max_ = val + if rel_op == "<=": + max_incl = True + + # type (> | >=) val + elif rel_op in [">", ">="]: + min_ = val + max_ = BoundaryType.INFINITY + if rel_op == ">=": + min_incl = True + + return BoundaryType(type_, min=min_, max=max_, min_inclusive=min_incl, max_inclusive=max_incl) + + +def _create_interval_in_brackets_boundary(match_string: Span, type_: str) -> BoundaryType: + span_ = match_string[2:-1] + + return _create_interval_boundary(span_, type_) + + +def _analyze_matches(matches: list[tuple[str, Span]], boundaries: BoundaryList) -> None: + """Analyze the passed match list for boundaries to be created. + + Parameters + ---------- + matches + Matches found by spaCy Matcher. + + boundaries + BoundaryList object that creates and contains the matching boundary objects. + + """ + type_id = 0 + other_id = 0 + processed_matches = [] + found_type = False + + # Assignment of the found boundaries to the corresponding data type + for match_label, match_string in matches: + if match_label == "BOUNDARY_TYPE": + if found_type: + other_id += 1 + processed_matches.append({"id": type_id, "match_label": match_label, "match_string": match_string}) + type_id += 1 + found_type = True + + else: + processed_matches.append({"id": other_id, "match_label": match_label, "match_string": match_string}) + other_id += 1 + if found_type: + found_type = False + + # Creation of the matching BoundaryTypes + for i in range(max(type_id, other_id)): + same_id = [match for match in processed_matches if match["id"] == i] + if len(same_id) == 2: + type_ = "" + match_string = "" + match_label = "" + + for match in same_id: + if match["match_label"] == "BOUNDARY_TYPE": + type_ = match["match_string"].text + else: + match_label = match["match_label"] + match_string = match["match_string"] + + boundaries.add_boundary(match_label, type_, match_string) + + +def extract_boundary(description: str, type_string: str) -> set[BoundaryType]: + """Extract valid BoundaryTypes. + + Extract valid BoundaryTypes described by predefined rules. + + Parameters + ---------- + description + Description string of the parameter to be examined. + + type_string + Type string of the parameter to be examined. + + Returns + ------- + set[BoundaryType] + A set containing valid BoundaryTypes. + """ + boundaries = BoundaryList() + + type_doc = _nlp(type_string) + type_matches = _matcher(type_doc) + type_matches = [(_nlp.vocab.strings[match_id], type_doc[start:end]) for match_id, start, end in type_matches] + + description_doc = _nlp(description) + desc_matches = _matcher(description_doc) + desc_matches = [(_nlp.vocab.strings[match_id], description_doc[start:end]) for match_id, start, end in desc_matches] + + if type_matches: + type_list = [] # Possible numeric data types that may be used with the parameter to be examined. + restriction_list = [] # Restrictions of the type such as non-negative + match_label = "" + + for match in type_matches: + if match[0] == "BOUNDARY_TYPE": + type_list.append(match[1].text) + else: + restriction_list.append(match) + + type_length = len(type_list) + + # If the length of the found types is 1, the boundary type is described only in the type string + # and the value range only in the description string. + + if type_length == 1: + type_text = type_list[0] + match_string: Span | None = None + + if len(restriction_list) == 1: + match_label = restriction_list[0][0] + match_string = restriction_list[0][1] + + # Checking the description for boundaries if no restriction was found in the type string + elif len(desc_matches) > 0: + match_label, match_string = desc_matches[0] + if match_label == "BOUNDARY_TYPE": + type_text = match_string.text + match_label, match_string = desc_matches[1] + + boundaries.add_boundary(match_label, type_text, match_string) + + elif type_length > 1: + found_type_rel_val = any(match[0] == "BOUNDARY_TYPE_REL_VAL" for match in type_matches) + + if found_type_rel_val: + _analyze_matches(type_matches, boundaries) + else: + _analyze_matches(desc_matches, boundaries) + + return boundaries.get_boundaries() diff --git a/tests/library_analyzer/processing/api/test_extract_boundary_values.py b/tests/library_analyzer/processing/api/test_extract_boundary_values.py new file mode 100644 index 00000000..4dfbe409 --- /dev/null +++ b/tests/library_analyzer/processing/api/test_extract_boundary_values.py @@ -0,0 +1,226 @@ +from typing import TypeAlias + +import pytest +from library_analyzer.processing.api._extract_boundary_values import extract_boundary +from library_analyzer.processing.api.model import BoundaryType + +_Numeric: TypeAlias = int | float +BoundaryValueType = tuple[str, tuple[_Numeric | str, bool], tuple[_Numeric | str, bool]] + + +# @pytest.mark.skip(reason="Currently not testting this") +@pytest.mark.parametrize( + ("type_string", "description", "expected_boundary"), + [ + ( + "float", + ( + "Damping factor in the range [0.5, 1.0) is the extent to which the current value is maintained relative" + " to incoming values (weighted 1 - damping). This in order to avoid numerical oscillations when" + " updating these values (messages)." + ), + [("float", (0.5, True), (1.0, False))], + ), + ( + "float", + ( + "An upper bound on the fraction of training errors and a lower bound of the fraction of support" + " vectors. Should be in the interval (0, 1]. By default 0.5 will be taken." + ), + [("float", (0.0, False), (1.0, True))], + ), + ( + "non-negative float", + ( + "Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost " + "complexity that is smaller than ccp_alpha will be chosen. By default, no pruning is performed. See " + ":ref:minimal_cost_complexity_pruning for details." + ), + [("float", (0.0, True), ("Infinity", False))], + ), + ( + "{'scale', 'auto'} or float", + ( + "Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\nif gamma='scale' (default) is passed then it" + " uses 1 / (n_features * X.var()) as value of gamma,\nif 'auto', uses 1 / n_features\nif float, must be" + " non-negative.\n\n.. versionchanged: 0.22 The default value of gamma changed from 'auto' to 'scale'." + ), + [("float", (0.0, True), ("Infinity", False))], + ), + ( + "int", + "Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.", + [("int", (0, True), ("Infinity", False))], + ), + ( + "int", + "The verbosity level. The default, zero, means silent mode. Range of values is [0, inf].", + [("int", (0, True), ("Infinity", False))], + ), + ( + "int", + "The verbosity level. The default, zero, means silent mode. Range of values is at least 3.", + [("int", (3, True), ("Infinity", False))], + ), + ( + "float", + "Momentum for gradient descent update. Should be between 0 and 1. Only used when solver='sgd'.", + [("float", (0.0, True), (1.0, True))], + ), + ( + "float between 0 and 1", + ( + "Determines the minimum steepness on the reachability plot that constitutes a cluster boundary. For " + "example, an upwards point in the reachability plot is defined by the ratio from one point to its " + "successor being at most 1-xi. Used only when cluster_method='xi'." + ), + [("float", (0.0, True), (1.0, True))], + ), + ( + "float", + "Momentum for gradient descent update. Should be non-positive. Only used when solver='sgd'.", + [("float", ("NegativeInfinity", False), (0.0, True))], + ), + ( + "float", + ( + "Regularization parameter. The strength of the regularization is inversely proportional to C. Must be " + "strictly positive." + ), + [("float", (0.0, False), ("Infinity", False))], + ), + ( + "int or float", + ( + "If bootstrap is True, the number of samples to draw from X to train each base estimator.\n\nIf None (" + "default), then draw X.shape[0] samples.\nIf int, then draw max_samples samples.\n If float, " + "then draw max_samples * X.shape[0] samples. Thus, max_samples should be in the interval (0.0, " + "1.0].\n\n.. versionadded: 0.22" + ), + [("float", (0.0, False), (1.0, True))], + ), + ( + "int or float", + ( + "If bootstrap is True, the number of samples to draw from X to train each base estimator.\n\nIf None (" + "default), then draw X.shape[0] samples.\nIf int, then max_samples values in [0, 10].\n If float, " + "then draw max_samples * X.shape[0] samples. Thus, max_samples should be in the interval (0.0, " + "1.0].\n\n.. versionadded: 0.22" + ), + [("int", (0, True), (10, True)), ("float", (0.0, False), (1.0, True))], + ), + ( + "float", + ( + "The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, " + "l1_ratio=1 to L1. Only used if penalty is 'elasticnet'." + ), + [("float", (0.0, True), (1.0, True))], + ), + ( + "float", + ( + "The Elastic Net mixing parameter, with 0 < l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, " + "l1_ratio=1 to L1. Only used if penalty is 'elasticnet'." + ), + [("float", (0.0, False), (1.0, True))], + ), + ( + "float", + ( + "The Elastic Net mixing parameter, with 0 <= l1_ratio < 1. l1_ratio=0 corresponds to L2 penalty, " + "l1_ratio=1 to L1. Only used if penalty is 'elasticnet'." + ), + [("float", (0.0, True), (1.0, False))], + ), + ( + "float", + ( + "The Elastic Net mixing parameter, with 0 < l1_ratio < 1. l1_ratio=0 corresponds to L2 penalty, " + "l1_ratio=1 to L1. Only used if penalty is 'elasticnet'." + ), + [("float", (0.0, False), (1.0, False))], + ), + ( + "float", + ( + "The Elastic Net mixing parameter, with 1 > l1_ratio > 0. l1_ratio=0 corresponds to L2 penalty, " + "l1_ratio=1 to L1. Only used if penalty is 'elasticnet'." + ), + [("float", (0.0, False), (1.0, False))], + ), + ( + "float", + ( + "The Elastic Net mixing parameter, with l1_ratio > 0 and < 1. l1_ratio=0 corresponds to L2 penalty, " + "l1_ratio=1 to L1. Only used if penalty is 'elasticnet'." + ), + [("float", (0.0, False), (1.0, False))], + ), + ( + "float", + ( + "The Elastic Net mixing parameter, with l1_ratio >= 0 and < 1. l1_ratio=0 corresponds to L2 penalty, " + "l1_ratio=1 to L1. Only used if penalty is 'elasticnet'." + ), + [("float", (0.0, True), (1.0, False))], + ), + ( + "int > 1 or float between 0 and 1", + ( + "Minimum number of samples in an OPTICS cluster, expressed as an absolute number or a fraction of the " + "number of samples (rounded to be at least 2). If None, the value of min_samples is used instead. Used " + "only when cluster_method='xi'." + ), + [("int", (1, False), ("Infinity", False)), ("float", (0.0, True), (1.0, True))], + ), + ("float ([0, 1])", "abc", [("float", (0.0, True), (1.0, True))]), + ("bool", "Whether to allow array.ndim > 2", []), + ( + 'dict, list of dicts, "balanced", or None', + ( + "Weights associated with classes in the form {class_label: weight}. If not given, all classes are" + " supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same" + " order as the columns of y.\n\nNote that for multioutput (including multilabel) weights should be" + " defined for each class of every column in its own dict. For example, for four-class multilabel" + " classification weights should be [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of" + ' [{1:1}, {2:5}, {3:1}, {4:1}].\n\nThe "balanced" mode uses the values of y to automatically adjust' + " weights inversely proportional to class frequencies in the input data: n_samples / (n_classes *" + " np.bincount(y)).\n\nFor multi-output, the weights of each column of y will be multiplied." + ), + [], + ), + ( + "int, RandomState instance or None", + ( + "Controls the randomness of the estimator. The features are always randomly permuted at each split," + ' even if splitter is set to "best". When max_features < n_features, the algorithm will select' + " max_features at random at each split before finding the best split among them. But the best found" + " split may vary across different runs, even if max_features=n_features. That is the case, if the" + " improvement of the criterion is identical for several splits and one split has to be selected at" + " random. To obtain a deterministic behaviour during fitting, random_state has to be fixed to an" + " integer. See :term:Glossary for details." + ), + [], + ), + ( + "{'ovo', 'ovr'}", + ( + "Whether to return a one-vs-rest ('ovr') decision function of shape (n_samples, n_classes) as all other" + " classifiers, or the original one-vs-one ('ovo') decision function of libsvm which has shape" + " (n_samples, n_classes * (n_classes - 1) / 2). However, note that internally, one-vs-one ('ovo') is" + " always used as a multi-class strategy to train models; an ovr matrix is only constructed from the ovo" + " matrix. The parameter is ignored for binary classification.\n\n.. versionchanged: 0.19" + " decision_function_shape is 'ovr' by default.\n\n.. versionadded: 0.17 decision_function_shape='ovr'" + " is recommended.\n\n.. versionchanged: 0.17 Deprecated decision_function_shape='ovo' and None." + ), + [], + ), + ], +) +def test_extract_boundaries(type_string: str, description: str, expected_boundary: list[BoundaryValueType]) -> None: + expected = [ + BoundaryType(base_type=type_, min=min_[0], max=max_[0], min_inclusive=min_[1], max_inclusive=max_[1]) + for type_, min_, max_ in expected_boundary + ] + assert extract_boundary(description, type_string) == set(expected)