From 3f23cd4d58ad860826cedb5c9978fa2012a75ae8 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 1 Jul 2022 16:43:06 +0200 Subject: [PATCH 1/8] Update to latest versions --- .pre-commit-config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e13aa2fd0..b4ad7509d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/psf/black - rev: 19.10b0 + rev: 22.6.0 hooks: - id: black args: [--line-length=100] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.761 + rev: v0.961 hooks: - id: mypy name: mypy openml @@ -14,15 +14,15 @@ repos: name: mypy tests files: tests/.* - repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.3 + rev: 4.0.1 hooks: - id: flake8 name: flake8 openml files: openml/.* additional_dependencies: - - flake8-print==3.1.4 + - flake8-print==5.0.0 - id: flake8 name: flake8 tests files: tests/.* additional_dependencies: - - flake8-print==3.1.4 + - flake8-print==5.0.0 From c2e83072283769f530e7c71ccc31bab82a92f167 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 1 Jul 2022 16:44:22 +0200 Subject: [PATCH 2/8] Updated Black formatting Black was bumped from 19.10b0 to 22.6.0. Changes in the files are reduced to: - No whitespace at the start and end of a docstring. - All comma separated "lists" (for example in function calls) are now one item per line, regardless if they would fit on one line. --- examples/30_extended/custom_flow_.py | 9 +- .../30_extended/fetch_runtimes_tutorial.py | 10 +- .../30_extended/flows_and_runs_tutorial.py | 6 +- examples/30_extended/run_setup_tutorial.py | 12 ++- examples/30_extended/study_tutorial.py | 4 +- .../task_manual_iteration_tutorial.py | 43 ++++++-- openml/_api_calls.py | 55 ++++++++--- openml/base.py | 34 +++---- openml/cli.py | 16 ++- openml/config.py | 31 +++--- openml/datasets/dataset.py | 18 ++-- openml/datasets/functions.py | 28 +++--- openml/evaluations/functions.py | 2 +- openml/exceptions.py | 16 +-- openml/extensions/extension_interface.py | 8 +- openml/extensions/functions.py | 8 +- openml/extensions/sklearn/extension.py | 58 +++++++---- openml/flows/flow.py | 10 +- openml/flows/functions.py | 13 ++- openml/runs/functions.py | 25 +++-- openml/runs/run.py | 22 +++-- openml/runs/trace.py | 19 +++- openml/setups/functions.py | 2 +- openml/study/functions.py | 11 ++- openml/study/study.py | 6 +- openml/tasks/functions.py | 14 ++- openml/tasks/split.py | 10 +- openml/tasks/task.py | 94 ++++++++++-------- openml/testing.py | 4 +- openml/utils.py | 2 +- setup.py | 10 +- tests/conftest.py | 2 +- tests/test_datasets/test_dataset_functions.py | 37 ++++--- tests/test_extensions/test_functions.py | 6 +- .../test_sklearn_extension.py | 42 +++++--- tests/test_flows/test_flow.py | 13 ++- tests/test_flows/test_flow_functions.py | 14 ++- tests/test_openml/test_api_calls.py | 3 +- tests/test_openml/test_config.py | 12 +-- tests/test_openml/test_openml.py | 11 ++- tests/test_runs/test_run.py | 21 +++- tests/test_runs/test_run_functions.py | 98 ++++++++++++++----- tests/test_runs/test_trace.py | 11 ++- tests/test_setups/test_setup_functions.py | 4 +- tests/test_study/test_study_functions.py | 6 +- tests/test_tasks/test_split.py | 12 ++- tests/test_tasks/test_task_functions.py | 25 ++++- tests/test_utils/test_utils.py | 3 +- 48 files changed, 635 insertions(+), 285 deletions(-) diff --git a/examples/30_extended/custom_flow_.py b/examples/30_extended/custom_flow_.py index ae5f37631..513d445ba 100644 --- a/examples/30_extended/custom_flow_.py +++ b/examples/30_extended/custom_flow_.py @@ -85,7 +85,9 @@ # but that does not matter for this demonstration. autosklearn_flow = openml.flows.get_flow(9313) # auto-sklearn 0.5.1 -subflow = dict(components=OrderedDict(automl_tool=autosklearn_flow),) +subflow = dict( + components=OrderedDict(automl_tool=autosklearn_flow), +) #################################################################################################### # With all parameters of the flow defined, we can now initialize the OpenMLFlow and publish. @@ -98,7 +100,10 @@ # the model of the flow to `None`. autosklearn_amlb_flow = openml.flows.OpenMLFlow( - **general, **flow_hyperparameters, **subflow, model=None, + **general, + **flow_hyperparameters, + **subflow, + model=None, ) autosklearn_amlb_flow.publish() print(f"autosklearn flow created: {autosklearn_amlb_flow.flow_id}") diff --git a/examples/30_extended/fetch_runtimes_tutorial.py b/examples/30_extended/fetch_runtimes_tutorial.py index 3d5183613..535f3607d 100644 --- a/examples/30_extended/fetch_runtimes_tutorial.py +++ b/examples/30_extended/fetch_runtimes_tutorial.py @@ -72,7 +72,10 @@ n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( - task_id, n_repeats, n_folds, n_samples, + task_id, + n_repeats, + n_folds, + n_samples, ) ) @@ -97,7 +100,10 @@ def print_compare_runtimes(measures): clf = RandomForestClassifier(n_estimators=10) run1 = openml.runs.run_model_on_task( - model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False, + model=clf, + task=task, + upload_flow=False, + avoid_duplicate_runs=False, ) measures = run1.fold_evaluations diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index 714ce7b55..05b8c8cce 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -176,7 +176,11 @@ # The following lines can then be executed offline: run = openml.runs.run_model_on_task( - pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array", + pipe, + task, + avoid_duplicate_runs=False, + upload_flow=False, + dataset_format="array", ) # The run may be stored offline, and the flow will be stored along with it: diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py index 1bb123aad..a2bc3a4df 100644 --- a/examples/30_extended/run_setup_tutorial.py +++ b/examples/30_extended/run_setup_tutorial.py @@ -57,10 +57,18 @@ # easy as you want it to be -cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),) +cat_imp = make_pipeline( + OneHotEncoder(handle_unknown="ignore", sparse=False), + TruncatedSVD(), +) cont_imp = SimpleImputer(strategy="median") ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) -model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),]) +model_original = Pipeline( + steps=[ + ("transform", ct), + ("estimator", RandomForestClassifier()), + ] +) # Let's change some hyperparameters. Of course, in any good application we # would tune them using, e.g., Random Search or Bayesian Optimization, but for diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py index b66c49096..d5bfcd88a 100644 --- a/examples/30_extended/study_tutorial.py +++ b/examples/30_extended/study_tutorial.py @@ -51,7 +51,9 @@ # And we can use the evaluation listing functionality to learn more about # the evaluations available for the conducted runs: evaluations = openml.evaluations.list_evaluations( - function="predictive_accuracy", output_format="dataframe", study=study.study_id, + function="predictive_accuracy", + output_format="dataframe", + study=study.study_id, ) print(evaluations.head()) diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py index c30ff66a3..676a742a1 100644 --- a/examples/30_extended/task_manual_iteration_tutorial.py +++ b/examples/30_extended/task_manual_iteration_tutorial.py @@ -44,7 +44,10 @@ print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( - task_id, n_repeats, n_folds, n_samples, + task_id, + n_repeats, + n_folds, + n_samples, ) ) @@ -53,7 +56,11 @@ # samples (indexing is zero-based). Usually, one would loop over all repeats, folds and sample # sizes, but we can neglect this here as there is only a single repetition. -train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0,) +train_indices, test_indices = task.get_train_test_split_indices( + repeat=0, + fold=0, + sample=0, +) print(train_indices.shape, train_indices.dtype) print(test_indices.shape, test_indices.dtype) @@ -69,7 +76,10 @@ print( "X_train.shape: {}, y_train.shape: {}, X_test.shape: {}, y_test.shape: {}".format( - X_train.shape, y_train.shape, X_test.shape, y_test.shape, + X_train.shape, + y_train.shape, + X_test.shape, + y_test.shape, ) ) @@ -82,7 +92,10 @@ n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( - task_id, n_repeats, n_folds, n_samples, + task_id, + n_repeats, + n_folds, + n_samples, ) ) @@ -92,7 +105,9 @@ for fold_idx in range(n_folds): for sample_idx in range(n_samples): train_indices, test_indices = task.get_train_test_split_indices( - repeat=repeat_idx, fold=fold_idx, sample=sample_idx, + repeat=repeat_idx, + fold=fold_idx, + sample=sample_idx, ) X_train = X.iloc[train_indices] y_train = y.iloc[train_indices] @@ -121,7 +136,10 @@ n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( - task_id, n_repeats, n_folds, n_samples, + task_id, + n_repeats, + n_folds, + n_samples, ) ) @@ -131,7 +149,9 @@ for fold_idx in range(n_folds): for sample_idx in range(n_samples): train_indices, test_indices = task.get_train_test_split_indices( - repeat=repeat_idx, fold=fold_idx, sample=sample_idx, + repeat=repeat_idx, + fold=fold_idx, + sample=sample_idx, ) X_train = X.iloc[train_indices] y_train = y.iloc[train_indices] @@ -160,7 +180,10 @@ n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( - task_id, n_repeats, n_folds, n_samples, + task_id, + n_repeats, + n_folds, + n_samples, ) ) @@ -170,7 +193,9 @@ for fold_idx in range(n_folds): for sample_idx in range(n_samples): train_indices, test_indices = task.get_train_test_split_indices( - repeat=repeat_idx, fold=fold_idx, sample=sample_idx, + repeat=repeat_idx, + fold=fold_idx, + sample=sample_idx, ) X_train = X.iloc[train_indices] y_train = y.iloc[train_indices] diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 12b283738..959cad51a 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -69,15 +69,20 @@ def _perform_api_call(call, request_method, data=None, file_elements=None): __check_response(response, url, file_elements) logging.info( - "%.7fs taken for [%s] request for the URL %s", time.time() - start, request_method, url, + "%.7fs taken for [%s] request for the URL %s", + time.time() - start, + request_method, + url, ) return response.text def _download_minio_file( - source: str, destination: Union[str, pathlib.Path], exists_ok: bool = True, + source: str, + destination: Union[str, pathlib.Path], + exists_ok: bool = True, ) -> None: - """ Download file ``source`` from a MinIO Bucket and store it at ``destination``. + """Download file ``source`` from a MinIO Bucket and store it at ``destination``. Parameters ---------- @@ -103,7 +108,9 @@ def _download_minio_file( try: client.fget_object( - bucket_name=bucket, object_name=object_name, file_path=str(destination), + bucket_name=bucket, + object_name=object_name, + file_path=str(destination), ) except minio.error.S3Error as e: if e.message.startswith("Object does not exist"): @@ -120,7 +127,7 @@ def _download_text_file( exists_ok: bool = True, encoding: str = "utf8", ) -> Optional[str]: - """ Download the text file at `source` and store it in `output_path`. + """Download the text file at `source` and store it in `output_path`. By default, do nothing if a file already exists in `output_path`. The downloaded file can be checked against an expected md5 checksum. @@ -156,7 +163,10 @@ def _download_text_file( if output_path is None: logging.info( - "%.7fs taken for [%s] request for the URL %s", time.time() - start, "get", source, + "%.7fs taken for [%s] request for the URL %s", + time.time() - start, + "get", + source, ) return downloaded_file @@ -165,7 +175,10 @@ def _download_text_file( fh.write(downloaded_file) logging.info( - "%.7fs taken for [%s] request for the URL %s", time.time() - start, "get", source, + "%.7fs taken for [%s] request for the URL %s", + time.time() - start, + "get", + source, ) del downloaded_file @@ -174,8 +187,8 @@ def _download_text_file( def _file_id_to_url(file_id, filename=None): """ - Presents the URL how to download a given file id - filename is optional + Presents the URL how to download a given file id + filename is optional """ openml_url = config.server.split("/api/") url = openml_url[0] + "/data/download/%s" % file_id @@ -194,7 +207,12 @@ def _read_url_files(url, data=None, file_elements=None): file_elements = {} # Using requests.post sets header 'Accept-encoding' automatically to # 'gzip,deflate' - response = _send_request(request_method="post", url=url, data=data, files=file_elements,) + response = _send_request( + request_method="post", + url=url, + data=data, + files=file_elements, + ) return response @@ -258,7 +276,9 @@ def _send_request(request_method, url, data, files=None, md5_checksum=None): raise OpenMLServerError( "Unexpected server error when calling {}. Please contact the " "developers!\nStatus code: {}\n{}".format( - url, response.status_code, response.text, + url, + response.status_code, + response.text, ) ) if retry_counter >= n_retries: @@ -290,7 +310,9 @@ def __check_response(response, url, file_elements): def __parse_server_exception( - response: requests.Response, url: str, file_elements: Dict, + response: requests.Response, + url: str, + file_elements: Dict, ) -> OpenMLServerError: if response.status_code == 414: @@ -319,12 +341,17 @@ def __parse_server_exception( # 512 for runs, 372 for datasets, 500 for flows # 482 for tasks, 542 for evaluations, 674 for setups - return OpenMLServerNoResult(code=code, message=full_message,) + return OpenMLServerNoResult( + code=code, + message=full_message, + ) # 163: failure to validate flow XML (https://www.openml.org/api_docs#!/flow/post_flow) if code in [163] and file_elements is not None and "description" in file_elements: # file_elements['description'] is the XML file description of the flow full_message = "\n{}\n{} - {}".format( - file_elements["description"], message, additional_information, + file_elements["description"], + message, + additional_information, ) else: full_message = "{} - {}".format(message, additional_information) diff --git a/openml/base.py b/openml/base.py index 1b6e5ccc7..35a9ce58f 100644 --- a/openml/base.py +++ b/openml/base.py @@ -13,7 +13,7 @@ class OpenMLBase(ABC): - """ Base object for functionality that is shared across entities. """ + """Base object for functionality that is shared across entities.""" def __repr__(self): body_fields = self._get_repr_body_fields() @@ -22,32 +22,32 @@ def __repr__(self): @property @abstractmethod def id(self) -> Optional[int]: - """ The id of the entity, it is unique for its entity type. """ + """The id of the entity, it is unique for its entity type.""" pass @property def openml_url(self) -> Optional[str]: - """ The URL of the object on the server, if it was uploaded, else None. """ + """The URL of the object on the server, if it was uploaded, else None.""" if self.id is None: return None return self.__class__.url_for_id(self.id) @classmethod def url_for_id(cls, id_: int) -> str: - """ Return the OpenML URL for the object of the class entity with the given id. """ + """Return the OpenML URL for the object of the class entity with the given id.""" # Sample url for a flow: openml.org/f/123 return "{}/{}/{}".format(openml.config.get_server_base_url(), cls._entity_letter(), id_) @classmethod def _entity_letter(cls) -> str: - """ Return the letter which represents the entity type in urls, e.g. 'f' for flow.""" + """Return the letter which represents the entity type in urls, e.g. 'f' for flow.""" # We take advantage of the class naming convention (OpenMLX), # which holds for all entities except studies and tasks, which overwrite this method. return cls.__name__.lower()[len("OpenML") :][0] @abstractmethod def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: - """ Collect all information to display in the __repr__ body. + """Collect all information to display in the __repr__ body. Returns ------ @@ -60,13 +60,13 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: pass def _apply_repr_template(self, body_fields: List[Tuple[str, str]]) -> str: - """ Generates the header and formats the body for string representation of the object. + """Generates the header and formats the body for string representation of the object. - Parameters - ---------- - body_fields: List[Tuple[str, str]] - A list of (name, value) pairs to display in the body of the __repr__. - """ + Parameters + ---------- + body_fields: List[Tuple[str, str]] + A list of (name, value) pairs to display in the body of the __repr__. + """ # We add spaces between capitals, e.g. ClassificationTask -> Classification Task name_with_spaces = re.sub( r"(\w)([A-Z])", r"\1 \2", self.__class__.__name__[len("OpenML") :] @@ -81,7 +81,7 @@ def _apply_repr_template(self, body_fields: List[Tuple[str, str]]) -> str: @abstractmethod def _to_dict(self) -> "OrderedDict[str, OrderedDict]": - """ Creates a dictionary representation of self. + """Creates a dictionary representation of self. Uses OrderedDict to ensure consistent ordering when converting to xml. The return value (OrderedDict) will be used to create the upload xml file. @@ -98,7 +98,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": pass def _to_xml(self) -> str: - """ Generate xml representation of self for upload to server. """ + """Generate xml representation of self for upload to server.""" dict_representation = self._to_dict() xml_representation = xmltodict.unparse(dict_representation, pretty=True) @@ -108,7 +108,7 @@ def _to_xml(self) -> str: return xml_body def _get_file_elements(self) -> Dict: - """ Get file_elements to upload to the server, called during Publish. + """Get file_elements to upload to the server, called during Publish. Derived child classes should overwrite this method as necessary. The description field will be populated automatically if not provided. @@ -117,7 +117,7 @@ def _get_file_elements(self) -> Dict: @abstractmethod def _parse_publish_response(self, xml_response: Dict): - """ Parse the id from the xml_response and assign it to self. """ + """Parse the id from the xml_response and assign it to self.""" pass def publish(self) -> "OpenMLBase": @@ -136,7 +136,7 @@ def publish(self) -> "OpenMLBase": return self def open_in_browser(self): - """ Opens the OpenML web page corresponding to this object in your default browser. """ + """Opens the OpenML web page corresponding to this object in your default browser.""" webbrowser.open(self.openml_url) def push_tag(self, tag: str): diff --git a/openml/cli.py b/openml/cli.py index cfd453e9f..039ac227c 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -26,7 +26,7 @@ def looks_like_url(url: str) -> bool: def wait_until_valid_input( prompt: str, check: Callable[[str], str], sanitize: Union[Callable[[str], str], None] ) -> str: - """ Asks `prompt` until an input is received which returns True for `check`. + """Asks `prompt` until an input is received which returns True for `check`. Parameters ---------- @@ -252,7 +252,7 @@ def configure_field( input_message: str, sanitize: Union[Callable[[str], str], None] = None, ) -> None: - """ Configure `field` with `value`. If `value` is None ask the user for input. + """Configure `field` with `value`. If `value` is None ask the user for input. `value` and user input are first corrected/auto-completed with `convert_value` if provided, then validated with `check_with_message` function. @@ -288,13 +288,15 @@ def configure_field( else: print(intro_message) value = wait_until_valid_input( - prompt=input_message, check=check_with_message, sanitize=sanitize, + prompt=input_message, + check=check_with_message, + sanitize=sanitize, ) verbose_set(field, value) def configure(args: argparse.Namespace): - """ Calls the right submenu(s) to edit `args.field` in the configuration file. """ + """Calls the right submenu(s) to edit `args.field` in the configuration file.""" set_functions = { "apikey": configure_apikey, "server": configure_server, @@ -348,7 +350,11 @@ def main() -> None: ) parser_configure.add_argument( - "value", type=str, default=None, nargs="?", help="The value to set the FIELD to.", + "value", + type=str, + default=None, + nargs="?", + help="The value to set the FIELD to.", ) args = parser.parse_args() diff --git a/openml/config.py b/openml/config.py index 8593ad484..09359d33d 100644 --- a/openml/config.py +++ b/openml/config.py @@ -23,7 +23,7 @@ def _create_log_handlers(create_file_handler=True): - """ Creates but does not attach the log handlers. """ + """Creates but does not attach the log handlers.""" global console_handler, file_handler if console_handler is not None or file_handler is not None: logger.debug("Requested to create log handlers, but they are already created.") @@ -36,7 +36,7 @@ def _create_log_handlers(create_file_handler=True): console_handler.setFormatter(output_formatter) if create_file_handler: - one_mb = 2 ** 20 + one_mb = 2**20 log_path = os.path.join(cache_directory, "openml_python.log") file_handler = logging.handlers.RotatingFileHandler( log_path, maxBytes=one_mb, backupCount=1, delay=True @@ -45,7 +45,7 @@ def _create_log_handlers(create_file_handler=True): def _convert_log_levels(log_level: int) -> Tuple[int, int]: - """ Converts a log level that's either defined by OpenML/Python to both specifications. """ + """Converts a log level that's either defined by OpenML/Python to both specifications.""" # OpenML verbosity level don't match Python values directly: openml_to_python = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG} python_to_openml = { @@ -62,7 +62,7 @@ def _convert_log_levels(log_level: int) -> Tuple[int, int]: def _set_level_register_and_store(handler: logging.Handler, log_level: int): - """ Set handler log level, register it if needed, save setting to config file if specified. """ + """Set handler log level, register it if needed, save setting to config file if specified.""" oml_level, py_level = _convert_log_levels(log_level) handler.setLevel(py_level) @@ -74,13 +74,13 @@ def _set_level_register_and_store(handler: logging.Handler, log_level: int): def set_console_log_level(console_output_level: int): - """ Set console output to the desired level and register it with openml logger if needed. """ + """Set console output to the desired level and register it with openml logger if needed.""" global console_handler _set_level_register_and_store(cast(logging.Handler, console_handler), console_output_level) def set_file_log_level(file_output_level: int): - """ Set file output to the desired level and register it with openml logger if needed. """ + """Set file output to the desired level and register it with openml logger if needed.""" global file_handler _set_level_register_and_store(cast(logging.Handler, file_handler), file_output_level) @@ -90,7 +90,14 @@ def set_file_log_level(file_output_level: int): "apikey": "", "server": "https://www.openml.org/api/v1/xml", "cachedir": ( - os.environ.get("XDG_CACHE_HOME", os.path.join("~", ".cache", "openml",)) + os.environ.get( + "XDG_CACHE_HOME", + os.path.join( + "~", + ".cache", + "openml", + ), + ) if platform.system() == "Linux" else os.path.join("~", ".openml") ), @@ -144,7 +151,7 @@ def set_retry_policy(value: str, n_retries: Optional[int] = None) -> None: class ConfigurationForExamples: - """ Allows easy switching to and from a test configuration, used for examples. """ + """Allows easy switching to and from a test configuration, used for examples.""" _last_used_server = None _last_used_key = None @@ -154,7 +161,7 @@ class ConfigurationForExamples: @classmethod def start_using_configuration_for_example(cls): - """ Sets the configuration to connect to the test server with valid apikey. + """Sets the configuration to connect to the test server with valid apikey. To configuration as was before this call is stored, and can be recovered by using the `stop_use_example_configuration` method. @@ -181,7 +188,7 @@ def start_using_configuration_for_example(cls): @classmethod def stop_using_configuration_for_example(cls): - """ Return to configuration as it was before `start_use_example_configuration`. """ + """Return to configuration as it was before `start_use_example_configuration`.""" if not cls._start_last_called: # We don't want to allow this because it will (likely) result in the `server` and # `apikey` variables being set to None. @@ -281,7 +288,7 @@ def _get(config, key): def set_field_in_config_file(field: str, value: Any): - """ Overwrites the `field` in the configuration file with the new `value`. """ + """Overwrites the `field` in the configuration file with the new `value`.""" if field not in _defaults: return ValueError(f"Field '{field}' is not valid and must be one of '{_defaults.keys()}'.") @@ -302,7 +309,7 @@ def set_field_in_config_file(field: str, value: Any): def _parse_config(config_file: str): - """ Parse the config file, set up defaults. """ + """Parse the config file, set up defaults.""" config = configparser.RawConfigParser(defaults=_defaults) # The ConfigParser requires a [SECTION_HEADER], which we do not expect in our config file. diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 8f1ce612b..8d5606912 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -239,7 +239,7 @@ def id(self) -> Optional[int]: return self.dataset_id def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: - """ Collect all information to display in the __repr__ body. """ + """Collect all information to display in the __repr__ body.""" fields = { "Name": self.name, "Version": self.version, @@ -297,7 +297,7 @@ def __eq__(self, other): return all(self.__dict__[key] == other.__dict__[key] for key in self_keys) def _download_data(self) -> None: - """ Download ARFF data file to standard cache directory. Set `self.data_file`. """ + """Download ARFF data file to standard cache directory. Set `self.data_file`.""" # import required here to avoid circular import. from .functions import _get_dataset_arff, _get_dataset_parquet @@ -363,7 +363,7 @@ def decode_arff(fh): def _parse_data_from_arff( self, arff_file_path: str ) -> Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]: - """ Parse all required data from arff file. + """Parse all required data from arff file. Parameters ---------- @@ -473,7 +473,7 @@ def _compressed_cache_file_paths(self, data_file: str) -> Tuple[str, str, str]: def _cache_compressed_file_from_file( self, data_file: str ) -> Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]: - """ Store data from the local file in compressed format. + """Store data from the local file in compressed format. If a local parquet file is present it will be used instead of the arff file. Sets cache_format to 'pickle' if data is sparse. @@ -519,7 +519,7 @@ def _cache_compressed_file_from_file( return data, categorical, attribute_names def _load_data(self): - """ Load data from compressed format or arff. Download data if not present on disk. """ + """Load data from compressed format or arff. Download data if not present on disk.""" need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None need_to_create_feather = self.cache_format == "feather" and self.data_feather_file is None @@ -675,7 +675,7 @@ def get_data( List[bool], List[str], ]: - """ Returns dataset content as dataframes or sparse matrices. + """Returns dataset content as dataframes or sparse matrices. Parameters ---------- @@ -863,7 +863,7 @@ def get_features_by_type( return result def _get_file_elements(self) -> Dict: - """ Adds the 'dataset' to file elements. """ + """Adds the 'dataset' to file elements.""" file_elements = {} path = None if self.data_file is None else os.path.abspath(self.data_file) @@ -882,11 +882,11 @@ def _get_file_elements(self) -> Dict: return file_elements def _parse_publish_response(self, xml_response: Dict): - """ Parse the id from the xml_response and assign it to self. """ + """Parse the id from the xml_response and assign it to self.""" self.dataset_id = int(xml_response["oml:upload_data_set"]["oml:id"]) def _to_dict(self) -> "OrderedDict[str, OrderedDict]": - """ Creates a dictionary representation of self. """ + """Creates a dictionary representation of self.""" props = [ "id", "name", diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index d92d7d515..fb2e201f6 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -36,12 +36,12 @@ def _get_cache_directory(dataset: OpenMLDataset) -> str: - """ Return the cache directory of the OpenMLDataset """ + """Return the cache directory of the OpenMLDataset""" return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id) def list_qualities() -> List[str]: - """ Return list of data qualities available. + """Return list of data qualities available. The function performs an API call to retrieve the entire list of data qualities that are computed on the datasets uploaded. @@ -236,7 +236,8 @@ def _validated_data_attributes( def check_datasets_active( - dataset_ids: List[int], raise_error_if_not_exist: bool = True, + dataset_ids: List[int], + raise_error_if_not_exist: bool = True, ) -> Dict[int, bool]: """ Check if the dataset ids provided are active. @@ -274,7 +275,7 @@ def check_datasets_active( def _name_to_id( dataset_name: str, version: Optional[int] = None, error_if_multiple: bool = False ) -> int: - """ Attempt to find the dataset id of the dataset with the given name. + """Attempt to find the dataset id of the dataset with the given name. If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``, then return the least recent still active dataset. @@ -354,7 +355,7 @@ def get_dataset( cache_format: str = "pickle", download_qualities: bool = True, ) -> OpenMLDataset: - """ Download the OpenML dataset representation, optionally also download actual data file. + """Download the OpenML dataset representation, optionally also download actual data file. This function is thread/multiprocessing safe. This function uses caching. A check will be performed to determine if the information has @@ -407,7 +408,10 @@ def get_dataset( "`dataset_id` must be one of `str` or `int`, not {}.".format(type(dataset_id)) ) - did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,) + did_cache_dir = _create_cache_directory_for_id( + DATASETS_CACHE_DIR_NAME, + dataset_id, + ) remove_dataset_cache = True try: @@ -450,7 +454,7 @@ def get_dataset( def attributes_arff_from_df(df): - """ Describe attributes of the dataframe according to ARFF specification. + """Describe attributes of the dataframe according to ARFF specification. Parameters ---------- @@ -746,7 +750,7 @@ def edit_dataset( original_data_url=None, paper_url=None, ) -> int: - """ Edits an OpenMLDataset. + """Edits an OpenMLDataset. In addition to providing the dataset id of the dataset to edit (through data_id), you must specify a value for at least one of the optional function arguments, @@ -886,7 +890,7 @@ def _topic_add_dataset(data_id: int, topic: str): id of the dataset for which the topic needs to be added topic : str Topic to be added for the dataset - """ + """ if not isinstance(data_id, int): raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id))) form_data = {"data_id": data_id, "topic": topic} @@ -907,7 +911,7 @@ def _topic_delete_dataset(data_id: int, topic: str): topic : str Topic to be deleted - """ + """ if not isinstance(data_id, int): raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id))) form_data = {"data_id": data_id, "topic": topic} @@ -959,7 +963,7 @@ def _get_dataset_description(did_cache_dir, dataset_id): def _get_dataset_parquet( description: Union[Dict, OpenMLDataset], cache_directory: str = None ) -> Optional[str]: - """ Return the path to the local parquet file of the dataset. If is not cached, it is downloaded. + """Return the path to the local parquet file of the dataset. If is not cached, it is downloaded. Checks if the file is in the cache, if yes, return the path to the file. If not, downloads the file and caches it, then returns the file path. @@ -1007,7 +1011,7 @@ def _get_dataset_parquet( def _get_dataset_arff(description: Union[Dict, OpenMLDataset], cache_directory: str = None) -> str: - """ Return the path to the local arff file of the dataset. If is not cached, it is downloaded. + """Return the path to the local arff file of the dataset. If is not cached, it is downloaded. Checks if the file is in the cache, if yes, return the path to the file. If not, downloads the file and caches it, then returns the file path. diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index b3fdd0aa0..30d376c04 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -253,7 +253,7 @@ def __list_evaluations(api_call, output_format="object"): def list_evaluation_measures() -> List[str]: - """ Return list of evaluation measures available. + """Return list of evaluation measures available. The function performs an API call to retrieve the entire list of evaluation measures that are available. diff --git a/openml/exceptions.py b/openml/exceptions.py index 781784ee2..a5f132128 100644 --- a/openml/exceptions.py +++ b/openml/exceptions.py @@ -9,7 +9,7 @@ def __init__(self, message: str): class OpenMLServerError(PyOpenMLError): """class for when something is really wrong on the server - (result did not parse to dict), contains unparsed error.""" + (result did not parse to dict), contains unparsed error.""" def __init__(self, message: str): super().__init__(message) @@ -17,7 +17,7 @@ def __init__(self, message: str): class OpenMLServerException(OpenMLServerError): """exception for when the result of the server was - not 200 (e.g., listing call w/o results). """ + not 200 (e.g., listing call w/o results).""" # Code needs to be optional to allow the exceptino to be picklable: # https://stackoverflow.com/questions/16244923/how-to-make-a-custom-exception-class-with-multiple-init-args-pickleable # noqa: E501 @@ -28,11 +28,15 @@ def __init__(self, message: str, code: int = None, url: str = None): super().__init__(message) def __str__(self): - return "%s returned code %s: %s" % (self.url, self.code, self.message,) + return "%s returned code %s: %s" % ( + self.url, + self.code, + self.message, + ) class OpenMLServerNoResult(OpenMLServerException): - """exception for when the result of the server is empty. """ + """exception for when the result of the server is empty.""" pass @@ -51,14 +55,14 @@ class OpenMLHashException(PyOpenMLError): class OpenMLPrivateDatasetError(PyOpenMLError): - """ Exception thrown when the user has no rights to access the dataset. """ + """Exception thrown when the user has no rights to access the dataset.""" def __init__(self, message: str): super().__init__(message) class OpenMLRunsExistError(PyOpenMLError): - """ Indicates run(s) already exists on the server when they should not be duplicated. """ + """Indicates run(s) already exists on the server when they should not be duplicated.""" def __init__(self, run_ids: set, message: str): if len(run_ids) < 1: diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py index 4529ad163..f33ef7543 100644 --- a/openml/extensions/extension_interface.py +++ b/openml/extensions/extension_interface.py @@ -204,7 +204,9 @@ def _run_model_on_fold( @abstractmethod def obtain_parameter_values( - self, flow: "OpenMLFlow", model: Any = None, + self, + flow: "OpenMLFlow", + model: Any = None, ) -> List[Dict[str, Any]]: """Extracts all parameter settings required for the flow from the model. @@ -247,7 +249,9 @@ def check_if_model_fitted(self, model: Any) -> bool: @abstractmethod def instantiate_model_from_hpo_class( - self, model: Any, trace_iteration: "OpenMLTraceIteration", + self, + model: Any, + trace_iteration: "OpenMLTraceIteration", ) -> Any: """Instantiate a base model which can be searched over by the hyperparameter optimization model. diff --git a/openml/extensions/functions.py b/openml/extensions/functions.py index 52bb03961..a080e1004 100644 --- a/openml/extensions/functions.py +++ b/openml/extensions/functions.py @@ -30,7 +30,8 @@ def register_extension(extension: Type[Extension]) -> None: def get_extension_by_flow( - flow: "OpenMLFlow", raise_if_no_extension: bool = False, + flow: "OpenMLFlow", + raise_if_no_extension: bool = False, ) -> Optional[Extension]: """Get an extension which can handle the given flow. @@ -66,7 +67,10 @@ def get_extension_by_flow( ) -def get_extension_by_model(model: Any, raise_if_no_extension: bool = False,) -> Optional[Extension]: +def get_extension_by_model( + model: Any, + raise_if_no_extension: bool = False, +) -> Optional[Extension]: """Get an extension which can handle the given flow. Iterates all registered extensions and checks whether they can handle the presented model. diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index d49a9a9c5..c2a1d6bde 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -66,8 +66,8 @@ class SklearnExtension(Extension): """Connect scikit-learn to OpenML-Python. - The estimators which use this extension must be scikit-learn compatible, - i.e needs to be a subclass of sklearn.base.BaseEstimator". + The estimators which use this extension must be scikit-learn compatible, + i.e needs to be a subclass of sklearn.base.BaseEstimator". """ ################################################################################################ @@ -107,7 +107,7 @@ def can_handle_model(cls, model: Any) -> bool: def trim_flow_name( cls, long_name: str, extra_trim_length: int = 100, _outer: bool = True ) -> str: - """ Shorten generated sklearn flow name to at most ``max_length`` characters. + """Shorten generated sklearn flow name to at most ``max_length`` characters. Flows are assumed to have the following naming structure: ``(model_selection)? (pipeline)? (steps)+`` @@ -223,7 +223,7 @@ def remove_all_in_parentheses(string: str) -> str: @classmethod def _min_dependency_str(cls, sklearn_version: str) -> str: - """ Returns a string containing the minimum dependencies for the sklearn version passed. + """Returns a string containing the minimum dependencies for the sklearn version passed. Parameters ---------- @@ -858,7 +858,9 @@ def _get_tags(self) -> List[str]: ] def _get_external_version_string( - self, model: Any, sub_components: Dict[str, OpenMLFlow], + self, + model: Any, + sub_components: Dict[str, OpenMLFlow], ) -> str: # Create external version string for a flow, given the model and the # already parsed dictionary of sub_components. Retrieves the external @@ -874,7 +876,8 @@ def _get_external_version_string( module = importlib.import_module(model_package_name) model_package_version_number = module.__version__ # type: ignore external_version = self._format_external_version( - model_package_name, model_package_version_number, + model_package_name, + model_package_version_number, ) external_versions.add(external_version) @@ -890,7 +893,9 @@ def _get_external_version_string( return ",".join(list(sorted(external_versions))) def _check_multiple_occurence_of_component_in_flow( - self, model: Any, sub_components: Dict[str, OpenMLFlow], + self, + model: Any, + sub_components: Dict[str, OpenMLFlow], ) -> None: to_visit_stack = [] # type: List[OpenMLFlow] to_visit_stack.extend(sub_components.values()) @@ -910,7 +915,8 @@ def _check_multiple_occurence_of_component_in_flow( to_visit_stack.extend(visitee.components.values()) def _extract_information_from_model( - self, model: Any, + self, + model: Any, ) -> Tuple[ "OrderedDict[str, Optional[str]]", "OrderedDict[str, Optional[Dict]]", @@ -936,7 +942,7 @@ def _extract_information_from_model( rval = self._serialize_sklearn(v, model) def flatten_all(list_): - """ Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]). """ + """Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]).""" for el in list_: if isinstance(el, (list, tuple)) and len(el) > 0: yield from flatten_all(el) @@ -1381,7 +1387,9 @@ def _deserialize_cross_validator( return model_class(**parameters) def _format_external_version( - self, model_package_name: str, model_package_version_number: str, + self, + model_package_name: str, + model_package_version_number: str, ) -> str: return "%s==%s" % (model_package_name, model_package_version_number) @@ -1530,7 +1538,7 @@ def _seed_current_object(current_value): # statement) this way we guarantee that if a different set of # subflows is seeded, the same number of the random generator is # used - new_value = rs.randint(0, 2 ** 16) + new_value = rs.randint(0, 2**16) if _seed_current_object(current_value): random_states[param_name] = new_value @@ -1540,7 +1548,7 @@ def _seed_current_object(current_value): continue current_value = model_params[param_name].random_state - new_value = rs.randint(0, 2 ** 16) + new_value = rs.randint(0, 2**16) if _seed_current_object(current_value): model_params[param_name].random_state = new_value @@ -1777,7 +1785,8 @@ def _prediction_to_probabilities( # for class 3 because the rest of the library expects that the # probabilities are ordered the same way as the classes are ordered). message = "Estimator only predicted for {}/{} classes!".format( - proba_y.shape[1], len(task.class_labels), + proba_y.shape[1], + len(task.class_labels), ) warnings.warn(message) openml.config.logger.warning(message) @@ -1815,7 +1824,9 @@ def _prediction_to_probabilities( return pred_y, proba_y, user_defined_measures, trace def obtain_parameter_values( - self, flow: "OpenMLFlow", model: Any = None, + self, + flow: "OpenMLFlow", + model: Any = None, ) -> List[Dict[str, Any]]: """Extracts all parameter settings required for the flow from the model. @@ -2019,7 +2030,9 @@ def is_subcomponent_specification(values): return parameters def _openml_param_name_to_sklearn( - self, openml_parameter: openml.setups.OpenMLParameter, flow: OpenMLFlow, + self, + openml_parameter: openml.setups.OpenMLParameter, + flow: OpenMLFlow, ) -> str: """ Converts the name of an OpenMLParameter into the sklean name, given a flow. @@ -2068,7 +2081,9 @@ def _is_hpo_class(self, model: Any) -> bool: return isinstance(model, sklearn.model_selection._search.BaseSearchCV) def instantiate_model_from_hpo_class( - self, model: Any, trace_iteration: OpenMLTraceIteration, + self, + model: Any, + trace_iteration: OpenMLTraceIteration, ) -> Any: """Instantiate a ``base_estimator`` which can be searched over by the hyperparameter optimization model. @@ -2114,7 +2129,11 @@ def _extract_trace_data(self, model, rep_no, fold_no): arff_tracecontent.append(arff_line) return arff_tracecontent - def _obtain_arff_trace(self, model: Any, trace_content: List,) -> "OpenMLRunTrace": + def _obtain_arff_trace( + self, + model: Any, + trace_content: List, + ) -> "OpenMLRunTrace": """Create arff trace object from a fitted model and the trace content obtained by repeatedly calling ``run_model_on_task``. @@ -2176,4 +2195,7 @@ def _obtain_arff_trace(self, model: Any, trace_content: List,) -> "OpenMLRunTrac attribute = (PREFIX + key[6:], type) trace_attributes.append(attribute) - return OpenMLRunTrace.generate(trace_attributes, trace_content,) + return OpenMLRunTrace.generate( + trace_attributes, + trace_content, + ) diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 2a340e625..b9752e77c 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -174,7 +174,7 @@ def extension(self): ) def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: - """ Collect all information to display in the __repr__ body. """ + """Collect all information to display in the __repr__ body.""" fields = { "Flow Name": self.name, "Flow Description": self.description, @@ -203,7 +203,7 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: return [(key, fields[key]) for key in order if key in fields] def _to_dict(self) -> "OrderedDict[str, OrderedDict]": - """ Creates a dictionary representation of self. """ + """Creates a dictionary representation of self.""" flow_container = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' flow_dict = OrderedDict( [("@xmlns:oml", "http://openml.org/openml")] @@ -297,7 +297,7 @@ def _from_dict(cls, xml_dict): Calls itself recursively to create :class:`OpenMLFlow` objects of subflows (components). - + XML definition of a flow is available at https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd @@ -400,11 +400,11 @@ def from_filesystem(cls, input_directory) -> "OpenMLFlow": return OpenMLFlow._from_dict(xmltodict.parse(xml_string)) def _parse_publish_response(self, xml_response: Dict): - """ Parse the id from the xml_response and assign it to self. """ + """Parse the id from the xml_response and assign it to self.""" self.flow_id = int(xml_response["oml:upload_flow"]["oml:id"]) def publish(self, raise_error_if_exists: bool = False) -> "OpenMLFlow": - """ Publish this flow to OpenML server. + """Publish this flow to OpenML server. Raises a PyOpenMLError if the flow exists on the server, but `self.flow_id` does not match the server known flow id. diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 28d49b691..85546a0a3 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -122,7 +122,8 @@ def _get_flow_description(flow_id: int) -> OpenMLFlow: except OpenMLCacheException: xml_file = os.path.join( - openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id), "flow.xml", + openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id), + "flow.xml", ) flow_xml = openml._api_calls._perform_api_call("flow/%d" % flow_id, request_method="get") @@ -253,7 +254,9 @@ def flow_exists(name: str, external_version: str) -> Union[int, bool]: raise ValueError("Argument 'version' should be a non-empty string") xml_response = openml._api_calls._perform_api_call( - "flow/exists", "get", data={"name": name, "external_version": external_version}, + "flow/exists", + "get", + data={"name": name, "external_version": external_version}, ) result_dict = xmltodict.parse(xml_response) @@ -265,7 +268,9 @@ def flow_exists(name: str, external_version: str) -> Union[int, bool]: def get_flow_id( - model: Optional[Any] = None, name: Optional[str] = None, exact_version=True, + model: Optional[Any] = None, + name: Optional[str] = None, + exact_version=True, ) -> Union[int, bool, List[int]]: """Retrieves the flow id for a model or a flow name. @@ -357,7 +362,7 @@ def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.D def _check_flow_for_server_id(flow: OpenMLFlow) -> None: - """ Raises a ValueError if the flow or any of its subflows has no flow id. """ + """Raises a ValueError if the flow or any of its subflows has no flow id.""" # Depth-first search to check if all components were uploaded to the # server before parsing the parameters diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 8bbe3b956..08b2fe972 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -353,7 +353,10 @@ def initialize_model_from_run(run_id: int) -> Any: def initialize_model_from_trace( - run_id: int, repeat: int, fold: int, iteration: Optional[int] = None, + run_id: int, + repeat: int, + fold: int, + iteration: Optional[int] = None, ) -> Any: """ Initialize a model based on the parameters that were set @@ -461,7 +464,12 @@ def _run_task_get_arffcontent( jobs = [] for n_fit, (rep_no, fold_no, sample_no) in enumerate( - itertools.product(range(num_reps), range(num_folds), range(num_samples),), start=1 + itertools.product( + range(num_reps), + range(num_folds), + range(num_samples), + ), + start=1, ): jobs.append((n_fit, rep_no, fold_no, sample_no)) @@ -537,7 +545,8 @@ def _calculate_local_measure(sklearn_fn, openml_name): if add_local_measures: _calculate_local_measure( - sklearn.metrics.accuracy_score, "predictive_accuracy", + sklearn.metrics.accuracy_score, + "predictive_accuracy", ) elif isinstance(task, OpenMLRegressionTask): @@ -557,7 +566,8 @@ def _calculate_local_measure(sklearn_fn, openml_name): if add_local_measures: _calculate_local_measure( - sklearn.metrics.mean_absolute_error, "mean_absolute_error", + sklearn.metrics.mean_absolute_error, + "mean_absolute_error", ) elif isinstance(task, OpenMLClusteringTask): @@ -921,7 +931,10 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): def _get_cached_run(run_id): """Load a run from the cache.""" - run_cache_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id,) + run_cache_dir = openml.utils._create_cache_directory_for_id( + RUNS_CACHE_DIR_NAME, + run_id, + ) try: run_file = os.path.join(run_cache_dir, "description.xml") with io.open(run_file, encoding="utf8") as fh: @@ -1144,7 +1157,7 @@ def format_prediction( sample: Optional[int] = None, proba: Optional[Dict[str, float]] = None, ) -> List[Union[str, int, float]]: - """ Format the predictions in the specific order as required for the run results. + """Format the predictions in the specific order as required for the run results. Parameters ---------- diff --git a/openml/runs/run.py b/openml/runs/run.py index 5c93e9518..58367179e 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -121,7 +121,7 @@ def __init__( @property def predictions(self) -> pd.DataFrame: - """ Return a DataFrame with predictions for this run """ + """Return a DataFrame with predictions for this run""" if self._predictions is None: if self.data_content: arff_dict = self._generate_arff_dict() @@ -140,7 +140,7 @@ def id(self) -> Optional[int]: return self.run_id def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: - """ Collect all information to display in the __repr__ body. """ + """Collect all information to display in the __repr__ body.""" fields = { "Uploader Name": self.uploader_name, "Metric": self.task_evaluation_measure, @@ -251,7 +251,11 @@ def from_filesystem(cls, directory: str, expect_model: bool = True) -> "OpenMLRu return run - def to_filesystem(self, directory: str, store_model: bool = True,) -> None: + def to_filesystem( + self, + directory: str, + store_model: bool = True, + ) -> None: """ The inverse of the from_filesystem method. Serializes a run on the filesystem, to be uploaded later. @@ -408,7 +412,8 @@ def get_metric_fn(self, sklearn_fn, kwargs=None): predictions_arff = self._generate_arff_dict() elif "predictions" in self.output_files: predictions_file_url = openml._api_calls._file_id_to_url( - self.output_files["predictions"], "predictions.arff", + self.output_files["predictions"], + "predictions.arff", ) response = openml._api_calls._download_text_file(predictions_file_url) predictions_arff = arff.loads(response) @@ -516,11 +521,11 @@ def _attribute_list_to_dict(attribute_list): return np.array(scores) def _parse_publish_response(self, xml_response: Dict): - """ Parse the id from the xml_response and assign it to self. """ + """Parse the id from the xml_response and assign it to self.""" self.run_id = int(xml_response["oml:upload_run"]["oml:run_id"]) def _get_file_elements(self) -> Dict: - """ Get file_elements to upload to the server. + """Get file_elements to upload to the server. Derived child classes should overwrite this method as necessary. The description field will be populated automatically if not provided. @@ -544,7 +549,8 @@ def _get_file_elements(self) -> Dict: if self.flow is None: self.flow = openml.flows.get_flow(self.flow_id) self.parameter_settings = self.flow.extension.obtain_parameter_values( - self.flow, self.model, + self.flow, + self.model, ) file_elements = {"description": ("description.xml", self._to_xml())} @@ -559,7 +565,7 @@ def _get_file_elements(self) -> Dict: return file_elements def _to_dict(self) -> "OrderedDict[str, OrderedDict]": - """ Creates a dictionary representation of self. """ + """Creates a dictionary representation of self.""" description = OrderedDict() # type: 'OrderedDict' description["oml:run"] = OrderedDict() description["oml:run"]["@xmlns:oml"] = "http://openml.org/openml" diff --git a/openml/runs/trace.py b/openml/runs/trace.py index 0c05b9dc8..e6885260e 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -331,7 +331,12 @@ def trace_from_xml(cls, xml): ) current = OpenMLTraceIteration( - repeat, fold, iteration, setup_string, evaluation, selected, + repeat, + fold, + iteration, + setup_string, + evaluation, + selected, ) trace[(repeat, fold, iteration)] = current @@ -372,7 +377,8 @@ def merge_traces(cls, traces: List["OpenMLRunTrace"]) -> "OpenMLRunTrace": def __repr__(self): return "[Run id: {}, {} trace iterations]".format( - -1 if self.run_id is None else self.run_id, len(self.trace_iterations), + -1 if self.run_id is None else self.run_id, + len(self.trace_iterations), ) def __iter__(self): @@ -410,7 +416,14 @@ class OpenMLTraceIteration(object): """ def __init__( - self, repeat, fold, iteration, setup_string, evaluation, selected, parameters=None, + self, + repeat, + fold, + iteration, + setup_string, + evaluation, + selected, + parameters=None, ): if not isinstance(selected, bool): diff --git a/openml/setups/functions.py b/openml/setups/functions.py index b418a6106..675172738 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -175,7 +175,7 @@ def _list_setups(setup=None, output_format="object", **kwargs): Returns ------- dict or dataframe - """ + """ api_call = "setup/list" if setup is not None: diff --git a/openml/study/functions.py b/openml/study/functions.py index 26cb9bd55..ae257dd9c 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -30,7 +30,8 @@ def get_suite(suite_id: Union[int, str]) -> OpenMLBenchmarkSuite: def get_study( - study_id: Union[int, str], arg_for_backwards_compat: Optional[str] = None, + study_id: Union[int, str], + arg_for_backwards_compat: Optional[str] = None, ) -> OpenMLStudy: # noqa F401 """ Retrieves all relevant information of an OpenML study from the server. @@ -83,7 +84,8 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: if entity_type != main_entity_type: raise ValueError( "Unexpected entity type '{}' reported by the server, expected '{}'".format( - main_entity_type, entity_type, + main_entity_type, + entity_type, ) ) benchmark_suite = ( @@ -207,7 +209,10 @@ def create_study( def create_benchmark_suite( - name: str, description: str, task_ids: List[int], alias: Optional[str] = None, + name: str, + description: str, + task_ids: List[int], + alias: Optional[str] = None, ) -> OpenMLBenchmarkSuite: """ Creates an OpenML benchmark suite (collection of entity types, where diff --git a/openml/study/study.py b/openml/study/study.py index dbbef6e89..0cdc913f9 100644 --- a/openml/study/study.py +++ b/openml/study/study.py @@ -99,7 +99,7 @@ def id(self) -> Optional[int]: return self.study_id def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: - """ Collect all information to display in the __repr__ body. """ + """Collect all information to display in the __repr__ body.""" fields = { "Name": self.name, "Status": self.status, @@ -138,11 +138,11 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: return [(key, fields[key]) for key in order if key in fields] def _parse_publish_response(self, xml_response: Dict): - """ Parse the id from the xml_response and assign it to self. """ + """Parse the id from the xml_response and assign it to self.""" self.study_id = int(xml_response["oml:study_upload"]["oml:id"]) def _to_dict(self) -> "OrderedDict[str, OrderedDict]": - """ Creates a dictionary representation of self. """ + """Creates a dictionary representation of self.""" # some can not be uploaded, e.g., id, creator, creation_date simple_props = ["alias", "main_entity_type", "name", "description"] # maps from attribute name (which is used as outer tag name) to immer diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 2c5a56ad7..75731d01f 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -354,7 +354,10 @@ def get_task( except (ValueError, TypeError): raise ValueError("Dataset ID is neither an Integer nor can be cast to an Integer.") - tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id,) + tid_cache_dir = openml.utils._create_cache_directory_for_id( + TASKS_CACHE_DIR_NAME, + task_id, + ) try: task = _get_task_description(task_id) @@ -371,7 +374,8 @@ def get_task( task.download_split() except Exception as e: openml.utils._remove_cache_dir_for_id( - TASKS_CACHE_DIR_NAME, tid_cache_dir, + TASKS_CACHE_DIR_NAME, + tid_cache_dir, ) raise e @@ -384,7 +388,11 @@ def _get_task_description(task_id): return _get_cached_task(task_id) except OpenMLCacheException: xml_file = os.path.join( - openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id,), "task.xml", + openml.utils._create_cache_directory_for_id( + TASKS_CACHE_DIR_NAME, + task_id, + ), + "task.xml", ) task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get") diff --git a/openml/tasks/split.py b/openml/tasks/split.py index 515be895a..e5fafedc5 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -14,11 +14,11 @@ class OpenMLSplit(object): """OpenML Split object. - Parameters - ---------- - name : int or str - description : str - split : dict + Parameters + ---------- + name : int or str + description : str + split : dict """ def __init__(self, name, description, split): diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 6a1f2a4c5..7f2e53a65 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -34,16 +34,16 @@ class TaskType(Enum): class OpenMLTask(OpenMLBase): """OpenML Task object. - Parameters - ---------- - task_type_id : TaskType - Refers to the type of task. - task_type : str - Refers to the task. - data_set_id: int - Refers to the data. - estimation_procedure_id: int - Refers to the type of estimates used. + Parameters + ---------- + task_type_id : TaskType + Refers to the type of task. + task_type : str + Refers to the task. + data_set_id: int + Refers to the data. + estimation_procedure_id: int + Refers to the type of estimates used. """ def __init__( @@ -82,7 +82,7 @@ def id(self) -> Optional[int]: return self.task_id def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: - """ Collect all information to display in the __repr__ body. """ + """Collect all information to display in the __repr__ body.""" fields = { "Task Type Description": "{}/tt/{}".format( openml.config.get_server_base_url(), self.task_type_id @@ -120,14 +120,21 @@ def get_dataset(self) -> datasets.OpenMLDataset: return datasets.get_dataset(self.dataset_id) def get_train_test_split_indices( - self, fold: int = 0, repeat: int = 0, sample: int = 0, + self, + fold: int = 0, + repeat: int = 0, + sample: int = 0, ) -> Tuple[np.ndarray, np.ndarray]: # Replace with retrieve from cache if self.split is None: self.split = self.download_split() - train_indices, test_indices = self.split.get(repeat=repeat, fold=fold, sample=sample,) + train_indices, test_indices = self.split.get( + repeat=repeat, + fold=fold, + sample=sample, + ) return train_indices, test_indices def _download_split(self, cache_file: str): @@ -137,14 +144,15 @@ def _download_split(self, cache_file: str): except (OSError, IOError): split_url = self.estimation_procedure["data_splits_url"] openml._api_calls._download_text_file( - source=str(split_url), output_path=cache_file, + source=str(split_url), + output_path=cache_file, ) def download_split(self) -> OpenMLSplit: - """Download the OpenML split for a given task. - """ + """Download the OpenML split for a given task.""" cached_split_file = os.path.join( - _create_cache_directory_for_id("tasks", self.task_id), "datasplits.arff", + _create_cache_directory_for_id("tasks", self.task_id), + "datasplits.arff", ) try: @@ -164,7 +172,7 @@ def get_split_dimensions(self) -> Tuple[int, int, int]: return self.split.repeats, self.split.folds, self.split.samples def _to_dict(self) -> "OrderedDict[str, OrderedDict]": - """ Creates a dictionary representation of self. """ + """Creates a dictionary representation of self.""" task_container = OrderedDict() # type: OrderedDict[str, OrderedDict] task_dict = OrderedDict( [("@xmlns:oml", "http://openml.org/openml")] @@ -192,17 +200,17 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": return task_container def _parse_publish_response(self, xml_response: Dict): - """ Parse the id from the xml_response and assign it to self. """ + """Parse the id from the xml_response and assign it to self.""" self.task_id = int(xml_response["oml:upload_task"]["oml:id"]) class OpenMLSupervisedTask(OpenMLTask, ABC): """OpenML Supervised Classification object. - Parameters - ---------- - target_name : str - Name of the target feature (the class variable). + Parameters + ---------- + target_name : str + Name of the target feature (the class variable). """ def __init__( @@ -233,7 +241,8 @@ def __init__( self.target_name = target_name def get_X_and_y( - self, dataset_format: str = "array", + self, + dataset_format: str = "array", ) -> Tuple[ Union[np.ndarray, pd.DataFrame, scipy.sparse.spmatrix], Union[np.ndarray, pd.Series] ]: @@ -257,7 +266,10 @@ def get_X_and_y( TaskType.LEARNING_CURVE, ): raise NotImplementedError(self.task_type) - X, y, _, _ = dataset.get_data(dataset_format=dataset_format, target=self.target_name,) + X, y, _, _ = dataset.get_data( + dataset_format=dataset_format, + target=self.target_name, + ) return X, y def _to_dict(self) -> "OrderedDict[str, OrderedDict]": @@ -291,10 +303,10 @@ def estimation_parameters(self, est_parameters): class OpenMLClassificationTask(OpenMLSupervisedTask): """OpenML Classification object. - Parameters - ---------- - class_labels : List of str (optional) - cost_matrix: array (optional) + Parameters + ---------- + class_labels : List of str (optional) + cost_matrix: array (optional) """ def __init__( @@ -333,8 +345,7 @@ def __init__( class OpenMLRegressionTask(OpenMLSupervisedTask): - """OpenML Regression object. - """ + """OpenML Regression object.""" def __init__( self, @@ -366,11 +377,11 @@ def __init__( class OpenMLClusteringTask(OpenMLTask): """OpenML Clustering object. - Parameters - ---------- - target_name : str (optional) - Name of the target feature (class) that is not part of the - feature set for the clustering task. + Parameters + ---------- + target_name : str (optional) + Name of the target feature (class) that is not part of the + feature set for the clustering task. """ def __init__( @@ -401,7 +412,8 @@ def __init__( self.target_name = target_name def get_X( - self, dataset_format: str = "array", + self, + dataset_format: str = "array", ) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.spmatrix]: """Get data associated with the current task. @@ -417,7 +429,10 @@ def get_X( """ dataset = self.get_dataset() - data, *_ = dataset.get_data(dataset_format=dataset_format, target=None,) + data, *_ = dataset.get_data( + dataset_format=dataset_format, + target=None, + ) return data def _to_dict(self) -> "OrderedDict[str, OrderedDict]": @@ -442,8 +457,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": class OpenMLLearningCurveTask(OpenMLClassificationTask): - """OpenML Learning Curve object. - """ + """OpenML Learning Curve object.""" def __init__( self, diff --git a/openml/testing.py b/openml/testing.py index 922d373b2..56445a253 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -114,7 +114,7 @@ def tearDown(self): @classmethod def _mark_entity_for_removal(self, entity_type, entity_id): - """ Static record of entities uploaded to test server + """Static record of entities uploaded to test server Dictionary of lists where the keys are 'entity_type'. Each such dictionary is a list of integer IDs. @@ -128,7 +128,7 @@ def _mark_entity_for_removal(self, entity_type, entity_id): @classmethod def _delete_entity_from_tracker(self, entity_type, entity): - """ Deletes entity records from the static file_tracker + """Deletes entity records from the static file_tracker Given an entity type and corresponding ID, deletes all entries, including duplicate entries of the ID for the entity type. diff --git a/openml/utils.py b/openml/utils.py index a482bf0bc..8ab238463 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -71,7 +71,7 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True): def _get_rest_api_type_alias(oml_object: "OpenMLBase") -> str: - """ Return the alias of the openml entity as it is defined for the REST API. """ + """Return the alias of the openml entity as it is defined for the REST API.""" rest_api_mapping = [ (openml.datasets.OpenMLDataset, "data"), (openml.flows.OpenMLFlow, "flow"), diff --git a/setup.py b/setup.py index f5e70abb5..9f3cdd0e6 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,8 @@ # Make sure to remove stale files such as the egg-info before updating this: # https://stackoverflow.com/a/26547314 packages=setuptools.find_packages( - include=["openml.*", "openml"], exclude=["*.tests", "*.tests.*", "tests.*", "tests"], + include=["openml.*", "openml"], + exclude=["*.tests", "*.tests.*", "tests.*", "tests"], ), package_data={"": ["*.txt", "*.md", "py.typed"]}, python_requires=">=3.6", @@ -84,7 +85,12 @@ "seaborn", ], "examples_unix": ["fanova"], - "docs": ["sphinx>=3", "sphinx-gallery", "sphinx_bootstrap_theme", "numpydoc",], + "docs": [ + "sphinx>=3", + "sphinx-gallery", + "sphinx_bootstrap_theme", + "numpydoc", + ], }, test_suite="pytest", classifiers=[ diff --git a/tests/conftest.py b/tests/conftest.py index c1f728a72..cf3f33834 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,7 +38,7 @@ def worker_id() -> str: - """ Returns the name of the worker process owning this function call. + """Returns the name of the worker process owning this function call. :return: str Possible outputs from the set of {'master', 'gw0', 'gw1', ..., 'gw(n-1)'} diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 9d67ee177..878b2288a 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -58,7 +58,8 @@ def _remove_pickle_files(self): self.lock_path = os.path.join(openml.config.get_cache_directory(), "locks") for did in ["-1", "2"]: with lockutils.external_lock( - name="datasets.functions.get_dataset:%s" % did, lock_path=self.lock_path, + name="datasets.functions.get_dataset:%s" % did, + lock_path=self.lock_path, ): pickle_path = os.path.join( openml.config.get_cache_directory(), "datasets", did, "dataset.pkl.py3" @@ -175,7 +176,10 @@ def test_list_datasets_empty(self): def test_check_datasets_active(self): # Have to test on live because there is no deactivated dataset on the test server. openml.config.server = self.production_server - active = openml.datasets.check_datasets_active([2, 17, 79], raise_error_if_not_exist=False,) + active = openml.datasets.check_datasets_active( + [2, 17, 79], + raise_error_if_not_exist=False, + ) self.assertTrue(active[2]) self.assertFalse(active[17]) self.assertIsNone(active.get(79)) @@ -188,7 +192,7 @@ def test_check_datasets_active(self): openml.config.server = self.test_server def _datasets_retrieved_successfully(self, dids, metadata_only=True): - """ Checks that all files for the given dids have been downloaded. + """Checks that all files for the given dids have been downloaded. This includes: - description @@ -229,24 +233,24 @@ def _datasets_retrieved_successfully(self, dids, metadata_only=True): ) def test__name_to_id_with_deactivated(self): - """ Check that an activated dataset is returned if an earlier deactivated one exists. """ + """Check that an activated dataset is returned if an earlier deactivated one exists.""" openml.config.server = self.production_server # /d/1 was deactivated self.assertEqual(openml.datasets.functions._name_to_id("anneal"), 2) openml.config.server = self.test_server def test__name_to_id_with_multiple_active(self): - """ With multiple active datasets, retrieve the least recent active. """ + """With multiple active datasets, retrieve the least recent active.""" openml.config.server = self.production_server self.assertEqual(openml.datasets.functions._name_to_id("iris"), 61) def test__name_to_id_with_version(self): - """ With multiple active datasets, retrieve the least recent active. """ + """With multiple active datasets, retrieve the least recent active.""" openml.config.server = self.production_server self.assertEqual(openml.datasets.functions._name_to_id("iris", version=3), 969) def test__name_to_id_with_multiple_active_error(self): - """ With multiple active datasets, retrieve the least recent active. """ + """With multiple active datasets, retrieve the least recent active.""" openml.config.server = self.production_server self.assertRaisesRegex( ValueError, @@ -257,7 +261,7 @@ def test__name_to_id_with_multiple_active_error(self): ) def test__name_to_id_name_does_not_exist(self): - """ With multiple active datasets, retrieve the least recent active. """ + """With multiple active datasets, retrieve the least recent active.""" self.assertRaisesRegex( RuntimeError, "No active datasets exist with name does_not_exist", @@ -266,7 +270,7 @@ def test__name_to_id_name_does_not_exist(self): ) def test__name_to_id_version_does_not_exist(self): - """ With multiple active datasets, retrieve the least recent active. """ + """With multiple active datasets, retrieve the least recent active.""" self.assertRaisesRegex( RuntimeError, "No active datasets exist with name iris and version 100000", @@ -356,7 +360,7 @@ def test_get_dataset_lazy(self): self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45, False) def test_get_dataset_lazy_all_functions(self): - """ Test that all expected functionality is available without downloading the dataset. """ + """Test that all expected functionality is available without downloading the dataset.""" dataset = openml.datasets.get_dataset(1, download_data=False) # We only tests functions as general integrity is tested by test_get_dataset_lazy @@ -537,10 +541,14 @@ def test__get_dataset_skip_download(self): def test_deletion_of_cache_dir(self): # Simple removal - did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, 1,) + did_cache_dir = _create_cache_directory_for_id( + DATASETS_CACHE_DIR_NAME, + 1, + ) self.assertTrue(os.path.exists(did_cache_dir)) openml.utils._remove_cache_dir_for_id( - DATASETS_CACHE_DIR_NAME, did_cache_dir, + DATASETS_CACHE_DIR_NAME, + did_cache_dir, ) self.assertFalse(os.path.exists(did_cache_dir)) @@ -1526,7 +1534,10 @@ def test_data_fork(self): self.assertNotEqual(did, result) # Check server exception when unknown dataset is provided self.assertRaisesRegex( - OpenMLServerException, "Unknown dataset", fork_dataset, data_id=999999, + OpenMLServerException, + "Unknown dataset", + fork_dataset, + data_id=999999, ) def test_get_dataset_parquet(self): diff --git a/tests/test_extensions/test_functions.py b/tests/test_extensions/test_functions.py index 85361cc02..791e815e1 100644 --- a/tests/test_extensions/test_functions.py +++ b/tests/test_extensions/test_functions.py @@ -73,7 +73,8 @@ def test_get_extension_by_flow(self): self.assertIsInstance(get_extension_by_flow(DummyFlow()), DummyExtension1) register_extension(DummyExtension1) with self.assertRaisesRegex( - ValueError, "Multiple extensions registered which can handle flow:", + ValueError, + "Multiple extensions registered which can handle flow:", ): get_extension_by_flow(DummyFlow()) @@ -87,6 +88,7 @@ def test_get_extension_by_model(self): self.assertIsInstance(get_extension_by_model(DummyModel()), DummyExtension1) register_extension(DummyExtension1) with self.assertRaisesRegex( - ValueError, "Multiple extensions registered which can handle model:", + ValueError, + "Multiple extensions registered which can handle model:", ): get_extension_by_model(DummyModel()) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index e45eeea53..0264b965d 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -736,10 +736,18 @@ def test_serialize_feature_union_switched_names(self): fu2 = sklearn.pipeline.FeatureUnion(transformer_list=[("scaler", ohe), ("ohe", scaler)]) fu1_serialization, _ = self._serialization_test_helper( - fu1, X=None, y=None, subcomponent_parameters=(), dependencies_mock_call_count=(3, 6), + fu1, + X=None, + y=None, + subcomponent_parameters=(), + dependencies_mock_call_count=(3, 6), ) fu2_serialization, _ = self._serialization_test_helper( - fu2, X=None, y=None, subcomponent_parameters=(), dependencies_mock_call_count=(3, 6), + fu2, + X=None, + y=None, + subcomponent_parameters=(), + dependencies_mock_call_count=(3, 6), ) # OneHotEncoder was moved to _encoders module in 0.20 @@ -1104,7 +1112,8 @@ def test_serialize_advanced_grid_fails(self): } clf = sklearn.model_selection.GridSearchCV( - sklearn.ensemble.BaggingClassifier(), param_grid=param_grid, + sklearn.ensemble.BaggingClassifier(), + param_grid=param_grid, ) with self.assertRaisesRegex( TypeError, re.compile(r".*OpenML.*Flow.*is not JSON serializable", flags=re.DOTALL) @@ -1513,7 +1522,9 @@ def test_obtain_parameter_values_flow_not_from_server(self): self.extension.obtain_parameter_values(flow) model = sklearn.ensemble.AdaBoostClassifier( - base_estimator=sklearn.linear_model.LogisticRegression(solver="lbfgs",) + base_estimator=sklearn.linear_model.LogisticRegression( + solver="lbfgs", + ) ) flow = self.extension.model_to_flow(model) flow.flow_id = 1 @@ -1546,14 +1557,14 @@ def test_obtain_parameter_values(self): self.assertEqual(parameter["oml:component"], 2) def test_numpy_type_allowed_in_flow(self): - """ Simple numpy types should be serializable. """ + """Simple numpy types should be serializable.""" dt = sklearn.tree.DecisionTreeClassifier( max_depth=np.float64(3.0), min_samples_leaf=np.int32(5) ) self.extension.model_to_flow(dt) def test_numpy_array_not_allowed_in_flow(self): - """ Simple numpy arrays should not be serializable. """ + """Simple numpy arrays should not be serializable.""" bin = sklearn.preprocessing.MultiLabelBinarizer(classes=np.asarray([1, 2, 3])) with self.assertRaises(TypeError): self.extension.model_to_flow(bin) @@ -1772,7 +1783,8 @@ def test_run_model_on_fold_classification_2(self): y_test = y[test_indices] pipeline = sklearn.model_selection.GridSearchCV( - sklearn.tree.DecisionTreeClassifier(), {"max_depth": [1, 2]}, + sklearn.tree.DecisionTreeClassifier(), + {"max_depth": [1, 2]}, ) # TODO add some mocking here to actually test the innards of this function, too! res = self.extension._run_model_on_fold( @@ -1947,7 +1959,11 @@ def test_run_model_on_fold_clustering(self): ) # TODO add some mocking here to actually test the innards of this function, too! res = self.extension._run_model_on_fold( - model=pipeline, task=task, fold_no=0, rep_no=0, X_train=X, + model=pipeline, + task=task, + fold_no=0, + rep_no=0, + X_train=X, ) y_hat, y_hat_proba, user_defined_measures, trace = res @@ -1984,7 +2000,9 @@ def test__extract_trace_data(self): num_iters = 10 task = openml.tasks.get_task(20) # balance-scale; crossvalidation clf = sklearn.model_selection.RandomizedSearchCV( - sklearn.neural_network.MLPClassifier(), param_grid, num_iters, + sklearn.neural_network.MLPClassifier(), + param_grid, + num_iters, ) # just run the task on the model (without invoking any fancy extension & openml code) train, _ = task.get_train_test_split_indices(0, 0) @@ -2149,7 +2167,8 @@ def test_run_on_model_with_empty_steps(self): self.assertEqual(flow.components["prep"].class_name, "sklearn.pipeline.Pipeline") self.assertIsInstance(flow.components["prep"].components["columntransformer"], OpenMLFlow) self.assertIsInstance( - flow.components["prep"].components["columntransformer"].components["cat"], OpenMLFlow, + flow.components["prep"].components["columntransformer"].components["cat"], + OpenMLFlow, ) self.assertEqual( flow.components["prep"].components["columntransformer"].components["cat"].name, "drop" @@ -2189,8 +2208,7 @@ def test_sklearn_serialization_with_none_step(self): reason="columntransformer introduction in 0.20.0", ) def test_failed_serialization_of_custom_class(self): - """Test to check if any custom class inherited from sklearn expectedly fails serialization - """ + """Test to check if any custom class inherited from sklearn expectedly fails serialization""" try: from sklearn.impute import SimpleImputer except ImportError: diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index 8d08f4eaf..50d152192 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -176,7 +176,8 @@ def test_publish_flow(self): parameters=collections.OrderedDict(), parameters_meta_info=collections.OrderedDict(), external_version=self.extension._format_external_version( - "sklearn", sklearn.__version__, + "sklearn", + sklearn.__version__, ), tags=[], language="English", @@ -368,7 +369,10 @@ def test_existing_flow_exists(self): steps = [ ("imputation", SimpleImputer(strategy="median")), ("hotencoding", sklearn.preprocessing.OneHotEncoder(**ohe_params)), - ("variencethreshold", sklearn.feature_selection.VarianceThreshold(),), + ( + "variencethreshold", + sklearn.feature_selection.VarianceThreshold(), + ), ("classifier", sklearn.tree.DecisionTreeClassifier()), ] complicated = sklearn.pipeline.Pipeline(steps=steps) @@ -387,7 +391,10 @@ def test_existing_flow_exists(self): # check if flow exists can find it flow = openml.flows.get_flow(flow.flow_id) - downloaded_flow_id = openml.flows.flow_exists(flow.name, flow.external_version,) + downloaded_flow_id = openml.flows.flow_exists( + flow.name, + flow.external_version, + ) self.assertEqual(downloaded_flow_id, flow.flow_id) def test_sklearn_to_upload_to_flow(self): diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index a65dcbf70..eb80c2861 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -112,10 +112,14 @@ def test_are_flows_equal(self): new_flow = copy.deepcopy(flow) setattr(new_flow, attribute, new_value) self.assertNotEqual( - getattr(flow, attribute), getattr(new_flow, attribute), + getattr(flow, attribute), + getattr(new_flow, attribute), ) self.assertRaises( - ValueError, openml.flows.functions.assert_flows_equal, flow, new_flow, + ValueError, + openml.flows.functions.assert_flows_equal, + flow, + new_flow, ) # Test that the API ignores several keys when comparing flows @@ -134,7 +138,8 @@ def test_are_flows_equal(self): new_flow = copy.deepcopy(flow) setattr(new_flow, attribute, new_value) self.assertNotEqual( - getattr(flow, attribute), getattr(new_flow, attribute), + getattr(flow, attribute), + getattr(new_flow, attribute), ) openml.flows.functions.assert_flows_equal(flow, new_flow) @@ -370,7 +375,8 @@ def test_get_flow_id(self): name=flow.name, exact_version=True ) flow_ids_exact_version_False = openml.flows.get_flow_id( - name=flow.name, exact_version=False, + name=flow.name, + exact_version=False, ) self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False) self.assertIn(flow.flow_id, flow_ids_exact_version_True) diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index 16bdbc7df..ecc7111fa 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -7,7 +7,8 @@ class TestConfig(openml.testing.TestBase): def test_too_long_uri(self): with self.assertRaisesRegex( - openml.exceptions.OpenMLServerError, "URI too long!", + openml.exceptions.OpenMLServerError, + "URI too long!", ): openml.datasets.list_datasets(data_id=list(range(10000))) diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index 638f02420..ba70689a1 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -37,7 +37,7 @@ def side_effect(path_): openml.config._setup() def test_get_config_as_dict(self): - """ Checks if the current configuration is returned accurately as a dict. """ + """Checks if the current configuration is returned accurately as a dict.""" config = openml.config.get_config_as_dict() _config = dict() _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de" @@ -51,7 +51,7 @@ def test_get_config_as_dict(self): self.assertDictEqual(config, _config) def test_setup_with_config(self): - """ Checks if the OpenML configuration can be updated using _setup(). """ + """Checks if the OpenML configuration can be updated using _setup().""" _config = dict() _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de" _config["server"] = "https://www.openml.org/api/v1/xml" @@ -68,7 +68,7 @@ def test_setup_with_config(self): class TestConfigurationForExamples(openml.testing.TestBase): def test_switch_to_example_configuration(self): - """ Verifies the test configuration is loaded properly. """ + """Verifies the test configuration is loaded properly.""" # Below is the default test key which would be used anyway, but just for clarity: openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de" openml.config.server = self.production_server @@ -79,7 +79,7 @@ def test_switch_to_example_configuration(self): self.assertEqual(openml.config.server, self.test_server) def test_switch_from_example_configuration(self): - """ Verifies the previous configuration is loaded after stopping. """ + """Verifies the previous configuration is loaded after stopping.""" # Below is the default test key which would be used anyway, but just for clarity: openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de" openml.config.server = self.production_server @@ -91,14 +91,14 @@ def test_switch_from_example_configuration(self): self.assertEqual(openml.config.server, self.production_server) def test_example_configuration_stop_before_start(self): - """ Verifies an error is raised is `stop_...` is called before `start_...`. """ + """Verifies an error is raised is `stop_...` is called before `start_...`.""" error_regex = ".*stop_use_example_configuration.*start_use_example_configuration.*first" self.assertRaisesRegex( RuntimeError, error_regex, openml.config.stop_using_configuration_for_example ) def test_example_configuration_start_twice(self): - """ Checks that the original config can be returned to if `start..` is called twice. """ + """Checks that the original config can be returned to if `start..` is called twice.""" openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de" openml.config.server = self.production_server diff --git a/tests/test_openml/test_openml.py b/tests/test_openml/test_openml.py index 80f5e67f0..93d2e6925 100644 --- a/tests/test_openml/test_openml.py +++ b/tests/test_openml/test_openml.py @@ -15,7 +15,11 @@ class TestInit(TestBase): @mock.patch("openml.flows.functions.get_flow") @mock.patch("openml.runs.functions.get_run") def test_populate_cache( - self, run_mock, flow_mock, dataset_mock, task_mock, + self, + run_mock, + flow_mock, + dataset_mock, + task_mock, ): openml.populate_cache(task_ids=[1, 2], dataset_ids=[3, 4], flow_ids=[5, 6], run_ids=[7, 8]) self.assertEqual(run_mock.call_count, 2) @@ -27,7 +31,10 @@ def test_populate_cache( self.assertEqual(argument[0], fixture) self.assertEqual(dataset_mock.call_count, 2) - for argument, fixture in zip(dataset_mock.call_args_list, [(3,), (4,)],): + for argument, fixture in zip( + dataset_mock.call_args_list, + [(3,), (4,)], + ): self.assertEqual(argument[0], fixture) self.assertEqual(task_mock.call_count, 2) diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index dd0da5c00..88c998bc3 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -79,8 +79,14 @@ def _check_array(array, type_): int_part_prime = [line[:3] for line in run_prime_trace_content] _check_array(int_part_prime, int) - float_part = np.array(np.array(run_trace_content)[:, 3:4], dtype=float,) - float_part_prime = np.array(np.array(run_prime_trace_content)[:, 3:4], dtype=float,) + float_part = np.array( + np.array(run_trace_content)[:, 3:4], + dtype=float, + ) + float_part_prime = np.array( + np.array(run_prime_trace_content)[:, 3:4], + dtype=float, + ) bool_part = [line[4] for line in run_trace_content] bool_part_prime = [line[4] for line in run_prime_trace_content] for bp, bpp in zip(bool_part, bool_part_prime): @@ -113,7 +119,11 @@ def test_to_from_filesystem_vanilla(self): upload_flow=True, ) - cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)),) + cache_path = os.path.join( + self.workdir, + "runs", + str(random.getrandbits(128)), + ) run.to_filesystem(cache_path) run_prime = openml.runs.OpenMLRun.from_filesystem(cache_path) @@ -146,7 +156,10 @@ def test_to_from_filesystem_search(self): task = openml.tasks.get_task(119) # diabetes; crossvalidation run = openml.runs.run_model_on_task( - model=model, task=task, add_local_measures=False, avoid_duplicate_runs=False, + model=model, + task=task, + add_local_measures=False, + avoid_duplicate_runs=False, ) cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128))) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 8eafb0a7b..7a860dab3 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -143,7 +143,9 @@ def _compare_predictions(self, predictions, predictions_prime): val_2 = predictions_prime["data"][idx][col_idx] if type(val_1) == float or type(val_2) == float: self.assertAlmostEqual( - float(val_1), float(val_2), places=6, + float(val_1), + float(val_2), + places=6, ) else: self.assertEqual(val_1, val_2) @@ -165,11 +167,17 @@ def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed, create if create_task_obj: task = openml.tasks.get_task(run.task_id) run_prime = openml.runs.run_model_on_task( - model=model_prime, task=task, avoid_duplicate_runs=False, seed=seed, + model=model_prime, + task=task, + avoid_duplicate_runs=False, + seed=seed, ) else: run_prime = openml.runs.run_model_on_task( - model=model_prime, task=run.task_id, avoid_duplicate_runs=False, seed=seed, + model=model_prime, + task=run.task_id, + avoid_duplicate_runs=False, + seed=seed, ) predictions_prime = run_prime._generate_arff_dict() @@ -277,7 +285,9 @@ def _remove_random_state(flow): # test the initialize setup function run_id = run_.run_id run_server = openml.runs.get_run(run_id) - clf_server = openml.setups.initialize_model(setup_id=run_server.setup_id,) + clf_server = openml.setups.initialize_model( + setup_id=run_server.setup_id, + ) flow_local = self.extension.model_to_flow(clf) flow_server = self.extension.model_to_flow(clf_server) @@ -299,7 +309,9 @@ def _remove_random_state(flow): openml.flows.assert_flows_equal(flow_local, flow_server) # and test the initialize setup from run function - clf_server2 = openml.runs.initialize_model_from_run(run_id=run_server.run_id,) + clf_server2 = openml.runs.initialize_model_from_run( + run_id=run_server.run_id, + ) flow_server2 = self.extension.model_to_flow(clf_server2) if flow.class_name not in classes_without_random_state: self.assertEqual(flow_server2.parameters["random_state"], flow_expected_rsv) @@ -382,7 +394,10 @@ def test_run_regression_on_classif_task(self): AttributeError, "'LinearRegression' object has no attribute 'classes_'" ): openml.runs.run_model_on_task( - model=clf, task=task, avoid_duplicate_runs=False, dataset_format="array", + model=clf, + task=task, + avoid_duplicate_runs=False, + dataset_format="array", ) def test_check_erronous_sklearn_flow_fails(self): @@ -396,7 +411,8 @@ def test_check_erronous_sklearn_flow_fails(self): r"Penalty term must be positive; got \(C=u?'abc'\)", # u? for 2.7/3.4-6 compability ): openml.runs.run_model_on_task( - task=task, model=clf, + task=task, + model=clf, ) ########################################################################### @@ -474,7 +490,9 @@ def determine_grid_size(param_grid): self._wait_for_processed_run(run.run_id, 600) try: model_prime = openml.runs.initialize_model_from_trace( - run_id=run.run_id, repeat=0, fold=0, + run_id=run.run_id, + repeat=0, + fold=0, ) except openml.exceptions.OpenMLServerException as e: e.message = "%s; run_id %d" % (e.message, run.run_id) @@ -815,8 +833,8 @@ def test_learning_curve_task_2(self): RandomizedSearchCV( DecisionTreeClassifier(), { - "min_samples_split": [2 ** x for x in range(1, 8)], - "min_samples_leaf": [2 ** x for x in range(0, 7)], + "min_samples_split": [2**x for x in range(1, 8)], + "min_samples_leaf": [2**x for x in range(0, 7)], }, cv=3, n_iter=10, @@ -858,7 +876,10 @@ def test_initialize_cv_from_run(self): task = openml.tasks.get_task(11) # kr-vs-kp; holdout run = openml.runs.run_model_on_task( - model=randomsearch, task=task, avoid_duplicate_runs=False, seed=1, + model=randomsearch, + task=task, + avoid_duplicate_runs=False, + seed=1, ) run_ = run.publish() TestBase._mark_entity_for_removal("run", run.run_id) @@ -896,7 +917,10 @@ def _test_local_evaluations(self, run): else: tests.append((sklearn.metrics.jaccard_score, {})) for test_idx, test in enumerate(tests): - alt_scores = run.get_metric_fn(sklearn_fn=test[0], kwargs=test[1],) + alt_scores = run.get_metric_fn( + sklearn_fn=test[0], + kwargs=test[1], + ) self.assertEqual(len(alt_scores), 10) for idx in range(len(alt_scores)): self.assertGreaterEqual(alt_scores[idx], 0) @@ -909,7 +933,10 @@ def test_local_run_swapped_parameter_order_model(self): # task and clf are purposely in the old order run = openml.runs.run_model_on_task( - task, clf, avoid_duplicate_runs=False, upload_flow=False, + task, + clf, + avoid_duplicate_runs=False, + upload_flow=False, ) self._test_local_evaluations(run) @@ -935,7 +962,10 @@ def test_local_run_swapped_parameter_order_flow(self): # invoke OpenML run run = openml.runs.run_flow_on_task( - task, flow, avoid_duplicate_runs=False, upload_flow=False, + task, + flow, + avoid_duplicate_runs=False, + upload_flow=False, ) self._test_local_evaluations(run) @@ -960,7 +990,10 @@ def test_local_run_metric_score(self): # invoke OpenML run run = openml.runs.run_model_on_task( - model=clf, task=task, avoid_duplicate_runs=False, upload_flow=False, + model=clf, + task=task, + avoid_duplicate_runs=False, + upload_flow=False, ) self._test_local_evaluations(run) @@ -1013,7 +1046,11 @@ def test_initialize_model_from_run(self): TestBase.logger.info("collected from test_run_functions: {}".format(task_id)) task = openml.tasks.get_task(task_id) - run = openml.runs.run_model_on_task(model=clf, task=task, avoid_duplicate_runs=False,) + run = openml.runs.run_model_on_task( + model=clf, + task=task, + avoid_duplicate_runs=False, + ) run_ = run.publish() TestBase._mark_entity_for_removal("run", run_.run_id) TestBase.logger.info("collected from test_run_functions: {}".format(run_.run_id)) @@ -1098,7 +1135,9 @@ def test_run_with_illegal_flow_id(self): ) with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex): openml.runs.run_flow_on_task( - task=task, flow=flow, avoid_duplicate_runs=True, + task=task, + flow=flow, + avoid_duplicate_runs=True, ) def test_run_with_illegal_flow_id_after_load(self): @@ -1113,7 +1152,11 @@ def test_run_with_illegal_flow_id_after_load(self): task=task, flow=flow, avoid_duplicate_runs=False, upload_flow=False ) - cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)),) + cache_path = os.path.join( + self.workdir, + "runs", + str(random.getrandbits(128)), + ) run.to_filesystem(cache_path) loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path) @@ -1144,7 +1187,9 @@ def test_run_with_illegal_flow_id_1(self): expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'" with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex): openml.runs.run_flow_on_task( - task=task, flow=flow_new, avoid_duplicate_runs=True, + task=task, + flow=flow_new, + avoid_duplicate_runs=True, ) def test_run_with_illegal_flow_id_1_after_load(self): @@ -1167,7 +1212,11 @@ def test_run_with_illegal_flow_id_1_after_load(self): task=task, flow=flow_new, avoid_duplicate_runs=False, upload_flow=False ) - cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)),) + cache_path = os.path.join( + self.workdir, + "runs", + str(random.getrandbits(128)), + ) run.to_filesystem(cache_path) loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path) @@ -1488,7 +1537,10 @@ def test_run_flow_on_task_downloaded_flow(self): downloaded_flow = openml.flows.get_flow(flow.flow_id) task = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE["task_id"]) run = openml.runs.run_flow_on_task( - flow=downloaded_flow, task=task, avoid_duplicate_runs=False, upload_flow=False, + flow=downloaded_flow, + task=task, + avoid_duplicate_runs=False, + upload_flow=False, ) run.publish() @@ -1573,7 +1625,7 @@ def test_format_prediction_task_regression(self): ) @unittest.mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs") def test__run_task_get_arffcontent_2(self, parallel_mock): - """ Tests if a run executed in parallel is collated correctly. """ + """Tests if a run executed in parallel is collated correctly.""" task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp x, y = task.get_X_and_y(dataset_format="dataframe") num_instances = x.shape[0] @@ -1626,7 +1678,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock): ) @unittest.mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs") def test_joblib_backends(self, parallel_mock): - """ Tests evaluation of a run using various joblib backends and n_jobs. """ + """Tests evaluation of a run using various joblib backends and n_jobs.""" task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp x, y = task.get_X_and_y(dataset_format="dataframe") num_instances = x.shape[0] diff --git a/tests/test_runs/test_trace.py b/tests/test_runs/test_trace.py index 96724d139..0b4b64359 100644 --- a/tests/test_runs/test_trace.py +++ b/tests/test_runs/test_trace.py @@ -25,19 +25,22 @@ def test_get_selected_iteration(self): # This next one should simply not fail self.assertEqual(trace.get_selected_iteration(2, 2), 2) with self.assertRaisesRegex( - ValueError, "Could not find the selected iteration for rep/fold 3/3", + ValueError, + "Could not find the selected iteration for rep/fold 3/3", ): trace.get_selected_iteration(3, 3) def test_initialization(self): - """Check all different ways to fail the initialization """ + """Check all different ways to fail the initialization""" with self.assertRaisesRegex( - ValueError, "Trace content not available.", + ValueError, + "Trace content not available.", ): OpenMLRunTrace.generate(attributes="foo", content=None) with self.assertRaisesRegex( - ValueError, "Trace attributes not available.", + ValueError, + "Trace attributes not available.", ): OpenMLRunTrace.generate(attributes=None, content="foo") with self.assertRaisesRegex(ValueError, "Trace content is empty."): diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 538b08821..464431b94 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -87,7 +87,9 @@ def side_effect(self): self.priors = None with unittest.mock.patch.object( - sklearn.naive_bayes.GaussianNB, "__init__", side_effect, + sklearn.naive_bayes.GaussianNB, + "__init__", + side_effect, ): # Check a flow with zero hyperparameters nb = sklearn.naive_bayes.GaussianNB() diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index 904df4d3a..3d7811f6e 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -44,7 +44,8 @@ def test_get_study_error(self): openml.config.server = self.production_server with self.assertRaisesRegex( - ValueError, "Unexpected entity type 'task' reported by the server, expected 'run'", + ValueError, + "Unexpected entity type 'task' reported by the server, expected 'run'", ): openml.study.get_study(99) @@ -62,7 +63,8 @@ def test_get_suite_error(self): openml.config.server = self.production_server with self.assertRaisesRegex( - ValueError, "Unexpected entity type 'run' reported by the server, expected 'task'", + ValueError, + "Unexpected entity type 'run' reported by the server, expected 'task'", ): openml.study.get_suite(123) diff --git a/tests/test_tasks/test_split.py b/tests/test_tasks/test_split.py index 7c3dcf9aa..7d8004a91 100644 --- a/tests/test_tasks/test_split.py +++ b/tests/test_tasks/test_split.py @@ -82,8 +82,16 @@ def test_get_split(self): self.assertEqual(train_split.shape[0], 808) self.assertEqual(test_split.shape[0], 90) self.assertRaisesRegex( - ValueError, "Repeat 10 not known", split.get, 10, 2, + ValueError, + "Repeat 10 not known", + split.get, + 10, + 2, ) self.assertRaisesRegex( - ValueError, "Fold 10 not known", split.get, 2, 10, + ValueError, + "Fold 10 not known", + split.get, + 2, + 10, ) diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index 418b21b65..be5b0c9bd 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -143,7 +143,15 @@ def test_get_task(self): self.assertIsInstance(task, OpenMLTask) self.assertTrue( os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "task.xml",) + os.path.join( + self.workdir, + "org", + "openml", + "test", + "tasks", + "1", + "task.xml", + ) ) ) self.assertTrue( @@ -162,7 +170,15 @@ def test_get_task_lazy(self): self.assertIsInstance(task, OpenMLTask) self.assertTrue( os.path.exists( - os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "task.xml",) + os.path.join( + self.workdir, + "org", + "openml", + "test", + "tasks", + "2", + "task.xml", + ) ) ) self.assertEqual(task.class_labels, ["1", "2", "3", "4", "5", "U"]) @@ -230,7 +246,10 @@ def test_download_split(self): def test_deletion_of_cache_dir(self): # Simple removal - tid_cache_dir = openml.utils._create_cache_directory_for_id("tasks", 1,) + tid_cache_dir = openml.utils._create_cache_directory_for_id( + "tasks", + 1, + ) self.assertTrue(os.path.exists(tid_cache_dir)) openml.utils._remove_cache_dir_for_id("tasks", tid_cache_dir) self.assertFalse(os.path.exists(tid_cache_dir)) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 4fa08e1ab..a5add31c8 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -98,6 +98,7 @@ def test__create_cache_directory(self, config_mock): os.chmod(subdir, 0o444) config_mock.return_value = subdir with self.assertRaisesRegex( - openml.exceptions.OpenMLCacheException, r"Cannot create cache directory", + openml.exceptions.OpenMLCacheException, + r"Cannot create cache directory", ): openml.utils._create_cache_directory("ghi") From 9a740e6e8ebc42e4f021e44c6da45552401e826b Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 1 Jul 2022 16:54:24 +0200 Subject: [PATCH 3/8] Update error code for "print" Changed in flake8-print 5.0.0: https://pypi.org/project/flake8-print/ --- .flake8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index 211234f22..2d17eec10 100644 --- a/.flake8 +++ b/.flake8 @@ -5,7 +5,7 @@ select = C,E,F,W,B,T ignore = E203, E402, W503 per-file-ignores = *__init__.py:F401 - *cli.py:T001 + *cli.py:T201 exclude = venv examples From 3618131293a225ce78a1ae7c3ad7a0133b21afaf Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 1 Jul 2022 16:56:34 +0200 Subject: [PATCH 4/8] Shorten comment to observe line length codestyle --- .../test_sklearn_extension/test_sklearn_extension.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 0264b965d..a906d7ebd 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -2208,7 +2208,7 @@ def test_sklearn_serialization_with_none_step(self): reason="columntransformer introduction in 0.20.0", ) def test_failed_serialization_of_custom_class(self): - """Test to check if any custom class inherited from sklearn expectedly fails serialization""" + """Check if any custom class inherited from sklearn expectedly fails serialization""" try: from sklearn.impute import SimpleImputer except ImportError: From 6cd8c688305ae5323216ddc23e0ac1fa2e3bea42 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 1 Jul 2022 16:59:34 +0200 Subject: [PATCH 5/8] Install stubs for requests for mypy --- .pre-commit-config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b4ad7509d..95cd660ab 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,9 +10,13 @@ repos: - id: mypy name: mypy openml files: openml/.* + additional_dependencies: + - types-requests - id: mypy name: mypy tests files: tests/.* + additional_dependencies: + - types-requests - repo: https://gitlab.com/pycqa/flake8 rev: 4.0.1 hooks: From ac0364c11be202f589e5db0b2471209bc0549024 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 1 Jul 2022 17:30:23 +0200 Subject: [PATCH 6/8] Add dependency for mypy dateutil type stubs --- .pre-commit-config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 95cd660ab..ebea5251e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,11 +12,13 @@ repos: files: openml/.* additional_dependencies: - types-requests + - types-python-dateutil - id: mypy name: mypy tests files: tests/.* additional_dependencies: - types-requests + - types-python-dateutil - repo: https://gitlab.com/pycqa/flake8 rev: 4.0.1 hooks: From 0a80835477564be8ad67816332a88aa5ffeee0ba Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 1 Jul 2022 17:30:46 +0200 Subject: [PATCH 7/8] Resolve mypy warnings --- openml/datasets/dataset.py | 4 ++-- openml/extensions/sklearn/extension.py | 6 +++--- openml/flows/functions.py | 7 +++++-- openml/tasks/task.py | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 8d5606912..6f3f66853 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -354,8 +354,8 @@ def decode_arff(fh): return decoder.decode(fh, encode_nominal=True, return_type=return_type) if filename[-3:] == ".gz": - with gzip.open(filename) as fh: - return decode_arff(fh) + with gzip.open(filename) as zipfile: + return decode_arff(zipfile) else: with open(filename, encoding="utf8") as fh: return decode_arff(fh) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index c2a1d6bde..f8936b0db 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -11,7 +11,7 @@ from re import IGNORECASE import sys import time -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, cast +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, cast, Sized import warnings import numpy as np @@ -499,7 +499,7 @@ def _serialize_sklearn(self, o: Any, parent_model: Optional[Any] = None) -> Any: rval = tuple(rval) elif isinstance(o, SIMPLE_TYPES) or o is None: if isinstance(o, tuple(SIMPLE_NUMPY_TYPES)): - o = o.item() + o = o.item() # type: ignore # base parameter values rval = o elif isinstance(o, dict): @@ -1357,7 +1357,7 @@ def _serialize_cross_validator(self, o: Any) -> "OrderedDict[str, Union[str, Dic # if the parameter is deprecated, don't show it continue - if not (hasattr(value, "__len__") and len(value) == 0): + if not (isinstance(value, Sized) and len(value) == 0): value = json.dumps(value) parameters[key] = value else: diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 85546a0a3..73c2b1d3a 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -434,6 +434,9 @@ def assert_flows_equal( attr1 = getattr(flow1, key, None) attr2 = getattr(flow2, key, None) if key == "components": + if not (isinstance(attr1, Dict) and isinstance(attr2, Dict)): + raise TypeError("Cannot compare components because they are not dictionary.") + for name in set(attr1.keys()).union(attr2.keys()): if name not in attr1: raise ValueError( @@ -495,8 +498,8 @@ def assert_flows_equal( # dictionary with keys specifying the parameter's 'description' and 'data_type' # checking parameter descriptions can be ignored since that might change # data type check can also be ignored if one of them is not defined, i.e., None - params1 = set(flow1.parameters_meta_info.keys()) - params2 = set(flow2.parameters_meta_info.keys()) + params1 = set(flow1.parameters_meta_info) + params2 = set(flow2.parameters_meta_info) if params1 != params2: raise ValueError( "Parameter list in meta info for parameters differ " "in the two flows." diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 7f2e53a65..095730645 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -176,7 +176,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": task_container = OrderedDict() # type: OrderedDict[str, OrderedDict] task_dict = OrderedDict( [("@xmlns:oml", "http://openml.org/openml")] - ) # type: OrderedDict[str, Union[List, str, TaskType]] + ) # type: OrderedDict[str, Union[List, str, int]] task_container["oml:task_inputs"] = task_dict task_dict["oml:task_type_id"] = self.task_type_id.value From 881615fc8a5db600e231bb8f347e28824fda7b6a Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 1 Jul 2022 17:32:30 +0200 Subject: [PATCH 8/8] Add update pre-commit dependencies notice --- doc/progress.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/progress.rst b/doc/progress.rst index 02dd78086..88b0dd29d 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -12,6 +12,7 @@ Changelog * FIX#1110: Make arguments to ``create_study`` and ``create_suite`` that are defined as optional by the OpenML XSD actually optional. * FIX#1147: ``openml.flow.flow_exists`` no longer requires an API key. * MAIN#1088: Do CI for Windows on Github Actions instead of Appveyor. + * MAIN#1146: Update the pre-commit dependencies. * ADD#1103: Add a ``predictions`` property to OpenMLRun for easy accessibility of prediction data.