diff --git a/.github/workflows/pre_commit.yaml b/.github/workflows/pre_commit.yaml new file mode 100644 index 00000000..04566544 --- /dev/null +++ b/.github/workflows/pre_commit.yaml @@ -0,0 +1,23 @@ +name: Pre-commit Hooks + + +on: [pull_request] + +jobs: + run-pre-commit: + runs-on: ubuntu-latest + timeout-minutes: 2 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install pre-commit + run: pip install pre-commit + + - name: Run pre-commit Hooks + run: pre-commit run --all-files diff --git a/.gitignore b/.gitignore index 00b31a50..16d241f5 100644 --- a/.gitignore +++ b/.gitignore @@ -104,4 +104,4 @@ data/ .DS_Store .virtualenvs/ test.env -run_tests_single.sh \ No newline at end of file +run_tests_single.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..73fd0ca9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + files: '\.py$' + - id: end-of-file-fixer + files: '\.py$' + - id: check-yaml + - id: check-added-large-files + files: '\.py$' + + + + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.4.1 + hooks: + + - id: ruff-format + args: + - '--exclude=import_specifications/clients/baseclient.py' + - id: ruff + args: + - '--exclude=import_specifications/clients/baseclient.py' + + diff --git a/.travis.yml b/.travis.yml index b891d07f..02b8120c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,4 +13,4 @@ after_success: env: global: - secure: "mH9OrGjZYV021EfQPjx0hnwi7/UlGVNTnE6kC81O1/9/lOWihnDKBb/GXuWyj7hbiCZgA2ofDblkm+bIZQxU/+40k8ZrQyk4xe9Bm3jM9MCZ7y6r3g3HZnKXGbzdQTACLoJwdHRWEnYWcBC+aEnDsJjm/uh+gFE3WFXABHO5nOBVBJ1vnZoqczPnLIqOtTWwybV58eYxXOm7J6GvxrTWxdqoEOKsLcjdbm9dej/i9xsxHr6x0I/fiB8mFsIgl1QS9WobpTjs1AQghShz8oFO4YI70mJF4b9tU0FbfF1Wm6K4fe69ff65RwUhT9o8ZgwBt/mzrDFTLMI9f/3J8v6Bu5UxQ1yEPc9QP78IWW/g2G20WxjLzmbKogsGZkSyPEz1TLq8CgEGvU5IOoDSW1NjQJr9XQaE8A1ILDEfi1xVG/d6o8uhEMEiloveRa8iKdNFoGv5O0fbS8RC6KVs9SPP0nkApmv29MhWf0injEROZNXiDJr6Da4HiU+RkDwdM8rbIOS0586t/czVIRremmon5Xb9NDZapLbqdIX/zd//IC6WGM0ytX36t8FgU0Du+V+KfaAc7XP3ff+GLUL/sxMGGjdl+gCCdUCrTPCl2+D9P54/HRB5Q5xKNnO/OVou9WOe1HanthH5n2QGelzLisU2OxCiWl/iZ15pl+CGntF9+Ek=" - - secure: "sHLRR/xLA6fiPkJB6IolHUaJxNJJYcfSKG3MmRAU7v0LxxoqUkneBfk9jyk2GdDdtOR+RocOQmwie0Vf6ik5bIxcs/Bse1tR0+yhmg9EpgClokfgZN8815yNVclGkVFWZS7BhRjm7mt0K3Vo7Of0mKikp5L/tYmwQy1hesJ+mTa83igeQKpFU3QgkRJB2ALgSI54ruhO88x+9HtLqkhN8KqHj5ykGzS7ShagFywflFDVFWJoBK76NideU0gTU7+2dNV0+EPmnRWs9QoAh5LSUDa8v7fo+I2WHFCOLfiKzaNMc85+Gc72qyxvkpWl2UCJpiFVK1X4DFGvHYU9FzM0W9VuhrBAa3h1XFncYk52WzKNGVvQnOZE965iNrl/xSh2oTUOfyVIfX5JuldEWffU/vhAjvnqfsoq17geRxLyqbRx/4ZvNq6jf3Ef6kLwKlnPzcw85zZs2jsy3qC5UGJf0xqchWfsqxACvmGhVyv+KxHJxaFq48UHp6Ezb/X2DweD6/6WPfwXI6cOwXy6Uj3uvkYSsKE7HJu3K9aMeh158ZMrW/JeyH5dCRAv1qeAyLN0yOw/lPUapdrbB9nXkLbHIOhV8b6A2cOtkkuM6HXXBxICJ5BIHmVzO+4W7VXueXfVjAVY2+xs0ouFsPIOINfR5hgsglncDY+r/wU8LrUHxHg=" \ No newline at end of file + - secure: "sHLRR/xLA6fiPkJB6IolHUaJxNJJYcfSKG3MmRAU7v0LxxoqUkneBfk9jyk2GdDdtOR+RocOQmwie0Vf6ik5bIxcs/Bse1tR0+yhmg9EpgClokfgZN8815yNVclGkVFWZS7BhRjm7mt0K3Vo7Of0mKikp5L/tYmwQy1hesJ+mTa83igeQKpFU3QgkRJB2ALgSI54ruhO88x+9HtLqkhN8KqHj5ykGzS7ShagFywflFDVFWJoBK76NideU0gTU7+2dNV0+EPmnRWs9QoAh5LSUDa8v7fo+I2WHFCOLfiKzaNMc85+Gc72qyxvkpWl2UCJpiFVK1X4DFGvHYU9FzM0W9VuhrBAa3h1XFncYk52WzKNGVvQnOZE965iNrl/xSh2oTUOfyVIfX5JuldEWffU/vhAjvnqfsoq17geRxLyqbRx/4ZvNq6jf3Ef6kLwKlnPzcw85zZs2jsy3qC5UGJf0xqchWfsqxACvmGhVyv+KxHJxaFq48UHp6Ezb/X2DweD6/6WPfwXI6cOwXy6Uj3uvkYSsKE7HJu3K9aMeh158ZMrW/JeyH5dCRAv1qeAyLN0yOw/lPUapdrbB9nXkLbHIOhV8b6A2cOtkkuM6HXXBxICJ5BIHmVzO+4W7VXueXfVjAVY2+xs0ouFsPIOINfR5hgsglncDY+r/wU8LrUHxHg=" diff --git a/.vscode/launch.json b/.vscode/launch.json index 4abd742d..09873874 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -75,4 +75,4 @@ "host": "localhost" } ] -} \ No newline at end of file +} diff --git a/.vscode/settings.json b/.vscode/settings.json index a7adfa62..f01bd180 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,4 +4,4 @@ "files.autoSave": "afterDelay", "files.autoSaveDelay": 500, "python.pythonPath": "python3" -} \ No newline at end of file +} diff --git a/README.md b/README.md index c85dc418..28ad89b1 100644 --- a/README.md +++ b/README.md @@ -632,7 +632,7 @@ cannot decompress a file ## Add Globus ACL -After authenticating at this endpoint, AUTH is queried to get your filepath and globus id file for +After authenticating at this endpoint, AUTH is queried to get your filepath and globus id file for linking to globus. **URL** : `ci.kbase.us/services/staging_service/add-acl` @@ -675,7 +675,7 @@ Error Connecting to auth service ... **Content** ``` { - 'success': False, + 'success': False, 'error_type': 'TransferAPIError', 'error': "Can't create ACL rule; it already exists", 'error_code': 'Exists', 'shared_directory_basename': '/username/' @@ -684,7 +684,7 @@ Error Connecting to auth service ... ## Remove Globus ACL -After authenticating at this endpoint, AUTH is queried to get your filepath and globus id file for +After authenticating at this endpoint, AUTH is queried to get your filepath and globus id file for linking to globus. **URL** : `ci.kbase.us/services/staging_service/remove-acl` @@ -704,7 +704,7 @@ linking to globus. ``` { "message": "{\n \"DATA_TYPE\": \"result\",\n \"code\": \"Deleted\", - "message\": \"Access rule 'KBASE-examplex766ada0-x8aa-x1e8-xc7b-xa1d4c5c824a' deleted successfully\", + "message\": \"Access rule 'KBASE-examplex766ada0-x8aa-x1e8-xc7b-xa1d4c5c824a' deleted successfully\", "request_id\": \"x2KFzfop05\",\n \"resource\": \"/endpoint/KBaseExample2a-5e5b-11e6-8309-22000b97daec/access/KBaseExample-ada0-d8aa-11e8-8c7b-0a1d4c5c824a\"}", "Success": true } @@ -727,7 +727,7 @@ Error Connecting to auth service ... **Content** ``` { - 'success': False, + 'success': False, 'error_type': 'TransferAPIError', 'error': "Can't create ACL rule; it already exists", 'error_code': 'Exists', 'shared_directory_basename': '/username/' @@ -801,7 +801,7 @@ Reponse: in the data file. Each data file row is provided in order for each type. Each row is provided in a mapping of `spec.json` ID to the data for the row. Lines > 3 in the templates are user-provided data, and each line corresponds to a single import or analysis. - + ### Error Response Error reponses are of the general form: @@ -985,7 +985,7 @@ Reponse: * `files` contains a mapping of each provided data type to the output template file for that type. In the case of Excel, all the file paths will be the same since the data types are all written to different tabs in the same file. - + ### Error Response Method specific errors have the form: @@ -1012,7 +1012,7 @@ This endpoint returns: For example, * if we pass in nothing we get a response with no mappings * if we pass in a list of files, such as ["file1.fasta", "file2.fq", "None"], we would get back a - response that maps to Fasta Importers and FastQ Importers, with a weight of 0 to 1 + response that maps to Fasta Importers and FastQ Importers, with a weight of 0 to 1 which represents the probability that this is the correct importer for you. * for files for which there is no predicted app, the return is a null value * this endpoint is used to power the dropdowns for the staging service window in the Narrative @@ -1071,7 +1071,7 @@ Response: **Content** ``` -must provide file_list field +must provide file_list field ``` ## Get importer filetypes diff --git a/build/push2dockerhub.sh b/build/push2dockerhub.sh index dce8b698..a1badb80 100755 --- a/build/push2dockerhub.sh +++ b/build/push2dockerhub.sh @@ -1,5 +1,5 @@ #!/bin/bash -# +# # This script is intended to be run in the deploy stage of a travis build # It checks to make sure that this is a not a PR, and that we have the secure # environment variables available and then checks if this is either the master diff --git a/deployment/bin/entrypoint.sh b/deployment/bin/entrypoint.sh index 787b2446..739fe1e0 100755 --- a/deployment/bin/entrypoint.sh +++ b/deployment/bin/entrypoint.sh @@ -16,4 +16,4 @@ if [ -d "kb/deployment/lib/staging_service" ]; then PYTHONPATH="kb/deployment/lib/staging_service" # environment variable for KB_DEPLOYMENT_CONFIG set in docker-compose.yml fi -python3 -m staging_service \ No newline at end of file +python3 -m staging_service diff --git a/deployment/conf/deployment.cfg b/deployment/conf/deployment.cfg index be7851a4..1da09aaa 100644 --- a/deployment/conf/deployment.cfg +++ b/deployment/conf/deployment.cfg @@ -3,4 +3,4 @@ META_DIR = /kb/deployment/lib/src/data/metadata/ DATA_DIR = /kb/deployment/lib/src/data/bulk/ AUTH_URL = https://ci.kbase.us/services/auth/api/V2/token CONCIERGE_PATH = /kbaseconcierge -FILE_EXTENSION_MAPPINGS = /kb/deployment/conf/supported_apps_w_extensions.json \ No newline at end of file +FILE_EXTENSION_MAPPINGS = /kb/deployment/conf/supported_apps_w_extensions.json diff --git a/deployment/conf/local.cfg b/deployment/conf/local.cfg index b0033d2b..3f1794ec 100644 --- a/deployment/conf/local.cfg +++ b/deployment/conf/local.cfg @@ -2,4 +2,4 @@ META_DIR = ./data/metadata/ DATA_DIR = ./data/bulk/ AUTH_URL = https://ci.kbase.us/services/auth/api/V2/token -CONCIERGE_PATH = /kbaseconcierge \ No newline at end of file +CONCIERGE_PATH = /kbaseconcierge diff --git a/deployment/conf/supported_apps_w_extensions.json b/deployment/conf/supported_apps_w_extensions.json index 205fc0b0..545a862d 100644 --- a/deployment/conf/supported_apps_w_extensions.json +++ b/deployment/conf/supported_apps_w_extensions.json @@ -1126,4 +1126,4 @@ ] } } -} \ No newline at end of file +} diff --git a/deployment/conf/testing.cfg b/deployment/conf/testing.cfg index 432e99b7..eaf9f62a 100644 --- a/deployment/conf/testing.cfg +++ b/deployment/conf/testing.cfg @@ -4,4 +4,4 @@ DATA_DIR = ./data/bulk/ AUTH_URL = https://ci.kbase.us/services/auth/api/V2/token CONCIERGE_PATH = /kbaseconcierge FILE_EXTENSION_MAPPINGS = ./deployment/conf/supported_apps_w_extensions.json -;FILE_EXTENSION_MAPPINGS_PYCHARM = ../deployment/conf/supported_apps_w_extensions.json \ No newline at end of file +;FILE_EXTENSION_MAPPINGS_PYCHARM = ../deployment/conf/supported_apps_w_extensions.json diff --git a/docs/import_specifications.ADR.md b/docs/import_specifications.ADR.md index dbd04386..c49bbd1e 100644 --- a/docs/import_specifications.ADR.md +++ b/docs/import_specifications.ADR.md @@ -13,7 +13,7 @@ to import one or more files in the staging area to KBase as KBase data types. ## Front end changes -The design introduces a new StS data type, `import_specification`. The FE's current +The design introduces a new StS data type, `import_specification`. The FE's current behavior is to display any data types returned from the StS in the file dropdown, but silently ignore user-selected files for which the selected data type is unknown to the narrative, a bug. The FE will be updated to ignore unknown data types returned from the StS, allowing for phased, @@ -47,7 +47,7 @@ The file, by row, is: * The version allows us to update the file format and increment the version, allowing backwards compatibility - the staging service can process the file appropriately depending on the version number. -2. The IDs of the app inputs from the `spec.json` file. +2. The IDs of the app inputs from the `spec.json` file. 3. The corresponding human readable names of the app inputs from the `display.yaml` file. 4. (and beyond) Import specifications. Each line corresponds to a single import. @@ -89,7 +89,7 @@ The front end will be expected to either ## User operations * The user uploads the import specification files to the staging area along with all the files - inluded in the specification. + inluded in the specification. * The user selects the `Import Specification` type for the specification files. * The user may also select other files in the staging area to include in the import along with the files listed in the specification. @@ -277,16 +277,16 @@ Note in this case the service MUST log the stack trace along with the filename f Dynamic scientific name to taxon lookup may be added to the Genbank (and the currently out of scope, but trivial to add GFF/FASTA Genome) importer in the near future. If that occurs, for the purpose of xSV import the user will be expected to provide the entire, correct, -scientific name as returned from the taxon API. +scientific name as returned from the taxon API. -* The user could get this name by starting a genome import and running the query from the +* The user could get this name by starting a genome import and running the query from the import app cell configuration screen. * This will be documented in the README.md for the template files. * As part of the UI work we could theoretically provide a landing page for looking up valid scientific names. * Presumably the UI would need to run the dynamic query and report an error to the user if the dynamic service returns 0 or > 1 entries. -* Providing the scientific name vs. the taxon ID seems simpler because the machinery already +* Providing the scientific name vs. the taxon ID seems simpler because the machinery already exists to perform the query and is part of the spec. * Expect these plans to change as it becomes more clear how dynamic fields will work in the - context of bulk import. \ No newline at end of file + context of bulk import. diff --git a/globus.cfg b/globus.cfg index 9dcfeb94..c9e3b026 100644 --- a/globus.cfg +++ b/globus.cfg @@ -3,4 +3,3 @@ transfer_token = "Get this refresh token" auth_token = "Get this refresh token" endpoint_id = c3c0a65f-5827-4834-b6c9-388b0b19953a client_id = 26d64c4c-fcc2-4f7c-b056-62f185875af6 - diff --git a/import_specifications/clients/authclient.py b/import_specifications/clients/authclient.py index 844f9b0c..a7a354bb 100644 --- a/import_specifications/clients/authclient.py +++ b/import_specifications/clients/authclient.py @@ -1,18 +1,20 @@ -''' +""" Created on Aug 1, 2016 A very basic KBase auth client for the Python server. @author: gaprice@lbl.gov -''' +""" + +import hashlib +import threading as _threading import time as _time + import requests as _requests -import threading as _threading -import hashlib class TokenCache(object): - ''' A basic cache for tokens. ''' + """A basic cache for tokens.""" _MAX_TIME_SEC = 5 * 60 # 5 min @@ -24,7 +26,7 @@ def __init__(self, maxsize=2000): self._halfmax = maxsize / 2 # int division to round down def get_user(self, token): - token = hashlib.sha256(token.encode('utf-8')).hexdigest() + token = hashlib.sha256(token.encode("utf-8")).hexdigest() with self._lock: usertime = self._cache.get(token) if not usertime: @@ -37,17 +39,14 @@ def get_user(self, token): def add_valid_token(self, token, user): if not token: - raise ValueError('Must supply token') + raise ValueError("Must supply token") if not user: - raise ValueError('Must supply user') - token = hashlib.sha256(token.encode('utf-8')).hexdigest() + raise ValueError("Must supply user") + token = hashlib.sha256(token.encode("utf-8")).hexdigest() with self._lock: self._cache[token] = [user, _time.time()] if len(self._cache) > self._maxsize: - sorted_items = sorted( - list(self._cache.items()), - key=(lambda v: v[1][1]) - ) + sorted_items = sorted(list(self._cache.items()), key=(lambda v: v[1][1])) for i, (t, _) in enumerate(sorted_items): if i <= self._halfmax: del self._cache[t] @@ -56,16 +55,16 @@ def add_valid_token(self, token, user): class KBaseAuth(object): - ''' + """ A very basic KBase auth client for the Python server. - ''' + """ - _LOGIN_URL = 'https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login' + _LOGIN_URL = "https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login" def __init__(self, auth_url=None): - ''' + """ Constructor - ''' + """ self._authurl = auth_url if not self._authurl: self._authurl = self._LOGIN_URL @@ -73,22 +72,24 @@ def __init__(self, auth_url=None): def get_user(self, token): if not token: - raise ValueError('Must supply token') + raise ValueError("Must supply token") user = self._cache.get_user(token) if user: return user - d = {'token': token, 'fields': 'user_id'} + d = {"token": token, "fields": "user_id"} ret = _requests.post(self._authurl, data=d) if not ret.ok: try: err = ret.json() - except Exception as e: + except Exception: ret.raise_for_status() - raise ValueError('Error connecting to auth service: {} {}\n{}' - .format(ret.status_code, ret.reason, - err['error']['message'])) + raise ValueError( + "Error connecting to auth service: {} {}\n{}".format( + ret.status_code, ret.reason, err["error"]["message"] + ) + ) - user = ret.json()['user_id'] + user = ret.json()["user_id"] self._cache.add_valid_token(token, user) return user diff --git a/import_specifications/clients/baseclient.py b/import_specifications/clients/baseclient.py index 7dc1ce12..8bac630e 100644 --- a/import_specifications/clients/baseclient.py +++ b/import_specifications/clients/baseclient.py @@ -8,10 +8,11 @@ from __future__ import print_function import json as _json -import requests as _requests -import random as _random import os as _os +import random as _random import traceback as _traceback + +import requests as _requests from requests.exceptions import ConnectionError from urllib3.exceptions import ProtocolError @@ -26,9 +27,9 @@ from urlparse import urlparse as _urlparse # py2 import time -_CT = 'content-type' -_AJ = 'application/json' -_URL_SCHEME = frozenset(['http', 'https']) +_CT = "content-type" +_AJ = "application/json" +_URL_SCHEME = frozenset(["http", "https"]) _CHECK_JOB_RETRYS = 3 @@ -38,23 +39,29 @@ def _get_token(user_id, password, auth_svc): # note that currently globus usernames, and therefore kbase usernames, # cannot contain non-ascii characters. In python 2, quote doesn't handle # unicode, so if this changes this client will need to change. - body = ('user_id=' + _requests.utils.quote(user_id) + '&password=' + - _requests.utils.quote(password) + '&fields=token') + body = ( + "user_id=" + + _requests.utils.quote(user_id) + + "&password=" + + _requests.utils.quote(password) + + "&fields=token" + ) ret = _requests.post(auth_svc, data=body, allow_redirects=True) status = ret.status_code if status >= 200 and status <= 299: tok = _json.loads(ret.text) elif status == 403: - raise Exception('Authentication failed: Bad user_id/password ' + - 'combination for user %s' % (user_id)) + raise Exception( + "Authentication failed: Bad user_id/password " + "combination for user %s" % (user_id) + ) else: raise Exception(ret.text) - return tok['token'] + return tok["token"] -def _read_inifile(file=_os.environ.get( # @ReservedAssignment - 'KB_DEPLOYMENT_CONFIG', _os.environ['HOME'] + - '/.kbase_config')): +def _read_inifile( + file=_os.environ.get("KB_DEPLOYMENT_CONFIG", _os.environ["HOME"] + "/.kbase_config"), +): # @ReservedAssignment # Another bandaid to read in the ~/.kbase_config file if one is present authdata = None if _os.path.exists(file): @@ -62,33 +69,40 @@ def _read_inifile(file=_os.environ.get( # @ReservedAssignment config = _ConfigParser() config.read(file) # strip down whatever we read to only what is legit - authdata = {x: config.get('authentication', x) - if config.has_option('authentication', x) - else None for x in ('user_id', 'token', - 'client_secret', 'keyfile', - 'keyfile_passphrase', 'password')} + authdata = { + x: ( + config.get("authentication", x) + if config.has_option("authentication", x) + else None + ) + for x in ( + "user_id", + "token", + "client_secret", + "keyfile", + "keyfile_passphrase", + "password", + ) + } except Exception as e: - print('Error while reading INI file {}: {}'.format(file, e)) + print("Error while reading INI file {}: {}".format(file, e)) return authdata class ServerError(Exception): - def __init__(self, name, code, message, data=None, error=None): super(Exception, self).__init__(message) self.name = name self.code = code - self.message = '' if message is None else message - self.data = data or error or '' + self.message = "" if message is None else message + self.data = data or error or "" # data = JSON RPC 2.0, error = 1.1 def __str__(self): - return self.name + ': ' + str(self.code) + '. ' + self.message + \ - '\n' + self.data + return self.name + ": " + str(self.code) + ". " + self.message + "\n" + self.data class _JSONObjectEncoder(_json.JSONEncoder): - def default(self, obj): if isinstance(obj, set): return list(obj) @@ -98,7 +112,7 @@ def default(self, obj): class BaseClient(object): - ''' + """ The KBase base client. Required initialization arguments (positional): url - the url of the the service to contact: @@ -120,18 +134,25 @@ class BaseClient(object): lookup_url - set to true when contacting KBase dynamic services. async_job_check_time_ms - the wait time between checking job state for asynchronous jobs run with the run_job method. - ''' + """ + def __init__( - self, url=None, timeout=30 * 60, user_id=None, - password=None, token=None, ignore_authrc=False, - trust_all_ssl_certificates=False, - auth_svc='https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login', - lookup_url=False, - async_job_check_time_ms=100, - async_job_check_time_scale_percent=150, - async_job_check_max_time_ms=300000): + self, + url=None, + timeout=30 * 60, + user_id=None, + password=None, + token=None, + ignore_authrc=False, + trust_all_ssl_certificates=False, + auth_svc="https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login", + lookup_url=False, + async_job_check_time_ms=100, + async_job_check_time_scale_percent=150, + async_job_check_max_time_ms=300000, + ): if url is None: - raise ValueError('A url is required') + raise ValueError("A url is required") scheme, _, _, _, _, _ = _urlparse(url) if scheme not in _URL_SCHEME: raise ValueError(url + " isn't a valid http url") @@ -141,93 +162,96 @@ def __init__( self.trust_all_ssl_certificates = trust_all_ssl_certificates self.lookup_url = lookup_url self.async_job_check_time = async_job_check_time_ms / 1000.0 - self.async_job_check_time_scale_percent = ( - async_job_check_time_scale_percent) + self.async_job_check_time_scale_percent = async_job_check_time_scale_percent self.async_job_check_max_time = async_job_check_max_time_ms / 1000.0 # token overrides user_id and password if token is not None: - self._headers['AUTHORIZATION'] = token + self._headers["AUTHORIZATION"] = token elif user_id is not None and password is not None: - self._headers['AUTHORIZATION'] = _get_token( - user_id, password, auth_svc) - elif 'KB_AUTH_TOKEN' in _os.environ: - self._headers['AUTHORIZATION'] = _os.environ.get('KB_AUTH_TOKEN') + self._headers["AUTHORIZATION"] = _get_token(user_id, password, auth_svc) + elif "KB_AUTH_TOKEN" in _os.environ: + self._headers["AUTHORIZATION"] = _os.environ.get("KB_AUTH_TOKEN") elif not ignore_authrc: authdata = _read_inifile() if authdata is not None: - if authdata.get('token') is not None: - self._headers['AUTHORIZATION'] = authdata['token'] - elif(authdata.get('user_id') is not None and - authdata.get('password') is not None): - self._headers['AUTHORIZATION'] = _get_token( - authdata['user_id'], authdata['password'], auth_svc) + if authdata.get("token") is not None: + self._headers["AUTHORIZATION"] = authdata["token"] + elif authdata.get("user_id") is not None and authdata.get("password") is not None: + self._headers["AUTHORIZATION"] = _get_token( + authdata["user_id"], authdata["password"], auth_svc + ) if self.timeout < 1: - raise ValueError('Timeout value must be at least 1 second') + raise ValueError("Timeout value must be at least 1 second") def _call(self, url, method, params, context=None): - arg_hash = {'method': method, - 'params': params, - 'version': '1.1', - 'id': str(_random.random())[2:] - } + arg_hash = { + "method": method, + "params": params, + "version": "1.1", + "id": str(_random.random())[2:], + } if context: if type(context) is not dict: - raise ValueError('context is not type dict as required.') - arg_hash['context'] = context + raise ValueError("context is not type dict as required.") + arg_hash["context"] = context body = _json.dumps(arg_hash, cls=_JSONObjectEncoder) - ret = _requests.post(url, data=body, headers=self._headers, - timeout=self.timeout, - verify=not self.trust_all_ssl_certificates) - ret.encoding = 'utf-8' + ret = _requests.post( + url, + data=body, + headers=self._headers, + timeout=self.timeout, + verify=not self.trust_all_ssl_certificates, + ) + ret.encoding = "utf-8" if ret.status_code == 500: if ret.headers.get(_CT) == _AJ: err = ret.json() - if 'error' in err: - raise ServerError(**err['error']) + if "error" in err: + raise ServerError(**err["error"]) else: - raise ServerError('Unknown', 0, ret.text) + raise ServerError("Unknown", 0, ret.text) else: - raise ServerError('Unknown', 0, ret.text) + raise ServerError("Unknown", 0, ret.text) if not ret.ok: ret.raise_for_status() resp = ret.json() - if 'result' not in resp: - raise ServerError('Unknown', 0, 'An unknown server error occurred') - if not resp['result']: + if "result" not in resp: + raise ServerError("Unknown", 0, "An unknown server error occurred") + if not resp["result"]: return - if len(resp['result']) == 1: - return resp['result'][0] - return resp['result'] + if len(resp["result"]) == 1: + return resp["result"][0] + return resp["result"] def _get_service_url(self, service_method, service_version): if not self.lookup_url: return self.url - service, _ = service_method.split('.') + service, _ = service_method.split(".") service_status_ret = self._call( - self.url, 'ServiceWizard.get_service_status', - [{'module_name': service, 'version': service_version}]) - return service_status_ret['url'] + self.url, + "ServiceWizard.get_service_status", + [{"module_name": service, "version": service_version}], + ) + return service_status_ret["url"] def _set_up_context(self, service_ver=None, context=None): if service_ver: if not context: context = {} - context['service_ver'] = service_ver + context["service_ver"] = service_ver return context def _check_job(self, service, job_id): - return self._call(self.url, service + '._check_job', [job_id]) + return self._call(self.url, service + "._check_job", [job_id]) - def _submit_job(self, service_method, args, service_ver=None, - context=None): + def _submit_job(self, service_method, args, service_ver=None, context=None): context = self._set_up_context(service_ver, context) - mod, meth = service_method.split('.') - return self._call(self.url, mod + '._' + meth + '_submit', - args, context) + mod, meth = service_method.split(".") + return self._call(self.url, mod + "._" + meth + "_submit", args, context) def run_job(self, service_method, args, service_ver=None, context=None): - ''' + """ Run a SDK method asynchronously. Required arguments: service_method - the service and method to run, e.g. myserv.mymeth. @@ -236,16 +260,16 @@ def run_job(self, service_method, args, service_ver=None, context=None): service_ver - the version of the service to run, e.g. a git hash or dev/beta/release. context - the rpc context dict. - ''' - mod, _ = service_method.split('.') + """ + mod, _ = service_method.split(".") job_id = self._submit_job(service_method, args, service_ver, context) async_job_check_time = self.async_job_check_time check_job_failures = 0 while check_job_failures < _CHECK_JOB_RETRYS: time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self.async_job_check_time_scale_percent / - 100.0) + async_job_check_time = ( + async_job_check_time * self.async_job_check_time_scale_percent / 100.0 + ) if async_job_check_time > self.async_job_check_max_time: async_job_check_time = self.async_job_check_max_time @@ -256,18 +280,18 @@ def run_job(self, service_method, args, service_ver=None, context=None): check_job_failures += 1 continue - if job_state['finished']: - if not job_state['result']: + if job_state["finished"]: + if not job_state["result"]: return - if len(job_state['result']) == 1: - return job_state['result'][0] - return job_state['result'] - raise RuntimeError("_check_job failed {} times and exceeded limit".format( - check_job_failures)) + if len(job_state["result"]) == 1: + return job_state["result"][0] + return job_state["result"] + raise RuntimeError( + "_check_job failed {} times and exceeded limit".format(check_job_failures) + ) - def call_method(self, service_method, args, service_ver=None, - context=None): - ''' + def call_method(self, service_method, args, service_ver=None, context=None): + """ Call a standard or dynamic service synchronously. Required arguments: service_method - the service and method to run, e.g. myserv.mymeth. @@ -276,7 +300,7 @@ def call_method(self, service_method, args, service_ver=None, service_ver - the version of the service to run, e.g. a git hash or dev/beta/release. context - the rpc context dict. - ''' + """ url = self._get_service_url(service_method, service_ver) context = self._set_up_context(service_ver, context) return self._call(url, service_method, args, context) diff --git a/import_specifications/clients/narrative_method_store_client.py b/import_specifications/clients/narrative_method_store_client.py index 16e681e0..250830b4 100644 --- a/import_specifications/clients/narrative_method_store_client.py +++ b/import_specifications/clients/narrative_method_store_client.py @@ -7,6 +7,7 @@ ############################################################ from __future__ import print_function + # the following is a hack to get the baseclient to import whether we're in a # package or not. This makes pep8 unhappy hence the annotations. try: @@ -18,28 +19,37 @@ class NarrativeMethodStore(object): - def __init__( - self, url=None, timeout=30 * 60, user_id=None, - password=None, token=None, ignore_authrc=False, - trust_all_ssl_certificates=False, - auth_svc='https://ci.kbase.us/services/auth/api/legacy/KBase/Sessions/Login'): + self, + url=None, + timeout=30 * 60, + user_id=None, + password=None, + token=None, + ignore_authrc=False, + trust_all_ssl_certificates=False, + auth_svc="https://ci.kbase.us/services/auth/api/legacy/KBase/Sessions/Login", + ): if url is None: - url = 'https://kbase.us/services/narrative_method_store/rpc' + url = "https://kbase.us/services/narrative_method_store/rpc" self._service_ver = None self._client = _BaseClient( - url, timeout=timeout, user_id=user_id, password=password, - token=token, ignore_authrc=ignore_authrc, + url, + timeout=timeout, + user_id=user_id, + password=password, + token=token, + ignore_authrc=ignore_authrc, trust_all_ssl_certificates=trust_all_ssl_certificates, - auth_svc=auth_svc) + auth_svc=auth_svc, + ) def ver(self, context=None): """ Returns the current running version of the NarrativeMethodStore. :returns: instance of String """ - return self._client.call_method('NarrativeMethodStore.ver', - [], self._service_ver, context) + return self._client.call_method("NarrativeMethodStore.ver", [], self._service_ver, context) def status(self, context=None): """ @@ -50,8 +60,9 @@ def status(self, context=None): parameter "git_spec_commit" of String, parameter "update_interval" of String """ - return self._client.call_method('NarrativeMethodStore.status', - [], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.status", [], self._service_ver, context + ) def list_categories(self, params, context=None): """ @@ -103,8 +114,9 @@ def list_categories(self, params, context=None): String to String, parameter "landing_page_url_prefix" of String, parameter "loading_error" of String """ - return self._client.call_method('NarrativeMethodStore.list_categories', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.list_categories", [params], self._service_ver, context + ) def get_category(self, params, context=None): """ @@ -116,8 +128,9 @@ def get_category(self, params, context=None): String, parameter "parent_ids" of list of String, parameter "loading_error" of String """ - return self._client.call_method('NarrativeMethodStore.get_category', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.get_category", [params], self._service_ver, context + ) def list_methods(self, params, context=None): """ @@ -144,8 +157,9 @@ def list_methods(self, params, context=None): list of String, parameter "output_types" of list of String, parameter "app_type" of String """ - return self._client.call_method('NarrativeMethodStore.list_methods', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.list_methods", [params], self._service_ver, context + ) def list_methods_full_info(self, params, context=None): """ @@ -185,8 +199,12 @@ def list_methods_full_info(self, params, context=None): structure: parameter "pmid" of String, parameter "display_text" of String, parameter "link" of type "url" """ - return self._client.call_method('NarrativeMethodStore.list_methods_full_info', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.list_methods_full_info", + [params], + self._service_ver, + context, + ) def list_methods_spec(self, params, context=None): """ @@ -261,7 +279,7 @@ def list_methods_spec(self, params, context=None): [0,1]), parameter "ui_class" of String, parameter "default_values" of list of String, parameter "valid_file_types" of list of String, parameter "text_options" of type "TextOptions" (valid_ws_types - - list of valid ws types that can be used for input validate_as + list of valid ws types that can be used for input validate_as - int | float | nonnumeric | none is_output_name - true if the user is specifying an output name, false otherwise, default is false) -> structure: parameter "valid_ws_types" of list of String, @@ -333,10 +351,10 @@ def list_methods_spec(self, params, context=None): selection items data structure must be a list of mappings or structures. As an example of correctly specifying where the selection items are within the data structure returned from the - dynamic service, if the data structure is: [ "foo", + dynamic service, if the data structure is: [ "foo", # return array position 0 { # return array position 1 "interesting_data": [ "baz", "boo", [ {"id": 1, "name": - "foo" }, ... {"id": 42, "name": "wowbagger" } ], "bat" ] }, "bar" + "foo" }, ... {"id": 42, "name": "wowbagger" } ], "bat" ] }, "bar" # return array position 2 ] Note that KBase dynamic services all return an array of values, even for single-value returns, as the KIDL spec allows specifying multiple return values per function. @@ -534,8 +552,12 @@ def list_methods_spec(self, params, context=None): "target_property" of String, parameter "target_type_transform" of String, parameter "job_id_output_field" of String """ - return self._client.call_method('NarrativeMethodStore.list_methods_spec', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.list_methods_spec", + [params], + self._service_ver, + context, + ) def list_method_ids_and_names(self, params, context=None): """ @@ -544,8 +566,12 @@ def list_method_ids_and_names(self, params, context=None): 'release').) -> structure: parameter "tag" of String :returns: instance of mapping from String to String """ - return self._client.call_method('NarrativeMethodStore.list_method_ids_and_names', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.list_method_ids_and_names", + [params], + self._service_ver, + context, + ) def list_apps(self, params, context=None): """ @@ -564,8 +590,9 @@ def list_apps(self, params, context=None): parameter "categories" of list of String, parameter "loading_error" of String """ - return self._client.call_method('NarrativeMethodStore.list_apps', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.list_apps", [params], self._service_ver, context + ) def list_apps_full_info(self, params, context=None): """ @@ -591,8 +618,12 @@ def list_apps_full_info(self, params, context=None): parameter "screenshots" of list of type "ScreenShot" -> structure: parameter "url" of type "url" """ - return self._client.call_method('NarrativeMethodStore.list_apps_full_info', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.list_apps_full_info", + [params], + self._service_ver, + context, + ) def list_apps_spec(self, params, context=None): """ @@ -630,15 +661,20 @@ def list_apps_spec(self, params, context=None): [0,1]), parameter "from" of String, parameter "to" of String, parameter "description" of String """ - return self._client.call_method('NarrativeMethodStore.list_apps_spec', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.list_apps_spec", [params], self._service_ver, context + ) def list_app_ids_and_names(self, context=None): """ :returns: instance of mapping from String to String """ - return self._client.call_method('NarrativeMethodStore.list_app_ids_and_names', - [], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.list_app_ids_and_names", + [], + self._service_ver, + context, + ) def list_types(self, params, context=None): """ @@ -662,8 +698,9 @@ def list_types(self, params, context=None): "landing_page_url_prefix" of String, parameter "loading_error" of String """ - return self._client.call_method('NarrativeMethodStore.list_types', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.list_types", [params], self._service_ver, context + ) def get_method_brief_info(self, params, context=None): """ @@ -687,8 +724,12 @@ def get_method_brief_info(self, params, context=None): list of String, parameter "output_types" of list of String, parameter "app_type" of String """ - return self._client.call_method('NarrativeMethodStore.get_method_brief_info', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.get_method_brief_info", + [params], + self._service_ver, + context, + ) def get_method_full_info(self, params, context=None): """ @@ -725,8 +766,12 @@ def get_method_full_info(self, params, context=None): structure: parameter "pmid" of String, parameter "display_text" of String, parameter "link" of type "url" """ - return self._client.call_method('NarrativeMethodStore.get_method_full_info', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.get_method_full_info", + [params], + self._service_ver, + context, + ) def get_method_spec(self, params, context=None): """ @@ -798,7 +843,7 @@ def get_method_spec(self, params, context=None): [0,1]), parameter "ui_class" of String, parameter "default_values" of list of String, parameter "valid_file_types" of list of String, parameter "text_options" of type "TextOptions" (valid_ws_types - - list of valid ws types that can be used for input validate_as + list of valid ws types that can be used for input validate_as - int | float | nonnumeric | none is_output_name - true if the user is specifying an output name, false otherwise, default is false) -> structure: parameter "valid_ws_types" of list of String, @@ -870,10 +915,10 @@ def get_method_spec(self, params, context=None): selection items data structure must be a list of mappings or structures. As an example of correctly specifying where the selection items are within the data structure returned from the - dynamic service, if the data structure is: [ "foo", + dynamic service, if the data structure is: [ "foo", # return array position 0 { # return array position 1 "interesting_data": [ "baz", "boo", [ {"id": 1, "name": - "foo" }, ... {"id": 42, "name": "wowbagger" } ], "bat" ] }, "bar" + "foo" }, ... {"id": 42, "name": "wowbagger" } ], "bat" ] }, "bar" # return array position 2 ] Note that KBase dynamic services all return an array of values, even for single-value returns, as the KIDL spec allows specifying multiple return values per function. @@ -1071,8 +1116,9 @@ def get_method_spec(self, params, context=None): "target_property" of String, parameter "target_type_transform" of String, parameter "job_id_output_field" of String """ - return self._client.call_method('NarrativeMethodStore.get_method_spec', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.get_method_spec", [params], self._service_ver, context + ) def get_app_brief_info(self, params, context=None): """ @@ -1086,8 +1132,12 @@ def get_app_brief_info(self, params, context=None): parameter "categories" of list of String, parameter "loading_error" of String """ - return self._client.call_method('NarrativeMethodStore.get_app_brief_info', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.get_app_brief_info", + [params], + self._service_ver, + context, + ) def get_app_full_info(self, params, context=None): """ @@ -1108,8 +1158,12 @@ def get_app_full_info(self, params, context=None): parameter "screenshots" of list of type "ScreenShot" -> structure: parameter "url" of type "url" """ - return self._client.call_method('NarrativeMethodStore.get_app_full_info', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.get_app_full_info", + [params], + self._service_ver, + context, + ) def get_app_spec(self, params, context=None): """ @@ -1142,8 +1196,9 @@ def get_app_spec(self, params, context=None): [0,1]), parameter "from" of String, parameter "to" of String, parameter "description" of String """ - return self._client.call_method('NarrativeMethodStore.get_app_spec', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.get_app_spec", [params], self._service_ver, context + ) def get_type_info(self, params, context=None): """ @@ -1162,8 +1217,9 @@ def get_type_info(self, params, context=None): "landing_page_url_prefix" of String, parameter "loading_error" of String """ - return self._client.call_method('NarrativeMethodStore.get_type_info', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.get_type_info", [params], self._service_ver, context + ) def validate_method(self, params, context=None): """ @@ -1307,7 +1363,7 @@ def validate_method(self, params, context=None): [0,1]), parameter "ui_class" of String, parameter "default_values" of list of String, parameter "valid_file_types" of list of String, parameter "text_options" of type "TextOptions" (valid_ws_types - - list of valid ws types that can be used for input validate_as + list of valid ws types that can be used for input validate_as - int | float | nonnumeric | none is_output_name - true if the user is specifying an output name, false otherwise, default is false) -> structure: parameter "valid_ws_types" of list of String, @@ -1379,10 +1435,10 @@ def validate_method(self, params, context=None): selection items data structure must be a list of mappings or structures. As an example of correctly specifying where the selection items are within the data structure returned from the - dynamic service, if the data structure is: [ "foo", + dynamic service, if the data structure is: [ "foo", # return array position 0 { # return array position 1 "interesting_data": [ "baz", "boo", [ {"id": 1, "name": - "foo" }, ... {"id": 42, "name": "wowbagger" } ], "bat" ] }, "bar" + "foo" }, ... {"id": 42, "name": "wowbagger" } ], "bat" ] }, "bar" # return array position 2 ] Note that KBase dynamic services all return an array of values, even for single-value returns, as the KIDL spec allows specifying multiple return values per function. @@ -1592,8 +1648,9 @@ def validate_method(self, params, context=None): "landing_page_url_prefix" of String, parameter "loading_error" of String """ - return self._client.call_method('NarrativeMethodStore.validate_method', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.validate_method", [params], self._service_ver, context + ) def validate_app(self, params, context=None): """ @@ -1735,7 +1792,7 @@ def validate_app(self, params, context=None): [0,1]), parameter "ui_class" of String, parameter "default_values" of list of String, parameter "valid_file_types" of list of String, parameter "text_options" of type "TextOptions" (valid_ws_types - - list of valid ws types that can be used for input validate_as + list of valid ws types that can be used for input validate_as - int | float | nonnumeric | none is_output_name - true if the user is specifying an output name, false otherwise, default is false) -> structure: parameter "valid_ws_types" of list of String, @@ -1807,10 +1864,10 @@ def validate_app(self, params, context=None): selection items data structure must be a list of mappings or structures. As an example of correctly specifying where the selection items are within the data structure returned from the - dynamic service, if the data structure is: [ "foo", + dynamic service, if the data structure is: [ "foo", # return array position 0 { # return array position 1 "interesting_data": [ "baz", "boo", [ {"id": 1, "name": - "foo" }, ... {"id": 42, "name": "wowbagger" } ], "bat" ] }, "bar" + "foo" }, ... {"id": 42, "name": "wowbagger" } ], "bat" ] }, "bar" # return array position 2 ] Note that KBase dynamic services all return an array of values, even for single-value returns, as the KIDL spec allows specifying multiple return values per function. @@ -2020,8 +2077,9 @@ def validate_app(self, params, context=None): "landing_page_url_prefix" of String, parameter "loading_error" of String """ - return self._client.call_method('NarrativeMethodStore.validate_app', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.validate_app", [params], self._service_ver, context + ) def validate_type(self, params, context=None): """ @@ -2163,7 +2221,7 @@ def validate_type(self, params, context=None): [0,1]), parameter "ui_class" of String, parameter "default_values" of list of String, parameter "valid_file_types" of list of String, parameter "text_options" of type "TextOptions" (valid_ws_types - - list of valid ws types that can be used for input validate_as + list of valid ws types that can be used for input validate_as - int | float | nonnumeric | none is_output_name - true if the user is specifying an output name, false otherwise, default is false) -> structure: parameter "valid_ws_types" of list of String, @@ -2235,10 +2293,10 @@ def validate_type(self, params, context=None): selection items data structure must be a list of mappings or structures. As an example of correctly specifying where the selection items are within the data structure returned from the - dynamic service, if the data structure is: [ "foo", + dynamic service, if the data structure is: [ "foo", # return array position 0 { # return array position 1 "interesting_data": [ "baz", "boo", [ {"id": 1, "name": - "foo" }, ... {"id": 42, "name": "wowbagger" } ], "bat" ] }, "bar" + "foo" }, ... {"id": 42, "name": "wowbagger" } ], "bat" ] }, "bar" # return array position 2 ] Note that KBase dynamic services all return an array of values, even for single-value returns, as the KIDL spec allows specifying multiple return values per function. @@ -2448,8 +2506,9 @@ def validate_type(self, params, context=None): "landing_page_url_prefix" of String, parameter "loading_error" of String """ - return self._client.call_method('NarrativeMethodStore.validate_type', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.validate_type", [params], self._service_ver, context + ) def load_widget_java_script(self, params, context=None): """ @@ -2464,8 +2523,12 @@ def load_widget_java_script(self, params, context=None): parameter "tag" of String :returns: instance of String """ - return self._client.call_method('NarrativeMethodStore.load_widget_java_script', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.load_widget_java_script", + [params], + self._service_ver, + context, + ) def register_repo(self, params, context=None): """ @@ -2474,24 +2537,27 @@ def register_repo(self, params, context=None): ******************************) -> structure: parameter "git_url" of String, parameter "git_commit_hash" of String """ - return self._client.call_method('NarrativeMethodStore.register_repo', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.register_repo", [params], self._service_ver, context + ) def disable_repo(self, params, context=None): """ :param params: instance of type "DisableRepoParams" -> structure: parameter "module_name" of String """ - return self._client.call_method('NarrativeMethodStore.disable_repo', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.disable_repo", [params], self._service_ver, context + ) def enable_repo(self, params, context=None): """ :param params: instance of type "EnableRepoParams" -> structure: parameter "module_name" of String """ - return self._client.call_method('NarrativeMethodStore.enable_repo', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.enable_repo", [params], self._service_ver, context + ) def push_repo_to_tag(self, params, context=None): """ @@ -2499,5 +2565,9 @@ def push_repo_to_tag(self, params, context=None): two values: 'beta' or 'release'.) -> structure: parameter "module_name" of String, parameter "tag" of String """ - return self._client.call_method('NarrativeMethodStore.push_repo_to_tag', - [params], self._service_ver, context) + return self._client.call_method( + "NarrativeMethodStore.push_repo_to_tag", + [params], + self._service_ver, + context, + ) diff --git a/import_specifications/generate_import_template.py b/import_specifications/generate_import_template.py index d62f6835..86e58339 100755 --- a/import_specifications/generate_import_template.py +++ b/import_specifications/generate_import_template.py @@ -13,6 +13,7 @@ import argparse import json import sys + from clients.narrative_method_store_client import NarrativeMethodStore _HEADER_SEP = ";" @@ -20,45 +21,60 @@ _NMS_URLS = { # including the entire url vs constructing it makes this easy to update for local NMS # instances or whatever - 'prod': 'https://kbase.us/services/narrative_method_store/rpc', + "prod": "https://kbase.us/services/narrative_method_store/rpc", # prod and appdev point to the same service... *shrug* - 'appdev': 'https://appdev.kbase.us/services/narrative_method_store/rpc', - 'next': 'https://next.kbase.us/services/narrative_method_store/rpc', - 'ci': 'https://ci.kbase.us/services/narrative_method_store/rpc', + "appdev": "https://appdev.kbase.us/services/narrative_method_store/rpc", + "next": "https://next.kbase.us/services/narrative_method_store/rpc", + "ci": "https://ci.kbase.us/services/narrative_method_store/rpc", } _FORMAT_VERSION = 1 # evolve the format by making changes and incrementing the version + def parse_args(): - parser = argparse.ArgumentParser(description='Generate a bulk import template for an app') - parser.add_argument('app_id', help= - 'The app ID to process, for example kb_uploadmethods/import_sra_as_reads_from_staging') - parser.add_argument('data_type', help= - 'The datatype corresponding to the the app. This id is shared between the ' + - 'staging service and the narrative, for example sra_reads') - parser.add_argument('--tsv', action='store_true', help= - 'Create a TSV file rather than a CSV file (the default)') - parser.add_argument('--env', choices=['prod', 'appdev', 'next', 'ci'], default='prod', help= - 'The KBase environment to query, default prod') - parser.add_argument('--print-spec', action='store_true', help= - 'Print the input specification for the app to stderr') + parser = argparse.ArgumentParser(description="Generate a bulk import template for an app") + parser.add_argument( + "app_id", + help="The app ID to process, for example kb_uploadmethods/import_sra_as_reads_from_staging", + ) + parser.add_argument( + "data_type", + help="The datatype corresponding to the the app." + + "This id is shared between the staging service and the narrative, for example sra_reads", + ) + parser.add_argument( + "--tsv", + action="store_true", + help="Create a TSV file rather than a CSV file (the default)", + ) + parser.add_argument( + "--env", + choices=["prod", "appdev", "next", "ci"], + default="prod", + help="The KBase environment to query, default prod", + ) + parser.add_argument( + "--print-spec", + action="store_true", + help="Print the input specification for the app to stderr", + ) return parser.parse_args() def is_file_input(param): - if param['field_type'] != 'dynamic_dropdown': + if param["field_type"] != "dynamic_dropdown": return False - if 'dynamic_dropdown_options' not in param: - raise ValueError('Missing dynamic_dropdown_options field for dynamic_dropdown input') - return param['dynamic_dropdown_options'].get('data_source') == 'ftp_staging' + if "dynamic_dropdown_options" not in param: + raise ValueError("Missing dynamic_dropdown_options field for dynamic_dropdown input") + return param["dynamic_dropdown_options"].get("data_source") == "ftp_staging" def is_object_name(param): - return 'text_options' in param and param['text_options'].get('is_output_name') + return "text_options" in param and param["text_options"].get("is_output_name") def is_advanced(param): - return bool(param.get('advanced')) + return bool(param.get("advanced")) def parameter_order(param): @@ -75,26 +91,29 @@ def sort_params(params): new_params = [] for i, p in enumerate(params): p = dict(p) # make a copy, don't change the input - p['i'] = i + p["i"] = i new_params.append(p) - return sorted(new_params, key=lambda p: (parameter_order(p), p['i'])) + return sorted(new_params, key=lambda p: (parameter_order(p), p["i"])) def main(): args = parse_args() nms = NarrativeMethodStore(_NMS_URLS[args.env]) - spec = nms.get_method_spec({'ids': [args.app_id]}) + spec = nms.get_method_spec({"ids": [args.app_id]}) if args.print_spec: - print(json.dumps(spec[0]['parameters'], indent=4), file=sys.stderr) - params = sort_params(spec[0]['parameters']) - sep = '\t' if args.tsv else ', ' - print(f'Data type: {args.data_type}{_HEADER_SEP} ' - + f'Columns: {len(params)}{_HEADER_SEP} Version: {_FORMAT_VERSION}') + print(json.dumps(spec[0]["parameters"], indent=4), file=sys.stderr) + params = sort_params(spec[0]["parameters"]) + sep = "\t" if args.tsv else ", " + print( + f"Data type: {args.data_type}{_HEADER_SEP} " + + f"Columns: {len(params)}{_HEADER_SEP} Version: {_FORMAT_VERSION}" + ) # we could theoretically use the parameter order to note for the users the type of each # column - e.g. file input, output name, params, advanced params # That's not in scope for now - print(sep.join([p['id'] for p in params])) - print(sep.join([p['ui_name'] for p in params])) + print(sep.join([p["id"] for p in params])) + print(sep.join([p["ui_name"] for p in params])) + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/import_specifications/readme.md b/import_specifications/readme.md index 22f8b717..afb37710 100644 --- a/import_specifications/readme.md +++ b/import_specifications/readme.md @@ -1,2 +1,2 @@ This directory contains templates for bulk import specifications to be parsed by the -import specfication endpoint, and example code for how to generate them automatically. \ No newline at end of file +import specfication endpoint, and example code for how to generate them automatically. diff --git a/import_specifications/templates/README.md b/import_specifications/templates/README.md index 5a3433e6..8525f907 100644 --- a/import_specifications/templates/README.md +++ b/import_specifications/templates/README.md @@ -71,4 +71,4 @@ We are considering ways to make this easier in the future. ## Notes 1. Allowing templates to specify all the parameters for each import also enables us to support - this behavior at a later date without changing the template format. \ No newline at end of file + this behavior at a later date without changing the template format. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..38fb2ca3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,79 @@ +[tool.black] +# don't format the installed clients or the SDK-generated files. +#extend-exclude = ''' +#lib/( +# installed_clients/* +#| +# combinatrix/combinatrix(Impl|Server).py +#)''' +line-length = 100 + + + +[tool.ruff] +lint.select = [ + # core + "F", # Pyflakes + "E", # pycodestyle errors + "W", # pycodestyle warnings +# "C90", # mccabe + #TODO disable complexity checks +# "I", # isort #TODO enable sort of imports +# "N", # pep8-naming #Todo enable proper naming + # "D", # pydocstyle - disabled for now +# "UP", # pyupgrade # TODO Enable Type Annotations +] + + +# E203: whitespace before ‘,’, ‘;’, or ‘:’ +# E501: line length +# W503: line break after binary operator +lint.ignore = [ + "E203", + "E501", + "S101", +] + +# Allow autofix for all enabled rules (when `--fix`) is provided. +lint.fixable = ["ALL"] +lint.unfixable = [] + +# Exclude a variety of commonly ignored directories. +exclude = [ + "__pypackages__", + "_build", + ".bzr", + ".direnv", + ".eggs", + ".git-rewrite", + ".git", + ".github", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "*.pyc", + "buck-out", + "build", + "deps", + "dist", + "node_modules", + "other_schema", + "python-coverage", + "sample_data", + "venv", +] +lint.per-file-ignores = { } + +# Same as Black. +line-length = 100 + +# Assume Python 3.12. +target-version = "py312" + +[tool.ruff.pydocstyle] +convention = "google" diff --git a/requirements.txt b/requirements.txt index e637e064..17f782c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ pandas==1.3.2 xlrd==2.0.1 openpyxl==3.0.7 defusedxml==0.7.1 -python-magic==0.4.24 \ No newline at end of file +python-magic==0.4.24 diff --git a/run_tests.sh b/run_tests.sh index 43de2167..f4ddd44a 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -2,4 +2,4 @@ DIR="$( cd "$( dirname "$0" )" && pwd )" export KB_DEPLOYMENT_CONFIG="$DIR/deployment/conf/testing.cfg" export FILE_LIFETIME="90" -python3 -m pytest -s -vv --cov=staging_service \ No newline at end of file +python3 -m pytest -s -vv --cov=staging_service diff --git a/scripts/prune_acls.py b/scripts/prune_acls.py index 5902ff15..44934e9f 100644 --- a/scripts/prune_acls.py +++ b/scripts/prune_acls.py @@ -3,18 +3,17 @@ """ Deletes ACLS from globus, and then clears out directories older than THRESHOLD (60) days """ + from __future__ import print_function # for python 2 +import configparser import logging import time -import shutil from collections import namedtuple - from os.path import getmtime import globus_sdk from globus_sdk import TransferAPIError -import configparser """ Setup clients and read token @@ -22,19 +21,22 @@ current_time = time.time() THRESHOLD_DAYS = 60 -admin_acls = ['9cb619d0-4417-11e8-8e06-0a6d4e044368', '580118b2-dc53-11e6-9d02-22000a1e3b52'] -admin_names = ['dolsonadmin', 'dolson'] +admin_acls = [ + "9cb619d0-4417-11e8-8e06-0a6d4e044368", + "580118b2-dc53-11e6-9d02-22000a1e3b52", +] +admin_names = ["dolsonadmin", "dolson"] config = configparser.ConfigParser() config.read("globus.cfg") -cf = config['globus'] -endpoint_id = cf['endpoint_id'] +cf = config["globus"] +endpoint_id = cf["endpoint_id"] -client = globus_sdk.NativeAppAuthClient(cf['client_id']) +client = globus_sdk.NativeAppAuthClient(cf["client_id"]) try: - transfer_authorizer = globus_sdk.RefreshTokenAuthorizer(cf['transfer_token'], client) + transfer_authorizer = globus_sdk.RefreshTokenAuthorizer(cf["transfer_token"], client) globus_transfer_client = globus_sdk.TransferClient(authorizer=transfer_authorizer) - auth_authorizer = globus_sdk.RefreshTokenAuthorizer(cf['auth_token'], client) + auth_authorizer = globus_sdk.RefreshTokenAuthorizer(cf["auth_token"], client) globus_auth_client = globus_sdk.AuthClient(authorizer=auth_authorizer) except globus_sdk.GlobusAPIError as error: logging.error(str(error.code) + error.raw_text) @@ -48,7 +50,7 @@ def remove_directory(directory): """ try: logging.info("About to delete {}".format(directory)) - #shutil.rmtree(directory) + # shutil.rmtree(directory) except OSError as error: logging.error("Couldn't delete {} {} {}".format(directory, error.message, error.filename)) @@ -59,16 +61,18 @@ def remove_acl(acl): :return: Logs success or failure of deleting this ACL to the log """ logging.info( - "{}:About to remove ACL {} for {} (> {} days)".format(current_time, acl['id'], acl['path'], - THRESHOLD_DAYS)) + "{}:About to remove ACL {} for {} (> {} days)".format( + current_time, acl["id"], acl["path"], THRESHOLD_DAYS + ) + ) try: - resp = globus_transfer_client.delete_endpoint_acl_rule(endpoint_id, acl['id']) + globus_transfer_client.delete_endpoint_acl_rule(endpoint_id, acl["id"]) except TransferAPIError as error: logging.error(error.raw_text) def main(): - logging.basicConfig(filename='prune_acl.log', level=logging.INFO) + logging.basicConfig(filename="prune_acl.log", level=logging.INFO) logging.info("{}:BEGIN RUN".format(current_time)) old_acls = get_old_acls() @@ -85,7 +89,7 @@ def get_endpoint_acls(): :return: Return a dictionary of endpoint ACLS using the Globus API """ try: - return globus_transfer_client.endpoint_acl_list(endpoint_id)['DATA'] + return globus_transfer_client.endpoint_acl_list(endpoint_id)["DATA"] except TransferAPIError as error: print(error) @@ -117,13 +121,13 @@ def get_old_acls(): old_acls = [] old_acl_and_dir = namedtuple("old_acl_and_dir", "acl dir") for acl in acls: - directory = "/dtn/disk0/bulk" + acl['path'] - if directory_is_old(directory) and acl['id'] not in admin_acls: + directory = "/dtn/disk0/bulk" + acl["path"] + if directory_is_old(directory) and acl["id"] not in admin_acls: oad = old_acl_and_dir(acl, directory) old_acls.append(oad) return old_acls -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/refresh_token.py b/scripts/refresh_token.py index 8d9b225d..89fd2e5a 100644 --- a/scripts/refresh_token.py +++ b/scripts/refresh_token.py @@ -1,24 +1,23 @@ #!/usr/bin/env python2.7 -#This needs to be placed in cron on the api host to keep the globus token refreshed - # for python 2 -from globus_sdk import TransferClient -from globus_sdk import AuthClient -from globus_sdk import TransferAPIError -import globus_sdk -import traceback -import argparse -import os import pickle +from configparser import SafeConfigParser + +import globus_sdk -CLIENT_ID = '26d64c4c-fcc2-4f7c-b056-62f185875af6' +# for python 2 +# This needs to be placed in cron on the api host to keep the globus token refreshed + + +CLIENT_ID = "26d64c4c-fcc2-4f7c-b056-62f185875af6" client = globus_sdk.NativeAppAuthClient(CLIENT_ID) ########################## -#If we ever need to re-request the refreshable tokens, uncomment this section and use the globus account with admin permissions +# If we ever need to re-request the refreshable tokens, +# uncomment this section and use the globus account with admin permissions # on the share to complete the web browser step. ########################## -''' +""" client.oauth2_start_flow_native_app(refresh_tokens=True) # authorize_url = client.oauth2_get_authorize_url() @@ -36,52 +35,54 @@ with open('/opt/kb-ftp-api/f', 'wb') as f: pickle.dump(token_response, f) # -''' -#read token from file +""" +# read token from file ######################## # End refresh steps ####################### -#open the globus cfg file for writing -from configparser import SafeConfigParser +# open the globus cfg file for writing + parser = SafeConfigParser() -parser.read('/root/.globus.cfg') -#general.auth_token general.transfer_token +parser.read("/root/.globus.cfg") +# general.auth_token general.transfer_token # The refresh token is stored in this file.. protect this file -with open('/opt/kb-ftp-api/f', 'rb') as f: +with open("/opt/kb-ftp-api/f", "rb") as f: token_response = pickle.load(f) print(token_response) # let's get stuff for the Globus Transfer service -globus_transfer_data = token_response.by_resource_server['transfer.api.globus.org'] +globus_transfer_data = token_response.by_resource_server["transfer.api.globus.org"] # the refresh token and access token, often abbr. as RT and AT -transfer_rt = globus_transfer_data['refresh_token'] -transfer_at = globus_transfer_data['access_token'] -expires_at_s = globus_transfer_data['expires_at_seconds'] +transfer_rt = globus_transfer_data["refresh_token"] +transfer_at = globus_transfer_data["access_token"] +expires_at_s = globus_transfer_data["expires_at_seconds"] authorizer = globus_sdk.RefreshTokenAuthorizer( - transfer_rt, client, access_token=None, expires_at=expires_at_s) + transfer_rt, client, access_token=None, expires_at=expires_at_s +) print((authorizer.access_token)) -parser.set('general', 'transfer_token', authorizer.access_token) +parser.set("general", "transfer_token", authorizer.access_token) -#New let's get stuff for the Globus Auth service -globus_transfer_data = token_response.by_resource_server['auth.globus.org'] +# New let's get stuff for the Globus Auth service +globus_transfer_data = token_response.by_resource_server["auth.globus.org"] # the refresh token and access token, often abbr. as RT and AT -transfer_rt = globus_transfer_data['refresh_token'] -transfer_at = globus_transfer_data['access_token'] -expires_at_s = globus_transfer_data['expires_at_seconds'] +transfer_rt = globus_transfer_data["refresh_token"] +transfer_at = globus_transfer_data["access_token"] +expires_at_s = globus_transfer_data["expires_at_seconds"] authorizer = globus_sdk.RefreshTokenAuthorizer( - transfer_rt, client, access_token=None, expires_at=expires_at_s) + transfer_rt, client, access_token=None, expires_at=expires_at_s +) print((authorizer.access_token)) -parser.set('general', 'auth_token', authorizer.access_token) +parser.set("general", "auth_token", authorizer.access_token) # Writing our configuration file -#with open('/root/.globus.cfg', 'wb') as configfile: -with open('/root/.globus.cfg', 'w') as configfile: +# with open('/root/.globus.cfg', 'wb') as configfile: +with open("/root/.globus.cfg", "w") as configfile: parser.write(configfile) diff --git a/staging_service/AutoDetectUtils.py b/staging_service/AutoDetectUtils.py index 97766b72..6047df69 100644 --- a/staging_service/AutoDetectUtils.py +++ b/staging_service/AutoDetectUtils.py @@ -2,6 +2,7 @@ This class is in charge of determining possible importers by determining the suffix of the filepath pulled in, and by looking up the appropriate mappings in the supported_apps_w_extensions.json file """ + from typing import Optional, Tuple, Dict @@ -9,7 +10,9 @@ class AutoDetectUtils: _MAPPINGS = None # expects to be set by config @staticmethod - def determine_possible_importers(filename: str) -> Tuple[Optional[list], Dict[str, object]]: + def determine_possible_importers( + filename: str, + ) -> Tuple[Optional[list], Dict[str, object]]: """ Given a filename, come up with a reference to all possible apps. :param filename: The filename to find applicable apps for @@ -18,7 +21,7 @@ def determine_possible_importers(filename: str) -> Tuple[Optional[list], Dict[st The fileinfo dict, containing: the file prefix the file suffix, if a suffix matched a mapping - the file types, if a suffix matched a mapping, otherwise an empty list + the file types, if a suffix matched a mapping, otherwise an empty list """ dotcount = filename.count(".") if dotcount: @@ -32,10 +35,11 @@ def determine_possible_importers(filename: str) -> Tuple[Optional[list], Dict[st prefix = ".".join(parts[0:i]) return ( m["types"][suffix]["mappings"], - {"prefix": prefix, - "suffix": parts[-1], - "file_ext_type": m["types"][suffix]["file_ext_type"], - } + { + "prefix": prefix, + "suffix": parts[-1], + "file_ext_type": m["types"][suffix]["file_ext_type"], + }, ) return None, {"prefix": filename, "suffix": None, "file_ext_type": []} diff --git a/staging_service/JGIMetadata.py b/staging_service/JGIMetadata.py index 643be600..fdeac095 100644 --- a/staging_service/JGIMetadata.py +++ b/staging_service/JGIMetadata.py @@ -1,9 +1,11 @@ -from .utils import Path +import os from json import JSONDecoder + import aiofiles -import os from aiohttp import web +from .utils import Path + decoder = JSONDecoder() @@ -18,5 +20,3 @@ async def read_metadata_for(path: Path): path=path.user_path ) ) - - diff --git a/staging_service/__main__.py b/staging_service/__main__.py index e1f31b14..f917b9dc 100644 --- a/staging_service/__main__.py +++ b/staging_service/__main__.py @@ -1,9 +1,11 @@ -from .app import app_factory -from aiohttp import web import asyncio +import configparser import os + import uvloop -import configparser +from aiohttp import web + +from .app import app_factory asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) # for speed of event loop diff --git a/staging_service/app.py b/staging_service/app.py index ac04fc16..85dcbd1e 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -4,33 +4,32 @@ import shutil import sys from collections import defaultdict -from urllib.parse import parse_qs from pathlib import Path as PathPy +from urllib.parse import parse_qs import aiohttp_cors from aiohttp import web -from .app_error_formatter import format_import_spec_errors from .AutoDetectUtils import AutoDetectUtils from .JGIMetadata import read_metadata_for +from .app_error_formatter import format_import_spec_errors from .auth2Client import KBaseAuth2 +from .autodetect.Mappings import CSV, TSV, EXCEL from .globus import assert_globusid_exists, is_globusid -from .metadata import some_metadata, dir_info, add_upa, similar -from .utils import Path, run_command, AclManager from .import_specifications.file_parser import ( ErrorType, FileTypeResolution, parse_import_specifications, ) -from .import_specifications.individual_parsers import parse_csv, parse_tsv, parse_excel from .import_specifications.file_writers import ( write_csv, write_tsv, write_excel, ImportSpecWriteException, ) -from .autodetect.Mappings import CSV, TSV, EXCEL - +from .import_specifications.individual_parsers import parse_csv, parse_tsv, parse_excel +from .metadata import some_metadata, dir_info, add_upa, similar +from .utils import Path, run_command, AclManager logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) routes = web.RouteTableDef() @@ -80,7 +79,7 @@ async def importer_mappings(request: web.Request) -> web.json_response: if len(file_list) == 0: raise web.HTTPBadRequest( text=f"must provide file_list field. Your provided qs: {request.query_string}", - ) + ) mappings = AutoDetectUtils.get_mappings(file_list) return web.json_response(data=mappings) @@ -90,11 +89,11 @@ def _file_type_resolver(path: PathPy) -> FileTypeResolution: fi = AutoDetectUtils.get_mappings([str(path)])["fileinfo"][0] # Here we assume that the first entry in the file_ext_type field is the entry # we want. Presumably secondary entries are less general. - ftype = fi['file_ext_type'][0] if fi['file_ext_type'] else None + ftype = fi["file_ext_type"][0] if fi["file_ext_type"] else None if ftype in _IMPSPEC_FILE_TO_PARSER: return FileTypeResolution(parser=_IMPSPEC_FILE_TO_PARSER[ftype]) else: - ext = fi['suffix'] if fi['suffix'] else path.suffix[1:] if path.suffix else path.name + ext = fi["suffix"] if fi["suffix"] else path.suffix[1:] if path.suffix else path.name return FileTypeResolution(unsupported_type=ext) @@ -119,11 +118,14 @@ async def bulk_specification(request: web.Request) -> web.json_response: res = parse_import_specifications( tuple(list(paths)), _file_type_resolver, - lambda e: logging.error("Unexpected error while parsing import specs", exc_info=e)) + lambda e: logging.error("Unexpected error while parsing import specs", exc_info=e), + ) if res.results: types = {dt: result.result for dt, result in res.results.items()} - files = {dt: {"file": str(paths[result.source.file]), "tab": result.source.tab} - for dt, result in res.results.items()} + files = { + dt: {"file": str(paths[result.source.file]), "tab": result.source.tab} + for dt, result in res.results.items() + } return web.json_response({"types": types, "files": files}) errtypes = {e.error for e in res.errors} errtext = json.dumps({"errors": format_import_spec_errors(res.errors, paths)}) @@ -168,18 +170,20 @@ async def write_bulk_specification(request: web.Request) -> web.json_response: # There should be a way to get aiohttp to handle this but I can't find it return _createJSONErrorResponse( f"Required content-type is {_APP_JSON}", - error_class=web.HTTPUnsupportedMediaType) + error_class=web.HTTPUnsupportedMediaType, + ) if not request.content_length: return _createJSONErrorResponse( "The content-length header is required and must be > 0", - error_class=web.HTTPLengthRequired) + error_class=web.HTTPLengthRequired, + ) # No need to check the max content length; the server already does that. See tests data = await request.json() - if type(data) != dict: + if not isinstance(data, dict): return _createJSONErrorResponse("The top level JSON element must be a mapping") - folder = data.get('output_directory') - type_ = data.get('output_file_type') - if type(folder) != str: + folder = data.get("output_directory") + type_ = data.get("output_file_type") + if not isinstance(data, dict): return _createJSONErrorResponse("output_directory is required and must be a string") writer = _IMPSPEC_FILE_TO_WRITER.get(type_) if not writer: @@ -205,15 +209,11 @@ async def add_acl_concierge(request: web.Request): user_dir = Path.validate_path(username).full_path concierge_path = f"{Path._CONCIERGE_PATH}/{username}/" aclm = AclManager() - result = aclm.add_acl_concierge( - shared_directory=user_dir, concierge_path=concierge_path + result = aclm.add_acl_concierge(shared_directory=user_dir, concierge_path=concierge_path) + result["msg"] = f"Requesting Globus Perms for the following globus dir: {concierge_path}" + result["link"] = ( + f"https://app.globus.org/file-manager?destination_id={aclm.endpoint_id}&destination_path={concierge_path}" ) - result[ - "msg" - ] = f"Requesting Globus Perms for the following globus dir: {concierge_path}" - result[ - "link" - ] = f"https://app.globus.org/file-manager?destination_id={aclm.endpoint_id}&destination_path={concierge_path}" return web.json_response(result) @@ -260,7 +260,7 @@ async def file_exists(request: web.Request): show_hidden = True else: show_hidden = False - except KeyError as no_query: + except KeyError: show_hidden = False # this scans the entire directory recursively just to see if one file exists... why? results = await dir_info(user_dir, show_hidden, query) @@ -284,9 +284,7 @@ async def list_files(request: web.Request): username = await authorize_request(request) path = Path.validate_path(username, request.match_info.get("path", "")) if not os.path.exists(path.full_path): - raise web.HTTPNotFound( - text="path {path} does not exist".format(path=path.user_path) - ) + raise web.HTTPNotFound(text="path {path} does not exist".format(path=path.user_path)) elif os.path.isfile(path.full_path): raise web.HTTPBadRequest( text="{path} is a file not a directory".format(path=path.full_path) @@ -297,7 +295,7 @@ async def list_files(request: web.Request): show_hidden = True else: show_hidden = False - except KeyError as no_query: + except KeyError: show_hidden = False data = await dir_info(path, show_hidden, recurse=True) return web.json_response(data) @@ -311,17 +309,13 @@ async def download_files(request: web.Request): username = await authorize_request(request) path = Path.validate_path(username, request.match_info.get("path", "")) if not os.path.exists(path.full_path): - raise web.HTTPNotFound( - text="path {path} does not exist".format(path=path.user_path) - ) + raise web.HTTPNotFound(text="path {path} does not exist".format(path=path.user_path)) elif not os.path.isfile(path.full_path): raise web.HTTPBadRequest( text="{path} is a directory not a file".format(path=path.full_path) ) # hard coding the mime type to force download - return web.FileResponse( - path.full_path, headers={"content-type": "application/octet-stream"} - ) + return web.FileResponse(path.full_path, headers={"content-type": "application/octet-stream"}) @routes.get("/similar/{path:.+}") @@ -332,9 +326,7 @@ async def similar_files(request: web.Request): username = await authorize_request(request) path = Path.validate_path(username, request.match_info["path"]) if not os.path.exists(path.full_path): - raise web.HTTPNotFound( - text="path {path} does not exist".format(path=path.user_path) - ) + raise web.HTTPNotFound(text="path {path} does not exist".format(path=path.user_path)) elif os.path.isdir(path.full_path): raise web.HTTPBadRequest( text="{path} is a directory not a file".format(path=path.full_path) @@ -370,7 +362,7 @@ async def search(request: web.Request): show_hidden = True else: show_hidden = False - except KeyError as no_query: + except KeyError: show_hidden = False results = await dir_info(user_dir, show_hidden, query) results.sort(key=lambda x: x["mtime"], reverse=True) @@ -386,9 +378,7 @@ async def get_metadata(request: web.Request): username = await authorize_request(request) path = Path.validate_path(username, request.match_info["path"]) if not os.path.exists(path.full_path): - raise web.HTTPNotFound( - text="path {path} does not exist".format(path=path.user_path) - ) + raise web.HTTPNotFound(text="path {path} does not exist".format(path=path.user_path)) return web.json_response(await some_metadata(path)) @@ -416,9 +406,7 @@ async def upload_files_chunked(request: web.Request): counter = 0 user_file = None destPath = None - while ( - counter < 100 - ): # TODO this is arbitrary to keep an attacker from creating infinite loop + while counter < 100: # TODO this is arbitrary to keep an attacker from creating infinite loop # This loop handles the null parts that come in inbetween destpath and file part = await reader.next() @@ -435,19 +423,19 @@ async def upload_files_chunked(request: web.Request): filename: str = user_file.filename if filename.lstrip() != filename: - raise web.HTTPForbidden( # forbidden isn't really the right code, should be 400 + raise web.HTTPForbidden( text="cannot upload file with name beginning with space" - ) + ) # forbidden isn't really the right code, should be 400 if "," in filename: - raise web.HTTPForbidden( # for consistency we use 403 again + raise web.HTTPForbidden( text="cannot upload file with ',' in name" - ) + ) # for consistency we use 403 again # may want to make this configurable if we ever decide to add a hidden files toggle to # the staging area UI if filename.startswith("."): - raise web.HTTPForbidden( # for consistency we use 403 again + raise web.HTTPForbidden( text="cannot upload file with name beginning with '.'" - ) + ) # for consistency we use 403 again size = 0 destPath = os.path.join(destPath, filename) @@ -462,9 +450,8 @@ async def upload_files_chunked(request: web.Request): f.write(chunk) if not os.path.exists(path.full_path): - error_msg = "We are sorry but upload was interrupted. Please try again.".format( - path=path.full_path - ) + # TODO add in path.full_path to error message + error_msg = "We are sorry but the upload was interrupted. Please try again." raise web.HTTPNotFound(text=error_msg) response = await some_metadata( @@ -484,21 +471,17 @@ async def define_UPA(request: web.Request): path = Path.validate_path(username, request.match_info["path"]) if not os.path.exists(path.full_path or not os.path.isfile(path.full_path)): # TODO the security model here is to not care if someone wants to put in a false upa - raise web.HTTPNotFound( - text="no file found found on path {}".format(path.user_path) - ) + raise web.HTTPNotFound(text="no file found found on path {}".format(path.user_path)) if not request.has_body: raise web.HTTPBadRequest(text="must provide UPA field in body") body = await request.post() try: UPA = body["UPA"] - except KeyError as wrong_key: + except KeyError: raise web.HTTPBadRequest(text="must provide UPA field in body") await add_upa(path, UPA) return web.Response( - text="succesfully updated UPA {UPA} for file {path}".format( - UPA=UPA, path=path.user_path - ) + text="succesfully updated UPA {UPA} for file {path}".format(UPA=UPA, path=path.user_path) ) @@ -523,9 +506,7 @@ async def delete(request: web.Request): if os.path.exists(path.metadata_path): shutil.rmtree(path.metadata_path) else: - raise web.HTTPNotFound( - text="could not delete {path}".format(path=path.user_path) - ) + raise web.HTTPNotFound(text="could not delete {path}".format(path=path.user_path)) return web.Response(text="successfully deleted {path}".format(path=path.user_path)) @@ -544,7 +525,7 @@ async def rename(request: web.Request): body = await request.post() try: new_path = body["newPath"] - except KeyError as wrong_key: + except KeyError: raise web.HTTPBadRequest(text="must provide newPath field in body") new_path = Path.validate_path(username, new_path) if os.path.exists(path.full_path): @@ -577,13 +558,9 @@ async def decompress(request: web.Request): # 2 could try again after doign an automatic rename scheme (add nubmers to end) # 3 just overwrite and force destination = os.path.dirname(path.full_path) - if ( - upper_file_extension == ".tar" and file_extension == ".gz" - ) or file_extension == ".tgz": + if (upper_file_extension == ".tar" and file_extension == ".gz") or file_extension == ".tgz": await run_command("tar", "xzf", path.full_path, "-C", destination) - elif upper_file_extension == ".tar" and ( - file_extension == ".bz" or file_extension == ".bz2" - ): + elif upper_file_extension == ".tar" and (file_extension == ".bz" or file_extension == ".bz2"): await run_command("tar", "xjf", path.full_path, "-C", destination) elif file_extension == ".zip" or file_extension == ".ZIP": await run_command("unzip", path.full_path, "-d", destination) @@ -594,9 +571,7 @@ async def decompress(request: web.Request): elif file_extension == ".bz2" or file_extension == "bzip2": await run_command("bzip2", "-d", path.full_path) else: - raise web.HTTPBadRequest( - text="cannot decompress a {ext} file".format(ext=file_extension) - ) + raise web.HTTPBadRequest(text="cannot decompress a {ext} file".format(ext=file_extension)) return web.Response(text="succesfully decompressed " + path.user_path) @@ -664,8 +639,8 @@ def inject_config_dependencies(config): # if we start using the file ext type array for anything else this might need changes filetype = val["file_ext_type"][0] extensions[filetype].add(fileext) - for m in val['mappings']: - datatypes[m['id']].add(filetype) + for m in val["mappings"]: + datatypes[m["id"]].add(filetype) global _DATATYPE_MAPPINGS _DATATYPE_MAPPINGS = { "datatype_to_filetype": {k: sorted(datatypes[k]) for k in datatypes}, diff --git a/staging_service/app_error_formatter.py b/staging_service/app_error_formatter.py index 3d61fb41..70a67918 100644 --- a/staging_service/app_error_formatter.py +++ b/staging_service/app_error_formatter.py @@ -25,13 +25,13 @@ "type": "cannot_parse_file", "message": msg, "file": file1, - "tab": tab1 + "tab": tab1, }, ErrorType.INCORRECT_COLUMN_COUNT: lambda msg, file1, tab1, file2, tab2: { "type": "incorrect_column_count", "message": msg, "file": file1, - "tab": tab1 + "tab": tab1, }, ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE: lambda msg, file1, tab1, file2, tab2: { "type": "multiple_specifications_for_data_type", @@ -43,7 +43,9 @@ }, } -def format_import_spec_errors(errors: list[Error], path_translations: dict[Path, Path] + +def format_import_spec_errors( + errors: list[Error], path_translations: dict[Path, Path] ) -> list[dict[str, str]]: """ Formats a list of bulk import specification errors into a list of str->str dicts. @@ -67,4 +69,3 @@ def format_import_spec_errors(errors: list[Error], path_translations: dict[Path, tab2 = e.source_2.tab errs.append(_IMPORT_SPEC_ERROR_FORMATTERS[e.error](e.message, file1, tab1, file2, tab2)) return errs - diff --git a/staging_service/auth2Client.py b/staging_service/auth2Client.py index 1f390a5e..414ba946 100644 --- a/staging_service/auth2Client.py +++ b/staging_service/auth2Client.py @@ -4,13 +4,15 @@ @author: gaprice@lbl.gov modified for python3 and authV2 """ + +import hashlib import time as _time + import aiohttp -import hashlib class TokenCache(object): - """ A basic cache for tokens. """ + """A basic cache for tokens.""" _MAX_TIME_SEC = 5 * 60 # 5 min @@ -39,9 +41,7 @@ def add_valid_token(self, token, user, expire_time): token = hashlib.sha256(token.encode("utf8")).hexdigest() self._cache[token] = [user, _time.time(), expire_time] if len(self._cache) > self._maxsize: - for i, (t, _) in enumerate( - sorted(self._cache.items(), key=lambda v: v[1][1]) - ): + for i, (t, _) in enumerate(sorted(self._cache.items(), key=lambda v: v[1][1])): if i <= self._halfmax: del self._cache[t] else: @@ -67,9 +67,7 @@ async def get_user(self, token): if user: return user async with aiohttp.ClientSession() as session: - async with session.get( - self._authurl, headers={"Authorization": token} - ) as resp: + async with session.get(self._authurl, headers={"Authorization": token}) as resp: ret = await resp.json() if not resp.reason == "OK": raise aiohttp.web.HTTPUnauthorized( diff --git a/staging_service/autodetect/GenerateMappings.py b/staging_service/autodetect/GenerateMappings.py index 399bc3c4..f5af89d6 100644 --- a/staging_service/autodetect/GenerateMappings.py +++ b/staging_service/autodetect/GenerateMappings.py @@ -22,9 +22,41 @@ * Note: We should serve the generated content from memory * Note: This doesn't handle if we want to have different output types based on file extensions feeding into the same app """ + from collections import defaultdict -from staging_service.autodetect.Mappings import * +from staging_service.autodetect.Mappings import ( + sra_reads_id, + fastq_reads_noninterleaved_id, + fastq_reads_interleaved_id, + assembly_id, + gff_genome_id, + gff_metagenome_id, + genbank_genome_id, + decompress_id, + sample_set_id, + media_id, + expression_matrix_id, + metabolic_annotations_id, + metabolic_annotations_bulk_id, + fba_model_id, + phenotype_set_id, + escher_map_id, + import_specification, + SRA, + FASTQ, + FASTA, + GENBANK, + GFF, + ZIP, + CSV, + TSV, + EXCEL, + JSON, + SBML, + extension_to_file_format_mapping, + file_format_to_extension_mapping, +) # Note that some upload apps are not included - in particular batch apps, which are now # redundant, and MSAs and attribute mappings because they're out of scope at the current time. @@ -53,16 +85,34 @@ file_format_to_app_mapping = {} file_format_to_app_mapping[SRA] = [sra_reads_id] -file_format_to_app_mapping[FASTQ] = [fastq_reads_interleaved_id, fastq_reads_noninterleaved_id] -file_format_to_app_mapping[FASTA] = [assembly_id, gff_genome_id, gff_metagenome_id] +file_format_to_app_mapping[FASTQ] = [ + fastq_reads_interleaved_id, + fastq_reads_noninterleaved_id, +] +file_format_to_app_mapping[FASTA] = [ + assembly_id, + gff_genome_id, + gff_metagenome_id, +] file_format_to_app_mapping[GENBANK] = [genbank_genome_id] file_format_to_app_mapping[GFF] = [gff_genome_id, gff_metagenome_id] file_format_to_app_mapping[ZIP] = [decompress_id] file_format_to_app_mapping[CSV] = [sample_set_id, import_specification] file_format_to_app_mapping[TSV] = [ - media_id, expression_matrix_id, metabolic_annotations_id, metabolic_annotations_bulk_id, - fba_model_id, phenotype_set_id, import_specification] -file_format_to_app_mapping[EXCEL] = [sample_set_id, media_id, fba_model_id, import_specification] + media_id, + expression_matrix_id, + metabolic_annotations_id, + metabolic_annotations_bulk_id, + fba_model_id, + phenotype_set_id, + import_specification, +] +file_format_to_app_mapping[EXCEL] = [ + sample_set_id, + media_id, + fba_model_id, + import_specification, +] file_format_to_app_mapping[JSON] = [escher_map_id] file_format_to_app_mapping[SBML] = [fba_model_id] @@ -88,7 +138,6 @@ # and 1 being a perfect weight score of 100% extensions_mapping = {} for app_id in app_id_to_extensions: - perfect_match_weight = 1 for extension in app_id_to_extensions[app_id]: if extension not in extensions_mapping: @@ -98,7 +147,7 @@ # detection. For backwards compatibilily, we'd leave the current FASTQ type and # add a FASTQ-FWD or FWD type or something. "file_ext_type": [extension_to_file_format_mapping[extension]], - "mappings": [] + "mappings": [], } extensions_mapping[extension]["mappings"].append( { @@ -116,6 +165,7 @@ # this is currently unused by the code base, but we include it to make it easy to # see what file extensions are registered for each app "app_to_ext": app_id_to_extensions, - "types": extensions_mapping} + "types": extensions_mapping, + } with open("supported_apps_w_extensions.json", "w") as f: json.dump(obj=data, fp=f, indent=2) diff --git a/staging_service/autodetect/Mappings.py b/staging_service/autodetect/Mappings.py index b30d128a..51d27091 100644 --- a/staging_service/autodetect/Mappings.py +++ b/staging_service/autodetect/Mappings.py @@ -55,15 +55,23 @@ attribute_mapping_id = "attribute_mapping" escher_map_id = "escher_map" + def _flatten(some_list): return list(itertools.chain.from_iterable(some_list)) -_COMPRESSION_EXT = ["", ".gz", ".gzip"] # empty string to keep the uncompressed extension + +_COMPRESSION_EXT = [ + "", + ".gz", + ".gzip", +] # empty string to keep the uncompressed extension + # longer term there's probably a better way to do this but this is quick def _add_gzip(extension_list): return _flatten([[ext + comp for comp in _COMPRESSION_EXT] for ext in extension_list]) + file_format_to_extension_mapping = { FASTA: _add_gzip(["fna", "fa", "faa", "fsa", "fasta"]), FASTQ: _add_gzip(["fq", "fastq"]), @@ -88,8 +96,8 @@ def _add_gzip(extension_list): # "phylip", # "stockholm", # ], - TSV: ["tsv"], # See Note 1 below - CSV: ["csv"], # See Note 1 below + TSV: ["tsv"], # See Note 1 below + CSV: ["csv"], # See Note 1 below JSON: ["json"], EXCEL: ["xls", "xlsx"], # See Note 1 below ZIP: ["zip", "tar", "tgz", "tar.gz", "7z", "gz", "gzip", "rar"], @@ -105,4 +113,4 @@ def _add_gzip(extension_list): if ext in extension_to_file_format_mapping: type2 = extension_to_file_format_mapping[ext] raise ValueError(f"Duplicate entry for extension {ext} in {type_} and {type2}") - extension_to_file_format_mapping[ext] = type_ \ No newline at end of file + extension_to_file_format_mapping[ext] = type_ diff --git a/staging_service/globus.py b/staging_service/globus.py index 54b41fb4..4838fbef 100644 --- a/staging_service/globus.py +++ b/staging_service/globus.py @@ -1,8 +1,10 @@ -from .utils import Path -import aiohttp -import aiofiles -import os import configparser +import os + +import aiofiles +import aiohttp + +from .utils import Path def _get_authme_url(): @@ -45,7 +47,7 @@ def is_globusid(path: Path, username: str): async def assert_globusid_exists(username, token): - """ ensures that a globus id exists if there is a valid one for user""" + """ensures that a globus id exists if there is a valid one for user""" # make root dir root = Path.validate_path(username, "") diff --git a/staging_service/import_specifications/file_parser.py b/staging_service/import_specifications/file_parser.py index 68dcaa6b..c9cf7651 100644 --- a/staging_service/import_specifications/file_parser.py +++ b/staging_service/import_specifications/file_parser.py @@ -6,10 +6,11 @@ from collections.abc import Callable from dataclasses import dataclass from enum import Enum +from pathlib import Path +from typing import Union, Optional as Opt + # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 from frozendict.core import frozendict -from pathlib import Path -from typing import Union, Optional as O # TODO should get mypy working at some point @@ -20,6 +21,7 @@ class ErrorType(Enum): """ The type of an error encountered when trying to parse import specifications. """ + FILE_NOT_FOUND = 1 PARSE_FAIL = 2 INCORRECT_COLUMN_COUNT = 3 @@ -37,27 +39,33 @@ class SpecificationSource: tab - the name of the spreadsheet file tab from which the import specification was obtained, if any. """ + file: Path - tab: O[str] = None + tab: Opt[str] = None def __post_init__(self): if not self.file: raise ValueError("file is required") -_ERR_MESSAGE = 'message' -_ERR_SOURCE_1 = 'source_1' -_ERR_SOURCE_2 = 'source_2' +_ERR_MESSAGE = "message" +_ERR_SOURCE_1 = "source_1" +_ERR_SOURCE_2 = "source_2" _ERRTYPE_TO_REQ_ARGS = { ErrorType.FILE_NOT_FOUND: (_ERR_SOURCE_1,), ErrorType.PARSE_FAIL: (_ERR_MESSAGE, _ERR_SOURCE_1), ErrorType.INCORRECT_COLUMN_COUNT: (_ERR_MESSAGE, _ERR_SOURCE_1), - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE: (_ERR_MESSAGE, _ERR_SOURCE_1, _ERR_SOURCE_2), + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE: ( + _ERR_MESSAGE, + _ERR_SOURCE_1, + _ERR_SOURCE_2, + ), ErrorType.NO_FILES_PROVIDED: tuple(), ErrorType.OTHER: (_ERR_MESSAGE,), } + @dataclass(frozen=True) class Error: f""" @@ -80,9 +88,9 @@ class Error: """ error: ErrorType - message: O[str] = None - source_1: O[SpecificationSource] = None - source_2: O[SpecificationSource] = None + message: Opt[str] = None + source_1: Opt[SpecificationSource] = None + source_2: Opt[SpecificationSource] = None def __post_init__(self): if not self.error: @@ -111,6 +119,7 @@ class ParseResult: expected that the class creator do that error checking. Users should use the parse_import_specifications method to create instances of this class. """ + source: SpecificationSource result: tuple[frozendict[str, PRIMITIVE_TYPE], ...] @@ -137,8 +146,9 @@ class ParseResults: expected that the class creator do that error checking. Users should use the parse_import_specifications method to create an instance of this class. """ - results: O[frozendict[str, ParseResult]] = None - errors: O[tuple[Error, ...]] = None + + results: Opt[frozendict[str, ParseResult]] = None + errors: Opt[tuple[Error, ...]] = None def __post_init__(self): if not (bool(self.results) ^ bool(self.errors)): # xnor @@ -156,8 +166,9 @@ class FileTypeResolution: parser - a parser for the file unsupported_type - the file type if the type is not a supported type. """ - parser: O[Callable[[Path], ParseResults]] = None - unsupported_type: O[str] = None + + parser: Opt[Callable[[Path], ParseResults]] = None + unsupported_type: Opt[str] = None def __post_init__(self): if not (bool(self.parser) ^ bool(self.unsupported_type)): # xnor @@ -167,7 +178,7 @@ def __post_init__(self): def parse_import_specifications( paths: tuple[Path, ...], file_type_resolver: Callable[[Path], FileTypeResolution], - log_error: Callable[[Exception], None] + log_error: Callable[[Exception], None], ) -> ParseResults: """ Parse a set of import specification files and return the results. @@ -187,6 +198,7 @@ def parse_import_specifications( errors = tuple([Error(ErrorType.OTHER, str(e))]) return ParseResults(errors=errors) + def _parse( paths: tuple[Path, ...], file_type_resolver: Callable[[Path], FileTypeResolution], @@ -196,12 +208,14 @@ def _parse( for p in paths: file_type = file_type_resolver(p) if file_type.unsupported_type: - errors.append(Error( - ErrorType.PARSE_FAIL, - f"{file_type.unsupported_type} " + errors.append( + Error( + ErrorType.PARSE_FAIL, + f"{file_type.unsupported_type} " + "is not a supported file type for import specifications", - SpecificationSource(p) - )) + SpecificationSource(p), + ) + ) continue res = file_type.parser(p) if res.errors: @@ -209,12 +223,14 @@ def _parse( else: for data_type in res.results: if data_type in results: - errors.append(Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - f"Data type {data_type} appears in two importer specification sources", - results[data_type].source, - res.results[data_type].source - )) + errors.append( + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + f"Data type {data_type} appears in two importer specification sources", + results[data_type].source, + res.results[data_type].source, + ) + ) else: results[data_type] = res.results[data_type] if errors: diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py index 160b32e2..d18efeb5 100644 --- a/staging_service/import_specifications/file_writers.py +++ b/staging_service/import_specifications/file_writers.py @@ -25,18 +25,19 @@ Leave the `data` list empty to write an empty template. :returns: A mapping of the data types to the files to which they were written. """ + # note that we can't use an f string here to interpolate the variables below, e.g. # order_and_display, etc. import collections import csv import numbers +from pathlib import Path +from typing import Any from openpyxl import Workbook -from openpyxl.worksheet.worksheet import Worksheet from openpyxl.utils import get_column_letter -from pathlib import Path -from typing import Any +from openpyxl.worksheet.worksheet import Worksheet # this version is synonymous to the versions in individual_parsers.py. However, this module # should only ever write the most recent format for import specifictions, while the parsers @@ -60,6 +61,7 @@ _SEP_CSV = "," _SEP_TSV = "\t" + def _check_import_specification(types: dict[str, dict[str, list[Any]]]): f""" Check the structure of an import specification data structure. If the input is empty the @@ -87,24 +89,24 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]): # replace this with jsonschema? don't worry about it for now _check_string(datatype, "A data type") spec = types[datatype] - if type(spec) != dict: + if not isinstance(spec, dict): # noqa: E721 raise ImportSpecWriteException(f"The value for data type {datatype} must be a mapping") if _ORDER_AND_DISPLAY not in spec: - raise ImportSpecWriteException( - f"Data type {datatype} missing {_ORDER_AND_DISPLAY} key") + raise ImportSpecWriteException(f"Data type {datatype} missing {_ORDER_AND_DISPLAY} key") _check_is_sequence( - spec[_ORDER_AND_DISPLAY], f"Data type {datatype} {_ORDER_AND_DISPLAY} value") + spec[_ORDER_AND_DISPLAY], f"Data type {datatype} {_ORDER_AND_DISPLAY} value" + ) if not len(spec[_ORDER_AND_DISPLAY]): raise ImportSpecWriteException( - f"At least one entry is required for {_ORDER_AND_DISPLAY} for type {datatype}") + f"At least one entry is required for {_ORDER_AND_DISPLAY} for type {datatype}" + ) if _DATA not in spec: raise ImportSpecWriteException(f"Data type {datatype} missing {_DATA} key") _check_is_sequence(spec[_DATA], f"Data type {datatype} {_DATA} value") param_ids = set() for i, id_display in enumerate(spec[_ORDER_AND_DISPLAY]): - err = (f"Invalid {_ORDER_AND_DISPLAY} entry for datatype {datatype} " - + f"at index {i} ") + err = f"Invalid {_ORDER_AND_DISPLAY} entry for datatype {datatype} " + f"at index {i} " _check_is_sequence(id_display, err + "- the entry") if len(id_display) != 2: raise ImportSpecWriteException(err + "- expected 2 item list") @@ -114,21 +116,24 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]): param_ids.add(pid) for i, datarow in enumerate(spec[_DATA]): err = f"Data type {datatype} {_DATA} row {i}" - if type(datarow) != dict: + if not isinstance(datarow, dict): raise ImportSpecWriteException(err + " is not a mapping") if datarow.keys() != param_ids: raise ImportSpecWriteException( - err + f" does not have the same keys as {_ORDER_AND_DISPLAY}") + err + f" does not have the same keys as {_ORDER_AND_DISPLAY}" + ) for pid, v in datarow.items(): if v is not None and not isinstance(v, numbers.Number) and not isinstance(v, str): raise ImportSpecWriteException( - err + f"'s value for parameter {pid} is not a number or a string") + err + f"'s value for parameter {pid} is not a number or a string" + ) def _check_string(tocheck: Any, errprefix: str): if not isinstance(tocheck, str) or not tocheck.strip(): raise ImportSpecWriteException( - errprefix + " cannot be a non-string or a whitespace only string") + errprefix + " cannot be a non-string or a whitespace only string" + ) def _check_is_sequence(tocheck: Any, errprefix: str): @@ -159,10 +164,14 @@ def _write_xsv(folder: Path, types: dict[str, dict[str, list[Any]]], ext: str, s filename = datatype + "." + ext dt = types[datatype] cols = len(dt[_ORDER_AND_DISPLAY]) - with open(folder / filename, "w", newline='') as f: + with open(folder / filename, "w", newline="") as f: csvw = csv.writer(f, delimiter=sep) # handle sep escaping - csvw.writerow([f"{_DATA_TYPE} {datatype}{_HEADER_SEP} " - + f"{_COLUMN_STR} {cols}{_HEADER_SEP} {_VERSION_STR} {_VERSION}"]) + csvw.writerow( + [ + f"{_DATA_TYPE} {datatype}{_HEADER_SEP} " + + f"{_COLUMN_STR} {cols}{_HEADER_SEP} {_VERSION_STR} {_VERSION}" + ] + ) pids = [i[0] for i in dt[_ORDER_AND_DISPLAY]] csvw.writerow(pids) csvw.writerow([i[1] for i in dt[_ORDER_AND_DISPLAY]]) @@ -177,7 +186,8 @@ def _check_write_args(folder: Path, types: dict[str, dict[str, list[Any]]]): # this is a programming error, not a user input error, so not using the custom # exception here raise ValueError("The folder cannot be null") - if type(types) != dict: + # This is checking the type hierarchy, do not use isinstance here + if type(types) != dict: # noqa: E721 raise ImportSpecWriteException("The types value must be a mapping") _check_import_specification(types) @@ -203,8 +213,10 @@ def write_excel(folder: Path, types: dict[str, dict[str, list[Any]]]) -> dict[st _write_excel_row(sheet, xlrow, [row[pid] for pid in pids]) _expand_excel_columns_to_max_width(sheet) # Add the hidden data *after* expanding the columns - sheet['A1'] = (f"{_DATA_TYPE} {datatype}{_HEADER_SEP} " - + f"{_COLUMN_STR} {cols}{_HEADER_SEP} {_VERSION_STR} {_VERSION}") + sheet["A1"] = ( + f"{_DATA_TYPE} {datatype}{_HEADER_SEP} " + + f"{_COLUMN_STR} {cols}{_HEADER_SEP} {_VERSION_STR} {_VERSION}" + ) _write_excel_row(sheet, 2, pids) sheet.row_dimensions[1].hidden = True sheet.row_dimensions[2].hidden = True @@ -220,6 +232,7 @@ def _write_excel_row(sheet: Worksheet, row: int, contents: list[Any]): for col, val in enumerate(contents, start=1): sheet.cell(row=row, column=col).value = val + def _expand_excel_columns_to_max_width(sheet: Worksheet): # https://stackoverflow.com/a/40935194/643675 for column_cells in sheet.columns: @@ -235,4 +248,5 @@ class ImportSpecWriteException(Exception): """ An exception thrown when writing an import specification fails. """ + pass diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 61bf84b5..8b6ffeff 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -3,15 +3,16 @@ """ import csv -import magic import math -import pandas import re +from pathlib import Path +from typing import Optional as Opt, Union, Any + +import magic +import pandas # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 from frozendict.core import frozendict -from pathlib import Path -from typing import Optional as O, Union, Any from staging_service.import_specifications.file_parser import ( PRIMITIVE_TYPE, @@ -31,10 +32,14 @@ _VERSION_STR = "Version:" _COLUMN_STR = "Columns:" _HEADER_SEP = ";" -_EXPECTED_HEADER = (f"{_DATA_TYPE} {_HEADER_SEP} " - + f"{_COLUMN_STR} {_HEADER_SEP} {_VERSION_STR} ") -_HEADER_REGEX = re.compile(f"{_DATA_TYPE} (\\w+){_HEADER_SEP} " - + f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)") +_EXPECTED_HEADER = ( + f"{_DATA_TYPE} {_HEADER_SEP} " + + f"{_COLUMN_STR} {_HEADER_SEP} {_VERSION_STR} " +) +_HEADER_REGEX = re.compile( + f"{_DATA_TYPE} (\\w+){_HEADER_SEP} " + + f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)" +) _MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"} @@ -60,24 +65,29 @@ class _ParseException(Exception): pass -def _parse_header(header: str, spec_source: SpecificationSource, maximum_version: int +def _parse_header( + header: str, spec_source: SpecificationSource, maximum_version: int ) -> tuple[str, int]: # return is (data type, column count) match = _HEADER_REGEX.fullmatch(header) if not match: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f'Invalid header; got "{header}", expected "{_EXPECTED_HEADER}"', - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f'Invalid header; got "{header}", expected "{_EXPECTED_HEADER}"', + spec_source, + ) + ) version = int(match[3]) if version > maximum_version: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f"Schema version {version} is larger than maximum processable " - + f"version {maximum_version}", - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f"Schema version {version} is larger than maximum processable " + + f"version {maximum_version}", + spec_source, + ) + ) return match[1], int(match[2]) @@ -86,23 +96,26 @@ def _csv_next( line_number: int, expected_line_count: Union[None, int], # None = skip columns check spec_source: SpecificationSource, - error: str + error: str, ) -> list[str]: try: line = next(input_) except StopIteration: raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source)) if expected_line_count and len(line) != expected_line_count: - raise _ParseException(Error( - ErrorType.INCORRECT_COLUMN_COUNT, - f"Incorrect number of items in line {line_number}, " - + f"expected {expected_line_count}, got {len(line)}", - spec_source)) + raise _ParseException( + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + f"Incorrect number of items in line {line_number}, " + + f"expected {expected_line_count}, got {len(line)}", + spec_source, + ) + ) return line def _error(error: Error) -> ParseResults: - return ParseResults(errors = tuple([error])) + return ParseResults(errors=tuple([error])) def _normalize_pandas(val: PRIMITIVE_TYPE) -> PRIMITIVE_TYPE: @@ -141,18 +154,22 @@ def _normalize_headers( ret = [str(s).strip() if not pandas.isna(s) else None for s in headers] for i, name in enumerate(ret, start=1): if not name: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f"Missing header entry in row {line_number}, position {i}", - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f"Missing header entry in row {line_number}, position {i}", + spec_source, + ) + ) if name in seen: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f"Duplicate header name in row {line_number}: {name}", - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f"Duplicate header name in row {line_number}: {name}", + spec_source, + ) + ) seen.add(name) return ret @@ -163,7 +180,7 @@ def _parse_xsv(path: Path, sep: str) -> ParseResults: filetype = magic.from_file(str(path), mime=True) if filetype not in _MAGIC_TEXT_FILES: return _error(Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc)) - with open(path, newline='') as input_: + with open(path, newline="") as input_: rdr = csv.reader(input_, delimiter=sep) # let parser handle quoting dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header") datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION) @@ -176,19 +193,20 @@ def _parse_xsv(path: Path, sep: str) -> ParseResults: if len(row) != columns: # could collect errors (first 10?) and throw an exception with a list # lets wait and see if that's really needed - raise _ParseException(Error( - ErrorType.INCORRECT_COLUMN_COUNT, - f"Incorrect number of items in line {i}, " - + f"expected {columns}, got {len(row)}", - spcsrc)) - results.append(frozendict( - {param_ids[j]: _normalize_xsv(row[j]) for j in range(len(row))})) + raise _ParseException( + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + f"Incorrect number of items in line {i}, " + + f"expected {columns}, got {len(row)}", + spcsrc, + ) + ) + results.append( + frozendict({param_ids[j]: _normalize_xsv(row[j]) for j in range(len(row))}) + ) if not results: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) - return ParseResults(frozendict( - {datatype: ParseResult(spcsrc, tuple(results))} - )) + raise _ParseException(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) + return ParseResults(frozendict({datatype: ParseResult(spcsrc, tuple(results))})) except FileNotFoundError: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: @@ -198,12 +216,12 @@ def _parse_xsv(path: Path, sep: str) -> ParseResults: def parse_csv(path: Path) -> ParseResults: - """ Parse the provided CSV file. """ + """Parse the provided CSV file.""" return _parse_xsv(path, ",") def parse_tsv(path: Path) -> ParseResults: - """ Parse the provided TSV file. """ + """Parse the provided TSV file.""" return _parse_xsv(path, "\t") @@ -214,16 +232,20 @@ def _process_excel_row( if pandas.isna(row[-1]): # inefficient, but premature optimization... row = row[0:-1] else: - raise _ParseException(Error( - ErrorType.INCORRECT_COLUMN_COUNT, - f"Incorrect number of items in line {rownum}, " - + f"expected {expected_columns}, got {len(row)}", - spcsrc)) + raise _ParseException( + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + f"Incorrect number of items in line {rownum}, " + + f"expected {expected_columns}, got {len(row)}", + spcsrc, + ) + ) return row - -def _process_excel_tab(excel: pandas.ExcelFile, spcsrc: SpecificationSource -) -> (O[str], O[ParseResult]): + +def _process_excel_tab( + excel: pandas.ExcelFile, spcsrc: SpecificationSource +) -> (Opt[str], Opt[ParseResult]): df = excel.parse(sheet_name=spcsrc.tab, na_values=_EXCEL_MISSING_VALUES, keep_default_na=False) if df.shape[0] < 3: # might as well not error check headers in sheets with no data return (None, None) @@ -239,8 +261,9 @@ def _process_excel_tab(excel: pandas.ExcelFile, spcsrc: SpecificationSource for i, row in enumerate(it, start=4): row = _process_excel_row(row, i, columns, spcsrc) if any(map(lambda x: not pandas.isna(x), row)): # skip empty rows - results.append(frozendict( - {param_ids[j]: _normalize_pandas(row[j]) for j in range(len(row))})) + results.append( + frozendict({param_ids[j]: _normalize_pandas(row[j]) for j in range(len(row))}) + ) return datatype, ParseResult(spcsrc, tuple(results)) @@ -262,12 +285,14 @@ def parse_excel(path: Path) -> ParseResults: if not datatype: continue elif datatype in results: - errors.append(Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - f"Found datatype {datatype} in multiple tabs", - SpecificationSource(path, datatype_to_tab[datatype]), - spcsrc_tab, - )) + errors.append( + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + f"Found datatype {datatype} in multiple tabs", + SpecificationSource(path, datatype_to_tab[datatype]), + spcsrc_tab, + ) + ) else: datatype_to_tab[datatype] = tab results[datatype] = result @@ -279,8 +304,13 @@ def parse_excel(path: Path) -> ParseResults: return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) except ValueError as e: if "Excel file format cannot be determined" in str(e): - return _error(Error( - ErrorType.PARSE_FAIL, "Not a supported Excel file type", source_1=spcsrc)) + return _error( + Error( + ErrorType.PARSE_FAIL, + "Not a supported Excel file type", + source_1=spcsrc, + ) + ) raise e # bail out, not sure what's wrong, not sure how to test either if errors: return ParseResults(errors=tuple(errors)) @@ -288,4 +318,3 @@ def parse_excel(path: Path) -> ParseResults: return ParseResults(frozendict(results)) else: return _error(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) - \ No newline at end of file diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 110a7716..64820bdc 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -1,11 +1,12 @@ +import hashlib +import os +from difflib import SequenceMatcher from json import JSONDecoder, JSONEncoder + import aiofiles -from .utils import run_command, Path -import os from aiohttp import web -import hashlib -from difflib import SequenceMatcher -from itertools import islice + +from .utils import Path decoder = JSONDecoder() encoder = JSONEncoder() @@ -52,7 +53,7 @@ async def _generate_metadata(path: Path, source: str): data["source"] = source try: md5 = hashlib.md5(open(path.full_path, "rb").read()).hexdigest() - except: + except Exception: md5 = "n/a" data["md5"] = md5.split()[0] @@ -61,7 +62,7 @@ async def _generate_metadata(path: Path, source: str): try: # all things that expect a text file to decode output should be in this block data["head"] = _file_read_from_head(path.full_path) data["tail"] = _file_read_from_tail(path.full_path) - except: + except Exception: data["head"] = "not text file" data["tail"] = "not text file" async with aiofiles.open(path.metadata_path, mode="w") as f: @@ -100,7 +101,7 @@ async def _only_source(path: Path): try: data = await extant.read() data = decoder.decode(data) - except: + except Exception: data = {} else: data = {} @@ -114,9 +115,7 @@ async def _only_source(path: Path): return data["source"] -async def dir_info( - path: Path, show_hidden: bool, query: str = "", recurse=True -) -> list: +async def dir_info(path: Path, show_hidden: bool, query: str = "", recurse=True) -> list: """ only call this on a validated full path """ @@ -125,15 +124,13 @@ async def dir_info( specific_path = Path.from_full_path(entry.path) # maybe should never show the special .globus_id file ever? # or moving it somewhere outside the user directory would be better - if not show_hidden and entry.name.startswith('.'): + if not show_hidden and entry.name.startswith("."): continue if entry.is_dir(): if query == "" or specific_path.user_path.find(query) != -1: response.append(await stat_data(specific_path)) if recurse: - response.extend( - await dir_info(specific_path, show_hidden, query, recurse) - ) + response.extend(await dir_info(specific_path, show_hidden, query, recurse)) if entry.is_file(): if query == "" or specific_path.user_path.find(query) != -1: data = await stat_data(specific_path) diff --git a/staging_service/utils.py b/staging_service/utils.py index 79841d65..8a33f9e2 100644 --- a/staging_service/utils.py +++ b/staging_service/utils.py @@ -98,21 +98,13 @@ def __init__(self): client = globus_sdk.NativeAppAuthClient(cf["client_id"]) try: - transfer_authorizer = globus_sdk.RefreshTokenAuthorizer( - cf["transfer_token"], client - ) - self.globus_transfer_client = globus_sdk.TransferClient( - authorizer=transfer_authorizer - ) - auth_authorizer = globus_sdk.RefreshTokenAuthorizer( - cf["auth_token"], client - ) + transfer_authorizer = globus_sdk.RefreshTokenAuthorizer(cf["transfer_token"], client) + self.globus_transfer_client = globus_sdk.TransferClient(authorizer=transfer_authorizer) + auth_authorizer = globus_sdk.RefreshTokenAuthorizer(cf["auth_token"], client) self.globus_auth_client = globus_sdk.AuthClient(authorizer=auth_authorizer) except globus_sdk.GlobusAPIError as error: logging.error(str(error.code) + error.raw_text) - raise HTTPInternalServerError( - text=str("Invalid Token Specified in globus.cfg file") - ) + raise HTTPInternalServerError(text=str("Invalid Token Specified in globus.cfg file")) def _get_globus_identities(self, shared_directory: str): """ @@ -122,18 +114,14 @@ def _get_globus_identities(self, shared_directory: str): globus_id_filename = "{}.globus_id".format(shared_directory) with open(globus_id_filename, "r") as fp: ident = fp.read() - return self.globus_auth_client.get_identities( - usernames=ident.split("\n")[0] - ) + return self.globus_auth_client.get_identities(usernames=ident.split("\n")[0]) def _get_globus_identity(self, globus_id_filename: str): """ Get the first identity for the username in the .globus_id file """ try: - return self._get_globus_identities(globus_id_filename)["identities"][0][ - "id" - ] + return self._get_globus_identities(globus_id_filename)["identities"][0]["id"] except FileNotFoundError as error: response = { "success": False, @@ -167,7 +155,7 @@ def _add_acl(self, user_identity_id: str, shared_directory_basename: str): Attempt to add acl for the given user id and directory """ try: - resp = self.globus_transfer_client.add_endpoint_acl_rule( + self.globus_transfer_client.add_endpoint_acl_rule( self.endpoint_id, dict( DATA_TYPE="access", @@ -186,9 +174,7 @@ def _add_acl(self, user_identity_id: str, shared_directory_basename: str): } logging.info(response) - logging.info( - "Shared %s with %s\n" % (shared_directory_basename, user_identity_id) - ) + logging.info("Shared %s with %s\n" % (shared_directory_basename, user_identity_id)) logging.info(response) return response @@ -205,18 +191,14 @@ def _add_acl(self, user_identity_id: str, shared_directory_basename: str): if error.code == "Exists": raise HTTPOk(text=json.dumps(response), content_type="application/json") - raise HTTPInternalServerError( - text=json.dumps(response), content_type="application/json" - ) + raise HTTPInternalServerError(text=json.dumps(response), content_type="application/json") def _remove_acl(self, user_identity_id: str): """ Get all ACLS and attempt to remove the correct ACL for the given user_identity """ try: - acls = self.globus_transfer_client.endpoint_acl_list(self.endpoint_id)[ - "DATA" - ] + acls = self.globus_transfer_client.endpoint_acl_list(self.endpoint_id)["DATA"] for acl in acls: if user_identity_id == acl["principal"]: if "id" in acl and acl["id"] is not None: @@ -240,7 +222,7 @@ def _remove_acl(self, user_identity_id: str): text=json.dumps(response), content_type="application/json" ) - except globus_sdk.GlobusAPIError as error: + except globus_sdk.GlobusAPIError: response = { "success": False, "error_type": "GlobusAPIError", diff --git a/tests/import_specifications/test_file_parser.py b/tests/import_specifications/test_file_parser.py index 857be960..ecb49ddf 100644 --- a/tests/import_specifications/test_file_parser.py +++ b/tests/import_specifications/test_file_parser.py @@ -1,13 +1,13 @@ # not much point in testing data classes unless there's custom logic in them from collections.abc import Callable -from frozendict import frozendict -from pytest import raises -from tests.test_utils import assert_exception_correct from pathlib import Path -from typing import Optional as O +from typing import Optional as Opt from unittest.mock import Mock, call +from frozendict import frozendict +from pytest import raises + from staging_service.import_specifications.file_parser import ( PRIMITIVE_TYPE, ErrorType, @@ -16,11 +16,12 @@ Error, ParseResult, ParseResults, - parse_import_specifications + parse_import_specifications, ) +from tests.test_utils import assert_exception_correct -def spcsrc(path: str, tab: O[str]=None): +def spcsrc(path: str, tab: Opt[str] = None): return SpecificationSource(Path(path), tab) @@ -43,17 +44,19 @@ def test_SpecificationSource_init_fail(): specificationSource_init_fail(None, ValueError("file is required")) -def specificationSource_init_fail(file_: O[str], expected: Exception): +def specificationSource_init_fail(file_: Opt[str], expected: Exception): with raises(Exception) as got: SpecificationSource(file_) assert_exception_correct(got.value, expected) def test_FileTypeResolution_init_w_parser_success(): - p = lambda path: ParseResults(errors=(Error(ErrorType.OTHER, "foo"),)) + def p(path): + return ParseResults(errors=(Error(ErrorType.OTHER, "foo"),)) + ftr = FileTypeResolution(p) - assert ftr.parser is p # Here only identity equality makes sense + assert ftr.parser is p # Check for identity equality assert ftr.unsupported_type is None @@ -72,9 +75,9 @@ def test_FileTypeResolution_init_fail(): def fileTypeResolution_init_fail( - parser: O[Callable[[Path], ParseResults]], - unexpected_type: O[str], - expected: Exception + parser: Opt[Callable[[Path], ParseResults]], + unexpected_type: Opt[str], + expected: Exception, ): with raises(Exception) as got: FileTypeResolution(parser, unexpected_type) @@ -119,7 +122,10 @@ def test_Error_init_w_INCORRECT_COLUMN_COUNT_success(): def test_Error_init_w_MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE_success(): e = Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foo", spcsrc("foo2"), spcsrc("yay") + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + "foo", + spcsrc("foo2"), + spcsrc("yay"), ) assert e.error == ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE @@ -158,8 +164,13 @@ def test_Error_init_w_OTHER_success(): def test_Error_init_fail(): # arguments are error type, message string, 1st source, 2nd source, exception error_init_fail(None, None, None, None, ValueError("error is required")) - error_init_fail(ErrorType.FILE_NOT_FOUND, None, None, None, ValueError( - "source_1 is required for a FILE_NOT_FOUND error")) + error_init_fail( + ErrorType.FILE_NOT_FOUND, + None, + None, + None, + ValueError("source_1 is required for a FILE_NOT_FOUND error"), + ) err = "message, source_1 is required for a PARSE_FAIL error" error_init_fail(ErrorType.PARSE_FAIL, None, spcsrc("wooo"), None, ValueError(err)) error_init_fail(ErrorType.PARSE_FAIL, "msg", None, None, ValueError(err)) @@ -167,22 +178,28 @@ def test_Error_init_fail(): error_init_fail(ErrorType.INCORRECT_COLUMN_COUNT, None, spcsrc("whee"), None, ValueError(err)) error_init_fail(ErrorType.INCORRECT_COLUMN_COUNT, "msg", None, None, ValueError(err)) ms = ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE - err = ("message, source_1, source_2 is required for a " - + "MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE error") + err = ( + "message, source_1, source_2 is required for a MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE error" + ) error_init_fail(ms, None, None, None, ValueError(err)) error_init_fail(ms, None, spcsrc("foo"), spcsrc("bar"), ValueError(err)) error_init_fail(ms, "msg", None, spcsrc("bar"), ValueError(err)) error_init_fail(ms, "msg", spcsrc("foo"), None, ValueError(err)) - error_init_fail(ErrorType.OTHER, None, None, None, ValueError( - "message is required for a OTHER error")) + error_init_fail( + ErrorType.OTHER, + None, + None, + None, + ValueError("message is required for a OTHER error"), + ) def error_init_fail( - errortype: O[ErrorType], - message: O[str], - source_1: O[SpecificationSource], - source_2: O[SpecificationSource], - expected: Exception + errortype: Opt[ErrorType], + message: Opt[str], + source_1: Opt[SpecificationSource], + source_2: Opt[SpecificationSource], + expected: Exception, ): with raises(Exception) as got: Error(errortype, message, source_1, source_2) @@ -203,25 +220,30 @@ def test_ParseResult_init_fail(): def parseResult_init_fail( - source: O[SpecificationSource], - result: O[tuple[frozendict[str, PRIMITIVE_TYPE], ...]], - expected: Exception + source: Opt[SpecificationSource], + result: Opt[tuple[frozendict[str, PRIMITIVE_TYPE], ...]], + expected: Exception, ): with raises(Exception) as got: ParseResult(source, result) assert_exception_correct(got.value, expected) -PR_RESULTS = frozendict({"data_type": ParseResult( - spcsrc("some_file", "tab"), - (frozendict({"fasta_file": "foo.fa", "do_thing": 1}),) # make a tuple! -)}) +PR_RESULTS = frozendict( + { + "data_type": ParseResult( + spcsrc("some_file", "tab"), + (frozendict({"fasta_file": "foo.fa", "do_thing": 1}),), + ) + } +) # make a tuple! PR_ERROR = ( Error(ErrorType.OTHER, message="foo"), - Error(ErrorType.PARSE_FAIL, message="bar", source_1=spcsrc("some_file", "tab3")) + Error(ErrorType.PARSE_FAIL, message="bar", source_1=spcsrc("some_file", "tab3")), ) + def test_ParseResults_init_w_results_success(): results_copy = frozendict(PR_RESULTS) # prevent identity equality @@ -233,7 +255,7 @@ def test_ParseResults_init_w_results_success(): def test_ParseResults_init_w_error_success(): - errors_copy = tuple(PR_ERROR) # prevent identity equality + errors_copy = tuple(PR_ERROR) # prevent identity equality pr = ParseResults(errors=PR_ERROR) assert pr.results is None @@ -243,24 +265,25 @@ def test_ParseResults_init_w_error_success(): def test_ParseResults_init_fail(): - err = 'Exectly one of results or errors must be supplied' + err = "Exectly one of results or errors must be supplied" parseResults_init_fail(None, None, ValueError(err)) parseResults_init_fail(PR_RESULTS, PR_ERROR, ValueError(err)) def parseResults_init_fail( - results: O[frozendict[str, ParseResult]], - errors: O[tuple[Error, ...]], - expected: Exception + results: Opt[frozendict[str, ParseResult]], + errors: Opt[tuple[Error, ...]], + expected: Exception, ): with raises(Exception) as got: ParseResults(results, errors) assert_exception_correct(got.value, expected) -def _ftr(parser: Callable[[Path], ParseResults] = None, notype: str=None) -> FileTypeResolution: +def _ftr(parser: Callable[[Path], ParseResults] = None, notype: str = None) -> FileTypeResolution: return FileTypeResolution(parser, notype) + def _get_mocks(count: int) -> tuple[Mock, ...]: return (Mock() for _ in range(count)) @@ -270,44 +293,50 @@ def test_parse_import_specifications_success(): resolver.side_effect = [_ftr(parser1), _ftr(parser2)] - parser1.return_value = ParseResults(frozendict( - {"type1": ParseResult( - spcsrc("myfile.xlsx", "tab1"), - (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})) - ), - "type2": ParseResult( - spcsrc("myfile.xlsx", "tab2"), - (frozendict({"whee": "whoo"}),) # tuple! - ) - } - )) - - parser2.return_value = ParseResults(frozendict( - {"type_other": ParseResult( - spcsrc("somefile.csv"), - (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})) - ) - } - )) + parser1.return_value = ParseResults( + frozendict( + { + "type1": ParseResult( + spcsrc("myfile.xlsx", "tab1"), + (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})), + ), + "type2": ParseResult( + spcsrc("myfile.xlsx", "tab2"), (frozendict({"whee": "whoo"}),) + ), # tuple! + } + ) + ) - res = parse_import_specifications( - (Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger + parser2.return_value = ParseResults( + frozendict( + { + "type_other": ParseResult( + spcsrc("somefile.csv"), + (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})), + ) + } + ) ) - assert res == ParseResults(frozendict({ - "type1": ParseResult( - spcsrc("myfile.xlsx", "tab1"), - (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})) - ), - "type2": ParseResult( - spcsrc("myfile.xlsx", "tab2"), - (frozendict({"whee": "whoo"}),) # tuple! - ), - "type_other": ParseResult( - spcsrc("somefile.csv"), - (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})) - ) - })) + res = parse_import_specifications((Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger) + + assert res == ParseResults( + frozendict( + { + "type1": ParseResult( + spcsrc("myfile.xlsx", "tab1"), + (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})), + ), + "type2": ParseResult( + spcsrc("myfile.xlsx", "tab2"), (frozendict({"whee": "whoo"}),) + ), # tuple! + "type_other": ParseResult( + spcsrc("somefile.csv"), + (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})), + ), + } + ) + ) resolver.assert_has_calls([call(Path("myfile.xlsx")), call(Path("somefile.csv"))]) parser1.assert_called_once_with(Path("myfile.xlsx")) @@ -330,11 +359,9 @@ def test_parse_import_specification_resolver_exception(): resolver.side_effect = [_ftr(parser1), ArithmeticError("crapsticks")] # test that other errors aren't included in the result - parser1.return_value = ParseResults(errors=tuple([Error(ErrorType.OTHER, 'foo')])) + parser1.return_value = ParseResults(errors=tuple([Error(ErrorType.OTHER, "foo")])) - res = parse_import_specifications( - (Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger - ) + res = parse_import_specifications((Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger) assert res == ParseResults(errors=tuple([Error(ErrorType.OTHER, "crapsticks")])) @@ -360,32 +387,43 @@ def test_parse_import_specification_unsupported_type_and_parser_error(): resolver.side_effect = [_ftr(parser1), _ftr(parser2), _ftr(notype="JPEG")] # check that other errors are also returned, and the results are ignored - parser1.return_value = ParseResults(errors=tuple([ - Error(ErrorType.OTHER, 'foo'), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")) - ])) + parser1.return_value = ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "foo"), + Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")), + ] + ) + ) parser2.return_value = ParseResults( - frozendict({"foo": ParseResult(spcsrc("a"), tuple([frozendict({"a": "b"})]))})) + frozendict({"foo": ParseResult(spcsrc("a"), tuple([frozendict({"a": "b"})]))}) + ) res = parse_import_specifications( (Path("myfile.xlsx"), Path("somefile.csv"), Path("x.jpeg")), resolver, logger ) - assert res == ParseResults(errors=tuple([ - Error(ErrorType.OTHER, "foo"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")), - Error( - ErrorType.PARSE_FAIL, - "JPEG is not a supported file type for import specifications", - spcsrc(Path("x.jpeg")) + assert res == ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "foo"), + Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")), + Error( + ErrorType.PARSE_FAIL, + "JPEG is not a supported file type for import specifications", + spcsrc(Path("x.jpeg")), + ), + ] ) - ])) + ) - resolver.assert_has_calls([ - call(Path("myfile.xlsx")), - call(Path("somefile.csv")), - call(Path("x.jpeg")), - ]) + resolver.assert_has_calls( + [ + call(Path("myfile.xlsx")), + call(Path("somefile.csv")), + call(Path("x.jpeg")), + ] + ) parser1.assert_called_once_with(Path("myfile.xlsx")) parser2.assert_called_once_with(Path("somefile.csv")) logger.assert_not_called() @@ -406,50 +444,66 @@ def test_parse_import_specification_multiple_specs_and_parser_error(): resolver.side_effect = [_ftr(parser1), _ftr(parser2), _ftr(parser3)] # check that other errors are also returned, and the results are ignored - parser1.return_value = ParseResults(errors=tuple([ - Error(ErrorType.OTHER, "other"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")) - ])) - parser2.return_value = ParseResults(frozendict( - {"foo": ParseResult(spcsrc("a1"), tuple([frozendict({"a": "b"})])), - "bar": ParseResult(spcsrc("b1"), tuple([frozendict({"a": "b"})])), - "baz": ParseResult(spcsrc("c1"), tuple([frozendict({"a": "b"})])) - }, - )) - parser3.return_value = ParseResults(frozendict( - {"foo2": ParseResult(spcsrc("a2"), tuple([frozendict({"a": "b"})])), - "bar": ParseResult(spcsrc("b2"), tuple([frozendict({"a": "b"})])), - "baz": ParseResult(spcsrc("c2"), tuple([frozendict({"a": "b"})])) - }, - )) + parser1.return_value = ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "other"), + Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")), + ] + ) + ) + parser2.return_value = ParseResults( + frozendict( + { + "foo": ParseResult(spcsrc("a1"), tuple([frozendict({"a": "b"})])), + "bar": ParseResult(spcsrc("b1"), tuple([frozendict({"a": "b"})])), + "baz": ParseResult(spcsrc("c1"), tuple([frozendict({"a": "b"})])), + }, + ) + ) + parser3.return_value = ParseResults( + frozendict( + { + "foo2": ParseResult(spcsrc("a2"), tuple([frozendict({"a": "b"})])), + "bar": ParseResult(spcsrc("b2"), tuple([frozendict({"a": "b"})])), + "baz": ParseResult(spcsrc("c2"), tuple([frozendict({"a": "b"})])), + }, + ) + ) res = parse_import_specifications( (Path("myfile.xlsx"), Path("somefile.csv"), Path("x.tsv")), resolver, logger ) - assert res == ParseResults(errors=tuple([ - Error(ErrorType.OTHER, "other"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")), - Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - "Data type bar appears in two importer specification sources", - spcsrc("b1"), - spcsrc("b2") - ), - Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - "Data type baz appears in two importer specification sources", - spcsrc("c1"), - spcsrc("c2") + assert res == ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "other"), + Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")), + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + "Data type bar appears in two importer specification sources", + spcsrc("b1"), + spcsrc("b2"), + ), + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + "Data type baz appears in two importer specification sources", + spcsrc("c1"), + spcsrc("c2"), + ), + ] ) - ])) + ) - resolver.assert_has_calls([ - call(Path("myfile.xlsx")), - call(Path("somefile.csv")), - call(Path("x.tsv")), - ]) + resolver.assert_has_calls( + [ + call(Path("myfile.xlsx")), + call(Path("somefile.csv")), + call(Path("x.tsv")), + ] + ) parser1.assert_called_once_with(Path("myfile.xlsx")) parser2.assert_called_once_with(Path("somefile.csv")) parser3.assert_called_once_with(Path("x.tsv")) - logger.assert_not_called() \ No newline at end of file + logger.assert_not_called() diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py index 7f2cf26f..d0195567 100644 --- a/tests/import_specifications/test_file_writers.py +++ b/tests/import_specifications/test_file_writers.py @@ -1,13 +1,10 @@ import os import uuid - -import openpyxl - from collections.abc import Generator from pathlib import Path -from pytest import raises, fixture -from tests.test_app import FileUtil +import openpyxl +from pytest import raises, fixture from staging_service.import_specifications.file_writers import ( write_csv, @@ -15,7 +12,7 @@ write_excel, ImportSpecWriteException, ) - +from tests.test_app import FileUtil from tests.test_utils import ( assert_exception_correct, check_file_contents, @@ -43,7 +40,7 @@ def temp_dir() -> Generator[Path, None, None]: "data": [ {"id3": "comma,comma", "id1": "yay!\ttab", "id2": 42}, {"id3": 56.78, "id1": "boo!", "id2": None}, - ] + ], }, "type2": { "order_and_display": [ @@ -52,7 +49,7 @@ def temp_dir() -> Generator[Path, None, None]: "data": [ {"id1": "foo"}, {"id1": 0}, - ] + ], }, "type3": { "order_and_display": [ @@ -60,8 +57,8 @@ def temp_dir() -> Generator[Path, None, None]: ("tab\tid", "xsv is only"), ("comma,id", "a template"), ], - "data": [] - } + "data": [], + }, } @@ -80,7 +77,7 @@ def test_write_csv(temp_dir: Path): '"thing,with,comma",other,thing\twith\ttabs\n', 'yay!\ttab,42,"comma,comma"\n', "boo!,,56.78\n", - ] + ], ) check_file_contents( temp_dir / "type2.csv", @@ -90,7 +87,7 @@ def test_write_csv(temp_dir: Path): "oh no I only have one column\n", "foo\n", "0\n", - ] + ], ) check_file_contents( temp_dir / "type3.csv", @@ -98,7 +95,7 @@ def test_write_csv(temp_dir: Path): "Data type: type3; Columns: 3; Version: 1\n", 'some_id,tab\tid,"comma,id"\n', "hey this,xsv is only,a template\n", - ] + ], ) @@ -117,7 +114,7 @@ def test_write_tsv(temp_dir: Path): 'thing,with,comma\tother\t"thing\twith\ttabs"\n', '"yay!\ttab"\t42\tcomma,comma\n', "boo!\t\t56.78\n", - ] + ], ) check_file_contents( temp_dir / "type2.tsv", @@ -127,7 +124,7 @@ def test_write_tsv(temp_dir: Path): "oh no I only have one column\n", "foo\n", "0\n", - ] + ], ) check_file_contents( temp_dir / "type3.tsv", @@ -135,7 +132,7 @@ def test_write_tsv(temp_dir: Path): "Data type: type3; Columns: 3; Version: 1\n", 'some_id\t"tab\tid"\tcomma,id\n', "hey this\txsv is only\ta template\n", - ] + ], ) @@ -179,7 +176,7 @@ def test_write_excel(temp_dir: Path): "type3", [ ["Data type: type3; Columns: 3; Version: 1", None, None], - ["some_id","tab\tid","comma,id"], + ["some_id", "tab\tid", "comma,id"], ["hey this", "xsv is only", "a template"], ], [8.0, 11.0, 10.0], @@ -193,81 +190,128 @@ def test_file_writers_fail(): file_writers_fail(p, None, E("The types value must be a mapping")) file_writers_fail(p, {}, E("At least one data type must be specified")) file_writers_fail( - p, {None: 1}, E("A data type cannot be a non-string or a whitespace only string")) + p, + {None: 1}, + E("A data type cannot be a non-string or a whitespace only string"), + ) file_writers_fail( p, {" \t ": 1}, - E("A data type cannot be a non-string or a whitespace only string")) + E("A data type cannot be a non-string or a whitespace only string"), + ) file_writers_fail(p, {"t": []}, E("The value for data type t must be a mapping")) file_writers_fail(p, {"t": 1}, E("The value for data type t must be a mapping")) file_writers_fail(p, {"t": {}}, E("Data type t missing order_and_display key")) file_writers_fail( p, {"t": {"order_and_display": {}, "data": []}}, - E("Data type t order_and_display value is not a list")) + E("Data type t order_and_display value is not a list"), + ) file_writers_fail( p, {"t": {"order_and_display": [], "data": []}}, - E("At least one entry is required for order_and_display for type t")) + E("At least one entry is required for order_and_display for type t"), + ) file_writers_fail( p, {"t": {"order_and_display": [["foo", "bar"]]}}, - E("Data type t missing data key")) + E("Data type t missing data key"), + ) file_writers_fail( p, {"t": {"order_and_display": [["foo", "bar"]], "data": "foo"}}, - E("Data type t data value is not a list")) + E("Data type t data value is not a list"), + ) file_writers_fail( p, - {"t": {"order_and_display": [["foo", "bar"], "baz"] , "data": []}}, - E("Invalid order_and_display entry for datatype t at index 1 - " - + "the entry is not a list")) + {"t": {"order_and_display": [["foo", "bar"], "baz"], "data": []}}, + E( + "Invalid order_and_display entry for datatype t at index 1 - " + + "the entry is not a list" + ), + ) file_writers_fail( p, - {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"], ["baz"]] , "data": []}}, - E("Invalid order_and_display entry for datatype t at index 2 - " - + "expected 2 item list")) + { + "t": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"], ["baz"]], + "data": [], + } + }, + E("Invalid order_and_display entry for datatype t at index 2 - " + "expected 2 item list"), + ) file_writers_fail( p, - {"t": {"order_and_display": [("foo", "bar", "whee"), ["whee", "whoo"]] , "data": []}}, - E("Invalid order_and_display entry for datatype t at index 0 - " - + "expected 2 item list")) + { + "t": { + "order_and_display": [("foo", "bar", "whee"), ["whee", "whoo"]], + "data": [], + } + }, + E("Invalid order_and_display entry for datatype t at index 0 - " + "expected 2 item list"), + ) for parm in [None, " \t ", 1]: file_writers_fail( p, {"t": {"order_and_display": [(parm, "foo"), ["whee", "whoo"]], "data": []}}, - E("Invalid order_and_display entry for datatype t at index 0 - " - + "parameter ID cannot be a non-string or a whitespace only string")) + E( + "Invalid order_and_display entry for datatype t at index 0 - " + + "parameter ID cannot be a non-string or a whitespace only string" + ), + ) file_writers_fail( p, {"t": {"order_and_display": [("bar", "foo"), ["whee", parm]], "data": []}}, - E("Invalid order_and_display entry for datatype t at index 1 - " - + "parameter display name cannot be a non-string or a whitespace only string")) + E( + "Invalid order_and_display entry for datatype t at index 1 - " + + "parameter display name cannot be a non-string or a whitespace only string" + ), + ) file_writers_fail( p, - {"t": {"order_and_display": [("bar", "foo"), ["whee", "whoo"]], "data": ["foo"]}}, - E("Data type t data row 0 is not a mapping")) + { + "t": { + "order_and_display": [("bar", "foo"), ["whee", "whoo"]], + "data": ["foo"], + } + }, + E("Data type t data row 0 is not a mapping"), + ) file_writers_fail( p, {"t": {"order_and_display": [("bar", "foo")], "data": [{"bar": 1}, []]}}, - E("Data type t data row 1 is not a mapping")) + E("Data type t data row 1 is not a mapping"), + ) file_writers_fail( p, - {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], - "data": [{"foo": 1, "whee": 2}, {"foo": 2}]}}, - E("Data type t data row 1 does not have the same keys as order_and_display")) + { + "t": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 1, "whee": 2}, {"foo": 2}], + } + }, + E("Data type t data row 1 does not have the same keys as order_and_display"), + ) file_writers_fail( p, - {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], - "data": [{"foo": 2, "whee": 3, 5: 4}, {"foo": 1, "whee": 2}]}}, - E("Data type ty data row 0 does not have the same keys as order_and_display")) + { + "ty": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 2, "whee": 3, 5: 4}, {"foo": 1, "whee": 2}], + } + }, + E("Data type ty data row 0 does not have the same keys as order_and_display"), + ) file_writers_fail( p, - {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], - "data": [{"foo": 2, "whee": 3}, {"foo": 1, "whee": []}]}}, - E("Data type ty data row 1's value for parameter whee " - + "is not a number or a string")) - + { + "ty": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 2, "whee": 3}, {"foo": 1, "whee": []}], + } + }, + E("Data type ty data row 1's value for parameter whee " + "is not a number or a string"), + ) def file_writers_fail(path: Path, types: dict, expected: Exception): diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index fe73a309..2c4a8a24 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -1,13 +1,12 @@ import os import uuid - from collections.abc import Callable, Generator +from pathlib import Path + # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 from frozendict.core import frozendict -from pathlib import Path from pytest import fixture - from staging_service.import_specifications.individual_parsers import ( parse_csv, parse_tsv, @@ -16,9 +15,8 @@ Error, SpecificationSource, ParseResult, - ParseResults + ParseResults, ) - from tests.test_app import FileUtil _TEST_DATA_DIR = (Path(__file__).parent / "test_data").resolve() @@ -33,45 +31,83 @@ def temp_dir() -> Generator[Path, None, None]: # FileUtil will auto delete after exiting + ########################################## # xSV tests ########################################## def test_xsv_parse_success(temp_dir: Path): - # Tests a special case where if an xSV file is opened by Excel and then resaved, + # Tests a special case where if an xSV file is opened by Excel and then resaved, # Excel will add separators to the end of the 1st header line. This previously caused # the parse to fail. - _xsv_parse_success(temp_dir, ',', parse_csv) - _xsv_parse_success(temp_dir, '\t', parse_tsv) + _xsv_parse_success(temp_dir, ",", parse_csv) + _xsv_parse_success(temp_dir, "\t", parse_tsv) def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]): s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: - test_file.writelines([ - f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n", - f"spec1{s} spec2{s} spec3 {s} spec4\n", # test trimming - f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", - f"val1 {s} val2 {s} 7 {s} 3.2\n", # test trimming - f"val3 {s} val4{s} 1{s} 8.9\n", - f"val5 {s}{s}{s} 42.42\n", # test missing values w/o whitespace - f"val6 {s} {s} {s} 3.14\n" # test missing values w/ whitespace - ]) + test_file.writelines( + [ + f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n", + f"spec1{s} spec2{s} spec3 {s} spec4\n", # test trimming + f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", + f"val1 {s} val2 {s} 7 {s} 3.2\n", # test trimming + f"val3 {s} val4{s} 1{s} 8.9\n", + f"val5 {s}{s}{s} 42.42\n", # test missing values w/o whitespace + f"val6 {s} {s} {s} 3.14\n", # test missing values w/ whitespace + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"some_type": ParseResult(SpecificationSource(input_), - tuple([ - frozendict({"spec1": "val1", "spec2": "val2", "spec3": 7, "spec4": 3.2}), - frozendict({"spec1": "val3", "spec2": "val4", "spec3": 1, "spec4": 8.9}), - frozendict({"spec1": "val5", "spec2": None, "spec3": None, "spec4": 42.42}), - frozendict({"spec1": "val6", "spec2": None, "spec3": None, "spec4": 3.14}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "some_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + { + "spec1": "val1", + "spec2": "val2", + "spec3": 7, + "spec4": 3.2, + } + ), + frozendict( + { + "spec1": "val3", + "spec2": "val4", + "spec3": 1, + "spec4": 8.9, + } + ), + frozendict( + { + "spec1": "val5", + "spec2": None, + "spec3": None, + "spec4": 42.42, + } + ), + frozendict( + { + "spec1": "val6", + "spec2": None, + "spec3": None, + "spec4": 3.14, + } + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_success_nan_inf(temp_dir: Path): @@ -79,34 +115,64 @@ def test_xsv_parse_success_nan_inf(temp_dir: Path): # they will be returned from the service as barewords in JSON and will cause errors in # some parsers as the JSON spec does not support NaN and Inf (which it should but...) # See https://kbase-jira.atlassian.net/browse/PTV-1866 - _xsv_parse_success_nan_inf(temp_dir, ',', parse_csv) - _xsv_parse_success_nan_inf(temp_dir, '\t', parse_tsv) + _xsv_parse_success_nan_inf(temp_dir, ",", parse_csv) + _xsv_parse_success_nan_inf(temp_dir, "\t", parse_tsv) def _xsv_parse_success_nan_inf(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]): s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: - test_file.writelines([ - f"Data type: some_nan_type; Columns: 4; Version: 1{s}{s}{s}\n", - f"spec1{s} -Inf{s} nan {s} inf\n", - f"Spec 1{s} inf{s} Spec 3{s} -inf\n", - f"inf {s} val2 {s} NaN {s} 3.2\n", - f"Inf {s} val4{s} -inf{s} 8.9\n", - f"val5 {s}-Inf{s}{s} nan\n", - ]) + test_file.writelines( + [ + f"Data type: some_nan_type; Columns: 4; Version: 1{s}{s}{s}\n", + f"spec1{s} -Inf{s} nan {s} inf\n", + f"Spec 1{s} inf{s} Spec 3{s} -inf\n", + f"inf {s} val2 {s} NaN {s} 3.2\n", + f"Inf {s} val4{s} -inf{s} 8.9\n", + f"val5 {s}-Inf{s}{s} nan\n", + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"some_nan_type": ParseResult(SpecificationSource(input_), - tuple([ - frozendict({"spec1": "inf", "-Inf": "val2", "nan": "NaN", "inf": 3.2}), - frozendict({"spec1": "Inf", "-Inf": "val4", "nan": "-inf", "inf": 8.9}), - frozendict({"spec1": "val5", "-Inf": "-Inf", "nan": None, "inf": "nan"}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "some_nan_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + { + "spec1": "inf", + "-Inf": "val2", + "nan": "NaN", + "inf": 3.2, + } + ), + frozendict( + { + "spec1": "Inf", + "-Inf": "val4", + "nan": "-inf", + "inf": 8.9, + } + ), + frozendict( + { + "spec1": "val5", + "-Inf": "-Inf", + "nan": None, + "inf": "nan", + } + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_success_with_numeric_headers(temp_dir: Path): @@ -114,8 +180,8 @@ def test_xsv_parse_success_with_numeric_headers(temp_dir: Path): Not a use case we expect but good to check numeric headers don't cause an unexpected error. """ - _xsv_parse_success_with_numeric_headers(temp_dir, ',', parse_csv) - _xsv_parse_success_with_numeric_headers(temp_dir, '\t', parse_tsv) + _xsv_parse_success_with_numeric_headers(temp_dir, ",", parse_csv) + _xsv_parse_success_with_numeric_headers(temp_dir, "\t", parse_tsv) def _xsv_parse_success_with_numeric_headers( @@ -124,21 +190,31 @@ def _xsv_parse_success_with_numeric_headers( s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: - test_file.writelines([ - "Data type: some_type; Columns: 4; Version: 1\n", - f"1{s} 2.0{s} 3{s} 4.1\n", # test trimming - f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", - f"val3 {s} val4{s} 1{s} 8.9\n", - ]) + test_file.writelines( + [ + "Data type: some_type; Columns: 4; Version: 1\n", + f"1{s} 2.0{s} 3{s} 4.1\n", # test trimming + f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", + f"val3 {s} val4{s} 1{s} 8.9\n", + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"some_type": ParseResult(SpecificationSource(input_), - tuple([frozendict({"1": "val3", "2.0": "val4", "3": 1, "4.1": 8.9}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "some_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict({"1": "val3", "2.0": "val4", "3": 1, "4.1": 8.9}), + ] + ), + ) + } + ) + ) def test_xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir: Path): @@ -146,8 +222,8 @@ def test_xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir: Path Test that leaving one or more empty lines in a csv/tsv file does not cause the parse to fail. This is easy to do accidentally and so will annoy users. """ - _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, ',', parse_csv) - _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, '\t', parse_tsv) + _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, ",", parse_csv) + _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, "\t", parse_tsv) def _xsv_parse_success_with_internal_and_trailing_empty_lines( @@ -156,46 +232,71 @@ def _xsv_parse_success_with_internal_and_trailing_empty_lines( s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: - test_file.writelines([ - "Data type: other_type; Columns: 4; Version: 1\n", - f"spec1{s} spec2{s} spec3{s} spec4\n", - f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", - f"val3 {s} val4{s} 1{s} 8.9\n", - "\n", - f"val1 {s} val2{s} 7 {s} 3.2\n", - "\n", - "\n", - "\n", - ]) + test_file.writelines( + [ + "Data type: other_type; Columns: 4; Version: 1\n", + f"spec1{s} spec2{s} spec3{s} spec4\n", + f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", + f"val3 {s} val4{s} 1{s} 8.9\n", + "\n", + f"val1 {s} val2{s} 7 {s} 3.2\n", + "\n", + "\n", + "\n", + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"other_type": ParseResult(SpecificationSource(input_), - tuple([ - frozendict({"spec1": "val3", "spec2": "val4", "spec3": 1, "spec4": 8.9}), - frozendict({"spec1": "val1", "spec2": "val2", "spec3": 7, "spec4": 3.2}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "other_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + { + "spec1": "val3", + "spec2": "val4", + "spec3": 1, + "spec4": 8.9, + } + ), + frozendict( + { + "spec1": "val1", + "spec2": "val2", + "spec3": 7, + "spec4": 3.2, + } + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_fail_no_file(temp_dir: Path): input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: - test_file.writelines([ - "Data type: other_type; Columns: 4; Version: 1\n", - "spec1, spec2, spec3, spec4\n", - "Spec 1, Spec 2, Spec 3, Spec 4\n", - "val1 , val2, 7 , 3.2\n", - ]) + test_file.writelines( + [ + "Data type: other_type; Columns: 4; Version: 1\n", + "spec1, spec2, spec3, spec4\n", + "Spec 1, Spec 2, Spec 3, Spec 4\n", + "val1 , val2, 7 , 3.2\n", + ] + ) input_ = Path(str(input_)[-1]) res = parse_csv(input_) - assert res == ParseResults(errors=tuple([ - Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(input_)) - ])) + assert res == ParseResults( + errors=tuple([Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(input_))]) + ) def test_xsv_parse_fail_binary_file(temp_dir: Path): @@ -203,12 +304,17 @@ def test_xsv_parse_fail_binary_file(temp_dir: Path): res = parse_csv(test_file) - assert res == ParseResults(errors=tuple([ - Error( - ErrorType.PARSE_FAIL, - "Not a text file: application/vnd.ms-excel", - source_1=SpecificationSource(test_file)) - ])) + assert res == ParseResults( + errors=tuple( + [ + Error( + ErrorType.PARSE_FAIL, + "Not a text file: application/vnd.ms-excel", + source_1=SpecificationSource(test_file), + ) + ] + ) + ) def test_xsv_parse_fail_directory(temp_dir: Path): @@ -217,9 +323,17 @@ def test_xsv_parse_fail_directory(temp_dir: Path): res = parse_tsv(test_file) - assert res == ParseResults(errors=tuple([Error( - ErrorType.PARSE_FAIL, "The given path is a directory", SpecificationSource(test_file) - )])) + assert res == ParseResults( + errors=tuple( + [ + Error( + ErrorType.PARSE_FAIL, + "The given path is a directory", + SpecificationSource(test_file), + ) + ] + ) + ) def _xsv_parse_fail( @@ -244,8 +358,10 @@ def test_xsv_parse_fail_empty_file(temp_dir: Path): def test_xsv_parse_fail_bad_datatype_header(temp_dir: Path): - err = ('Invalid header; got "This is the wrong header", expected "Data type: ' - + '; Columns: ; Version: "') + err = ( + 'Invalid header; got "This is the wrong header", expected "Data type: ' + + '; Columns: ; Version: "' + ) _xsv_parse_fail(temp_dir, ["This is the wrong header"], parse_csv, err) @@ -275,8 +391,7 @@ def test_xsv_parse_fail_missing_column_header_entries(temp_dir: Path): def test_xsv_parse_fail_missing_data(temp_dir: Path): err = "No non-header data in file" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head1, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head1, head2, head3\n", "Head 1, Head 2, Head 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err) @@ -285,40 +400,35 @@ def test_xsv_parse_fail_missing_data(temp_dir: Path): def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 3, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head1, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head1, head2, head3\n", "Head 1, Head 2\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 3, expected 2, got 3" lines = [ - "Data type: foo; Columns: 2; Version: 1\n" - "head1\thead2\n", + "Data type: foo; Columns: 2; Version: 1\n" "head1\thead2\n", "Head 1\tHead 2\tHead 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_tsv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 2, expected 2, got 3" lines = [ - "Data type: foo; Columns: 2; Version: 1\n" - "head1, head2, head3\n", + "Data type: foo; Columns: 2; Version: 1\n" "head1, head2, head3\n", "Head 1, Head 2\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 2, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head1\thead2\n", + "Data type: foo; Columns: 3; Version: 1\n" "head1\thead2\n", "Head 1\tHead 2\tHead 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_tsv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 5, expected 3, got 4" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head1, head2, head 3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head1, head2, head 3\n", "Head 1, Head 2, Head 3\n", "1, 2, 3\n", "1, 2, 3, 4\n", @@ -328,8 +438,7 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 6, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head1\thead2\thead 3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head1\thead2\thead 3\n", "Head 1\tHead 2\tHead 3\n", "1\t2\t3\n", "1\t2\t3\n", @@ -342,16 +451,14 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): def test_xsv_parse_fail_duplicate_headers(temp_dir: Path): err = "Duplicate header name in row 2: head3" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head3, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head3, head2, head3\n", "Head 1, Head 2, Head 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err) # test with duplicate dual headers lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head3, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head3, head2, head3\n", "Head 3, Head 2, Head 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err) @@ -389,20 +496,29 @@ def test_excel_parse_success(): res = parse_excel(ex) - assert res == ParseResults(frozendict({ - "type1": ParseResult(SpecificationSource(ex, "tab1"), ( - frozendict({"header1": "foo", "header2": 1, "header3": 6.7}), - frozendict({"header1": "bar", "header2": 2, "header3": 8.9}), - frozendict({"header1": "baz", "header2": None, "header3": 3.4}), - frozendict({"header1": "bat", "header2": 4, "header3": None}), - )), - "type2": ParseResult(SpecificationSource(ex, "tab2"), ( - frozendict({"h1": "golly gee", "2": 42, "h3": "super"}), - )), - "type3": ParseResult(SpecificationSource(ex, "tab3"), ( - frozendict({"head1": "some data", "head2": 1}), - )), - })) + assert res == ParseResults( + frozendict( + { + "type1": ParseResult( + SpecificationSource(ex, "tab1"), + ( + frozendict({"header1": "foo", "header2": 1, "header3": 6.7}), + frozendict({"header1": "bar", "header2": 2, "header3": 8.9}), + frozendict({"header1": "baz", "header2": None, "header3": 3.4}), + frozendict({"header1": "bat", "header2": 4, "header3": None}), + ), + ), + "type2": ParseResult( + SpecificationSource(ex, "tab2"), + (frozendict({"h1": "golly gee", "2": 42, "h3": "super"}),), + ), + "type3": ParseResult( + SpecificationSource(ex, "tab3"), + (frozendict({"head1": "some data", "head2": 1}),), + ), + } + ) + ) def test_excel_parse_success_nan_inf(): @@ -417,29 +533,36 @@ def test_excel_parse_success_nan_inf(): res = parse_excel(ex) - assert res == ParseResults(frozendict({ - "nan_type": ParseResult(SpecificationSource(ex, "tab1"), ( - frozendict({"header1": 1, "header2": None}), - frozendict({"header1": 2, "header2": None}), - frozendict({"header1": 3, "header2": None}), - frozendict({"header1": 4, "header2": "-1.#IND"}), - frozendict({"header1": 5, "header2": "-1.#QNAN"}), - frozendict({"header1": 6, "header2": "-NaN"}), - frozendict({"header1": 7, "header2": "-nan"}), - frozendict({"header1": 8, "header2": "1.#IND"}), - frozendict({"header1": 9, "header2": "1.#QNAN"}), - frozendict({"header1": 10, "header2": None}), - frozendict({"header1": 11, "header2": None}), - frozendict({"header1": 12, "header2": None}), - frozendict({"header1": 13, "header2": None}), - frozendict({"header1": 14, "header2": "NaN"}), - frozendict({"header1": 15, "header2": None}), - frozendict({"header1": 16, "header2": "nan"}), - frozendict({"header1": 17, "header2": None}), - frozendict({"header1": 18, "header2": None}), - frozendict({"header1": 19, "header2": "some stuff"}), - )), - })) + assert res == ParseResults( + frozendict( + { + "nan_type": ParseResult( + SpecificationSource(ex, "tab1"), + ( + frozendict({"header1": 1, "header2": None}), + frozendict({"header1": 2, "header2": None}), + frozendict({"header1": 3, "header2": None}), + frozendict({"header1": 4, "header2": "-1.#IND"}), + frozendict({"header1": 5, "header2": "-1.#QNAN"}), + frozendict({"header1": 6, "header2": "-NaN"}), + frozendict({"header1": 7, "header2": "-nan"}), + frozendict({"header1": 8, "header2": "1.#IND"}), + frozendict({"header1": 9, "header2": "1.#QNAN"}), + frozendict({"header1": 10, "header2": None}), + frozendict({"header1": 11, "header2": None}), + frozendict({"header1": 12, "header2": None}), + frozendict({"header1": 13, "header2": None}), + frozendict({"header1": 14, "header2": "NaN"}), + frozendict({"header1": 15, "header2": None}), + frozendict({"header1": 16, "header2": "nan"}), + frozendict({"header1": 17, "header2": None}), + frozendict({"header1": 18, "header2": None}), + frozendict({"header1": 19, "header2": "some stuff"}), + ), + ), + } + ) + ) def _excel_parse_fail( @@ -458,16 +581,22 @@ def _excel_parse_fail( if errors: assert res == ParseResults(errors=tuple(errors)) else: - assert res == ParseResults(errors=tuple([ - Error(ErrorType.PARSE_FAIL, message, source_1=SpecificationSource(test_file)) - ])) + assert res == ParseResults( + errors=tuple( + [ + Error( + ErrorType.PARSE_FAIL, + message, + source_1=SpecificationSource(test_file), + ) + ] + ) + ) def test_excel_parse_fail_no_file(): f = _get_test_file("testtabs3full2nodata1empty0.xls") - _excel_parse_fail(f, errors=[ - Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(f)) - ]) + _excel_parse_fail(f, errors=[Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(f))]) def test_excel_parse_fail_directory(temp_dir): @@ -484,13 +613,17 @@ def test_excel_parse_fail_empty_file(temp_dir: Path): def test_excel_parse_fail_non_excel_file(temp_dir: Path): lines = [ - "Data type: foo; Version: 1\n" - "head1, head2, head 3\n", + "Data type: foo; Version: 1\n" "head1, head2, head 3\n", "Head 1, Head 2, Head 3\n", "1, 2, 3\n", ] _xsv_parse_fail( - temp_dir, lines, parse_excel, "Not a supported Excel file type", extension=".xlsx") + temp_dir, + lines, + parse_excel, + "Not a supported Excel file type", + extension=".xlsx", + ) def test_excel_parse_1emptytab(): @@ -499,13 +632,18 @@ def test_excel_parse_1emptytab(): def test_excel_parse_fail_bad_datatype_header(): f = _get_test_file("testbadinitialheader.xls") - err1 = ('Invalid header; got "This header is wack, yo", expected "Data type: ' - + '; Columns: ; Version: "') + err1 = ( + 'Invalid header; got "This header is wack, yo", expected "Data type: ' + + '; Columns: ; Version: "' + ) err2 = "Schema version 2 is larger than maximum processable version 1" - _excel_parse_fail(f, errors=[ - Error(ErrorType.PARSE_FAIL, err1, SpecificationSource(f, "badheader1")), - Error(ErrorType.PARSE_FAIL, err2, SpecificationSource(f, "badheader2")), - ]) + _excel_parse_fail( + f, + errors=[ + Error(ErrorType.PARSE_FAIL, err1, SpecificationSource(f, "badheader1")), + Error(ErrorType.PARSE_FAIL, err2, SpecificationSource(f, "badheader2")), + ], + ) def test_excel_parse_fail_headers_only(): @@ -513,34 +651,76 @@ def test_excel_parse_fail_headers_only(): _excel_parse_fail(f, "No non-header data in file") +def format_message(t): + return f"Found datatype {t} in multiple tabs" + + def test_excel_parse_fail_colliding_datatypes(): f = _get_test_file("testdatatypecollisions.xls") - l = lambda t: f"Found datatype {t} in multiple tabs" + msg = format_message + err = ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE - _excel_parse_fail(f, errors=[ - Error(err, l("type2"), SpecificationSource(f, "dt2"), SpecificationSource(f, "dt2_2")), - Error(err, l("type3"), SpecificationSource(f, "dt3"), SpecificationSource(f, "dt3_2")), - Error(err, l("type2"), SpecificationSource(f, "dt2"), SpecificationSource(f, "dt2_3")), - ]) + _excel_parse_fail( + f, + errors=[ + Error( + err, + msg("type2"), + SpecificationSource(f, "dt2"), + SpecificationSource(f, "dt2_2"), + ), + Error( + err, + msg("type3"), + SpecificationSource(f, "dt3"), + SpecificationSource(f, "dt3_2"), + ), + Error( + err, + msg("type2"), + SpecificationSource(f, "dt2"), + SpecificationSource(f, "dt2_3"), + ), + ], + ) + + +def duplicate_header_message(h): + return f"Duplicate header name in row 2: {h}" def test_excel_parse_fail_duplicate_headers(): f = _get_test_file("testduplicateheaders.xlsx") - l = lambda h: f"Duplicate header name in row 2: {h}" - _excel_parse_fail(f, errors=[ - Error(ErrorType.PARSE_FAIL, l("head1"), SpecificationSource(f, "dt2")), - Error(ErrorType.PARSE_FAIL, l("head2"), SpecificationSource(f, "dt3")), - ]) + msg = duplicate_header_message + _excel_parse_fail( + f, + errors=[ + Error(ErrorType.PARSE_FAIL, msg("head1"), SpecificationSource(f, "dt2")), + Error(ErrorType.PARSE_FAIL, msg("head2"), SpecificationSource(f, "dt3")), + ], + ) def test_excel_parse_fail_missing_header_item(): f = _get_test_file("testmissingheaderitem.xlsx") err1 = "Missing header entry in row 2, position 3" err2 = "Missing header entry in row 2, position 2" - _excel_parse_fail(f, errors=[ - Error(ErrorType.PARSE_FAIL, err1, SpecificationSource(f, "missing header item error")), - Error(ErrorType.PARSE_FAIL, err2, SpecificationSource(f, "whitespace header item")), - ]) + _excel_parse_fail( + f, + errors=[ + Error( + ErrorType.PARSE_FAIL, + err1, + SpecificationSource(f, "missing header item error"), + ), + Error( + ErrorType.PARSE_FAIL, + err2, + SpecificationSource(f, "whitespace header item"), + ), + ], + ) + def test_excel_parse_fail_unequal_rows(): """ @@ -550,20 +730,23 @@ def test_excel_parse_fail_unequal_rows(): worry about off by one errors when filling in separator characters. """ f = _get_test_file("testunequalrows.xlsx") - _excel_parse_fail(f, errors=[ - Error( - ErrorType.INCORRECT_COLUMN_COUNT, - "Incorrect number of items in line 3, expected 2, got 3", - SpecificationSource(f, "2 cols, 3 human readable") - ), - Error( - ErrorType.INCORRECT_COLUMN_COUNT, - "Incorrect number of items in line 2, expected 2, got 3", - SpecificationSource(f, "2 cols, 3 spec IDs") - ), - Error( - ErrorType.INCORRECT_COLUMN_COUNT, - "Incorrect number of items in line 5, expected 3, got 4", - SpecificationSource(f, "3 cols, 4 data") - ), - ]) + _excel_parse_fail( + f, + errors=[ + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + "Incorrect number of items in line 3, expected 2, got 3", + SpecificationSource(f, "2 cols, 3 human readable"), + ), + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + "Incorrect number of items in line 2, expected 2, got 3", + SpecificationSource(f, "2 cols, 3 spec IDs"), + ), + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + "Incorrect number of items in line 5, expected 3, got 4", + SpecificationSource(f, "3 cols, 4 data"), + ), + ], + ) diff --git a/tests/pyproject.toml b/tests/pyproject.toml index 8bc18712..7ac2121e 100644 --- a/tests/pyproject.toml +++ b/tests/pyproject.toml @@ -3,4 +3,4 @@ minversion = "6.0" addopts = "-ra -q" testpaths = [ "tests", -] \ No newline at end of file +] diff --git a/tests/test.env.example b/tests/test.env.example index 37b78383..b9b678d9 100644 --- a/tests/test.env.example +++ b/tests/test.env.example @@ -1,3 +1,3 @@ export KB_DEPLOYMENT_CONFIG_DIR=/Users///staging_service/deployment/conf export KB_DEPLOYMENT_CONFIG=${KB_DEPLOYMENT_CONFIG_DIR}/testing.cfg -export FILE_EXTENSION_MAPPINGS=${KB_DEPLOYMENT_CONFIG_DIR}/supported_apps_w_extensions.json \ No newline at end of file +export FILE_EXTENSION_MAPPINGS=${KB_DEPLOYMENT_CONFIG_DIR}/supported_apps_w_extensions.json diff --git a/tests/test_app.py b/tests/test_app.py index eabec07e..de147af5 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1,19 +1,18 @@ import asyncio import configparser import hashlib -import openpyxl import os -import pandas import shutil import string import time +from io import BytesIO from json import JSONDecoder from pathlib import Path -from urllib.parse import urlencode,unquote -from io import BytesIO from typing import Any +from urllib.parse import urlencode, unquote - +import openpyxl +import pandas from aiohttp import test_utils, FormData from hypothesis import given, settings from hypothesis import strategies as st @@ -22,7 +21,6 @@ import staging_service.globus as globus import staging_service.utils as utils from staging_service.AutoDetectUtils import AutoDetectUtils - from tests.test_utils import check_file_contents, check_excel_contents if os.environ.get("KB_DEPLOYMENT_CONFIG") is None: @@ -73,9 +71,7 @@ async def mock_auth(*args, **kwargs): async def mock_globus_id(*args, **kwargs): return ["testuser@globusid.org"] - globus._get_globus_ids = ( - mock_globus_id # TODO this doesn't allow testing of this fn does it - ) + globus._get_globus_ids = mock_globus_id # TODO this doesn't allow testing of this fn does it return application @@ -137,22 +133,11 @@ def remove_dir(self, path): @given(username_first_strat, username_strat) def test_path_cases(username_first, username_rest): username = username_first + username_rest - assert ( - username + "/foo/bar" == utils.Path.validate_path(username, "foo/bar").user_path - ) - assert ( - username + "/baz" - == utils.Path.validate_path(username, "foo/../bar/../baz").user_path - ) - assert ( - username + "/bar" - == utils.Path.validate_path(username, "foo/../../../../bar").user_path - ) + assert username + "/foo/bar" == utils.Path.validate_path(username, "foo/bar").user_path + assert username + "/baz" == utils.Path.validate_path(username, "foo/../bar/../baz").user_path + assert username + "/bar" == utils.Path.validate_path(username, "foo/../../../../bar").user_path assert username + "/foo" == utils.Path.validate_path(username, "./foo").user_path - assert ( - username + "/foo/bar" - == utils.Path.validate_path(username, "../foo/bar").user_path - ) + assert username + "/foo/bar" == utils.Path.validate_path(username, "../foo/bar").user_path assert username + "/foo" == utils.Path.validate_path(username, "/../foo").user_path assert username + "/" == utils.Path.validate_path(username, "/foo/..").user_path assert username + "/foo" == utils.Path.validate_path(username, "/foo/.").user_path @@ -164,10 +149,7 @@ def test_path_cases(username_first, username_rest): assert username + "/" == utils.Path.validate_path(username, "").user_path assert username + "/" == utils.Path.validate_path(username, "foo/..").user_path assert username + "/" == utils.Path.validate_path(username, "/..../").user_path - assert ( - username + "/stuff.ext" - == utils.Path.validate_path(username, "/stuff.ext").user_path - ) + assert username + "/stuff.ext" == utils.Path.validate_path(username, "/stuff.ext").user_path @given(username_first_strat, username_strat, st.text()) @@ -222,11 +204,9 @@ async def test_jbi_metadata(): async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_jgi.fastq"), txt) - f_jgi = fs.make_file( - os.path.join(username, "test", ".test_jgi.fastq.jgi"), jbi_metadata - ) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_jgi.fastq"), txt) + fs.make_file(os.path.join(username, "test", ".test_jgi.fastq.jgi"), jbi_metadata) res1 = await cli.get( os.path.join("jgi-metadata", "test", "test_jgi.fastq"), headers={"Authorization": ""}, @@ -252,8 +232,8 @@ async def test_metadata(): username = "testuser" async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) res1 = await cli.get( os.path.join("metadata", "test", "test_file_1"), headers={"Authorization": ""}, @@ -352,8 +332,8 @@ async def test_define_UPA(): username = "testuser" async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) # generating metadata file res1 = await cli.get( os.path.join("metadata", "test", "test_file_1"), @@ -433,13 +413,11 @@ async def test_mv(): username = "testuser" async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) # list current test directory - res1 = await cli.get( - os.path.join("list", "test"), headers={"Authorization": ""} - ) + res1 = await cli.get(os.path.join("list", "test"), headers={"Authorization": ""}) assert res1.status == 200 json_text = await res1.text() json = decoder.decode(json_text) @@ -456,9 +434,7 @@ async def test_mv(): assert "successfully moved" in json_text # relist test directory - res3 = await cli.get( - os.path.join("list", "test"), headers={"Authorization": ""} - ) + res3 = await cli.get(os.path.join("list", "test"), headers={"Authorization": ""}) assert res3.status == 200 json_text = await res3.text() json = decoder.decode(json_text) @@ -507,13 +483,11 @@ async def test_delete(): username = "testuser" async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) # list current test directory - res1 = await cli.get( - os.path.join("list", "test"), headers={"Authorization": ""} - ) + res1 = await cli.get(os.path.join("list", "test"), headers={"Authorization": ""}) assert res1.status == 200 json_text = await res1.text() json = decoder.decode(json_text) @@ -529,9 +503,7 @@ async def test_delete(): assert "successfully deleted" in json_text # relist test directory - res3 = await cli.get( - os.path.join("list", "test"), headers={"Authorization": ""} - ) + res3 = await cli.get(os.path.join("list", "test"), headers={"Authorization": ""}) assert res3.status == 200 json_text = await res3.text() json = decoder.decode(json_text) @@ -555,12 +527,10 @@ async def test_list(): username = "testuser" async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) - d2 = fs.make_dir(os.path.join(username, "test", "test_sub_dir")) - f3 = fs.make_file( - os.path.join(username, "test", "test_sub_dir", "test_file_2"), txt - ) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test", "test_sub_dir")) + fs.make_file(os.path.join(username, "test", "test_sub_dir", "test_file_2"), txt) res1 = await cli.get("list/..", headers={"Authorization": ""}) assert res1.status == 404 res2 = await cli.get( @@ -596,9 +566,7 @@ async def test_list(): assert sum(file_folder_count) == 2 # testing sub-directory - res5 = await cli.get( - os.path.join("list", "test"), headers={"Authorization": ""} - ) + res5 = await cli.get(os.path.join("list", "test"), headers={"Authorization": ""}) assert res5.status == 200 json_text = await res5.text() json = decoder.decode(json_text) @@ -608,16 +576,14 @@ async def test_list(): assert sum(file_folder_count) == 1 # testing list dot-files - f4 = fs.make_file(os.path.join(username, "test", ".test_file_1"), txt) + fs.make_file(os.path.join(username, "test", ".test_file_1"), txt) # f5 = fs.make_file(os.path.join(username, 'test', '.globus_id'), txt) res6 = await cli.get("/list/", headers={"Authorization": ""}) assert res6.status == 200 json_text = await res6.text() json = decoder.decode(json_text) - file_names = [ - file_json["name"] for file_json in json if not file_json["isFolder"] - ] + file_names = [file_json["name"] for file_json in json if not file_json["isFolder"]] assert ".test_file_1" not in file_names assert ".globus_id" not in file_names assert len(file_names) == 2 @@ -629,9 +595,7 @@ async def test_list(): assert res7.status == 200 json_text = await res7.text() json = decoder.decode(json_text) - file_names = [ - file_json["name"] for file_json in json if not file_json["isFolder"] - ] + file_names = [file_json["name"] for file_json in json if not file_json["isFolder"]] assert ".test_file_1" in file_names assert ".globus_id" in file_names assert len(file_names) == 4 @@ -642,8 +606,8 @@ async def test_download(txt): username = "testuser" async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) res = await cli.get( os.path.join("download", "test", "test_file_1"), @@ -658,7 +622,7 @@ async def test_download_errors(): username = "testuser" async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) + fs.make_dir(os.path.join(username, "test")) res1 = await cli.get("dwnload", headers={"Authorization": ""}) assert res1.status == 404 @@ -673,24 +637,18 @@ async def test_similar(): username = "testuser" async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1.fq"), txt) - d1 = fs.make_dir(os.path.join(username, "test", "test_sub_dir")) - f1 = fs.make_file( - os.path.join(username, "test", "test_sub_dir", "test_file_2.fq"), txt - ) - f2 = fs.make_file( + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1.fq"), txt) + fs.make_dir(os.path.join(username, "test", "test_sub_dir")) + fs.make_file(os.path.join(username, "test", "test_sub_dir", "test_file_2.fq"), txt) + fs.make_file( os.path.join(username, "test", "test_sub_dir", "test_file_right.fq"), txt, ) - f3 = fs.make_file( - os.path.join(username, "test", "test_sub_dir", "my_files"), txt - ) + fs.make_file(os.path.join(username, "test", "test_sub_dir", "my_files"), txt) # testing similar file name - res1 = await cli.get( - "similar/test/test_file_1.fq", headers={"Authorization": ""} - ) + res1 = await cli.get("similar/test/test_file_1.fq", headers={"Authorization": ""}) assert res1.status == 200 json_text = await res1.text() json = decoder.decode(json_text) @@ -699,9 +657,7 @@ async def test_similar(): assert json[1].get("name") in ["test_file_2.fq", "test_file_right.fq"] # testing non-existing file - res2 = await cli.get( - "similar/test/non-existing", headers={"Authorization": ""} - ) + res2 = await cli.get("similar/test/non-existing", headers={"Authorization": ""}) assert res2.status == 404 # testing path is a directory @@ -714,22 +670,14 @@ async def test_existence(): username = "testuser" async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) - d2 = fs.make_dir(os.path.join(username, "test", "test_sub_dir")) - f3 = fs.make_file( - os.path.join(username, "test", "test_sub_dir", "test_file_2"), txt - ) - d3 = fs.make_dir( - os.path.join(username, "test", "test_sub_dir", "test_file_1") - ) - d4 = fs.make_dir( - os.path.join(username, "test", "test_sub_dir", "test_sub_dir") - ) - f4 = fs.make_file( - os.path.join( - username, "test", "test_sub_dir", "test_sub_dir", "test_file_1" - ), + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test", "test_sub_dir")) + fs.make_file(os.path.join(username, "test", "test_sub_dir", "test_file_2"), txt) + fs.make_dir(os.path.join(username, "test", "test_sub_dir", "test_file_1")) + fs.make_dir(os.path.join(username, "test", "test_sub_dir", "test_sub_dir")) + fs.make_file( + os.path.join(username, "test", "test_sub_dir", "test_sub_dir", "test_file_1"), txt, ) @@ -750,9 +698,7 @@ async def test_existence(): assert json["isFolder"] is False # testing existence of folder - res3 = await cli.get( - "existence/test_sub_dir", headers={"Authorization": ""} - ) + res3 = await cli.get("existence/test_sub_dir", headers={"Authorization": ""}) assert res3.status == 200 json_text = await res3.text() json = decoder.decode(json_text) @@ -780,10 +726,10 @@ async def test_search(): username = "testuser" async with AppClient(config, username) as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test1"), txt) - d2 = fs.make_dir(os.path.join(username, "test", "test2")) - f3 = fs.make_file(os.path.join(username, "test", "test2", "test3"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test1"), txt) + fs.make_dir(os.path.join(username, "test", "test2")) + fs.make_file(os.path.join(username, "test", "test2", "test3"), txt) res1 = await cli.get("search/", headers={"Authorization": ""}) assert res1.status == 200 json_text = await res1.text() @@ -810,14 +756,12 @@ async def test_upload(): assert res1.status == 400 with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) + fs.make_dir(os.path.join(username, "test")) f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) files = {"destPath": "/", "uploads": open(f, "rb")} - res2 = await cli.post( - os.path.join("upload"), headers={"Authorization": ""}, data=files - ) + res2 = await cli.post(os.path.join("upload"), headers={"Authorization": ""}, data=files) assert res2.status == 200 @@ -826,7 +770,6 @@ async def _upload_file_fail_filename(filename: str, err: str): # Note two file uploads in a row causes a test error: # https://github.com/aio-libs/aiohttp/issues/3968 async with AppClient(config, "fake") as cli: # username is ignored by AppClient - formdata = FormData() formdata.add_field("destPath", "/") formdata.add_field("uploads", BytesIO(b"sometext"), filename=filename) @@ -839,17 +782,18 @@ async def _upload_file_fail_filename(filename: str, err: str): async def test_upload_fail_leading_space(): await _upload_file_fail_filename( - " test_file", "cannot upload file with name beginning with space") + " test_file", "cannot upload file with name beginning with space" + ) async def test_upload_fail_dotfile(): await _upload_file_fail_filename( - ".test_file", "cannot upload file with name beginning with '.'") + ".test_file", "cannot upload file with name beginning with '.'" + ) async def test_upload_fail_comma_in_file(): - await _upload_file_fail_filename( - "test,file", "cannot upload file with ',' in name") + await _upload_file_fail_filename("test,file", "cannot upload file with ',' in name") @settings(deadline=None) @@ -898,9 +842,7 @@ async def test_directory_decompression(contents): assert not os.path.exists(d2) assert not os.path.exists(f3) assert os.path.exists(compressed) - resp = await cli.patch( - "/decompress/" + name, headers={"Authorization": ""} - ) + resp = await cli.patch("/decompress/" + name, headers={"Authorization": ""}) assert resp.status == 200 text = await resp.text() assert "succesfully decompressed" in text @@ -1020,38 +962,46 @@ async def test_bulk_specification_success(): base = Path(fu.base_dir) / "testuser" tsv = "genomes.tsv" with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - "val3\tval4\t1\n", - ]) + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + "val3\tval4\t1\n", + ] + ) csv = "somefolder/breakfastcereals.csv" with open(base / csv, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin , gross\n", - "grape nuts , dietary fiber, also gross\n", - ]) + f.writelines( + [ + "Data type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin , gross\n", + "grape nuts , dietary fiber, also gross\n", + ] + ) excel = "importspec.xlsx" with pandas.ExcelWriter(base / excel) as exw: - df = pandas.DataFrame([ - ["Data type: fruit_bats; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ["Fred", 1.5] - ]) + df = pandas.DataFrame( + [ + ["Data type: fruit_bats; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ["Fred", 1.5], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) resp = await cli.get(f"bulk_specification/?files={tsv} , {csv}, {excel} ") @@ -1060,7 +1010,7 @@ async def test_bulk_specification_success(): "types": { "genomes": [ {"spec1": "val1", "spec2": "ꔆ", "spec3": 7}, - {"spec1": "val3", "spec2": "val4", "spec3": 1} + {"spec1": "val3", "spec2": "val4", "spec3": 1}, ], "breakfastcereals": [ {"s1": "froot loops", "s2": "puffin", "s3": "gross"}, @@ -1070,18 +1020,20 @@ async def test_bulk_specification_success(): {"bat_name": "George", "wing_count": 42}, {"bat_name": "Fred", "wing_count": 1.5}, ], - "tree_sloths": [ - {"entity_id": "That which ends all", "preferred_food": "ꔆ"} - ] + "tree_sloths": [{"entity_id": "That which ends all", "preferred_food": "ꔆ"}], }, "files": { "genomes": {"file": "testuser/genomes.tsv", "tab": None}, "breakfastcereals": { "file": "testuser/somefolder/breakfastcereals.csv", - "tab": None}, + "tab": None, + }, "fruit_bats": {"file": "testuser/importspec.xlsx", "tab": "bats"}, - "tree_sloths": {"file": "testuser/importspec.xlsx", "tab": "sloths"}, - } + "tree_sloths": { + "file": "testuser/importspec.xlsx", + "tab": "sloths", + }, + }, } assert resp.status == 200 @@ -1102,17 +1054,19 @@ async def test_bulk_specification_fail_not_found(): base = Path(fu.base_dir) / "testuser" tsv = "otherfolder/genomes.tsv" with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) resp = await cli.get(f"bulk_specification/?files={tsv},somefile.csv") jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "cannot_find_file", "file": "testuser/somefile.csv"} - ]} + assert jsn == { + "errors": [{"type": "cannot_find_file", "file": "testuser/somefile.csv"}] + } assert resp.status == 404 @@ -1128,83 +1082,102 @@ async def test_bulk_specification_fail_parse_fail(): tsv = "otherfolder/genomes.tsv" # this one is fine with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) csv = "otherfolder/thing.csv" # this one has a misspelling in the header with open(base / csv, "w") as f: - f.writelines([ - "Dater type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin , gross\n", - ]) + f.writelines( + [ + "Dater type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin , gross\n", + ] + ) excel = "stuff.xlsx" with pandas.ExcelWriter(base / excel) as exw: # this one is fine - df = pandas.DataFrame([ - ["Data type: fruit_bats; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: fruit_bats; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) # this one is missing a parameter ID - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["", "preferred_food"], - ["ID", "Foods I like"], - ["Kevin Garibaldi", "Beeeaaaaans!"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["", "preferred_food"], + ["ID", "Foods I like"], + ["Kevin Garibaldi", "Beeeaaaaans!"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) # this also tests a number of bad file extensions - no need to create files - resp = await cli.get(f"bulk_specification/?files={tsv},{csv},{excel}" - + ",badfile,badfile.fasta.gz,badfile.sra,badfile.sys,badfile.") + resp = await cli.get( + f"bulk_specification/?files={tsv},{csv},{excel}" + + ",badfile,badfile.fasta.gz,badfile.sra,badfile.sys,badfile." + ) jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "cannot_parse_file", - "message": 'Invalid header; got "Dater type: breakfastcereals; ' - + 'Columns: 3; Version: 1", expected "Data type: ; ' - + 'Columns: ; Version: "', - "file": "testuser/otherfolder/thing.csv", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "Missing header entry in row 2, position 1", - "file": "testuser/stuff.xlsx", - "tab": "sloths", - }, - {"type": "cannot_parse_file", - "message": "badfile is not a supported file type for import specifications", - "file": "testuser/badfile", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "fasta.gz is not a supported file type for import specifications", - "file": "testuser/badfile.fasta.gz", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "sra is not a supported file type for import specifications", - "file": "testuser/badfile.sra", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "sys is not a supported file type for import specifications", - "file": "testuser/badfile.sys", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "badfile. is not a supported file type for import specifications", - "file": "testuser/badfile.", - "tab": None, - }, - ]} + assert jsn == { + "errors": [ + { + "type": "cannot_parse_file", + "message": 'Invalid header; got "Dater type: breakfastcereals; ' + + 'Columns: 3; Version: 1", expected "Data type: ; ' + + 'Columns: ; Version: "', + "file": "testuser/otherfolder/thing.csv", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "Missing header entry in row 2, position 1", + "file": "testuser/stuff.xlsx", + "tab": "sloths", + }, + { + "type": "cannot_parse_file", + "message": "badfile is not a supported file type for import specifications", + "file": "testuser/badfile", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "fasta.gz is not a supported file type for import specifications", + "file": "testuser/badfile.fasta.gz", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "sra is not a supported file type for import specifications", + "file": "testuser/badfile.sra", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "sys is not a supported file type for import specifications", + "file": "testuser/badfile.sys", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "badfile. is not a supported file type for import specifications", + "file": "testuser/badfile.", + "tab": None, + }, + ] + } assert resp.status == 400 @@ -1216,55 +1189,66 @@ async def test_bulk_specification_fail_column_count(): tsv = "genomes.tsv" # this one is fine with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) csv = "thing.csv" # this one is missing a column in the last row with open(base / csv, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin\n", - ]) + f.writelines( + [ + "Data type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin\n", + ] + ) excel = "stuff.xlsx" with pandas.ExcelWriter(base / excel) as exw: # this one has an extra column in the last row - df = pandas.DataFrame([ - ["Data type: fruit_bats; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42, 56], - ]) + df = pandas.DataFrame( + [ + ["Data type: fruit_bats; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42, 56], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) # this one is fine - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) - resp = await cli.get( - f"bulk_specification/?files={tsv},{csv},{excel}") + resp = await cli.get(f"bulk_specification/?files={tsv},{csv},{excel}") jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "incorrect_column_count", - "message": "Incorrect number of items in line 4, expected 3, got 2", - "file": "testuser/thing.csv", - "tab": None, - }, - {"type": "incorrect_column_count", - "message": "Incorrect number of items in line 4, expected 2, got 3", - "file": "testuser/stuff.xlsx", - "tab": "bats", - }, - ]} + assert jsn == { + "errors": [ + { + "type": "incorrect_column_count", + "message": "Incorrect number of items in line 4, expected 3, got 2", + "file": "testuser/thing.csv", + "tab": None, + }, + { + "type": "incorrect_column_count", + "message": "Incorrect number of items in line 4, expected 2, got 3", + "file": "testuser/stuff.xlsx", + "tab": "bats", + }, + ] + } assert resp.status == 400 @@ -1276,69 +1260,82 @@ async def test_bulk_specification_fail_multiple_specs_per_type(): tsv = "genomes.tsv" # this one is fine with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) csv1 = "thing.csv" # this is the first of the breakfastcereals data sources, so fine with open(base / csv1, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin, whee\n", - ]) + f.writelines( + [ + "Data type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin, whee\n", + ] + ) csv2 = "thing2.csv" # this data type is also breakfastcereals, so will cause an error with open(base / csv2, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 2; Version: 1\n", - "s1,s2\n", - "S 1,S 2\n", - "froot loops , puffin\n", - ]) + f.writelines( + [ + "Data type: breakfastcereals; Columns: 2; Version: 1\n", + "s1,s2\n", + "S 1,S 2\n", + "froot loops , puffin\n", + ] + ) excel = "stuff.xlsx" with pandas.ExcelWriter(base / excel) as exw: # this data type is also breakfastcereals, so will cause an error - df = pandas.DataFrame([ - ["Data type: breakfastcereals; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: breakfastcereals; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) # this one is fine - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) - resp = await cli.get( - f"bulk_specification/?files={tsv},{csv1},{csv2},{excel}") + resp = await cli.get(f"bulk_specification/?files={tsv},{csv1},{csv2},{excel}") jsn = await resp.json() err = "Data type breakfastcereals appears in two importer specification sources" - assert jsn == {"errors": [ - {"type": "multiple_specifications_for_data_type", - "message": err, - "file_1": "testuser/thing.csv", - "tab_1": None, - "file_2": "testuser/thing2.csv", - "tab_2": None - }, - {"type": "multiple_specifications_for_data_type", - "message": err, - "file_1": "testuser/thing.csv", - "tab_1": None, - "file_2": "testuser/stuff.xlsx", - "tab_2": "bats" - }, - ]} + assert jsn == { + "errors": [ + { + "type": "multiple_specifications_for_data_type", + "message": err, + "file_1": "testuser/thing.csv", + "tab_1": None, + "file_2": "testuser/thing2.csv", + "tab_2": None, + }, + { + "type": "multiple_specifications_for_data_type", + "message": err, + "file_1": "testuser/thing.csv", + "tab_1": None, + "file_2": "testuser/stuff.xlsx", + "tab_2": "bats", + }, + ] + } assert resp.status == 400 @@ -1353,62 +1350,68 @@ async def test_bulk_specification_fail_multiple_specs_per_type_excel(): base = Path(fu.base_dir) / "testuser" excel = "stuff.xlsx" with pandas.ExcelWriter(base / excel) as exw: - df = pandas.DataFrame([ - ["Data type: breakfastcereals; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: breakfastcereals; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) - df = pandas.DataFrame([ - ["Data type: breakfastcereals; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: breakfastcereals; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="otherbats", header=False, index=False) resp = await cli.get(f"bulk_specification/?files={excel}") jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "multiple_specifications_for_data_type", - "message": "Found datatype breakfastcereals in multiple tabs", - "file_1": "testuser/stuff.xlsx", - "tab_1": "bats", - "file_2": "testuser/stuff.xlsx", - "tab_2": "otherbats" - }, - ]} + assert jsn == { + "errors": [ + { + "type": "multiple_specifications_for_data_type", + "message": "Found datatype breakfastcereals in multiple tabs", + "file_1": "testuser/stuff.xlsx", + "tab_1": "bats", + "file_2": "testuser/stuff.xlsx", + "tab_2": "otherbats", + }, + ] + } assert resp.status == 400 _IMPORT_SPEC_TEST_DATA = { "genome": { - "order_and_display": [ - ["id1", "display1"], - ["id2", "display2"] - ], + "order_and_display": [["id1", "display1"], ["id2", "display2"]], "data": [ {"id1": 54, "id2": "boo"}, {"id1": 32, "id2": "yikes"}, - ] + ], }, "reads": { "order_and_display": [ ["name", "Reads File Name"], - ["inseam", "Reads inseam measurement in km"] + ["inseam", "Reads inseam measurement in km"], ], "data": [ {"name": "myreads.fa", "inseam": 0.1}, - ] - } + ], + }, } @@ -1417,19 +1420,21 @@ async def test_write_bulk_specification_success_csv(): async with AppClient(config) as cli: with FileUtil() as fu: fu.make_dir("testuser") # testuser is hardcoded in the auth mock - resp = await cli.post("write_bulk_specification/", json= - { + resp = await cli.post( + "write_bulk_specification/", + json={ "output_directory": "specs", "output_file_type": "CSV", "types": _IMPORT_SPEC_TEST_DATA, - }) + }, + ) js = await resp.json() assert js == { "output_file_type": "CSV", "files_created": { "genome": "testuser/specs/genome.csv", - "reads": "testuser/specs/reads.csv" - } + "reads": "testuser/specs/reads.csv", + }, } base = Path(fu.base_dir) / "testuser" check_file_contents( @@ -1440,7 +1445,7 @@ async def test_write_bulk_specification_success_csv(): "display1,display2\n", "54,boo\n", "32,yikes\n", - ] + ], ) check_file_contents( base / "specs/reads.csv", @@ -1449,7 +1454,7 @@ async def test_write_bulk_specification_success_csv(): "name,inseam\n", "Reads File Name,Reads inseam measurement in km\n", "myreads.fa,0.1\n", - ] + ], ) @@ -1459,21 +1464,23 @@ async def test_write_bulk_specification_success_tsv(): with FileUtil() as fu: fu.make_dir("testuser") # testuser is hardcoded in the auth mock types = dict(_IMPORT_SPEC_TEST_DATA) - types['reads'] = dict(types['reads']) - types['reads']['data'] = [] - resp = await cli.post("write_bulk_specification", json= - { + types["reads"] = dict(types["reads"]) + types["reads"]["data"] = [] + resp = await cli.post( + "write_bulk_specification", + json={ "output_directory": "tsvspecs", "output_file_type": "TSV", "types": types, - }) + }, + ) js = await resp.json() assert js == { "output_file_type": "TSV", "files_created": { "genome": "testuser/tsvspecs/genome.tsv", - "reads": "testuser/tsvspecs/reads.tsv" - } + "reads": "testuser/tsvspecs/reads.tsv", + }, } base = Path(fu.base_dir) / "testuser" check_file_contents( @@ -1484,7 +1491,7 @@ async def test_write_bulk_specification_success_tsv(): "display1\tdisplay2\n", "54\tboo\n", "32\tyikes\n", - ] + ], ) check_file_contents( base / "tsvspecs/reads.tsv", @@ -1492,7 +1499,7 @@ async def test_write_bulk_specification_success_tsv(): "Data type: reads; Columns: 2; Version: 1\n", "name\tinseam\n", "Reads File Name\tReads inseam measurement in km\n", - ] + ], ) @@ -1501,19 +1508,21 @@ async def test_write_bulk_specification_success_excel(): async with AppClient(config) as cli: with FileUtil() as fu: fu.make_dir("testuser") # testuser is hardcoded in the auth mock - resp = await cli.post("write_bulk_specification/", json= - { + resp = await cli.post( + "write_bulk_specification/", + json={ "output_directory": "", "output_file_type": "EXCEL", "types": _IMPORT_SPEC_TEST_DATA, - }) + }, + ) js = await resp.json() assert js == { "output_file_type": "EXCEL", "files_created": { "genome": "testuser/import_specification.xlsx", "reads": "testuser/import_specification.xlsx", - } + }, } wb = openpyxl.load_workbook(Path(fu.base_dir) / "testuser/import_specification.xlsx") assert wb.sheetnames == ["genome", "reads"] @@ -1553,7 +1562,8 @@ async def test_write_bulk_specification_fail_wrong_data_type(): async def test_write_bulk_specification_fail_no_content_length(): async with AppClient(config) as cli: resp = await cli.post( - "write_bulk_specification", headers={"content-type": "application/json"}) + "write_bulk_specification", headers={"content-type": "application/json"} + ) js = await resp.json() assert js == {"error": "The content-length header is required and must be > 0"} assert resp.status == 411 @@ -1578,33 +1588,39 @@ async def _write_bulk_specification_json_fail(json: Any, err: str): async def test_write_bulk_specification_fail_not_dict(): await _write_bulk_specification_json_fail( - ["foo"], "The top level JSON element must be a mapping") + ["foo"], "The top level JSON element must be a mapping" + ) async def test_write_bulk_specification_fail_no_output_dir(): await _write_bulk_specification_json_fail( - {}, "output_directory is required and must be a string") + {}, "output_directory is required and must be a string" + ) async def test_write_bulk_specification_fail_wrong_type_for_output_dir(): await _write_bulk_specification_json_fail( - {"output_directory": 4}, "output_directory is required and must be a string") + {"output_directory": 4}, "output_directory is required and must be a string" + ) async def test_write_bulk_specification_fail_no_file_type(): await _write_bulk_specification_json_fail( - {"output_directory": "foo"}, "Invalid output_file_type: None") + {"output_directory": "foo"}, "Invalid output_file_type: None" + ) async def test_write_bulk_specification_fail_wrong_file_type(): await _write_bulk_specification_json_fail( - {"output_directory": "foo", "output_file_type": "XSV"}, "Invalid output_file_type: XSV") + {"output_directory": "foo", "output_file_type": "XSV"}, + "Invalid output_file_type: XSV", + ) async def test_write_bulk_specification_fail_invalid_type_value(): await _write_bulk_specification_json_fail( {"output_directory": "foo", "output_file_type": "CSV", "types": {"a": "fake"}}, - "The value for data type a must be a mapping" + "The value for data type a must be a mapping", ) diff --git a/tests/test_app_error_formatter.py b/tests/test_app_error_formatter.py index a700fd9b..ace418f0 100644 --- a/tests/test_app_error_formatter.py +++ b/tests/test_app_error_formatter.py @@ -1,7 +1,11 @@ from pathlib import Path from staging_service.app_error_formatter import format_import_spec_errors -from staging_service.import_specifications.file_parser import ErrorType, Error, SpecificationSource +from staging_service.import_specifications.file_parser import ( + ErrorType, + Error, + SpecificationSource, +) def _ss(file: str, tab: str = None) -> SpecificationSource: @@ -15,10 +19,11 @@ def test_format_import_spec_errors_no_input(): def test_format_import_spec_errors_one_error(): errors = [Error(ErrorType.OTHER, "foobar")] assert format_import_spec_errors(errors, {}) == [ - {"type": "unexpected_error", - "message": "foobar", - "file": None, - } + { + "type": "unexpected_error", + "message": "foobar", + "file": None, + } ] @@ -31,7 +36,7 @@ def test_format_import_spec_errors_all_the_errors_no_tabs(): ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foobar4", _ss("file4"), - _ss("file5") + _ss("file5"), ), Error(ErrorType.NO_FILES_PROVIDED), Error(ErrorType.FILE_NOT_FOUND, source_1=_ss("file6")), @@ -45,31 +50,31 @@ def test_format_import_spec_errors_all_the_errors_no_tabs(): Path("file6"): Path("f6"), } assert format_import_spec_errors(errors, paths) == [ - {"type": "unexpected_error", - "message": "foobar1", - "file": "f1", - }, - {"type": "cannot_parse_file", - "message": "foobar2", - "file": "f2", - "tab": None - }, - {"type": "incorrect_column_count", - "message": "foobar3", - "file": "f3", - "tab": None, - }, - {"type": "multiple_specifications_for_data_type", - "message": "foobar4", - "file_1": "f4", - "tab_1": None, - "file_2": "f5", - "tab_2": None, - }, + { + "type": "unexpected_error", + "message": "foobar1", + "file": "f1", + }, + {"type": "cannot_parse_file", "message": "foobar2", "file": "f2", "tab": None}, + { + "type": "incorrect_column_count", + "message": "foobar3", + "file": "f3", + "tab": None, + }, + { + "type": "multiple_specifications_for_data_type", + "message": "foobar4", + "file_1": "f4", + "tab_1": None, + "file_2": "f5", + "tab_2": None, + }, {"type": "no_files_provided"}, - {"type": "cannot_find_file", - "file": "f6", - }, + { + "type": "cannot_find_file", + "file": "f6", + }, ] @@ -81,7 +86,7 @@ def test_format_import_spec_errors_all_the_errors_with_tabs(): ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foobar3", _ss("file3", "tab3"), - _ss("file4", "tab4") + _ss("file4", "tab4"), ), ] paths = { @@ -91,21 +96,24 @@ def test_format_import_spec_errors_all_the_errors_with_tabs(): Path("file4"): Path("f4"), } assert format_import_spec_errors(errors, paths) == [ - {"type": "cannot_parse_file", - "message": "foobar1", - "file": "f1", - "tab": "tab1" - }, - {"type": "incorrect_column_count", - "message": "foobar2", - "file": "f2", - "tab": "tab2", - }, - {"type": "multiple_specifications_for_data_type", - "message": "foobar3", - "file_1": "f3", - "tab_1": "tab3", - "file_2": "f4", - "tab_2": "tab4", - }, - ] \ No newline at end of file + { + "type": "cannot_parse_file", + "message": "foobar1", + "file": "f1", + "tab": "tab1", + }, + { + "type": "incorrect_column_count", + "message": "foobar2", + "file": "f2", + "tab": "tab2", + }, + { + "type": "multiple_specifications_for_data_type", + "message": "foobar3", + "file_1": "f3", + "tab_1": "tab3", + "file_2": "f4", + "tab_2": "tab4", + }, + ] diff --git a/tests/test_autodetect.py b/tests/test_autodetect.py index 21bbbc82..2cc4fda3 100644 --- a/tests/test_autodetect.py +++ b/tests/test_autodetect.py @@ -1,10 +1,11 @@ import pytest + +from staging_service.AutoDetectUtils import AutoDetectUtils +from staging_service.app import inject_config_dependencies from staging_service.autodetect.GenerateMappings import ( file_format_to_extension_mapping, extensions_mapping, ) -from staging_service.AutoDetectUtils import AutoDetectUtils -from staging_service.app import inject_config_dependencies from tests.test_utils import bootstrap_config @@ -36,11 +37,7 @@ def test_bad_filenames(): filename=filename ) assert possible_importers is None - assert fileinfo == { - "prefix": filename, - "suffix": None, - "file_ext_type": [] - } + assert fileinfo == {"prefix": filename, "suffix": None, "file_ext_type": []} def test_reasonable_filenames(): @@ -53,11 +50,13 @@ def test_reasonable_filenames(): for heading in file_format_to_extension_mapping.keys(): extensions = file_format_to_extension_mapping[heading] for extension in extensions: - good_filenames.append(( - f"{heading}.{extension}", - heading.count("."), - extensions_mapping[extension]['file_ext_type'] - )) + good_filenames.append( + ( + f"{heading}.{extension}", + heading.count("."), + extensions_mapping[extension]["file_ext_type"], + ) + ) for filename, heading_dotcount, ext in good_filenames: for filename_variant in [ @@ -71,12 +70,14 @@ def test_reasonable_filenames(): ) assert possible_importers is not None expected_suffix = filename_variant.split(".", heading_dotcount + 1)[-1] - assert possible_importers == AutoDetectUtils._MAPPINGS["types"][ - expected_suffix.lower()]["mappings"], filename_variant + assert ( + possible_importers + == AutoDetectUtils._MAPPINGS["types"][expected_suffix.lower()]["mappings"] + ), filename_variant assert fileinfo == { - "prefix": filename_variant[:-len(expected_suffix) - 1], + "prefix": filename_variant[: -len(expected_suffix) - 1], "suffix": expected_suffix, - "file_ext_type": ext + "file_ext_type": ext, } @@ -85,68 +86,101 @@ def test_specific_filenames(): Test some made up filenames to check that multi-dot extensions are handled correctly """ test_data = [ - ("filename", (None, {"prefix": "filename", "suffix": None, "file_ext_type": []})), - ("file.name", (None, {"prefix": "file.name", "suffix": None, "file_ext_type": []})), - ("fil.en.ame", (None, {"prefix": "fil.en.ame", "suffix": None, "file_ext_type": []})), - ("file.gZ", ( - [{ - 'app_weight': 1, - 'id': 'decompress', - 'title': 'Decompress/Unpack', - }], - {"prefix": "file" , "suffix": "gZ", "file_ext_type": ['CompressedFileFormatArchive']}, - ) - ), - ("file.name.gZ", ( - [{ - 'app_weight': 1, - 'id': 'decompress', - 'title': 'Decompress/Unpack', - }], - {"prefix": "file.name", - "suffix": "gZ", - "file_ext_type": ['CompressedFileFormatArchive'] - }, - ) - ), - ("oscar_the_grouch_does_meth.FaStA.gz", ( - [ - {'app_weight': 1, - 'id': 'assembly', - 'title': 'Assembly', + ( + "filename", + (None, {"prefix": "filename", "suffix": None, "file_ext_type": []}), + ), + ( + "file.name", + (None, {"prefix": "file.name", "suffix": None, "file_ext_type": []}), + ), + ( + "fil.en.ame", + (None, {"prefix": "fil.en.ame", "suffix": None, "file_ext_type": []}), + ), + ( + "file.gZ", + ( + [ + { + "app_weight": 1, + "id": "decompress", + "title": "Decompress/Unpack", + } + ], + { + "prefix": "file", + "suffix": "gZ", + "file_ext_type": ["CompressedFileFormatArchive"], }, - {'app_weight': 1, - 'id': 'gff_genome', - 'title': 'GFF/FASTA Genome', + ), + ), + ( + "file.name.gZ", + ( + [ + { + "app_weight": 1, + "id": "decompress", + "title": "Decompress/Unpack", + } + ], + { + "prefix": "file.name", + "suffix": "gZ", + "file_ext_type": ["CompressedFileFormatArchive"], }, - {'app_weight': 1, - 'id': 'gff_metagenome', - 'title': 'GFF/FASTA MetaGenome', - } - ], - {"prefix": "oscar_the_grouch_does_meth", - "suffix": "FaStA.gz", - "file_ext_type": ["FASTA"] - }, - ) - ), - ("look.at.all.these.frigging.dots.gff2.gzip", ( - [ - {'app_weight': 1, - 'id': 'gff_genome', - 'title': 'GFF/FASTA Genome', - }, - {'app_weight': 1, - 'id': 'gff_metagenome', - 'title': 'GFF/FASTA MetaGenome', - } - ], - {"prefix": "look.at.all.these.frigging.dots", - "suffix": "gff2.gzip", - "file_ext_type": ["GFF"] - }, - ) - ) + ), + ), + ( + "oscar_the_grouch_does_meth.FaStA.gz", + ( + [ + { + "app_weight": 1, + "id": "assembly", + "title": "Assembly", + }, + { + "app_weight": 1, + "id": "gff_genome", + "title": "GFF/FASTA Genome", + }, + { + "app_weight": 1, + "id": "gff_metagenome", + "title": "GFF/FASTA MetaGenome", + }, + ], + { + "prefix": "oscar_the_grouch_does_meth", + "suffix": "FaStA.gz", + "file_ext_type": ["FASTA"], + }, + ), + ), + ( + "look.at.all.these.frigging.dots.gff2.gzip", + ( + [ + { + "app_weight": 1, + "id": "gff_genome", + "title": "GFF/FASTA Genome", + }, + { + "app_weight": 1, + "id": "gff_metagenome", + "title": "GFF/FASTA MetaGenome", + }, + ], + { + "prefix": "look.at.all.these.frigging.dots", + "suffix": "gff2.gzip", + "file_ext_type": ["GFF"], + }, + ), + ), ] for filename, importers in test_data: @@ -159,13 +193,14 @@ def test_sra_mappings(): :return: """ sra_file = "test.sra" - possible_importers, fileinfo = AutoDetectUtils.determine_possible_importers( - filename=sra_file) - assert possible_importers == [{ - 'id': 'sra_reads', - 'app_weight': 1, - 'title': 'SRA Reads', - }] + possible_importers, fileinfo = AutoDetectUtils.determine_possible_importers(filename=sra_file) + assert possible_importers == [ + { + "id": "sra_reads", + "app_weight": 1, + "title": "SRA Reads", + } + ] assert fileinfo == {"prefix": "test", "suffix": "sra", "file_ext_type": ["SRA"]} @@ -175,17 +210,18 @@ def test_zip_mappings(): :return: """ gz_file = "test.tar.gz" - possible_importers, fileinfo = AutoDetectUtils.determine_possible_importers( - filename=gz_file) - assert possible_importers == [{ - 'id': 'decompress', - 'app_weight': 1, - 'title': 'Decompress/Unpack', - }] + possible_importers, fileinfo = AutoDetectUtils.determine_possible_importers(filename=gz_file) + assert possible_importers == [ + { + "id": "decompress", + "app_weight": 1, + "title": "Decompress/Unpack", + } + ] assert fileinfo == { "prefix": "test", "suffix": "tar.gz", - 'file_ext_type': ['CompressedFileFormatArchive'] + "file_ext_type": ["CompressedFileFormatArchive"], } @@ -197,28 +233,33 @@ def test_get_mappings(): assert AutoDetectUtils.get_mappings(["filename", "file.name.Gz", "some.dots.gff3.gz"]) == { "mappings": [ None, - [{ - 'app_weight': 1, - 'id': 'decompress', - 'title': 'Decompress/Unpack', - }], [ - {'app_weight': 1, - 'id': 'gff_genome', - 'title': 'GFF/FASTA Genome', - }, - {'app_weight': 1, - 'id': 'gff_metagenome', - 'title': 'GFF/FASTA MetaGenome', + { + "app_weight": 1, + "id": "decompress", + "title": "Decompress/Unpack", } ], + [ + { + "app_weight": 1, + "id": "gff_genome", + "title": "GFF/FASTA Genome", + }, + { + "app_weight": 1, + "id": "gff_metagenome", + "title": "GFF/FASTA MetaGenome", + }, + ], ], "fileinfo": [ {"prefix": "filename", "suffix": None, "file_ext_type": []}, - {"prefix": "file.name", - "suffix": "Gz", - "file_ext_type": ['CompressedFileFormatArchive'] - }, - {"prefix": "some.dots", "suffix": "gff3.gz", "file_ext_type": ['GFF']}, - ] + { + "prefix": "file.name", + "suffix": "Gz", + "file_ext_type": ["CompressedFileFormatArchive"], + }, + {"prefix": "some.dots", "suffix": "gff3.gz", "file_ext_type": ["GFF"]}, + ], } diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 599f0cef..81e544ae 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,17 +1,17 @@ -""" Unit tests for the metadata handling routines. """ +"""Unit tests for the metadata handling routines.""" import json import uuid - from collections.abc import Generator from pathlib import Path as PyPath + from pytest import fixture -from staging_service.utils import Path from staging_service.metadata import some_metadata - +from staging_service.utils import Path from tests.test_app import FileUtil + # TODO TEST add more unit tests here. @@ -34,16 +34,11 @@ async def test_incomplete_metadata_file_update(temp_dir: Path): See https://kbase-jira.atlassian.net/browse/PTV-1767 """ await _incomplete_metadata_file_update( - temp_dir, - {"source": "some place", "UPA": "1/2/3"}, - "some place" - ) + temp_dir, {"source": "some place", "UPA": "1/2/3"}, "some place" + ) + + await _incomplete_metadata_file_update(temp_dir, {"UPA": "1/2/3"}, "Unknown") - await _incomplete_metadata_file_update( - temp_dir, - {"UPA": "1/2/3"}, - "Unknown" - ) async def _incomplete_metadata_file_update(temp_dir, metadict, source): target = Path( @@ -51,7 +46,8 @@ async def _incomplete_metadata_file_update(temp_dir, metadict, source): str(temp_dir / "meta"), "user_path", "myfilename", - "super_fake_jgi_path") + "super_fake_jgi_path", + ) with open(target.full_path, "w") as p: p.writelines(make_test_lines(1, 6)) @@ -73,8 +69,9 @@ async def _incomplete_metadata_file_update(temp_dir, metadict, source): "lineCount": "5", "name": "myfilename", "path": "user_path", - "size": 1280 - } + "size": 1280, + } + def make_test_lines(start, stop): return [str(i) + "a" * (256 - len(str(i)) - 1) + "\n" for i in range(start, stop)] diff --git a/tests/test_utils.py b/tests/test_utils.py index 94143456..c8b03754 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,12 +1,12 @@ import configparser +import os import traceback - -from dotenv import load_dotenv from pathlib import Path from typing import Any -import os import openpyxl +from dotenv import load_dotenv + def bootstrap(): test_env_0 = "../test.env" @@ -34,21 +34,23 @@ def bootstrap_config(): def assert_exception_correct(got: Exception, expected: Exception): err = "".join(traceback.TracebackException.from_exception(got).format()) assert got.args == expected.args, err - assert type(got) == type(expected) + assert type(got) == type(expected) # noqa: E721 + def check_file_contents(file: Path, lines: list[str]): with open(file) as f: assert f.readlines() == lines + def check_excel_contents( wb: openpyxl.Workbook, sheetname: str, contents: list[list[Any]], - column_widths: list[int] + column_widths: list[int], ): sheet = wb[sheetname] for i, row in enumerate(sheet.iter_rows()): assert [cell.value for cell in row] == contents[i] # presumably there's an easier way to do this, but it works so f it dims = [sheet.column_dimensions[dim].width for dim in sheet.column_dimensions] - assert dims == column_widths \ No newline at end of file + assert dims == column_widths diff --git a/tox.ini b/tox.ini index 51b50a04..7da1f960 100644 --- a/tox.ini +++ b/tox.ini @@ -1,2 +1,2 @@ [flake8] -max-line-length = 100 \ No newline at end of file +max-line-length = 100