diff --git a/OPERATOR.rst b/OPERATOR.rst index 212aa31f50..81da597319 100644 --- a/OPERATOR.rst +++ b/OPERATOR.rst @@ -239,22 +239,55 @@ Change the target branch of the blocked PR to ``develop`` and remove the ``chain label from that PR. Remove the ``base`` label from the blocking PR. Lastly, remove the blocking relationship. +Updating the AMI for GitLab instances +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Once a month, operators must check for updates to the AMI for the root volume of +the EC2 instance running GitLab. There are ways to dynamically determine the +latest AMI for Amazon Linux 2 but in the spirit of reproducible builds, we would +rather pin the AMI ID and adopt updates at our own discretion to avoid +unexpected failures. To obtain the latest compatible AMI ID, select the desired +``….gitlab`` component, say, ``_select dev.gitlab`` and run + + :: + + aws ssm get-parameters \ + --names \ + $(aws ssm get-parameters-by-path \ + --path /aws/service/ami-amazon-linux-latest \ + --query "Parameters[].Name" \ + | jq -r .[] \ + | grep -F amzn2 \ + | grep -Fv minimal \ + | grep -Fv kernel-5.10 \ + | grep -F x86_64 \ + | grep -F ebs) \ + | jq -r .Parameters[].Value + +This will print the ID of the most recent Amazon Linux 2 AMI. Update the value +of the ``ami_id`` variable in ``terraform/gitlab/gitlab.tf.json.template.py``. +The variable holds a dictionary with one entry per region, because AMIs are +specific to a region. If there are ``….gitlab`` components in more than one AWS +region (uncommon), you need to select at least one ``….gitlab`` component in +each of these regions, rerun the command above for each such component, and add +or update the ``ami_id`` entry for the respective region. + Upgrading GitLab & ClamAV ^^^^^^^^^^^^^^^^^^^^^^^^^ -Operators must check for updates to GitLab and ClamAV on a monthly basis in -addition to triaging GitLab security releases that occur during the month. -An email notification is sent to ``azul-group@ucsc.edu`` when a GitLab security +Operators must check for updates to the Docker images for GitLab and ClamAV at +least once a month, and whenever a GitLab security releases requires it. An +email notification is sent to ``azul-group@ucsc.edu`` when a GitLab security release is available. Discuss with the lead the **Table of Fixes** referenced in the release blog post to determine the urgency of the update. An email notification should also be received when ClamAV releases become available. The current version of GitLab installed can be found on the ``/help`` endpoint of -`GitLab dev`_, and the available releases can be found on the -`GitLab Docker image`_ page. When updating the GitLab instance, check if there -are applicable updates to the `GitLab runner image`_. Use the latest runner -image whose major and minor version match that of the GitLab image. Similarly, -check for available releases to ClamAV in the `ClamAV image`_. The current -version of ClamAV image being used can be found by running:: +`GitLab dev`_, and the available releases can be found on the `GitLab Docker +image`_ page. When updating the GitLab instance, check if there are applicable +updates to the `GitLab runner image`_. Use the latest runner image whose major +and minor version match that of the GitLab image. Similarly, check for available +releases to ClamAV in the `ClamAV image`_. The current version of ClamAV image +being used can be found by running:: cat $project_root/terraform/gitlab/gitlab.tf.json.template.py | grep 'clamav_image =' @@ -313,6 +346,19 @@ For GitLab or ClamAV updates, use the ``--no-restart`` flag in order to leave the instance stopped after the snapshot has been created. There is no point in starting the instance only to have the update terminate it again. +Updating software packages on GitLab instances +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Once a week, operators must update all Linux packages installed on the root +volume of each GitLab instance. SSH access to the instances is necessary to +perform these instructions but on production instances this access is +unavailable, even to operators. In these cases the operator must request the +help of the system administrator via Slack to perform these steps. + +SSH into the instance, and run ``sudo yum update`` followed by ``sudo reboot``. +Wait for the GitLab web application to become available again and perform a +``git pull`` from one of the Git repositories hosted on that instance. + Adding snapshots to ``dev`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/UPGRADING.rst b/UPGRADING.rst index f04a1d4387..e0ec5587e1 100644 --- a/UPGRADING.rst +++ b/UPGRADING.rst @@ -19,6 +19,39 @@ branch that does not have the listed changes, the steps would need to be reverted. This is all fairly informal and loosely defined. Hopefully we won't have too many entries in this file. +#5207 Fix: Partition sizing ignores supplementary bundles +========================================================= + +Subgraph counts have been updated for `anvildev` and `anvilbox`. If you have any +personal deployments that index these snapshots, update the subgraph counts +accordingly. + + +#4022 Encrypt GitLab data and root volume and snapshots +======================================================= + +Operator +~~~~~~~~ + +Prior to pushing the merge commit to a GitLab instance, login to the AWS +Console and navigate to `EC2` -> `Instances` -> select the GitLab instance -> +`Storage` to confirm that root volume is encrypted. + +If the root volume is not encrypted, manually deploy the ``gitlab`` component of +a deployment just before pushing the merge commit to the GitLab instance in that +deployment. + + +#5133 Trigger an alarm on absence of logs +========================================= + +Operator +~~~~~~~~ + +Manually deploy the ``shared`` component of any main deployment just before +pushing the merge commit to the GitLab instance in that deployment. + + #5110 Update GitLab IAM policy for FedRAMP inventory ==================================================== diff --git a/deployments/anvilbox/environment.py b/deployments/anvilbox/environment.py index 892585afb2..1bedfb9d5d 100644 --- a/deployments/anvilbox/environment.py +++ b/deployments/anvilbox/environment.py @@ -68,17 +68,17 @@ def mkdict(previous_catalog: dict[str, str], anvil_sources = mkdict({}, 11, mkdelta([ - mksrc('datarepo-1ba591a6', 'ANVIL_1000G_high_coverage_2019_20221019_ANV5_202303081523', 6404), - mksrc('datarepo-aa67671a', 'ANVIL_CMG_UWASH_DS_BAV_IRB_PUB_RD_20221020_ANV5_202303081451', 177), + mksrc('datarepo-1ba591a6', 'ANVIL_1000G_high_coverage_2019_20221019_ANV5_202303081523', 6804), + mksrc('datarepo-aa67671a', 'ANVIL_CMG_UWASH_DS_BAV_IRB_PUB_RD_20221020_ANV5_202303081451', 181), mksrc('datarepo-b4e0bfd5', 'ANVIL_CMG_UWASH_DS_BDIS_20221020_ANV5_202303081501', 10), - mksrc('datarepo-333dd883', 'ANVIL_CMG_UWASH_DS_HFA_20221020_ANV5_202303081456', 83), - mksrc('datarepo-b968cbdb', 'ANVIL_CMG_UWASH_DS_NBIA_20221020_ANV5_202303081459', 107), - mksrc('datarepo-6a5b13ea', 'ANVIL_CMG_UWASH_HMB_20221020_ANV5_202303081455', 419), - mksrc('datarepo-3d4c42f7', 'ANVIL_CMG_UWASH_HMB_IRB_20221020_ANV5_202303081454', 41), - mksrc('datarepo-080b2c9e', 'ANVIL_CMG_UWash_DS_EP_20221020_ANV5_202303081452', 49), - mksrc('datarepo-4f75c9e3', 'ANVIL_CMG_UWash_GRU_20230308_ANV5_202303081731', 2113), - mksrc('datarepo-ec9365be', 'ANVIL_CMG_UWash_GRU_IRB_20221020_ANV5_202303081458', 559), - mksrc('datarepo-8392ac2c', 'ANVIL_GTEx_V8_hg38_20221013_ANV5_202303081502', 18361), + mksrc('datarepo-333dd883', 'ANVIL_CMG_UWASH_DS_HFA_20221020_ANV5_202303081456', 198), + mksrc('datarepo-b968cbdb', 'ANVIL_CMG_UWASH_DS_NBIA_20221020_ANV5_202303081459', 110), + mksrc('datarepo-6a5b13ea', 'ANVIL_CMG_UWASH_HMB_20221020_ANV5_202303081455', 423), + mksrc('datarepo-3d4c42f7', 'ANVIL_CMG_UWASH_HMB_IRB_20221020_ANV5_202303081454', 45), + mksrc('datarepo-080b2c9e', 'ANVIL_CMG_UWash_DS_EP_20221020_ANV5_202303081452', 53), + mksrc('datarepo-4f75c9e3', 'ANVIL_CMG_UWash_GRU_20230308_ANV5_202303081731', 5861), + mksrc('datarepo-ec9365be', 'ANVIL_CMG_UWash_GRU_IRB_20221020_ANV5_202303081458', 563), + mksrc('datarepo-8392ac2c', 'ANVIL_GTEx_V8_hg38_20221013_ANV5_202303081502', 101205) ])) diff --git a/deployments/anvildev/environment.py b/deployments/anvildev/environment.py index a4ed5db957..ac68e34301 100644 --- a/deployments/anvildev/environment.py +++ b/deployments/anvildev/environment.py @@ -56,17 +56,17 @@ def mkdict(previous_catalog: dict[str, str], anvil_sources = mkdict({}, 11, mkdelta([ - mksrc('datarepo-1ba591a6', 'ANVIL_1000G_high_coverage_2019_20221019_ANV5_202303081523', 6404), - mksrc('datarepo-aa67671a', 'ANVIL_CMG_UWASH_DS_BAV_IRB_PUB_RD_20221020_ANV5_202303081451', 177), + mksrc('datarepo-1ba591a6', 'ANVIL_1000G_high_coverage_2019_20221019_ANV5_202303081523', 6804), + mksrc('datarepo-aa67671a', 'ANVIL_CMG_UWASH_DS_BAV_IRB_PUB_RD_20221020_ANV5_202303081451', 181), mksrc('datarepo-b4e0bfd5', 'ANVIL_CMG_UWASH_DS_BDIS_20221020_ANV5_202303081501', 10), - mksrc('datarepo-333dd883', 'ANVIL_CMG_UWASH_DS_HFA_20221020_ANV5_202303081456', 83), - mksrc('datarepo-b968cbdb', 'ANVIL_CMG_UWASH_DS_NBIA_20221020_ANV5_202303081459', 107), - mksrc('datarepo-6a5b13ea', 'ANVIL_CMG_UWASH_HMB_20221020_ANV5_202303081455', 419), - mksrc('datarepo-3d4c42f7', 'ANVIL_CMG_UWASH_HMB_IRB_20221020_ANV5_202303081454', 41), - mksrc('datarepo-080b2c9e', 'ANVIL_CMG_UWash_DS_EP_20221020_ANV5_202303081452', 49), - mksrc('datarepo-4f75c9e3', 'ANVIL_CMG_UWash_GRU_20230308_ANV5_202303081731', 2113), - mksrc('datarepo-ec9365be', 'ANVIL_CMG_UWash_GRU_IRB_20221020_ANV5_202303081458', 559), - mksrc('datarepo-8392ac2c', 'ANVIL_GTEx_V8_hg38_20221013_ANV5_202303081502', 18361), + mksrc('datarepo-333dd883', 'ANVIL_CMG_UWASH_DS_HFA_20221020_ANV5_202303081456', 198), + mksrc('datarepo-b968cbdb', 'ANVIL_CMG_UWASH_DS_NBIA_20221020_ANV5_202303081459', 110), + mksrc('datarepo-6a5b13ea', 'ANVIL_CMG_UWASH_HMB_20221020_ANV5_202303081455', 423), + mksrc('datarepo-3d4c42f7', 'ANVIL_CMG_UWASH_HMB_IRB_20221020_ANV5_202303081454', 45), + mksrc('datarepo-080b2c9e', 'ANVIL_CMG_UWash_DS_EP_20221020_ANV5_202303081452', 53), + mksrc('datarepo-4f75c9e3', 'ANVIL_CMG_UWash_GRU_20230308_ANV5_202303081731', 5861), + mksrc('datarepo-ec9365be', 'ANVIL_CMG_UWash_GRU_IRB_20221020_ANV5_202303081458', 563), + mksrc('datarepo-8392ac2c', 'ANVIL_GTEx_V8_hg38_20221013_ANV5_202303081502', 101205) ])) diff --git a/deployments/anvilprod/environment.py b/deployments/anvilprod/environment.py index 7168f7d687..ce75730a78 100644 --- a/deployments/anvilprod/environment.py +++ b/deployments/anvilprod/environment.py @@ -56,9 +56,9 @@ def mkdict(previous_catalog: dict[str, str], anvil_sources = mkdict({}, 3, mkdelta([ - mksrc('datarepo-dev-43738c90', 'ANVIL_1000G_2019_Dev_20230302_ANV5_202303032342', 6404), - mksrc('datarepo-dev-42c70e6a', 'ANVIL_CCDG_Sample_1_20230228_ANV5_202302281520', 25), - mksrc('datarepo-dev-97ad270b', 'ANVIL_CMG_Sample_1_20230225_ANV5_202302281509', 25), + mksrc('datarepo-dev-43738c90', 'ANVIL_1000G_2019_Dev_20230302_ANV5_202303032342', 22814), + mksrc('datarepo-dev-42c70e6a', 'ANVIL_CCDG_Sample_1_20230228_ANV5_202302281520', 28), + mksrc('datarepo-dev-97ad270b', 'ANVIL_CMG_Sample_1_20230225_ANV5_202302281509', 25) ])) diff --git a/scripts/create_gitlab_snapshot.py b/scripts/create_gitlab_snapshot.py index 8a8870cb0d..8ab979811e 100644 --- a/scripts/create_gitlab_snapshot.py +++ b/scripts/create_gitlab_snapshot.py @@ -69,8 +69,9 @@ def gitlab_volume_info() -> JSON: def shutdown_instance(instance: JSON): instance_id = instance['InstanceId'] - log.info('Preparing to stop GitLab instance for %r, waiting 10 seconds ' - 'before proceeding. Hit Ctrl-C to abort …', + log.info('Preparing to stop GitLab instance in %r. ' + 'Waiting 10 seconds before proceeding. ' + 'Hit Ctrl-C to abort …', config.deployment_stage) sleep(10) log.info('Stopping instance %r …', instance_id) @@ -83,7 +84,7 @@ def shutdown_instance(instance: JSON): def start_instance(instance: JSON): instance_id = instance['InstanceId'] - log.info('Starting instance %r …', config.deployment_stage) + log.info('Starting instance %r …', instance_id) aws.ec2.start_instances(InstanceIds=[instance_id]) waiter = aws.ec2.get_waiter('instance_status_ok') waiter.wait(InstanceIds=[instance_id], @@ -105,7 +106,7 @@ def create_snapshot(volume: JSON): 'service': 'azul', } date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') - response = aws.ec2.create_snapshot(Description=f'{date} GitLab Update', + response = aws.ec2.create_snapshot(Description=f'{date} snapshot of GitLab data volume', VolumeId=volume['VolumeId'], TagSpecifications=[ dict(ResourceType='snapshot', @@ -118,7 +119,7 @@ def create_snapshot(volume: JSON): waiter.wait(SnapshotIds=[snapshot_id], WaiterConfig=dict(MaxAttempts=9999, Delay=15)) log.info('Snapshot %r of volume %r is complete', - volume['VolumeId'], config.deployment_stage) + snapshot_id, volume['VolumeId']) if __name__ == '__main__': diff --git a/scripts/update_subgraph_counts.py b/scripts/update_subgraph_counts.py new file mode 100644 index 0000000000..73d5e208b5 --- /dev/null +++ b/scripts/update_subgraph_counts.py @@ -0,0 +1,89 @@ +""" +Count the number of subgraphs per configured source and produce output to +expedite updated source configurations. +""" + +import argparse +import sys + +import attr + +from azul import ( + config, +) +from azul.args import ( + AzulArgumentHelpFormatter, +) +from azul.azulclient import ( + AzulClient, +) +from azul.indexer import ( + Prefix, +) +from azul.openapi import ( + format_description, +) +from azul.terra import ( + TDRSourceSpec, +) + + +@attr.s(auto_attribs=True, frozen=True, kw_only=True) +class SourceSpecArgs: + project: str + snapshot: str + subgraph_count: int + + def __str__(self) -> str: + return f'mksrc({self.project!r}, {self.snapshot!r}, {self.subgraph_count!r})' + + +def generate_sources(catalog: str) -> list[SourceSpecArgs]: + plugin = AzulClient().repository_plugin(catalog) + sources = [] + for spec in plugin.sources: + spec: TDRSourceSpec = attr.evolve(spec, prefix=Prefix.of_everything) + ref = plugin.resolve_source(str(spec)) + partitions = plugin.list_partitions(ref) + sources.append(SourceSpecArgs(project=spec.project, + snapshot=spec.name, + subgraph_count=sum(partitions.values()))) + return sources + + +def main(args: list[str]): + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=AzulArgumentHelpFormatter) + + parser.add_argument('--catalogs', + nargs='+', + metavar='NAME', + default=[ + c for c in config.catalogs + if c not in config.integration_test_catalogs + ], + help='The names of the catalogs to determine source specs for.') + + args = parser.parse_args(args) + + for catalog in args.catalogs: + print(catalog) + print('-' * len(catalog)) + spec_args_list = generate_sources(catalog) + spec_args_list.sort(key=lambda spec_args: spec_args.snapshot) + print(',\n'.join(map(str, spec_args_list)), end='\n\n') + + print(format_description(''' + ----------------- + !!! IMPORTANT !!! + ----------------- + + This script does *not* populate the `ma` or `pop` flags for the source + specs. Do not copy/paste the above output without checking whether these + flags should be applied. If `mksrc` generates a common prefix, manual + adjustment of the generated common prefix may be required. + ''')) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/src/azul/chalice.py b/src/azul/chalice.py index a3154745c9..0c73fe98b2 100644 --- a/src/azul/chalice.py +++ b/src/azul/chalice.py @@ -1,7 +1,6 @@ from collections.abc import ( Iterable, ) -import html import json from json import ( JSONEncoder, @@ -37,12 +36,6 @@ from furl import ( furl, ) -from more_itertools import ( - only, -) -from werkzeug.http import ( - parse_accept_header, -) from azul import ( config, @@ -79,6 +72,10 @@ class GoneError(ChaliceViewError): # Chalice does not define any exceptions for 5xx status codes besides 500 +class BadGatewayError(ChaliceViewError): + STATUS_CODE = 502 + + class ServiceUnavailableError(ChaliceViewError): STATUS_CODE = 503 @@ -110,7 +107,6 @@ def __init__(self, self._specs: Optional[MutableJSON] = None super().__init__(app_name, debug=config.debug > 0, configure_logs=False) # Middleware is invoked in order of registration - self.register_middleware(self._html_wrapping_middleware, 'http') self.register_middleware(self._logging_middleware, 'http') self.register_middleware(self._hsts_header_middleware, 'http') self.register_middleware(self._lambda_context_middleware, 'all') @@ -145,32 +141,6 @@ def patched_event_source_handler(self_, event, context): if old_handler.__code__ != patched_event_source_handler.__code__: chalice.app.EventSourceHandler.__call__ = patched_event_source_handler - def _html_wrapping_middleware(self, event, get_response): - """ - Embed a `text/plain` response in HTML if the request favors `text/html`. - Any HTML fragments in the original response are escaped to counteract - HTML injection attacks. This doesn't fully prevent those attacks because - a broken, ancient user agent might still request `text/plain`, `*/*` or - nothing, ignore the `text/plain` content type, sniff the HTML fragment - and render it. It does, however, handle vulnerability scanners because - those do prefer `text/html`. - """ - response = get_response(event) - ct_key = only(k for k in response.headers if k.casefold() == 'content-type') - if ct_key and response.headers[ct_key] == 'text/plain': - parsed = parse_accept_header(event.headers.get('accept')) - text_html = parsed.find('text/html') - star_star = parsed.find('*/*') - if 0 <= text_html and (star_star < 0 or text_html < star_star): - response.body = ( - '
' - f'{html.escape(json.dumps(response.body), quote=False)}' - '' - ) - response.headers[ct_key] = 'text/html' - return response - def _logging_middleware(self, event, get_response): self._log_request() response = get_response(event) diff --git a/src/azul/indexer/index_service.py b/src/azul/indexer/index_service.py index 2be20ad64a..728766006b 100644 --- a/src/azul/indexer/index_service.py +++ b/src/azul/indexer/index_service.py @@ -629,7 +629,11 @@ def _aggregate(self, contributions: list[CataloguedContribution]) -> list[Aggreg ] # FIXME: Replace hard coded limit with a config property # https://github.com/DataBiosphere/azul/issues/3725 - bundles = bundles[:100] + max_bundles = 100 + if len(bundles) > max_bundles: + log.warning('Only aggregating %i out of %i bundles for outer entity %r', + max_bundles, len(bundles), entity) + bundles = bundles[:max_bundles] sources = set(c.source for c in contributions) aggregate_cls = self.aggregate_class(entity.catalog) aggregate = aggregate_cls(coordinates=AggregateCoordinates(entity=entity), diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 488dda1952..73ccfd3a79 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -434,12 +434,28 @@ def list_sources(self, authentication: Optional[Authentication] ) -> Iterable[SOURCE_REF]: """ - The sources the plugin is configured to read metadata from. + The sources the plugin is configured to read metadata from that are + accessible using the provided authentication. Retrieving this + information may require a round-trip to the underlying repository. + Implementations should raise PermissionError if the provided + authentication is insufficient to access the repository. + """ + raise NotImplementedError + + def list_source_ids(self, + authentication: Optional[Authentication] + ) -> set[str]: + """ + List source IDs in the underlying repository that are accessible using + the provided authentication. Sources may be included even if they are + not configured to be read from. Subclasses should override this method + if it can be implemented more efficiently than `list_sources`. + Retrieving this information may require a round-trip to the underlying repository. Implementations should raise PermissionError if the provided authentication is insufficient to access the repository. """ - raise NotImplementedError + return {source.id for source in self.list_sources(authentication)} @cached_property def _generic_params(self) -> tuple: diff --git a/src/azul/plugins/repository/tdr.py b/src/azul/plugins/repository/tdr.py index b88418cc24..36e271f7b3 100644 --- a/src/azul/plugins/repository/tdr.py +++ b/src/azul/plugins/repository/tdr.py @@ -9,8 +9,10 @@ import logging import time from typing import ( + Callable, Optional, Type, + TypeVar, ) import attr @@ -25,7 +27,6 @@ CatalogName, cache_per_thread, config, - reject, require, ) from azul.auth import ( @@ -86,6 +87,9 @@ def _parse_drs_path(self, drs_uri: str) -> str: return str(drs_uri.path).strip('/') +T = TypeVar('T') + + @attr.s(kw_only=True, auto_attribs=True, frozen=True) class TDRPlugin(RepositoryPlugin[SOURCE_SPEC, SOURCE_REF, BUNDLE_FQID]): _sources: Set[TDRSourceSpec] @@ -113,10 +117,25 @@ def _user_authenticated_tdr(self, type(authentication)) return tdr + def _auth_fallback(self, + authentication: Optional[Authentication], + tdr_callback: Callable[[TDRClient], T] + ) -> T: + tdr = self._user_authenticated_tdr(authentication) + try: + return tdr_callback(tdr) + except UnauthorizedError: + if authentication is None or tdr.is_registered(): + raise + else: + # Fall back to anonymous access if the request is authenticated + # using an unregistered account. + tdr = self._user_authenticated_tdr(None) + return tdr_callback(tdr) + def list_sources(self, authentication: Optional[Authentication] ) -> list[TDRSourceRef]: - tdr = self._user_authenticated_tdr(authentication) configured_specs_by_name = {spec.name: spec for spec in self.sources} # Filter by prefix of snapshot names in an attempt to speed up the # listing by limiting the number of irrelevant snapshots returned. Note @@ -124,19 +143,8 @@ def list_sources(self, # the longest common substring is complicated and, as of yet, I haven't # found a trustworthy, reusable implementation. filter = longest_common_prefix(configured_specs_by_name.keys()) - try: - snapshots = tdr.snapshot_names_by_id(filter=filter) - except UnauthorizedError: - if tdr.is_registered(): - raise - else: - # Fall back to anonymous access if the user has authenticated - # using an unregistered account. The call to `reject` protects - # against infinite recursion in the event that the public - # service account erroneously isn't registered. - reject(authentication is None) - return self.list_sources(None) - + snapshots = self._auth_fallback(authentication, + lambda tdr: tdr.snapshot_names_by_id(filter=filter)) snapshot_ids_by_name = { name: id for id, name in snapshots.items() @@ -148,6 +156,12 @@ def list_sources(self, for name, id in snapshot_ids_by_name.items() ] + def list_source_ids(self, + authentication: Optional[Authentication] + ) -> set[str]: + return self._auth_fallback(authentication, + lambda tdr: tdr.snapshot_ids()) + @property def tdr(self): return self._tdr() diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index 88fe19e5b5..cc57acb528 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -277,12 +277,17 @@ def list_partitions(self, for partition_prefix in prefix.partition_prefixes() ] assert prefixes, prefix - entity_type = BundleEntityType.primary.value - pk_column = entity_type + '_id' + primary = BundleEntityType.primary.value + supplementary = BundleEntityType.supplementary.value rows = self._run_sql(f''' - SELECT prefix, COUNT({pk_column}) AS subgraph_count - FROM {backtick(self._full_table_name(source.spec, entity_type))} - JOIN UNNEST({prefixes}) AS prefix ON STARTS_WITH({pk_column}, prefix) + SELECT prefix, COUNT(datarepo_row_id) AS subgraph_count + FROM ( + SELECT datarepo_row_id FROM {backtick(self._full_table_name(source.spec, primary))} + UNION ALL + SELECT datarepo_row_id FROM {backtick(self._full_table_name(source.spec, supplementary))} + WHERE is_supplementary + ) + JOIN UNNEST({prefixes}) AS prefix ON STARTS_WITH(datarepo_row_id, prefix) GROUP BY prefix ''') return {row['prefix']: row['subgraph_count'] for row in rows} diff --git a/src/azul/service/source_controller.py b/src/azul/service/source_controller.py index dbf46494a0..2148e37bee 100644 --- a/src/azul/service/source_controller.py +++ b/src/azul/service/source_controller.py @@ -1,3 +1,4 @@ +import logging from typing import ( Optional, ) @@ -14,6 +15,7 @@ Authentication, ) from azul.chalice import ( + BadGatewayError, ServiceUnavailableError, ) from azul.service import ( @@ -30,6 +32,8 @@ JSONs, ) +log = logging.getLogger(__name__) + class SourceController(ServiceAppController): @@ -48,6 +52,16 @@ def list_sources(self, except TerraTimeoutException as e: raise ServiceUnavailableError(*e.args) else: + authoritative_source_ids = {source.id for source in sources} + cached_source_ids = self._list_source_ids(catalog, authentication) + # For optimized performance, the cache may include source IDs that + # are accessible but are not configured for indexing. Therefore, we + # expect the set of actual sources to be a subset of the cached + # sources. + diff = authoritative_source_ids - cached_source_ids + if diff: + log.debug(diff) + raise BadGatewayError('Inconsistent response from repository') return [ {'sourceId': source.id, 'sourceSpec': str(source.spec)} for source in sources @@ -57,8 +71,14 @@ def _list_source_ids(self, catalog: CatalogName, authentication: Optional[Authentication] ) -> set[str]: - sources = self.list_sources(catalog, authentication) - return {source['sourceId'] for source in sources} + try: + source_ids = self._source_service.list_source_ids(catalog, authentication) + except PermissionError: + raise UnauthorizedError + except TerraTimeoutException as e: + raise ServiceUnavailableError(*e.args) + else: + return source_ids def get_filters(self, catalog: CatalogName, diff --git a/src/azul/service/source_service.py b/src/azul/service/source_service.py index 9613ab9dbf..50ff6e4350 100644 --- a/src/azul/service/source_service.py +++ b/src/azul/service/source_service.py @@ -4,8 +4,8 @@ time, ) from typing import ( + Iterable, Optional, - Sequence, ) from azul import ( @@ -20,12 +20,14 @@ aws, ) from azul.indexer import ( - SourceJSON, SourceRef, ) from azul.plugins import ( RepositoryPlugin, ) +from azul.types import ( + AnyJSON, +) log = logging.getLogger(__name__) @@ -52,10 +54,10 @@ class SourceService: def _repository_plugin(self, catalog: CatalogName) -> RepositoryPlugin: return RepositoryPlugin.load(catalog).create(catalog) - def list_sources(self, - catalog: CatalogName, - authentication: Optional[Authentication] - ) -> list[SourceRef]: + def list_source_ids(self, + catalog: CatalogName, + authentication: Optional[Authentication] + ) -> set[str]: plugin = self._repository_plugin(catalog) cache_key = ( @@ -66,16 +68,26 @@ def list_sources(self, assert not any(joiner in c for c in cache_key), cache_key cache_key = joiner.join(cache_key) try: - sources = self._get(cache_key) + source_ids = self._get(cache_key) except CacheMiss: - sources = list(plugin.list_sources(authentication)) - self._put(cache_key, [source.to_json() for source in sources]) - return sources + pass else: - return [ - plugin.source_from_json(source) - for source in sources - ] + # FIXME: Remove safety net from source cache format transition + # https://github.com/DataBiosphere/azul/issues/5204 + try: + return set(source_ids) + except TypeError as e: + if e.args != ("unhashable type: 'dict'",): + raise + source_ids = plugin.list_source_ids(authentication) + self._put(cache_key, list(source_ids)) + return source_ids + + def list_sources(self, + catalog: CatalogName, + authentication: Optional[Authentication] + ) -> Iterable[SourceRef]: + return self._repository_plugin(catalog).list_sources(authentication) table_name = config.dynamo_sources_cache_table_name @@ -84,15 +96,13 @@ def list_sources(self, ttl_attribute = 'expiration' # Timespan in seconds that sources persist in the cache - # FIXME: Streamline cache expiration - # https://github.com/DataBiosphere/azul/issues/3094 expiration = 60 @property def _dynamodb(self): return aws.dynamodb - def _get(self, key: str) -> list[SourceJSON]: + def _get(self, key: str) -> list[AnyJSON]: response = self._dynamodb.get_item(TableName=self.table_name, Key={self.key_attribute: {'S': key}}, ProjectionExpression=','.join([self.value_attribute, self.ttl_attribute])) @@ -108,7 +118,7 @@ def _get(self, key: str) -> list[SourceJSON]: else: return json.loads(result[self.value_attribute]['S']) - def _put(self, key: str, sources: Sequence[SourceJSON]) -> None: + def _put(self, key: str, sources: list[AnyJSON]) -> None: item = { self.key_attribute: {'S': key}, self.value_attribute: {'S': json.dumps(sources)}, diff --git a/src/azul/terra.py b/src/azul/terra.py index 9bb0dac506..38f11361cb 100644 --- a/src/azul/terra.py +++ b/src/azul/terra.py @@ -571,6 +571,16 @@ def _check_response(self, page_size: ClassVar[int] = 1000 + def snapshot_ids(self) -> set[str]: + """ + List the IDs of the TDR snapshots accessible to the current credentials. + Much faster than listing the snapshots' names. + """ + endpoint = self._repository_endpoint('snapshots', 'roleMap') + response = self._request('GET', endpoint) + response = self._check_response(endpoint, response) + return set(response['roleMap'].keys()) + def snapshot_names_by_id(self, *, filter: Optional[str] = None diff --git a/terraform/cloudwatch.tf.json.template.py b/terraform/cloudwatch.tf.json.template.py index b478efb496..a629c4ded3 100644 --- a/terraform/cloudwatch.tf.json.template.py +++ b/terraform/cloudwatch.tf.json.template.py @@ -85,31 +85,94 @@ def prod_qualified_resource_name(name: str) -> str: 'resource': [ *( ( - { - 'aws_cloudwatch_metric_alarm': { - f'{lambda_}_5xx': { - 'alarm_name': config.qualified_resource_name(lambda_ + '_5xx'), - 'comparison_operator': 'GreaterThanThreshold', - # This alarm catches persistent 5XX errors occurring over - # one hour, specifically when more than one occurrence is - # sampled in a ten-minute period for six consecutive periods. - 'evaluation_periods': 6, - 'period': 60 * 10, - 'metric_name': '5XXError', - 'namespace': 'AWS/ApiGateway', - 'statistic': 'Sum', - 'threshold': 1, - 'treat_missing_data': 'notBreaching', - 'dimensions': { - 'ApiName': config.qualified_resource_name(lambda_), - 'Stage': config.deployment_stage, - }, - 'alarm_actions': ['${data.aws_sns_topic.monitoring.arn}'], - 'ok_actions': ['${data.aws_sns_topic.monitoring.arn}'], + *( + { + 'aws_cloudwatch_metric_alarm': { + f'{lambda_}_5xx': { + 'alarm_name': config.qualified_resource_name(lambda_ + '_5xx'), + 'comparison_operator': 'GreaterThanThreshold', + # This alarm catches persistent 5XX errors occurring over + # one hour, specifically when more than one occurrence is + # sampled in a ten-minute period for six consecutive periods. + 'evaluation_periods': 6, + 'period': 60 * 10, + 'metric_name': '5XXError', + 'namespace': 'AWS/ApiGateway', + 'statistic': 'Sum', + 'threshold': 1, + 'treat_missing_data': 'notBreaching', + 'dimensions': { + 'ApiName': config.qualified_resource_name(lambda_), + 'Stage': config.deployment_stage, + }, + 'alarm_actions': ['${data.aws_sns_topic.monitoring.arn}'], + 'ok_actions': ['${data.aws_sns_topic.monitoring.arn}'], + } } } - } - for lambda_ in config.lambda_names() + for lambda_ in config.lambda_names() + ), + *( + { + 'aws_cloudwatch_log_metric_filter': { + f'{lambda_}cachehealth': { + 'name': config.qualified_resource_name(f'{lambda_}cachehealth', suffix='.filter'), + 'pattern': '', + 'log_group_name': ( + '/aws/lambda/' + + config.qualified_resource_name(lambda_) + + f'-{lambda_}cachehealth' + ), + 'metric_transformation': { + 'name': config.qualified_resource_name(f'{lambda_}cachehealth'), + 'namespace': 'LogMetrics', + 'value': 1, + 'default_value': 0, + } + } + } + } + for lambda_ in config.lambda_names() + ), + *( + { + 'aws_cloudwatch_metric_alarm': { + f'{lambda_}cachehealth': { + 'alarm_name': config.qualified_resource_name(f'{lambda_}cachehealth', suffix='.alarm'), + 'comparison_operator': 'LessThanThreshold', + 'threshold': 1, + 'datapoints_to_alarm': 1, + 'evaluation_periods': 1, + 'treat_missing_data': 'breaching', + 'alarm_actions': ['${data.aws_sns_topic.monitoring.arn}'], + 'ok_actions': ['${data.aws_sns_topic.monitoring.arn}'], + # CloudWatch uses an unconfigurable "evaluation range" when missing + # data is involved. In practice this means that an alarm on the + # absence of logs with an evaluation period of ten minutes would + # require thirty minutes of no logs before the alarm is raised. + # Using a metric query we can fill in missing datapoints with a + # value of zero and avoid the need for the evaluation range. + 'metric_query': [ + { + 'id': 'log_count_filled', + 'expression': 'FILL(log_count_raw, 0)', + 'return_data': True, + }, + { + 'id': 'log_count_raw', + 'metric': { + 'metric_name': config.qualified_resource_name(f'{lambda_}cachehealth'), + 'namespace': 'LogMetrics', + 'period': 10 * 60, + 'stat': 'Sum', + } + } + ] + } + } + } + for lambda_ in config.lambda_names() + ), ) if config.enable_monitoring else () diff --git a/terraform/gitlab/gitlab.tf.json.template.py b/terraform/gitlab/gitlab.tf.json.template.py index 4dea5d4448..8a880d751e 100644 --- a/terraform/gitlab/gitlab.tf.json.template.py +++ b/terraform/gitlab/gitlab.tf.json.template.py @@ -124,17 +124,18 @@ # missing after `terraform apply`. # The name of an EBS volume to attach to the instance. This EBS volume must -# exist and be formatted with ext4. We don't manage the volume in Terraform -# because that would require formatting it once after creation. That can only -# be one after attaching it to an EC2 instance but before mounting it. This -# turns out to be difficult and risks overwriting existing data on the volume. -# We'd also have to prevent the volume from being deleted during `terraform -# destroy`. +# exist, be encrypted, and be formatted with ext4. We don't manage the volume in +# Terraform because that would require formatting it once after creation. That +# can only be one after attaching it to an EC2 instance but before mounting it. +# This turns out to be difficult and risks overwriting existing data on the +# volume. We'd also have to prevent the volume from being deleted during +# `terraform destroy`. # # If this EBS volume does not exist you must create it with the desired size # before running Terraform. For example: # # aws ec2 create-volume \ +# --encrypted \ # --size 100 \ # --availability-zone "${AWS_DEFAULT_REGION}a" \ # --tag-specifications 'ResourceType=volume,Tags=[{Key=Name,Value=azul-gitlab},{Key=owner,Value=hannes@ucsc.edu}]' @@ -297,29 +298,8 @@ def remove_inconsequential_statements(statements: list[JSON]) -> list[JSON]: gitlab_image = 'gitlab/gitlab-ce:15.11.2-ce.0' runner_image = 'gitlab/gitlab-runner:v15.11.0' -# There are ways to dynamically determine the latest Amazon Linux AMI but in the -# spirit of reproducable builds we would rather pin the AMI and adopt updates at -# our own discretion so as to avoid unexpected failures due to AMI changes. To -# determine the latest AMI for Amazon Linux 2, I used the following commands: -# -# _select dev.gitlab -# aws ssm get-parameters \ -# --names \ -# $(aws ssm get-parameters-by-path \ -# --path /aws/service/ami-amazon-linux-latest \ -# --query "Parameters[].Name" \ -# | jq -r .[] \ -# | grep -F amzn2 \ -# | grep -Fv minimal \ -# | grep -Fv kernel-5.10 \ -# | grep -F x86_64 \ -# | grep -F ebs) \ -# | jq -r .Parameters[].Value -# -# The AMI ID is specific to a region. If there are ….gitlab components in more -# than one AWS region, you need to select at least one ….gitlab component in -# each of these regions, rerun the command for each such component and add or -# update the entry for the respective region in the dictionary literal below. +# For instructions on finding the latest Amazon Linux AMI ID, see +# OPERATOR.rst#upgrading-linux-ami # ami_id = { 'us-east-1': 'ami-0bb85ecb87fe01c2f' @@ -1312,6 +1292,7 @@ def qq(*words): 'http_put_response_hop_limit': 3 }, 'root_block_device': { + 'encrypted': True, 'volume_size': 20 }, 'key_name': '${aws_key_pair.gitlab.key_name}', diff --git a/terraform/shared/shared.tf.json.template.py b/terraform/shared/shared.tf.json.template.py index f8d924f341..6ff7371ac0 100644 --- a/terraform/shared/shared.tf.json.template.py +++ b/terraform/shared/shared.tf.json.template.py @@ -364,35 +364,84 @@ def conformance_pack(name: str) -> str: } }, 'aws_cloudwatch_log_metric_filter': { - a.name: { - 'name': config.qualified_resource_name(a.name, suffix='.filter'), - 'pattern': a.filter_pattern, + **{ + a.name: { + 'name': config.qualified_resource_name(a.name, suffix='.filter'), + 'pattern': a.filter_pattern, + 'log_group_name': '${aws_cloudwatch_log_group.trail.name}', + 'metric_transformation': { + 'name': a.metric_name, + 'namespace': 'LogMetrics', + 'value': 1 + } + } + for a in cis_alarms + }, + 'trail_logs': { + 'name': config.qualified_resource_name('trail_logs', suffix='.filter'), + 'pattern': '', 'log_group_name': '${aws_cloudwatch_log_group.trail.name}', 'metric_transformation': { - 'name': a.metric_name, + 'name': config.qualified_resource_name('trail_logs'), 'namespace': 'LogMetrics', - 'value': 1 + 'value': 1, + 'default_value': 0, } } - for a in cis_alarms }, 'aws_cloudwatch_metric_alarm': { - a.name: { - 'alarm_name': config.qualified_resource_name(a.name, suffix='.alarm'), - 'comparison_operator': 'GreaterThanOrEqualToThreshold', - 'evaluation_periods': 1, - 'metric_name': a.metric_name, - 'namespace': 'LogMetrics', - 'statistic': a.statistic, - 'treat_missing_data': 'notBreaching', - 'threshold': 1, - # The CIS documentation does not specify a period. 5 minutes is - # the default value when creating the alarm via the console UI. - 'period': 5 * 60, - 'alarm_actions': ['${aws_sns_topic.monitoring.arn}'], - 'ok_actions': ['${aws_sns_topic.monitoring.arn}'] + **{ + a.name: { + 'alarm_name': config.qualified_resource_name(a.name, suffix='.alarm'), + 'comparison_operator': 'GreaterThanOrEqualToThreshold', + 'evaluation_periods': 1, + 'metric_name': a.metric_name, + 'namespace': 'LogMetrics', + 'statistic': a.statistic, + 'treat_missing_data': 'notBreaching', + 'threshold': 1, + # The CIS documentation does not specify a period. 5 minutes is + # the default value when creating the alarm via the console UI. + 'period': 5 * 60, + 'alarm_actions': ['${aws_sns_topic.monitoring.arn}'], + 'ok_actions': ['${aws_sns_topic.monitoring.arn}'] + } + for a in cis_alarms + }, + **{ + 'trail_logs': { + 'alarm_name': config.qualified_resource_name('trail_logs', suffix='.alarm'), + 'comparison_operator': 'LessThanThreshold', + 'threshold': 1, + 'datapoints_to_alarm': 1, + 'evaluation_periods': 1, + 'treat_missing_data': 'breaching', + 'alarm_actions': ['${aws_sns_topic.monitoring.arn}'], + 'ok_actions': ['${aws_sns_topic.monitoring.arn}'], + # CloudWatch uses an unconfigurable "evaluation range" when missing + # data is involved. In practice this means that an alarm on the + # absence of logs with an evaluation period of ten minutes would + # require thirty minutes of no logs before the alarm is raised. + # Using a metric query we can fill in missing datapoints with a + # value of zero and avoid the need for the evaluation range. + 'metric_query': [ + { + 'id': 'log_count_filled', + 'expression': 'FILL(log_count_raw, 0)', + 'return_data': True + }, + { + 'id': 'log_count_raw', + 'metric': { + 'metric_name': config.qualified_resource_name('trail_logs'), + 'namespace': 'LogMetrics', + 'period': 10 * 60, + 'stat': 'Sum', + } + } + ] + } } - for a in cis_alarms }, 'aws_iam_role': { 'api_gateway': { diff --git a/test/app_test_case.py b/test/app_test_case.py index 9c1a06d9c8..280ef8c4fa 100644 --- a/test/app_test_case.py +++ b/test/app_test_case.py @@ -6,10 +6,6 @@ Thread, ) import time -from typing import ( - Any, - ClassVar, -) # noinspection PyPackageRequirements from chalice.config import ( @@ -73,8 +69,6 @@ class LocalAppTestCase(CatalogTestCase, metaclass=ABCMeta): ElasticsearchTestCase. """ - app_module: ClassVar[Any] - @classmethod @abstractmethod def lambda_name(cls) -> str: diff --git a/test/azul_test_case.py b/test/azul_test_case.py index dbfcbfc905..f6f7db68d4 100644 --- a/test/azul_test_case.py +++ b/test/azul_test_case.py @@ -470,7 +470,7 @@ def setUpClass(cls): from service import ( patch_source_cache, ) - cls._source_cache_patch = patch_source_cache(hit=[cls.source.to_json()]) + cls._source_cache_patch = patch_source_cache(hit=[cls.source.id]) cls._source_cache_patch.start() @classmethod diff --git a/test/integration_test.py b/test/integration_test.py index e5d80698fd..f6a53864eb 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -445,19 +445,7 @@ def _wait_for_indexer(): self.azul_client.queue_notifications(catalog.notifications) _wait_for_indexer() for catalog in catalogs: - if config.is_hca_enabled(catalog.name): - entity_type = 'files' - elif config.is_anvil_enabled(catalog.name): - # While the files index does exist for AnVIL, it's possible - # for a bundle entity not to contain any files and - # thus be absent from the files response. The only entity - # type that is linked to both primary and supplementary - # bundles is datasets. - entity_type = 'datasets' - else: - assert False, catalog self._assert_catalog_complete(catalog=catalog.name, - entity_type=entity_type, bundle_fqids=catalog.bundles) self._test_single_entity_response(catalog=catalog.name) @@ -943,14 +931,49 @@ def update(source: SourceRef, notifications.extend(duplicate_bundles) return notifications, bundle_fqids + def _get_indexed_bundles(self, + catalog: CatalogName, + ) -> set[SourcedBundleFQID]: + indexed_fqids = set() + with self._service_account_credentials: + # FIXME: Use `bundles` index for `catalog_complete` subtest + # https://github.com/DataBiosphere/azul/issues/5214 + hits = self._get_entities(catalog, 'files') + if config.is_anvil_enabled(catalog): + # Primary bundles may not contain any files, and supplementary + # bundles contain only a file and a dataset. We can't use + # datasets to find all the indexed bundles because the number of + # bundles per dataset often exceeds the inner entity aggregation + # limit. Hence, we need to collect bundles separately for files + # and biosamples to cover supplementary and primary bundles, + # respectively. + hits.extend(self._get_entities(catalog, 'biosamples')) + for hit in hits: + source = one(hit['sources']) + for bundle in hit.get('bundles', ()): + bundle_fqid = dict( + source=dict(id=source['sourceId'], spec=source['sourceSpec']), + uuid=bundle['bundleUuid'], + version=bundle['bundleVersion'] + ) + if config.is_anvil_enabled(catalog): + for file in hit['files']: + is_supplementary = file['is_supplementary'] + if isinstance(is_supplementary, list): + is_supplementary = one(is_supplementary) + if is_supplementary: + bundle_fqid['entity_type'] = BundleEntityType.supplementary.value + break + else: + bundle_fqid['entity_type'] = BundleEntityType.primary.value + bundle_fqid = self.repository_plugin(catalog).resolve_bundle(bundle_fqid) + indexed_fqids.add(bundle_fqid) + return indexed_fqids + def _assert_catalog_complete(self, catalog: CatalogName, - entity_type: str, - bundle_fqids: Set[SourcedBundleFQID]) -> None: - fqid_by_uuid: Mapping[str, SourcedBundleFQID] = { - fqid.uuid: fqid for fqid in bundle_fqids - } - self.assertEqual(len(bundle_fqids), len(fqid_by_uuid)) + bundle_fqids: Set[SourcedBundleFQID] + ) -> None: with self.subTest('catalog_complete', catalog=catalog): expected_fqids = set(self.azul_client.filter_obsolete_bundle_versions(bundle_fqids)) obsolete_fqids = bundle_fqids - expected_fqids @@ -958,23 +981,13 @@ def _assert_catalog_complete(self, log.debug('Ignoring obsolete bundle versions %r', obsolete_fqids) num_bundles = len(expected_fqids) timeout = 600 - indexed_fqids = set() log.debug('Expecting bundles %s ', sorted(expected_fqids)) retries = 0 deadline = time.time() + timeout while True: - with self._service_account_credentials: - hits = self._get_entities(catalog, entity_type) - indexed_fqids.update( - # FIXME: We should use the source from the index rather than - # looking it up from the expectation. - # https://github.com/DataBiosphere/azul/issues/2625 - fqid_by_uuid[bundle['bundleUuid']] - for hit in hits - for bundle in hit.get('bundles', ()) - ) - log.info('Detected %i of %i bundles in %i hits for entity type %s on try #%i.', - len(indexed_fqids), num_bundles, len(hits), entity_type, retries) + indexed_fqids = self._get_indexed_bundles(catalog) + log.info('Detected %i of %i bundles on try #%i.', + len(indexed_fqids), num_bundles, retries) if len(indexed_fqids) == num_bundles: log.info('Found the expected %i bundles.', num_bundles) break diff --git a/test/service/__init__.py b/test/service/__init__.py index f51d572e9f..dfa87141ce 100644 --- a/test/service/__init__.py +++ b/test/service/__init__.py @@ -11,7 +11,6 @@ Callable, ClassVar, Optional, - Sequence, Union, ) from unittest.mock import ( @@ -39,7 +38,6 @@ from azul.indexer import ( Bundle, BundleUUID, - SourceJSON, SourcedBundleFQID, ) from azul.logging import ( @@ -54,6 +52,7 @@ StorageService, ) from azul.types import ( + AnyJSON, JSONs, ) from indexer import ( @@ -198,7 +197,7 @@ def storage_service(self) -> StorageService: 'mix in the appropriate subclass of CatalogTestCase.') def patch_source_cache(target: Union[None, type, Callable] = None, /, - hit: Optional[Sequence[SourceJSON]] = None): + hit: Optional[list[AnyJSON]] = None): """ Patch the cache access methods of SourceService to emulate a cache miss or return a given set of sources. diff --git a/test/service/test_cache_poisoning.py b/test/service/test_cache_poisoning.py index 9f2bcbc34a..18298ed459 100644 --- a/test/service/test_cache_poisoning.py +++ b/test/service/test_cache_poisoning.py @@ -1,6 +1,9 @@ from abc import ( ABCMeta, ) +from unittest.mock import ( + patch, +) import requests @@ -10,6 +13,9 @@ from azul.logging import ( configure_test_logging, ) +from azul.terra import ( + TDRClient, +) from azul_test_case import ( AnvilTestCase, DCP1TestCase, @@ -22,6 +28,21 @@ def setUpModule(): class CachePoisoningTestCase(LocalAppTestCase, metaclass=ABCMeta): + snapshot_mock = None + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.snapshot_mock = patch.object(TDRClient, + 'snapshot_names_by_id', + return_value={}) + cls.snapshot_mock.start() + + @classmethod + def tearDownClass(cls): + cls.snapshot_mock.stop() + cls.snapshot_mock = None + super().tearDownClass() @classmethod def lambda_name(cls) -> str: diff --git a/test/service/test_repository_proxy.py b/test/service/test_repository_proxy.py index 64bf44ac3b..18e782b6c3 100644 --- a/test/service/test_repository_proxy.py +++ b/test/service/test_repository_proxy.py @@ -61,6 +61,7 @@ SourceService, ) from azul.terra import ( + TDRClient, TDRSourceSpec, TerraClient, ) @@ -182,7 +183,9 @@ def test_repository_files_proxy(self, mock_get_cached_sources): response = client.request('GET', str(azul_url), redirect=False) self.assertEqual(response.status, 404) + @mock.patch.object(TDRClient, 'snapshot_names_by_id') def test_list_sources(self, + mock_list_snapshots, mock_get_cached_sources, ): # Includes extra sources to check that the endpoint only returns results @@ -192,14 +195,7 @@ def test_list_sources(self, str(i): source_name for i, source_name in enumerate(self.mock_source_names + extra_sources) } - mock_source_jsons = [ - { - 'id': id, - 'spec': str(TDRSourceSpec.parse(self.make_mock_source_spec(name))) - } - for id, name in mock_source_names_by_id.items() - if name not in extra_sources - ] + mock_list_snapshots.return_value = mock_source_names_by_id client = http_client() azul_url = furl(url=self.base_url, path='/repository/sources', @@ -219,20 +215,21 @@ def _test(*, authenticate: bool, cache: bool): self.assertEqual(response, { 'sources': [ { - 'sourceId': source['id'], - 'sourceSpec': source['spec'] + 'sourceId': id, + 'sourceSpec': str(TDRSourceSpec.parse(self.make_mock_source_spec(name))) } - for source in mock_source_jsons + for id, name in mock_source_names_by_id.items() + if name not in extra_sources ] }) - mock_get_cached_sources.return_value = mock_source_jsons + mock_get_cached_sources.return_value = list(mock_source_names_by_id.keys()) _test(authenticate=True, cache=True) _test(authenticate=False, cache=True) mock_get_cached_sources.return_value = None mock_get_cached_sources.side_effect = NotFound('foo_token') - with mock.patch('azul.terra.TDRClient.snapshot_names_by_id', - return_value=mock_source_names_by_id): + with mock.patch('azul.terra.TDRClient.snapshot_ids', + return_value=mock_source_names_by_id.keys() | {'not_indexed'}): _test(authenticate=True, cache=False) _test(authenticate=False, cache=False) diff --git a/test/test_content_type.py b/test/test_content_type.py deleted file mode 100644 index 32a1eae7ae..0000000000 --- a/test/test_content_type.py +++ /dev/null @@ -1,245 +0,0 @@ -import re -from typing import ( - Any, - Callable, -) -from unittest import ( - mock, -) - -from chalice import ( - Chalice, - ChaliceUnhandledError, - NotFoundError, - Response, -) -import requests - -from app_test_case import ( - LocalAppTestCase, -) -from azul_test_case import ( - DCP2TestCase, -) - - -class TestContentType(LocalAppTestCase, DCP2TestCase): - - @classmethod - def lambda_name(cls) -> str: - return 'service' - - @classmethod - def setUpClass(cls): - super().setUpClass() - - @cls.app_module.app.route('/test') - def route(): - pass - - def _replace_handler(self, handler: Callable[[None], Any]): - """ - Replace the current handler for route `/test` with the provided handler - """ - route = '/test' - app = self.__class__.app_module.app - app.routes.pop(route) - app._register_handler(handler_type='route', - name='route', - user_handler=handler, - wrapped_handler=handler, - kwargs={'path': route, 'kwargs': {}}, - options=None) - - def _shrink_traceback(self, s: str) -> str: - """ - Return a modified version of the given traceback. The first and last - lines are kept, and everything inbetween is replaced with a single line - of '...'. - """ - if s.startswith('Traceback'): - lines = s.split('\n') - assert lines[-1] == '', s # since traceback ends with a '\n' - s = '\n'.join([lines[0], '...', lines[-2]]) - elif s.startswith(''): - # Assumes traceback is a json dumped string inside tags - pattern = re.compile(r'(
)' - r'("Traceback.*?\\n)' # 1st line of traceback - r'.*\\n' # middle lines - r'(.*)\\n' # last line of traceback - r'(")') - s = re.sub(pattern, r'\1\2...\\n\3\4', s) - return s - - def _test_route(self, - handler_fn: Callable[[None], Any], - expected_fn: Callable[[bool, bool], tuple[str, str]] - ): - """ - Verify the response against expected for requests made with various - types of `accept` header values. - """ - self._replace_handler(handler_fn) - for debug in (False, True): - with mock.patch.object(Chalice, 'debug', 1 if debug else 0): - for accept, expect_wrapped in [ - (None, False), - ('*/*', False), - ('*/*,text/html', False), - ('text/html', True), - ('text/html,*/*', True), - ('*/*;q=0.9,text/html', True), - ('text/html;q=0.9,*/*;q=1.0', False), - ]: - with self.subTest(debug=debug, accept=accept): - url = self.base_url.set(path=('test',)) - headers = {'accept': accept} - response = requests.get(url, headers=headers) - response_text = self._shrink_traceback(response.text) - expected_text, expected_content_type = expected_fn(debug, expect_wrapped) - self.assertEqual(expected_text, response_text) - self.assertEqual(expected_content_type, response.headers['Content-Type']) - - def test_string(self): - - def route(): - return '' - - def expected(_debug: bool, _wrapped: bool) -> tuple[str, str]: - text = '' - content_type = 'application/json' - return text, content_type - - self._test_route(route, expected) - - def test_json(self): - - def route(): - return {'': ''} - - def expected(_debug: bool, _wrapped: bool) -> tuple[str, str]: - text = '{"":""}' - content_type = 'application/json' - return text, content_type - - self._test_route(route, expected) - - def test_response_200(self): - - def route(): - return Response(status_code=200, body='') - - def expected(_debug: bool, _wrapped: bool) -> tuple[str, str]: - text = '' - content_type = 'application/json' - return text, content_type - - self._test_route(route, expected) - - def test_response_200_text_plain(self): - - def route(): - return Response(status_code=200, - headers={'Content-Type': 'text/plain'}, - body='') - - def expected(_debug: bool, wrapped: bool) -> tuple[str, str]: - if wrapped: - text = ( - '' - '
"<script />"' - '' - ) - content_type = 'text/html' - else: - text = '' - content_type = 'text/plain' - return text, content_type - - self._test_route(route, expected) - - def test_response_200_text_html(self): - - def route(): - return Response(status_code=200, - headers={'Content-Type': 'text/html'}, - body='') - - def expected(_debug: bool, _wrapped: bool) -> tuple[str, str]: - text = '' - content_type = 'text/html' - return text, content_type - - self._test_route(route, expected) - - def test_NotFoundError(self): - - def route(): - raise NotFoundError('') - - def expected(_debug: bool, _wrapped: bool) -> tuple[str, str]: - text = '{"Code":"NotFoundError","Message":""}' - content_type = 'application/json' - return text, content_type - - self._test_route(route, expected) - - def test_ChaliceUnhandledError(self): - - def route(): - raise ChaliceUnhandledError('') - - def expected(debug: bool, _wrapped: bool) -> tuple[str, str]: - if debug: - text = ( - 'Traceback (most recent call last):\n' - '...\n' - 'chalice.app.ChaliceUnhandledError: ' - ) - content_type = 'text/plain' - else: - text = ( - '{"Code":"InternalServerError",' - '"Message":"An internal server error occurred."}' - ) - content_type = 'application/json' - return text, content_type - - self._test_route(route, expected) - - def test_exception(self): - - def route(): - raise Exception('') - - def expected(debug: bool, wrapped: bool) -> tuple[str, str]: - # Chalice's `_unhandled_exception_to_response` returns a - # stacktrace if debug is enabled - if debug: - if wrapped: - text = ( - '' - '
"Traceback (most recent call last):\\n' - '...\\n' - 'Exception: <script />"' - '' - ) - content_type = 'text/html' - else: - text = ( - 'Traceback (most recent call last):\n' - '...\n' - 'Exception: ' - ) - content_type = 'text/plain' - else: - text = ( - '{"Code":"InternalServerError",' - '"Message":"An internal server error occurred."}' - ) - content_type = 'application/json' - return text, content_type - - self._test_route(route, expected)