From 435853fcdd04abe82da93ca345b27ae68052b37d Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Tue, 16 Apr 2024 10:06:41 -0700 Subject: [PATCH 1/8] bug-1886021: Implement GCS storage classes for GCP --- bin/gcs_cli.py | 41 ++ bin/process_crashes.sh | 17 +- socorro/external/boto/connection_context.py | 9 +- socorro/external/boto/crashstorage.py | 13 +- socorro/external/gcs/__init__.py | 3 + socorro/external/gcs/crashstorage.py | 383 ++++++++++++++ socorro/mozilla_settings.py | 82 ++- socorro/tests/conftest.py | 113 ++++- socorro/tests/external/gcs/__init__.py | 3 + socorro/tests/external/gcs/test_crash_data.py | 140 ++++++ .../tests/external/gcs/test_crashstorage.py | 473 ++++++++++++++++++ webapp/crashstats/api/tests/test_views.py | 38 +- .../management/commands/updatemissing.py | 6 +- .../management/commands/verifyprocessed.py | 29 +- webapp/crashstats/crashstats/models.py | 6 + .../crashstats/tests/test_models.py | 26 +- .../crashstats/tests/test_updatemissing.py | 25 +- .../crashstats/tests/test_verifyprocessed.py | 68 +-- .../crashstats/crashstats/tests/test_views.py | 112 +++-- 19 files changed, 1397 insertions(+), 190 deletions(-) create mode 100644 socorro/external/gcs/__init__.py create mode 100644 socorro/external/gcs/crashstorage.py create mode 100644 socorro/tests/external/gcs/__init__.py create mode 100644 socorro/tests/external/gcs/test_crash_data.py create mode 100644 socorro/tests/external/gcs/test_crashstorage.py diff --git a/bin/gcs_cli.py b/bin/gcs_cli.py index d050995c92..d51b51e1c3 100755 --- a/bin/gcs_cli.py +++ b/bin/gcs_cli.py @@ -9,6 +9,7 @@ # Usage: ./bin/gcs_cli.py CMD import os +from pathlib import Path import click @@ -119,6 +120,46 @@ def list_objects(bucket_name, details): click.echo("No objects in bucket.") +@gcs_group.command("upload") +@click.argument("source") +@click.argument("destination") +def upload(source, destination): + """Upload files to a bucket""" + + client = get_client() + + # remove protocol from destination if present + destination = destination.split("://", 1)[-1] + bucket_name, _, prefix = destination.partition("/") + + try: + bucket = client.get_bucket(bucket_name) + except NotFound: + click.error(f"GCS bucket {bucket_name!r} does not exist.") + return + + source_path = Path(source) + if not source_path.exists(): + click.error("local path {source!r} does not exist.") + if source_path.is_dir(): + prefix_path = Path(prefix) + sources = [p for p in source_path.rglob("*") if not p.is_dir()] + else: + sources = [source_path] + if not sources: + click.echo("No files in directory {source!r}.") + return + for path in sources: + if path == source_path: + key_path = prefix_path + else: + key_path = prefix_path / path.relative_to(source_path) + key = "/".join(key_path.parts) + blob = bucket.blob(key) + blob.upload_from_filename(path) + click.echo(f"Uploaded gs://{bucket_name}/{key}") + + def main(argv=None): argv = argv or [] gcs_group(argv) diff --git a/bin/process_crashes.sh b/bin/process_crashes.sh index 3ff76f36ed..975df02a9a 100755 --- a/bin/process_crashes.sh +++ b/bin/process_crashes.sh @@ -4,8 +4,8 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -# Pulls down crash data for specified crash ids, syncs to the S3 bucket, and -# sends the crash ids to the Pub/Sub queue. +# Pulls down crash data for specified crash ids, syncs to the cloud storage +# bucket, and sends the crash ids to the queue. # # Usage: ./bin/process_crashes.sh # @@ -47,9 +47,16 @@ mkdir "${DATADIR}" || echo "${DATADIR} already exists." ./socorro-cmd fetch_crash_data "${DATADIR}" $@ # Make the bucket and sync contents -./bin/socorro_aws_s3.sh mb s3://dev-bucket/ -./bin/socorro_aws_s3.sh cp --recursive "${DATADIR}" "s3://${CRASHSTORAGE_S3_BUCKET}/" -./bin/socorro_aws_s3.sh ls --recursive "s3://${CRASHSTORAGE_S3_BUCKET}/" +# ^^ returns CLOUD_PROVIDER value as uppercase +if [[ "${CLOUD_PROVIDER^^}" == "GCP" ]]; then + ./socorro-cmd gcs create "${CRASHSTORAGE_GCS_BUCKET}" + ./socorro-cmd gcs upload "${DATADIR}" "${CRASHSTORAGE_GCS_BUCKET}" + ./socorro-cmd gcs list_objects "${CRASHSTORAGE_GCS_BUCKET}" +else + ./bin/socorro_aws_s3.sh mb "s3://${CRASHSTORAGE_S3_BUCKET}/" + ./bin/socorro_aws_s3.sh cp --recursive "${DATADIR}" "s3://${CRASHSTORAGE_S3_BUCKET}/" + ./bin/socorro_aws_s3.sh ls --recursive "s3://${CRASHSTORAGE_S3_BUCKET}/" +fi # Add crash ids to queue # ^^ returns CLOUD_PROVIDER value as uppercase diff --git a/socorro/external/boto/connection_context.py b/socorro/external/boto/connection_context.py index d2f031688f..14a1d40ed8 100644 --- a/socorro/external/boto/connection_context.py +++ b/socorro/external/boto/connection_context.py @@ -204,17 +204,22 @@ def load_file(self, bucket, path): f"(bucket={bucket!r} key={path}) not found, no value returned" ) from exc - def list_objects_paginator(self, bucket, prefix): + def list_objects_paginator(self, bucket, prefix, page_size=None): """Returns S3 client paginator of objects with key prefix in bucket :arg bucket: the name of the bucket :arg prefix: the key prefix + :arg page_size: the size of pages to request :returns: S3 paginator """ paginator = self.client.get_paginator("list_objects_v2") - page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) + page_iterator = paginator.paginate( + Bucket=bucket, + Prefix=prefix, + PaginationConfig={} if page_size is None else {"PageSize": page_size}, + ) return page_iterator def head_object(self, bucket, key): diff --git a/socorro/external/boto/crashstorage.py b/socorro/external/boto/crashstorage.py index 2ee5330f71..67ab2955c2 100644 --- a/socorro/external/boto/crashstorage.py +++ b/socorro/external/boto/crashstorage.py @@ -195,15 +195,18 @@ def save_processed_crash(self, raw_crash, processed_crash): path = build_keys("processed_crash", crash_id)[0] self.save_file(path, data) - def list_objects_paginator(self, prefix): - """Return generator of objects in the bucket that have a specified key prefix + def list_objects_paginator(self, prefix, page_size=None): + """Yield pages of object keys in the bucket that have a specified key prefix :arg prefix: the prefix to look at + :arg page_size: the number of results to return per page - :returns: generator of keys - + :returns: generator of pages (lists) of object keys """ - return self.connection.list(bucket=self.bucket, prefix=prefix) + for page in self.connection.list_objects_paginator( + bucket=self.bucket, prefix=prefix, page_size=page_size + ): + yield [item["Key"] for item in page.get("Contents", [])] def exists_object(self, key): """Returns whether the object exists in the bucket diff --git a/socorro/external/gcs/__init__.py b/socorro/external/gcs/__init__.py new file mode 100644 index 0000000000..448bb8652d --- /dev/null +++ b/socorro/external/gcs/__init__.py @@ -0,0 +1,3 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. diff --git a/socorro/external/gcs/crashstorage.py b/socorro/external/gcs/crashstorage.py new file mode 100644 index 0000000000..18d0adb9d3 --- /dev/null +++ b/socorro/external/gcs/crashstorage.py @@ -0,0 +1,383 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + +import json +import os + +import markus +from google.auth.credentials import AnonymousCredentials +from google.api_core.exceptions import NotFound +from google.cloud import storage +from more_itertools import chunked + +from socorro.external.crashstorage_base import ( + CrashStorageBase, + CrashIDNotFound, + MemoryDumpsMapping, +) +from socorro.external.boto.crashstorage import ( + build_keys, + dict_to_str, + list_to_str, + str_to_list, +) +from socorro.lib import external_common, MissingArgumentError, BadArgumentError, libooid +from socorro.lib.libjsonschema import JsonSchemaReducer +from socorro.lib.libsocorrodataschema import ( + get_schema, + permissions_transform_function, + SocorroDataReducer, + transform_schema, +) +from socorro.schemas import TELEMETRY_SOCORRO_CRASH_SCHEMA + + +class GcsCrashStorage(CrashStorageBase): + """Saves and loads crash data to GCS""" + + def __init__( + self, + bucket="crashstats", + dump_file_suffix=".dump", + metrics_prefix="processor.gcs", + ): + """ + :arg bucket: the GCS bucket to save to + :arg dump_file_suffix: the suffix used to identify a dump file (for use in temp + files) + :arg region: the AWS region to use + :arg access_key: the AWS access_key to use + :arg secret_access_key: the AWS secret_access_key to use + :arg endpoint_url: the endpoint url to use when in a local development + environment + + """ + super().__init__() + + if emulator := os.environ.get("STORAGE_EMULATOR_HOST"): + self.logger.debug( + "STORAGE_EMULATOR_HOST detected, connecting to emulator: %s", + emulator, + ) + self.client = storage.Client( + credentials=AnonymousCredentials(), + project=os.environ.get("STORAGE_PROJECT_ID"), + ) + else: + self.client = storage.Client() + + self.bucket = bucket + self.dump_file_suffix = dump_file_suffix + + self.metrics = markus.get_metrics(metrics_prefix) + + def load_file(self, path): + bucket = self.client.get_bucket(self.bucket) + blob = bucket.blob(path) + return blob.download_as_bytes() + + def save_file(self, path, data): + bucket = self.client.get_bucket(self.bucket) + blob = bucket.blob(path) + blob.upload_from_string(data) + + def save_raw_crash(self, raw_crash, dumps, crash_id): + """Save raw crash data to GCS bucket. + + A raw crash consists of the raw crash annotations and all the dumps that came in + the crash report. We need to save the raw crash file, a dump names file listing + the dumps that came in the crash report, and then each of the dumps. + + """ + if dumps is None: + dumps = MemoryDumpsMapping() + + path = build_keys("raw_crash", crash_id)[0] + raw_crash_data = dict_to_str(raw_crash).encode("utf-8") + self.save_file(path, raw_crash_data) + + path = build_keys("dump_names", crash_id)[0] + dump_names_data = list_to_str(dumps.keys()).encode("utf-8") + self.save_file(path, dump_names_data) + + # We don't know what type of dumps mapping we have. We do know, + # however, that by calling the memory_dump_mapping method, we will get + # a MemoryDumpMapping which is exactly what we need. + dumps = dumps.as_memory_dumps_mapping() + for dump_name, dump in dumps.items(): + if dump_name in (None, "", "upload_file_minidump"): + dump_name = "dump" + path = build_keys(dump_name, crash_id)[0] + self.save_file(path, dump) + + def save_processed_crash(self, raw_crash, processed_crash): + """Save the processed crash file.""" + crash_id = processed_crash["uuid"] + data = dict_to_str(processed_crash).encode("utf-8") + path = build_keys("processed_crash", crash_id)[0] + self.save_file(path, data) + + def list_objects_paginator(self, prefix, page_size=1000): + """Yield pages of object keys in the bucket that have a specified key prefix + + :arg prefix: the prefix to look at + :arg page_size: the number of results to return per page + + :returns: generator of pages (lists) of object keys + + """ + for page in chunked( + self.client.list_blobs( + bucket_or_name=self.bucket, prefix=prefix, page_size=page_size + ), + page_size, + ): + yield [blob.name for blob in page] + + def exists_object(self, key): + """Returns whether the object exists in the bucket + + :arg key: the key to check + + :returns: bool + + """ + try: + bucket = self.client.get_bucket(self.bucket) + bucket.get_blob(key) + return True + except NotFound: + return False + + def get_raw_crash(self, crash_id): + """Get the raw crash file for the given crash id + + :returns: dict + + :raises CrashIDNotFound: if the crash doesn't exist + + """ + for path in build_keys("raw_crash", crash_id): + try: + raw_crash_as_string = self.load_file(path) + data = json.loads(raw_crash_as_string) + return data + except NotFound: + continue + + raise CrashIDNotFound(f"{crash_id} not found") + + def get_raw_dump(self, crash_id, name=None): + """Get a specified dump file for the given crash id. + + :returns: dump as bytes + + :raises CrashIDNotFound: if file does not exist + + """ + try: + if name in (None, "", "upload_file_minidump"): + name = "dump" + path = build_keys(name, crash_id)[0] + a_dump = self.load_file(path) + return a_dump + except NotFound as exc: + raise CrashIDNotFound(f"{crash_id} not found: {exc}") from exc + + def get_dumps(self, crash_id): + """Get all the dump files for a given crash id. + + :returns MemoryDumpsMapping: + + :raises CrashIDNotFound: if file does not exist + + """ + try: + path = build_keys("dump_names", crash_id)[0] + dump_names_as_string = self.load_file(path) + dump_names = str_to_list(dump_names_as_string) + + dumps = MemoryDumpsMapping() + for dump_name in dump_names: + if dump_name in (None, "", "upload_file_minidump"): + dump_name = "dump" + path = build_keys(dump_name, crash_id)[0] + dumps[dump_name] = self.load_file(path) + return dumps + except NotFound as exc: + raise CrashIDNotFound(f"{crash_id} not found: {exc}") from exc + + def get_dumps_as_files(self, crash_id, tmpdir): + """Get the dump files for given crash id and save them to tmp. + + :returns: dict of dumpname -> file path + + :raises CrashIDNotFound: if file does not exist + + """ + in_memory_dumps = self.get_dumps(crash_id) + # convert our native memory dump mapping into a file dump mapping. + return in_memory_dumps.as_file_dumps_mapping( + crash_id, + tmpdir, + self.dump_file_suffix, + ) + + def get_processed_crash(self, crash_id): + """Get the processed crash. + + :returns: dict + + :raises CrashIDNotFound: if file does not exist + + """ + path = build_keys("processed_crash", crash_id)[0] + try: + processed_crash_as_string = self.load_file(path) + except NotFound as exc: + raise CrashIDNotFound(f"{crash_id} not found: {exc}") from exc + return json.loads(processed_crash_as_string) + + def get(self, **kwargs): + """Return JSON data of a crash report, given its uuid.""" + filters = [ + ("uuid", None, str), + ("datatype", None, str), + ("name", None, str), # only applicable if datatype == 'raw' + ] + params = external_common.parse_arguments(filters, kwargs, modern=True) + + if not params["uuid"]: + raise MissingArgumentError("uuid") + + if not libooid.is_crash_id_valid(params["uuid"]): + raise BadArgumentError("uuid") + + if not params["datatype"]: + raise MissingArgumentError("datatype") + + datatype_method_mapping = { + # Minidumps + "raw": "get_raw_dump", + # Raw Crash + "meta": "get_raw_crash", + # Redacted processed crash + "processed": "get_processed_crash", + } + if params["datatype"] not in datatype_method_mapping: + raise BadArgumentError(params["datatype"]) + get = self.__getattribute__(datatype_method_mapping[params["datatype"]]) + try: + if params["datatype"] == "raw": + return get(params["uuid"], name=params["name"]) + else: + return get(params["uuid"]) + except CrashIDNotFound as cidnf: + self.logger.warning("%s not found: %s", params["datatype"], cidnf) + # The CrashIDNotFound exception that happens inside the + # crashstorage is too revealing as exception message + # contains information about buckets and prefix keys. + # Re-wrap it here so the message is just the crash ID. + raise CrashIDNotFound(params["uuid"]) from cidnf + + +class TelemetryGcsCrashStorage(GcsCrashStorage): + """Sends a subset of the processed crash to a GCS bucket + + The subset of the processed crash is based on the JSON Schema which is + derived from "socorro/external/es/super_search_fields.py". + + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Create a reducer that traverses documents and reduces them down to the + # structure of the specified schema + self.build_reducers() + + def build_reducers(self): + processed_crash_schema = get_schema("processed_crash.schema.yaml") + only_public = permissions_transform_function( + permissions_have=["public"], + default_permissions=processed_crash_schema["default_permissions"], + ) + public_processed_crash_schema = transform_schema( + schema=processed_crash_schema, + transform_function=only_public, + ) + self.processed_crash_reducer = SocorroDataReducer( + schema=public_processed_crash_schema + ) + + self.telemetry_reducer = JsonSchemaReducer( + schema=TELEMETRY_SOCORRO_CRASH_SCHEMA + ) + + # List of source -> target keys which have different names for historical reasons + HISTORICAL_MANUAL_KEYS = [ + # processed crash source key, crash report target key + ("build", "build_id"), + ("date_processed", "date"), + ("os_pretty_version", "platform_pretty_version"), + ("os_name", "platform"), + ("os_version", "platform_version"), + ] + + def save_processed_crash(self, raw_crash, processed_crash): + """Save processed crash data. + + For Telemetry, we reduce the processed crash into a crash report that matches + the telemetry_socorro_crash.json schema. + + For historical reasons, we then add some additional fields manually. + + """ + # Reduce processed crash to public-only fields + public_data = self.processed_crash_reducer.traverse(document=processed_crash) + + # Reduce public processed_crash to telemetry schema fields + telemetry_data = self.telemetry_reducer.traverse(document=public_data) + + # Add additional fields that have different names for historical reasons + for source_key, target_key in self.HISTORICAL_MANUAL_KEYS: + if source_key in public_data: + telemetry_data[target_key] = public_data[source_key] + + crash_id = telemetry_data["uuid"] + data = dict_to_str(telemetry_data).encode("utf-8") + path = build_keys("crash_report", crash_id)[0] + self.save_file(path, data) + + def get_processed_crash(self, crash_id): + """Get a crash report from the GCS bucket. + + :returns: dict + + :raises CrashIDNotFound: if file does not exist + + """ + path = build_keys("crash_report", crash_id)[0] + try: + crash_report_as_str = self.load_file(path) + except NotFound as exc: + raise CrashIDNotFound(f"{crash_id} not found: {exc}") from exc + return json.loads(crash_report_as_str) + + def get(self, **kwargs): + """Return JSON data of a crash report, given its uuid.""" + filters = [("uuid", None, str)] + params = external_common.parse_arguments(filters, kwargs, modern=True) + + if not params["uuid"]: + raise MissingArgumentError("uuid") + + try: + return self.get_processed_crash(params["uuid"]) + except CrashIDNotFound as cidnf: + self.logger.warning("telemetry crash not found: %s", cidnf) + # The CrashIDNotFound exception that happens inside the + # crashstorage is too revealing as exception message contains + # information about buckets and prefix keys. Re-wrap it here so the + # message is just the crash ID. + raise CrashIDNotFound(params["uuid"]) from cidnf diff --git a/socorro/mozilla_settings.py b/socorro/mozilla_settings.py index 991526c48e..b13ab2b630 100644 --- a/socorro/mozilla_settings.py +++ b/socorro/mozilla_settings.py @@ -210,27 +210,7 @@ def _or_none(val): }, } - -def cloud_provider_parser(val): - """Return 'AWS' or 'GCP'.""" - normalized = val.strip().upper() - if normalized in ("AWS", "GCP"): - return normalized - raise ValueError(f"cloud provider not supported, must be AWS or GCP: {val}") - - -# Cloud provider specific configuration -CLOUD_PROVIDER = _config( - "CLOUD_PROVIDER", - default="AWS", - parser=cloud_provider_parser, - doc="The cloud provider to use for queueing and blob storage. Must be AWS or GCP.", -) -if CLOUD_PROVIDER == "AWS": - QUEUE = QUEUE_SQS -elif CLOUD_PROVIDER == "GCP": - QUEUE = QUEUE_PUBSUB - +# Crash report storage configuration if CLOUD_PROVIDER == AWS S3_STORAGE = { "class": "socorro.external.boto.crashstorage.BotoS3CrashStorage", "options": { @@ -258,6 +238,18 @@ def cloud_provider_parser(val): "endpoint_url": LOCAL_DEV_AWS_ENDPOINT_URL, }, } +# Crash report storage configuration if CLOUD_PROVIDER == GCP +GCS_STORAGE = { + "class": "socorro.external.gcs.crashstorage.GcsCrashStorage", + "options": { + "metrics_prefix": "processor.gcs", + "bucket": _config( + "CRASHSTORAGE_GCS_BUCKET", + default="", + doc="GCS bucket name for crash report data.", + ), + }, +} ES_STORAGE = { "class": "socorro.external.es.crashstorage.ESCrashStorage", @@ -277,7 +269,8 @@ def cloud_provider_parser(val): }, } -TELEMETRY_STORAGE = { +# Telemetry crash report storage configuration if CLOUD_PROVIDER == AWS +TELEMETRY_S3_STORAGE = { "class": "socorro.external.boto.crashstorage.TelemetryBotoS3CrashStorage", "options": { "metrics_prefix": "processor.telemetry", @@ -300,15 +293,52 @@ def cloud_provider_parser(val): "endpoint_url": LOCAL_DEV_AWS_ENDPOINT_URL, }, } +# Telemetry crash report storage configuration if CLOUD_PROVIDER == GCP +TELEMETRY_GCS_STORAGE = { + "class": "socorro.external.gcs.crashstorage.TelemetryGcsCrashStorage", + "options": { + "metrics_prefix": "processor.telemetry", + "bucket": _config( + "TELEMETRY_GCS_BUCKET", + default="", + doc="GCS bucket name for telemetry data export.", + ), + }, +} + + +def cloud_provider_parser(val): + """Return 'AWS' or 'GCP'.""" + normalized = val.strip().upper() + if normalized in ("AWS", "GCP"): + return normalized + raise ValueError(f"cloud provider not supported, must be AWS or GCP: {val}") + + +# Cloud provider specific configuration +CLOUD_PROVIDER = _config( + "CLOUD_PROVIDER", + default="AWS", + parser=cloud_provider_parser, + doc="The cloud provider to use for queueing and blob storage. Must be AWS or GCP.", +) +if CLOUD_PROVIDER == "AWS": + QUEUE = QUEUE_SQS + STORAGE = S3_STORAGE + TELEMETRY_STORAGE = TELEMETRY_S3_STORAGE +elif CLOUD_PROVIDER == "GCP": + QUEUE = QUEUE_PUBSUB + STORAGE = GCS_STORAGE + TELEMETRY_STORAGE = TELEMETRY_GCS_STORAGE -# Crash report storage source pulls from S3 -CRASH_SOURCE = S3_STORAGE +# Crash report storage source pulls from S3 or GCS +CRASH_SOURCE = STORAGE # Each key in this list corresponds to a key in this dict containing a crash report data # destination configuration -CRASH_DESTINATIONS_ORDER = ["s3", "es", "telemetry"] +CRASH_DESTINATIONS_ORDER = ["storage", "es", "telemetry"] CRASH_DESTINATIONS = { - "s3": S3_STORAGE, + "storage": STORAGE, "es": ES_STORAGE, "telemetry": TELEMETRY_STORAGE, } diff --git a/socorro/tests/conftest.py b/socorro/tests/conftest.py index 12459f3b3b..b77dff4bd0 100644 --- a/socorro/tests/conftest.py +++ b/socorro/tests/conftest.py @@ -18,6 +18,8 @@ from botocore.client import ClientError, Config from elasticsearch_dsl import Search from google.api_core.exceptions import AlreadyExists, NotFound +from google.auth.credentials import AnonymousCredentials +from google.cloud import storage from google.cloud.pubsub_v1 import PublisherClient, SubscriberClient from google.cloud.pubsub_v1.types import BatchSettings, PublisherOptions from markus.testing import MetricsMock @@ -157,12 +159,18 @@ def upload_fileobj(self, bucket_name, key, data): self.create_bucket(bucket_name) self.conn.upload_fileobj(Fileobj=io.BytesIO(data), Bucket=bucket_name, Key=key) + def upload(self, bucket_name, key, data): + self.upload_fileobj(bucket_name, key, data) + def download_fileobj(self, bucket_name, key): """Fetches an object from the specified bucket""" self.create_bucket(bucket_name) resp = self.conn.get_object(Bucket=bucket_name, Key=key) return resp["Body"].read() + def download(self, bucket_name, key): + return self.download_fileobj(bucket_name, key) + def list(self, bucket_name): """Return list of keys for objects in bucket.""" self.create_bucket(bucket_name) @@ -179,8 +187,8 @@ def s3_helper(): * ``get_client()`` * ``get_crashstorage_bucket()`` * ``create_bucket(bucket_name)`` - * ``upload_fileobj(bucket_name, key, value)`` - * ``download_fileobj(bucket_name, key)`` + * ``upload(bucket_name, key, data)`` + * ``download(bucket_name, key)`` * ``list(bucket_name)`` """ @@ -188,6 +196,87 @@ def s3_helper(): yield s3_helper +class GcsHelper: + """GCS helper class. + + When used in a context, this will clean up any buckets created. + + """ + + def __init__(self): + self._buckets_seen = None + if os.environ.get("STORAGE_EMULATOR_HOST"): + self.client = storage.Client( + credentials=AnonymousCredentials(), + project="test", + ) + else: + self.client = storage.Client() + + def __enter__(self): + self._buckets_seen = set() + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + for bucket_name in self._buckets_seen: + try: + bucket = self.client.get_bucket(bucket_or_name=bucket_name) + bucket.delete(force=True) + except NotFound: + pass + self._buckets_seen = None + + def get_crashstorage_bucket(self): + return os.environ["CRASHSTORAGE_GCS_BUCKET"] + + def get_telemetry_bucket(self): + return os.environ["TELEMETRY_GCS_BUCKET"] + + def create_bucket(self, bucket_name): + """Create specified bucket if it doesn't exist.""" + try: + bucket = self.client.get_bucket(bucket_or_name=bucket_name) + except NotFound: + bucket = self.client.create_bucket(bucket_or_name=bucket_name) + if self._buckets_seen is not None: + self._buckets_seen.add(bucket_name) + return bucket + + def upload(self, bucket_name, key, data): + """Puts an object into the specified bucket.""" + bucket = self.create_bucket(bucket_name) + bucket.blob(blob_name=key).upload_from_string(data) + + def download(self, bucket_name, key): + """Fetches an object from the specified bucket""" + bucket = self.create_bucket(bucket_name) + return bucket.blob(blob_name=key).download_as_bytes() + + def list(self, bucket_name): + """Return list of keys for objects in bucket.""" + self.create_bucket(bucket_name) + blobs = list(self.client.list_blobs(bucket_or_name=bucket_name)) + return [blob.name for blob in blobs] + + +@pytest.fixture +def gcs_helper(): + """Returns an GcsHelper for automating repetitive tasks in GCS setup. + + Provides: + + * ``get_crashstorage_bucket()`` + * ``get_telemetry_bucket()`` + * ``create_bucket(bucket_name)`` + * ``upload(bucket_name, key, data)`` + * ``download(bucket_name, key)`` + * ``list(bucket_name)`` + + """ + with GcsHelper() as gcs_helper: + yield gcs_helper + + class ElasticsearchHelper: """Elasticsearch helper class. @@ -548,3 +637,23 @@ def queue_helper(cloud_provider): with helper as _helper: yield _helper + + +@pytest.fixture +def storage_helper(cloud_provider): + """Generate and return a queue helper using env config.""" + actual_backend = "s3" + if os.environ.get("CLOUD_PROVIDER", "").strip().upper() == "GCP": + actual_backend = "gcs" + + expect_backend = "gcs" if cloud_provider == "gcp" else "s3" + if actual_backend != expect_backend: + pytest.fail(f"test requires {expect_backend} but found {actual_backend}") + + if actual_backend == "gcs": + helper = GcsHelper() + else: + helper = S3Helper() + + with helper as _helper: + yield _helper diff --git a/socorro/tests/external/gcs/__init__.py b/socorro/tests/external/gcs/__init__.py new file mode 100644 index 0000000000..448bb8652d --- /dev/null +++ b/socorro/tests/external/gcs/__init__.py @@ -0,0 +1,3 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. diff --git a/socorro/tests/external/gcs/test_crash_data.py b/socorro/tests/external/gcs/test_crash_data.py new file mode 100644 index 0000000000..e465b2b988 --- /dev/null +++ b/socorro/tests/external/gcs/test_crash_data.py @@ -0,0 +1,140 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + +import json +import os + +import pytest + +from socorro.external.crashstorage_base import CrashIDNotFound +from socorro.lib import MissingArgumentError, BadArgumentError +from socorro.libclass import build_instance_from_settings +from socorro.lib.libooid import create_new_ooid + + +CRASHDATA_SETTINGS = { + "class": "socorro.external.gcs.crashstorage.GcsCrashStorage", + "options": { + "bucket": os.environ["CRASHSTORAGE_GCS_BUCKET"], + }, +} + +TELEMETRY_SETTINGS = { + "class": "socorro.external.gcs.crashstorage.TelemetryGcsCrashStorage", + "options": { + "bucket": os.environ["TELEMETRY_GCS_BUCKET"], + }, +} + + +class TestSimplifiedCrashData: + def test_get_processed(self, gcs_helper): + crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) + + bucket = CRASHDATA_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/processed_crash/{crash_id}", + data=json.dumps({"foo": "bar"}).encode("utf-8"), + ) + + result = crashdata.get(uuid=crash_id, datatype="processed") + assert result == {"foo": "bar"} + + def test_get_processed_not_found(self, gcs_helper): + crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) + + bucket = CRASHDATA_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + with pytest.raises(CrashIDNotFound): + crashdata.get(uuid=crash_id, datatype="processed") + + def test_get_raw_dump(self, gcs_helper): + crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) + + bucket = CRASHDATA_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/dump/{crash_id}", + data=b"\xa0", + ) + + result = crashdata.get(uuid=crash_id, datatype="raw") + assert result == b"\xa0" + + def test_get_raw_dump_not_found(self, gcs_helper): + crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) + + bucket = CRASHDATA_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + with pytest.raises(CrashIDNotFound): + crashdata.get(uuid=crash_id, datatype="raw") + + def test_get_raw_crash_not_found(self, gcs_helper): + crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) + + bucket = CRASHDATA_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + with pytest.raises(CrashIDNotFound): + crashdata.get(uuid=crash_id, datatype="meta") + + def test_bad_arguments(self): + crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) + + crash_id = create_new_ooid() + + with pytest.raises(MissingArgumentError): + crashdata.get() + + with pytest.raises(MissingArgumentError): + crashdata.get(uuid=crash_id) + + with pytest.raises(BadArgumentError): + crashdata.get(uuid=crash_id, datatype="junk") + + +class TestTelemetryCrashData: + def test_get_data(self, gcs_helper): + crashdata = build_instance_from_settings(TELEMETRY_SETTINGS) + + bucket = TELEMETRY_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/crash_report/20{crash_id[-6:]}/{crash_id}", + data=json.dumps({"foo": "bar"}).encode("utf-8"), + ) + + result = crashdata.get(uuid=crash_id) + assert result == {"foo": "bar"} + + def test_get_data_not_found(self, gcs_helper): + crashdata = build_instance_from_settings(TELEMETRY_SETTINGS) + + bucket = TELEMETRY_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + with pytest.raises(CrashIDNotFound): + crashdata.get(uuid=crash_id) + + def test_bad_arguments(self): + crashdata = build_instance_from_settings(TELEMETRY_SETTINGS) + + with pytest.raises(MissingArgumentError): + crashdata.get() diff --git a/socorro/tests/external/gcs/test_crashstorage.py b/socorro/tests/external/gcs/test_crashstorage.py new file mode 100644 index 0000000000..a2b3f19044 --- /dev/null +++ b/socorro/tests/external/gcs/test_crashstorage.py @@ -0,0 +1,473 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + +import json +import os.path +import os + +import pytest + +from socorro.external.gcs.crashstorage import build_keys, dict_to_str +from socorro.external.crashstorage_base import CrashIDNotFound, MemoryDumpsMapping +from socorro.libclass import build_instance_from_settings +from socorro.lib.libdatetime import date_to_string, utc_now +from socorro.lib.libooid import create_new_ooid + + +CRASHSTORAGE_SETTINGS = { + "class": "socorro.external.gcs.crashstorage.GcsCrashStorage", + "options": { + "bucket": os.environ["CRASHSTORAGE_GCS_BUCKET"], + }, +} + + +TELEMETRY_SETTINGS = { + "class": "socorro.external.gcs.crashstorage.TelemetryGcsCrashStorage", + "options": { + "bucket": os.environ["TELEMETRY_GCS_BUCKET"], + }, +} + + +@pytest.mark.parametrize( + "kind, crashid, expected", + [ + ( + "raw_crash", + "0bba929f-8721-460c-dead-a43c20071027", + [ + "v1/raw_crash/20071027/0bba929f-8721-460c-dead-a43c20071027", + ], + ), + ( + "dump_names", + "0bba929f-8721-460c-dead-a43c20071027", + [ + "v1/dump_names/0bba929f-8721-460c-dead-a43c20071027", + ], + ), + ( + "processed_crash", + "0bba929f-8721-460c-dead-a43c20071027", + [ + "v1/processed_crash/0bba929f-8721-460c-dead-a43c20071027", + ], + ), + # For telemetry + ( + "crash_report", + "0bba929f-8721-460c-dead-a43c20071027", + [ + "v1/crash_report/20071027/0bba929f-8721-460c-dead-a43c20071027", + ], + ), + ], +) +def test_build_keys(kind, crashid, expected): + assert build_keys(kind, crashid) == expected + + +class TestGcsCrashStorage: + def test_save_raw_crash_no_dumps(self, gcs_helper): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + now = utc_now() + crash_id = create_new_ooid(timestamp=now) + original_raw_crash = {"submitted_timestamp": date_to_string(now)} + gcs_helper.create_bucket(bucket) + + # Run save_raw_crash + crashstorage.save_raw_crash( + raw_crash=original_raw_crash, + # This is an empty set of dumps--no dumps! + dumps=MemoryDumpsMapping(), + crash_id=crash_id, + ) + + # Verify the raw_crash made it to the right place and has the right + # contents + raw_crash = gcs_helper.download( + bucket_name=bucket, + key=f"v1/raw_crash/20{crash_id[-6:]}/{crash_id}", + ) + + assert json.loads(raw_crash) == original_raw_crash + + # Verify dump_names made it to the right place and has the right contents + dump_names = gcs_helper.download( + bucket_name=bucket, + key=f"v1/dump_names/{crash_id}", + ) + assert json.loads(dump_names) == [] + + def test_save_raw_crash_with_dumps(self, gcs_helper): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + now = utc_now() + crash_id = create_new_ooid(timestamp=now) + original_raw_crash = {"submitted_timestamp": date_to_string(now)} + gcs_helper.create_bucket(bucket) + + # Run save_raw_crash + crashstorage.save_raw_crash( + raw_crash=original_raw_crash, + dumps=MemoryDumpsMapping( + {"dump": b"fake dump", "content_dump": b"fake content dump"} + ), + crash_id=crash_id, + ) + + # Verify the raw_crash made it to the right place and has the right contents + raw_crash = gcs_helper.download( + bucket_name=bucket, + key=f"v1/raw_crash/20{crash_id[-6:]}/{crash_id}", + ) + + assert json.loads(raw_crash) == original_raw_crash + + # Verify dump_names made it to the right place and has the right + # contents + dump_names = gcs_helper.download( + bucket_name=bucket, + key=f"v1/dump_names/{crash_id}", + ) + assert sorted(json.loads(dump_names)) == ["content_dump", "dump"] + + # Verify dumps + dump = gcs_helper.download( + bucket_name=bucket, + key=f"v1/dump/{crash_id}", + ) + assert dump == b"fake dump" + + content_dump = gcs_helper.download( + bucket_name=bucket, + key=f"v1/content_dump/{crash_id}", + ) + assert content_dump == b"fake content dump" + + def test_save_processed_crash(self, gcs_helper): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + now = utc_now() + crash_id = create_new_ooid(timestamp=now) + original_raw_crash = {"submitted_timestamp": date_to_string(now)} + original_processed_crash = { + "uuid": crash_id, + "completed_datetime": date_to_string(now), + "signature": "now_this_is_a_signature", + } + + gcs_helper.create_bucket(bucket) + crashstorage.save_processed_crash( + raw_crash=original_raw_crash, + processed_crash=original_processed_crash, + ) + + # Verify processed crash is saved + processed_crash = gcs_helper.download( + bucket_name=bucket, + key=f"v1/processed_crash/{crash_id}", + ) + assert json.loads(processed_crash) == original_processed_crash + # Verify nothing else got saved + assert gcs_helper.list(bucket_name=bucket) == [f"v1/processed_crash/{crash_id}"] + + def test_get_raw_crash(self, gcs_helper): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + now = utc_now() + crash_id = create_new_ooid(timestamp=now) + original_raw_crash = {"submitted_timestamp": date_to_string(now)} + + gcs_helper.create_bucket(bucket) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/raw_crash/20{crash_id[-6:]}/{crash_id}", + data=dict_to_str(original_raw_crash).encode("utf-8"), + ) + + result = crashstorage.get_raw_crash(crash_id) + expected = { + "submitted_timestamp": original_raw_crash["submitted_timestamp"], + } + assert result == expected + + def test_get_raw_crash_not_found(self, gcs_helper): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + with pytest.raises(CrashIDNotFound): + crashstorage.get_raw_crash(crash_id) + + def test_get_raw_dump(self, gcs_helper): + """test fetching the raw dump without naming it""" + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/dump/{crash_id}", + data=b"this is a raw dump", + ) + + result = crashstorage.get_raw_dump(crash_id) + assert result == b"this is a raw dump" + + def test_get_raw_dump_not_found(self, gcs_helper): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + with pytest.raises(CrashIDNotFound): + crashstorage.get_raw_dump(crash_id) + + def test_get_raw_dump_upload_file_minidump(self, gcs_helper): + """test fetching the raw dump, naming it 'upload_file_minidump'""" + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/dump/{crash_id}", + data=b"this is a raw dump", + ) + + result = crashstorage.get_raw_dump(crash_id, name="upload_file_minidump") + assert result == b"this is a raw dump" + + def test_get_raw_dump_empty_string(self, gcs_helper): + """test fetching the raw dump, naming it with empty string""" + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/dump/{crash_id}", + data=b"this is a raw dump", + ) + + result = crashstorage.get_raw_dump(crash_id, name="") + assert result == b"this is a raw dump" + + def test_get_dumps(self, gcs_helper): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/dump_names/{crash_id}", + data=b'["dump", "content_dump", "city_dump"]', + ) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/dump/{crash_id}", + data=b'this is "dump", the first one', + ) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/content_dump/{crash_id}", + data=b'this is "content_dump", the second one', + ) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/city_dump/{crash_id}", + data=b'this is "city_dump", the last one', + ) + + result = crashstorage.get_dumps(crash_id) + assert result == { + "dump": b'this is "dump", the first one', + "content_dump": b'this is "content_dump", the second one', + "city_dump": b'this is "city_dump", the last one', + } + + def test_get_dumps_not_found(self, gcs_helper): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + with pytest.raises(CrashIDNotFound): + crashstorage.get_dumps(crash_id) + + def test_get_dumps_as_files(self, gcs_helper, tmp_path): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/dump_names/{crash_id}", + data=b'["dump", "content_dump", "city_dump"]', + ) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/dump/{crash_id}", + data=b'this is "dump", the first one', + ) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/content_dump/{crash_id}", + data=b'this is "content_dump", the second one', + ) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/city_dump/{crash_id}", + data=b'this is "city_dump", the last one', + ) + + result = crashstorage.get_dumps_as_files( + crash_id=crash_id, tmpdir=str(tmp_path) + ) + + # We don't care much about the mocked internals as the bulk of that function is + # tested elsewhere. We just need to be concerned about the file writing worked. + expected = { + "content_dump": os.path.join( + str(tmp_path), + f"{crash_id}.content_dump.TEMPORARY.dump", + ), + "city_dump": os.path.join( + str(tmp_path), + f"{crash_id}.city_dump.TEMPORARY.dump", + ), + "upload_file_minidump": os.path.join( + str(tmp_path), + f"{crash_id}.upload_file_minidump.TEMPORARY.dump", + ), + } + assert result == expected + + def test_get_processed_crash(self, gcs_helper): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + processed_crash = { + "a": {"b": {"c": 11}}, + "sensitive": {"x": 2}, + "not_url": "not a url", + # These keys do not survive redaction + "url": "http://example.com", + "json_dump": {"sensitive": 22}, + } + + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/processed_crash/{crash_id}", + data=dict_to_str(processed_crash).encode("utf-8"), + ) + + result = crashstorage.get_processed_crash(crash_id) + assert result == processed_crash + + def test_get_processed_not_found(self, gcs_helper): + crashstorage = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + with pytest.raises(CrashIDNotFound): + crashstorage.get_processed_crash(crash_id) + + +class TestTelemetryGcsCrashStorage: + def test_save_processed_crash(self, gcs_helper): + crashstorage = build_instance_from_settings(TELEMETRY_SETTINGS) + bucket = TELEMETRY_SETTINGS["options"]["bucket"] + now = utc_now() + crash_id = create_new_ooid(timestamp=now) + original_raw_crash = {"submitted_timestamp": date_to_string(now)} + gcs_helper.create_bucket(bucket) + + # Run save_processed_crash + crashstorage.save_processed_crash( + raw_crash=original_raw_crash, + processed_crash={ + "uuid": crash_id, + "completed_datetime": date_to_string(now), + "signature": "now_this_is_a_signature", + "os_name": "Linux", + "some_random_key": "should not appear", + "json_dump": { + "crash_info": { + "address": "0x6357737b", + "some_random_key": "should not appear", + }, + "crashing_thread": { + "frames": [ + { + "frame": 0, + "module": "xul.dll", + "function": None, + "some_random_key": "should not appear", + }, + ], + }, + }, + }, + ) + + # Get the crash data we just saved from the bucket and verify it's contents + crash_data = gcs_helper.download( + bucket_name=bucket, + key=f"v1/crash_report/20{crash_id[-6:]}/{crash_id}", + ) + assert json.loads(crash_data) == { + "platform": "Linux", + "signature": "now_this_is_a_signature", + "uuid": crash_id, + "json_dump": { + "crash_info": { + "address": "0x6357737b", + }, + "crashing_thread": { + "frames": [ + { + "frame": 0, + "function": None, + "module": "xul.dll", + }, + ], + }, + }, + } + + def test_get_processed_crash(self, gcs_helper): + crashstorage = build_instance_from_settings(TELEMETRY_SETTINGS) + bucket = TELEMETRY_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + crash_data = { + "platform": "Linux", + "signature": "now_this_is_a_signature", + "uuid": crash_id, + } + + # Save the data to GCS so we have something to get + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/crash_report/20{crash_id[-6:]}/{crash_id}", + data=json.dumps(crash_data).encode("utf-8"), + ) + + # Get the crash and assert it's the same data + data = crashstorage.get_processed_crash(crash_id=crash_id) + assert data == crash_data diff --git a/webapp/crashstats/api/tests/test_views.py b/webapp/crashstats/api/tests/test_views.py index 821a2fdb92..547e1fad55 100644 --- a/webapp/crashstats/api/tests/test_views.py +++ b/webapp/crashstats/api/tests/test_views.py @@ -598,11 +598,11 @@ def mocked_supersearch_get(**params): mock_ss.return_value.get.side_effect = mocked_supersearch_get yield - def create_s3_buckets(self, s3_helper): - bucket = s3_helper.get_crashstorage_bucket() - s3_helper.create_bucket(bucket) - telemetry_bucket = s3_helper.get_telemetry_bucket() - s3_helper.create_bucket(telemetry_bucket) + def create_storage_buckets(self, storage_helper): + bucket = storage_helper.get_crashstorage_bucket() + storage_helper.create_bucket(bucket) + telemetry_bucket = storage_helper.get_telemetry_bucket() + storage_helper.create_bucket(telemetry_bucket) def test_bad_uuid(self, client): url = reverse("api:crash_verify") @@ -612,8 +612,8 @@ def test_bad_uuid(self, client): data = json.loads(resp.content) assert data == {"error": "unknown crash id"} - def test_elastcsearch_has_crash(self, s3_helper, client): - self.create_s3_buckets(s3_helper) + def test_elastcsearch_has_crash(self, storage_helper, client): + self.create_storage_buckets(storage_helper) uuid = create_new_ooid() @@ -632,15 +632,15 @@ def test_elastcsearch_has_crash(self, s3_helper, client): "s3_telemetry_crash": False, } - def test_raw_crash_has_crash(self, s3_helper, client): - self.create_s3_buckets(s3_helper) + def test_raw_crash_has_crash(self, storage_helper, client): + self.create_storage_buckets(storage_helper) uuid = create_new_ooid() crash_data = {"submitted_timestamp": "2018-03-14-09T22:21:18.646733+00:00"} - bucket = s3_helper.get_crashstorage_bucket() + bucket = storage_helper.get_crashstorage_bucket() raw_crash_key = "v1/raw_crash/20%s/%s" % (uuid[-6:], uuid) - s3_helper.upload_fileobj( + storage_helper.upload( bucket_name=bucket, key=raw_crash_key, data=json.dumps(crash_data).encode("utf-8"), @@ -661,8 +661,8 @@ def test_raw_crash_has_crash(self, s3_helper, client): "s3_telemetry_crash": False, } - def test_processed_has_crash(self, s3_helper, client): - self.create_s3_buckets(s3_helper) + def test_processed_has_crash(self, storage_helper, client): + self.create_storage_buckets(storage_helper) uuid = create_new_ooid() crash_data = { @@ -671,8 +671,8 @@ def test_processed_has_crash(self, s3_helper, client): "completed_datetime": "2022-03-14 10:56:50.902884", } - bucket = s3_helper.get_crashstorage_bucket() - s3_helper.upload_fileobj( + bucket = storage_helper.get_crashstorage_bucket() + storage_helper.upload( bucket_name=bucket, key="v1/processed_crash/%s" % uuid, data=json.dumps(crash_data, cls=DateTimeEncoder).encode("utf-8"), @@ -693,8 +693,8 @@ def test_processed_has_crash(self, s3_helper, client): "s3_telemetry_crash": False, } - def test_telemetry_has_crash(self, s3_helper, client): - self.create_s3_buckets(s3_helper) + def test_telemetry_has_crash(self, storage_helper, client): + self.create_storage_buckets(storage_helper) uuid = create_new_ooid() crash_data = { @@ -703,8 +703,8 @@ def test_telemetry_has_crash(self, s3_helper, client): "uuid": uuid, } - telemetry_bucket = s3_helper.get_telemetry_bucket() - s3_helper.upload_fileobj( + telemetry_bucket = storage_helper.get_telemetry_bucket() + storage_helper.upload( bucket_name=telemetry_bucket, key="v1/crash_report/20%s/%s" % (uuid[-6:], uuid), data=json.dumps(crash_data).encode("utf-8"), diff --git a/webapp/crashstats/crashstats/management/commands/updatemissing.py b/webapp/crashstats/crashstats/management/commands/updatemissing.py index edc011c2eb..dc902185e0 100644 --- a/webapp/crashstats/crashstats/management/commands/updatemissing.py +++ b/webapp/crashstats/crashstats/management/commands/updatemissing.py @@ -9,7 +9,7 @@ from django.core.management.base import BaseCommand from crashstats.crashstats.management.commands.verifyprocessed import ( - is_in_s3, + is_in_storage, check_elasticsearch, ) from crashstats.supersearch.models import SuperSearchUnredacted @@ -23,7 +23,7 @@ class Command(BaseCommand): def check_past_missing(self): """Check the table for missing crashes and check to see if they exist.""" - s3_crash_dest = build_instance_from_settings(socorro_settings.S3_STORAGE) + crash_dest = build_instance_from_settings(socorro_settings.STORAGE) supersearch = SuperSearchUnredacted() @@ -36,7 +36,7 @@ def check_past_missing(self): no_longer_missing = [] for crash_id in crash_ids: - if is_in_s3(s3_crash_dest, crash_id): + if is_in_storage(crash_dest, crash_id): missing = check_elasticsearch(supersearch, crash_id) if not missing: no_longer_missing.append(crash_id) diff --git a/webapp/crashstats/crashstats/management/commands/verifyprocessed.py b/webapp/crashstats/crashstats/management/commands/verifyprocessed.py index 874cef8529..781d50eb5c 100644 --- a/webapp/crashstats/crashstats/management/commands/verifyprocessed.py +++ b/webapp/crashstats/crashstats/management/commands/verifyprocessed.py @@ -28,9 +28,6 @@ from socorro.libclass import build_instance_from_settings -RAW_CRASH_PREFIX_TEMPLATE = "v1/raw_crash/%s/%s" -PROCESSED_CRASH_TEMPLATE = "v1/processed_crash/%s" - # Number of seconds until we decide a worker has stalled WORKER_TIMEOUT = 15 * 60 @@ -41,10 +38,9 @@ metrics = markus.get_metrics("cron.verifyprocessed") -def is_in_s3(s3_crash_dest, crash_id): - """Is the processed crash in S3.""" - key = PROCESSED_CRASH_TEMPLATE % crash_id - return s3_crash_dest.exists_object(key) +def is_in_storage(crash_dest, crash_id): + """Is the processed crash in storage.""" + return crash_dest.exists_object(f"v1/processed_crash/{crash_id}") def check_elasticsearch(supersearch, crash_ids): @@ -76,8 +72,8 @@ def check_elasticsearch(supersearch, crash_ids): def check_crashids_for_date(firstchars_chunk, date): """Check crash ids for a given firstchars and date""" - s3_crash_source = build_instance_from_settings(socorro_settings.CRASH_SOURCE) - s3_crash_dest = build_instance_from_settings(socorro_settings.S3_STORAGE) + crash_source = build_instance_from_settings(socorro_settings.CRASH_SOURCE) + crash_dest = build_instance_from_settings(socorro_settings.STORAGE) supersearch = SuperSearchUnredacted() @@ -85,26 +81,21 @@ def check_crashids_for_date(firstchars_chunk, date): for firstchars in firstchars_chunk: # Grab all the crash ids at the given date directory - raw_crash_key_prefix = RAW_CRASH_PREFIX_TEMPLATE % (date, firstchars) - - page_iterator = s3_crash_source.connection.list_objects_paginator( - bucket=s3_crash_source.bucket, - prefix=raw_crash_key_prefix, + page_iterator = crash_source.list_objects_paginator( + prefix=f"v1/raw_crash/{date}/{firstchars}", ) for page in page_iterator: # NOTE(willkg): Keys here look like /v2/raw_crash/ENTROPY/DATE/CRASHID or # /v1/raw_crash/DATE/CRASHID - crash_ids = [ - item["Key"].split("/")[-1] for item in page.get("Contents", []) - ] + crash_ids = [item.split("/")[-1] for item in page] if not crash_ids: continue - # Check S3 first + # Check storage first for crash_id in crash_ids: - if not is_in_s3(s3_crash_dest, crash_id): + if not is_in_storage(crash_dest, crash_id): missing.append(crash_id) # Check Elasticsearch in batches diff --git a/webapp/crashstats/crashstats/models.py b/webapp/crashstats/crashstats/models.py index 0a36f4f524..0b53940c7a 100644 --- a/webapp/crashstats/crashstats/models.py +++ b/webapp/crashstats/crashstats/models.py @@ -542,6 +542,8 @@ class TelemetryCrash(SocorroMiddleware): delete = None def get_implementation(self): + if socorro_settings.CLOUD_PROVIDER == "GCP": + return build_instance_from_settings(socorro_settings.TELEMETRY_STORAGE) s3_settings = socorro_settings.TELEMETRY_STORAGE["options"] # FIXME(willkg): change this to BotoS3CrashStorage return TelemetryCrashData(**s3_settings) @@ -627,6 +629,8 @@ class ProcessedCrash(SocorroMiddleware): API_ALLOWLIST = None def get_implementation(self): + if socorro_settings.CLOUD_PROVIDER == "GCP": + return build_instance_from_settings(socorro_settings.STORAGE) # FIXME(willkg): change this to BotoS3CrashStorage s3_settings = socorro_settings.S3_STORAGE["options"] return SimplifiedCrashData(**s3_settings) @@ -736,6 +740,8 @@ class RawCrash(SocorroMiddleware): API_BINARY_PERMISSIONS = ("crashstats.view_rawdump",) def get_implementation(self): + if socorro_settings.CLOUD_PROVIDER == "GCP": + return build_instance_from_settings(socorro_settings.STORAGE) s3_settings = socorro_settings.S3_STORAGE["options"] # FIXME(willkg): change this to BotoS3CrashStorage return SimplifiedCrashData(**s3_settings) diff --git a/webapp/crashstats/crashstats/tests/test_models.py b/webapp/crashstats/crashstats/tests/test_models.py index 991d74148c..2b043df76b 100644 --- a/webapp/crashstats/crashstats/tests/test_models.py +++ b/webapp/crashstats/crashstats/tests/test_models.py @@ -17,7 +17,7 @@ from crashstats.crashstats.tests.conftest import Response from crashstats.crashstats.tests.testbase import DjangoTestCase from socorro import settings as socorro_settings -from socorro.external.boto.crashstorage import dict_to_str, build_keys as s3_build_keys +from socorro.external.boto.crashstorage import dict_to_str, build_keys from socorro.lib import BadArgumentError from socorro.libclass import build_instance_from_settings from socorro.lib.libooid import create_new_ooid, date_from_ooid @@ -397,7 +397,7 @@ def mocked_get(url, **options): class TestProcessedCrash: - def test_api(self, s3_helper): + def test_api(self, storage_helper): api = models.ProcessedCrash() crash_id = create_new_ooid() @@ -417,10 +417,10 @@ def test_api(self, s3_helper): ], } - key = s3_build_keys("processed_crash", crash_id)[0] - crashstorage = build_instance_from_settings(socorro_settings.S3_STORAGE) + key = build_keys("processed_crash", crash_id)[0] + crashstorage = build_instance_from_settings(socorro_settings.STORAGE) data = dict_to_str(processed_crash).encode("utf-8") - s3_helper.upload_fileobj(bucket_name=crashstorage.bucket, key=key, data=data) + storage_helper.upload(bucket_name=crashstorage.bucket, key=key, data=data) ret = api.get(crash_id=crash_id) assert ret == { @@ -442,7 +442,7 @@ def test_api(self, s3_helper): class TestRawCrash: - def test_api(self, s3_helper): + def test_api(self, storage_helper): api = models.RawCrash() crash_id = create_new_ooid() @@ -454,10 +454,10 @@ def test_api(self, s3_helper): "version": 2, } - key = s3_build_keys("raw_crash", crash_id)[0] - crashstorage = build_instance_from_settings(socorro_settings.S3_STORAGE) + key = build_keys("raw_crash", crash_id)[0] + crashstorage = build_instance_from_settings(socorro_settings.STORAGE) data = dict_to_str(raw_crash).encode("utf-8") - s3_helper.upload_fileobj(bucket_name=crashstorage.bucket, key=key, data=data) + storage_helper.upload(bucket_name=crashstorage.bucket, key=key, data=data) ret = api.get(crash_id=crash_id) assert ret == { @@ -473,15 +473,15 @@ def test_invalid_id(self): with pytest.raises(BadArgumentError): api.get(crash_id="821fcd0c-d925-4900-85b6-687250180607docker/as_me.sh") - def test_raw_data(self, s3_helper): + def test_raw_data(self, storage_helper): api = models.RawCrash() crash_id = create_new_ooid() dump = b"abcde" - key = s3_build_keys("dump", crash_id)[0] - crashstorage = build_instance_from_settings(socorro_settings.S3_STORAGE) - s3_helper.upload_fileobj(bucket_name=crashstorage.bucket, key=key, data=dump) + key = build_keys("dump", crash_id)[0] + crashstorage = build_instance_from_settings(socorro_settings.STORAGE) + storage_helper.upload(bucket_name=crashstorage.bucket, key=key, data=dump) r = api.get(crash_id=crash_id, format="raw", name="upload_file_minidump") assert r == dump diff --git a/webapp/crashstats/crashstats/tests/test_updatemissing.py b/webapp/crashstats/crashstats/tests/test_updatemissing.py index 1d2dcd76e2..6303ae1ec1 100644 --- a/webapp/crashstats/crashstats/tests/test_updatemissing.py +++ b/webapp/crashstats/crashstats/tests/test_updatemissing.py @@ -2,7 +2,6 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. -import os from crashstats.crashstats.models import MissingProcessedCrash from crashstats.crashstats.management.commands.updatemissing import Command @@ -14,15 +13,15 @@ class TestUpdateMissing: - def create_raw_crash_in_s3(self, s3_helper, bucket_name, crash_id): - s3_helper.upload_fileobj( + def create_raw_crash_in_storage(self, storage_helper, bucket_name, crash_id): + storage_helper.upload( bucket_name=bucket_name, key=f"v1/raw_crash/{TODAY}/{crash_id}", data=b"test", ) - def create_processed_crash_in_s3(self, s3_helper, bucket_name, crash_id): - s3_helper.upload_fileobj( + def create_processed_crash_in_storage(self, storage_helper, bucket_name, crash_id): + storage_helper.upload( bucket_name=bucket_name, key=f"v1/processed_crash/{crash_id}", data=b"test", @@ -51,18 +50,22 @@ def test_past_missing_still_missing(self, capsys, db): mpe = MissingProcessedCrash.objects.get(crash_id=crash_id) assert mpe.is_processed is False - def test_past_missing_no_longer_missing(self, capsys, db, es_helper, s3_helper): - # Create a MissingProcessedCrash row and put the processed crash in the S3 + def test_past_missing_no_longer_missing( + self, capsys, db, es_helper, storage_helper + ): + # Create a MissingProcessedCrash row and put the processed crash in the # bucket. After check_past_missing() runs, the MissingProcessedCrash should # have is_processed=True. crash_id = create_new_ooid() mpe = MissingProcessedCrash(crash_id=crash_id, is_processed=False) mpe.save() - bucket = os.environ["CRASHSTORAGE_S3_BUCKET"] - self.create_raw_crash_in_s3(s3_helper, bucket_name=bucket, crash_id=crash_id) - self.create_processed_crash_in_s3( - s3_helper, bucket_name=bucket, crash_id=crash_id + bucket = storage_helper.get_crashstorage_bucket() + self.create_raw_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id + ) + self.create_processed_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id ) self.create_processed_crash_in_es(es_helper, crash_id) diff --git a/webapp/crashstats/crashstats/tests/test_verifyprocessed.py b/webapp/crashstats/crashstats/tests/test_verifyprocessed.py index 6a72510178..4376a2e4cb 100644 --- a/webapp/crashstats/crashstats/tests/test_verifyprocessed.py +++ b/webapp/crashstats/crashstats/tests/test_verifyprocessed.py @@ -29,15 +29,15 @@ def fetch_crashids(self): "crash_id", flat=True ) - def create_raw_crash_in_s3(self, s3_helper, bucket_name, crash_id): - s3_helper.upload_fileobj( + def create_raw_crash_in_storage(self, storage_helper, bucket_name, crash_id): + storage_helper.upload( bucket_name=bucket_name, key=f"v1/raw_crash/{TODAY}/{crash_id}", data=b"test", ) - def create_processed_crash_in_s3(self, s3_helper, bucket_name, crash_id): - s3_helper.upload_fileobj( + def create_processed_crash_in_storage(self, storage_helper, bucket_name, crash_id): + storage_helper.upload( bucket_name=bucket_name, key=f"v1/processed_crash/{crash_id}", data=b"test", @@ -62,23 +62,23 @@ def test_get_threechars(self): assert threechars[0] == "000" assert threechars[-1] == "fff" - def test_no_crashes(self, s3_helper, monkeypatch): + def test_no_crashes(self, storage_helper, monkeypatch): """Verify no crashes in bucket result in no missing crashes.""" monkeypatch.setattr(Command, "get_threechars", get_threechars_subset) - bucket = os.environ["CRASHSTORAGE_S3_BUCKET"] - s3_helper.create_bucket(bucket) + bucket = storage_helper.get_crashstorage_bucket() + storage_helper.create_bucket(bucket) cmd = Command() missing = cmd.find_missing(num_workers=1, date=TODAY) assert missing == [] - def test_no_missing_crashes(self, s3_helper, es_helper, monkeypatch): + def test_no_missing_crashes(self, storage_helper, es_helper, monkeypatch): """Verify raw crashes with processed crashes result in no missing crashes.""" monkeypatch.setattr(Command, "get_threechars", get_threechars_subset) - bucket = os.environ["CRASHSTORAGE_S3_BUCKET"] - s3_helper.create_bucket(bucket) + bucket = storage_helper.get_crashstorage_bucket() + storage_helper.create_bucket(bucket) # Create a few raw and processed crashes crashids = [ @@ -87,11 +87,11 @@ def test_no_missing_crashes(self, s3_helper, es_helper, monkeypatch): "000" + create_new_ooid()[3:], ] for crash_id in crashids: - self.create_raw_crash_in_s3( - s3_helper, bucket_name=bucket, crash_id=crash_id + self.create_raw_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id ) - self.create_processed_crash_in_s3( - s3_helper, bucket_name=bucket, crash_id=crash_id + self.create_processed_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id ) self.create_processed_crash_in_es(es_helper, crash_id=crash_id) @@ -101,49 +101,57 @@ def test_no_missing_crashes(self, s3_helper, es_helper, monkeypatch): missing = cmd.find_missing(num_workers=1, date=TODAY) assert missing == [] - def test_missing_crashes(self, s3_helper, es_helper, monkeypatch): + def test_missing_crashes(self, storage_helper, es_helper, monkeypatch): """Verify it finds a missing crash.""" monkeypatch.setattr(Command, "get_threechars", get_threechars_subset) - bucket = os.environ["CRASHSTORAGE_S3_BUCKET"] - s3_helper.create_bucket(bucket) + bucket = storage_helper.get_crashstorage_bucket() + storage_helper.create_bucket(bucket) # Create a raw and processed crash crash_id_1 = "000" + create_new_ooid()[3:] - self.create_raw_crash_in_s3(s3_helper, bucket_name=bucket, crash_id=crash_id_1) - self.create_processed_crash_in_s3( - s3_helper, bucket_name=bucket, crash_id=crash_id_1 + self.create_raw_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id_1 + ) + self.create_processed_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id_1 ) self.create_processed_crash_in_es(es_helper, crash_id=crash_id_1) # Create a raw crash crash_id_2 = "000" + create_new_ooid()[3:] - self.create_raw_crash_in_s3(s3_helper, bucket_name=bucket, crash_id=crash_id_2) + self.create_raw_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id_2 + ) cmd = Command() missing = cmd.find_missing(num_workers=1, date=TODAY) assert missing == [crash_id_2] - def test_missing_crashes_es(self, s3_helper, es_helper, monkeypatch): + def test_missing_crashes_es(self, storage_helper, es_helper, monkeypatch): """Verify it finds a processed crash missing in ES.""" monkeypatch.setattr(Command, "get_threechars", get_threechars_subset) - bucket = os.environ["CRASHSTORAGE_S3_BUCKET"] - s3_helper.create_bucket(bucket) + bucket = storage_helper.get_crashstorage_bucket() + storage_helper.create_bucket(bucket) # Create a raw and processed crash crash_id_1 = "000" + create_new_ooid()[3:] - self.create_raw_crash_in_s3(s3_helper, bucket_name=bucket, crash_id=crash_id_1) - self.create_processed_crash_in_s3( - s3_helper, bucket_name=bucket, crash_id=crash_id_1 + self.create_raw_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id_1 + ) + self.create_processed_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id_1 ) self.create_processed_crash_in_es(es_helper, crash_id=crash_id_1) # Create a raw crash crash_id_2 = "000" + create_new_ooid()[3:] - self.create_raw_crash_in_s3(s3_helper, bucket_name=bucket, crash_id=crash_id_2) - self.create_processed_crash_in_s3( - s3_helper, bucket_name=bucket, crash_id=crash_id_2 + self.create_raw_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id_2 + ) + self.create_processed_crash_in_storage( + storage_helper, bucket_name=bucket, crash_id=crash_id_2 ) cmd = Command() diff --git a/webapp/crashstats/crashstats/tests/test_views.py b/webapp/crashstats/crashstats/tests/test_views.py index 6560b707ab..88a857173f 100644 --- a/webapp/crashstats/crashstats/tests/test_views.py +++ b/webapp/crashstats/crashstats/tests/test_views.py @@ -88,20 +88,20 @@ def build_crash_data(): return crash_id, raw_crash, processed_crash -def upload_crash_data(s3_helper, raw_crash, processed_crash): +def upload_crash_data(storage_helper, raw_crash, processed_crash): """Validate crash data and upload it to crash bucket""" crash_id = processed_crash["uuid"] - bucket = s3_helper.get_crashstorage_bucket() + bucket = storage_helper.get_crashstorage_bucket() validate_instance(raw_crash, RAW_CRASH_SCHEMA) raw_key = build_keys("raw_crash", crash_id)[0] - s3_helper.upload_fileobj( + storage_helper.upload( bucket_name=bucket, key=raw_key, data=dict_to_str(raw_crash).encode("utf-8") ) validate_instance(processed_crash, PROCESSED_CRASH_SCHEMA) processed_key = build_keys("processed_crash", crash_id)[0] - s3_helper.upload_fileobj( + storage_helper.upload( bucket_name=bucket, key=processed_key, data=dict_to_str(processed_crash).encode("utf-8"), @@ -405,7 +405,7 @@ def test_quick_search_metrics(self, client): class Test_report_index: - def test_report_index(self, client, db, s3_helper, user_helper): + def test_report_index(self, client, db, storage_helper, user_helper): json_dump = { "system_info": { "os": "Mac OS X", @@ -419,7 +419,7 @@ def test_report_index(self, client, db, s3_helper, user_helper): crash_id, raw_crash, processed_crash = build_crash_data() processed_crash["json_dump"] = json_dump upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) models.BugAssociation.objects.create(bug_id=222222, signature="FakeSignature1") @@ -494,13 +494,13 @@ def test_report_index(self, client, db, s3_helper, user_helper): assert processed_crash["user_comments"] not in content assert raw_crash["URL"] not in content - def test_raw_crash_unicode_key(self, client, db, s3_helper, user_helper): + def test_raw_crash_unicode_key(self, client, db, storage_helper, user_helper): crash_id, raw_crash, processed_crash = build_crash_data() # NOTE(willkg): The collector doesn't remove non-ascii keys currently. At some # point, it probably should. raw_crash["Pr\u00e9nom"] = "Peter" upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) # Log in with protected data access to view all data @@ -512,7 +512,7 @@ def test_raw_crash_unicode_key(self, client, db, s3_helper, user_helper): assert response.status_code == 200 assert "Pr\u00e9nom" in smart_str(response.content) - def test_additional_raw_dump_links(self, client, db, s3_helper, user_helper): + def test_additional_raw_dump_links(self, client, db, storage_helper, user_helper): json_dump = { "system_info": { "os": "Mac OS X", @@ -533,7 +533,7 @@ def test_additional_raw_dump_links(self, client, db, s3_helper, user_helper): } processed_crash["json_dump"] = json_dump upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) # Expect these urls @@ -588,7 +588,7 @@ def test_additional_raw_dump_links(self, client, db, s3_helper, user_helper): ) assert bar_dump_url in smart_str(response.content) - def test_symbol_url_in_modules(self, client, db, s3_helper, user_helper): + def test_symbol_url_in_modules(self, client, db, storage_helper, user_helper): json_dump = { "status": "OK", "threads": [], @@ -621,7 +621,7 @@ def test_symbol_url_in_modules(self, client, db, s3_helper, user_helper): raw_crash["additional_minidumps"] = "foo, bar," processed_crash["json_dump"] = json_dump upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -633,7 +633,7 @@ def test_symbol_url_in_modules(self, client, db, s3_helper, user_helper): response.content ) - def test_cert_subject_in_modules(self, client, db, s3_helper): + def test_cert_subject_in_modules(self, client, db, storage_helper): json_dump = { "status": "OK", "threads": [], @@ -667,7 +667,7 @@ def test_cert_subject_in_modules(self, client, db, s3_helper): crash_id, raw_crash, processed_crash = build_crash_data() processed_crash["json_dump"] = json_dump upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -683,7 +683,7 @@ def test_cert_subject_in_modules(self, client, db, s3_helper): smart_str(response.content), ) - def test_unloaded_modules(self, client, db, s3_helper): + def test_unloaded_modules(self, client, db, storage_helper): json_dump = { "status": "OK", "threads": [], @@ -708,7 +708,7 @@ def test_unloaded_modules(self, client, db, s3_helper): crash_id, raw_crash, processed_crash = build_crash_data() processed_crash["json_dump"] = json_dump upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -721,7 +721,7 @@ def test_unloaded_modules(self, client, db, s3_helper): # Assert that the second unloaded module cert subject shows up assert "Microsoft Windows" in smart_str(response.content) - def test_shutdownhang_signature(self, client, db, s3_helper): + def test_shutdownhang_signature(self, client, db, storage_helper): json_dump = { "crash_info": {"crashing_thread": 2}, "status": "OK", @@ -739,7 +739,7 @@ def test_shutdownhang_signature(self, client, db, s3_helper): processed_crash["json_dump"] = json_dump processed_crash["signature"] = "shutdownhang | foo::bar()" upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -749,7 +749,7 @@ def test_shutdownhang_signature(self, client, db, s3_helper): assert "Crashing Thread (2)" not in smart_str(response.content) assert "Crashing Thread (0)" in smart_str(response.content) - def test_no_crashing_thread(self, client, db, s3_helper): + def test_no_crashing_thread(self, client, db, storage_helper): # If the json_dump has no crashing thread available, do not display a # specific crashing thread, but instead display all threads. json_dump = { @@ -767,7 +767,7 @@ def test_no_crashing_thread(self, client, db, s3_helper): processed_crash["json_dump"] = json_dump processed_crash["signature"] = "foo::bar()" upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -779,7 +779,7 @@ def test_no_crashing_thread(self, client, db, s3_helper): assert "Thread 1" in smart_str(response.content) assert "Thread 2" in smart_str(response.content) - def test_crashing_thread_table(self, client, db, s3_helper): + def test_crashing_thread_table(self, client, db, storage_helper): json_dump = { "crash_info": {"crashing_thread": 0}, "status": "OK", @@ -843,7 +843,7 @@ def test_crashing_thread_table(self, client, db, s3_helper): processed_crash["crashing_thread"] = json_dump["crash_info"]["crashing_thread"] processed_crash["signature"] = "shutdownhang | foo::bar()" upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -854,7 +854,7 @@ def test_crashing_thread_table(self, client, db, s3_helper): assert "context" in smart_str(response.content) assert "frame_pointer" in smart_str(response.content) - def test_inlines(self, client, db, s3_helper): + def test_inlines(self, client, db, storage_helper): json_dump = { "crash_info": {"crashing_thread": 0}, "status": "OK", @@ -909,7 +909,7 @@ def test_inlines(self, client, db, s3_helper): processed_crash["signature"] = "shutdownhang | foo::bar()" upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -922,7 +922,7 @@ def test_inlines(self, client, db, s3_helper): assert "InlineFunction2" in smart_str(response.content) assert "inlinefile2.cpp:374" in smart_str(response.content) - def test_java_exception_table_not_logged_in(self, client, db, s3_helper): + def test_java_exception_table_not_logged_in(self, client, db, storage_helper): java_exception = { "exception": { "values": [ @@ -949,7 +949,7 @@ def test_java_exception_table_not_logged_in(self, client, db, s3_helper): crash_id, raw_crash, processed_crash = build_crash_data() processed_crash["java_exception"] = java_exception upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -962,7 +962,9 @@ def test_java_exception_table_not_logged_in(self, client, db, s3_helper): # Make sure "PII" is not in the crash report assert "[PII]" not in smart_str(response.content) - def test_java_exception_table_logged_in(self, client, db, s3_helper, user_helper): + def test_java_exception_table_logged_in( + self, client, db, storage_helper, user_helper + ): java_exception = { "exception": { "values": [ @@ -989,7 +991,7 @@ def test_java_exception_table_logged_in(self, client, db, s3_helper, user_helper crash_id, raw_crash, processed_crash = build_crash_data() processed_crash["java_exception"] = java_exception upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) user = user_helper.create_protected_user() @@ -1005,7 +1007,7 @@ def test_java_exception_table_logged_in(self, client, db, s3_helper, user_helper # Make sure "PII" is in the crash report assert "[PII]" in smart_str(response.content) - def test_last_error_value(self, client, db, s3_helper): + def test_last_error_value(self, client, db, storage_helper): json_dump = { "crash_info": { "crashing_thread": 0, @@ -1022,7 +1024,7 @@ def test_last_error_value(self, client, db, s3_helper): processed_crash["json_dump"] = json_dump processed_crash["crashing_thread"] = json_dump["crash_info"]["crashing_thread"] upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -1032,7 +1034,7 @@ def test_last_error_value(self, client, db, s3_helper): # Assert it's not in the content assert "Last Error Value" in smart_str(response.content) - def test_unpaired_surrogate(self, client, db, s3_helper): + def test_unpaired_surrogate(self, client, db, storage_helper): """An unpaired surrogate like \udf03 can't be encoded in UTF-8, so it is escaped.""" json_dump = { "crash_info": {"crashing_thread": 0}, @@ -1056,7 +1058,7 @@ def test_unpaired_surrogate(self, client, db, s3_helper): processed_crash["crashing_thread"] = json_dump["crash_info"]["crashing_thread"] processed_crash["signature"] = "shutdownhang | foo::bar()" upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -1066,7 +1068,7 @@ def test_unpaired_surrogate(self, client, db, s3_helper): # The escaped surrogate appears in the page assert "surrogate@example.com.xpi\\udf03" in smart_str(response.content) - def test_telemetry_environment(self, client, db, s3_helper): + def test_telemetry_environment(self, client, db, storage_helper): telemetry_environment = { "build": { "applicationId": "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}", @@ -1083,7 +1085,7 @@ def test_telemetry_environment(self, client, db, s3_helper): raw_crash["TelemetryEnvironment"] = json.dumps(telemetry_environment) processed_crash["telemetry_environment"] = telemetry_environment upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=(crash_id,)) @@ -1095,7 +1097,7 @@ def test_telemetry_environment(self, client, db, s3_helper): # does it so let's just check the data attribute is there. assert 'id="telemetryenvironment-json"' in smart_str(response.content) - def test_odd_product_and_version(self, client, db, s3_helper): + def test_odd_product_and_version(self, client, db, storage_helper): # If the processed JSON references an unfamiliar product and version it should # not use that to make links in the nav to reports for that unfamiliar product # and version. @@ -1106,7 +1108,7 @@ def test_odd_product_and_version(self, client, db, s3_helper): processed_crash["version"] = "99.9" upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=[crash_id]) @@ -1123,11 +1125,11 @@ def test_odd_product_and_version(self, client, db, s3_helper): bad_url = reverse("crashstats:product_home", args=("WaterWolf",)) assert bad_url not in smart_str(response.content) - def test_no_dump(self, client, db, s3_helper): + def test_no_dump(self, client, db, storage_helper): crash_id, raw_crash, processed_crash = build_crash_data() del processed_crash["json_dump"] upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=[crash_id]) @@ -1135,7 +1137,7 @@ def test_no_dump(self, client, db, s3_helper): assert response.status_code == 200 assert "No dump available" in smart_str(response.content) - def test_invalid_crash_id(self, client, db, s3_helper): + def test_invalid_crash_id(self, client, db, storage_helper): # Last 6 digits indicate 30th Feb 2012 which doesn't exist so this is an invalid # crash_id url = reverse( @@ -1146,11 +1148,11 @@ def test_invalid_crash_id(self, client, db, s3_helper): assert "Invalid crash ID" in smart_str(response.content) assert response["Content-Type"] == "text/html; charset=utf-8" - def test_valid_install_time(self, client, db, s3_helper): + def test_valid_install_time(self, client, db, storage_helper): crash_id, raw_crash, processed_crash = build_crash_data() raw_crash["InstallTime"] = "1461170304" upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=[crash_id]) @@ -1159,13 +1161,13 @@ def test_valid_install_time(self, client, db, s3_helper): # This is what 1461170304 is in human friendly format. assert "2016-04-20 16:38:24" in smart_str(response.content) - def test_invalid_install_time(self, client, db, s3_helper): + def test_invalid_install_time(self, client, db, storage_helper): # NOTE(willkg): this is no longer an issue when we switch the template to # render the install time from the processed crash crash_id, raw_crash, processed_crash = build_crash_data() raw_crash["InstallTime"] = "bad number" upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=[crash_id]) @@ -1178,26 +1180,26 @@ def test_invalid_install_time(self, client, db, s3_helper): if pyquery.PyQuery(row).find("th").text() == "Install Time": assert pyquery.PyQuery(row).find("td").text() == "" - def test_empty_json_dump(self, client, db, s3_helper): + def test_empty_json_dump(self, client, db, storage_helper): json_dump = {"stackwalker_version": "minidump_stackwalk 0.10.3 ..."} crash_id, raw_crash, processed_crash = build_crash_data() processed_crash["json_dump"] = json_dump upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=[crash_id]) response = client.get(url) assert response.status_code == 200 - def test_raw_crash_not_found(self, client, db, s3_helper): + def test_raw_crash_not_found(self, client, db, storage_helper): crash_id, raw_crash, processed_crash = build_crash_data() - bucket = s3_helper.get_crashstorage_bucket() + bucket = storage_helper.get_crashstorage_bucket() validate_instance(processed_crash, PROCESSED_CRASH_SCHEMA) processed_key = build_keys("processed_crash", crash_id)[0] - s3_helper.upload_fileobj( + storage_helper.upload( bucket_name=bucket, key=processed_key, data=dict_to_str(processed_crash).encode("utf-8"), @@ -1209,13 +1211,13 @@ def test_raw_crash_not_found(self, client, db, s3_helper): assert response.status_code == 404 assert "Crash Report Not Found" in smart_str(response.content) - def test_processed_crash_not_found(self, client, db, s3_helper, queue_helper): + def test_processed_crash_not_found(self, client, db, storage_helper, queue_helper): crash_id, raw_crash, processed_crash = build_crash_data() - bucket = s3_helper.get_crashstorage_bucket() + bucket = storage_helper.get_crashstorage_bucket() validate_instance(raw_crash, RAW_CRASH_SCHEMA) raw_key = build_keys("raw_crash", crash_id)[0] - s3_helper.upload_fileobj( + storage_helper.upload( bucket_name=bucket, key=raw_key, data=dict_to_str(raw_crash).encode("utf-8") ) @@ -1227,10 +1229,10 @@ def test_processed_crash_not_found(self, client, db, s3_helper, queue_helper): assert "Please wait..." in content assert "Processing this crash report only takes a few seconds" in content - def test_redirect_by_prefix(self, client, db, s3_helper): + def test_redirect_by_prefix(self, client, db, storage_helper): crash_id, raw_crash, processed_crash = build_crash_data() upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse( @@ -1240,7 +1242,7 @@ def test_redirect_by_prefix(self, client, db, s3_helper): expected = reverse("crashstats:report_index", kwargs={"crash_id": crash_id}) assert response.url == expected - def test_thread_name(self, client, db, s3_helper): + def test_thread_name(self, client, db, storage_helper): # Some threads now have a name. If there is one, verify that name is displayed # next to that thread's number. json_dump = { @@ -1264,7 +1266,7 @@ def test_thread_name(self, client, db, s3_helper): processed_crash["json_dump"] = json_dump processed_crash["crashing_thread"] = json_dump["crash_info"]["crashing_thread"] upload_crash_data( - s3_helper, raw_crash=raw_crash, processed_crash=processed_crash + storage_helper, raw_crash=raw_crash, processed_crash=processed_crash ) url = reverse("crashstats:report_index", args=[crash_id]) From a818bf0a645234d2c5d0ee10da112164ea911bd1 Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Wed, 17 Apr 2024 15:04:17 -0700 Subject: [PATCH 2/8] fix gcs upload and add tests --- bin/gcs_cli.py | 36 ++++++++++++++++++------------ socorro/tests/test_gcs_cli.py | 42 +++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 14 deletions(-) diff --git a/bin/gcs_cli.py b/bin/gcs_cli.py index d51b51e1c3..3a858522cb 100755 --- a/bin/gcs_cli.py +++ b/bin/gcs_cli.py @@ -9,7 +9,7 @@ # Usage: ./bin/gcs_cli.py CMD import os -from pathlib import Path +from pathlib import Path, PurePosixPath import click @@ -124,37 +124,45 @@ def list_objects(bucket_name, details): @click.argument("source") @click.argument("destination") def upload(source, destination): - """Upload files to a bucket""" + """Upload files to a bucket + + SOURCE is a path to a file or directory of files. will recurse on directory trees + + DESTINATION is a path to a file or directory in the bucket. If SOURCE is a + directory or DESTINATION ends with "/", then DESTINATION is treated as a directory. + """ client = get_client() # remove protocol from destination if present destination = destination.split("://", 1)[-1] bucket_name, _, prefix = destination.partition("/") + prefix_path = PurePosixPath(prefix) try: bucket = client.get_bucket(bucket_name) - except NotFound: - click.error(f"GCS bucket {bucket_name!r} does not exist.") - return + except NotFound as e: + raise click.ClickException(f"GCS bucket {bucket_name!r} does not exist.") from e source_path = Path(source) if not source_path.exists(): - click.error("local path {source!r} does not exist.") - if source_path.is_dir(): - prefix_path = Path(prefix) + raise click.ClickException(f"local path {source!r} does not exist.") + source_is_dir = source_path.is_dir() + if source_is_dir: sources = [p for p in source_path.rglob("*") if not p.is_dir()] else: sources = [source_path] if not sources: - click.echo("No files in directory {source!r}.") - return + raise click.ClickException(f"No files in directory {source!r}.") for path in sources: - if path == source_path: - key_path = prefix_path + if source_is_dir: + # source is a directory so treat destination as a directory + key = str(prefix_path / path.relative_to(source_path)) + elif prefix == "" or prefix.endswith("/"): + # source is a file but destination is a directory, preserve file name + key = str(prefix_path / path.name) else: - key_path = prefix_path / path.relative_to(source_path) - key = "/".join(key_path.parts) + key = prefix blob = bucket.blob(key) blob.upload_from_filename(path) click.echo(f"Uploaded gs://{bucket_name}/{key}") diff --git a/socorro/tests/test_gcs_cli.py b/socorro/tests/test_gcs_cli.py index 9947096ae4..09d06152b2 100644 --- a/socorro/tests/test_gcs_cli.py +++ b/socorro/tests/test_gcs_cli.py @@ -2,6 +2,8 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. +from uuid import uuid4 + from click.testing import CliRunner from gcs_cli import gcs_group @@ -12,3 +14,43 @@ def test_it_runs(): runner = CliRunner() result = runner.invoke(gcs_group, ["--help"]) assert result.exit_code == 0 + + +def test_upload_file_to_root(gcs_helper, tmp_path): + """Test whether the module loads and spits out help.""" + bucket = gcs_helper.create_bucket("test").name + path = tmp_path / uuid4().hex + path.write_text(path.name) + result = CliRunner().invoke( + gcs_group, ["upload", str(path.absolute()), f"gs://{bucket}"] + ) + assert result.exit_code == 0 + assert gcs_helper.download(bucket, path.name) == path.name.encode("utf-8") + + +def test_upload_file_to_dir(gcs_helper, tmp_path): + """Test whether the module loads and spits out help.""" + bucket = gcs_helper.create_bucket("test").name + path = tmp_path / uuid4().hex + path.write_text(path.name) + result = CliRunner().invoke( + gcs_group, ["upload", str(path.absolute()), f"gs://{bucket}/{path.name}/"] + ) + assert result.exit_code == 0 + assert gcs_helper.download(bucket, f"{path.name}/{path.name}") == path.name.encode( + "utf-8" + ) + + +def test_upload_dir_to_dir(gcs_helper, tmp_path): + """Test whether the module loads and spits out help.""" + bucket = gcs_helper.create_bucket("test").name + path = tmp_path / uuid4().hex + path.write_text(path.name) + result = CliRunner().invoke( + gcs_group, ["upload", str(tmp_path.absolute()), f"gs://{bucket}/{path.name}"] + ) + assert result.exit_code == 0 + assert gcs_helper.download(bucket, f"{path.name}/{path.name}") == path.name.encode( + "utf-8" + ) From f17cf2e0e1c2ead1145aad01d674fddfed233510 Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Wed, 17 Apr 2024 15:11:23 -0700 Subject: [PATCH 3/8] fix copypasta docstring --- socorro/external/gcs/crashstorage.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/socorro/external/gcs/crashstorage.py b/socorro/external/gcs/crashstorage.py index 18d0adb9d3..8f5335533b 100644 --- a/socorro/external/gcs/crashstorage.py +++ b/socorro/external/gcs/crashstorage.py @@ -46,11 +46,7 @@ def __init__( :arg bucket: the GCS bucket to save to :arg dump_file_suffix: the suffix used to identify a dump file (for use in temp files) - :arg region: the AWS region to use - :arg access_key: the AWS access_key to use - :arg secret_access_key: the AWS secret_access_key to use - :arg endpoint_url: the endpoint url to use when in a local development - environment + :arg metrics_prefix: the metrics prefix for markus """ super().__init__() From bc25d375d829b077b103525cee52b7f6923d917d Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Tue, 7 May 2024 13:53:00 -0700 Subject: [PATCH 4/8] address review --- socorro/external/gcs/crashstorage.py | 4 ++++ socorro/tests/external/gcs/test_crashstorage.py | 3 --- socorro/tests/test_gcs_cli.py | 6 +++--- .../crashstats/management/commands/verifyprocessed.py | 3 +-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/socorro/external/gcs/crashstorage.py b/socorro/external/gcs/crashstorage.py index 8f5335533b..6567117f7e 100644 --- a/socorro/external/gcs/crashstorage.py +++ b/socorro/external/gcs/crashstorage.py @@ -237,6 +237,8 @@ def get_processed_crash(self, crash_id): def get(self, **kwargs): """Return JSON data of a crash report, given its uuid.""" + # FIXME(relud): This method is used by the webapp API middleware nonsense. It + # shouldn't exist here. We should move it to the webapp. filters = [ ("uuid", None, str), ("datatype", None, str), @@ -362,6 +364,8 @@ def get_processed_crash(self, crash_id): def get(self, **kwargs): """Return JSON data of a crash report, given its uuid.""" + # FIXME(relud): This method is used by the webapp API middleware nonsense. It + # shouldn't exist here. We should move it to the webapp. filters = [("uuid", None, str)] params = external_common.parse_arguments(filters, kwargs, modern=True) diff --git a/socorro/tests/external/gcs/test_crashstorage.py b/socorro/tests/external/gcs/test_crashstorage.py index a2b3f19044..895d0a0d2a 100644 --- a/socorro/tests/external/gcs/test_crashstorage.py +++ b/socorro/tests/external/gcs/test_crashstorage.py @@ -335,8 +335,6 @@ def test_get_dumps_as_files(self, gcs_helper, tmp_path): crash_id=crash_id, tmpdir=str(tmp_path) ) - # We don't care much about the mocked internals as the bulk of that function is - # tested elsewhere. We just need to be concerned about the file writing worked. expected = { "content_dump": os.path.join( str(tmp_path), @@ -363,7 +361,6 @@ def test_get_processed_crash(self, gcs_helper): "a": {"b": {"c": 11}}, "sensitive": {"x": 2}, "not_url": "not a url", - # These keys do not survive redaction "url": "http://example.com", "json_dump": {"sensitive": 22}, } diff --git a/socorro/tests/test_gcs_cli.py b/socorro/tests/test_gcs_cli.py index 09d06152b2..38db2b4c70 100644 --- a/socorro/tests/test_gcs_cli.py +++ b/socorro/tests/test_gcs_cli.py @@ -17,7 +17,7 @@ def test_it_runs(): def test_upload_file_to_root(gcs_helper, tmp_path): - """Test whether the module loads and spits out help.""" + """Test uploading one file to a bucket root.""" bucket = gcs_helper.create_bucket("test").name path = tmp_path / uuid4().hex path.write_text(path.name) @@ -29,7 +29,7 @@ def test_upload_file_to_root(gcs_helper, tmp_path): def test_upload_file_to_dir(gcs_helper, tmp_path): - """Test whether the module loads and spits out help.""" + """Test uploading one file to a directory inside a bucket.""" bucket = gcs_helper.create_bucket("test").name path = tmp_path / uuid4().hex path.write_text(path.name) @@ -43,7 +43,7 @@ def test_upload_file_to_dir(gcs_helper, tmp_path): def test_upload_dir_to_dir(gcs_helper, tmp_path): - """Test whether the module loads and spits out help.""" + """Test uploading a whole directory to a directory inside a bucket.""" bucket = gcs_helper.create_bucket("test").name path = tmp_path / uuid4().hex path.write_text(path.name) diff --git a/webapp/crashstats/crashstats/management/commands/verifyprocessed.py b/webapp/crashstats/crashstats/management/commands/verifyprocessed.py index 781d50eb5c..5c50f2fc92 100644 --- a/webapp/crashstats/crashstats/management/commands/verifyprocessed.py +++ b/webapp/crashstats/crashstats/management/commands/verifyprocessed.py @@ -86,8 +86,7 @@ def check_crashids_for_date(firstchars_chunk, date): ) for page in page_iterator: - # NOTE(willkg): Keys here look like /v2/raw_crash/ENTROPY/DATE/CRASHID or - # /v1/raw_crash/DATE/CRASHID + # NOTE(willkg): Keys here look like /v1/raw_crash/DATE/CRASHID crash_ids = [item.split("/")[-1] for item in page] if not crash_ids: From 9e8eddade9ff0e14ebbc95763f0b412831340e61 Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Tue, 7 May 2024 14:01:50 -0700 Subject: [PATCH 5/8] move and copy utility functions --- socorro/external/boto/crashstorage.py | 44 +++------------------------ socorro/external/crashstorage_base.py | 42 +++++++++++++++++++++++++ socorro/external/gcs/crashstorage.py | 34 +++++++++++++++++++-- 3 files changed, 77 insertions(+), 43 deletions(-) diff --git a/socorro/external/boto/crashstorage.py b/socorro/external/boto/crashstorage.py index 67ab2955c2..cff4013bc6 100644 --- a/socorro/external/boto/crashstorage.py +++ b/socorro/external/boto/crashstorage.py @@ -2,7 +2,6 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. -import datetime import json import logging @@ -12,6 +11,10 @@ CrashStorageBase, CrashIDNotFound, MemoryDumpsMapping, + get_datestamp, + dict_to_str, + list_to_str, + str_to_list, ) from socorro.external.boto.connection_context import S3Connection from socorro.lib.libjsonschema import JsonSchemaReducer @@ -21,7 +24,6 @@ SocorroDataReducer, transform_schema, ) -from socorro.lib.libooid import date_from_ooid from socorro.schemas import TELEMETRY_SOCORRO_CRASH_SCHEMA @@ -32,25 +34,6 @@ def wait_time_generator(): yield from [1, 1, 1, 1, 1] -class CrashIDMissingDatestamp(Exception): - """Indicates the crash id is invalid and missing a datestamp.""" - - -def get_datestamp(crashid): - """Parses out datestamp from a crashid. - - :returns: datetime - - :raises CrashIDMissingDatestamp: if the crash id has no datestamp at the end - - """ - datestamp = date_from_ooid(crashid) - if datestamp is None: - # We should never hit this situation unless the crashid is not valid - raise CrashIDMissingDatestamp(f"{crashid} is missing datestamp") - return datestamp - - def build_keys(name_of_thing, crashid): """Builds a list of s3 pseudo-filenames @@ -81,25 +64,6 @@ def build_keys(name_of_thing, crashid): return [f"v1/{name_of_thing}/{crashid}"] -class JSONISOEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, datetime.date): - return obj.isoformat() - raise NotImplementedError(f"Don't know about {obj!r}") - - -def dict_to_str(a_mapping): - return json.dumps(a_mapping, cls=JSONISOEncoder) - - -def list_to_str(a_list): - return json.dumps(list(a_list)) - - -def str_to_list(a_string): - return json.loads(a_string) - - class BotoS3CrashStorage(CrashStorageBase): """Saves and loads crash data to S3""" diff --git a/socorro/external/crashstorage_base.py b/socorro/external/crashstorage_base.py index 6eae670a4c..9bba54239e 100644 --- a/socorro/external/crashstorage_base.py +++ b/socorro/external/crashstorage_base.py @@ -5,9 +5,13 @@ """Base classes for crashstorage system.""" from contextlib import suppress +import datetime +import json import logging import os +from socorro.lib.libooid import date_from_ooid + class MemoryDumpsMapping(dict): """there has been a bifurcation in the crash storage data throughout the @@ -262,3 +266,41 @@ def remove(self, crash_id): with suppress(KeyError): del self._processed_crash_data[crash_id] + + +class CrashIDMissingDatestamp(Exception): + """Indicates the crash id is invalid and missing a datestamp.""" + + +def get_datestamp(crashid): + """Parses out datestamp from a crashid. + + :returns: datetime + + :raises CrashIDMissingDatestamp: if the crash id has no datestamp at the end + + """ + datestamp = date_from_ooid(crashid) + if datestamp is None: + # We should never hit this situation unless the crashid is not valid + raise CrashIDMissingDatestamp(f"{crashid} is missing datestamp") + return datestamp + + +class JSONISOEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, datetime.date): + return obj.isoformat() + raise NotImplementedError(f"Don't know about {obj!r}") + + +def dict_to_str(a_mapping): + return json.dumps(a_mapping, cls=JSONISOEncoder) + + +def list_to_str(a_list): + return json.dumps(list(a_list)) + + +def str_to_list(a_string): + return json.loads(a_string) diff --git a/socorro/external/gcs/crashstorage.py b/socorro/external/gcs/crashstorage.py index 6567117f7e..957baa0c31 100644 --- a/socorro/external/gcs/crashstorage.py +++ b/socorro/external/gcs/crashstorage.py @@ -15,9 +15,7 @@ CrashStorageBase, CrashIDNotFound, MemoryDumpsMapping, -) -from socorro.external.boto.crashstorage import ( - build_keys, + get_datestamp, dict_to_str, list_to_str, str_to_list, @@ -33,6 +31,36 @@ from socorro.schemas import TELEMETRY_SOCORRO_CRASH_SCHEMA +def build_keys(name_of_thing, crashid): + """Builds a list of pseudo-filenames + + When using keys for saving a crash, always use the first one given. + + When using keys for loading a crash, try each key in order. This lets us change our + key scheme and continue to access things saved using the old key. + + :arg name_of_thing: the kind of thing we're building a filename for; e.g. + "raw_crash" + :arg crashid: the crash id for the thing being stored + + :returns: list of keys to try in order + + :raises CrashIDMissingDatestamp: if the crash id is missing a datestamp at the + end + + """ + if name_of_thing == "raw_crash": + date = get_datestamp(crashid).strftime("%Y%m%d") + return [f"v1/{name_of_thing}/{date}/{crashid}"] + + elif name_of_thing == "crash_report": + # Crash data from the TelemetryBotoS3CrashStorage + date = get_datestamp(crashid).strftime("%Y%m%d") + return [f"v1/{name_of_thing}/{date}/{crashid}"] + + return [f"v1/{name_of_thing}/{crashid}"] + + class GcsCrashStorage(CrashStorageBase): """Saves and loads crash data to GCS""" From 61ebf61d20d469e446b4fbfe13cf9e2b8527bb28 Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Tue, 7 May 2024 14:05:27 -0700 Subject: [PATCH 6/8] remove _fileobj suffix from s3_helper methods --- socorro/tests/conftest.py | 10 +---- .../external/boto/test_connection_context.py | 6 +-- .../tests/external/boto/test_crash_data.py | 6 +-- .../tests/external/boto/test_crashstorage.py | 44 +++++++++---------- socorro/tests/test_upload_telemetry_schema.py | 2 +- 5 files changed, 31 insertions(+), 37 deletions(-) diff --git a/socorro/tests/conftest.py b/socorro/tests/conftest.py index b77dff4bd0..fc25daad4a 100644 --- a/socorro/tests/conftest.py +++ b/socorro/tests/conftest.py @@ -154,23 +154,17 @@ def create_bucket(self, bucket_name): if self._buckets_seen is not None: self._buckets_seen.add(bucket_name) - def upload_fileobj(self, bucket_name, key, data): + def upload(self, bucket_name, key, data): """Puts an object into the specified bucket.""" self.create_bucket(bucket_name) self.conn.upload_fileobj(Fileobj=io.BytesIO(data), Bucket=bucket_name, Key=key) - def upload(self, bucket_name, key, data): - self.upload_fileobj(bucket_name, key, data) - - def download_fileobj(self, bucket_name, key): + def download(self, bucket_name, key): """Fetches an object from the specified bucket""" self.create_bucket(bucket_name) resp = self.conn.get_object(Bucket=bucket_name, Key=key) return resp["Body"].read() - def download(self, bucket_name, key): - return self.download_fileobj(bucket_name, key) - def list(self, bucket_name): """Return list of keys for objects in bucket.""" self.create_bucket(bucket_name) diff --git a/socorro/tests/external/boto/test_connection_context.py b/socorro/tests/external/boto/test_connection_context.py index 94119af460..0333f1dbe3 100644 --- a/socorro/tests/external/boto/test_connection_context.py +++ b/socorro/tests/external/boto/test_connection_context.py @@ -35,12 +35,12 @@ def test_save_file(self, s3_helper): objects = s3_helper.list(bucket) assert objects == ["/test/testfile.txt"] - assert s3_helper.download_fileobj(bucket, path) == file_data + assert s3_helper.download(bucket, path) == file_data # Stomp on that file with a new one file_data2 = b"test file contents 2" conn.save_file(bucket=bucket, path=path, data=file_data2) - assert s3_helper.download_fileobj(bucket, path) == file_data2 + assert s3_helper.download(bucket, path) == file_data2 def test_load_file_doesnt_exist(self, s3_helper): """Test loading a file that isn't there.""" @@ -62,6 +62,6 @@ def test_load_file(self, s3_helper): file_data = b"test file contents" s3_helper.create_bucket(bucket) - s3_helper.upload_fileobj(bucket, path, file_data) + s3_helper.upload(bucket, path, file_data) data = conn.load_file(bucket=bucket, path=path) assert data == file_data diff --git a/socorro/tests/external/boto/test_crash_data.py b/socorro/tests/external/boto/test_crash_data.py index 262c2acaf4..e09159cc77 100644 --- a/socorro/tests/external/boto/test_crash_data.py +++ b/socorro/tests/external/boto/test_crash_data.py @@ -44,7 +44,7 @@ def test_get_processed(self, s3_helper): crash_id = create_new_ooid() s3_helper.create_bucket(bucket) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/processed_crash/{crash_id}", data=json.dumps({"foo": "bar"}).encode("utf-8"), @@ -70,7 +70,7 @@ def test_get_raw_dump(self, s3_helper): crash_id = create_new_ooid() s3_helper.create_bucket(bucket) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/dump/{crash_id}", data=b"\xa0", @@ -122,7 +122,7 @@ def test_get_data(self, s3_helper): crash_id = create_new_ooid() s3_helper.create_bucket(bucket) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/crash_report/20{crash_id[-6:]}/{crash_id}", data=json.dumps({"foo": "bar"}).encode("utf-8"), diff --git a/socorro/tests/external/boto/test_crashstorage.py b/socorro/tests/external/boto/test_crashstorage.py index 82405608e6..3f841ad479 100644 --- a/socorro/tests/external/boto/test_crashstorage.py +++ b/socorro/tests/external/boto/test_crashstorage.py @@ -96,7 +96,7 @@ def test_save_raw_crash_no_dumps(self, s3_helper): # Verify the raw_crash made it to the right place and has the right # contents - raw_crash = s3_helper.download_fileobj( + raw_crash = s3_helper.download( bucket_name=bucket, key=f"v1/raw_crash/20{crash_id[-6:]}/{crash_id}", ) @@ -104,7 +104,7 @@ def test_save_raw_crash_no_dumps(self, s3_helper): assert json.loads(raw_crash) == original_raw_crash # Verify dump_names made it to the right place and has the right contents - dump_names = s3_helper.download_fileobj( + dump_names = s3_helper.download( bucket_name=bucket, key=f"v1/dump_names/{crash_id}", ) @@ -128,7 +128,7 @@ def test_save_raw_crash_with_dumps(self, s3_helper): ) # Verify the raw_crash made it to the right place and has the right contents - raw_crash = s3_helper.download_fileobj( + raw_crash = s3_helper.download( bucket_name=bucket, key=f"v1/raw_crash/20{crash_id[-6:]}/{crash_id}", ) @@ -137,20 +137,20 @@ def test_save_raw_crash_with_dumps(self, s3_helper): # Verify dump_names made it to the right place and has the right # contents - dump_names = s3_helper.download_fileobj( + dump_names = s3_helper.download( bucket_name=bucket, key=f"v1/dump_names/{crash_id}", ) assert sorted(json.loads(dump_names)) == ["content_dump", "dump"] # Verify dumps - dump = s3_helper.download_fileobj( + dump = s3_helper.download( bucket_name=bucket, key=f"v1/dump/{crash_id}", ) assert dump == b"fake dump" - content_dump = s3_helper.download_fileobj( + content_dump = s3_helper.download( bucket_name=bucket, key=f"v1/content_dump/{crash_id}", ) @@ -175,7 +175,7 @@ def test_save_processed_crash(self, s3_helper): ) # Verify processed crash is saved - processed_crash = s3_helper.download_fileobj( + processed_crash = s3_helper.download( bucket_name=bucket, key=f"v1/processed_crash/{crash_id}", ) @@ -191,7 +191,7 @@ def test_get_raw_crash(self, s3_helper): original_raw_crash = {"submitted_timestamp": date_to_string(now)} s3_helper.create_bucket(bucket) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/raw_crash/20{crash_id[-6:]}/{crash_id}", data=dict_to_str(original_raw_crash).encode("utf-8"), @@ -219,7 +219,7 @@ def test_get_raw_dump(self, s3_helper): crash_id = create_new_ooid() s3_helper.create_bucket(bucket) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/dump/{crash_id}", data=b"this is a raw dump", @@ -244,7 +244,7 @@ def test_get_raw_dump_upload_file_minidump(self, s3_helper): crash_id = create_new_ooid() s3_helper.create_bucket(bucket) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/dump/{crash_id}", data=b"this is a raw dump", @@ -260,7 +260,7 @@ def test_get_raw_dump_empty_string(self, s3_helper): crash_id = create_new_ooid() s3_helper.create_bucket(bucket) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/dump/{crash_id}", data=b"this is a raw dump", @@ -275,22 +275,22 @@ def test_get_dumps(self, s3_helper): crash_id = create_new_ooid() s3_helper.create_bucket(bucket) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/dump_names/{crash_id}", data=b'["dump", "content_dump", "city_dump"]', ) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/dump/{crash_id}", data=b'this is "dump", the first one', ) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/content_dump/{crash_id}", data=b'this is "content_dump", the second one', ) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/city_dump/{crash_id}", data=b'this is "city_dump", the last one', @@ -318,22 +318,22 @@ def test_get_dumps_as_files(self, s3_helper, tmp_path): crash_id = create_new_ooid() s3_helper.create_bucket(bucket) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/dump_names/{crash_id}", data=b'["dump", "content_dump", "city_dump"]', ) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/dump/{crash_id}", data=b'this is "dump", the first one', ) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/content_dump/{crash_id}", data=b'this is "content_dump", the second one', ) - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/city_dump/{crash_id}", data=b'this is "city_dump", the last one', @@ -376,7 +376,7 @@ def test_get_processed_crash(self, s3_helper): "json_dump": {"sensitive": 22}, } - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/processed_crash/{crash_id}", data=dict_to_str(processed_crash).encode("utf-8"), @@ -433,7 +433,7 @@ def test_save_processed_crash(self, s3_helper): ) # Get the crash data we just saved from the bucket and verify it's contents - crash_data = s3_helper.download_fileobj( + crash_data = s3_helper.download( bucket_name=bucket, key=f"v1/crash_report/20{crash_id[-6:]}/{crash_id}", ) @@ -470,7 +470,7 @@ def test_get_processed_crash(self, s3_helper): } # Save the data to S3 so we have something to get - s3_helper.upload_fileobj( + s3_helper.upload( bucket_name=bucket, key=f"v1/crash_report/20{crash_id[-6:]}/{crash_id}", data=json.dumps(crash_data).encode("utf-8"), diff --git a/socorro/tests/test_upload_telemetry_schema.py b/socorro/tests/test_upload_telemetry_schema.py index 51385772fa..00a2262975 100644 --- a/socorro/tests/test_upload_telemetry_schema.py +++ b/socorro/tests/test_upload_telemetry_schema.py @@ -30,7 +30,7 @@ def test_upload(s3_helper): assert result.exit_code == 0 # Get the crash data we just saved from the bucket and verify it's contents - crash_data = s3_helper.download_fileobj( + crash_data = s3_helper.download( bucket_name=TELEMETRY_BUCKET, key="telemetry_socorro_crash.json", ) From 7e291ae95b45f3fd675bbf2ec3a72f9144cfc0bd Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Tue, 7 May 2024 14:12:51 -0700 Subject: [PATCH 7/8] merge test_crash_data into test_crashstorage --- socorro/tests/external/gcs/test_crash_data.py | 140 ------------------ .../tests/external/gcs/test_crashstorage.py | 113 ++++++++++++++ 2 files changed, 113 insertions(+), 140 deletions(-) delete mode 100644 socorro/tests/external/gcs/test_crash_data.py diff --git a/socorro/tests/external/gcs/test_crash_data.py b/socorro/tests/external/gcs/test_crash_data.py deleted file mode 100644 index e465b2b988..0000000000 --- a/socorro/tests/external/gcs/test_crash_data.py +++ /dev/null @@ -1,140 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. - -import json -import os - -import pytest - -from socorro.external.crashstorage_base import CrashIDNotFound -from socorro.lib import MissingArgumentError, BadArgumentError -from socorro.libclass import build_instance_from_settings -from socorro.lib.libooid import create_new_ooid - - -CRASHDATA_SETTINGS = { - "class": "socorro.external.gcs.crashstorage.GcsCrashStorage", - "options": { - "bucket": os.environ["CRASHSTORAGE_GCS_BUCKET"], - }, -} - -TELEMETRY_SETTINGS = { - "class": "socorro.external.gcs.crashstorage.TelemetryGcsCrashStorage", - "options": { - "bucket": os.environ["TELEMETRY_GCS_BUCKET"], - }, -} - - -class TestSimplifiedCrashData: - def test_get_processed(self, gcs_helper): - crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) - - bucket = CRASHDATA_SETTINGS["options"]["bucket"] - crash_id = create_new_ooid() - gcs_helper.create_bucket(bucket) - - gcs_helper.upload( - bucket_name=bucket, - key=f"v1/processed_crash/{crash_id}", - data=json.dumps({"foo": "bar"}).encode("utf-8"), - ) - - result = crashdata.get(uuid=crash_id, datatype="processed") - assert result == {"foo": "bar"} - - def test_get_processed_not_found(self, gcs_helper): - crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) - - bucket = CRASHDATA_SETTINGS["options"]["bucket"] - crash_id = create_new_ooid() - gcs_helper.create_bucket(bucket) - - with pytest.raises(CrashIDNotFound): - crashdata.get(uuid=crash_id, datatype="processed") - - def test_get_raw_dump(self, gcs_helper): - crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) - - bucket = CRASHDATA_SETTINGS["options"]["bucket"] - crash_id = create_new_ooid() - gcs_helper.create_bucket(bucket) - - gcs_helper.upload( - bucket_name=bucket, - key=f"v1/dump/{crash_id}", - data=b"\xa0", - ) - - result = crashdata.get(uuid=crash_id, datatype="raw") - assert result == b"\xa0" - - def test_get_raw_dump_not_found(self, gcs_helper): - crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) - - bucket = CRASHDATA_SETTINGS["options"]["bucket"] - crash_id = create_new_ooid() - gcs_helper.create_bucket(bucket) - - with pytest.raises(CrashIDNotFound): - crashdata.get(uuid=crash_id, datatype="raw") - - def test_get_raw_crash_not_found(self, gcs_helper): - crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) - - bucket = CRASHDATA_SETTINGS["options"]["bucket"] - crash_id = create_new_ooid() - gcs_helper.create_bucket(bucket) - - with pytest.raises(CrashIDNotFound): - crashdata.get(uuid=crash_id, datatype="meta") - - def test_bad_arguments(self): - crashdata = build_instance_from_settings(CRASHDATA_SETTINGS) - - crash_id = create_new_ooid() - - with pytest.raises(MissingArgumentError): - crashdata.get() - - with pytest.raises(MissingArgumentError): - crashdata.get(uuid=crash_id) - - with pytest.raises(BadArgumentError): - crashdata.get(uuid=crash_id, datatype="junk") - - -class TestTelemetryCrashData: - def test_get_data(self, gcs_helper): - crashdata = build_instance_from_settings(TELEMETRY_SETTINGS) - - bucket = TELEMETRY_SETTINGS["options"]["bucket"] - crash_id = create_new_ooid() - - gcs_helper.create_bucket(bucket) - gcs_helper.upload( - bucket_name=bucket, - key=f"v1/crash_report/20{crash_id[-6:]}/{crash_id}", - data=json.dumps({"foo": "bar"}).encode("utf-8"), - ) - - result = crashdata.get(uuid=crash_id) - assert result == {"foo": "bar"} - - def test_get_data_not_found(self, gcs_helper): - crashdata = build_instance_from_settings(TELEMETRY_SETTINGS) - - bucket = TELEMETRY_SETTINGS["options"]["bucket"] - crash_id = create_new_ooid() - - gcs_helper.create_bucket(bucket) - with pytest.raises(CrashIDNotFound): - crashdata.get(uuid=crash_id) - - def test_bad_arguments(self): - crashdata = build_instance_from_settings(TELEMETRY_SETTINGS) - - with pytest.raises(MissingArgumentError): - crashdata.get() diff --git a/socorro/tests/external/gcs/test_crashstorage.py b/socorro/tests/external/gcs/test_crashstorage.py index 895d0a0d2a..783d882aa9 100644 --- a/socorro/tests/external/gcs/test_crashstorage.py +++ b/socorro/tests/external/gcs/test_crashstorage.py @@ -10,6 +10,7 @@ from socorro.external.gcs.crashstorage import build_keys, dict_to_str from socorro.external.crashstorage_base import CrashIDNotFound, MemoryDumpsMapping +from socorro.lib import MissingArgumentError, BadArgumentError from socorro.libclass import build_instance_from_settings from socorro.lib.libdatetime import date_to_string, utc_now from socorro.lib.libooid import create_new_ooid @@ -384,6 +385,84 @@ def test_get_processed_not_found(self, gcs_helper): crashstorage.get_processed_crash(crash_id) +class TestGetMethod: + def test_get_processed(self, gcs_helper): + crashdata = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/processed_crash/{crash_id}", + data=json.dumps({"foo": "bar"}).encode("utf-8"), + ) + + result = crashdata.get(uuid=crash_id, datatype="processed") + assert result == {"foo": "bar"} + + def test_get_processed_not_found(self, gcs_helper): + crashdata = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + with pytest.raises(CrashIDNotFound): + crashdata.get(uuid=crash_id, datatype="processed") + + def test_get_raw_dump(self, gcs_helper): + crashdata = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/dump/{crash_id}", + data=b"\xa0", + ) + + result = crashdata.get(uuid=crash_id, datatype="raw") + assert result == b"\xa0" + + def test_get_raw_dump_not_found(self, gcs_helper): + crashdata = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + with pytest.raises(CrashIDNotFound): + crashdata.get(uuid=crash_id, datatype="raw") + + def test_get_raw_crash_not_found(self, gcs_helper): + crashdata = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + + bucket = CRASHSTORAGE_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + gcs_helper.create_bucket(bucket) + + with pytest.raises(CrashIDNotFound): + crashdata.get(uuid=crash_id, datatype="meta") + + def test_bad_arguments(self): + crashdata = build_instance_from_settings(CRASHSTORAGE_SETTINGS) + + crash_id = create_new_ooid() + + with pytest.raises(MissingArgumentError): + crashdata.get() + + with pytest.raises(MissingArgumentError): + crashdata.get(uuid=crash_id) + + with pytest.raises(BadArgumentError): + crashdata.get(uuid=crash_id, datatype="junk") + + class TestTelemetryGcsCrashStorage: def test_save_processed_crash(self, gcs_helper): crashstorage = build_instance_from_settings(TELEMETRY_SETTINGS) @@ -468,3 +547,37 @@ def test_get_processed_crash(self, gcs_helper): # Get the crash and assert it's the same data data = crashstorage.get_processed_crash(crash_id=crash_id) assert data == crash_data + + +class TestTelemetryGetMethod: + def test_get_data(self, gcs_helper): + crashdata = build_instance_from_settings(TELEMETRY_SETTINGS) + + bucket = TELEMETRY_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + gcs_helper.upload( + bucket_name=bucket, + key=f"v1/crash_report/20{crash_id[-6:]}/{crash_id}", + data=json.dumps({"foo": "bar"}).encode("utf-8"), + ) + + result = crashdata.get(uuid=crash_id) + assert result == {"foo": "bar"} + + def test_get_data_not_found(self, gcs_helper): + crashdata = build_instance_from_settings(TELEMETRY_SETTINGS) + + bucket = TELEMETRY_SETTINGS["options"]["bucket"] + crash_id = create_new_ooid() + + gcs_helper.create_bucket(bucket) + with pytest.raises(CrashIDNotFound): + crashdata.get(uuid=crash_id) + + def test_bad_arguments(self): + crashdata = build_instance_from_settings(TELEMETRY_SETTINGS) + + with pytest.raises(MissingArgumentError): + crashdata.get() From 7792931c04fb48d0b17626c3c8ad18cddbf4867e Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Tue, 7 May 2024 14:13:57 -0700 Subject: [PATCH 8/8] only require objectUser permissions --- socorro/external/gcs/crashstorage.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/socorro/external/gcs/crashstorage.py b/socorro/external/gcs/crashstorage.py index 957baa0c31..82d9fb6bf3 100644 --- a/socorro/external/gcs/crashstorage.py +++ b/socorro/external/gcs/crashstorage.py @@ -97,12 +97,12 @@ def __init__( self.metrics = markus.get_metrics(metrics_prefix) def load_file(self, path): - bucket = self.client.get_bucket(self.bucket) + bucket = self.client.bucket(self.bucket) blob = bucket.blob(path) return blob.download_as_bytes() def save_file(self, path, data): - bucket = self.client.get_bucket(self.bucket) + bucket = self.client.bucket(self.bucket) blob = bucket.blob(path) blob.upload_from_string(data) @@ -167,12 +167,8 @@ def exists_object(self, key): :returns: bool """ - try: - bucket = self.client.get_bucket(self.bucket) - bucket.get_blob(key) - return True - except NotFound: - return False + bucket = self.client.bucket(self.bucket) + return bucket.blob(key).exists() def get_raw_crash(self, crash_id): """Get the raw crash file for the given crash id