diff --git a/Makefile b/Makefile index f3779003..8d49d064 100644 --- a/Makefile +++ b/Makefile @@ -101,7 +101,7 @@ mongorestore-nmdc-db: mkdir -p /tmp/remote-mongodump/nmdc # SSH into the remote server, stream the dump directory as a gzipped tar archive, and extract it locally. ssh -i ~/.ssh/nersc ${NERSC_USERNAME}@dtn01.nersc.gov \ - 'tar -czf - -C /global/cfs/projectdirs/m3408/nmdc-mongodumps/dump_nmdc-prod_2024-07-29_20-12-07/nmdc .' \ + 'tar -czf - -C /global/cfs/projectdirs/m3408/nmdc-mongodumps/dump_nmdc-prod_2024-11-25_20-12-02/nmdc .' \ | tar -xzv -C /tmp/remote-mongodump/nmdc mongorestore -v -h localhost:27018 -u admin -p root --authenticationDatabase=admin \ --drop --nsInclude='nmdc.*' --dir /tmp/remote-mongodump diff --git a/docs/nb/bulk_validation_referential_integrity_check.ipynb b/docs/nb/bulk_validation_referential_integrity_check.ipynb index 06a01ec8..9defa5e5 100644 --- a/docs/nb/bulk_validation_referential_integrity_check.ipynb +++ b/docs/nb/bulk_validation_referential_integrity_check.ipynb @@ -37,7 +37,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "mongodb://localhost:27018\n" + "localhost:27018\n" ] } ], @@ -93,7 +93,7 @@ { "data": { "text/plain": [ - "'11.0.0rc22'" + "'11.1.0'" ] }, "execution_count": 3, @@ -140,15 +140,7 @@ "id": "bcb5802b-8205-49b7-8784-dc137baff1a0", "metadata": {}, "source": [ - "## Check for errors in the database" - ] - }, - { - "cell_type": "markdown", - "id": "1ab96cda-30ab-4e93-a0b1-3a936599305d", - "metadata": {}, - "source": [ - "The `nmdc_schema_collection_names` function returns the populated (having at least one document) set-intersection of (a) the set of collection names present in the Mongo database and (b) the set of Database slots in the schema that correspond to a collection (defined as being multivalued and values being inlined as a list)." + "## Create slot mappings" ] }, { @@ -156,18 +148,9 @@ "execution_count": 5, "id": "1d76b70e-4412-4b17-9db9-322ac791859a", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'study_set', 'workflow_execution_set', 'material_processing_set', 'instrument_set', 'data_object_set', 'configuration_set', 'biosample_set', 'functional_annotation_agg', 'calibration_set', 'processed_sample_set', 'field_research_site_set', 'data_generation_set'}\n" - ] - } - ], + "outputs": [], "source": [ - "collection_names = get_nonempty_nmdc_schema_collection_names(mdb)\n", - "print(collection_names)" + "collection_names = populated_schema_collection_names_with_id_field(mdb) # `get_nonempty_nmdc_schema_collection_names` to include \"functional_annotation_agg\"" ] }, { @@ -211,107 +194,6 @@ "}" ] }, - { - "cell_type": "code", - "execution_count": 8, - "id": "12e7d00e-0ec4-45de-b0da-1b618ef7e80b", - "metadata": {}, - "outputs": [], - "source": [ - "def collect_errors(note_doc_field_errors):\n", - " errors = {\"bad_type\": [], \"no_type\": [], \"bad_slot\": [], \"is_null\": []}\n", - " n_docs_total = sum(mdb[coll_name].estimated_document_count() for coll_name in collection_names)\n", - " pbar = tqdm(total=n_docs_total)\n", - " n_errors_cache = 0\n", - " for coll_name in sorted(collection_names):\n", - " cls_names = collection_name_to_class_names[coll_name]\n", - " pbar.set_description(f\"processing {coll_name}...\")\n", - " # Iterate over each document (as a dictionary) in this collection.\n", - " for doc in mdb[coll_name].find():\n", - " doc = dissoc(doc, \"_id\")\n", - " \n", - " # Ensure we know the document's type.\n", - " cls_name = None\n", - " cls_type_match = re.match(r\"^nmdc:(?P.+)\", doc.get(\"type\", \"\"))\n", - " if cls_type_match is not None:\n", - " cls_name = cls_type_match.group(\"name\")\n", - " if cls_name not in cls_names:\n", - " errors[\"bad_type\"].append(f\"{coll_name} doc {doc['id']}: doc type {cls_name} not in those allowed for {coll_name}, i.e. {cls_names}.\")\n", - " cls_name = None\n", - " elif len(cls_names) == 1:\n", - " cls_name = cls_names[0]\n", - " else:\n", - " errors[\"no_type\"].append(f\"{coll_name} doc {doc['id']}: 'type' not set.\")\n", - "\n", - " if cls_name is not None: \n", - " slot_map = cls_slot_map[cls_name]\n", - " # Iterate over each key/value pair in the dictionary (document).\n", - " for field, value in doc.items():\n", - " if field in slot_map:\n", - " if not isinstance(value, list):\n", - " value = [value]\n", - " for v in value:\n", - " note_doc_field_errors(value=v,field=field,doc=doc,coll_name=coll_name,errors=errors) \n", - " else:\n", - " errors[\"bad_slot\"].append(f\"{coll_name} doc {doc['id']}: field '{field}' not a valid slot\")\n", - " pbar.update(1)\n", - " n_errors = sum([len(v) for v in errors.values()])\n", - " if n_errors > n_errors_cache:\n", - " print(f\"{n_errors} errors so far...\")\n", - " n_errors_cache = n_errors\n", - " pbar.close()\n", - " return errors" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "14afb4c6-b0b7-4fd7-8e2f-13c682c74409", - "metadata": {}, - "outputs": [], - "source": [ - "def note_doc_field_errors(value=None, field=None, doc=None, coll_name=None, errors=None):\n", - " # No fields should be null-valued.\n", - " # Example of how this may happen: JSON serialization from pydantic models may set optional fields to `null`.\n", - " if value is None:\n", - " errors[\"is_null\"].append(f\"{coll_name} doc {doc['id']}: field {field} is null.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "829039e5-7abe-4c50-ba44-e384b45b7535", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6c88577a3a9342808d3bbc0e3707a95a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/2351449 [00:00 List[str]:\n", + " r\"\"\"\n", + " Determine the slot's \"effective\" range, by taking into account its `any_of` constraints (if defined).\n", + "\n", + " Note: The `any_of` constraints constrain the slot's \"effective\" range beyond that described by the\n", + " induced slot definition's `range` attribute. `SchemaView` does not seem to provide the result\n", + " of applying those additional constraints, so we do it manually here (if any are defined).\n", + " Reference: https://github.com/orgs/linkml/discussions/2101#discussion-6625646\n", + "\n", + " Reference: https://linkml.io/linkml-model/latest/docs/any_of/\n", + " \"\"\"\n", + "\n", + " # Initialize the list to be empty.\n", + " names_of_eligible_target_classes = []\n", + "\n", + " # If the `any_of` constraint is defined on this slot, use that instead of the `range`.\n", + " if \"any_of\" in slot_definition and len(slot_definition.any_of) > 0:\n", + " for slot_expression in slot_definition.any_of:\n", + " # Use the slot expression's `range` to get the specified eligible class name\n", + " # and the names of all classes that inherit from that eligible class.\n", + " if slot_expression.range in schema_view.all_classes():\n", + " own_and_descendant_class_names = schema_view.class_descendants(slot_expression.range)\n", + " names_of_eligible_target_classes.extend(own_and_descendant_class_names)\n", + " else:\n", + " # Use the slot's `range` to get the specified eligible class name\n", + " # and the names of all classes that inherit from that eligible class.\n", + " if slot_definition.range in schema_view.all_classes():\n", + " own_and_descendant_class_names = schema_view.class_descendants(slot_definition.range)\n", + " names_of_eligible_target_classes.extend(own_and_descendant_class_names)\n", + "\n", + " # Remove duplicate class names.\n", + " names_of_eligible_target_classes = list(set(names_of_eligible_target_classes))\n", + "\n", + " return names_of_eligible_target_classes" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "d253c567-533f-440f-8376-03a6e1e905cf", + "execution_count": 9, + "id": "98fbfdff-51d6-42c5-9448-3b4616a2c9cb", "metadata": {}, "outputs": [], "source": [ - "def doc_cls(doc, coll_name=None):\n", - " \"\"\"Return unprefixed name of document class.\n", + "# Any ancestor of a document class is a document-referenceable range, i.e., a valid range of a document-reference-ranged slot.\n", + "document_referenceable_ranges = set(chain.from_iterable(schema_view.class_ancestors(cls_name) for cls_name in document_class_names))\n", "\n", - " Try to get from doc['type'] (lopping off 'nmdc:' prefix).\n", - " Else, if can unambiguously infer type given coll_name, use that.\n", - " Else, return None.\n", - " \"\"\"\n", - " if 'type' in doc:\n", - " return doc['type'][5:] # lop off \"nmdc:\" prefix\n", - " elif coll_name and len(collection_name_to_class_names[coll_name]) == 1:\n", - " return collection_name_to_class_names[coll_name][0]" + "document_reference_ranged_slots = defaultdict(list)\n", + "for cls_name, slot_map in cls_slot_map.items():\n", + " for slot_name, slot in slot_map.items():\n", + " if set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot)) & document_referenceable_ranges:\n", + " document_reference_ranged_slots[cls_name].append(slot_name)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6", "metadata": { "scrolled": true @@ -380,12 +292,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f63a4ce942bc4278b3e99a5a87b0155c", + "model_id": "47b7ed79df384cf98c527651021fe2cd", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/2351449 [00:00