From aae9723dad39c2c08072ffe20daffa49292fd603 Mon Sep 17 00:00:00 2001
From: Donny Winston <donny@polyneme.xyz>
Date: Thu, 21 Nov 2024 20:35:18 +0100
Subject: [PATCH 1/7] todo: progress

---
 Makefile                                      |   2 +-
 ...lidation_referential_integrity_check.ipynb | 187 +++++++++++-------
 2 files changed, 122 insertions(+), 67 deletions(-)

diff --git a/Makefile b/Makefile
index f3779003..85ba107b 100644
--- a/Makefile
+++ b/Makefile
@@ -101,7 +101,7 @@ mongorestore-nmdc-db:
 	mkdir -p /tmp/remote-mongodump/nmdc
 	# SSH into the remote server, stream the dump directory as a gzipped tar archive, and extract it locally.
 	ssh -i ~/.ssh/nersc ${NERSC_USERNAME}@dtn01.nersc.gov \
-		'tar -czf - -C /global/cfs/projectdirs/m3408/nmdc-mongodumps/dump_nmdc-prod_2024-07-29_20-12-07/nmdc .' \
+		'tar -czf - -C /global/cfs/projectdirs/m3408/nmdc-mongodumps/dump_nmdc-prod_2024-11-20_20-12-02/nmdc .' \
 		| tar -xzv -C /tmp/remote-mongodump/nmdc
 	mongorestore -v -h localhost:27018 -u admin -p root --authenticationDatabase=admin \
 		--drop --nsInclude='nmdc.*' --dir /tmp/remote-mongodump
diff --git a/docs/nb/bulk_validation_referential_integrity_check.ipynb b/docs/nb/bulk_validation_referential_integrity_check.ipynb
index 06a01ec8..b616da32 100644
--- a/docs/nb/bulk_validation_referential_integrity_check.ipynb
+++ b/docs/nb/bulk_validation_referential_integrity_check.ipynb
@@ -37,7 +37,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "mongodb://localhost:27018\n"
+      "localhost:27018\n"
      ]
     }
    ],
@@ -93,7 +93,7 @@
     {
      "data": {
       "text/plain": [
-       "'11.0.0rc22'"
+       "'11.1.0'"
       ]
      },
      "execution_count": 3,
@@ -161,7 +161,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'study_set', 'workflow_execution_set', 'material_processing_set', 'instrument_set', 'data_object_set', 'configuration_set', 'biosample_set', 'functional_annotation_agg', 'calibration_set', 'processed_sample_set', 'field_research_site_set', 'data_generation_set'}\n"
+      "{'data_object_set', 'functional_annotation_agg', 'material_processing_set', 'workflow_execution_set', 'calibration_set', 'data_generation_set', 'configuration_set', 'processed_sample_set', 'instrument_set', 'biosample_set', 'study_set', 'field_research_site_set'}\n"
      ]
     }
    ],
@@ -279,7 +279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "829039e5-7abe-4c50-ba44-e384b45b7535",
    "metadata": {
     "scrolled": true
@@ -288,12 +288,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6c88577a3a9342808d3bbc0e3707a95a",
+       "model_id": "d798cf56b8b541598d246c023543d29a",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/2351449 [00:00<?, ?it/s]"
+       "  0%|          | 0/27762723 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -334,7 +334,57 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
+   "id": "11ac53f2-057e-471b-a260-f5a65e3361af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# From https://github.com/microbiomedata/refscan/blob/af092b0e068b671849fe0f323fac2ed54b81d574/refscan/lib/helpers.py#L141-L176\n",
+    "\n",
+    "from typing import List\n",
+    "from linkml_runtime import linkml_model\n",
+    "\n",
+    "def get_names_of_classes_in_effective_range_of_slot(\n",
+    "    schema_view: SchemaView, slot_definition: linkml_model.SlotDefinition\n",
+    ") -> List[str]:\n",
+    "    r\"\"\"\n",
+    "    Determine the slot's \"effective\" range, by taking into account its `any_of` constraints (if defined).\n",
+    "\n",
+    "    Note: The `any_of` constraints constrain the slot's \"effective\" range beyond that described by the\n",
+    "          induced slot definition's `range` attribute. `SchemaView` does not seem to provide the result\n",
+    "          of applying those additional constraints, so we do it manually here (if any are defined).\n",
+    "          Reference: https://github.com/orgs/linkml/discussions/2101#discussion-6625646\n",
+    "\n",
+    "    Reference: https://linkml.io/linkml-model/latest/docs/any_of/\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Initialize the list to be empty.\n",
+    "    names_of_eligible_target_classes = []\n",
+    "\n",
+    "    # If the `any_of` constraint is defined on this slot, use that instead of the `range`.\n",
+    "    if \"any_of\" in slot_definition and len(slot_definition.any_of) > 0:\n",
+    "        for slot_expression in slot_definition.any_of:\n",
+    "            # Use the slot expression's `range` to get the specified eligible class name\n",
+    "            # and the names of all classes that inherit from that eligible class.\n",
+    "            if slot_expression.range in schema_view.all_classes():\n",
+    "                own_and_descendant_class_names = schema_view.class_descendants(slot_expression.range)\n",
+    "                names_of_eligible_target_classes.extend(own_and_descendant_class_names)\n",
+    "    else:\n",
+    "        # Use the slot's `range` to get the specified eligible class name\n",
+    "        # and the names of all classes that inherit from that eligible class.\n",
+    "        if slot_definition.range in schema_view.all_classes():\n",
+    "            own_and_descendant_class_names = schema_view.class_descendants(slot_definition.range)\n",
+    "            names_of_eligible_target_classes.extend(own_and_descendant_class_names)\n",
+    "\n",
+    "    # Remove duplicate class names.\n",
+    "    names_of_eligible_target_classes = list(set(names_of_eligible_target_classes))\n",
+    "\n",
+    "    return names_of_eligible_target_classes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
    "id": "98fbfdff-51d6-42c5-9448-3b4616a2c9cb",
    "metadata": {},
    "outputs": [],
@@ -345,13 +395,13 @@
     "document_reference_ranged_slots = defaultdict(list)\n",
     "for cls_name, slot_map in cls_slot_map.items():\n",
     "    for slot_name, slot in slot_map.items():\n",
-    "        if str(slot.range) in document_referenceable_ranges:\n",
+    "        if set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot)) & document_referenceable_ranges:\n",
     "            document_reference_ranged_slots[cls_name].append(slot_name)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 17,
    "id": "d253c567-533f-440f-8376-03a6e1e905cf",
    "metadata": {},
    "outputs": [],
@@ -371,7 +421,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 40,
    "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6",
    "metadata": {
     "scrolled": true
@@ -380,12 +430,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f63a4ce942bc4278b3e99a5a87b0155c",
+       "model_id": "158c79049a2c43c6b04904bc325946ec",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/2351449 [00:00<?, ?it/s]"
+       "  0%|          | 0/27762723 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -412,9 +462,9 @@
     "    requests = []\n",
     "    for doc in mdb[coll_name].find():\n",
     "        doc_type = doc_cls(doc, coll_name=coll_name)\n",
-    "        slots_to_include = [\"id\"] + document_reference_ranged_slots[doc_type]\n",
+    "        slots_to_include = [\"id\", \"type\"] + document_reference_ranged_slots[doc_type]\n",
     "        new_doc = pick(slots_to_include, doc)\n",
-    "        new_doc[\"type\"] = schema_view.class_ancestors(doc_type)\n",
+    "        new_doc[\"_type_and_ancestors\"] = schema_view.class_ancestors(doc_type)\n",
     "        requests.append(InsertOne(new_doc))\n",
     "        if len(requests) == 1000: # ensure bulk-write batches aren't too huge\n",
     "            result = mdb.alldocs.bulk_write(requests, ordered=False)\n",
@@ -467,7 +517,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 27,
    "id": "d0374653-c074-4a87-aef8-24323a5a63b3",
    "metadata": {},
    "outputs": [],
@@ -482,7 +532,10 @@
     "        for field, value in doc.items():\n",
     "            if field in (\"_id\", \"id\", \"type\"):\n",
     "                continue\n",
-    "            slot_range = str(cls_slot_map[doc[\"type\"][0]][field].range) # assumes upstream doc type is listed first.\n",
+    "            acceptable_slot_classes = get_names_of_classes_in_effective_range_of_slot(\n",
+    "                schema_view,\n",
+    "                cls_slot_map[doc[\"type\"][field],\n",
+    "            )\n",
     "            if not isinstance(value, list):\n",
     "                value = [value]\n",
     "            for v in value:\n",
@@ -491,7 +544,7 @@
     "                    \"id_is_nmdc_id\": \"id\" in doc,\n",
     "                    \"field\": field,\n",
     "                    \"value\": v,\n",
-    "                    \"slot_range\": slot_range,\n",
+    "                    \"acceptable_slot_classes\": acceptable_slot_classes,\n",
     "                })\n",
     "                if len(rv) == 1000:\n",
     "                    yield rv\n",
@@ -503,7 +556,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 29,
    "id": "103d70b6-24ab-41bd-8b7f-d2faaa028bdf",
    "metadata": {
     "scrolled": true
@@ -512,12 +565,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c8c75b6bd622470f9ded8e3813fc1d64",
+       "model_id": "483900e4d6bf4e46ab4a36ae0fb3f8a1",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/3039449 [00:00<?, ?it/s]"
+       "  0%|          | 0/27762723 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -529,9 +582,11 @@
     "\n",
     "def doc_field_value_errors(assertions):\n",
     "    errors = {\"not_found\": [], \"invalid_type\": []}\n",
+    "    # collect a list of assertions that concern a specific referenced \"id\".\n",
     "    assertions_by_referenced_id_value = defaultdict(list)\n",
     "    for a in assertions:\n",
     "        assertions_by_referenced_id_value[a[\"value\"]].append(a)\n",
+    "    # find the claimed type for every document id that is referenced by another document.\n",
     "    doc_id_types = {}\n",
     "    for d in list(mdb.alldocs.find({\"id\": {\"$in\": list(assertions_by_referenced_id_value.keys())}}, {\"_id\": 0, \"id\": 1, \"type\": 1})):\n",
     "        doc_id_types[d[\"id\"]] = d[\"type\"]\n",
@@ -541,7 +596,9 @@
     "            errors[\"not_found\"].extend(id_value_assertions)\n",
     "        else:\n",
     "            for a in id_value_assertions:\n",
-    "                if a[\"slot_range\"] not in doc_id_types[a[\"value\"]]:\n",
+    "                # check that the observed type (or any of its ancestors) for this id reference\n",
+    "                # is in fact allowed by the referring slot's schema definition.\n",
+    "                if not (set(a[\"acceptable_slot_classes\"]) & set(doc_id_types[a[\"value\"]])):\n",
     "                    errors[\"invalid_type\"].append(a)\n",
     "\n",
     "    return errors\n",
@@ -576,17 +633,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 30,
    "id": "e01450d1-3369-4fc5-80be-9787e00a6597",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(5, 45604)"
+       "(4, 0)"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -608,17 +665,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 31,
    "id": "afd25543-1cb3-4887-9aba-0086d4b998a6",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'nmdc:dobj-11-cvcxxr53', 'nmdc:dobj-11-fg28a080', 'nmdc:dobj-11-gxgpbv06'}"
+       "{'nmdc:dobj-11-achfhn33', 'nmdc:dobj-11-dpnhb305'}"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -629,47 +686,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 38,
    "id": "a25857f4-e26e-4896-9e5f-607e7b4bb07c",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "[{'id': 'nmdc:wfmgan-11-w1d6gy98.1',\n",
-       "  'id_is_nmdc_id': True,\n",
-       "  'field': 'has_input',\n",
-       "  'value': 'nmdc:dobj-11-cvcxxr53',\n",
-       "  'slot_range': 'NamedThing'},\n",
-       " {'id': 'nmdc:wfmgan-11-fmymf551.1',\n",
-       "  'id_is_nmdc_id': True,\n",
-       "  'field': 'has_input',\n",
-       "  'value': 'nmdc:dobj-11-fg28a080',\n",
-       "  'slot_range': 'NamedThing'},\n",
-       " {'id': 'nmdc:wfmgan-11-3nkefn97.1',\n",
-       "  'id_is_nmdc_id': True,\n",
-       "  'field': 'has_input',\n",
-       "  'value': 'nmdc:dobj-11-gxgpbv06',\n",
-       "  'slot_range': 'NamedThing'},\n",
-       " {'id': 'nmdc:wfmgan-11-fmymf551.1',\n",
-       "  'id_is_nmdc_id': True,\n",
-       "  'field': 'has_input',\n",
-       "  'value': 'nmdc:dobj-11-fg28a080',\n",
-       "  'slot_range': 'NamedThing'},\n",
-       " {'id': 'nmdc:wfmgan-11-3nkefn97.1',\n",
-       "  'id_is_nmdc_id': True,\n",
-       "  'field': 'has_input',\n",
-       "  'value': 'nmdc:dobj-11-gxgpbv06',\n",
-       "  'slot_range': 'NamedThing'}]"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "nmdc:wfmgan-11-c516q834.1 has_input nmdc:dobj-11-dpnhb305\n",
+      "nmdc:wfmgan-11-yzp9eq74.1 has_input nmdc:dobj-11-achfhn33\n",
+      "nmdc:wfmgan-11-c516q834.1 has_input nmdc:dobj-11-dpnhb305\n",
+      "nmdc:wfmgan-11-yzp9eq74.1 has_input nmdc:dobj-11-achfhn33\n"
+     ]
     }
    ],
    "source": [
-    "errors[\"not_found\"][:5]"
+    "for e in errors[\"not_found\"]:\n",
+    "    print(e[\"id\"], e['field'], e['value'])"
    ]
   },
   {
@@ -682,7 +716,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 39,
+   "id": "f8788551-a9b1-4915-a23d-74cfcbe62ec1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "errors[\"invalid_type\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
    "id": "33516e3c-f10d-4c30-942b-0d01d06082f9",
    "metadata": {},
    "outputs": [
@@ -690,7 +745,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'id': 'nmdc:dobj-11-xt088e26', 'id_is_nmdc_id': True, 'field': 'was_generated_by', 'value': 'nmdc:omprc-11-ymxzx274', 'slot_range': 'WorkflowExecution'}\n"
+      "{'id': 'nmdc:dobj-11-enyrng31', 'id_is_nmdc_id': True, 'field': 'was_generated_by', 'value': 'nmdc:omprc-11-2et99h53', 'slot_range': 'WorkflowExecution'}\n"
      ]
     }
    ],
@@ -769,9 +824,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "nmdc-runtime",
    "language": "python",
-   "name": "python3"
+   "name": "nmdc-runtime"
   },
   "language_info": {
    "codemirror_mode": {
@@ -783,7 +838,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

From 3aa160f089bc52942766154b0777fe2e58863a8f Mon Sep 17 00:00:00 2001
From: Donny Winston <donny@polyneme.xyz>
Date: Thu, 21 Nov 2024 21:58:02 +0100
Subject: [PATCH 2/7] todo: progress

---
 ...lidation_referential_integrity_check.ipynb | 183 +++---------------
 1 file changed, 26 insertions(+), 157 deletions(-)

diff --git a/docs/nb/bulk_validation_referential_integrity_check.ipynb b/docs/nb/bulk_validation_referential_integrity_check.ipynb
index b616da32..bfe16603 100644
--- a/docs/nb/bulk_validation_referential_integrity_check.ipynb
+++ b/docs/nb/bulk_validation_referential_integrity_check.ipynb
@@ -517,7 +517,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 55,
    "id": "d0374653-c074-4a87-aef8-24323a5a63b3",
    "metadata": {},
    "outputs": [],
@@ -530,11 +530,11 @@
     "    for doc in mdb.alldocs.find(limit=limit):\n",
     "        # Iterate over each key/value pair in the dictionary (document).\n",
     "        for field, value in doc.items():\n",
-    "            if field in (\"_id\", \"id\", \"type\"):\n",
+    "            if field.startswith(\"_\") or field in (\"id\", \"type\"):\n",
     "                continue\n",
     "            acceptable_slot_classes = get_names_of_classes_in_effective_range_of_slot(\n",
     "                schema_view,\n",
-    "                cls_slot_map[doc[\"type\"][field],\n",
+    "                cls_slot_map[doc[\"type\"][5:]][field],\n",
     "            )\n",
     "            if not isinstance(value, list):\n",
     "                value = [value]\n",
@@ -556,7 +556,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 56,
    "id": "103d70b6-24ab-41bd-8b7f-d2faaa028bdf",
    "metadata": {
     "scrolled": true
@@ -565,7 +565,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "483900e4d6bf4e46ab4a36ae0fb3f8a1",
+       "model_id": "bebb0ddb0055428f875494dbc0412d88",
        "version_major": 2,
        "version_minor": 0
       },
@@ -580,25 +580,26 @@
    "source": [
     "from pprint import pprint\n",
     "\n",
+    "alldocs_ids = set(mdb.alldocs.distinct(\"id\"))\n",
+    "\n",
     "def doc_field_value_errors(assertions):\n",
     "    errors = {\"not_found\": [], \"invalid_type\": []}\n",
-    "    # collect a list of assertions that concern a specific referenced \"id\".\n",
+    "    # group assertions by referenced \"id\" value.\n",
     "    assertions_by_referenced_id_value = defaultdict(list)\n",
     "    for a in assertions:\n",
     "        assertions_by_referenced_id_value[a[\"value\"]].append(a)\n",
-    "    # find the claimed type for every document id that is referenced by another document.\n",
+    "    # associate each referenced document id with its type.\n",
     "    doc_id_types = {}\n",
     "    for d in list(mdb.alldocs.find({\"id\": {\"$in\": list(assertions_by_referenced_id_value.keys())}}, {\"_id\": 0, \"id\": 1, \"type\": 1})):\n",
     "        doc_id_types[d[\"id\"]] = d[\"type\"]\n",
     "\n",
     "    for id_value, id_value_assertions in assertions_by_referenced_id_value.items():\n",
-    "        if id_value not in doc_id_types:\n",
+    "        if id_value not in alldocs_ids:\n",
     "            errors[\"not_found\"].extend(id_value_assertions)\n",
     "        else:\n",
     "            for a in id_value_assertions:\n",
-    "                # check that the observed type (or any of its ancestors) for this id reference\n",
-    "                # is in fact allowed by the referring slot's schema definition.\n",
-    "                if not (set(a[\"acceptable_slot_classes\"]) & set(doc_id_types[a[\"value\"]])):\n",
+    "                # check that the document-reported type for this id reference is kosher as per the referring slot's schema definition.\n",
+    "                if doc_id_types[a[\"value\"]][5:] not in a[\"acceptable_slot_classes\"]:\n",
     "                    errors[\"invalid_type\"].append(a)\n",
     "\n",
     "    return errors\n",
@@ -633,17 +634,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 57,
    "id": "e01450d1-3369-4fc5-80be-9787e00a6597",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(4, 0)"
+       "(12, 0)"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 57,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -655,170 +656,38 @@
     "# results with v10.5.5: (33, 6900)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "54a560df",
-   "metadata": {},
-   "source": [
-    "Display a few errors from one of the lists, as an example."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "afd25543-1cb3-4887-9aba-0086d4b998a6",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'nmdc:dobj-11-achfhn33', 'nmdc:dobj-11-dpnhb305'}"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "{e[\"value\"] for e in errors[\"not_found\"]}"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 73,
    "id": "a25857f4-e26e-4896-9e5f-607e7b4bb07c",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "nmdc:wfmgan-11-c516q834.1 has_input nmdc:dobj-11-dpnhb305\n",
-      "nmdc:wfmgan-11-yzp9eq74.1 has_input nmdc:dobj-11-achfhn33\n",
-      "nmdc:wfmgan-11-c516q834.1 has_input nmdc:dobj-11-dpnhb305\n",
-      "nmdc:wfmgan-11-yzp9eq74.1 has_input nmdc:dobj-11-achfhn33\n"
-     ]
-    }
-   ],
-   "source": [
-    "for e in errors[\"not_found\"]:\n",
-    "    print(e[\"id\"], e['field'], e['value'])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2bd191cd",
-   "metadata": {},
-   "source": [
-    "Display an example `invalid_type` errors for each of the set of expected types that are not being found:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "f8788551-a9b1-4915-a23d-74cfcbe62ec1",
-   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[]"
+       "{'nmdc:wfmag-11-8s9xk838.1 / has_input / nmdc:dobj-11-gxgpbv06',\n",
+       " 'nmdc:wfmag-11-8s9xk838.1 / has_input / nmdc:dobj-11-kr8ev105',\n",
+       " 'nmdc:wfmag-11-8s9xk838.1 / has_input / nmdc:dobj-11-whq9ph06',\n",
+       " 'nmdc:wfmag-11-dchy6q29.1 / has_input / nmdc:dobj-11-1wzar939',\n",
+       " 'nmdc:wfmag-11-dchy6q29.1 / has_input / nmdc:dobj-11-fg28a080',\n",
+       " 'nmdc:wfmag-11-dchy6q29.1 / has_input / nmdc:dobj-11-s4hp2x64'}"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 73,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "errors[\"invalid_type\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "33516e3c-f10d-4c30-942b-0d01d06082f9",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'id': 'nmdc:dobj-11-enyrng31', 'id_is_nmdc_id': True, 'field': 'was_generated_by', 'value': 'nmdc:omprc-11-2et99h53', 'slot_range': 'WorkflowExecution'}\n"
-     ]
-    }
-   ],
-   "source": [
-    "slot_range_examples = {}\n",
-    "for e in errors[\"invalid_type\"]:\n",
-    "    slot_range_examples[e[\"slot_range\"]] = e\n",
-    "\n",
-    "for ex in slot_range_examples.values():\n",
-    "    print(ex)"
+    "set(f\"{e['id']} / {e['field']} / {e['value']}\" for e in errors[\"not_found\"])"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "d4abec53",
+   "id": "b149872d-5814-4a73-ac5e-cc75fb578a01",
    "metadata": {},
    "source": [
-    "Spot check one of those errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "29ec7e82-d079-4525-bd7b-d770fd69d788",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'_id': ObjectId('66edad78007ef07eb670a09d'),\n",
-       " 'id': 'nmdc:omprc-11-sxze4w22',\n",
-       " 'has_input': ['nmdc:bsm-11-978cs285'],\n",
-       " 'has_output': ['nmdc:dobj-11-1epz0d53'],\n",
-       " 'associated_studies': ['nmdc:sty-11-28tm5d36'],\n",
-       " 'instrument_used': ['nmdc:inst-14-mwrrj632'],\n",
-       " 'type': ['MassSpectrometry',\n",
-       "  'DataGeneration',\n",
-       "  'PlannedProcess',\n",
-       "  'NamedThing']}"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# OmicsProcessing is not subclass of Activity\n",
-    "mdb.alldocs.find_one({\"id\": \"nmdc:omprc-11-sxze4w22\"})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "802290e0-58dd-4fbd-835a-c9928006819d",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'_id': ObjectId('66edad78007ef07eb67078c8'),\n",
-       " 'id': 'nmdc:procsm-11-v5sykd35',\n",
-       " 'type': ['ProcessedSample', 'MaterialEntity', 'NamedThing']}"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# ProcessedSample is not subclass of Biosample\n",
-    "mdb.alldocs.find_one({\"id\": \"nmdc:procsm-11-v5sykd35\"})"
+    "TODO: why the double assertions?"
    ]
   }
  ],

From 700c27c505e7adf7d5aed48d46fe2aad7760710d Mon Sep 17 00:00:00 2001
From: Donny Winston <donny@polyneme.xyz>
Date: Tue, 26 Nov 2024 20:07:00 +0100
Subject: [PATCH 3/7] fix: ThreadPoolExecutor is evil?

closes #576
---
 Makefile                                      |   2 +-
 ...lidation_referential_integrity_check.ipynb | 234 ++----------------
 2 files changed, 28 insertions(+), 208 deletions(-)

diff --git a/Makefile b/Makefile
index 85ba107b..8d49d064 100644
--- a/Makefile
+++ b/Makefile
@@ -101,7 +101,7 @@ mongorestore-nmdc-db:
 	mkdir -p /tmp/remote-mongodump/nmdc
 	# SSH into the remote server, stream the dump directory as a gzipped tar archive, and extract it locally.
 	ssh -i ~/.ssh/nersc ${NERSC_USERNAME}@dtn01.nersc.gov \
-		'tar -czf - -C /global/cfs/projectdirs/m3408/nmdc-mongodumps/dump_nmdc-prod_2024-11-20_20-12-02/nmdc .' \
+		'tar -czf - -C /global/cfs/projectdirs/m3408/nmdc-mongodumps/dump_nmdc-prod_2024-11-25_20-12-02/nmdc .' \
 		| tar -xzv -C /tmp/remote-mongodump/nmdc
 	mongorestore -v -h localhost:27018 -u admin -p root --authenticationDatabase=admin \
 		--drop --nsInclude='nmdc.*' --dir /tmp/remote-mongodump
diff --git a/docs/nb/bulk_validation_referential_integrity_check.ipynb b/docs/nb/bulk_validation_referential_integrity_check.ipynb
index bfe16603..9defa5e5 100644
--- a/docs/nb/bulk_validation_referential_integrity_check.ipynb
+++ b/docs/nb/bulk_validation_referential_integrity_check.ipynb
@@ -140,15 +140,7 @@
    "id": "bcb5802b-8205-49b7-8784-dc137baff1a0",
    "metadata": {},
    "source": [
-    "## Check for errors in the database"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1ab96cda-30ab-4e93-a0b1-3a936599305d",
-   "metadata": {},
-   "source": [
-    "The `nmdc_schema_collection_names` function returns the populated (having at least one document) set-intersection of (a) the set of collection names present in the Mongo database and (b) the set of Database slots in the schema that correspond to a collection (defined as being multivalued and values being inlined as a list)."
+    "## Create slot mappings"
    ]
   },
   {
@@ -156,18 +148,9 @@
    "execution_count": 5,
    "id": "1d76b70e-4412-4b17-9db9-322ac791859a",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'data_object_set', 'functional_annotation_agg', 'material_processing_set', 'workflow_execution_set', 'calibration_set', 'data_generation_set', 'configuration_set', 'processed_sample_set', 'instrument_set', 'biosample_set', 'study_set', 'field_research_site_set'}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "collection_names = get_nonempty_nmdc_schema_collection_names(mdb)\n",
-    "print(collection_names)"
+    "collection_names = populated_schema_collection_names_with_id_field(mdb) # `get_nonempty_nmdc_schema_collection_names` to include \"functional_annotation_agg\""
    ]
   },
   {
@@ -211,107 +194,6 @@
     "}"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "12e7d00e-0ec4-45de-b0da-1b618ef7e80b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def collect_errors(note_doc_field_errors):\n",
-    "    errors = {\"bad_type\": [], \"no_type\": [], \"bad_slot\": [], \"is_null\": []}\n",
-    "    n_docs_total = sum(mdb[coll_name].estimated_document_count() for coll_name in collection_names)\n",
-    "    pbar = tqdm(total=n_docs_total)\n",
-    "    n_errors_cache = 0\n",
-    "    for coll_name in sorted(collection_names):\n",
-    "        cls_names = collection_name_to_class_names[coll_name]\n",
-    "        pbar.set_description(f\"processing {coll_name}...\")\n",
-    "        # Iterate over each document (as a dictionary) in this collection.\n",
-    "        for doc in mdb[coll_name].find():\n",
-    "            doc = dissoc(doc, \"_id\")\n",
-    "            \n",
-    "            # Ensure we know the document's type.\n",
-    "            cls_name = None\n",
-    "            cls_type_match = re.match(r\"^nmdc:(?P<name>.+)\", doc.get(\"type\", \"\"))\n",
-    "            if cls_type_match is not None:\n",
-    "                cls_name = cls_type_match.group(\"name\")\n",
-    "                if cls_name not in cls_names:\n",
-    "                    errors[\"bad_type\"].append(f\"{coll_name} doc {doc['id']}: doc type {cls_name} not in those allowed for {coll_name}, i.e. {cls_names}.\")\n",
-    "                    cls_name = None\n",
-    "            elif len(cls_names) == 1:\n",
-    "                cls_name = cls_names[0]\n",
-    "            else:\n",
-    "                errors[\"no_type\"].append(f\"{coll_name} doc {doc['id']}: 'type' not set.\")\n",
-    "\n",
-    "            if cls_name is not None:        \n",
-    "                slot_map = cls_slot_map[cls_name]\n",
-    "                # Iterate over each key/value pair in the dictionary (document).\n",
-    "                for field, value in doc.items():\n",
-    "                    if field in slot_map:\n",
-    "                        if not isinstance(value, list):\n",
-    "                            value = [value]\n",
-    "                        for v in value:\n",
-    "                            note_doc_field_errors(value=v,field=field,doc=doc,coll_name=coll_name,errors=errors)                \n",
-    "                    else:\n",
-    "                        errors[\"bad_slot\"].append(f\"{coll_name} doc {doc['id']}: field '{field}' not a valid slot\")\n",
-    "            pbar.update(1)\n",
-    "            n_errors = sum([len(v) for v in errors.values()])\n",
-    "            if n_errors > n_errors_cache:\n",
-    "                print(f\"{n_errors} errors so far...\")\n",
-    "                n_errors_cache = n_errors\n",
-    "    pbar.close()\n",
-    "    return errors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "14afb4c6-b0b7-4fd7-8e2f-13c682c74409",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def note_doc_field_errors(value=None, field=None, doc=None, coll_name=None, errors=None):\n",
-    "    # No fields should be null-valued.\n",
-    "    # Example of how this may happen: JSON serialization from pydantic models may set optional fields to `null`.\n",
-    "    if value is None:\n",
-    "        errors[\"is_null\"].append(f\"{coll_name} doc {doc['id']}: field {field} is null.\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "829039e5-7abe-4c50-ba44-e384b45b7535",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d798cf56b8b541598d246c023543d29a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/27762723 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'bad_type': [], 'no_type': [], 'bad_slot': [], 'is_null': []}\n"
-     ]
-    }
-   ],
-   "source": [
-    "errors = collect_errors(note_doc_field_errors)\n",
-    "print(errors)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "21c2f771-b8da-466a-90e8-2c17ac5e6388",
@@ -334,7 +216,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 8,
    "id": "11ac53f2-057e-471b-a260-f5a65e3361af",
    "metadata": {},
    "outputs": [],
@@ -384,7 +266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 9,
    "id": "98fbfdff-51d6-42c5-9448-3b4616a2c9cb",
    "metadata": {},
    "outputs": [],
@@ -401,27 +283,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "id": "d253c567-533f-440f-8376-03a6e1e905cf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def doc_cls(doc, coll_name=None):\n",
-    "    \"\"\"Return unprefixed name of document class.\n",
-    "\n",
-    "    Try to get from doc['type'] (lopping off 'nmdc:' prefix).\n",
-    "    Else, if can unambiguously infer type given coll_name, use that.\n",
-    "    Else, return None.\n",
-    "    \"\"\"\n",
-    "    if 'type' in doc:\n",
-    "        return doc['type'][5:] # lop off \"nmdc:\" prefix\n",
-    "    elif coll_name and len(collection_name_to_class_names[coll_name]) == 1:\n",
-    "        return collection_name_to_class_names[coll_name][0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 10,
    "id": "b2e618f3-78b9-42b6-8ea9-63d080b1b0f6",
    "metadata": {
     "scrolled": true
@@ -430,12 +292,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "158c79049a2c43c6b04904bc325946ec",
+       "model_id": "47b7ed79df384cf98c527651021fe2cd",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/27762723 [00:00<?, ?it/s]"
+       "  0%|          | 0/163175 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -453,6 +315,8 @@
     "# Drop any existing `alldocs` collection (e.g. from previous use of this notebook).\n",
     "mdb.alldocs.drop()\n",
     "\n",
+    "BULK_WRITE_BATCH_SIZE = 2_000 # ensure bulk-write batches aren't too huge\n",
+    "\n",
     "# Set up progress bar\n",
     "n_docs_total = sum(mdb[name].estimated_document_count() for name in collection_names)\n",
     "pbar = tqdm(total=n_docs_total)\n",
@@ -461,12 +325,12 @@
     "    pbar.set_description(f\"processing {coll_name}...\")\n",
     "    requests = []\n",
     "    for doc in mdb[coll_name].find():\n",
-    "        doc_type = doc_cls(doc, coll_name=coll_name)\n",
+    "        doc_type = doc['type'][5:] # lop off \"nmdc:\" prefix\n",
     "        slots_to_include = [\"id\", \"type\"] + document_reference_ranged_slots[doc_type]\n",
     "        new_doc = pick(slots_to_include, doc)\n",
     "        new_doc[\"_type_and_ancestors\"] = schema_view.class_ancestors(doc_type)\n",
     "        requests.append(InsertOne(new_doc))\n",
-    "        if len(requests) == 1000: # ensure bulk-write batches aren't too huge\n",
+    "        if len(requests) == BULK_WRITE_BATCH_SIZE: \n",
     "            result = mdb.alldocs.bulk_write(requests, ordered=False)\n",
     "            pbar.update(result.inserted_count)\n",
     "            requests.clear()\n",
@@ -517,13 +381,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 11,
    "id": "d0374653-c074-4a87-aef8-24323a5a63b3",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def doc_assertions(limit=0):\n",
-    "    \"\"\"Yields batches of 1000 assertions to greatly speed up processing.\"\"\"\n",
+    "def doc_assertions(limit=0, batch_size=2_000):\n",
+    "    \"\"\"Yields batches of assertions to greatly speed up processing.\"\"\"\n",
     "    # Initialize progress bar.\n",
     "    pbar = tqdm(total=(mdb.alldocs.estimated_document_count() if limit == 0 else limit))\n",
     "    rv = []\n",
@@ -546,7 +410,7 @@
     "                    \"value\": v,\n",
     "                    \"acceptable_slot_classes\": acceptable_slot_classes,\n",
     "                })\n",
-    "                if len(rv) == 1000:\n",
+    "                if len(rv) == batch_size:\n",
     "                    yield rv\n",
     "                    rv.clear()\n",
     "        pbar.update(1)\n",
@@ -556,7 +420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 12,
    "id": "103d70b6-24ab-41bd-8b7f-d2faaa028bdf",
    "metadata": {
     "scrolled": true
@@ -565,12 +429,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bebb0ddb0055428f875494dbc0412d88",
+       "model_id": "412f98740ae7449399a874b46adae748",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/27762723 [00:00<?, ?it/s]"
+       "  0%|          | 0/163175 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -608,18 +472,10 @@
     "# Initialize \"global\" error lists.\n",
     "errors = {\"not_found\": [], \"invalid_type\": []}\n",
     "\n",
-    "# Use a with statement to ensure threads are cleaned up promptly\n",
-    "with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:\n",
-    "    future_to_errors = {executor.submit(doc_field_value_errors, das): das for das in doc_assertions()}\n",
-    "    for future in concurrent.futures.as_completed(future_to_errors):\n",
-    "        doc_asserts = future_to_errors[future]\n",
-    "        try:\n",
-    "            data = future.result()\n",
-    "        except Exception as exc:\n",
-    "            print(\"exception:\", str(exc))\n",
-    "        else:\n",
-    "            errors[\"not_found\"].extend(data[\"not_found\"])\n",
-    "            errors[\"invalid_type\"].extend(data[\"invalid_type\"])"
+    "for das in doc_assertions(batch_size=2_000):\n",
+    "    rv = doc_field_value_errors(das)\n",
+    "    errors[\"not_found\"].extend(rv[\"not_found\"])\n",
+    "    errors[\"invalid_type\"].extend(rv[\"invalid_type\"])"
    ]
   },
   {
@@ -634,60 +490,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 13,
    "id": "e01450d1-3369-4fc5-80be-9787e00a6597",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(12, 0)"
+       "(33, 0)"
       ]
      },
-     "execution_count": 57,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "len(errors[\"not_found\"]), len(errors[\"invalid_type\"])\n",
-    "# results prior to re-id-ing: (4857, 23503)\n",
-    "# results prior to v10.5.5: (33, 20488)\n",
-    "# results with v10.5.5: (33, 6900)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 73,
-   "id": "a25857f4-e26e-4896-9e5f-607e7b4bb07c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'nmdc:wfmag-11-8s9xk838.1 / has_input / nmdc:dobj-11-gxgpbv06',\n",
-       " 'nmdc:wfmag-11-8s9xk838.1 / has_input / nmdc:dobj-11-kr8ev105',\n",
-       " 'nmdc:wfmag-11-8s9xk838.1 / has_input / nmdc:dobj-11-whq9ph06',\n",
-       " 'nmdc:wfmag-11-dchy6q29.1 / has_input / nmdc:dobj-11-1wzar939',\n",
-       " 'nmdc:wfmag-11-dchy6q29.1 / has_input / nmdc:dobj-11-fg28a080',\n",
-       " 'nmdc:wfmag-11-dchy6q29.1 / has_input / nmdc:dobj-11-s4hp2x64'}"
-      ]
-     },
-     "execution_count": 73,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "set(f\"{e['id']} / {e['field']} / {e['value']}\" for e in errors[\"not_found\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b149872d-5814-4a73-ac5e-cc75fb578a01",
-   "metadata": {},
-   "source": [
-    "TODO: why the double assertions?"
+    "# results with v11.1.0 on `/global/cfs/projectdirs/m3408/nmdc-mongodumps/dump_nmdc-prod_2024-11-25_20-12-02/nmdc`: (33, 0)"
    ]
   }
  ],

From 5369b1ad6b9981cdffec4cf3a2ee024c2b2a90a4 Mon Sep 17 00:00:00 2001
From: shreddd <shreyas@gmail.com>
Date: Wed, 20 Nov 2024 14:44:08 -0800
Subject: [PATCH 4/7] Drop the old tests2 dir

---
 tests2/__init__.py                         |  0
 tests2/domain/__init__.py                  |  0
 tests2/domain/service/__init__.py          |  0
 tests2/domain/service/test_user_service.py | 57 ----------------------
 4 files changed, 57 deletions(-)
 delete mode 100644 tests2/__init__.py
 delete mode 100644 tests2/domain/__init__.py
 delete mode 100644 tests2/domain/service/__init__.py
 delete mode 100644 tests2/domain/service/test_user_service.py

diff --git a/tests2/__init__.py b/tests2/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests2/domain/__init__.py b/tests2/domain/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests2/domain/service/__init__.py b/tests2/domain/service/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests2/domain/service/test_user_service.py b/tests2/domain/service/test_user_service.py
deleted file mode 100644
index 529ac81c..00000000
--- a/tests2/domain/service/test_user_service.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from typing import Any, List
-
-import pytest
-
-from nmdc_runtime.domain.users.queriesInterface import IUserQueries
-from nmdc_runtime.domain.users.userSchema import (
-    UserAuth,
-    UserUpdate,
-    UserOut,
-)
-
-from nmdc_runtime.domain.users.userService import UserService
-
-USER_OUT = UserOut(
-    email="test+email@test.com",
-)
-
-
-class UserQueriesDummy(IUserQueries):
-    async def create(self, user: Any) -> UserOut:
-        return USER_OUT
-
-    async def update(self, old_user: Any, new_user: Any) -> UserOut:
-        return USER_OUT
-
-
-@pytest.fixture
-def user_out() -> UserOut:
-    return USER_OUT
-
-
-@pytest.fixture
-def user_schema() -> UserAuth:
-    return UserAuth(
-        username="bob",
-        password="test",
-    )
-
-
-@pytest.fixture
-def user_update_schema() -> UserUpdate:
-    return UserUpdate(
-        email="test@test.com",
-        full_name="test",
-        password="test",
-    )
-
-
-class TestUserService:
-    @pytest.mark.asyncio
-    async def test_user_create_valid(
-        self, user_out: UserOut, user_schema: UserAuth
-    ) -> None:
-        user_service = UserService(UserQueriesDummy())
-
-        result = await user_service.create_user(user_schema)
-        assert result == UserOut(email="test+email@test.com")

From 39e04448db96fd7e9096395ed2f16800a7ae00f5 Mon Sep 17 00:00:00 2001
From: Shreyas Cholia <scholia@lbl.gov>
Date: Mon, 25 Nov 2024 03:59:55 -0800
Subject: [PATCH 5/7] Update to the newer syntax for getting utcnow() (#788)

---
 demo/metadata_migration/notebooks/bookkeeper.py | 4 ++--
 nmdc_runtime/api/core/auth.py                   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/demo/metadata_migration/notebooks/bookkeeper.py b/demo/metadata_migration/notebooks/bookkeeper.py
index ad3cb918..97ccfb49 100644
--- a/demo/metadata_migration/notebooks/bookkeeper.py
+++ b/demo/metadata_migration/notebooks/bookkeeper.py
@@ -1,6 +1,6 @@
 from typing import Optional
 from enum import Enum
-from datetime import datetime
+from datetime import datetime, timezone
 
 from pymongo import MongoClient
 from nmdc_schema.migrators.migrator_base import MigratorBase
@@ -47,7 +47,7 @@ def __init__(
     @staticmethod
     def get_current_timestamp() -> str:
         r"""Returns an ISO 8601 timestamp (string) representing the current time in UTC."""
-        utc_now = datetime.utcnow()
+        utc_now = datetime.now(timezone.utc)
         iso_utc_now = utc_now.isoformat()
         return iso_utc_now  # e.g. "2024-02-21T04:31:03.115107"
 
diff --git a/nmdc_runtime/api/core/auth.py b/nmdc_runtime/api/core/auth.py
index 94685d2e..820f3dc0 100644
--- a/nmdc_runtime/api/core/auth.py
+++ b/nmdc_runtime/api/core/auth.py
@@ -1,5 +1,5 @@
 import os
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from typing import Optional, Dict
 
 from fastapi import Depends
@@ -101,9 +101,9 @@ def get_password_hash(password):
 def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
     to_encode = data.copy()
     if expires_delta:
-        expire = datetime.utcnow() + expires_delta
+        expire = datetime.now(timezone.utc) + expires_delta
     else:
-        expire = datetime.utcnow() + timedelta(minutes=15)
+        expire = datetime.now(timezone.utc) + timedelta(minutes=15)
     to_encode.update({"exp": expire})
     encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
     return encoded_jwt

From 2da475c22da6de932d2692886142833759952413 Mon Sep 17 00:00:00 2001
From: eecavanna <134325062+eecavanna@users.noreply.github.com>
Date: Sat, 23 Nov 2024 11:30:35 -0800
Subject: [PATCH 6/7] Add "Translators" subsystem as option in PR template
 (#767)

---
 .github/pull_request_template.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index f6f96b9f..b0c8634b 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -55,6 +55,7 @@ In this branch, I...
 - [ ] Minter
 - [ ] Dagster
 - [ ] Project documentation (in the `docs` directory)
+- [ ] Translators (metadata ingest pipelines)
 - [ ] MongoDB migrations
 - [ ] Other
 

From d79f6eb06a3e5341620f11ab22dec3bdf700c90b Mon Sep 17 00:00:00 2001
From: eecavanna <134325062+eecavanna@users.noreply.github.com>
Date: Mon, 25 Nov 2024 10:45:17 -0800
Subject: [PATCH 7/7] Remove reference to nonexistent `tests2` directory

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 5da4e771..964124d4 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     url="https://github.com/microbiomedata/nmdc-runtime",
     packages=find_packages(
         include=["nmdc_runtime*", "components*"],
-        exclude=["tests", "tests2"],
+        exclude=["tests"],
     ),
     use_scm_version=True,
     setup_requires=["setuptools_scm"],