Adding the reprocess update type. (#154)

* Adding the reprocess update type. * Bumping the version. * Add the doc update config block. * Adding documentation and removing commented out update types. --------- Co-authored-by: Mark <[email protected]>
climatepolicyradar · Dec 9, 2024 · 8b5f905 · 8b5f905
1 parent 4431ff9
commit 8b5f905
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 11 deletions.
diff --git a/src/cpr_sdk/pipeline_general_models.py b/src/cpr_sdk/pipeline_general_models.py
@@ -1,6 +1,6 @@
 from datetime import datetime
 from enum import Enum
-from typing import Mapping, Any, List, Optional, Sequence, Union
+from typing import Any, List, Mapping, Optional, Sequence, Union
 
 from pydantic import BaseModel, field_validator
 
@@ -68,22 +68,39 @@ class InputData(BaseModel):
 
 
 class UpdateTypes(str, Enum):
-    """Document types supported by the backend API."""
+    """
+    UpdateTypes that are recognised and have resulting actions in the pipeline.
+
+    A mapping of the update type to the action can be found in the ingest repo:
+    https://github.com/climatepolicyradar/navigator-data-ingest/blob/main/src/
+    navigator_data_ingest/base/updated_document_actions.py#L490
+
+    Attributes:
+        NAME (str): Represents the name of the document, causes embeddings generation to
+            be re-triggered for a document.
+        DESCRIPTION (str): Represents the description of the document, causes embeddings
+            generation to be re-triggered for a document.
+        SLUG (str): Represents the slug (a URL-friendly version of the name) of the
+            document, triggers an update of the field in the relating s3 objects such
+            that the new data is reflected in vespa.
+        SOURCE_URL (str): Represents the source URL of the document and triggers full
+            reprocessing and download from source of the document.
+        METADATA (str): Represents the metadata associated with the document and
+            indicates that the metadata of the objects in s3 relating to the document
+            should be updated.
+        REPARSE (str): Indicates that the document should be reparsed, including full
+            reprocessing but not redownload from source.
+        REPROCESS (str): Indicates that the document should be reprocessed, including
+            redownload from source and reparse.
+    """
 
     NAME = "name"
     DESCRIPTION = "description"
-    # IMPORT_ID = "import_id"
     SLUG = "slug"
-    # PUBLICATION_TS = "publication_ts"
     SOURCE_URL = "source_url"
-    # TYPE = "type"
-    # SOURCE = "source"
-    # CATEGORY = "category"
-    # GEOGRAPHY = "geography"
-    # LANGUAGES = "languages"
-    # DOCUMENT_STATUS = "document_status"
     METADATA = "metadata"
     REPARSE = "reparse"
+    REPROCESS = "reprocess"
 
 
 class Update(BaseModel):
@@ -109,3 +126,13 @@ class ExecutionData(BaseModel):
     """Data unique to a step functions execution that is required at later stages."""
 
     input_dir_path: str
+
+
+class DocUpdateConfig(BaseModel):
+    """
+    Config for updates not defined as part of IdentifyUpdates.
+
+    reprocess_updates: list of document ids to reprocess.
+    """
+
+    reprocess_updates: list[str]
diff --git a/src/cpr_sdk/version.py b/src/cpr_sdk/version.py
@@ -1,6 +1,6 @@
 _MAJOR = "1"
 _MINOR = "9"
-_PATCH = "6"
+_PATCH = "7"
 _SUFFIX = ""
 
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)