diff --git a/SECURITY.md b/SECURITY.md deleted file mode 100644 index c0e649fd..00000000 --- a/SECURITY.md +++ /dev/null @@ -1,12 +0,0 @@ -# Security Policy - -SSB takes the security of our software products and services seriously, which -includes all source code repositories managed through our GitHub organization. - -We believe that responsible disclosure of security vulnerabilities helps us ensure -the security and privacy of all our users. - -## Reporting a Vulnerability - -If you believe you have found a security vulnerability in any of SSB's GitHub -repositories, please report it to us using the [Github Private vulnerability reporting tool](https://github.com/statisticsnorway/datadoc/security/advisories). diff --git a/datadoc/backend/datadoc_metadata.py b/datadoc/backend/datadoc_metadata.py index 5e2a0cee..6d4b02a2 100644 --- a/datadoc/backend/datadoc_metadata.py +++ b/datadoc/backend/datadoc_metadata.py @@ -23,26 +23,6 @@ logger = logging.getLogger(__name__) -OBLIGATORY_DATASET_METADATA = [ - m.identifier - for m in display_dataset.DISPLAY_DATASET.values() - if m.obligatory and m.editable -] - -OBLIGATORY_VARIABLES_METADATA = [ - m.identifier - for m in display_variables.DISPLAY_VARIABLES.values() - if m.obligatory and m.editable -] - -# These don't vary at runtime so we calculate them as constants here -NUM_OBLIGATORY_DATASET_FIELDS = len( - [k for k in model.Dataset().model_dump() if k in OBLIGATORY_DATASET_METADATA], -) -NUM_OBLIGATORY_VARIABLES_FIELDS = len( - [k for k in model.Variable().model_dump() if k in OBLIGATORY_VARIABLES_METADATA], -) - METADATA_DOCUMENT_FILE_SUFFIX = "__DOC.json" PLACEHOLDER_USERNAME = "default_user@ssb.no" @@ -53,14 +33,36 @@ class DataDocMetadata: def __init__( self: t.Self @ DataDocMetadata, - dataset: str | None, + dataset_path: str | os.PathLike | None = None, + metadata_document_path: str | os.PathLike | None = None, ) -> None: """Read in a dataset if supplied, otherwise naively instantiate the class.""" - self.dataset: str = dataset - if self.dataset: + self.dataset: str = dataset_path + self.metadata_document: StorageAdapter | None = None + self.container: model.MetadataContainer | None = None + + self.dataset_state: DatasetState | None = None + self.short_name: str | None = None + self.current_user: str | None = None + self.meta: model.DatadocJsonSchema = model.DatadocJsonSchema( + percentage_complete=0, + dataset=model.Dataset(), + variables=[], + ) + + self.variables_lookup: dict[str, model.Variable] = {} + + if metadata_document_path: + # In this case the user has specified an independent metadata document for editing + # without a dataset. + self.metadata_document = StorageAdapter.for_path(metadata_document_path) + self.extract_metadata_from_existing_document() + + elif self.dataset: + # The short_name is set as the dataset filename without file extension self.short_name: str = pathlib.Path( self.dataset, - ).stem # filename without file ending + ).stem self.metadata_document: StorageAdapter = StorageAdapter.for_path( StorageAdapter.for_path(self.dataset).parent(), ) @@ -68,6 +70,9 @@ def __init__( self.short_name + METADATA_DOCUMENT_FILE_SUFFIX, ) self.dataset_state: DatasetState = self.get_dataset_state(self.dataset) + + self.extract_metadata_from_files() + try: self.current_user = os.environ["JUPYTERHUB_USER"] except KeyError: @@ -77,17 +82,6 @@ def __init__( self.current_user, ) - self.meta: model.DatadocJsonSchema = model.DatadocJsonSchema( - percentage_complete=0, - dataset=model.Dataset(), - variables=[], - ) - - self.variables_lookup: dict[str, model.Variable] = {} - - if self.dataset: - self.extract_metadata_from_files() - def get_dataset_state( self: t.Self @ DataDocMetadata, dataset: str, @@ -134,55 +128,77 @@ def get_dataset_version( return None def extract_metadata_from_files(self: t.Self @ DataDocMetadata) -> None: - """Read metadata from a dataset. + """Read metadata from an existing metadata document. - If a metadata document already exists, read in the metadata from that instead. + If no metadata document exists, create one from scratch by extracting metadata + from the dataset file. """ - fresh_metadata = {} if self.metadata_document.exists(): - try: - with self.metadata_document.open(mode="r", encoding="utf-8") as file: - fresh_metadata = json.load(file) - logger.info( - "Opened existing metadata file %s", - self.metadata_document.location, - ) - - fresh_metadata = upgrade_metadata( - fresh_metadata, - model.DatadocJsonSchema().document_version, - ) - - variables_list = fresh_metadata.pop("variables", None) - - self.meta.variables = [model.Variable(**v) for v in variables_list] - self.meta.dataset = model.Dataset( - **fresh_metadata.pop("dataset", None), - ) - except json.JSONDecodeError: - logger.warning( - "Could not open existing metadata file %s. \ - Falling back to collecting data from the dataset", - self.metadata_document.location, - exc_info=True, - ) - self.extract_metadata_from_dataset() + self.extract_metadata_from_existing_document() else: self.extract_metadata_from_dataset() - if self.meta.dataset.id is None: self.meta.dataset.id = uuid.uuid4() - # Set default values for variables where appropriate - v: model.Variable - for v in self.meta.variables: - if v.variable_role is None: - v.variable_role = VariableRole.MEASURE - if v.direct_person_identifying is None: - v.direct_person_identifying = False + # Set default values for variables where appropriate + v: model.Variable + for v in self.meta.variables: + if v.variable_role is None: + v.variable_role = VariableRole.MEASURE + if v.direct_person_identifying is None: + v.direct_person_identifying = False + + if not self.meta.dataset.id: + self.meta.dataset.id = uuid.uuid4() self.variables_lookup = {v.short_name: v for v in self.meta.variables} + def extract_metadata_from_existing_document(self: t.Self @ DataDocMetadata) -> None: + """There's an existing metadata document, so read in the metadata from that.""" + fresh_metadata = {} + try: + with self.metadata_document.open(mode="r", encoding="utf-8") as file: + fresh_metadata = json.load(file) + logger.info( + "Opened existing metadata file %s", + self.metadata_document.location, + ) + + if self.is_metadata_in_container_structure(fresh_metadata): + self.container = model.MetadataContainer.model_validate_json( + json.dumps(fresh_metadata), + ) + datadoc_metadata = fresh_metadata["datadoc"] + else: + datadoc_metadata = fresh_metadata + + datadoc_metadata = upgrade_metadata( + datadoc_metadata, + ) + + self.meta = model.DatadocJsonSchema.model_validate_json( + json.dumps(datadoc_metadata), + ) + + except json.JSONDecodeError: + logger.warning( + "Could not open existing metadata file %s. \ + Falling back to collecting data from the dataset", + self.metadata_document.location, + exc_info=True, + ) + + def is_metadata_in_container_structure( + self: t.Self @ DataDocMetadata, + metadata: dict, + ) -> bool: + """At a certain point a metadata 'container' was introduced. + + The container provides a structure for different 'types' of metadata, such as 'datadoc', 'pseudonymization' etc. + This method returns True if the metadata is in the container structure, False otherwise. + """ + return "datadoc" in metadata and "dataset" in metadata["datadoc"] + def extract_metadata_from_dataset(self: t.Self @ DataDocMetadata) -> None: """Obtain what metadata we can from the dataset itself. @@ -210,7 +226,13 @@ def write_metadata_document(self: t.Self @ DataDocMetadata) -> None: self.meta.dataset.metadata_created_by = self.current_user self.meta.dataset.metadata_last_updated_date = timestamp self.meta.dataset.metadata_last_updated_by = self.current_user - self.metadata_document.write_text(self.meta.model_dump_json(indent=4)) + + if self.container: + self.container.datadoc = self.meta + else: + self.container = model.MetadataContainer(datadoc=self.meta) + + self.metadata_document.write_text(self.container.model_dump_json(indent=4)) logger.info("Saved metadata document %s", self.metadata_document.location) @property @@ -221,22 +243,23 @@ def percent_complete(self: t.Self @ DataDocMetadata) -> int: assigned. Used for a live progress bar in the UI, as well as being saved in the datadoc as a simple quality indicator. """ - num_all_fields = NUM_OBLIGATORY_DATASET_FIELDS + num_all_fields = len(display_dataset.OBLIGATORY_DATASET_METADATA) num_set_fields = len( [ k for k, v in self.meta.dataset.model_dump().items() - if k in OBLIGATORY_DATASET_METADATA and v is not None + if k in display_dataset.OBLIGATORY_DATASET_METADATA and v is not None ], ) for variable in self.meta.variables: - num_all_fields += NUM_OBLIGATORY_VARIABLES_FIELDS + num_all_fields += len(display_variables.OBLIGATORY_VARIABLES_METADATA) num_set_fields += len( [ k for k, v in variable.model_dump().items() - if k in OBLIGATORY_VARIABLES_METADATA and v is not None + if k in display_variables.OBLIGATORY_VARIABLES_METADATA + and v is not None ], ) diff --git a/datadoc/backend/model_backwards_compatibility.py b/datadoc/backend/model_backwards_compatibility.py index 216725a4..50ab7602 100644 --- a/datadoc/backend/model_backwards_compatibility.py +++ b/datadoc/backend/model_backwards_compatibility.py @@ -12,7 +12,11 @@ from __future__ import annotations import typing as t +from collections import OrderedDict from dataclasses import dataclass +from datetime import datetime, timezone + +from datadoc_model.model import LanguageStringType if t.TYPE_CHECKING: from collections.abc import Callable @@ -37,7 +41,7 @@ def __str__(self: t.Self @ UnknownModelVersionError) -> str: return f"Document Version ({self.supplied_version}) of discovered file is not supported" -SUPPORTED_VERSIONS: dict[str, BackwardsCompatibleVersion] = {} +SUPPORTED_VERSIONS: OrderedDict[str, BackwardsCompatibleVersion] = OrderedDict() @dataclass() @@ -57,6 +61,30 @@ def handle_current_version(supplied_metadata: dict) -> dict: return supplied_metadata +def handle_version_1_0_0(supplied_metadata: dict) -> dict: + """Handle breaking changes for v1.0.0.""" + datetime_fields = [ + ("metadata_created_date"), + ("metadata_last_updated_date"), + ] + for field in datetime_fields: + if supplied_metadata["dataset"][field]: + supplied_metadata["dataset"][field] = datetime.isoformat( + datetime.fromisoformat(supplied_metadata["dataset"][field]).astimezone( + tz=timezone.utc, + ), + timespec="seconds", + ) + + if isinstance(supplied_metadata["dataset"]["data_source"], str): + supplied_metadata["dataset"]["data_source"] = LanguageStringType( + en=supplied_metadata["dataset"]["data_source"], + ) + supplied_metadata["document_version"] = "2.0.0" + + return supplied_metadata + + def handle_version_0_1_1(supplied_metadata: dict) -> dict: """Handle breaking changes for v0.1.1. @@ -80,26 +108,33 @@ def handle_version_0_1_1(supplied_metadata: dict) -> dict: return supplied_metadata -# Register all the supported versions and their handlers +# Register all the supported versions and their handlers. +# MUST be ordered from oldest to newest. BackwardsCompatibleVersion(version="0.1.1", handler=handle_version_0_1_1) BackwardsCompatibleVersion( - version="1", # Some documents exist with incorrect version specification - handler=handle_version_0_1_1, + version="1.0.0", + handler=handle_version_1_0_0, +) +BackwardsCompatibleVersion( + version="2.0.0", + handler=handle_current_version, ) -def upgrade_metadata(fresh_metadata: dict, current_model_version: str) -> dict: +def upgrade_metadata(fresh_metadata: dict) -> dict: """Run the handler for this version to upgrade the document to the latest version.""" # Special case for current version, we expose the current_model_version parameter for test purposes - SUPPORTED_VERSIONS[current_model_version] = BackwardsCompatibleVersion( - current_model_version, - handle_current_version, - ) supplied_version = fresh_metadata[VERSION_FIELD_NAME] - try: - # Retrieve the upgrade function for this version - upgrade = SUPPORTED_VERSIONS[supplied_version].handler - except KeyError as e: - raise UnknownModelVersionError(supplied_version) from e - else: - return upgrade(fresh_metadata) + start_running_handlers = False + + # Run all the handlers in order from the supplied version onwards + for k, v in SUPPORTED_VERSIONS.items(): + if k == supplied_version: + start_running_handlers = True + if start_running_handlers: + fresh_metadata = v.handler(fresh_metadata) + + if not start_running_handlers: + raise UnknownModelVersionError(supplied_version) + + return fresh_metadata diff --git a/datadoc/frontend/callbacks/dataset.py b/datadoc/frontend/callbacks/dataset.py index d51266c0..e9c1f24b 100644 --- a/datadoc/frontend/callbacks/dataset.py +++ b/datadoc/frontend/callbacks/dataset.py @@ -10,7 +10,10 @@ from pydantic import ValidationError from datadoc import state -from datadoc.backend.datadoc_metadata import DataDocMetadata +from datadoc.backend.datadoc_metadata import ( + METADATA_DOCUMENT_FILE_SUFFIX, + DataDocMetadata, +) from datadoc.frontend.callbacks.utils import ( MetadataInputTypes, find_existing_language_string, @@ -49,25 +52,29 @@ def get_dataset_path() -> str | Path | None: return path_from_env -def open_dataset(dataset_path: str | Path | None = None) -> None: +def open_file(file_path: str | Path | None = None) -> None: """Load the given dataset into an DataDocMetadata instance.""" - dataset = dataset_path or get_dataset_path() - state.metadata = DataDocMetadata(dataset) - logger.info("Opened dataset %s", dataset) + if file_path and file_path.endswith(METADATA_DOCUMENT_FILE_SUFFIX): + state.metadata = DataDocMetadata(metadata_document_path=file_path) + logger.info("Opened existing metadata document %s", file_path) + else: + dataset = file_path or get_dataset_path() + state.metadata = DataDocMetadata(dataset_path=dataset) + logger.info("Opened dataset %s", dataset) def open_dataset_handling( n_clicks: int, - dataset_path: str, + file_path: str, ) -> tuple[bool, bool, str, SupportedLanguages]: """Handle errors and other logic around opening a dataset file.""" try: - open_dataset(dataset_path) + open_file(file_path) except FileNotFoundError: return ( False, True, - f"Datasettet '{dataset_path}' finnes ikke.", + f"Filen '{file_path}' finnes ikke.", state.current_metadata_language.value, ) except Exception as e: # noqa: BLE001 diff --git a/datadoc/frontend/components/control_bars.py b/datadoc/frontend/components/control_bars.py index 42ba65b7..b0b885f8 100644 --- a/datadoc/frontend/components/control_bars.py +++ b/datadoc/frontend/components/control_bars.py @@ -84,7 +84,7 @@ def build_controls_bar() -> dbc.CardBody: ), dbc.Col( build_ssb_button( - text="Åpne datasett", + text="Åpne fil", icon_class="bi bi-folder2-open", button_id="open-button", ), diff --git a/datadoc/frontend/fields/display_base.py b/datadoc/frontend/fields/display_base.py index d3e1067c..8fd1bf2a 100644 --- a/datadoc/frontend/fields/display_base.py +++ b/datadoc/frontend/fields/display_base.py @@ -81,6 +81,7 @@ class DisplayMetadata: description: str obligatory: bool = False editable: bool = True + url: bool = False multiple_language_support: bool = False diff --git a/datadoc/frontend/fields/display_dataset.py b/datadoc/frontend/fields/display_dataset.py index aa2bf7f6..3f947ed6 100644 --- a/datadoc/frontend/fields/display_dataset.py +++ b/datadoc/frontend/fields/display_dataset.py @@ -261,3 +261,7 @@ class DatasetIdentifiers(str, Enum): DISPLAYED_DROPDOWN_DATASET_ENUMS = [ typing.get_args(types[m.identifier])[0] for m in DISPLAYED_DROPDOWN_DATASET_METADATA ] + +OBLIGATORY_DATASET_METADATA = [ + m.identifier for m in DISPLAY_DATASET.values() if m.obligatory and m.editable +] diff --git a/datadoc/frontend/fields/display_variables.py b/datadoc/frontend/fields/display_variables.py index 62ffc7d4..5abd51b2 100644 --- a/datadoc/frontend/fields/display_variables.py +++ b/datadoc/frontend/fields/display_variables.py @@ -66,6 +66,7 @@ class VariableIdentifiers(str, Enum): identifier=VariableIdentifiers.DEFINITION_URI.value, display_name="Definition URI", description="En lenke (URI) til variabelens definisjon i SSB (Vardok/VarDef)", + url=True, obligatory=True, ), VariableIdentifiers.DIRECT_PERSON_IDENTIFYING: DisplayVariablesMetadata( @@ -114,11 +115,13 @@ class VariableIdentifiers(str, Enum): identifier=VariableIdentifiers.CLASSIFICATION_URI.value, display_name="Kodeverkets URI", description="Lenke (URI) til gyldige kodeverk (klassifikasjon eller kodeliste) i KLASS", + url=True, ), VariableIdentifiers.SENTINEL_VALUE_URI: DisplayVariablesMetadata( identifier=VariableIdentifiers.SENTINEL_VALUE_URI.value, display_name="Spesialverdienes URI", description="En lenke (URI) til en oversikt over 'spesialverdier' som inngår i variabelen.", + url=True, ), VariableIdentifiers.INVALID_VALUE_DESCRIPTION: DisplayVariablesMetadata( identifier=VariableIdentifiers.INVALID_VALUE_DESCRIPTION.value, @@ -149,6 +152,8 @@ class VariableIdentifiers(str, Enum): m.identifier for m in DISPLAY_VARIABLES.values() if m.multiple_language_support ] +URL_VARIABLES_METADATA = [m.identifier for m in DISPLAY_VARIABLES.values() if m.url] + DISPLAYED_DROPDOWN_VARIABLES_METADATA = [ m.identifier for m in DISPLAY_VARIABLES.values() if m.presentation == "dropdown" ] @@ -162,3 +167,8 @@ class VariableIdentifiers(str, Enum): field_type = typing.get_args(types[m.identifier])[0] # if issubclass(field_type, LanguageStringsEnum) or field_type is bool: DISPLAYED_DROPDOWN_VARIABLES_TYPES.append(field_type) + + +OBLIGATORY_VARIABLES_METADATA = [ + m.identifier for m in DISPLAY_VARIABLES.values() if m.obligatory and m.editable +] diff --git a/datadoc/tests/conftest.py b/datadoc/tests/conftest.py index 0c29e3fb..f77d0135 100644 --- a/datadoc/tests/conftest.py +++ b/datadoc/tests/conftest.py @@ -20,7 +20,6 @@ TEST_EXISTING_METADATA_FILE_NAME, TEST_EXISTING_METADATA_WITH_VALID_ID_DIRECTORY, TEST_PARQUET_FILEPATH, - TEST_RESOURCES_DIRECTORY, TEST_RESOURCES_METADATA_DOCUMENT, ) @@ -45,10 +44,10 @@ def metadata(_mock_timestamp: None) -> DataDocMetadata: @pytest.fixture() def remove_document_file() -> None: - yield None # Dummy value, No need to return anything in particular here - full_path = TEST_RESOURCES_DIRECTORY / TEST_EXISTING_METADATA_FILE_NAME + # Yield so we only run teardown + yield None try: - full_path.unlink() + TEST_RESOURCES_METADATA_DOCUMENT.unlink() except FileNotFoundError as e: print("File not deleted on teardown, exception caught:") # noqa: T201 traceback.print_exception(type(e), e) diff --git a/klargjorte_data/person_data_v1__DOC_example_1.json b/datadoc/tests/resources/existing_metadata_file/compatibility/v1_0_0/person_data_v1__DOC.json similarity index 72% rename from klargjorte_data/person_data_v1__DOC_example_1.json rename to datadoc/tests/resources/existing_metadata_file/compatibility/v1_0_0/person_data_v1__DOC.json index 51c9cf1e..6f3d695f 100644 --- a/klargjorte_data/person_data_v1__DOC_example_1.json +++ b/datadoc/tests/resources/existing_metadata_file/compatibility/v1_0_0/person_data_v1__DOC.json @@ -1,51 +1,71 @@ { - "percentage_complete": 67, - "document_version": "1", + "percentage_complete": 100, + "document_version": "1.0.0", "dataset": { "short_name": "person_data_v1", - "assessment": null, + "assessment": "SENSITIVE", "dataset_status": "DRAFT", "dataset_state": "PROCESSED_DATA", "name": { - "en": "Citizens' tax data", + "en": "successfully_read_existing_file", "nn": "", "nb": "Persondata" }, - "data_source": null, - "population_description": null, + "data_source": { + "en": "", + "nn": "", + "nb": "Skatteetaten" + }, + "population_description": { + "en": "", + "nn": "", + "nb": "Norsk befolkning" + }, "version": "1", + "version_description": null, "unit_type": "PERSON", - "temporality_type": null, + "temporality_type": "STATUS", "description": { - "en": "Test data about Norwegian citizens. To be used for demonstration and testing only.", + "en": "", "nn": "", - "nb": "Testdata personer for demo og test." + "nb": "" }, - "subject_field": null, + "subject_field": { + "en": "", + "nn": "", + "nb": "Skatt" + }, + "keyword": [ + "Skatt", + "Person", + "Helsepenger" + ], "spatial_coverage_description": { - "en": "Norway", + "en": "", "nn": "", "nb": "Norge" }, - "id": null, + "id": "143fca77-ef56-419c-a1e1-d69c4199f020", "owner": null, - "data_source_path": "klargjorte_data/person_data_v1.parquet", - "created_date": "2022-08-08 13:44:14.768463", - "created_by": "default_user@ssb.no", - "last_updated_date": null, - "last_updated_by": null + "data_source_path": "/Users/mmwinther/code/datadoc/datadoc/../klargjorte_data/person_data_v1.parquet", + "metadata_created_date": "2022-09-05T13:07:14.066023", + "metadata_created_by": "default_user@ssb.no", + "metadata_last_updated_date": "2022-10-06T15:51:36.106543", + "metadata_last_updated_by": "default_user@ssb.no", + "contains_data_from": "2010-09-05", + "contains_data_until": "2022-09-05" }, "variables": [ { "short_name": "pers_id", "name": { - "en": "Personal number", + "en": "", "nn": "", - "nb": "F\u00f8dselsnummer" + "nb": "g" }, "data_type": "STRING", "variable_role": "IDENTIFIER", - "definition_uri": null, + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/26/nb", "direct_person_identifying": true, "data_source": null, "population_description": null, @@ -63,9 +83,9 @@ { "short_name": "tidspunkt", "name": { - "en": "Timestamp", + "en": "", "nn": "", - "nb": "Tidspunkt" + "nb": "g" }, "data_type": "DATETIME", "variable_role": "START_TIME", @@ -87,13 +107,13 @@ { "short_name": "sivilstand", "name": { - "en": "Marital status", + "en": "", "nn": "", - "nb": "Sivilstand" + "nb": "g" }, "data_type": "STRING", "variable_role": "MEASURE", - "definition_uri": null, + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/91/nb", "direct_person_identifying": false, "data_source": null, "population_description": null, @@ -111,13 +131,13 @@ { "short_name": "alm_inntekt", "name": { - "en": "Gross income", + "en": "", "nn": "", - "nb": "Almennlig inntekt" + "nb": "g" }, "data_type": "INTEGER", "variable_role": "MEASURE", - "definition_uri": null, + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/1438/nb", "direct_person_identifying": false, "data_source": null, "population_description": null, @@ -135,13 +155,13 @@ { "short_name": "sykepenger", "name": { - "en": "Sick pay", + "en": "", "nn": "", - "nb": "Sykepenger" + "nb": "g" }, "data_type": "INTEGER", "variable_role": "MEASURE", - "definition_uri": null, + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/3366/nb", "direct_person_identifying": false, "data_source": null, "population_description": null, @@ -159,13 +179,13 @@ { "short_name": "ber_bruttoformue", "name": { - "en": "Gross worth", + "en": "", "nn": "", - "nb": "Beregnet bruttoformue" + "nb": "g" }, "data_type": "INTEGER", "variable_role": "MEASURE", - "definition_uri": null, + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/3327/nb", "direct_person_identifying": false, "data_source": null, "population_description": null, @@ -183,13 +203,13 @@ { "short_name": "fullf_utdanning", "name": { - "en": "Completed education", + "en": "", "nn": "", - "nb": "Fullf\u00f8rt utdanning" + "nb": "g" }, "data_type": "STRING", "variable_role": "MEASURE", - "definition_uri": null, + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/3242/nb", "direct_person_identifying": false, "data_source": null, "population_description": null, @@ -207,13 +227,13 @@ { "short_name": "hoveddiagnose", "name": { - "en": "Primary diagnosis", + "en": "", "nn": "", - "nb": "Hoveddiagnose" + "nb": "g" }, "data_type": "STRING", "variable_role": "MEASURE", - "definition_uri": null, + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/2578/nb", "direct_person_identifying": false, "data_source": null, "population_description": null, diff --git a/datadoc/tests/resources/existing_metadata_file/invalid_id_field/person_data_v1__DOC.json b/datadoc/tests/resources/existing_metadata_file/invalid_id_field/person_data_v1__DOC.json index 736971d5..1d3a86fb 100644 --- a/datadoc/tests/resources/existing_metadata_file/invalid_id_field/person_data_v1__DOC.json +++ b/datadoc/tests/resources/existing_metadata_file/invalid_id_field/person_data_v1__DOC.json @@ -1,192 +1,201 @@ { - "percentage_complete": 39, - "document_version": "1", - "dataset": { - "short_name": "person_data_v1", - "assessment": "OPEN", - "dataset_status": "INTERNAL", - "dataset_state": "PROCESSED_DATA", - "name": { - "en": "successfully_read_existing_file", - "nn": "", - "nb": "Bokm\u00e5l navn" - }, - "data_source": "", - "population_description": null, - "version": "1", - "unit_type": "PERSON", - "temporality_type": null, - "description": null, - "subject_field": null, - "spatial_coverage_description": null, - "id": null, - "owner": null, - "data_source_path": "klargjorte_data/person_data_v1.parquet", - "created_date": "2022-07-13T17:28:01.617657", - "created_by": "default_user@ssb.no", - "last_updated_date": null, - "last_updated_by": null - }, - "variables": [ - { - "short_name": "pers_id", - "name": null, - "datatype": "STRING", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "tidspunkt", - "name": null, - "datatype": "DATETIME", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "sivilstand", - "name": null, - "datatype": "STRING", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, + "document_version": "0.0.1", + "datadoc": { + "percentage_complete": 6, + "document_version": "2.0.0", + "dataset": { + "short_name": "person_data_v1", + "assessment": "OPEN", + "dataset_status": "INTERNAL", + "dataset_state": "PROCESSED_DATA", + "name": { + "en": "successfully_read_existing_file", + "nn": "", + "nb": "Bokmål navn" + }, + "description": null, "data_source": null, + "register_uri": null, "population_description": null, - "comment": null, + "version": "1", + "version_description": null, + "unit_type": "PERSON", "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, + "subject_field": null, + "keyword": null, + "spatial_coverage_description": null, "id": null, + "owner": null, + "file_path": null, + "metadata_created_date": "2022-07-13T15:28:01Z", + "metadata_created_by": "default_user@ssb.no", + "metadata_last_updated_date": "2024-01-08T15:20:29.788264Z", + "metadata_last_updated_by": "default_user@ssb.no", "contains_data_from": null, "contains_data_until": null }, - { - "short_name": "alm_inntekt", - "name": null, - "datatype": "INTEGER", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "sykepenger", - "name": null, - "datatype": "INTEGER", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "ber_bruttoformue", - "name": null, - "datatype": "INTEGER", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "fullf_utdanning", - "name": null, - "datatype": "STRING", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "hoveddiagnose", - "name": null, - "datatype": "STRING", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - } - ] + "variables": [ + { + "short_name": "pers_id", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "tidspunkt", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "sivilstand", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "alm_inntekt", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "sykepenger", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "ber_bruttoformue", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "fullf_utdanning", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "hoveddiagnose", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + } + ] + }, + "pseudonymization": null } diff --git a/datadoc/tests/resources/existing_metadata_file/person_data_v1__DOC.json b/datadoc/tests/resources/existing_metadata_file/person_data_v1__DOC.json index 02439abd..663e0d1f 100644 --- a/datadoc/tests/resources/existing_metadata_file/person_data_v1__DOC.json +++ b/datadoc/tests/resources/existing_metadata_file/person_data_v1__DOC.json @@ -1,252 +1,257 @@ { - "percentage_complete": 100, - "document_version": "1.0.0", - "dataset": { - "short_name": "person_data_v1", - "assessment": "SENSITIVE", - "dataset_status": "DRAFT", - "dataset_state": "PROCESSED_DATA", - "name": { - "en": "successfully_read_existing_file", - "nn": "", - "nb": "Persondata" - }, - "data_source": { - "en": "", - "nn": "", - "nb": "Skatteetaten" - }, - "population_description": { - "en": "", - "nn": "", - "nb": "Norsk befolkning" - }, - "version": "1", - "version_description": null, - "unit_type": "PERSON", - "temporality_type": "STATUS", - "description": { - "en": "", - "nn": "", - "nb": "" - }, - "subject_field": { - "en": "", - "nn": "", - "nb": "Skatt" - }, - "keyword": [ - "Skatt", - "Person", - "Helsepenger" - ], - "spatial_coverage_description": { - "en": "", - "nn": "", - "nb": "Norge" - }, - "id": "143fca77-ef56-419c-a1e1-d69c4199f020", - "owner": null, - "data_source_path": "/Users/mmwinther/code/datadoc/datadoc/../klargjorte_data/person_data_v1.parquet", - "metadata_created_date": "2022-09-05T13:07:14.066023", - "metadata_created_by": "default_user@ssb.no", - "metadata_last_updated_date": "2022-10-06T15:51:36.106543", - "metadata_last_updated_by": "default_user@ssb.no", - "contains_data_from": "2010-09-05", - "contains_data_until": "2022-09-05" - }, - "variables": [ - { - "short_name": "pers_id", + "document_version": "0.0.1", + "datadoc": { + "percentage_complete": 98, + "document_version": "2.0.0", + "dataset": { + "short_name": "person_data_v1", + "assessment": "SENSITIVE", + "dataset_status": "DRAFT", + "dataset_state": "PROCESSED_DATA", "name": { - "en": "", + "en": "successfully_read_existing_file", "nn": "", - "nb": "g" + "nb": "Persondata" }, - "data_type": "STRING", - "variable_role": "IDENTIFIER", - "definition_uri": "ssb.no", - "direct_person_identifying": true, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "tidspunkt", - "name": { + "description": { "en": "", "nn": "", - "nb": "g" + "nb": "" }, - "data_type": "DATETIME", - "variable_role": "START_TIME", - "definition_uri": "ssb.no", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "sivilstand", - "name": { + "data_source": { "en": "", "nn": "", - "nb": "g" + "nb": "Skatteetaten" }, - "data_type": "STRING", - "variable_role": "MEASURE", - "definition_uri": "ssb.no", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "alm_inntekt", - "name": { + "register_uri": null, + "population_description": { "en": "", "nn": "", - "nb": "g" + "nb": "Norsk befolkning" }, - "data_type": "INTEGER", - "variable_role": "MEASURE", - "definition_uri": "ssb.no", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "sykepenger", - "name": { + "version": "1", + "version_description": null, + "unit_type": "PERSON", + "temporality_type": "STATUS", + "subject_field": { "en": "", "nn": "", - "nb": "g" + "nb": "Skatt" }, - "data_type": "INTEGER", - "variable_role": "MEASURE", - "definition_uri": "ssb.no", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "ber_bruttoformue", - "name": { + "keyword": [ + "Skatt", + "Person", + "Helsepenger" + ], + "spatial_coverage_description": { "en": "", "nn": "", - "nb": "g" + "nb": "Norge" }, - "data_type": "INTEGER", - "variable_role": "MEASURE", - "definition_uri": "ssb.no", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null + "id": "143fca77-ef56-419c-a1e1-d69c4199f020", + "owner": null, + "file_path": null, + "metadata_created_date": "2022-09-05T11:07:14Z", + "metadata_created_by": "default_user@ssb.no", + "metadata_last_updated_date": "2024-01-08T15:41:05.681664Z", + "metadata_last_updated_by": "default_user@ssb.no", + "contains_data_from": "2010-09-05", + "contains_data_until": "2022-09-05" }, - { - "short_name": "fullf_utdanning", - "name": { - "en": "", - "nn": "", - "nb": "g" + "variables": [ + { + "short_name": "pers_id", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "STRING", + "variable_role": "IDENTIFIER", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/26/nb", + "direct_person_identifying": true, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null }, - "data_type": "STRING", - "variable_role": "MEASURE", - "definition_uri": "ssb.no", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "hoveddiagnose", - "name": { - "en": "", - "nn": "", - "nb": "g" + { + "short_name": "tidspunkt", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "DATETIME", + "variable_role": "START_TIME", + "definition_uri": null, + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "sivilstand", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "STRING", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/91/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "alm_inntekt", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "INTEGER", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/1438/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "sykepenger", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "INTEGER", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/3366/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null }, - "data_type": "STRING", - "variable_role": "MEASURE", - "definition_uri": "ssb.no", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - } - ] + { + "short_name": "ber_bruttoformue", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "INTEGER", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/3327/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "fullf_utdanning", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "STRING", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/3242/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "hoveddiagnose", + "name": { + "en": "", + "nn": "", + "nb": "g" + }, + "data_type": "STRING", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/metadata/conceptvariable/vardok/2578/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + } + ] + }, + "pseudonymization": null } diff --git a/datadoc/tests/resources/existing_metadata_file/valid_id_field/person_data_v1__DOC.json b/datadoc/tests/resources/existing_metadata_file/valid_id_field/person_data_v1__DOC.json index 8f830f7c..fd42a80c 100644 --- a/datadoc/tests/resources/existing_metadata_file/valid_id_field/person_data_v1__DOC.json +++ b/datadoc/tests/resources/existing_metadata_file/valid_id_field/person_data_v1__DOC.json @@ -1,192 +1,201 @@ { - "percentage_complete": 39, - "document_version": "1", - "dataset": { - "short_name": "person_data_v1", - "assessment": "OPEN", - "dataset_status": "INTERNAL", - "dataset_state": "PROCESSED_DATA", - "name": { - "en": "successfully_read_existing_file", - "nn": "", - "nb": "Bokm\u00e5l navn" - }, - "data_source": "", - "population_description": null, - "version": "1", - "unit_type": "PERSON", - "temporality_type": null, - "description": null, - "subject_field": null, - "spatial_coverage_description": null, - "id": "143fca77-ef56-419c-a1e1-d69c4199f020", - "owner": null, - "data_source_path": "klargjorte_data/person_data_v1.parquet", - "created_date": "2022-07-13T17:28:01.617657", - "created_by": "default_user@ssb.no", - "last_updated_date": null, - "last_updated_by": null - }, - "variables": [ - { - "short_name": "pers_id", - "name": null, - "datatype": "STRING", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "tidspunkt", - "name": null, - "datatype": "DATETIME", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "sivilstand", - "name": null, - "datatype": "STRING", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "alm_inntekt", - "name": null, - "datatype": "INTEGER", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "sykepenger", - "name": null, - "datatype": "INTEGER", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "ber_bruttoformue", - "name": null, - "datatype": "INTEGER", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "fullf_utdanning", - "name": null, - "datatype": "STRING", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, + "document_version": "0.0.1", + "datadoc": { + "percentage_complete": 6, + "document_version": "2.0.0", + "dataset": { + "short_name": "person_data_v1", + "assessment": "OPEN", + "dataset_status": "INTERNAL", + "dataset_state": "PROCESSED_DATA", + "name": { + "en": "successfully_read_existing_file", + "nn": "", + "nb": "Bokmål navn" + }, + "description": null, "data_source": null, + "register_uri": null, "population_description": null, - "comment": null, + "version": "1", + "version_description": null, + "unit_type": "PERSON", "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, + "subject_field": null, + "keyword": null, + "spatial_coverage_description": null, + "id": "143fca77-ef56-419c-a1e1-d69c4199f020", + "owner": null, + "file_path": null, + "metadata_created_date": "2022-07-13T15:28:01Z", + "metadata_created_by": "default_user@ssb.no", + "metadata_last_updated_date": "2024-01-08T15:21:19.977721Z", + "metadata_last_updated_by": "default_user@ssb.no", "contains_data_from": null, "contains_data_until": null }, - { - "short_name": "hoveddiagnose", - "name": null, - "datatype": "STRING", - "variable_role": null, - "definition_uri": null, - "direct_person_identifying": null, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - } - ] + "variables": [ + { + "short_name": "pers_id", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "tidspunkt", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "sivilstand", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "alm_inntekt", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "sykepenger", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "ber_bruttoformue", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "fullf_utdanning", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "hoveddiagnose", + "name": null, + "data_type": null, + "variable_role": null, + "definition_uri": null, + "direct_person_identifying": null, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + } + ] + }, + "pseudonymization": null } diff --git a/klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1.parquet b/datadoc/tests/resources/klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1.parquet similarity index 100% rename from klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1.parquet rename to datadoc/tests/resources/klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1.parquet diff --git a/datadoc/tests/resources/klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json b/datadoc/tests/resources/klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json new file mode 100644 index 00000000..2bf5e75b --- /dev/null +++ b/datadoc/tests/resources/klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json @@ -0,0 +1,220 @@ +{ + "document_version": "0.0.1", + "datadoc": { + "percentage_complete": 97, + "document_version": "2.0.0", + "dataset": { + "short_name": "person_testdata_p2021-12-31_p2021-12-31", + "assessment": "PROTECTED", + "dataset_status": "DRAFT", + "dataset_state": "PROCESSED_DATA", + "name": { + "en": "", + "nn": "", + "nb": "Persondata demo datasett" + }, + "description": { + "en": "", + "nn": "", + "nb": "Person-data til demo og testing av DataDoc" + }, + "data_source": { + "en": "", + "nn": "", + "nb": "Skatteetaten, FREG og personskatt" + }, + "register_uri": null, + "population_description": { + "en": "", + "nn": "", + "nb": "Alle bosatte i Norge" + }, + "version": "1", + "version_description": { + "en": "", + "nn": "", + "nb": "Opprettelse" + }, + "unit_type": "PERSON", + "temporality_type": "STATUS", + "subject_field": { + "en": "", + "nn": "", + "nb": "befolkning" + }, + "keyword": [ + "befolkning", + "skatt" + ], + "spatial_coverage_description": { + "en": "", + "nn": "", + "nb": "Norge" + }, + "id": "2f72477a-f051-43ee-bf8b-0d8f47b5e0a7", + "owner": { + "en": "", + "nn": "", + "nb": "Seksjon NNN" + }, + "file_path": null, + "metadata_created_date": "2022-10-07T07:35:01Z", + "metadata_created_by": "default_user@ssb.no", + "metadata_last_updated_date": "2024-01-08T15:49:17.489872Z", + "metadata_last_updated_by": "default_user@ssb.no", + "contains_data_from": "2021-12-31", + "contains_data_until": "2021-12-31" + }, + "variables": [ + { + "short_name": "fnr", + "name": { + "en": "Personal number", + "nn": "", + "nb": "Fødselsnummer" + }, + "data_type": "STRING", + "variable_role": "IDENTIFIER", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/26/nb", + "direct_person_identifying": true, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "sivilstand", + "name": { + "en": "Marital status", + "nn": "", + "nb": "Sivilstand" + }, + "data_type": "STRING", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/91/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "bostedskommune", + "name": { + "en": "Residential district", + "nn": "", + "nb": "Bostedskommune" + }, + "data_type": "STRING", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/94/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": null, + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "inntekt", + "name": { + "en": "Income", + "nn": "", + "nb": "Alminnelig inntekt" + }, + "data_type": "INTEGER", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/3495/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": "NOK", + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "bankinnskudd", + "name": { + "en": "Bank transfer (total)", + "nn": "", + "nb": "Bankinnskudd totalt" + }, + "data_type": "INTEGER", + "variable_role": "MEASURE", + "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/591/nb", + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": null, + "temporality_type": null, + "measurement_unit": "NOK", + "format": null, + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + }, + { + "short_name": "dato", + "name": { + "en": "Date of registration", + "nn": "", + "nb": "Registreringsdato" + }, + "data_type": "DATETIME", + "variable_role": "START_TIME", + "definition_uri": null, + "direct_person_identifying": false, + "data_source": null, + "population_description": null, + "comment": { + "en": "", + "nn": "", + "nb": "Måletidspunkt for data" + }, + "temporality_type": null, + "measurement_unit": null, + "format": "YYYY-MM-DD", + "classification_uri": null, + "sentinel_value_uri": null, + "invalid_value_description": null, + "id": null, + "contains_data_from": null, + "contains_data_until": null + } + ] + }, + "pseudonymization": null +} diff --git a/datadoc/tests/test_datadoc_metadata.py b/datadoc/tests/test_datadoc_metadata.py index 55aa5e7e..ae18c83c 100644 --- a/datadoc/tests/test_datadoc_metadata.py +++ b/datadoc/tests/test_datadoc_metadata.py @@ -78,7 +78,6 @@ def test_metadata_document_percent_complete(metadata: DataDocMetadata): variable_2 = Variable(data_type=DataType.INTEGER) document = DatadocJsonSchema( percentage_complete=0, - document_version="1.0.0", dataset=dataset, variables=[variable_1, variable_2], ) @@ -110,27 +109,24 @@ def test_write_metadata_document( with Path.open(written_document) as f: written_metadata = json.loads(f.read()) + datadoc_metadata = written_metadata["datadoc"]["dataset"] assert ( # Use our pydantic model to read in the datetime string so we get the correct format Dataset( - metadata_created_date=written_metadata["dataset"]["metadata_created_date"], + metadata_created_date=datadoc_metadata["metadata_created_date"], ).metadata_created_date == dummy_timestamp ) - assert written_metadata["dataset"]["metadata_created_by"] == PLACEHOLDER_USERNAME + assert datadoc_metadata["metadata_created_by"] == PLACEHOLDER_USERNAME assert ( # Use our pydantic model to read in the datetime string so we get the correct format Dataset( - metadata_last_updated_date=written_metadata["dataset"][ - "metadata_last_updated_date" - ], + metadata_last_updated_date=datadoc_metadata["metadata_last_updated_date"], ).metadata_last_updated_date == dummy_timestamp ) - assert ( - written_metadata["dataset"]["metadata_last_updated_by"] == PLACEHOLDER_USERNAME - ) + assert datadoc_metadata["metadata_last_updated_by"] == PLACEHOLDER_USERNAME @pytest.mark.usefixtures("existing_metadata_file", "remove_document_file") @@ -163,12 +159,12 @@ def test_existing_metadata_none_id( pre_open_id = "" post_write_id = "" with Path.open(Path(existing_metadata_file)) as f: - pre_open_id = json.load(f)["dataset"]["id"] + pre_open_id = json.load(f)["datadoc"]["dataset"]["id"] assert pre_open_id is None assert isinstance(metadata.meta.dataset.id, UUID) metadata.write_metadata_document() with Path.open(Path(existing_metadata_file)) as f: - post_write_id = json.load(f)["dataset"]["id"] + post_write_id = json.load(f)["datadoc"]["dataset"]["id"] assert post_write_id == str(metadata.meta.dataset.id) @@ -184,13 +180,13 @@ def test_existing_metadata_valid_id( pre_open_id = "" post_write_id = "" with Path.open(Path(existing_metadata_file)) as f: - pre_open_id = json.load(f)["dataset"]["id"] + pre_open_id = json.load(f)["datadoc"]["dataset"]["id"] assert pre_open_id is not None assert isinstance(metadata.meta.dataset.id, UUID) assert str(metadata.meta.dataset.id) == pre_open_id metadata.write_metadata_document() with Path.open(Path(existing_metadata_file)) as f: - post_write_id = json.load(f)["dataset"]["id"] + post_write_id = json.load(f)["datadoc"]["dataset"]["id"] assert post_write_id == pre_open_id diff --git a/datadoc/tests/test_model_backwards_compatibility.py b/datadoc/tests/test_model_backwards_compatibility.py index ef7268a3..1da89468 100644 --- a/datadoc/tests/test_model_backwards_compatibility.py +++ b/datadoc/tests/test_model_backwards_compatibility.py @@ -22,17 +22,16 @@ def test_existing_metadata_current_model_version(): - current_model_version = "1.0.0" + current_model_version = "2.0.0" fresh_metadata = {"document_version": current_model_version} - upgraded_metadata = upgrade_metadata(fresh_metadata, current_model_version) + upgraded_metadata = upgrade_metadata(fresh_metadata) assert upgraded_metadata == fresh_metadata def test_existing_metadata_unknown_model_version(): - current_model_version = "1.0.0" fresh_metadata = {"document_version": "0.27.65"} with pytest.raises(UnknownModelVersionError): - upgrade_metadata(fresh_metadata, current_model_version) + upgrade_metadata(fresh_metadata) @pytest.mark.parametrize( @@ -45,20 +44,8 @@ def test_backwards_compatibility( existing_metadata_file: str, metadata: DataDocMetadata, ): - # Parameterise with all known backwards compatible versions with Path.open(Path(existing_metadata_file)) as f: file_metadata = json.loads(f.read()) - in_file_values = [ - v for v in file_metadata["dataset"].values() if v not in ["", None] - ] - read_in_values = json.loads( - metadata.meta.dataset.model_dump_json(exclude_none=True), - ).values() - - missing_values = [v for v in in_file_values if v not in read_in_values] - if missing_values: - msg = f"Some values were not successfully read in! {missing_values = }" - raise AssertionError( - msg, - ) + # Just test a single value to make sure we have a working model + assert metadata.meta.dataset.name.en == file_metadata["dataset"]["name"]["en"] diff --git a/datadoc/utils.py b/datadoc/utils.py index 30fae564..f56c8964 100644 --- a/datadoc/utils.py +++ b/datadoc/utils.py @@ -3,6 +3,7 @@ import datetime from datadoc_model import model +from pydantic import AnyUrl from datadoc.enums import SupportedLanguages @@ -33,6 +34,8 @@ def get_display_values( for field_name, value in variable: if isinstance(value, model.LanguageStringType): return_dict[field_name] = value.model_dump()[current_language.value] + elif isinstance(value, AnyUrl): + return_dict[field_name] = str(value) else: return_dict[field_name] = value return return_dict diff --git a/klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json b/klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json deleted file mode 100644 index aafdeeed..00000000 --- a/klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1__DOC.json +++ /dev/null @@ -1,219 +0,0 @@ -{ - "percentage_complete": 97, - "document_version": "1.0.0", - "dataset": { - "short_name": "person_testdata_p2021-12-31_p2021-12-31", - "assessment": "PROTECTED", - "dataset_status": "DRAFT", - "dataset_state": "PROCESSED_DATA", - "name": { - "en": "", - "nn": "", - "nb": "Persondata demo datasett" - }, - "data_source": { - "en": "", - "nn": "", - "nb": "Skatteetaten, FREG og personskatt" - }, - "population_description": { - "en": "", - "nn": "", - "nb": "Alle bosatte i Norge" - }, - "version": "1", - "version_description": "Opprettelse", - "unit_type": "PERSON", - "temporality_type": "STATUS", - "description": { - "en": "", - "nn": "", - "nb": "Person-data til demo og testing av DataDoc" - }, - "subject_field": { - "en": "", - "nn": "", - "nb": "befolkning" - }, - "keyword": [ - "befolkning", - "skatt" - ], - "spatial_coverage_description": { - "en": "", - "nn": "", - "nb": "Norge" - }, - "id": "2f72477a-f051-43ee-bf8b-0d8f47b5e0a7", - "owner": { - "en": "", - "nn": "", - "nb": "Seksjon NNN" - }, - "data_source_path": "/klargjorte_data/befolkning/person_testdata_p2021-12-31_p2021-12-31_v1.parquet", - "metadata_created_date": "2022-10-07T09:35:01.192898", - "metadata_created_by": "default_user@ssb.no", - "metadata_last_updated_date": "2022-10-10T13:00:12.259945", - "metadata_last_updated_by": "default_user@ssb.no", - "contains_data_from": "2021-12-31", - "contains_data_until": "2021-12-31" - }, - "variables": [ - { - "short_name": "fnr", - "name": { - "en": "Personal number", - "nn": "", - "nb": "F\u00f8dselsnummer" - }, - "data_type": "STRING", - "variable_role": "IDENTIFIER", - "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/26/nb", - "direct_person_identifying": true, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "sivilstand", - "name": { - "en": "Marital status", - "nn": "", - "nb": "Sivilstand" - }, - "data_type": "STRING", - "variable_role": "MEASURE", - "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/91/nb", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "bostedskommune", - "name": { - "en": "Residential district", - "nn": "", - "nb": "Bostedskommune" - }, - "data_type": "STRING", - "variable_role": "MEASURE", - "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/94/nb", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": null, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "inntekt", - "name": { - "en": "Income", - "nn": "", - "nb": "Alminnelig inntekt" - }, - "data_type": "INTEGER", - "variable_role": "MEASURE", - "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/3495/nb", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": { - "en": "", - "nn": "", - "nb": "NOK" - }, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "bankinnskudd", - "name": { - "en": "Bank transfer (total)", - "nn": "", - "nb": "Bankinnskudd totalt" - }, - "data_type": "INTEGER", - "variable_role": "MEASURE", - "definition_uri": "https://www.ssb.no/a/metadata/conceptvariable/vardok/591/nb", - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": null, - "temporality_type": null, - "measurement_unit": { - "en": "", - "nn": "", - "nb": "NOK" - }, - "format": null, - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - }, - { - "short_name": "dato", - "name": { - "en": "Date of registration", - "nn": "", - "nb": "Registreringsdato" - }, - "data_type": "DATETIME", - "variable_role": "START_TIME", - "definition_uri": null, - "direct_person_identifying": false, - "data_source": null, - "population_description": null, - "comment": { - "en": "", - "nn": "", - "nb": "M\u00e5letidspunkt for data" - }, - "temporality_type": null, - "measurement_unit": null, - "format": "YYYY-MM-DD", - "classification_uri": null, - "sentinel_value_uri": null, - "invalid_value_description": null, - "id": null, - "contains_data_from": null, - "contains_data_until": null - } - ] -} diff --git a/klargjorte_data/person_data_v1.parquet b/klargjorte_data/person_data_v1.parquet deleted file mode 100644 index e69b6837..00000000 Binary files a/klargjorte_data/person_data_v1.parquet and /dev/null differ