diff --git a/docs/source/api/stac_generator/stac_generator.rst b/docs/source/api/stac_generator/stac_generator.rst index 4811e4a..3b78202 100644 --- a/docs/source/api/stac_generator/stac_generator.rst +++ b/docs/source/api/stac_generator/stac_generator.rst @@ -4,10 +4,10 @@ STAC Generator :fa:`github` `View on Github `_ -.. automodule:: stac_generator.core.processor +.. automodule:: stac_generator.core.extraction_method :members: .. autoclass:: stac_generator.core.generator.BaseGenerator -.. automodule:: stac_generator.core.collection_describer +.. automodule:: stac_generator.core.baker :members: diff --git a/docs/source/collection_descriptions/building_a_workflow.rst b/docs/source/collection_descriptions/building_a_workflow.rst deleted file mode 100644 index 538d1e3..0000000 --- a/docs/source/collection_descriptions/building_a_workflow.rst +++ /dev/null @@ -1,110 +0,0 @@ -********************************** -Building an Collection Description -********************************** - -Building an STAC catalog workflow consists of 4 mains steps: - 1. Write an :ref:`item_description ` file to describe the workflow - 2. Test the workflow on a subset of data - 3. Index that subset of data to check it works as expected - 4. Index full dataset - -Parts 1 and 2 will likely go round in a loop, whilst you are developing the -workflow file, with several iterations until you are happy. - -1. Write a collection-description -================================= - -A basic collection-description consists of 5 sections: - 1. ``paths`` - 2. ``collection`` - 3. ``item`` - 4. ``asset`` - -An example collection-description can be found :ref:`here ` - -The extraction methods section describes how the facets are extracted from the data. - -Assets are aggregated into items based on the item ID - -.. warning:: - - All assets you want to end up together, should have the same item ID. - If you generate your item ID from the filename, and not all assets you want to group - together have the same filename convention (e.g. metadata files) then they will end up independent. - -To check your collection-description works as expected, you will need to run it. - -2. Running the collection-description on a subset of data -========================================================= - -To run your workflow, you will need to create a config file. -This will define an input path and output to standard out. - -Example configuration ---------------------- - -.. include:: ../stac_generator/user_guide/example_config.rst - -You should choose a filepath with a relatively small number of files to -make iteration quick and allow you to make tweaks. - -You can then run your workflow using: - -``stac_generator `` - -.. program-output:: stac_generator -h - -.. note:: - - It is likely that this will be an iterative process to make sure that the correct - assets end up together and that all the facets are extracted as desired. - -3. Indexing the data -==================== - -.. caution:: - - Have you indexed the assets? Things may not work fully if the assets have - not been indexed as well. - -This step is as simple as changing your output plugin to point to the final destination. - -Here is an example for the elasticsearch output making use of additional kwargs: - -.. code-block:: - - - method: elasticsearch - connection_kwargs: - hosts: [host1] - headers: - x-api-key: - use_ssl: true - verify_certs: false - ssl_show_warn: false - index: - name: ceda-items-2021-06-09 - - method: elasticsearch - connection_kwargs: - hosts: [host1] - headers: - x-api-key: - use_ssl: true - verify_certs: false - ssl_show_warn: false - index: - name: ceda-assets-2021-06-09 - -Once this works as expected... - -4. Indexing the full dataset -============================ - -This is done by increasing the scope of the input plugin. -In the example we used the path ``/badc/faam/data/2005/b069-jan-05``. If our -description file covered ``/badc/faam/data`` we could now expand our input to cover -``/badc/faam/data``. - -.. note:: - - The higher up the tree you put the input, the longer it will take. You might - wish to consider splitting the run into smaller segments and running in parallel. diff --git a/docs/source/collection_descriptions/collection_descriptions.rst b/docs/source/collection_descriptions/collection_descriptions.rst deleted file mode 100644 index b0ac526..0000000 --- a/docs/source/collection_descriptions/collection_descriptions.rst +++ /dev/null @@ -1,130 +0,0 @@ - -*********************** -Collection Descriptions -*********************** - -.. toctree:: - :hidden: - - building_a_workflow - - -These documents describe how to process the files within a dataset and extract facets. -These documents are aggregated along the branch with the information lower in the tree taking -precendence. -Having multiple files at different points in the tree allow for a narrowing of information. -i.e. A default set at a higher level, if not overwritten, will exist at all points down the tree. - -Example Description File -======================== - -This file applies to all files listed under: ``/badc/faam/data``. - -It uses the regex processor in two forms. The first, uses a pre-processor -to reduce the filepath to just the filename and then matches a regex pattern to -extract a date. This is then passed to the ISO Date post-processor which converts -the date into the ISO8601 format. - -The second, extracts platform and flight number from the filepath. -No pre or post processing is done in this case. - -.. code-block:: yaml - - paths: - - /badc/faam/data - asset: - extraction_methods: - - method: regex - inputs: - regex: '^(?:[^_]*_){2}(?P\d*)' - -Description file sections -========================= - -The description file consists of 5 top level keys: - -A :ref:`Full JSON Schema` references below. - -.. list-table:: - :header-rows: 1 - - * - Key - - Type - - Description - - Example - * - ``paths`` - - list - - List of paths this workflow applies to. - - :ref:`Paths` - * - ``asset`` - - :ref:`Collections ` - - Defines the extraction methods to create the assets matching the given path. - * - ``item`` - - :ref:`Collections ` - - Defines the ID and extraction methods to create the items matching the given path. - * - ``collection`` - - :ref:`Collections ` - - Defines the ID and extraction methods to create the collection matching the given path. - * - ``Categories`` - - list - - Used by the asset generator to assign categories to files. - By default, all files are given the category data. - - :ref:`Categories ` - -Paths ------ - -Describes the paths where this file applies. Can be multiple locations. -The path references all points below it in the hierarchy. - -.. code-block:: yaml - - paths: - - /badc/faam/data - -Asset, Item, & Collection -------------------------- - -.. list-table:: - :header-rows: 1 - - * - Key - - Type - - Description - - Example - * - ``id`` - - :ref:`ID extration methods ` - - ID for the generated object - - `sentinel3` - * - ``extration_methods`` - - :ref:`Extration methods ` - - A list of functions to run, and their arguments. - -.. code-block:: yaml - - collections: - id: - method: default - inputs: - value: sentinel3 - extraction_methods: - - method: regex - inputs: - regex: '^\/(?:[^/]*/)(?P\w*)\/(?:[^/]*/){2}(?P\w\d{3})' - -Categories ------------ - -Used by the asset extractor to assign categories to files. -By default, all files are given the category data. - -.. code-block:: yaml - - categories: - label: metadata - regex: 00README - -Schema -------- - -.. program-output:: python -c "from stac_generator.core.item_describer import ItemDescription; import json; print(json.dumps(ItemDescription.schema(), indent=4))" diff --git a/docs/source/index.rst b/docs/source/index.rst index b89ae7f..eab3a7a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -10,12 +10,11 @@ change the source of the files, the output of the metadata and the processing ch which extracts the metadata. The framework leverages a modular, plugin architecture to allow users to modify the workflow to fit their needs. -The process expects a stream of "assets" (an asset being a file, zarr object, etc.). +The process expects a stream of "messages" for which the recipes can be run against. The source of this stream is configured with `input plugins `_ which could be as simple as listing directories on a file system or using message queues as part of a complex ingest system. The `generators `_ operate on this stream and -pass to `output plugins `_. The output is at the level -of an "asset" so higher level aggregated objects may require an aggregation step. +pass to `output plugins `_. These outputs are also configurable so could dump to the terminal (for debugging), file, a data store (postgres, elasticsearch, etc.) or even a message queue for onward processing. @@ -36,22 +35,17 @@ in a certain space and time. Generators ========== -The different generators are designed to extract different levels of metadata to build the assets, items, and collections of the STAC Catalog. +The different generators are designed to extract different levels of metadata to build the items, and collections of the STAC Catalog. .. list-table:: :header-rows: 1 * - Name - Description - * - :ref:`Asset Generator ` - - Generates STAC Assets via extraction methods specified in the :ref:`colelction descriptions ` - focusing on file metadata (name, location, size, etc.) - * - :ref:`Item Generator ` - - Generates STAC Items via extraction methods specified in the :ref:`colelction descriptions ` - focusing on aggregation from asset metadata. - * - :ref:`Collection Generator ` - - Generates STAC Collections via extraction methods specified in the :ref:`colelction descriptions ` - focusing on aggregation from item metadata. + * - :ref:`Item Generator ` + - Generates STAC Items via extraction methods specified in the :ref:`colelction descriptions `. + * - :ref:`Collection Generator ` + - Generates STAC Collections via extraction methods specified in the relivant :ref:`recipe `. diff --git a/docs/source/recipes/building_a_workflow.rst b/docs/source/recipes/building_a_workflow.rst new file mode 100644 index 0000000..aa718df --- /dev/null +++ b/docs/source/recipes/building_a_workflow.rst @@ -0,0 +1,81 @@ +****************** +Building an Recipe +****************** + +Building an STAC catalog workflow consists of 4 mains steps: + 1. Write a :ref:`recipe ` file for each STAC level to describe the workflow + 2. Test the workflow on a subset of data + 3. Index that subset of data to check it works as expected + 4. Index full dataset + +Parts 1 and 2 will likely go round in a loop, whilst you are developing the +workflow file, with several iterations until you are happy. + +1. Write a recipe +================= + +A basic recipe consists of up to 5 sections: + 1. ``paths`` + 2. ``type`` + 3. ``id`` + 4. ``extraction_methods`` + 5. ``member_of`` + +An example recipe can be found :ref:`here ` + +The extraction methods section describes how the facets are extracted from the data. + +To check your recipe works as expected, you will need to run it. + +2. Running the recipe on a subset of data +========================================= + +To run your workflow, you will need to create a config file. +This will define an input path and output to standard out. + +Example configuration +--------------------- + +.. include:: ../stac_generator/user_guide/example_config.rst + +You should choose a filepath with a relatively small number of files to +make iteration quick and allow you to make tweaks. + +You can then run your workflow using: + +``stac_generator -c `` + +.. program-output:: stac_generator -h + +.. note:: + + It is likely that this will be an iterative process to make sure that the correct + facets are extracted and the output is as desired. + +3. Indexing the data +==================== + +This step is as simple as changing your output plugin to point to the final destination. + +Here is an example for the stac-fastapi output making use of additional kwargs: + +.. code-block:: + + - name: stac_fastapi + api_url: + verify: False + +Once this works as expected... + +4. Indexing the full dataset +============================ + +This is done by increasing the scope of the input plugin. +In the example we used the path ``/badc/faam/data/2005/b069-jan-05``. If our +description file covered ``/badc/faam/data`` we could now expand our input to cover +``/badc/faam/data``. + +.. note:: + + The higher up the tree you put the input, the longer it will take. You might + wish to consider splitting the run into smaller segments and running in parallel. diff --git a/docs/source/recipes/recipes.rst b/docs/source/recipes/recipes.rst new file mode 100644 index 0000000..276cf0d --- /dev/null +++ b/docs/source/recipes/recipes.rst @@ -0,0 +1,132 @@ + +******* +Recipes +******* + +.. toctree:: + :hidden: + + building_a_workflow + + +These documents describe how to process the files within a dataset and extract facets. +These documents are aggregated along the branch with the information lower in the tree taking +precendence. +Having multiple files at different points in the tree allow for a narrowing of information. +i.e. A default set at a higher level, if not overwritten, will exist at all points down the tree. + +Example Recipe +============== + +This file applies to all files listed under: ``/badc/faam/data``. + +It uses the regex processor to match a regex pattern to +extract a date. This is then passed to the ISO Date post-processor which converts +the date into the ISO8601 format. + +The second, extracts platform and flight number from the filepath. +No pre or post processing is done in this case. + +.. code-block:: yaml + + paths: + - /badc/faam/data + type: item + extraction_methods: + - method: regex + inputs: + regex: '^(?:[^_]*_){2}(?P\d*)' + +Recipe sections +=============== + +The description file consists of 5 top level keys: + +A :ref:`Full JSON Schema` references below. + +.. list-table:: + :header-rows: 1 + + * - Key + - Type + - Description + * - ``paths`` + - list[str] + - List of paths this workflow applies to. + * - ``type`` + - str + - Defines the extraction methods to create the assets matching the given path. + * - ``id`` + - list[:ref:`Extraction Methods `] + - Defines the ID of the STAC record runs after extraction methods. + * - ``extraction_methods`` + - list[:ref:`Extraction Methods `] + - Defines the extraction methods to generate the metadata for the STAC record. + * - ``member_of`` + - list[str] + - Defines the recipers for the Collections the generated Item or Collection is a member of. + +Paths +----- + +Describes the paths where this file applies. Can be multiple locations. +The path references all points below it in the hierarchy. + +.. code-block:: yaml + + paths: + - /badc/faam/data + +Type +---- + +Defines the tyoe of STAC record to create. + +.. code-block:: yaml + + type: item + +ID +--- + +Defines a list of extraction methods used to generate the record's ID. +This is run after the extraction methods. + +.. code-block:: yaml + + id: + # Use directory name ID + - method: default + inputs: + defaults: + item_id: $instance_id + +Extraction Methods +------------------ + +Defines a list of extraction methods used to generate the record's metadata. + +.. code-block:: yaml + + extraction_methods: + - method: lambda + inputs: + function: 'lambda uri: uri.replace("/badc/cmip6/data", "").strip("/").replace("/", ".")' + input_args: + - $uri + output_key: instance_id + +Member Of +--------- + +Defines a list of paths to recipes that this record is a member of. + +.. code-block:: yaml + + member_of: + - recipes/collection/cmip6.yaml + +Schema +------- + +.. program-output:: python -c "from stac_generator.core.baker import Recipe; import json; print(json.dumps(Recipe.schema(), indent=4))" diff --git a/docs/source/stac_generator/extraction_methods.rst b/docs/source/stac_generator/extraction_methods.rst new file mode 100644 index 0000000..c45577f --- /dev/null +++ b/docs/source/stac_generator/extraction_methods.rst @@ -0,0 +1,36 @@ + +********** +Processors +********** + +Processors take a uri and return a dictionary of extracted information. They +can be chained, one after the other and the results are merged such that arrays +are appended to and key:value pairs are overwritten by subsequent write operations. + +.. _extraction-methods: + +Extraction Methods +================== + +Extraction methods take in a dictionary, perform an operation to manipulate, add, or remove +data from the dictionary before reutrning it. They can be chained, one after the other to +generate the desired facets. + +See :ref:`CEDA Extraction Methods ` for +CEDA developed extraction methods. + +Third-Party Processors +====================== + +The plugin nature lends itself to third-party plugins. If you develop a plugin which might +be useful for others' workflows. Please make a pull request to add it to this table. + +.. list-table:: + :header-rows: 1 + + * - Processor Name + - Description + - Vendor + * - + - + - diff --git a/docs/source/stac_generator/filters.rst b/docs/source/stac_generator/filters.rst deleted file mode 100644 index a2913a6..0000000 --- a/docs/source/stac_generator/filters.rst +++ /dev/null @@ -1,15 +0,0 @@ - -Plugin Filters -============== - -.. automodule:: stac_generator.plugins.filters - -.. list-table:: - :header-rows: 1 - - * - Plugin Name - - Description - * - :ref:`Path Regex ` - - Can be used to pattern match against the path to either include/exclude. - -.. automodule:: stac_generator.plugins.filters.path_regex diff --git a/docs/source/stac_generator/generators.rst b/docs/source/stac_generator/generators.rst index 3243983..228829d 100644 --- a/docs/source/stac_generator/generators.rst +++ b/docs/source/stac_generator/generators.rst @@ -9,15 +9,11 @@ Genertators * - Generator Name - Description - * - :ref:`asset ` - - Used to generate STAC assets. - * - :ref:`item ` + * - :ref:`item ` - Used to generate STAC items. - * - :ref:`collection ` + * - :ref:`collection ` - Used to generate STAC collections. -.. automodule:: stac_generator.plugins.generators.asset - .. automodule:: stac_generator.plugins.generators.item -.. automodule:: stac_generator.plugins.generators.item +.. automodule:: stac_generator.plugins.generators.collection diff --git a/docs/source/stac_generator/index.rst b/docs/source/stac_generator/index.rst index 1efe91b..c3f4e4c 100644 --- a/docs/source/stac_generator/index.rst +++ b/docs/source/stac_generator/index.rst @@ -4,17 +4,16 @@ STAC Generator :fa:`github` `View on Github `_ -This library aims to be a generic tool for generating JSON documents which are `STAC `_-like. -You should be able to generate fully STAC compliant JSON or generate content which contains -all the relevant information to allow you to construct a valid `STAC item `_. +This library aims to be a generic tool for extracting meta data. This meta dat can then be mapped into the STAC framework to generate +`STAC `_-like items and collections. -This library works on the premise that you can build a processing chain for each of your datasets -by chaining together different processors to extract the relevant information. The core facet -extraction chain works on an atomic basis, where input plugins provide a single object for inspection -and output a single JSON object. Item IDs can be generated based on selected facets. -Downstream processing can then be used to aggregate this information together. +You should be able to generate content which contains all the relevant information to allow you to +construct a valid `STAC item `_. -Datastores such as Elasticsearch can make use of upserts which will merge the JSON documents in indexing. +This library works on the premise that you can build a processing workflow for each of your datasets +by chaining together different extraction methods to extract and manipulate the relevant information. +The core facet extraction chain works on an atomic basis, where input plugins provide a single object +for inspection and output a single object. Read the :ref:`Orientation ` guide as a introduction into the framework. @@ -33,12 +32,12 @@ extractors ``stac_generator``. .. code-block:: console - usage: stac_generator [-h] conf + usage: stac_generator [-c] path/to/conf Run the STAC Generator as configured - positional arguments: - conf Path to a yaml configuration file + required arguments: + -c Path to a yaml configuration file optional arguments: -h, --help show this help message and exit @@ -59,13 +58,15 @@ Base configuration options: * - Option - Description * - ``generator`` - - The name of the generator ``asset``, ``item``, ``collection`` - * - ``collection_descriptions`` - - ``REQUIRED`` Path to the root directory for the collection descriptions. Used to describe workflows. + - The generator type ``item`` or ``collection`` + * - ``recipes_root`` + - ``REQUIRED`` Path to the root directory for the recipes. Used to describe workflows. * - ``inputs`` - ``REQUIRED`` Must have at least one `input `_. * - ``outputs`` - - ``REQUIRED`` Must have at least one `output `_ + - ``REQUIRED`` Must have at least one `output `_. + * - ``extraction_methods`` + - ``OPTIONAL`` Defaults for any extraction methods that are being used `extraction methods `_. * - ``logging`` - Kwargs passed to the `logging.basicConfig `_ setup method @@ -81,8 +82,7 @@ Sample configuration .. code-block:: yaml generator: item - item_descriptions: - root_directory: /home/users/rsmith013/search_futures/collections-descriptions/descriptions + recipe_root: /home/users/rsmith013/search_futures/collections-descriptions/descriptions inputs: - method: file_system path: /badc/faam/data/2005/b069-jan-05 @@ -96,12 +96,8 @@ Plugins ======= Plugins are used to add modular components and allow extension of the base -capabilities to fit your needs. The Asset Scanner holds the Inputs/Outputs, -filters to modify these plugins and "processors" which are used to extract values -from the files. - -The processors are used to either extract content from the filename/path, headers -or third-party sources. +capabilities to fit your needs. The range of different Inputs, Outputs, +Extraction Methods, and Mappings have been developed to meet different use cases. .. toctree:: @@ -109,5 +105,5 @@ or third-party sources. inputs outputs - filters - processors + extraction_methods + mappings diff --git a/docs/source/stac_generator/inputs.rst b/docs/source/stac_generator/inputs.rst index 3a3c0e9..4975a1d 100644 --- a/docs/source/stac_generator/inputs.rst +++ b/docs/source/stac_generator/inputs.rst @@ -4,34 +4,5 @@ Inputs .. automodule:: stac_generator.plugins.inputs -.. list-table:: - :header-rows: 1 - - * - Plugin Name - - Description - - Required packages - * - :ref:`file_system ` - - Works with POSIX style file systems and performs a python `os.walk `_. - - ``None`` - * - :ref:`object_store ` - - Works with S3 endpoints. - - ``pip install stac-generator[boto3]`` - * - :ref:`intake_esm ` - - Use/search and intake ESM catalog to provide a source of paths. - - ``pip install stac-generator[intake-esm]`` - * - :ref:`rabbit_mq ` - - Connect to a RabbitMQ message queue. - - ``pip install stac-generator[rabbitmq]`` - * - :ref:`thredds ` - - Use a THREDDS catalog as a source - - ``pip install stac-generator[thredds]`` - -.. automodule:: stac_generator.plugins.inputs.file_system - -.. automodule:: stac_generator.plugins.inputs.object_store - -.. automodule:: stac_generator.plugins.inputs.intake_esm - -.. automodule:: stac_generator.plugins.inputs.rabbit_mq - -.. automodule:: stac_generator.plugins.inputs.thredds +.. automodule:: stac_generator.plugins.inputs + :members: diff --git a/docs/source/stac_generator/mappings.rst b/docs/source/stac_generator/mappings.rst new file mode 100644 index 0000000..aa4186c --- /dev/null +++ b/docs/source/stac_generator/mappings.rst @@ -0,0 +1,7 @@ +Mappings +======== + +.. automodule:: stac_generator.plugins.mappings + +.. automodule:: stac_generator.plugins.mappings + :members: diff --git a/docs/source/stac_generator/outputs.rst b/docs/source/stac_generator/outputs.rst index ab7b5e1..4646da1 100644 --- a/docs/source/stac_generator/outputs.rst +++ b/docs/source/stac_generator/outputs.rst @@ -4,39 +4,5 @@ Outputs .. automodule:: stac_generator.plugins.outputs -.. list-table:: - :header-rows: 1 - - * - Plugin Name - - Description - - Required packages - * - :ref:`standard_out ` - - Useful for debugging and preparing workflows. A simple ``print()``. - - ``None`` - * - :ref:`elasticsearch ` - - Outputs the metadata directly to Elasticsearch. - - ``pip install stac-generator[elasticsearch]`` - * - :ref:`text_file ` - - Outputs the metadata directly to file. - - ``None`` - * - :ref:`rabbitmq ` - - Outputs the metadata directly to Elasticsearch. - - ``pip install stac-generator[pika]`` - * - :ref:`intake_ems ` - - Outputs the metadata directly to Elasticsearch. - - ``pip install stac-generator[pika]`` - * - :ref:`json_file ` - - Outputs the metadata directly to JSON file. - - ``None`` - -.. automodule:: stac_generator.plugins.outputs.standard_out - -.. automodule:: stac_generator.plugins.outputs.elasticsearch - -.. automodule:: stac_generator.plugins.outputs.text_file - -.. automodule:: stac_generator.plugins.outputs.rabbit_mq - -.. automodule:: stac_generator.plugins.outputs.intake_ems - -.. automodule:: stac_generator.plugins.outputs.json_file +.. automodule:: stac_generator.plugins.outputs + :members: diff --git a/docs/source/stac_generator/processors.rst b/docs/source/stac_generator/processors.rst deleted file mode 100644 index 0e0b5f2..0000000 --- a/docs/source/stac_generator/processors.rst +++ /dev/null @@ -1,309 +0,0 @@ - -********** -Processors -********** - -Processors take a uri and return a dictionary of extracted information. They -can be chained, one after the other and the results are merged such that arrays -are appended to and key:value pairs are overwritten by subsequent write operations. - -Some processors can also take :ref:`pre-processors` and :ref:`post-processors`. Pre-processors modify -the input arguments whilst post-processors modify the output from the main processor. - -.. _extraction-methods: - -Extraction Methods -================== - -.. list-table:: - :header-rows: 1 - - * - Processor Name - - Description - * - :ref:`header ` - - Takes a uri string and a list of attributes and returns a dictionary of the values extracted from the file header. - * - :ref:`regex ` - - Takes an input string and a regex with named capture groups and returns a dictionary of the values extracted using the named capture groups. - * - :ref:`iso19115 ` - - Extracts attributes from an formatted ISO19115 record at a given URL. Supports URL templating. - * - :ref:`xml ` - - Extracts attributes from an xml record at a given URL. Supports URL templating. - * - :ref:`default ` - - Takes input dict of attributes and values to add generated object. - * - :ref:`elasticsearch ` - - Aggregates attributes for a given id from a elasticsearch endpoint. - * - :ref:`json_file ` - - Aggregates attributes for a given id from a json file. - * - :ref:`posix_stats ` - - Extracts file stats (name, size, mod_time, etc.). - * - :ref:`object_store_stats ` - - Extracts file stats (name, size, mod_time, etc.). - -.. _header-extract: - -Header ------- - -.. automodule:: stac_generator.plugins.extraction_methods.header - -.. autoclass:: stac_generator.plugins.extraction_methods.header.HeaderExtract - -.. _regex-extract: - -Regex ------ - -.. automodule:: stac_generator.plugins.extraction_methods.regex_extract - :members: - -.. _iso19115-extract: - -ISO19115 --------- - -.. automodule:: stac_generator.plugins.extraction_methods.iso19115_extract - :members: - -.. _xml-extract: - -XML ---- - -.. automodule:: stac_generator.plugins.extraction_methods.xml_extract - :members: - -.. _default-extract: - -Default -------- - -.. automodule:: stac_generator.plugins.extraction_methods.default_extract - :members: - -.. _elasticsearch-extract: - -Elasticsearch -------------- - -.. automodule:: stac_generator.plugins.extraction_methods.elasticsearch_extract - :members: - -.. _json-file-extract: - -JSON File ---------- - -.. automodule:: stac_generator.plugins.extraction_methods.json_file_extract - :members: - -.. _posix-stats-extract: - -POSIX Stats ------------ - -.. automodule:: stac_generator.plugins.extraction_methods.posix_stats_extract - :members: - -.. _object-store-stats-extract: - -Object Store Stats ------------------- - -.. automodule:: stac_generator.plugins.extraction_methods.object_store_stats_extract - :members: - -.. _pre-processors: - -Pre Processors -============== - -.. automodule:: stac_generator.plugins.preprocessors - -.. list-table:: - :header-rows: 1 - - * - Processor Name - - Description - * - :ref:`basename ` - - Takes a file path and returns the filename using ``os.path.basename``. - * - :ref:`ceda_observation ` - - Takes a file path and returns the uuid from the `CEDA Catalogue `_. - -.. _basename-pre: - -Basename --------- - -.. automodule:: stac_generator.plugins.preprocessors.basename - -.. _ceda-observation-pre: - -CEDA Observation ----------------- - -.. automodule:: stac_generator.plugins.preprocessors.ceda_observation - -.. _post-processors: - -Post Processors -=============== - -.. list-table:: - :header-rows: 1 - - * - Processor Name - - Description - * - :ref:`facet_map ` - - In some cases, you may wish to map the header attributes to different facets. This method takes a map and converts the facet labels into those specified. - * - :ref:`isodate ` - - Takes the source dict and the key to access the date and converts the date to ISO 8601 Format. - * - :ref:`date_combinator ` - - Automatically converts year, month, day, hour, minunte, second keys into an ISO 8601 date. - * - :ref:`bbox ` - - Converts coordinates from a dictionary into `RFC 7946, section 5 `_ - formatted coordinates - * - :ref:`geometry_point ` - - Converts coordinates from a dictionary into `RFC 7946, section 3.1.2 `_ - formatted coordinates - * - :ref:`geometry_line ` - - Converts coordinates from a dictionary into `RFC 7946, section 3.1.4 `_ - formatted coordinates - * - :ref:`geometry_polygon ` - - Converts coordinates from a dictionary into `RFC 7946, section 3.1.6 `_ - formatted coordinates - * - :ref:`string_join ` - - Join facets together to create a new value. - * - :ref:`facet_prefix ` - - Add prefix to given atributes. - -.. _facet-map-post: - -Facet Map ---------- - -.. automodule:: stac_generator.plugins.postprocessors.facet_map - -.. _iso-date-post: - -ISO Date --------- -.. automodule:: stac_generator.plugins.postprocessors.isodate - -.. _date-combinator-post: - -Date Combinator ---------------- - -.. automodule:: stac_generator.plugins.postprocessors.date_combinator - -.. _bbox-post: - -BBOX ----- -.. automodule:: stac_generator.plugins.postprocessors.bbox - -.. _geometry-point-post: - -Geometry Point --------------- -.. automodule:: stac_generator.plugins.postprocessors.geometry_point - -.. _geometry-line-post: - -Geometry Line -------------- -.. automodule:: stac_generator.plugins.postprocessors.geometry_line - -.. _geometry-polygon-post: - -Geometry Polygon ----------------- -.. automodule:: stac_generator.plugins.postprocessors.geometry_polygon - -.. _string-join-post: - -String Join ------------ -.. automodule:: stac_generator.plugins.postprocessors.string_join - -.. _date-combinator-post: - -Facet Prefix ------------- -.. automodule:: stac_generator.plugins.postprocessors.facet_prefix - -.. _post-extraction-methods-post: - -Post Extraction Methods -======================= - -.. automodule:: stac_generator.plugins.post_extraction_methods - -.. list-table:: - :header-rows: 1 - - * - Processor Name - - Description - * - :ref:`controlled_vocabulary ` - - Compare properties to a controlled vocabulary defined by a ``pydantic.BaseModel``. - * - :ref:`ceda_vocabulary ` - - Validates and sorts properties into vocabs and generates the ``general`` vocab for specified properties. - -.. _controlled-vocab-post-extract: - -Controlled Vocabulary ---------------------- -.. automodule:: stac_generator.plugins.post_extraction_methods.controlled_vocabulary - -.. _ceda-vocab-post-extract: - -CEDA Vocabulary ---------------- -.. automodule:: stac_generator.plugins.post_extraction_methods.ceda_vocabulary - -.. _id-extraction-methods: - -ID Extraction Methods -===================== - -.. automodule:: stac_generator.plugins.id_extraction_methods - -.. list-table:: - :header-rows: 1 - - * - Processor Name - - Description - * - :ref:`default ` - - Sets the ID to the given value. - * - :ref:`hash ` - - Sets the ID to the hash of the given terms. - -.. _default-id-extract: - -Default -------- -.. automodule:: stac_generator.plugins.id_extraction_methods.default - -.. _hash-id-extract: - -Hash ----- -.. automodule:: stac_generator.plugins.id_extraction_methods.hash - -.. _third-party: - -Third-Party Processors -====================== - -The plugin nature lends itself to third-party plugins. If you develop a plugin which might -be useful for others' workflows. Please make a pull request to add it to this table. - -.. list-table:: - :header-rows: 1 - - * - Processor Name - - Description - - Vendor - * - - - - - diff --git a/docs/source/stac_generator/user_guide/example_config.rst b/docs/source/stac_generator/user_guide/example_config.rst index b7b1d6c..0be4f19 100644 --- a/docs/source/stac_generator/user_guide/example_config.rst +++ b/docs/source/stac_generator/user_guide/example_config.rst @@ -1,7 +1,6 @@ .. code-block:: yaml - collection_descriptions: - root_directory: /etc/collection_descriptions/descriptions + recipes_root: /etc/recipes inputs: - method: file_system path: /badc/faam/data/2005/b069-jan-05 diff --git a/docs/source/stac_generator/user_guide/orientation.rst b/docs/source/stac_generator/user_guide/orientation.rst index cc35332..45adb7b 100644 --- a/docs/source/stac_generator/user_guide/orientation.rst +++ b/docs/source/stac_generator/user_guide/orientation.rst @@ -10,16 +10,14 @@ There are various pluggable pieces: - Inputs - Outputs - Extraction Methods - - Pre/Post Processors - - Post Extraction Methods - - ID Extraction Methods + - Mappings These pieces should allow you to construct a workflow which works for your use case and provide python entry points to allow you to write your own plugins. The STAC Generator package stores some :ref:`inputs ` which can be used to read from a range of different sources messages of STAC objects to genertate. -The :ref:`asset `, :ref:`item `, and :ref:`collection ` -generators take these messages and extract the required facets to buil the relevant STAC object using a variety of :ref:`processors `. +The :ref:`item `, and :ref:`collection ` +generators take these messages and extract the required facets to buil the relevant STAC object using a variety of :ref:`extraction methods `. These generated objects can then be passed to a range of :ref:`outputs `. The generators have two levels of configuration. Global configuration, passed at the command line on @@ -27,100 +25,300 @@ invocation, which defines the inputs, ouputs, logging etc. An example can be found :ref:`here `. -The second level of configuration comes in the form of collection-descriptions. These YAML files -describe the workflow for extracting facets and other metadata to build the assets, items, and collections of the STAC Catalog. -Background for collection-descriptions can be found `here `_ -and a guide for how to build, and test these files is :ref:`here `. +The second level of configuration comes in the form of recipes. These YAML files +describe the workflow for extracting facets and other metadata to build the items and collections of the STAC Catalog. +Background for recipes can be found `here `_ +and a guide for how to build, and test these files is :ref:`here `. -The different available processors which can construct these workflows are found :ref:`here `. +The different available extraction methods which can construct these workflows are found :ref:`here `. -The `CEDA repository containing these collection-descriptions `_ can +The `CEDA repository containing these recipes `_ can be used as an example. An example which includes extracting metadata from the NetCDF header is -`sentinel5 `_ +`sentinel5 `_ .. code-block:: yaml - paths: - - /neodc/sentinel5p/data - - asset: - extraction_methods: - - method: regex - description: Extract facets from the file path - inputs: - regex: '^\/(?:[^/]*/)(?P\w*)(?:[^/]*/){3}(?P[0-9v.]+)/' - - method: regex - description: Extract facets from the filename - inputs: - regex: '^(?:[^_]*_){2}(?P[^_]+)__(?P[^_]+)_{4}(?P[0-9T]+)_(?P[0-9T]+)_(?P\d+)(?:[^_]*_){3}(?P[0-9T]+)' - - method: header - description: Extract header metadata - inputs: - attributes: - - institution - - sensor - - item: - id: - method: hash - inputs: - terms: - - platform - - processing_level - - variable - - product_version - - datetime - extration_methods: - - elasticsearch: - default: - - platform - - processing_level - - variable - - product_version - - datetime - min: - - start_datetime - max: - - end_datetime - list: - - orbit - - institution - - sensor - - collection: - id: - method: default - inputs: - value: Ic93XnsBhuk7QqVbSFwS - extration_methods: - - elasticsearch: - default: - - platform - - processing_level - - variable - - product_version - - datetime - min: - - start_datetime - max: - - end_datetime - list: - - orbit - - institution - - sensor - -The “extraction_methods” are the workflow. In the example above I extract some facets from the file path, -some from the file name and some from the header. -To run regex on the filename, I use the ``filename_reducer`` and to convert my extracted dates to ISO 8601 -format, I run the ``isodate_processor``. - -As all of these “assets” are treated individually, but are grouped using the item id. So for the linked example, all assets -which return the same value for ``platform``, ``processing_level``, ``variable``, ``product_version`` and ``datetime``, -will be considered 1 STAC Item and be assigned the same ID. - -The same can be said for items and the collection id. - -This works in Elasticsearch because each individual elasticsearch document has the same id and are -merged in an upsert. If you are using another storage system, it will require an aggregation step -to join these together. Even with elasticsearch, lists are not merged in an upsert, but we have -not had to deal with this yet. + paths: + - /neodc/sentinel_ard/data/sentinel_2 + + type: item + + # This will be run over the meta files, example: neodc/sentinel_ard/data/sentinel_2/2018/07/05/S2A_20180705_lat57lon375_T30VVJ_ORB123_utm30n_osgb_vmsk_sharp_rad_srefdem_stdsref_meta.xml + id: + # Use full path minus the extension for ID + - method: default + inputs: + defaults: + item_id: $instance_id + + extraction_methods: + # Extract information from the meta file + - method: xml + inputs: + extraction_keys: + - name: east + key: .//gmd:eastBoundLongitude/gco:Decimal + - name: west + key: .//gmd:westBoundLongitude/gco:Decimal + - name: north + key: .//gmd:northBoundLatitude/gco:Decimal + - name: south + key: .//gmd:southBoundLatitude/gco:Decimal + - name: start_datetime + key: .//gml:beginPosition + - name: end_datetime + key: .//gml:beginPosition + - name: supInfo + key: .//gmd:supplementalInformation/gco:CharacterString + - name: EPSG + key: .//gmd:referenceSystemInfo/gmd:MD_ReferenceSystem/gmd:referenceSystemIdentifier/gmd:RS_Identifier/gmd:code/gco:CharacterString + namespaces: + gmd: http://www.isotc211.org/2005/gmd + gml: http://www.opengis.net/gml + gco: http://www.isotc211.org/2005/gco + + # Extract the variables from the supInfo field + - method: regex + inputs: + regex: 'ESA file name: (?P.*)' + input_term: supInfo + + - method: regex + inputs: + regex: 'Mean_Sun_Angle_Zenith: (?P.*)' + input_term: supInfo + + - method: regex + inputs: + regex: 'Mean_Sun_Angle_Azimuth: (?P.*)' + input_term: supInfo + + # Extract the manifest path info + - method: regex + inputs: + regex: 'neodc\/sentinel_ard\/data\/sentinel_2\/(?P\d{4})\/(?P\d{2})\/(?P\d{2})\/S2(?P[abAB]{1}).*' + input_term: uri + + - method: lambda + inputs: + function: 'lambda satellite: satellite.lower()' + input_args: + - $satellite + output_key: satellite + + # Generate path to the manifest file + - method: string_template + inputs: + template: '/neodc/sentinel2{satellite}/data/L1C_MSI/{year}/{month}/{day}/{esa_file_name}.manifest' + output_key: manifest_file + + # Extract information from the manifest file + - method: xml + inputs: + input_term: manifest_file + extraction_keys: + - name: Instrument Family Name + key: .//safe:platform/safe:instrument/safe:familyName + - name: Instrument Family Name Abbreviation + key: .//safe:platform/safe:instrument/safe:familyName + attribute: abbreviation + - name: Platform Number + key: .//safe:platform/safe:number + - name: NSSDC Identifier + key: .//safe:platform/safe:nssdcIdentifier + - name: Start Relative Orbit Number + key: .//safe:orbitReference/safe:relativeOrbitNumber + - name: Start Orbit Number + key: .//safe:orbitReference/safe:orbitNumber + - name: Ground Tracking Direction + key: .//safe:orbitReference/safe:orbitNumber + attribute: groundTrackDirection + - name: Instrument Mode + key: .//safe:platform/safe:instrument/safe:mode + - name: Coordinates + key: .//safe:frameSet/safe:footPrint/gml:coordinates + namespaces: + safe: http://www.esa.int/safe/sentinel/1.1 + gml: http://www.opengis.net/gml + + - method: regex + inputs: + regex: '(?P.+?)_vmsk_sharp_rad_srefdem_stdsref_meta\.' + + - method: lambda + inputs: + function: 'lambda coords_string: [[float(i), float(k)]for i,k in zip(coords_string.strip().split()[1::2], coords_string.strip().split()[0::2])]' + input_args: + - $Coordinates + output_key: coords + + - method: geometry_polygon + inputs: + coordinates_term: coords + + - method: geometry_to_bbox + inputs: + type: polygon + + - method: string_template + inputs: + template: '{esa_file_name}.SAFE/MTD_MSIL1C.xml' + output_key: inner_file + + - method: string_template + inputs: + template: '/neodc/sentinel2{satellite}/data/L1C_MSI/{year}/{month}/{day}/{esa_file_name}.zip' + output_key: zip_file + + - method: open_zip + inputs: + zip_file: $zip_file + inner_file: $inner_file + output_key: esa_product + + - method: xml + inputs: + input_term: esa_product + extraction_keys: + - name: Cloud Coverage Assessment + key: .//psd-14:Quality_Indicators_Info/Cloud_Coverage_Assessment + - name: Product Type + key: .//psd-14:General_Info/Product_Info/PRODUCT_TYPE + - name: Datatake Type + key: .//psd-14:General_Info/Product_Info/Datatake/DATATAKE_TYPE + namespaces: + psd-14: https://psd-14.sentinel2.eo.esa.int/PSD/User_Product_Level-1C.xsd + + + - method: string_template + inputs: + template: '{path_root}.*.tif' + output_key: data_regex + + - method: string_template + inputs: + template: '{path_root}.*_thumbnail.jpg' + output_key: thumbnail_regex + + - method: string_template + inputs: + template: '{path_root}.*_meta.xml' + output_key: metadata_regex + + - method: elasticsearch_assets + inputs: + search_field: path + regex_term: data_regex + fields: + - name: size + - name: location + extraction_methods: + - method: default + inputs: + defaults: + roles: ["data"] + + - method: elasticsearch_assets + inputs: + search_field: path + regex_term: thumbnail_regex + fields: + - name: size + - name: location + extraction_methods: + - method: default + inputs: + defaults: + roles: ["thumbnail"] + + - method: elasticsearch_assets + inputs: + search_field: path + regex_term: metadata_regex + fields: + - name: size + - name: location + extraction_methods: + - method: default + inputs: + defaults: + roles: ["metadata"] + + - method: rename_assets + inputs: + rename: + - name: cog + regex: '.*_stdsref.tif' + - name: cloud + regex: '.*_clouds.tif' + - name: cloud_probability + regex: '.*_clouds_prob.tif' + - name: topographic_shadow + regex: '.*_toposhad.tif' + - name: metadata + regex: '.*_meta.xml' + - name: thumbnail + regex: '.*_thumbnail.jpg' + - name: saturated_pixels + regex: '.*_sat.tif' + - name: valid_pixels + regex: '.*_valid.tif' + output_key: data_regex + + - method: lambda + inputs: + function: 'lambda assets: {asset_key: asset_value | {"href": "https://dap.ceda.ac.uk" + asset_value["href"]} for asset_key, asset_value in sorted(assets.items())}' + input_args: + - $assets + output_key: assets + + - method: lambda + inputs: + function: 'lambda path_root: path_root.replace("/badc/sentinel1b/data", "").replace("/badc/sentinel1a/data", "").strip("/").replace("/", ".")' + input_args: + - $path_root + output_key: instance_id + + - method: iso_date + inputs: + date_keys: + - start_datetime + - end_datetime + formats: + - '%Y-%m-%dT%H%M%SZ' + + - method: datetime_bound_to_centroid + + # Clean up unneeded terms + - method: remove + inputs: + keys: + - supInfo + - year + - month + - day + - manifest_file + - west + - south + - east + - north + - path_root + - data_regex + - thumbnail_regex + - metadata_regex + - Coordinates + - coords + - satellite + - zip_file + - inner_file + - esa_product + - uri + + member_of: + - recipes/collection/sentinel2_ARD.yaml + +The “extraction_methods” are the workflow. In the example shows the xml extaction method being used to extract some facets +from a meta data file, then this information is then manipulated by several different extaction methods including retrieving +a list of assets from CEDA's elasticsearch index. + +The extraction methods can also be used for collection generation but typically this will be aggregation of their items. diff --git a/stac_generator/core/baker.py b/stac_generator/core/baker.py index 3664a96..21e2ac2 100644 --- a/stac_generator/core/baker.py +++ b/stac_generator/core/baker.py @@ -94,7 +94,6 @@ class Recipes: def __init__(self, root_path: str): """ - :param root_path: Path to the root of the yaml files """ self.recipes = {"asset": {}, "item": {}, "collection": {}} @@ -155,6 +154,7 @@ def get(self, path: str, stac_type: str) -> Recipe: Get the most relevant recipe for a given path. :param path: Path for which to retrieve the recipe + :param stac_type: Type of recipe to return """ if path in self.recipes[stac_type]: return self.load_recipe(path, stac_type) diff --git a/stac_generator/core/bulk_output.py b/stac_generator/core/bulk_output.py index 909646c..6a363d6 100644 --- a/stac_generator/core/bulk_output.py +++ b/stac_generator/core/bulk_output.py @@ -48,7 +48,6 @@ def export(self, data_list: list) -> None: Output the data. :param data: list of data from processor to be output. - :param kwargs: """ def data_to_cache(self, data: dict) -> None: diff --git a/stac_generator/core/generator.py b/stac_generator/core/generator.py index 0f5141c..e9f5d28 100644 --- a/stac_generator/core/generator.py +++ b/stac_generator/core/generator.py @@ -65,6 +65,7 @@ def _load_extraction_method( Load the given extraction method :param extraction_method_conf: Configuration for the extraction method + :param kwargs: :return: extraction method """ @@ -100,7 +101,15 @@ def _load_extraction_method( def _run_extraction_method( self, body: dict, extraction_method_conf: dict, **kwargs ) -> dict: - """Run the specified extraction method.""" + """ + Run the specified extraction method. + + :param body: The current body of data + :param extraction_method_conf: Configuration for the extraction method + :param kwargs: + + :return: body post extraction method + """ extraction_method = self._load_extraction_method( extraction_method_conf, **kwargs @@ -116,6 +125,7 @@ def run_extraction_methods( :param body: current extracted meta data :param recipe: Recipe + :param kwargs: :return: result from the processing """ @@ -132,7 +142,9 @@ def run_member_of_methods( Extract the raw facets from the listed extraction methods :param body: Dict of current extracted data - :param recipe: Recipe + :param member_of: list of membership + :param kwargs: + :return: updated body """ @@ -158,6 +170,7 @@ def output(self, body: dict, recipe: Recipe, **kwargs) -> None: Run all configured outputs export methods. :param data: data to be output + :param kwargs: """ for output in self.outputs: output.run(body, recipe, **kwargs) @@ -176,6 +189,7 @@ def _process(self, body: dict, **kwargs) -> None: Run generator. :param body: body for object + :param kwargs: """ def process(self, uri: str, **kwargs) -> None: @@ -183,6 +197,7 @@ def process(self, uri: str, **kwargs) -> None: Run generator. :param uri: uri for object + :param kwargs: """ kwargs["TYPE"] = self.TYPE diff --git a/stac_generator/core/handler_picker.py b/stac_generator/core/handler_picker.py index 54ca5de..65afb55 100644 --- a/stac_generator/core/handler_picker.py +++ b/stac_generator/core/handler_picker.py @@ -45,9 +45,11 @@ def __init__(self, entry_point_key: Union[list, str]): @staticmethod def _get_entrypoints(group) -> dict: - """Get entrypoints for given group + """ + Get entrypoints for given group :param group: The named entry group + :return: dict of entrypoints """ entry_points = {} diff --git a/stac_generator/core/mapping.py b/stac_generator/core/mapping.py index 77ed438..ba544ef 100644 --- a/stac_generator/core/mapping.py +++ b/stac_generator/core/mapping.py @@ -15,7 +15,7 @@ class BaseMapping(ABC): """ - Class to act as a base for all processors. Defines the basic method signature + Class to act as a base for all mappings. Defines the basic method signature and ensure compliance by all subclasses. """ @@ -36,4 +36,13 @@ def _set_attrs(self, conf: dict) -> None: @abstractmethod def run(self, body: dict, recipe: Recipe, **kwargs) -> dict: + """ + Run the mapping + + :param body: + :param recipe: + :param kwargs: + + :return body: + """ pass diff --git a/stac_generator/plugins/bulk_outputs/elasticsearch.py b/stac_generator/plugins/bulk_outputs/elasticsearch.py index 69a2b29..badcf00 100644 --- a/stac_generator/plugins/bulk_outputs/elasticsearch.py +++ b/stac_generator/plugins/bulk_outputs/elasticsearch.py @@ -6,7 +6,7 @@ An output backend which outputs the content generated to elasticsearch using the Elasticsearch API -**Plugin name:** ``elasticsearch`` +**Plugin name:** ``elasticsearch_bulk`` .. list-table:: :header-rows: 1 @@ -28,7 +28,7 @@ .. code-block:: yaml outputs: - - method: elasticsearch + - method: elasticsearch_bulk connection_kwargs: hosts: ['host1','host2'] index: diff --git a/stac_generator/plugins/generators/asset.py b/stac_generator/plugins/generators/asset.py index 98b67ba..5ad1808 100644 --- a/stac_generator/plugins/generators/asset.py +++ b/stac_generator/plugins/generators/asset.py @@ -1,6 +1,14 @@ # encoding: utf-8 """ +Generator to create STAC Assets +Configuration +------------- + +.. code-block:: yaml + + generator: asset + recipes_root: recipes/ """ __author__ = "Richard Smith" __date__ = "01 Jun 2021" diff --git a/stac_generator/plugins/generators/collection.py b/stac_generator/plugins/generators/collection.py index 49e158b..5631e9d 100644 --- a/stac_generator/plugins/generators/collection.py +++ b/stac_generator/plugins/generators/collection.py @@ -1,14 +1,14 @@ # encoding: utf-8 """ - +Generator to create STAC Collections Configuration ------------- .. code-block:: yaml - item_descriptions: - root_directory: /path/to/root/descriptions + generator: collection + recipes_root: recipes/ """ __author__ = "Richard Smith" diff --git a/stac_generator/plugins/generators/item.py b/stac_generator/plugins/generators/item.py index 7b3f653..fefa193 100644 --- a/stac_generator/plugins/generators/item.py +++ b/stac_generator/plugins/generators/item.py @@ -1,14 +1,14 @@ # encoding: utf-8 """ - +Generator to create STAC Items Configuration ------------- .. code-block:: yaml - item_descriptions: - root_directory: /path/to/root/descriptions + generator: item + recipes_root: recipes/ """ __author__ = "Richard Smith" diff --git a/stac_generator/plugins/inputs/__init__.py b/stac_generator/plugins/inputs/__init__.py index 0a4cd23..5cc44c0 100644 --- a/stac_generator/plugins/inputs/__init__.py +++ b/stac_generator/plugins/inputs/__init__.py @@ -1,7 +1,6 @@ # encoding: utf-8 """ -The input plugins determine the source of the list. Files are processed -atomically and the input plugins provide this atomic action. +The input plugins generate a stream of dictionaries to be passed to the generator for processing. You can configure more than one input plugin, if you wanted to input the content from more than one place. diff --git a/stac_generator/plugins/inputs/file_system.py b/stac_generator/plugins/inputs/file_system.py index 3b3ff39..9e6f159 100644 --- a/stac_generator/plugins/inputs/file_system.py +++ b/stac_generator/plugins/inputs/file_system.py @@ -53,7 +53,7 @@ class FileSystemInput(BaseInput): """ - Performs an os.walk to provide a stream of paths for procesing. + Performs an os.walk to provide a stream of messages for procesing. """ def __init__(self, **kwargs): diff --git a/stac_generator/plugins/mappings/__init__.py b/stac_generator/plugins/mappings/__init__.py new file mode 100644 index 0000000..39e569b --- /dev/null +++ b/stac_generator/plugins/mappings/__init__.py @@ -0,0 +1,16 @@ +# encoding: utf-8 +""" +The mappings are run before an output and map the data into the +correct format. + +You can configure more than one mapping per output. Mappings run +in the order they are given. + +Mappings are loaded as named entry points with the namespace: +``stac_generator.mappings`` +""" +__author__ = "Rhys Evans" +__date__ = "26 Feb 2024" +__copyright__ = "Copyright 2018 United Kingdom Research and Innovation" +__license__ = "BSD - see LICENSE file in top-level package directory" +__contact__ = "rhys.r.evans@stfc.ac.uk" diff --git a/stac_generator/plugins/mappings/stac.py b/stac_generator/plugins/mappings/stac.py index 9af19d5..7a1656f 100644 --- a/stac_generator/plugins/mappings/stac.py +++ b/stac_generator/plugins/mappings/stac.py @@ -29,7 +29,7 @@ class STACMapping(BaseMapping): .. code-block:: yaml - - method: stac_observation + - method: stac_mapping """ diff --git a/stac_generator/plugins/outputs/stac_fastapi.py b/stac_generator/plugins/outputs/stac_fastapi.py index 5711c8c..6870c59 100644 --- a/stac_generator/plugins/outputs/stac_fastapi.py +++ b/stac_generator/plugins/outputs/stac_fastapi.py @@ -3,10 +3,10 @@ Elasticsearch ------------- -An output backend which outputs the content generated to elasticsearch -using the Elasticsearch API +An output backend which outputs the content generated to a STAC FastAPI +using the Transaction endpoint extension -**Plugin name:** ``elasticsearch`` +**Plugin name:** ``stac_fastapi`` .. list-table:: :header-rows: 1 @@ -14,25 +14,19 @@ * - Option - Value Type - Description - * - ``connection_kwargs`` - - ``dict`` - - ``REQUIRED`` Connection kwargs passed to the `elasticsearch client `_ - * - ``index.name`` - - ``str`` - - ``REQUIRED`` The index to output the content. - * - ``index.mapping`` + * - ``api_url`` - ``str`` + - ``REQUIRED`` root url of STAC API + * - ``verify`` + - ``bool`` - Path to a yaml file which defines the mapping for the index Example Configuration: .. code-block:: yaml outputs: - - method: elasticsearch - connection_kwargs: - hosts: ['host1','host2'] - index: - name: 'assets-2021-06-02' + - name: stac_fastapi + api_url: https://localhost """ __author__ = "Richard Smith" __date__ = "01 Jun 2021" diff --git a/stac_generator/plugins/outputs/stacapi_backend.py b/stac_generator/plugins/outputs/stacapi_backend.py index 30d4813..5fa9d02 100644 --- a/stac_generator/plugins/outputs/stacapi_backend.py +++ b/stac_generator/plugins/outputs/stacapi_backend.py @@ -91,92 +91,6 @@ def __init__(self, **kwargs): if r.status_code == 404: r.raise_for_status() - def export(self, data, **kwargs): - # todo avoid processing second json object - if "body" not in data: - return - - json_data = self.create_stac_item(data) - self.post_collection_item(self.stac_host, self.collection_id, json_data) - - def create_stac_item(self, data): - # TODO : not hardcoded bbox and footprint - bounds = { - "left": -140.99778, - "bottom": 41.6751050889, - "right": -52.6480987209, - "top": 83.23324, - } - bbox = [bounds["left"], bounds["bottom"], bounds["right"], bounds["top"]] - footprint = Polygon( - [ - [bounds["left"], bounds["bottom"]], - [bounds["left"], bounds["top"]], - [bounds["right"], bounds["top"]], - [bounds["right"], bounds["bottom"]], - ] - ) - - stac_item = pystac.Item( - id=data["body"]["item_id"], - geometry=mapping(footprint), - bbox=bbox, - datetime=datetime.datetime.utcnow(), - properties={}, - collection=self.collection_id, - ) - - properties_list = dict() - - for k, v in data["body"]["properties"].items(): - if k not in self.drop_properties: - properties_list[k] = v - - stac_item.properties = properties_list - - link = pystac.Link( - "self", - "{}/collections/{}/items/{}".format( - self.stac_host, self.collection_id, stac_item.id - ), - ) - stac_item.add_link(link) - - # TODO : hardcoded url path replacements - asset = pystac.Asset( - href=data["body"]["properties"]["uri"].replace("dodsC", "fileServer"), - media_type="application/netcdf", - title=data["body"]["properties"]["filename"], - roles=["data"], - ) - stac_item.add_asset("metadata_http", asset) - - asset = pystac.Asset( - href=data["body"]["properties"]["uri"].replace("dodsC", "iso"), - media_type="application/xml", - title="ISO", - roles=["metadata"], - ) - stac_item.add_asset("metadata_iso", asset) - - asset = pystac.Asset( - href=data["body"]["properties"]["uri"].replace("dodsC", "ncml"), - media_type="application/xml", - title="NcML", - roles=["metadata"], - ) - stac_item.add_asset("metadata_ncml", asset) - - asset = pystac.Asset( - href=data["body"]["properties"]["uri"], - media_type="application/netcdf", - title="OPeNDAP", - roles=["data"], - ) - stac_item.add_asset("metadata_opendap", asset) - - return stac_item.to_dict() - def post_collection_item(self, stac_host, collection_id, json_data): """ Post an item to a collection. diff --git a/stac_generator/plugins/outputs/standard_out.py b/stac_generator/plugins/outputs/standard_out.py index 1cee516..b2521ce 100644 --- a/stac_generator/plugins/outputs/standard_out.py +++ b/stac_generator/plugins/outputs/standard_out.py @@ -46,4 +46,6 @@ def export(self, data: dict, **kwargs) -> None: :param kwargs: Not used """ pp = pprint.PrettyPrinter(indent=4) - pp.pprint(data) + # pp.pprint(data) + pp.pprint(data["instance_id"]) + # pp.pprint(data["dataset_id"])