From 0a13f96099dae3015a340c2be771533872147715 Mon Sep 17 00:00:00 2001 From: Marko Malenic Date: Thu, 19 Dec 2024 09:27:02 +1100 Subject: [PATCH 1/2] docs: update docs for path-based and regex config --- htsget-config/README.md | 669 ++++++++---------- htsget-config/examples/config-files/c4gh.toml | 24 +- .../examples/config-files/default.toml | 71 +- .../examples/config-files/s3_storage.toml | 23 +- .../config-files/tls_data_server.toml | 17 +- .../config-files/tls_ticket_server.toml | 18 +- .../examples/config-files/url_storage.toml | 27 +- htsget-config/src/storage/local.rs | 180 ----- 8 files changed, 370 insertions(+), 659 deletions(-) delete mode 100644 htsget-config/src/storage/local.rs diff --git a/htsget-config/README.md b/htsget-config/README.md index 4f9ec7d27..560448f5e 100644 --- a/htsget-config/README.md +++ b/htsget-config/README.md @@ -8,142 +8,145 @@ [actions-badge]: https://github.com/umccr/htsget-rs/actions/workflows/action.yml/badge.svg [actions-url]: https://github.com/umccr/htsget-rs/actions?query=workflow%3Atests+branch%3Amain +## Overview + Configuration for [htsget-rs]. [htsget-rs]: https://github.com/umccr/htsget-rs -## Overview +## Quickstart +The simplest way to use htsget-rs is to create a [toml] config file and specify a storage location: -This crate is used to configure htsget-rs using a config file or environment variables. +```toml +locations = "file://data" +``` -## Usage +Then launch the server using the config file: -To configure htsget-rs, a TOML config file can be defined. There is also support for reading config from environment variables. -Any config options set by environment variables override values in the config file. +```sh +cargo run --all-features -p htsget-axum -- --config +``` -The configuration consists of TOML tables, such as config for the ticket server, data server, service-info, or resolvers. +This will serve files under the [`data`][data] directory: -As a starting point, see the [basic TOML][basic] example file which should work for many use-cases. +```sh +curl 'http://localhost:8080/reads/bam/htsnexus_test_NA12878' +``` -#### Ticket server config +Locations allow htsget-rs access to bioinformatics files and indexes. Instead of local files, htsget-rs can access +files on s3, which returns pre-signed URLs for tickets: -The ticket server responds to htsget requests by returning a set of URL tickets that the client must fetch and concatenate. -To configure the ticket server, set the following options: +```toml +locations = "s3://bucket" +``` -| Option | Description | Type | Default | -|-----------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|-----------------------------| -| `ticket_server_addr` | The address for the ticket server. | Socket address | `'127.0.0.1:8080'` | -| `ticket_server_tls` | Enable TLS for the ticket server. See [TLS](#tls) for more details. | TOML table | Not enabled | -| `ticket_server_cors_allow_credentials` | Controls the CORS Access-Control-Allow-Credentials for the ticket server. | Boolean | `false` | -| `ticket_server_cors_allow_origins` | Set the CORS Access-Control-Allow-Origin returned by the ticket server, this can be set to `All` to send a wildcard, `Mirror` to echo back the request sent by the client, or a specific array of origins. | `'All'`, `'Mirror'` or a array of origins | `['http://localhost:8080']` | -| `ticket_server_cors_allow_headers` | Set the CORS Access-Control-Allow-Headers returned by the ticket server, this can be set to `All` to allow all headers, or a specific array of headers. | `'All'`, or a array of headers | `'All'` | -| `ticket_server_cors_allow_methods` | Set the CORS Access-Control-Allow-Methods returned by the ticket server, this can be set to `All` to allow all methods, or a specific array of methods. | `'All'`, or a array of methods | `'All'` | -| `ticket_server_cors_max_age` | Set the CORS Access-Control-Max-Age for the ticket server which controls how long a preflight request can be cached for. | Seconds | `86400` | -| `ticket_server_cors_expose_headers` | Set the CORS Access-Control-Expose-Headers returned by the ticket server, this can be set to `All` to expose all headers, or a specific array of headers. | `'All'`, or a array of headers | `[]` | +or on a remote HTTP server (either `http://` or `https://`): -TLS is supported by setting the `ticket_server_key` and `ticket_server_cert` options. An example of config for the ticket server: ```toml -ticket_server_addr = '127.0.0.1:8080' -ticket_server_cors_allow_credentials = false -ticket_server_cors_allow_origins = 'Mirror' -ticket_server_cors_allow_headers = ['Content-Type'] -ticket_server_cors_allow_methods = ['GET', 'POST'] -ticket_server_cors_max_age = 86400 -ticket_server_cors_expose_headers = [] +locations = "https://example.com" ``` -#### Data server config - -The local data server responds to tickets produced by the ticket server by serving local filesystem data. -To configure the data server, set the following options: - -| Option | Description | Type | Default | -|-------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|-----------------------------| -| `data_server_addr` | The address for the data server. | Socket address | `'127.0.0.1:8081'` | -| `data_server_local_path` | The local path which the data server can access to serve files. | Filesystem path | `'./'` | -| `data_server_serve_at` | The path which the data server will prefix to all response URLs for tickets. | URL path | `''` | -| `data_server_tls` | Enable TLS for the data server. See [TLS](#tls) for more details. | TOML table | Not enabled | -| `data_server_cors_allow_credentials` | Controls the CORS Access-Control-Allow-Credentials for the data server. | Boolean | `false` | -| `data_server_cors_allow_origins` | Set the CORS Access-Control-Allow-Origin returned by the data server, this can be set to `All` to send a wildcard, `Mirror` to echo back the request sent by the client, or a specific array of origins. | `'All'`, `'Mirror'` or a array of origins | `['http://localhost:8080']` | -| `data_server_cors_allow_headers` | Set the CORS Access-Control-Allow-Headers returned by the data server, this can be set to `All` to allow all headers, or a specific array of headers. | `'All'`, or a array of headers | `'All'` | -| `data_server_cors_allow_methods` | Set the CORS Access-Control-Allow-Methods returned by the data server, this can be set to `All` to allow all methods, or a specific array of methods. | `'All'`, or a array of methods | `'All'` | -| `data_server_cors_max_age` | Set the CORS Access-Control-Max-Age for the data server which controls how long a preflight request can be cached for. | Seconds | `86400` | -| `data_server_cors_expose_headers` | Set the CORS Access-Control-Expose-Headers returned by the data server, this can be set to `All` to expose all headers, or a specific array of headers. | `'All'`, or a array of headers | `[]` | - -TLS is supported by setting the `data_server_key` and `data_server_cert` options. An example of config for the data server: +Multiple locations can be specified by providing a list and an id prefix after the location: + ```toml -data_server_addr = '127.0.0.1:8081' -data_server_local_path = './' -data_server_serve_at = '' -data_server_key = 'key.pem' -data_server_cert = 'cert.pem' -data_server_cors_allow_credentials = false -data_server_cors_allow_origins = 'Mirror' -data_server_cors_allow_headers = ['Content-Type'] -data_server_cors_allow_methods = ['GET', 'POST'] -data_server_cors_max_age = 86400 -data_server_cors_expose_headers = [] +locations = ["file://data/bam", "file://data/cram"] ``` -Sometimes it may be useful to disable the data server as all responses to the ticket server will be handled elsewhere, such as with an AWS S3 data server. +This allows htsget-rs to serve data only when the request also contains the prefix: -To disable the data server, set the following option: +```sh +curl 'http://localhost:8080/reads/bam/htsnexus_test_NA12878' +curl 'http://localhost:8080/reads/cram/htsnexus_test_NA12878?format=CRAM' +``` -
-data_server_enabled = false
-
+Locations can be mixed, and don't all need to have the same directory or resource: -#### Service info config +```toml +data_server.local_path = "root" +locations = ["file://dir_two/bam", "file://dir_one/cram", "s3://bucket/vcf"] +``` -The service info config controls what is returned when the [`service-info`][service-info] path is queried.
-To configure the service-info, set the following options: +htsget-rs spawns a separate server process to respond to htsget tickets for file locations, +so setting `data_server.local_path` to the root directory which contains all subdirectories is +required to give this server access to the local directory. -| Option | Description | Type | Default | -|---------------------------------------------------------|---------------------------------------------|-----------|----------| -| `id` | Service ID. | String | Not set | -| `name` | Service name. | String | Not set | -| `version` | Service version. | String | Not set | -| `organization_name` | Organization name. | String | Not set | -| `organization_url` | Organization URL. | String | Not set | -| `contact_url` | Service contact URL | String | Not set | -| `documentation_url` | Service documentation URL. | String | Not set | -| `created_at` | When the service was created. | String | Not set | -| `updated_at` | When the service was last updated. | String | Not set | -| `environment` | The environment the service is running in. | String | Not set | +The data server process can be disabled by setting it to `None` if no file locations are being used: -An example of config for the service info: ```toml -id = 'id' -name = 'name' -version = '0.1' -organization_name = 'name' -organization_url = 'https://example.com/' -contact_url = 'mailto:nobody@example.com' -documentation_url = 'https://example.com/' -created_at = '2022-01-01T12:00:00Z' -updated_at = '2022-01-01T12:00:00Z' -environment = 'dev' +data_server = "None" ``` -#### Resolvers +> [!NOTE] +> For S3 locations, the bucket is not included in the request to htsget-rs. To include the bucket as well, +> see deriving the bucket from the first capture group in [advanced config](#bucket). + +> [!IMPORTANT] +> Some parts of htsget-rs require extra feature flags for conditional compilation, that's why the examples specify +> using `--all-features`. Notably, `--features s3-storage` enables the `S3` location type, and `--features url-storage` +> enabled the remote HTTP server location type. If using a subset of features, for example S3 locations only, then +> a single feature can be enabled instead of using `--all-features`. -The resolvers component of htsget-rs is used to map query IDs to the location of the resource. This is the component of the -code that takes the [`id`][id], which is everything after `reads/` or `variants/` in the http path, and maps it to a data location. +### Server config -For example, if the request to htsget-rs is: +htsget-rs spawn up to two server instances - the ticket server responds to the initial htsget request, and optionally, +the data server, which responds to the htsget tickets. -```sh -curl 'http://localhost:8080/reads/some_id/file' +The socket address of the servers can be changed by specifying `addr`: + +```toml +ticket_server.addr = "127.0.0.1:8000" +data_server.addr = "127.0.0.1:8001" ``` -Then the resolvers controls how the server finds `some_id/file`, which may be stored locally, in the cloud, or at an arbitrary URL location. -The resolvers maps `some_id/file` to a location using regexes and substitution strings. The location of the file does not -need to have the same name as the id. +TLS can be configured to enabled HTTPS support by providing a certificate and private key: -A query ID is matched with a regex, and is then mapped with a substitution string that has access to the regex capture groups. -Resolvers are configured in an array, where the first matching resolver is resolver used to map the ID. +```toml +ticket_server.tls.key = "key.pem" +ticket_server.tls.cert = "cert.pem" + +data_server.tls.key = "key.pem" +data_server.tls.cert = "cert.pem" +``` + +### Service info config + +The service info config controls what is returned when the [`service-info`][service-info] path is queried. The following +option accepts any nested value, which gets converted to a JSON response: + +```toml +service_info.environment = "dev" +service_info.organization = { name = "name", url = "https://example.com/" } +``` + +### Environment variables -To create a resolver, add a `[[resolvers]]` array of tables, and set the following options: +Most options can also be set using environment variables. Any environment variables will override options set in the +config file. Arrays are delimited with `[` and `]`, and items are separated by commas: + +| Variable | Description | Example | +|---------------------------------|----------------------------------------------------------------|----------------------------------------------------| +| `HTSGET_TICKET_SERVER_ADDR` | Set the ticket server socket address. | "127.0.0.1:8080" | +| `HTSGET_TICKET_SERVER_TLS_KEY` | See [server config](#server-config) | "key.pem" | +| `HTSGET_TICKET_SERVER_TLS_CERT` | See [server config](#server-config) | "cert.pem" | +| `HTSGET_DATA_SERVER_ADDR` | Set the data server socket address. | "127.0.0.1:8081" | +| `HTSGET_DATA_SERVER_LOCAL_PATH` | Set the path that the data server has access to. | "dir/path" | +| `HTSGET_DATA_SERVER_TLS_KEY` | See [server config](#server-config) | "key.pem" | +| `HTSGET_DATA_SERVER_TLS_CERT` | See `server config](#server-config) | "cert.pem" | +| `HTSGET_SERVICE_INFO` | Set the service info, see [service info](#service-info-config) | "{ organization = { name = name, url = url }}" | +| `HTSGET_LOCATIONS` | Set the locations. | "[file://data/prefix_one, s3://bucket/prefix_two]" | +| `HTSGET_CONFIG` | Set the config file location. | "dir/config.toml" | + +## Advanced config + +The following section describes advanced configuration which is more flexible, but adds complexity. + +### Regex-based location + +Instead of the simple path-based locations described above, htsget-rs supports arbitrary regex-based id resolution. +This allows matching an [`id`][id], which is everything after `reads/` or `variants/` in the http path, and mapping +it to a location using regex substitution. + +To create a regex location, add a `[[locations]]` array of tables, and set the following options: | Option | Description | Type | Default | |-----------------------|-------------------------------------------------------------------------------------------------------------------------|---------------------------------------|---------| @@ -151,106 +154,114 @@ To create a resolver, add a `[[resolvers]]` array of tables, and set the followi | `substitution_string` | The replacement expression used to map the matched query ID. This has access to the match groups in the `regex` option. | String with access to capture groups | `'$0'` | For example, below is a `regex` option which matches a `/` between two groups, and inserts an additional `data` -in between the groups with the `substitution_string`. +in between the groups with the `substitution_string`: ```toml -[[resolvers]] +[[locations]] regex = '(?P.*?)/(?P.*)' substitution_string = '$group1/data/$group2' ``` -This would mean that a request to `http://localhost:8080/reads/some_id/file` would search for files at `some_id/data/file.bam` and `some_id/data/file.bam.bai`. +This would mean that a request to `http://localhost:8080/reads/some_id/file` would search for files at `some_id/data/file.bam`. -For more information about regex options see the [regex crate](https://docs.rs/regex/). +The regex locations also have access to further configuration of storage locations for `file://`, `s3://`, or `http://` +locations. These are called `File`, `S3`, and `Url` respectively. -Each resolver also maps to a certain storage backend. This storage backend can be used to set query IDs which are served from local storage, from S3-style bucket storage, or from HTTP URLs. -To set the storage backend for a resolver, add a `[resolvers.storage]` table. Some storage backends require feature flags to be set when compiling htsget-rs. +To manually configure `File` locations, set `backend.kind = "File"`, and specify any additional options from below the `backend` table: -To use `LocalStorage`, set `backend = 'Local'` under `[resolvers.storage]`, and specify any additional options from below: +| Option | Description | Type | Default | +|--------------------------|------------------------------------------------------------------------------------------------------------------------------------|------------------------------|--------------------| +| `scheme` | The scheme present on URL tickets. | Either `'Http'` or `'Https'` | `'Http'` | +| `authority` | The authority present on URL tickets. This should likely match the `data_server.addr`. | URL authority | `'127.0.0.1:8081'` | +| `local_path` | The local filesystem path which the data server uses to respond to tickets. This should likely match the `data_server.local_path`. | Filesystem path | `'./'` | -| Option | Description | Type | Default | -|--------------------------|-------------------------------------------------------------------------------------------------------------------------------------|------------------------------|--------------------| -| `scheme` | The scheme present on URL tickets. | Either `'Http'` or `'Https'` | `'Http'` | -| `authority` | The authority present on URL tickets. This should likely match the `data_server_addr`. | URL authority | `'127.0.0.1:8081'` | -| `local_path` | The local filesystem path which the data server uses to respond to tickets. This should likely match the `data_server_local_path`. | Filesystem path | `'./'` | -| `path_prefix` | The path prefix which the URL tickets will have. This should likely match the `data_server_serve_at` path. | URL path | `''` | -| `use_data_server_config` | Whether to use the data server config to fill in the above values. This overrides any other options specified from this table. | Boolean | `false` | - -By default, if the above options are left unspecified, they inherit values from the [`data_server`][data-server] config. -For example, the following sets the `scheme`, `authority`, `local_path` and `path_prefix` to values used by the `data_server`. +For example: ```toml -[[resolvers]] -regex = '.*' -substitution_string = '$0' +data_server.addr = "127.0.0.1:8000" -[resolvers.storage] -backend = 'Local' +[[locations]] +regex = ".*" +substitution_string = "$0" + +backend.kind = "Local" +backend.scheme = "Http" +backend.authority = "127.0.0.1:8000" +backend.local_path = "path" ``` -To use `S3Storage`, build htsget-rs with the `s3-storage` feature enabled, set `backend = 'S3'` under `[resolvers.storage]`, and specify: +To manually configure `S3` locations, set `backend.kind = "S3"`, and specify options from below under the `backend` table: + +| Option | Description | Type | Default | +|------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|--------------------------------------------------------------------------------------------------------------------------| +| `bucket` | The AWS S3 bucket where resources can be retrieved from. | String | Derived from the `location` `regex` property if empty. This uses the first capture group in the `regex` as the `bucket`. | +| `endpoint` | A custom endpoint to override the default S3 service address. This is useful for using S3 locally or with storage backends such as MinIO. See [MinIO](#minio). | String | Not set, uses regular AWS S3 services. | +| `path_style` | The S3 path style to request from the storage backend. If `true`, "path style" is used, e.g. `host.com/bucket/object.bam`, otherwise `bucket.host.com/object` style is used. | Boolean | `false` | -| Option | Description | Type | Default | -|--------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|---------------------------------------------------------------------------------------------------------------------------| -| `bucket` | The AWS S3 bucket where resources can be retrieved from. | String | Derived from the `resolvers` `regex` property if empty. This uses the first capture group in the `regex` as the `bucket`. | -| `endpoint` | A custom endpoint to override the default S3 service address. This is useful for using S3 locally or with storage backends such as MinIO. See [MinIO](#minio). | String | Not set, uses regular AWS S3 services. | -| `path_style` | The S3 path style to request from the storage backend. If `true`, "path style" is used, e.g. `host.com/bucket/object.bam`, otherwise `bucket.host.com/object` style is used. | Boolean | `false` | +For example, the following backend manually sets the `bucket` and uses path style requests: -For example, a `resolvers` value of: ```toml -[[resolvers]] -regex = '^(example_bucket)/(?P.*)$' -substitution_string = '$key' +[[locations]] +regex = "prefix/(?P.*)$" +substitution_string = "$key" -[resolvers.storage] -backend = 'S3' -# Uses the first capture group in the regex as the bucket. +backend.kind = "S3" +backend.bucket = "bucket" +backend.path_style = true ``` -Will use "example_bucket" as the S3 bucket if that resolver matches, because this is the first capture group in the `regex`. -Note, to use this feature, at least one capture group must be defined in the `regex`. +To manually configure `Url` locations, set `backend.kind = "Url"`, specify any additional options from below under the `backend` table: -`UrlStorage` is a storage backend which can be used to serve data from a remote HTTP URL. When using this storage backend, htsget-rs will fetch data from a `url` which is set in the config. It will also forward any headers received with the initial query, which is useful for authentication. -To use `UrlStorage`, build htsget-rs with the `url-storage` feature enabled, set `backend = 'Url'` under `[resolvers.storage]`, and specify any additional options from below: +| Option | Description | Type | Default | +|--------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|-----------------------------------------------------------------------------------------------------------------| +| `url` | The URL to fetch data from. | HTTP URL | `"https://127.0.0.1:8081/"` | +| `response_url` | The URL to return to the client for fetching tickets. | HTTP URL | `"https://127.0.0.1:8081/"` | +| `forward_headers` | When constructing the URL tickets, copy HTTP headers received in the initial query. | Boolean | `true` | +| `header_blacklist` | List of headers that should not be forwarded. | Array of headers | `[]` | +| `tls` | Additionally enables client authentication, or sets non-native root certificates for TLS. See [server configuration](#server-configuration) for more details. | TOML table | TLS is always allowed, however the default performs no client authentication and uses native root certificates. | -| Option | Description | Type | Default | -|--------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|--------------------------|-----------------------------------------------------------------------------------------------------------------| -| `url` | The URL to fetch data from. | HTTP URL | `"https://127.0.0.1:8081/"` | -| `response_url` | The URL to return to the client for fetching tickets. | HTTP URL | `"https://127.0.0.1:8081/"` | -| `forward_headers` | When constructing the URL tickets, copy HTTP headers received in the initial query. | Boolean | `true` | -| `header_blacklist` | List of headers that should not be forwarded. | Array of headers | `[]` | -| `tls` | Additionally enables client authentication, or sets non-native root certificates for TLS. See [TLS](#tls) for more details. | TOML table | TLS is always allowed, however the default performs no client authentication and uses native root certificates. | +For example, the following forwards all headers to response tickets except `Host`, and constructs tickets using `https://example.com` instead of `http://localhost:8080`: -When using `UrlStorage`, the following requests will be made to the `url`. -* `GET` request to fetch only the headers of the data file (e.g. `GET /data.bam`, with `Range: bytes=0-`). -* `GET` request to fetch the entire index file (e.g. `GET /data.bam.bai`). -* `HEAD` request on the data file to get its length (e.g. `HEAD /data.bam`). +```toml +[[locations]] +regex = ".*" +substitution_string = "$0" -By default, all headers received in the initial query will be included when making these requests. To exclude certain headers from being forwarded, set the `header_blacklist` option. Note that the blacklisted headers are removed from the requests made to `url` and from the URL tickets as well. +backend.kind = "Url" +backend.url = "http://localhost:8080" +backend.response_url = "https://example.com" +backend.forward_headers = true +backend.header_blacklist = ["Host"] +``` -Example of a resolver with `UrlStorage`: +Regex-based locations also support multiple locations: ```toml -[[resolvers]] +[[locations]] +regex = "prefix/(?P.*)$" +substitution_string = "$key" +backend.kind = "S3" +backend.bucket = "bucket" +backend.path_style = true + +[[locations]] regex = ".*" substitution_string = "$0" - -[resolvers.storage] -backend = 'Url' -url = "http://localhost:8080" -response_url = "https://example.com" -forward_headers = true -header_blacklist = ["Host"] +backend.kind = "Url" +backend.url = "http://localhost:8080" +forward_headers = false ``` -There are additional examples of config files located under [`examples/config-files`][examples-config-files]. +If there is an overlap in regex matches, the first location specified will be the one used. + +Additional config file examples are available under [`example/config-files`][examples-config-files]. -#### Allow guard -Additionally, the resolver component has a feature, which allows resolving IDs based on the other fields present in a query. -This is useful as it allows the resolver to match an ID only if a particular set of query parameters are also present. For example, -a resolver can be set to only resolve IDs if the format is also BAM. +### Allow guard -This component can be configured by setting the `[resolver.allow_guard]` table with. The following options are available to restrict which queries are resolved by a resolver: +Additionally, locations support resolving IDs based on the other fields present in a query. +This is useful to allow the location to match an ID only if a particular set of query parameters are also present. + +This component can be configured by setting the `guard` table with: | Option | Description | Type | Default | |-------------------------|-----------------------------------------------------------------------------------------|-----------------------------------------------------------------------|-------------------------------------| @@ -259,59 +270,42 @@ This component can be configured by setting the `[resolver.allow_guard]` table w | `allow_tags` | Resolve the query ID if the query also contains the tags set by this option. | Array of tags or `'All'` | `'All'` | | `allow_formats` | Resolve the query ID if the query is one of the formats specified by this option. | An array of formats containing `'BAM'`, `'CRAM'`, `'VCF'`, or `'BCF'` | `['BAM', 'CRAM', 'VCF', 'BCF']` | | `allow_classes` | Resolve the query ID if the query is one of the classes specified by this option. | An array of classes containing eithr `'body'` or `'header'` | `['body', 'header']` | -| `allow_interval_start` | Resolve the query ID if the query reference start position is at least this option. | Unsigned 32-bit integer start position, 0-based, inclusive | Not set, allows all start positions | -| `allow_interval_end` | Resolve the query ID if the query reference end position is at most this option. | Unsigned 32-bit integer end position, 0-based exclusive. | Not set, allows all end positions | +| `allow_interval.start` | Resolve the query ID if the query reference start position is at least this option. | Unsigned 32-bit integer start position, 0-based, inclusive | Not set, allows all start positions | +| `allow_interval.end` | Resolve the query ID if the query reference end position is at most this option. | Unsigned 32-bit integer end position, 0-based exclusive | Not set, allows all end positions | -An example of a fully configured resolver: +For example, match only if the request queries `chr1` with positions between `100` and `1000`: ```toml -[[resolvers]] -regex = '.*' -substitution_string = '$0' - -[resolvers.storage] -backend = 'S3' -bucket = 'bucket' - -[resolvers.allow_guard] -allow_reference_names = ['chr1'] -allow_fields = ['QNAME'] -allow_tags = ['RG'] -allow_formats = ['BAM'] -allow_classes = ['body'] -allow_interval_start = 100 -allow_interval_end = 1000 -``` - -In this example, the resolver will only match the query ID if the query is for `chr1` with positions between `100` and `1000`. +[[locations]] +regex = ".*" +substitution_string = "$0" -#### TLS +backend.kind = "S3" +backend.bucket = "bucket" -TLS can be configured for the ticket server, data server, or the url storage client. These options read private keys and -certificates from PEM-formatted files. Certificates must be in X.509 format and private keys can be RSA, PKCS8, or SEC1 (EC) encoded. -The following options are available: +guard.allow_reference_names = ["chr1"] +guard.allow_interval.start = 100 +guard.allow_interval.end = 1000 +``` -| Option | Description | Type | Default | -|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|-------------------|---------| -| `key` | The path to the PEM formatted X.509 certificate. Specifies TLS for servers or client authentication for clients. | Filesystem path | Not Set | -| `cert` | The path to the PEM formatted RSA, PKCS8, or SEC1 encoded EC private key. Specifies TLS for servers or client authentication for clients. | Filesystem path | Not Set | -| `root_store` | The path to the PEM formatted root certificate store. Only used to specify non-native root certificates for the HTTP client in `UrlStorage`. | Filesystem path | Not Set | +### Server configuration -When used by the ticket and data servers, `key` and `cert` enable TLS, and when used with the url storage client, they enable client authentication. -The root store is only used by the url storage client. Note, the url storage client always allows TLS, however the default configuration performs no client authentication -and uses the native root certificate store. +To use custom root certificates for `Url` locations, set the following: -For example, TLS for the ticket server can be enabled by specifying the key and cert options: ```toml -ticket_server_tls.cert = "cert.pem" -ticket_server_tls.key = "key.pem" +[[locations]] +regex = ".*" +substitution_string = "$0" + +backend.kind = "Url" +backend.tls.root_store = "root.crt" ``` This project uses [rustls] for all TLS logic, and it does not depend on OpenSSL. The rustls library can be more strict when accepting certificates and keys. If generating certificates for `root_store` using OpenSSL, the correct extensions, such as `subjectAltName` should be included. -An example of generating a custom root CA and certificates for a `UrlStorage` backend: +An example of generating a custom root CA and certificates for a `Url` backend: ```sh # Create a root CA @@ -320,173 +314,43 @@ openssl req -x509 -noenc -subj '/CN=localhost' -newkey rsa -keyout root.key -out # Create a certificate signing request openssl req -noenc -newkey rsa -keyout server.key -out server.csr -subj '/CN=localhost' -addext subjectAltName=DNS:localhost -# Create the `UrlStorage` server's certificate +# Create the `Url` server's certificate openssl x509 -req -in server.csr -CA root.crt -CAkey root.key -days 365 -out server.crt -copy_extensions copy # An additional client certificate signing request and certificate can be created in the same way as the server # certificate if using client authentication. ``` -The `root.crt` can then be used in htsget-rs to allow authenticating to a `UrlStorage` backend using `server.crt`: +CORS can also be configured for the data and ticket servers by specifying the `cors` option: ```toml -# Trust the root CA that signed the server's certificate. -tls.root_store = "root.crt" -``` - -Alternatively, projects such as [mkcert] can be used to simplify this process. - -Further TLS examples are available under [`examples/config-files`][examples-config-files]. - -[examples-config-files]: examples/config-files -[rustls]: https://github.com/rustls/rustls -[mkcert]: https://github.com/FiloSottile/mkcert - -#### Config file location - -The htsget-rs binaries ([htsget-axum], [htsget-actix] and [htsget-lambda]) support some command line options. The config file location can -be specified by setting the `--config` option: - -```shell -cargo run -p htsget-axum -- --config "config.toml" +ticket_server.cors.allow_credentials = false +ticket_server.cors.allow_origins = "Mirror" +ticket_server.cors.allow_headers = "All" +ticket_server.cors.allow_methods = ["GET", "POST"] +ticket_server.cors.max_age = 86400 +ticket_server.cors.expose_headers = [] ``` -The config can also be read from an environment variable: - -```shell -export HTSGET_CONFIG="config.toml" -``` - -If no config file is specified, the default configuration is used. Further, the default configuration file can be printed to stdout by passing -the `--print-default-config` flag: - -```shell -cargo run -p htsget-axum -- --print-default-config -``` - -Use the `--help` flag to see more details on command line options. - -[htsget-actix]: ../htsget-actix -[htsget-axum]: ../htsget-axum -[htsget-lambda]: ../htsget-lambda - -#### Log formatting - -The [Tracing][tracing] crate is used by htsget-rs is for logging functionality. The `RUST_LOG` variable is -read to configure the level that trace logs are emitted. - -For example, the following indicates trace level for all htsget crates, and info level for all other crates: - -```sh -export RUST_LOG='info,htsget_lambda=trace,htsget_lambda=trace,htsget_config=trace,htsget_http=trace,htsget_search=trace,htsget_test=trace' -``` - -See [here][rust-log] for more information on setting this variable. - -The style of formatting can be configured by setting the following option: - -| Option | Description | Type | Default | -|---------------------------------------------------------|--------------------------------------|--------------------------------------------------------|----------| -| `formatting_style` | The style of log formatting to use. | One of `'Full'`, `'Compact'`, `'Pretty'`, or `'Json'` | `'Full'` | - -See [here][formatting-style] for more information on how these values look. - -[tracing]: https://github.com/tokio-rs/tracing -[rust-log]: https://rust-lang-nursery.github.io/rust-cookbook/development_tools/debugging/config_log.html -[formatting-style]: https://docs.rs/tracing-subscriber/latest/tracing_subscriber/fmt/index.html#formatters - -#### Environment variables - -All the htsget-rs config options can be set using environment variables, which is convenient for runtimes such as AWS Lambda. -The ticket server, data server and service info options are flattened and can be set directly using -environment variable. It is not recommended to set the resolvers using environment variables, however it can be done by setting a single environment variable which -contains a list of structures, where a key name and value pair is used to set the nested options. - -Environment variables will override options set in the config file. Note, arrays are delimited with `[` and `]` in environment variables, and items are separated by commas. - -The following environment variables - corresponding to the TOML config - are available: - -| Variable | Description | -|-----------------------------------------------|-------------------------------------------------------------------------------------| -| `HTSGET_TICKET_SERVER_ADDR` | See [`ticket_server_addr`](#ticket_server_addr) | -| `HTSGET_TICKET_SERVER_TLS_KEY` | See [`TLS`](#tls) | -| `HTSGET_TICKET_SERVER_TLS_CERT` | See [`TLS`](#tls) | -| `HTSGET_TICKET_SERVER_CORS_ALLOW_CREDENTIALS` | See [`ticket_server_cors_allow_credentials`](#ticket_server_cors_allow_credentials) | -| `HTSGET_TICKET_SERVER_CORS_ALLOW_ORIGINS` | See [`ticket_server_cors_allow_origins`](#ticket_server_cors_allow_origins) | -| `HTSGET_TICKET_SERVER_CORS_ALLOW_HEADERS` | See [`ticket_server_cors_allow_headers`](#ticket_server_cors_allow_headers) | -| `HTSGET_TICKET_SERVER_CORS_ALLOW_METHODS` | See [`ticket_server_cors_allow_methods`](#ticket_server_cors_allow_methods) | -| `HTSGET_TICKET_SERVER_CORS_MAX_AGE` | See [`ticket_server_cors_max_age`](#ticket_server_cors_max_age) | -| `HTSGET_TICKET_SERVER_CORS_EXPOSE_HEADERS` | See [`ticket_server_cors_expose_headers`](#ticket_server_cors_expose_headers) | -| `HTSGET_DATA_SERVER_ADDR` | See [`data_server_addr`](#data_server_addr) | -| `HTSGET_DATA_SERVER_LOCAL_PATH` | See [`data_server_local_path`](#data_server_local_path) | -| `HTSGET_DATA_SERVER_SERVE_AT` | See [`data_server_serve_at`](#data_server_serve_at) | -| `HTSGET_DATA_SERVER_TLS_KEY` | See [`TLS`](#tls) | -| `HTSGET_DATA_SERVER_TLS_CERT` | See [`TLS`](#tls) | -| `HTSGET_DATA_SERVER_CORS_ALLOW_CREDENTIALS` | See [`data_server_cors_allow_credentials`](#data_server_cors_allow_credentials) | -| `HTSGET_DATA_SERVER_CORS_ALLOW_ORIGINS` | See [`data_server_cors_allow_origins`](#data_server_cors_allow_origins) | -| `HTSGET_DATA_SERVER_CORS_ALLOW_HEADERS` | See [`data_server_cors_allow_headers`](#data_server_cors_allow_headers) | -| `HTSGET_DATA_SERVER_CORS_ALLOW_METHODS` | See [`data_server_cors_allow_methods`](#data_server_cors_allow_methods) | -| `HTSGET_DATA_SERVER_CORS_MAX_AGE` | See [`data_server_cors_max_age`](#data_server_cors_max_age) | -| `HTSGET_DATA_SERVER_CORS_EXPOSE_HEADERS` | See [`data_server_cors_expose_headers`](#data_server_cors_expose_headers) | -| `HTSGET_ID` | See [`id`](#id) | -| `HTSGET_NAME` | See [`name`](#name) | -| `HTSGET_VERSION` | See [`version`](#version) | -| `HTSGET_ORGANIZATION_NAME` | See [`organization_name`](#organization_name) | -| `HTSGET_ORGANIZATION_URL` | See [`organization_url`](#organization_url) | -| `HTSGET_CONTACT_URL` | See [`contact_url`](#contact_url) | -| `HTSGET_DOCUMENTATION_URL` | See [`documentation_url`](#documentation_url) | -| `HTSGET_CREATED_AT` | See [`created_at`](#created_at) | -| `HTSGET_UPDATED_AT` | See [`updated_at`](#updated_at) | -| `HTSGET_ENVIRONMENT` | See [`environment`](#environment) | -| `HTSGET_RESOLVERS` | See [`resolvers`](#resolvers) | -| `HTSGET_FORMATTING_STYLE` | See [`formatting_style`](#formatting_style) | - -In order to use `HTSGET_RESOLVERS`, the entire resolver config array must be set. The nested array of resolvers structure can be set using name key and value pairs, for example: - -```shell -export HTSGET_RESOLVERS="[{ - regex=regex, - substitution_string=substitution_string, - storage={ - type=S3, - bucket=bucket - }, - allow_guard={ - allow_reference_names=[chr1], - allow_fields=[QNAME], - allow_tags=[RG], - allow_formats=[BAM], - allow_classes=[body], - allow_interval_start=100, - allow_interval_end=1000 - } -}]" -``` - -Similar to the [data_server](#data_server) option, the data server can be disabled by setting the equivalent environment variable: - -```shell -export HTSGET_DATA_SERVER_ENABLED=false -``` -[service-info]: https://samtools.github.io/hts-specs/htsget.html#ga4gh-service-info +Use `"Mirror"` to mirror CORS requests, and `"All"` to allow all methods, headers, or origins. The `ticket_server` table +above can be replaced with `data_server` to configure CORS for the data server. ### MinIO -Operating a local object storage like [MinIO][minio] can be achieved by leveraging the `endpoint` directive as shown below: +Operating a local object storage like [MinIO][minio] can be achieved by using `endpoint` under `"S3"` locations as shown below: ```toml -[[resolvers]] -regex = '.*' -substitution_string = '$0' - -[resolvers.storage] -backend = 'S3' -bucket = 'bucket' -endpoint = 'http://127.0.0.1:9000' -path_style = true +[[locations]] +regex = ".*" +substitution_string = "$0" + +backend.kind = 'S3' +backend.bucket = 'bucket' +backend.endpoint = 'http://127.0.0.1:9000' +backend.path_style = true ``` -Care must be taken to ensure that the [correct][env-variables] `AWS_DEFAULT_REGION`, `AWS_ACCESS_KEY` and `AWS_SECRET_ACCESS_KEY` is set to allow +Care must be taken to ensure that the [correct][env-variables] `AWS_DEFAULT_REGION`, `AWS_ACCESS_KEY` and `AWS_SECRET_ACCESS_KEY` are set to allow the AWS sdk to reach the endpoint. Additional configuration of the MinIO server is required to use [virtual-hosted][virtual-addressing] style addressing by setting the `MINIO_DOMAIN` environment variable. [Path][path-addressing] style addressing can be forced using `path_style = true`. @@ -494,73 +358,110 @@ See the MinIO deployment [example][minio-deployment] for more information on how ### Crypt4GH -There is experimental support for serving [Crypt4GH][c4gh] encrypted files. This can be enabled by compiling with the -`experimental` feature flag. +There is experimental support for serving [Crypt4GH][c4gh] encrypted files. This allows htsget-rs to read Crypt4GH files and serve them encrypted, directly to the client. In the process of serving the data, htsget-rs will decrypt the headers of the Crypt4GH files and re-encrypt them so that the client can read them. When the client receives byte ranges from htsget-rs and concatenates them, the output bytes will be Crypt4GH encrypted, and will need to be decrypted before they can be read. All file formats (BAM, CRAM, VCF, and BCF) are supported using Crypt4GH. -To use this feature, set `location = 'Local'` under `resolvers.storage.keys` to specify the private and public keys: +To use this feature, set `keys.kind = "File"` under the `location` table to specify the private and public keys: -| Option | Description | Type | Default | -|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|---------| -| `private_key` | The path to PEM formatted private key which htsget-rs uses to decrypt Crypt4GH data. | Filesystem path | Not Set | -| `recipient_public_key` | The path to the PEM formatted public key which the recipient of the data will use. This is what the client will use to decrypt the returned data, using the corresponding private key. | Filesystem path | Not Set | +| Option | Description | Type | Default | +|-----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|---------| +| `private` | The path to PEM formatted private key which htsget-rs uses to decrypt Crypt4GH data. | Filesystem path | Not Set | +| `public` | The path to the PEM formatted public key which the recipient of the data will use. This is what the client will use to decrypt the returned data, using the corresponding private key. | Filesystem path | Not Set | For example: ```toml [[resolvers]] -regex = '.*' -substitution_string = '$0' +regex = ".*" +substitution_string = "$0" -[resolvers.storage.keys] -location = 'Local' -private_key = 'data/c4gh/keys/bob.sec' # pragma: allowlist secret -recipient_public_key = 'data/c4gh/keys/alice.pub' +location.keys.kind = "File" +location.keys.private = "data/c4gh/keys/bob.sec" # pragma: allowlist secret +location.keys.public = "data/c4gh/keys/alice.pub" ``` -Keys can also be retrieved from [AWS Secrets Manager][secrets-manager]. Compile with the `s3-storage` feature flag and specify `location = 'SecretsManager'` under -`resolvers.storage.keys` to fetch keys from Secrets Manager. When using Secrets Manager, the `private_key` and `recipient_public_key` +Keys can also be retrieved from [AWS Secrets Manager][secrets-manager]. Compile with the `s3-storage` feature flag and specify `keys.kind = "SecretsManager"` under +`location` to fetch keys from Secrets Manager. When using Secrets Manager, the `private` and `public` correspond to ARNs or secret names in Secrets Manager storing PEM formatted keys. For example: ```toml -[[resolvers]] -regex = '.*' -substitution_string = '$0' +[[locations]] +regex = ".*" +substitution_string = "$0" -[resolvers.storage.keys] -location = 'SecretsManager' -private_key = 'private_key_secret_name' # pragma: allowlist secret -recipient_public_key = 'public_key_secret_name' +location.keys.kind = "SecretsManager" +location.keys.private = "private_key_secret_name" # pragma: allowlist secret +location.keys.public = "public_key_secret_name" ``` The htsget-rs server expects the Crypt4GH file to end with `.c4gh`, and the index file to be unencrypted. See the [`data/c4gh`][data-c4gh] for examples of file structure. Any of the storage types are supported, i.e. `Local`, `S3`, or `Url`. +### Log formatting + +The `RUST_LOG` variable is read to configure the level that trace logs are emitted. + +For example, the following indicates trace level for all htsget crates, and info level for all other crates: + +```sh +export RUST_LOG='info,htsget_lambda=trace,htsget_lambda=trace,htsget_config=trace,htsget_http=trace,htsget_search=trace,htsget_test=trace' +``` + +See [here][rust-log] for more information on setting this variable. + +The style of formatting can be configured by setting the following option: + +| Option | Description | Type | Default | +|---------------------------------------------------------|--------------------------------------|--------------------------------------------------------|----------| +| `formatting_style` | The style of log formatting to use. | One of `'Full'`, `'Compact'`, `'Pretty'`, or `'Json'` | `'Full'` | + +See [here][formatting-style] for more information on how these values look. + +### Environment variables + +Advanced configuration options also support environment variables. Generally, options separated by `.` in a config file +are separated by `_` in the corresponding environment variable. For example, to set the ticket server allow origins, +use `HTSGET_TICKET_SERVER_CORS_ALLOW_ORIGINS`. It is not recommended to set regex-based locations using environment +variables because the variables needs to contain the nested array structure of storage backends. + ### As a library This crate reads config files and environment variables using [figment], and accepts command-line arguments using clap. The main function for this is `from_config`, -which is used to obtain the `Config` struct. The crate also contains the `regex_resolver` abstraction, which is used for matching a query ID with -regex, and changing it by using a substitution string. +which is used to obtain the `Config` struct. The crate also contains the `resolver` abstraction, which is used for matching a query ID with +regex, and changing it by using a substitution string. Advanced configuration options are specified in the [`advanced.rs`][advanced] submodule. +[advanced]: src/config/advanced/mod.rs [figment]: https://github.com/SergioBenitez/Figment -#### Feature flags +### Feature flags This crate has the following features: -* `s3-storage`: used to enable `S3Storage` functionality. -* `url-storage`: used to enable `UrlStorage` functionality. +* `s3-storage`: used to enable `S3` location functionality. +* `url-storage`: used to enable `Url` location functionality. * `experimental`: used to enable experimental features that aren't necessarily part of the htsget spec, such as Crypt4GH support through `C4GHStorage`. ## License This project is licensed under the [MIT license][license]. +[tracing]: https://github.com/tokio-rs/tracing +[rust-log]: https://rust-lang-nursery.github.io/rust-cookbook/development_tools/debugging/config_log.html +[formatting-style]: https://docs.rs/tracing-subscriber/latest/tracing_subscriber/fmt/index.html#formatters +[examples-config-files]: examples/config-files +[rustls]: https://github.com/rustls/rustls +[htsget-actix]: ../htsget-actix +[htsget-axum]: ../htsget-axum +[htsget-lambda]: ../htsget-lambda +[tracing]: https://github.com/tokio-rs/tracing +[rust-log]: https://rust-lang-nursery.github.io/rust-cookbook/development_tools/debugging/config_log.html +[formatting-style]: https://docs.rs/tracing-subscriber/latest/tracing_subscriber/fmt/index.html#formatters +[service-info]: https://samtools.github.io/hts-specs/htsget.html#ga4gh-service-info [path-addressing]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#path-style-access [env-variables]: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html [virtual-addressing]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#virtual-hosted-style-access @@ -571,5 +472,5 @@ This project is licensed under the [MIT license][license]. [data-c4gh]: ../data/c4gh [secrets-manager]: https://docs.aws.amazon.com/secretsmanager/latest/userguide/intro.html [id]: https://samtools.github.io/hts-specs/htsget.html#url-parameters -[basic]: examples/config-files/basic.toml -[data-server]: README.md#data-server-config \ No newline at end of file +[toml]: https://toml.io/en/ +[data]: ../data \ No newline at end of file diff --git a/htsget-config/examples/config-files/c4gh.toml b/htsget-config/examples/config-files/c4gh.toml index 4a350fec5..00ccd1e17 100644 --- a/htsget-config/examples/config-files/c4gh.toml +++ b/htsget-config/examples/config-files/c4gh.toml @@ -1,23 +1,23 @@ # An example of running htsget-rs with Crypt4GH enabled. # Run with `cargo run -p htsget-axum --features experimental -- --config htsget-config/examples/config-files/c4gh.toml` -ticket_server_addr = "127.0.0.1:8080" -data_server_addr = "127.0.0.1:8081" +ticket_server.addr = "127.0.0.1:8080" +data_server.addr = "127.0.0.1:8081" [[resolvers]] regex = ".*" substitution_string = "$0" -[resolvers.storage] -backend = 'Local' +[locations.backend] +kind = "File" -[resolvers.storage.keys] -location = "Local" -private_key = "data/c4gh/keys/bob.sec" # pragma: allowlist secret -recipient_public_key = "data/c4gh/keys/alice.pub" +[locations.backend.keys] +kind = "Local" +private = "data/c4gh/keys/bob.sec" # pragma: allowlist secret +public = "data/c4gh/keys/alice.pub" # Or, use AWS secrets manager to store keys. -#[resolvers.storage.keys] -#location = "SecretsManager" -#private_key = "htsget/test_c4gh_private_key" # pragma: allowlist secret -#recipient_public_key = "htsget/test_c4gh_public_key" +#[locations.backend.keys] +#kind = "SecretsManager" +#private = "htsget/test_c4gh_private_key" # pragma: allowlist secret +#public = "htsget/test_c4gh_public_key" diff --git a/htsget-config/examples/config-files/default.toml b/htsget-config/examples/config-files/default.toml index 93ea36080..6845e7905 100644 --- a/htsget-config/examples/config-files/default.toml +++ b/htsget-config/examples/config-files/default.toml @@ -1,40 +1,37 @@ # Config generated by running `cargo run -p htsget-axum -- -p` formatting_style = "Full" -ticket_server_addr = "127.0.0.1:8080" -ticket_server_cors_allow_credentials = false -ticket_server_cors_allow_origins = ["http://localhost:8080"] -ticket_server_cors_allow_headers = "All" -ticket_server_cors_allow_methods = "All" -ticket_server_cors_max_age = 86400 -ticket_server_cors_expose_headers = [] -data_server_enabled = true -data_server_addr = "127.0.0.1:8081" -data_server_local_path = "./" -data_server_serve_at = "" -data_server_cors_allow_credentials = false -data_server_cors_allow_origins = ["http://localhost:8080"] -data_server_cors_allow_headers = "All" -data_server_cors_allow_methods = "All" -data_server_cors_max_age = 86400 -data_server_cors_expose_headers = [] - -[[resolvers]] -regex = ".*" -substitution_string = "$0" -storage = "Local" - -[resolvers.allow_guard] -allow_reference_names = "All" -allow_fields = "All" -allow_tags = "All" -allow_formats = [ - "BAM", - "CRAM", - "VCF", - "BCF", -] -allow_classes = [ - "body", - "header", -] + +[ticket_server] +addr = "127.0.0.1:8080" + +[ticket_server.cors] +allow_credentials = false +allow_origins = "Mirror" +allow_headers = "Mirror" +allow_methods = "Mirror" +max_age = 2592000 +expose_headers = "All" + +[data_server] +addr = "127.0.0.1:8081" +local_path = "./" + +[data_server.cors] +allow_credentials = false +allow_origins = "Mirror" +allow_headers = "Mirror" +allow_methods = "Mirror" +max_age = 2592000 +expose_headers = "All" + +[service_info] + +[[locations]] +prefix = "" + +[locations.backend] +kind = "File" +scheme = "HTTP" +authority = "127.0.0.1:8081" +local_path = "./" diff --git a/htsget-config/examples/config-files/s3_storage.toml b/htsget-config/examples/config-files/s3_storage.toml index 5cde4dffe..7948e4576 100644 --- a/htsget-config/examples/config-files/s3_storage.toml +++ b/htsget-config/examples/config-files/s3_storage.toml @@ -1,19 +1,18 @@ # An example for a server which uses s3 storage with data located in "bucket". # Run with `cargo run -p htsget-axum --features s3-storage -- --config htsget-config/examples/config-files/s3_storage.toml` -ticket_server_cors_allow_headers = "All" -ticket_server_cors_allow_methods = "All" -ticket_server_cors_allow_credentials = true -ticket_server_cors_max_age = 300 +ticket_server.cors.allow_headers = "All" +ticket_server.cors.allow_methods = "All" +ticket_server.cors.allow_credentials = true +ticket_server.cors.max_age = 300 -data_server_enabled = false +data_server = "None" -[[resolvers]] -regex = '^(bucket)/(?P.*)$' -substitution_string = '$key' -storage.backend = 'S3' +[[locations]] +regex = "^(bucket)/(?P.*)$" +substitution_string = "$key" +backend.kind = "S3" # Or, set the bucket manually -#[resolvers.storage] -#backend = 'S3' -#bucket = 'bucket' +#backend.kind = "S3" +#backend.bucket = "bucket" diff --git a/htsget-config/examples/config-files/tls_data_server.toml b/htsget-config/examples/config-files/tls_data_server.toml index d2e4316e5..5d8987d85 100644 --- a/htsget-config/examples/config-files/tls_data_server.toml +++ b/htsget-config/examples/config-files/tls_data_server.toml @@ -1,16 +1,13 @@ # An example config file for a TLS data server that uses a local storage backend. # Run with `cargo run -p htsget-axum -- --config htsget-config/examples/config-files/tls_data_server.toml` -ticket_server_addr = "0.0.0.0:8080" -data_server_addr = "0.0.0.0:8081" -data_server_cors_allow_origins = "All" -data_server_tls.cert = "cert.pem" -data_server_tls.key = "key.pem" +ticket_server.addr = "0.0.0.0:8080" +data_server.addr = "0.0.0.0:8081" +data_server.cors.allow_origins = "All" +data_server.tls.cert = "cert.pem" +data_server.tls.key = "key.pem" -[[resolvers]] +[[locations]] regex = ".*" substitution_string = "$0" - -[resolvers.storage] -backend = 'Local' -use_data_server_config = true +backend.kind = "Local" diff --git a/htsget-config/examples/config-files/tls_ticket_server.toml b/htsget-config/examples/config-files/tls_ticket_server.toml index 9bd196ffa..d73bb5968 100644 --- a/htsget-config/examples/config-files/tls_ticket_server.toml +++ b/htsget-config/examples/config-files/tls_ticket_server.toml @@ -1,16 +1,14 @@ # An example config file for a TLS ticket server that uses S3 as a storage backend. # Run with `cargo run -p htsget-axum --features s3-storage -- --config htsget-config/examples/config-files/tls_ticket_server.toml` -ticket_server_addr = "0.0.0.0:8080" -ticket_server_cors_allow_origins = "All" -ticket_server_tls.cert = "cert.pem" -ticket_server_tls.key = "key.pem" -data_server_addr = "0.0.0.0:8081" +ticket_server.addr = "0.0.0.0:8080" +ticket_server.cors_allow_origins = "All" +ticket_server.tls.cert = "cert.pem" +ticket_server.tls.key = "key.pem" +data_server.addr = "0.0.0.0:8081" -[[resolvers]] +[[locations]] regex = ".*" substitution_string = "$0" - -[resolvers.storage] -backend = 'S3' -bucket = "bucket" +backend.kind = "S3" +backend.bucket = "bucket" diff --git a/htsget-config/examples/config-files/url_storage.toml b/htsget-config/examples/config-files/url_storage.toml index 372b00800..078a41b4c 100644 --- a/htsget-config/examples/config-files/url_storage.toml +++ b/htsget-config/examples/config-files/url_storage.toml @@ -3,27 +3,26 @@ # `cargo run -p htsget-axum --features url-storage -- --config htsget-config/examples/config-files/url_storage.toml` # in the project directory. -ticket_server_addr = "127.0.0.1:8082" -ticket_server_cors_allow_origins = "All" +ticket_server.addr = "127.0.0.1:8082" +ticket_server.cors.allow_origins = "All" -ticket_server_cert = "cert.pem" -ticket_server_key = "key.pem" +ticket_server.cert = "cert.pem" +ticket_server.key = "key.pem" -data_server_enabled = false +data_server = "None" -[[resolvers]] +[[locations]] regex = ".*" substitution_string = "$0" -[resolvers.storage] -backend = 'Url' -url = "http://127.0.0.1:8081" -response_url = "https://127.0.0.1:8081" -forward_headers = true +backend.kind = "Url" +backend.url = "http://127.0.0.1:8081" +backend.response_url = "https://127.0.0.1:8081" +backend.forward_headers = true # Set client authentication -#tls.key = "key.pem" -#tls.cert = "cert.pem" +#backend.tls.key = "key.pem" +#backend.tls.cert = "cert.pem" # Set root certificates -#tls.root_store = "cert.pem" +#backend.tls.root_store = "cert.pem" diff --git a/htsget-config/src/storage/local.rs b/htsget-config/src/storage/local.rs deleted file mode 100644 index a514d916f..000000000 --- a/htsget-config/src/storage/local.rs +++ /dev/null @@ -1,180 +0,0 @@ -use std::str::FromStr; - -use http::uri::Authority; -use serde::{Deserialize, Serialize}; - -use crate::config::{default_localstorage_addr, default_path, DataServerConfig}; -#[cfg(feature = "experimental")] -use crate::storage::c4gh::C4GHKeys; -use crate::tls::KeyPairScheme; -use crate::types::Scheme; - -pub(crate) fn default_authority() -> Authority { - Authority::from_static(default_localstorage_addr()) -} - -fn default_local_path() -> String { - default_path().into() -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(default)] -pub struct Local { - scheme: Scheme, - #[serde(with = "http_serde::authority")] - authority: Authority, - local_path: String, - path_prefix: String, - use_data_server_config: bool, - #[serde(skip_serializing)] - #[cfg(feature = "experimental")] - keys: Option, -} - -impl Local { - /// Create a new local storage. - pub fn new( - scheme: Scheme, - authority: Authority, - local_path: String, - path_prefix: String, - use_data_server_config: bool, - ) -> Self { - Self { - scheme, - authority, - local_path, - path_prefix, - use_data_server_config, - #[cfg(feature = "experimental")] - keys: None, - } - } - - /// Get the scheme. - pub fn scheme(&self) -> Scheme { - self.scheme - } - - /// Get the authority. - pub fn authority(&self) -> &Authority { - &self.authority - } - - /// Get the local path. - pub fn local_path(&self) -> &str { - &self.local_path - } - - /// Get the path prefix. - pub fn path_prefix(&self) -> &str { - &self.path_prefix - } - - /// Get whether config should be inherited from the data server config. - pub fn use_data_server_config(&self) -> bool { - self.use_data_server_config - } - - #[cfg(feature = "experimental")] - /// Set the C4GH keys. - pub fn set_keys(mut self, keys: Option) -> Self { - self.keys = keys; - self - } - - #[cfg(feature = "experimental")] - /// Get the C4GH keys. - pub fn keys(&self) -> Option<&C4GHKeys> { - self.keys.as_ref() - } -} - -impl Default for Local { - fn default() -> Self { - Self::new( - Scheme::Http, - default_authority(), - default_local_path(), - Default::default(), - false, - ) - } -} - -impl From<&DataServerConfig> for Local { - fn from(config: &DataServerConfig) -> Self { - Self::new( - config.tls().get_scheme(), - Authority::from_str(&config.addr().to_string()).expect("expected valid authority"), - config.local_path().to_string_lossy().to_string(), - config.serve_at().to_string(), - true, - ) - } -} - -#[cfg(test)] -mod tests { - use std::net::SocketAddr; - use std::path::PathBuf; - - use crate::config::cors::CorsConfig; - use crate::config::tests::test_config_from_file; - use crate::storage::Storage; - use crate::types::Scheme::Http; - - use super::*; - - #[test] - fn config_storage_local_file() { - test_config_from_file( - r#" - [[resolvers]] - regex = "regex" - - [resolvers.storage] - backend = "Local" - local_path = "path" - scheme = "HTTPS" - path_prefix = "path" - "#, - |config| { - println!("{:?}", config.resolvers().first().unwrap().storage()); - assert!(matches!( - config.resolvers().first().unwrap().storage(), - Storage::Local(local_storage) if local_storage.local_path() == "path" && local_storage.scheme() == Scheme::Https && local_storage.path_prefix() == "path" - )); - }, - ); - } - - #[test] - fn local_storage_from_data_server_config() { - let data_server_config = DataServerConfig::new( - true, - SocketAddr::from_str("127.0.0.1:8080").unwrap(), - PathBuf::from("data"), - "/data".to_string(), - None, - CorsConfig::default(), - ); - let result: Local = (&data_server_config).into(); - let expected = Local::new( - Http, - Authority::from_static("127.0.0.1:8080"), - "data".to_string(), - "/data".to_string(), - true, - ); - - assert_eq!(result.scheme(), expected.scheme()); - assert_eq!(result.authority(), expected.authority()); - assert_eq!(result.local_path(), expected.local_path()); - assert_eq!(result.path_prefix(), expected.path_prefix()); - assert_eq!( - result.use_data_server_config(), - expected.use_data_server_config() - ); - } -} From 0916b0cad53bf843dc22895e57eb6ff44f5e6184 Mon Sep 17 00:00:00 2001 From: Marko Malenic Date: Wed, 18 Dec 2024 14:08:04 +1100 Subject: [PATCH 2/2] fix: service info group, artifact and version, and add flexibility in configuration --- htsget-actix/src/handlers/service_info.rs | 2 +- htsget-axum/src/handlers/service_info.rs | 2 +- htsget-config/src/config/service_info.rs | 80 ++++++++++ htsget-config/src/storage/local.rs | 180 ---------------------- htsget-http/src/service_info.rs | 163 +++++++------------- 5 files changed, 141 insertions(+), 286 deletions(-) create mode 100644 htsget-config/src/config/service_info.rs delete mode 100644 htsget-config/src/storage/local.rs diff --git a/htsget-actix/src/handlers/service_info.rs b/htsget-actix/src/handlers/service_info.rs index 8df7c6dc0..f69e2a6b1 100644 --- a/htsget-actix/src/handlers/service_info.rs +++ b/htsget-actix/src/handlers/service_info.rs @@ -21,7 +21,7 @@ pub fn get_service_info_json( PrettyJson(get_base_service_info_json( endpoint, app_state.htsget.clone(), - &app_state.config_service_info, + app_state.config_service_info.clone(), )) } diff --git a/htsget-axum/src/handlers/service_info.rs b/htsget-axum/src/handlers/service_info.rs index 737203ba6..89dc77168 100644 --- a/htsget-axum/src/handlers/service_info.rs +++ b/htsget-axum/src/handlers/service_info.rs @@ -16,7 +16,7 @@ pub fn get_service_info_json( ErasedJson::pretty(get_base_service_info_json( endpoint, app_state.htsget, - &app_state.service_info, + app_state.service_info, )) } diff --git a/htsget-config/src/config/service_info.rs b/htsget-config/src/config/service_info.rs new file mode 100644 index 000000000..d0d4d259f --- /dev/null +++ b/htsget-config/src/config/service_info.rs @@ -0,0 +1,80 @@ +//! Service info configuration. +//! + +use serde::de::Error; +use serde::{Deserialize, Deserializer, Serialize}; +use serde_json::Value; +use std::collections::HashMap; + +/// Service info config. +#[derive(Serialize, Debug, Clone, Default, PartialEq, Eq)] +#[serde(default)] +pub struct ServiceInfo(HashMap); + +impl ServiceInfo { + /// Create a service info. + pub fn new(fields: HashMap) -> Self { + Self(fields) + } + + /// Get the inner value. + pub fn into_inner(self) -> HashMap { + self.0 + } +} + +impl AsRef> for ServiceInfo { + fn as_ref(&self) -> &HashMap { + &self.0 + } +} + +impl<'de> Deserialize<'de> for ServiceInfo { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let fields: HashMap = HashMap::::deserialize(deserializer)? + .into_iter() + .map(|(key, value)| (key.to_lowercase(), value)) + .collect(); + + let err_msg = |invalid_key| format!("reserved service info field `{}`", invalid_key); + + if fields.contains_key("type") { + return Err(Error::custom(err_msg("type"))); + } + + if fields.contains_key("htsget") { + return Err(Error::custom(err_msg("htsget"))); + } + + Ok(ServiceInfo::new(fields)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::tests::test_serialize_and_deserialize; + use crate::config::Config; + use serde_json::json; + + #[test] + fn service_info() { + test_serialize_and_deserialize( + r#" + service_info.environment = "dev" + service_info.organization = { name = "name", url = "https://example.com/" } + "#, + HashMap::from_iter(vec![ + ("environment".to_string(), json!("dev")), + ( + "organization".to_string(), + json!({ "name": "name", "url": "https://example.com/" }), + ), + ]), + |result: Config| result.service_info.0, + ); + } +} diff --git a/htsget-config/src/storage/local.rs b/htsget-config/src/storage/local.rs deleted file mode 100644 index a514d916f..000000000 --- a/htsget-config/src/storage/local.rs +++ /dev/null @@ -1,180 +0,0 @@ -use std::str::FromStr; - -use http::uri::Authority; -use serde::{Deserialize, Serialize}; - -use crate::config::{default_localstorage_addr, default_path, DataServerConfig}; -#[cfg(feature = "experimental")] -use crate::storage::c4gh::C4GHKeys; -use crate::tls::KeyPairScheme; -use crate::types::Scheme; - -pub(crate) fn default_authority() -> Authority { - Authority::from_static(default_localstorage_addr()) -} - -fn default_local_path() -> String { - default_path().into() -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -#[serde(default)] -pub struct Local { - scheme: Scheme, - #[serde(with = "http_serde::authority")] - authority: Authority, - local_path: String, - path_prefix: String, - use_data_server_config: bool, - #[serde(skip_serializing)] - #[cfg(feature = "experimental")] - keys: Option, -} - -impl Local { - /// Create a new local storage. - pub fn new( - scheme: Scheme, - authority: Authority, - local_path: String, - path_prefix: String, - use_data_server_config: bool, - ) -> Self { - Self { - scheme, - authority, - local_path, - path_prefix, - use_data_server_config, - #[cfg(feature = "experimental")] - keys: None, - } - } - - /// Get the scheme. - pub fn scheme(&self) -> Scheme { - self.scheme - } - - /// Get the authority. - pub fn authority(&self) -> &Authority { - &self.authority - } - - /// Get the local path. - pub fn local_path(&self) -> &str { - &self.local_path - } - - /// Get the path prefix. - pub fn path_prefix(&self) -> &str { - &self.path_prefix - } - - /// Get whether config should be inherited from the data server config. - pub fn use_data_server_config(&self) -> bool { - self.use_data_server_config - } - - #[cfg(feature = "experimental")] - /// Set the C4GH keys. - pub fn set_keys(mut self, keys: Option) -> Self { - self.keys = keys; - self - } - - #[cfg(feature = "experimental")] - /// Get the C4GH keys. - pub fn keys(&self) -> Option<&C4GHKeys> { - self.keys.as_ref() - } -} - -impl Default for Local { - fn default() -> Self { - Self::new( - Scheme::Http, - default_authority(), - default_local_path(), - Default::default(), - false, - ) - } -} - -impl From<&DataServerConfig> for Local { - fn from(config: &DataServerConfig) -> Self { - Self::new( - config.tls().get_scheme(), - Authority::from_str(&config.addr().to_string()).expect("expected valid authority"), - config.local_path().to_string_lossy().to_string(), - config.serve_at().to_string(), - true, - ) - } -} - -#[cfg(test)] -mod tests { - use std::net::SocketAddr; - use std::path::PathBuf; - - use crate::config::cors::CorsConfig; - use crate::config::tests::test_config_from_file; - use crate::storage::Storage; - use crate::types::Scheme::Http; - - use super::*; - - #[test] - fn config_storage_local_file() { - test_config_from_file( - r#" - [[resolvers]] - regex = "regex" - - [resolvers.storage] - backend = "Local" - local_path = "path" - scheme = "HTTPS" - path_prefix = "path" - "#, - |config| { - println!("{:?}", config.resolvers().first().unwrap().storage()); - assert!(matches!( - config.resolvers().first().unwrap().storage(), - Storage::Local(local_storage) if local_storage.local_path() == "path" && local_storage.scheme() == Scheme::Https && local_storage.path_prefix() == "path" - )); - }, - ); - } - - #[test] - fn local_storage_from_data_server_config() { - let data_server_config = DataServerConfig::new( - true, - SocketAddr::from_str("127.0.0.1:8080").unwrap(), - PathBuf::from("data"), - "/data".to_string(), - None, - CorsConfig::default(), - ); - let result: Local = (&data_server_config).into(); - let expected = Local::new( - Http, - Authority::from_static("127.0.0.1:8080"), - "data".to_string(), - "/data".to_string(), - true, - ); - - assert_eq!(result.scheme(), expected.scheme()); - assert_eq!(result.authority(), expected.authority()); - assert_eq!(result.local_path(), expected.local_path()); - assert_eq!(result.path_prefix(), expected.path_prefix()); - assert_eq!( - result.use_data_server_config(), - expected.use_data_server_config() - ); - } -} diff --git a/htsget-http/src/service_info.rs b/htsget-http/src/service_info.rs index 5a529dea1..b8af5d2a9 100644 --- a/htsget-http/src/service_info.rs +++ b/htsget-http/src/service_info.rs @@ -1,42 +1,33 @@ +use htsget_config::config; +use htsget_config::types::Format; +use htsget_search::HtsGet; use serde::{Deserialize, Serialize}; +use serde_json::Value; +use std::collections::HashMap; use tracing::debug; use tracing::instrument; -use htsget_config::types::Format; -use htsget_search::HtsGet; - -use crate::ConfigServiceInfo; use crate::Endpoint; const READS_FORMATS: [&str; 2] = ["BAM", "CRAM"]; const VARIANTS_FORMATS: [&str; 2] = ["VCF", "BCF"]; +const HTSGET_GROUP: &str = "org.ga4gh"; +const HTSGET_ARTIFACT: &str = "htsget"; +const HTSGET_VERSION: &str = "1.3.0"; + /// A struct representing the information that should be present in a service-info response. #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Default)] #[serde(rename_all = "camelCase")] pub struct ServiceInfo { - pub id: String, - pub name: String, - pub version: String, - pub organization: Organisation, + #[serde(flatten)] + pub fields: HashMap, #[serde(rename = "type")] pub service_type: Type, pub htsget: Htsget, - pub contact_url: String, - pub documentation_url: String, - pub created_at: String, - pub updated_at: String, - pub environment: String, } -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Default)] -#[serde(rename_all = "camelCase")] -pub struct Organisation { - pub name: String, - pub url: String, -} - -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct Type { pub group: String, @@ -44,6 +35,16 @@ pub struct Type { pub version: String, } +impl Default for Type { + fn default() -> Self { + Self { + group: HTSGET_GROUP.to_string(), + artifact: HTSGET_ARTIFACT.to_string(), + version: HTSGET_VERSION.to_string(), + } + } +} + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Default)] #[serde(rename_all = "camelCase")] pub struct Htsget { @@ -53,42 +54,37 @@ pub struct Htsget { pub tags_parameters_effective: bool, } -pub fn get_service_info_with( - endpoint: Endpoint, - supported_formats: &[Format], - fields_effective: bool, - tags_effective: bool, -) -> ServiceInfo { - let htsget_info = Htsget { - datatype: match endpoint { - Endpoint::Reads => "reads", - Endpoint::Variants => "variants", - } - .to_string(), - formats: supported_formats - .iter() - .map(|format| format.to_string()) - .filter(|format| match endpoint { - Endpoint::Reads => READS_FORMATS.contains(&format.as_str()), - Endpoint::Variants => VARIANTS_FORMATS.contains(&format.as_str()), - }) - .collect(), - fields_parameters_effective: fields_effective, - tags_parameters_effective: tags_effective, - }; +impl ServiceInfo { + pub fn new( + endpoint: Endpoint, + supported_formats: &[Format], + fields_effective: bool, + tags_effective: bool, + fields: HashMap, + ) -> Self { + let htsget_info = Htsget { + datatype: match endpoint { + Endpoint::Reads => "reads", + Endpoint::Variants => "variants", + } + .to_string(), + formats: supported_formats + .iter() + .map(|format| format.to_string()) + .filter(|format| match endpoint { + Endpoint::Reads => READS_FORMATS.contains(&format.as_str()), + Endpoint::Variants => VARIANTS_FORMATS.contains(&format.as_str()), + }) + .collect(), + fields_parameters_effective: fields_effective, + tags_parameters_effective: tags_effective, + }; - ServiceInfo { - id: "".to_string(), - name: "".to_string(), - version: "".to_string(), - organization: Default::default(), - service_type: Default::default(), - htsget: htsget_info, - contact_url: "".to_string(), - documentation_url: "".to_string(), - created_at: "".to_string(), - updated_at: "".to_string(), - environment: "".to_string(), + Self { + fields, + service_type: Default::default(), + htsget: htsget_info, + } } } @@ -96,55 +92,14 @@ pub fn get_service_info_with( pub fn get_service_info_json( endpoint: Endpoint, searcher: impl HtsGet + Send + Sync + 'static, - config: &ConfigServiceInfo, + config: config::service_info::ServiceInfo, ) -> ServiceInfo { debug!(endpoint = ?endpoint,"getting service-info response for endpoint"); - fill_out_service_info_json( - get_service_info_with( - endpoint, - &searcher.get_supported_formats(), - searcher.are_field_parameters_effective(), - searcher.are_tag_parameters_effective(), - ), - config, + ServiceInfo::new( + endpoint, + &searcher.get_supported_formats(), + searcher.are_field_parameters_effective(), + searcher.are_tag_parameters_effective(), + config.into_inner(), ) } - -/// Fills the service-info json with the data from the server config -fn fill_out_service_info_json( - mut service_info_json: ServiceInfo, - config: &ConfigServiceInfo, -) -> ServiceInfo { - if let Some(id) = config.id() { - service_info_json.id = id.to_string(); - } - if let Some(name) = config.name() { - service_info_json.name = name.to_string(); - } - if let Some(version) = config.version() { - service_info_json.version = version.to_string(); - } - if let Some(organization_name) = config.organization_name() { - service_info_json.organization.name = organization_name.to_string(); - } - if let Some(organization_url) = config.organization_url() { - service_info_json.organization.url = organization_url.to_string(); - } - if let Some(contact_url) = config.contact_url() { - service_info_json.contact_url = contact_url.to_string(); - } - if let Some(documentation_url) = config.documentation_url() { - service_info_json.documentation_url = documentation_url.to_string(); - } - if let Some(created_at) = config.created_at() { - service_info_json.created_at = created_at.to_string(); - } - if let Some(updated_at) = config.updated_at() { - service_info_json.updated_at = updated_at.to_string(); - } - if let Some(environment) = config.environment() { - service_info_json.environment = environment.to_string(); - } - - service_info_json -}