diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 815687fa17..61a535fe91 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @kolchfa-aws @Naarcha-AWS @vagimeli @AMoo-Miki @natebower @dlvenable @stephen-crawford @epugh +* @kolchfa-aws @Naarcha-AWS @AMoo-Miki @natebower @dlvenable @epugh diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE/issue_template.md index 0985529940..b3e03c0ffe 100644 --- a/.github/ISSUE_TEMPLATE/issue_template.md +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -15,7 +15,6 @@ assignees: '' **Tell us about your request.** Provide a summary of the request. -***Version:** List the OpenSearch version to which this issue applies, e.g. 2.14, 2.12--2.14, or all. +**Version:** List the OpenSearch version to which this issue applies, e.g. 2.14, 2.12--2.14, or all. **What other resources are available?** Provide links to related issues, POCs, steps for testing, etc. - diff --git a/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt b/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt index e33ac09744..bf4c8a0a29 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt @@ -85,6 +85,7 @@ Python PyTorch Querqy Query Workbench +RankLib RCF Summarize RPM Package Manager Ruby @@ -97,4 +98,5 @@ TorchScript Tribuo VisBuilder Winlogbeat -Zstandard \ No newline at end of file +XGBoost +Zstandard diff --git a/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt b/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt index c6d129c2c5..0a1d2c557c 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt @@ -85,6 +85,8 @@ p\d{2} [Pp]erformant [Pp]laintext [Pp]luggable +[Pp]reaggregate(s|d)? +[Pp]recompute(s|d)? [Pp]reconfigure [Pp]refetch [Pp]refilter diff --git a/.github/workflows/jekyll-spec-insert.yml b/.github/workflows/jekyll-spec-insert.yml new file mode 100644 index 0000000000..cefd477be2 --- /dev/null +++ b/.github/workflows/jekyll-spec-insert.yml @@ -0,0 +1,20 @@ +name: Lint and Test Jekyll Spec Insert +on: + push: + paths: + - 'spec-insert/**' + pull_request: + paths: + - 'spec-insert/**' +jobs: + lint-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: { ruby-version: 3.3.0 } + - run: bundle install + - working-directory: spec-insert + run: | + bundle exec rubocop + bundle exec rspec diff --git a/.github/workflows/pr_checklist.yml b/.github/workflows/pr_checklist.yml index b56174793e..e34d0cecb2 100644 --- a/.github/workflows/pr_checklist.yml +++ b/.github/workflows/pr_checklist.yml @@ -29,7 +29,7 @@ jobs: with: script: | let assignee = context.payload.pull_request.user.login; - const prOwners = ['Naarcha-AWS', 'kolchfa-aws', 'vagimeli', 'natebower']; + const prOwners = ['Naarcha-AWS', 'kolchfa-aws', 'natebower']; if (!prOwners.includes(assignee)) { assignee = 'kolchfa-aws' @@ -40,4 +40,4 @@ jobs: owner: context.repo.owner, repo: context.repo.repo, assignees: [assignee] - }); \ No newline at end of file + }); diff --git a/.github/workflows/update-api-components.yml b/.github/workflows/update-api-components.yml new file mode 100644 index 0000000000..42cc1d2827 --- /dev/null +++ b/.github/workflows/update-api-components.yml @@ -0,0 +1,52 @@ +name: Update API Components +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * 0" # Every Sunday at midnight GMT +jobs: + update-api-components: + if: ${{ github.repository == 'opensearch-project/documentation-website' }} + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - run: git config --global pull.rebase true + + - uses: ruby/setup-ruby@v1 + with: { ruby-version: 3.3.0 } + + - run: bundle install + + - name: Download spec and insert into documentation + run: bundle exec jekyll spec-insert + + - name: Get current date + id: date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV + + - name: GitHub App token + id: github_app_token + uses: tibdex/github-app-token@v2.1.0 + with: + app_id: ${{ secrets.APP_ID }} + private_key: ${{ secrets.APP_PRIVATE_KEY }} + + - name: Create pull request + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ steps.github_app_token.outputs.token }} + commit-message: "Updated API components to reflect the latest OpenSearch API spec (${{ env.date }})" + title: "[AUTOCUT] Update API components to reflect the latest OpenSearch API spec (${{ env.date }})" + body: | + Update API components to reflect the latest [OpenSearch API spec](https://github.com/opensearch-project/opensearch-api-specification/releases/download/main-latest/opensearch-openapi.yaml). + Date: ${{ env.date }} + branch: update-api-components-${{ env.date }} + base: main + signoff: true + labels: autocut \ No newline at end of file diff --git a/.ruby-version b/.ruby-version deleted file mode 100644 index 4772543317..0000000000 --- a/.ruby-version +++ /dev/null @@ -1 +0,0 @@ -3.3.2 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8ab3c2bd4f..f6d9b87ee8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -78,13 +78,13 @@ Follow these steps to set up your local copy of the repository: 1. Navigate to your cloned repository. -##### Building using locally installed packages +##### Building by using locally installed packages 1. Install [Ruby](https://www.ruby-lang.org/en/) if you don't already have it. We recommend [RVM](https://rvm.io/), but you can use any method you prefer: ``` curl -sSL https://get.rvm.io | bash -s stable - rvm install 3.2.4 + rvm install 3.3.2 ruby -v ``` @@ -100,7 +100,7 @@ Follow these steps to set up your local copy of the repository: bundle install ``` -##### Building using containerization +##### Building by using containerization Assuming you have `docker-compose` installed, run the following command: @@ -158,6 +158,23 @@ To ensure that our documentation adheres to the [OpenSearch Project Style Guidel Optionally, you can install the [Vale VSCode](https://github.com/chrischinchilla/vale-vscode) extension, which integrates Vale with Visual Studio Code. By default, only _errors_ and _warnings_ are underlined. To change the minimum alert level to include _suggestions_, go to **Vale VSCode** > **Extension Settings** and select **suggestion** in the **Vale > Vale CLI: Min Alert Level** dropdown list. +## Troubleshooting + +This section provides information about potential solutions for known issues. + +### Installing Ruby on an Apple silicon machine + +If you're having trouble installing Ruby with `rvm` on an Apple silicon machine, it could be because of an OpenSSL version misalignment. To fix this issue, use the following command, replacing `` with your [desired version](https://github.com/ruby/openssl/blob/master/README.md): + +``` +# Assumes Brew is installed +curl -sSL https://get.rvm.io | bash -s stable +rvm install 3.2.4 --with-openssl-dir=$(brew --prefix openssl@) +ruby -v +``` + ## Getting help For help with the contribution process, reach out to one of the [points of contact](README.md#points-of-contact). + + diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md new file mode 100644 index 0000000000..9b0ec1c79d --- /dev/null +++ b/DEVELOPER_GUIDE.md @@ -0,0 +1,135 @@ +# Developer guide +- [Introduction](#introduction) +- [Starting the Jekyll server locally](#starting-the-jekyll-server-locally) +- [Using the spec-insert Jekyll plugin](#using-the-spec-insert-jekyll-plugin) + - [Ignoring files and folders](#ignoring-files-and-folders) +- [CI/CD](#cicd) +- [Spec insert components](#spec-insert-components) + - [Query parameters](#query-parameters) + - [Path parameters](#path-parameters) + - [Paths and HTTP methods](#paths-and-http-methods) + +## Introduction + +The `.md` documents in this repository are rendered into HTML pages using [Jekyll](https://jekyllrb.com/). These HTML pages are hosted on [opensearch.org](https://opensearch.org/docs/latest/). + +## Starting the Jekyll server locally +You can run the Jekyll server locally to view the rendered HTML pages using the following steps: + +1. Install [Ruby](https://www.ruby-lang.org/en/documentation/installation/) 3.1.0 or later for your operating system. +2. Install the required gems by running `bundle install`. +3. Run `bundle exec jekyll serve` to start the Jekyll server locally (this can take several minutes to complete). +4. Open your browser and navigate to `http://localhost:4000` to view the rendered HTML pages. + +## Using the `spec-insert` Jekyll plugin +The `spec-insert` Jekyll plugin is used to insert API components into Markdown files. The plugin downloads the [latest OpenSearch specification](https://github.com/opensearch-project/opensearch-api-specification) and renders the API components from the spec. This aims to reduce the manual effort required to keep the documentation up to date. + +To use this plugin, make sure that you have installed Ruby 3.1.0 or later and the required gems by running `bundle install`. + +Edit your Markdown file and insert the following snippet where you want render an API component: + +```markdown + + +This is where the API component will be inserted. +Everything between the `spec_insert_start` and `spec_insert_end` tags will be overwritten. + + +``` + +Then run the following Jekyll command to render the API components: +```shell +bundle exec jekyll spec-insert +``` + +If you are working on multiple Markdown files and do not want to keep running the `jekyll spec-insert` command, you can add the `--watch` (or `-W`) flag to the command to watch for changes in the Markdown files and automatically render the API components: + +```shell +bundle exec jekyll spec-insert --watch +``` + +Depending on the text editor you are using, you may need to manually reload the file from disk to see the changes applied by the plugin if the editor does not automatically reload the file periodically. + +The plugin will pull the newest OpenSearch API spec from its [repository](https://github.com/opensearch-project/opensearch-api-specification) if the spec file does not exist locally or if it is older than 24 hours. To tell the plugin to always pull the newest spec, you can add the `--refresh-spec` (or `-R`) flag to the command: + +```shell +bundle exec jekyll spec-insert --refresh-spec +``` + +### Ignoring files and folders +The `spec-insert` plugin ignores all files and folders listed in the [./_config.yml#exclude](./_config.yml) list, which is also the list of files and folders that Jekyll ignores. + +## CI/CD +The `spec-insert` plugin is run as part of the CI/CD pipeline to ensure that the API components are up to date in the documentation. This is performed through the [update-api-components.yml](.github/workflows/update-api-components.yml) GitHub Actions workflow, which creates a pull request containing the updated API components every Sunday. + +## Spec insert components +All spec insert components accept the following arguments: +- `api` (String; required): The name of the API to render the component from. This is equivalent to the `x-operation-group` field in the OpenSearch OpenAPI Spec. +- `component` (String; required): The name of the component to render, such as `query_parameters`, `path_parameters`, or `paths_and_http_methods`. +- `omit_header` (Boolean; Default is `false`): If set to `true`, the markdown header of the component will not be rendered. + +### Paths and HTTP methods +To insert paths and HTTP methods for the `search` API, use the following snippet: +```markdown + + +``` + +### Path parameters + +To insert a path parameters table of the `indices.create` API, use the following snippet. Use the `x-operation-group` field from OpenSearch OpenAPI Spec for the `api` value: + +```markdown + + +``` +This table accepts the same arguments as the query parameters table except the `include_global` argument. + +### Query parameters +To insert the API query parameters table of the `cat.indices` API, use the following snippet: +```markdown + + +``` + +This will insert the query parameters of the `cat.indices` API into the `.md` file with three default columns: `Parameter`, `Type`, and `Description`. You can customize the query parameters table by adding the `columns` argument which accepts a comma-separated list of column names. The available column names are: + +- `Parameter` +- `Type` +- `Description` +- `Required` +- `Default` + +_When `Required`/`Default` is not chosen, the information will be written in the `Description` column._ + +You can also customize this component with the following settings: + +- `include_global` (Boolean; default is `false`): Includes global query parameters in the table. +- `include_deprecated` (Boolean; default is `true`): Includes deprecated parameters in the table. +- `pretty` (Boolean; default is `false`): Renders the table in the pretty format instead of the compact format. + +The following snippet inserts the specified columns into the query parameters table: + +```markdown + + +``` diff --git a/Gemfile b/Gemfile index 7825dcd02b..fee04f3c48 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,9 @@ -source "http://rubygems.org" +# frozen_string_literal: true + +source 'https://rubygems.org' + +# Manually add csv gem since Ruby 3.4.0 no longer includes it +gem 'csv', '~> 3.0' # Hello! This is where you manage which Jekyll version is used to run. # When you want to use a different version, change it below, save the @@ -8,12 +13,12 @@ source "http://rubygems.org" # # This will help ensure the proper Jekyll version is running. # Happy Jekylling! -gem "jekyll", "~> 4.3.2" +gem 'jekyll', '~> 4.3.2' # This is the default theme for new Jekyll sites. You may change this to anything you like. -gem "just-the-docs", "~> 0.3.3" -gem "jekyll-remote-theme", "~> 0.4" -gem "jekyll-redirect-from", "~> 0.16" +gem 'jekyll-redirect-from', '~> 0.16' +gem 'jekyll-remote-theme', '~> 0.4' +gem 'just-the-docs', '~> 0.3.3' # If you want to use GitHub Pages, remove the "gem "jekyll"" above and # uncomment the line below. To upgrade, run `bundle update github-pages`. @@ -22,21 +27,31 @@ gem "jekyll-redirect-from", "~> 0.16" # If you have any plugins, put them here! group :jekyll_plugins do - gem "jekyll-last-modified-at" - gem "jekyll-sitemap" + gem 'jekyll-last-modified-at' + gem 'jekyll-sitemap' + gem 'jekyll-spec-insert', :path => './spec-insert' end # Windows does not include zoneinfo files, so bundle the tzinfo-data gem -gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby] +gem 'tzinfo-data', platforms: %i[mingw mswin x64_mingw jruby] # Performance-booster for watching directories on Windows -gem "wdm", "~> 0.1.0" if Gem.win_platform? +gem 'wdm', '~> 0.1.0' if Gem.win_platform? # Installs webrick dependency for building locally -gem "webrick", "~> 1.7" - +gem 'webrick', '~> 1.7' # Link checker -gem "typhoeus" -gem "ruby-link-checker" -gem "ruby-enum" +gem 'ruby-enum' +gem 'ruby-link-checker' +gem 'typhoeus' + +# Spec Insert +gem 'activesupport', '~> 7' +gem 'mustache', '~> 1' + +group :development, :test do + gem 'rspec' + gem 'rubocop', '~> 1.44', require: false + gem 'rubocop-rake', require: false +end diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 55b908e027..b06d367e21 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -9,14 +9,14 @@ This document lists the maintainers in this repo. See [opensearch-project/.githu | Fanit Kolchina | [kolchfa-aws](https://github.com/kolchfa-aws) | Amazon | | Nate Archer | [Naarcha-AWS](https://github.com/Naarcha-AWS) | Amazon | | Nathan Bower | [natebower](https://github.com/natebower) | Amazon | -| Melissa Vagi | [vagimeli](https://github.com/vagimeli) | Amazon | | Miki Barahmand | [AMoo-Miki](https://github.com/AMoo-Miki) | Amazon | | David Venable | [dlvenable](https://github.com/dlvenable) | Amazon | -| Stephen Crawford | [stephen-crawford](https://github.com/stephen-crawford) | Amazon | | Eric Pugh | [epugh](https://github.com/epugh) | OpenSource Connections | ## Emeritus -| Maintainer | GitHub ID | Affiliation | -| ---------------- | ----------------------------------------------- | ----------- | -| Heather Halter | [hdhalter](https://github.com/hdhalter) | Amazon | +| Maintainer | GitHub ID | Affiliation | +| ---------------- | ------------------------------------------------------- | ----------- | +| Heather Halter | [hdhalter](https://github.com/hdhalter) | Amazon | +| Melissa Vagi | [vagimeli](https://github.com/vagimeli) | Amazon | +| Stephen Crawford | [stephen-crawford](https://github.com/stephen-crawford) | Amazon | \ No newline at end of file diff --git a/README.md b/README.md index 66beb1948c..807e106309 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ # About the OpenSearch documentation repo The `documentation-website` repository contains the user documentation for OpenSearch. You can find the rendered documentation at [opensearch.org/docs](https://opensearch.org/docs). +The markdown files in this repository are rendered into HTML pages using [Jekyll](https://jekyllrb.com/). Check the [DEVELOPER_GUIDE](DEVELOPER_GUIDE.md) for more information about how to use Jekyll for this repository. ## Contributing @@ -23,7 +24,6 @@ If you encounter problems or have questions when contributing to the documentati - [kolchfa-aws](https://github.com/kolchfa-aws) - [Naarcha-AWS](https://github.com/Naarcha-AWS) -- [vagimeli](https://github.com/vagimeli) ## Code of conduct diff --git a/TERMS.md b/TERMS.md index 7de56f9275..146f3c1049 100644 --- a/TERMS.md +++ b/TERMS.md @@ -588,6 +588,8 @@ Use % in headlines, quotations, and tables or in technical copy. An agent and REST API that allows you to query numerous performance metrics for your cluster, including aggregations of those metrics, independent of the Java Virtual Machine (JVM). +**performant** + **plaintext, plain text** Use *plaintext* only to refer to nonencrypted or decrypted text in content about encryption. Use *plain text* to refer to ASCII files. @@ -602,6 +604,10 @@ Tools inside of OpenSearch that can be customized to enhance OpenSearch's functi **pop-up** +**preaggregate** + +**precompute** + **premise, premises** With reference to property and buildings, always form as plural. diff --git a/_about/version-history.md b/_about/version-history.md index b8b9e99309..bfbd8e9f55 100644 --- a/_about/version-history.md +++ b/_about/version-history.md @@ -9,6 +9,7 @@ permalink: /version-history/ OpenSearch version | Release highlights | Release date :--- | :--- | :--- +[2.18.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.18.0.md) | Adds a redesigned home page, updated Discover interface, and collaborative workspaces to OpenSearch Dashboards. Includes improvements to ML inference processor and query grouping. Introduces reranking by field and paginated CAT APIs. Includes experimental OpenSearch Dashboards Assistant capabilities. For a full list of release highlights, see the Release Notes. | 05 November 2024 [2.17.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.17.1.md) | Includes bug fixes for ML Commons, anomaly detection, k-NN, and security analytics. Adds various infrastructure and maintenance updates. For a full list of release highlights, see the Release Notes. | 1 October 2024 [2.17.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.17.0.md) | Includes disk-optimized vector search, binary quantization, and byte vector encoding in k-NN. Adds asynchronous batch ingestion for ML tasks. Provides search and query performance enhancements and a new custom trace source in trace analytics. Includes application-based configuration templates. For a full list of release highlights, see the Release Notes. | 17 September 2024 [2.16.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.16.0.md) | Includes built-in byte vector quantization and binary vector support in k-NN. Adds new sort, split, and ML inference search processors for search pipelines. Provides application-based configuration templates and additional plugins to integrate multiple data sources in OpenSearch Dashboards. Includes an experimental Batch Predict ML Commons API. For a full list of release highlights, see the Release Notes. | 06 August 2024 diff --git a/_aggregations/bucket/auto-interval-date-histogram.md b/_aggregations/bucket/auto-interval-date-histogram.md new file mode 100644 index 0000000000..b7a95a3b89 --- /dev/null +++ b/_aggregations/bucket/auto-interval-date-histogram.md @@ -0,0 +1,377 @@ +--- +layout: default +title: Auto-interval date histogram +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 12 +--- + +# Auto-interval date histogram + +Similar to the [date histogram aggregation]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-histogram/), in which you must specify an interval, the `auto_date_histogram` is a multi-bucket aggregation that automatically creates date histogram buckets based on the number of buckets you provide and the time range of your data. The actual number of buckets returned is always less than or equal to the number of buckets you specify. This aggregation is particularly useful when you are working with time-series data and want to visualize or analyze data over different time intervals without manually specifying the interval size. + +## Intervals + +The bucket interval is chosen based on the collected data to ensure that the number of returned buckets is less than or equal to the requested number. + +The following table lists the possible returned intervals for each time unit. + +| Unit | Intervals | +| :--- | :---| +| Seconds| Multiples of 1, 5, 10, and 30 | +| Minutes| Multiples of 1, 5, 10, and 30 | +| Hours | Multiples of 1, 3, and 12 | +| Days | Multiples of 1 and 7 | +| Months | Multiples of 1 and 3 | +| Years | Multiples of 1, 5, 10, 20, 50, and 100 | + +If an aggregation returns too many buckets (for example, daily buckets), OpenSearch will automatically reduce the number of buckets to ensure a manageable result. Instead of returning the exact number of requested daily buckets, it will reduce them by a factor of about 1/7. For example, if you ask for 70 buckets but the data contains too many daily intervals, OpenSearch might return only 10 buckets, grouping the data into larger intervals (such as weeks) to avoid an overwhelming number of results. This helps optimize the aggregation and prevent excessive detail when too much data is available. + +## Example + +In the following example, you'll search an index containing blog posts. + +First, create a mapping for this index and specify the `date_posted` field as the `date` type: + +```json +PUT blogs +{ + "mappings" : { + "properties" : { + "date_posted" : { + "type" : "date", + "format" : "yyyy-MM-dd" + } + } + } +} +``` +{% include copy-curl.html %} + +Next, index the following documents into the `blogs` index: + +```json +PUT blogs/_doc/1 +{ + "name": "Semantic search in OpenSearch", + "date_posted": "2022-04-17" +} +``` +{% include copy-curl.html %} + +```json +PUT blogs/_doc/2 +{ + "name": "Sparse search in OpenSearch", + "date_posted": "2022-05-02" +} +``` +{% include copy-curl.html %} + +```json +PUT blogs/_doc/3 +{ + "name": "Distributed tracing with Data Prepper", + "date_posted": "2022-04-25" +} +``` +{% include copy-curl.html %} + +```json +PUT blogs/_doc/4 +{ + "name": "Observability in OpenSearch", + "date_posted": "2023-03-23" +} + +``` +{% include copy-curl.html %} + +To use the `auto_date_histogram` aggregation, specify the field containing the date or timestamp values. For example, to aggregate blog posts by `date_posted` into two buckets, send the following request: + +```json +GET /blogs/_search +{ + "size": 0, + "aggs": { + "histogram": { + "auto_date_histogram": { + "field": "date_posted", + "buckets": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +The response shows that the blog posts were aggregated into two buckets. The interval was automatically set to 1 year, with all three 2022 blog posts collected in one bucket and the 2023 blog post in another: + +```json +{ + "took": 20, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "histogram": { + "buckets": [ + { + "key_as_string": "2022-01-01", + "key": 1640995200000, + "doc_count": 3 + }, + { + "key_as_string": "2023-01-01", + "key": 1672531200000, + "doc_count": 1 + } + ], + "interval": "1y" + } + } +} +``` + +## Returned buckets + +Each bucket contains the following information: + +```json +{ + "key_as_string": "2023-01-01", + "key": 1672531200000, + "doc_count": 1 +} +``` + +In OpenSearch, dates are internally stored as 64-bit integers representing timestamps in milliseconds since the epoch. In the aggregation response, each bucket `key` is returned as such a timestamp. The `key_as_string` value shows the same timestamp but formatted as a date string based on the [`format`](#date-format) parameter. The `doc_count` field contains the number of documents in the bucket. + +## Parameters + +Auto-interval date histogram aggregations accept the following parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`field` | String | The field on which to aggregate. The field must contain the date or timestamp values. Either `field` or `script` is required. +`buckets` | Integer | The desired number of buckets. The returned number of buckets is less than or equal to the desired number. Optional. Default is `10`. +`minimum_interval` | String | The minimum interval to be used. Specifying a minimum interval can make the aggregation process more efficient. Valid values are `year`, `month`, `day`, `hour`, `minute`, and `second`. Optional. +`time_zone` | String | Specifies to use a time zone other than the default (UTC) for bucketing and rounding. You can specify the `time_zone` parameter as a [UTC offset](https://en.wikipedia.org/wiki/UTC_offset), such as `-04:00`, or an [IANA time zone ID](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones), such as `America/New_York`. Optional. Default is `UTC`. For more information, see [Time zone](#time-zone). +`format` | String | The format for returning dates representing bucket keys. Optional. Default is the format specified in the field mapping. For more information, see [Date format](#date-format). +`script` | String | A document-level or value-level script for aggregating values into buckets. Either `field` or `script` is required. +`missing` | String | Specifies how to handle documents in which the field value is missing. By default, such documents are ignored. If you specify a date value in the `missing` parameter, all documents in which the field value is missing are collected into the bucket with the specified date. + +## Date format + +If you don't specify the `format` parameter, the format defined in the field mapping is used (as seen in the preceding response). To modify the format, specify the `format` parameter: + +```json +GET /blogs/_search +{ + "size": 0, + "aggs": { + "histogram": { + "auto_date_histogram": { + "field": "date_posted", + "format": "yyyy-MM-dd HH:mm:ss" + } + } + } +} +``` +{% include copy-curl.html %} + +The `key_as_string` field is now returned in the specified format: + +```json +{ + "key_as_string": "2023-01-01 00:00:00", + "key": 1672531200000, + "doc_count": 1 +} +``` + +Alternatively, you can specify one of the built-in date [formats]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/#formats): + +```json +GET /blogs/_search +{ + "size": 0, + "aggs": { + "histogram": { + "auto_date_histogram": { + "field": "date_posted", + "format": "basic_date_time_no_millis" + } + } + } +} +``` +{% include copy-curl.html %} + +The `key_as_string` field is now returned in the specified format: + +```json +{ + "key_as_string": "20230101T000000Z", + "key": 1672531200000, + "doc_count": 1 +} +``` + +## Time zone + +By default, dates are stored and processed in UTC. The `time_zone` parameter allows you to specify a different time zone for bucketing. You can specify the `time_zone` parameter as a [UTC offset](https://en.wikipedia.org/wiki/UTC_offset), such as `-04:00`, or an [IANA time zone ID](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones), such as `America/New_York`. + +As an example, index the following documents into an index: + +```json +PUT blogs1/_doc/1 +{ + "name": "Semantic search in OpenSearch", + "date_posted": "2022-04-17T01:00:00.000Z" +} +``` +{% include copy-curl.html %} + +```json +PUT blogs1/_doc/2 +{ + "name": "Sparse search in OpenSearch", + "date_posted": "2022-04-17T04:00:00.000Z" +} +``` +{% include copy-curl.html %} + +First, run an aggregation without specifying a time zone: + +```json +GET /blogs1/_search +{ + "size": 0, + "aggs": { + "histogram": { + "auto_date_histogram": { + "field": "date_posted", + "buckets": 2, + "format": "yyyy-MM-dd HH:mm:ss" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains two 3-hour buckets, starting at midnight UTC on April 17, 2022: + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "histogram": { + "buckets": [ + { + "key_as_string": "2022-04-17 01:00:00", + "key": 1650157200000, + "doc_count": 1 + }, + { + "key_as_string": "2022-04-17 04:00:00", + "key": 1650168000000, + "doc_count": 1 + } + ], + "interval": "3h" + } + } +} +``` + +Now, specify a `time_zone` of `-02:00`: + +```json +GET /blogs1/_search +{ + "size": 0, + "aggs": { + "histogram": { + "auto_date_histogram": { + "field": "date_posted", + "buckets": 2, + "format": "yyyy-MM-dd HH:mm:ss", + "time_zone": "-02:00" + } + } + } +} +``` + +The response contains two buckets in which the start time is shifted by 2 hours and starts at 23:00 on April 16, 2022: + +```json +{ + "took": 17, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "histogram": { + "buckets": [ + { + "key_as_string": "2022-04-16 23:00:00", + "key": 1650157200000, + "doc_count": 1 + }, + { + "key_as_string": "2022-04-17 02:00:00", + "key": 1650168000000, + "doc_count": 1 + } + ], + "interval": "3h" + } + } +} +``` + +When using time zones with daylight saving time (DST) changes, the sizes of buckets that are near the transition may differ slightly from the sizes of neighboring buckets. +{: .note} diff --git a/_aggregations/bucket/children.md b/_aggregations/bucket/children.md new file mode 100644 index 0000000000..1f493c4620 --- /dev/null +++ b/_aggregations/bucket/children.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Children +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 15 +--- + +# Children + +The `children` aggregation connects parent documents with their related child documents. This allows you to analyze relationships between different types of data in a single query, rather than needing to run multiple queries and combine the results manually. + +--- + +## Example index, sample data, and children aggregation query + +For example, if you have a parent-child relationship between authors, posts, and comments, you can analyze the relationships between the different data types (`authors`, `posts`, and `comments`) in a single query. + +The `authors` aggregation groups the documents by the `author.keyword` field. This allows you to see the number of documents associates with each author. + +In each author group, the `children` aggregation retrieves the associated posts. This gives you a breakdown of the posts written by each author. + +In the `posts` aggregation, another `children` aggregation fetches the comments associated with each post. This provides you a way to see the comments for each individual post. + +In the `comments` aggregation, the `value_count` aggregation counts the number of comments on each post. This allows you to gauge the engagement level for each post by seeing the number of comments it has received. + +#### Example index + +```json +PUT /blog-sample +{ + "mappings": { + "properties": { + "type": { "type": "keyword" }, + "name": { "type": "keyword" }, + "title": { "type": "text" }, + "content": { "type": "text" }, + "author": { "type": "keyword" }, + "post_id": { "type": "keyword" }, + "join_field": { + "type": "join", + "relations": { + "author": "post", + "post": "comment" + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Sample documents + +```json +POST /blog-sample/_doc/1?routing=1 +{ + "type": "author", + "name": "John Doe", + "join_field": "author" +} + +POST /blog-sample/_doc/2?routing=1 +{ + "type": "post", + "title": "Introduction to OpenSearch", + "content": "OpenSearch is a powerful search and analytics engine...", + "author": "John Doe", + "join_field": { + "name": "post", + "parent": "1" + } +} + +POST /blog-sample/_doc/3?routing=1 +{ + "type": "comment", + "content": "Great article! Very informative.", + "join_field": { + "name": "comment", + "parent": "2" + } +} + +POST /blog-sample/_doc/4?routing=1 +{ + "type": "comment", + "content": "Thanks for the clear explanation.", + "join_field": { + "name": "comment", + "parent": "2" + } +} +``` +{% include copy-curl.html %} + +#### Example children aggregation query + +```json +GET /blog-sample/_search +{ + "size": 0, + "aggs": { + "authors": { + "terms": { + "field": "name.keyword" + }, + "aggs": { + "posts": { + "children": { + "type": "post" + }, + "aggs": { + "post_titles": { + "terms": { + "field": "title.keyword" + }, + "aggs": { + "comments": { + "children": { + "type": "comment" + }, + "aggs": { + "comment_count": { + "value_count": { + "field": "_id" + } + } + } + } + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +The response should appear similar to the following example: + +```json +{ + "took": 30, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "authors": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [] + } + } +} +``` diff --git a/_aggregations/bucket/index.md b/_aggregations/bucket/index.md index 1658c06ea5..e1a02b890d 100644 --- a/_aggregations/bucket/index.md +++ b/_aggregations/bucket/index.md @@ -22,6 +22,7 @@ You can use bucket aggregations to implement faceted navigation (usually placed OpenSearch supports the following bucket aggregations: - [Adjacency matrix]({{site.url}}{{site.baseurl}}/aggregations/bucket/adjacency-matrix/) +- [Children]({{site.url}}{{site.baseurl}}/aggregations/bucket/children) - [Date histogram]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-histogram/) - [Date range]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-range/) - [Diversified sampler]({{site.url}}{{site.baseurl}}/aggregations/bucket/diversified-sampler/) diff --git a/_aggregations/bucket/nested.md b/_aggregations/bucket/nested.md index 89c44c6457..affda8e437 100644 --- a/_aggregations/bucket/nested.md +++ b/_aggregations/bucket/nested.md @@ -96,8 +96,8 @@ GET logs/_search "aggregations" : { "pages" : { "doc_count" : 2, - "min_price" : { - "value" : 200.0 + "min_load_time" : { + "value" : 200 } } } diff --git a/_analyzers/character-filters/html-character-filter.md b/_analyzers/character-filters/html-character-filter.md new file mode 100644 index 0000000000..ef55930bdf --- /dev/null +++ b/_analyzers/character-filters/html-character-filter.md @@ -0,0 +1,124 @@ +--- +layout: default +title: HTML strip +parent: Character filters +nav_order: 100 +--- + +# HTML strip character filter + +The `html_strip` character filter removes HTML tags, such as `
`, `

`, and ``, from the input text and renders plain text. The filter can be configured to preserve certain tags or decode specific HTML entities, such as ` `, into spaces. + +## Example: HTML analyzer + +```json +GET /_analyze +{ + "tokenizer": "keyword", + "char_filter": [ + "html_strip" + ], + "text": "

Commonly used calculus symbols include α, β and θ

" +} +``` +{% include copy-curl.html %} + +Using the HTML analyzer, you can convert the HTML character entity references into their corresponding symbols. The processed text would read as follows: + +``` +Commonly used calculus symbols include α, β and θ +``` + +## Example: Custom analyzer with lowercase filter + +The following example query creates a custom analyzer that strips HTML tags and converts the plain text to lowercase by using the `html_strip` analyzer and `lowercase` filter: + +```json +PUT /html_strip_and_lowercase_analyzer +{ + "settings": { + "analysis": { + "char_filter": { + "html_filter": { + "type": "html_strip" + } + }, + "analyzer": { + "html_strip_analyzer": { + "type": "custom", + "char_filter": ["html_filter"], + "tokenizer": "standard", + "filter": ["lowercase"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Testing `html_strip_and_lowercase_analyzer` + +You can run the following request to test the analyzer: + +```json +GET /html_strip_and_lowercase_analyzer/_analyze +{ + "analyzer": "html_strip_analyzer", + "text": "

Welcome to OpenSearch!

" +} +``` +{% include copy-curl.html %} + +In the response, the HTML tags have been removed and the plain text has been converted to lowercase: + +``` +welcome to opensearch! +``` + +## Example: Custom analyzer that preserves HTML tags + +The following example request creates a custom analyzer that preserves HTML tags: + +```json +PUT /html_strip_preserve_analyzer +{ + "settings": { + "analysis": { + "char_filter": { + "html_filter": { + "type": "html_strip", + "escaped_tags": ["b", "i"] + } + }, + "analyzer": { + "html_strip_analyzer": { + "type": "custom", + "char_filter": ["html_filter"], + "tokenizer": "keyword" + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Testing `html_strip_preserve_analyzer` + +You can run the following request to test the analyzer: + +```json +GET /html_strip_preserve_analyzer/_analyze +{ + "analyzer": "html_strip_analyzer", + "text": "

This is a bold and italic text.

" +} +``` +{% include copy-curl.html %} + +In the response, the `italic` and `bold` tags have been retained, as specified in the custom analyzer request: + +``` +This is a bold and italic text. +``` diff --git a/_analyzers/character-filters/index.md b/_analyzers/character-filters/index.md new file mode 100644 index 0000000000..0e2ce01b8c --- /dev/null +++ b/_analyzers/character-filters/index.md @@ -0,0 +1,19 @@ +--- +layout: default +title: Character filters +nav_order: 90 +has_children: true +has_toc: false +--- + +# Character filters + +Character filters process text before tokenization to prepare it for further analysis. + +Unlike token filters, which operate on tokens (words or terms), character filters process the raw input text before tokenization. They are especially useful for cleaning or transforming structured text containing unwanted characters, such as HTML tags or special symbols. Character filters help to strip or replace these elements so that text is properly formatted for analysis. + +Use cases for character filters include: + +- **HTML stripping:** Removes HTML tags from content so that only the plain text is indexed. +- **Pattern replacement:** Replaces or removes unwanted characters or patterns in text, for example, converting hyphens to spaces. +- **Custom mappings:** Substitutes specific characters or sequences with other values, for example, to convert currency symbols into their textual equivalents. diff --git a/_analyzers/custom-analyzer.md b/_analyzers/custom-analyzer.md new file mode 100644 index 0000000000..b808268f66 --- /dev/null +++ b/_analyzers/custom-analyzer.md @@ -0,0 +1,312 @@ +--- +layout: default +title: Creating a custom analyzer +nav_order: 90 +parent: Analyzers +--- + +# Creating a custom analyzer + +To create a custom analyzer, specify a combination of the following components: + +- Character filters (zero or more) + +- Tokenizer (one) + +- Token filters (zero or more) + +## Configuration + +The following parameters can be used to configure a custom analyzer. + +| Parameter | Required/Optional | Description | +|:--- | :--- | :--- | +| `type` | Optional | The analyzer type. Default is `custom`. You can also specify a prebuilt analyzer using this parameter. | +| `tokenizer` | Required | A tokenizer to be included in the analyzer. | +| `char_filter` | Optional | A list of character filters to be included in the analyzer. | +| `filter` | Optional | A list of token filters to be included in the analyzer. | +| `position_increment_gap` | Optional | The extra spacing applied between values when indexing text fields that have multiple values. For more information, see [Position increment gap](#position-increment-gap). Default is `100`. | + +## Examples + +The following examples demonstrate various custom analyzer configurations. + +### Custom analyzer with a character filter for HTML stripping + +The following example analyzer removes HTML tags from text before tokenization: + +```json +PUT simple_html_strip_analyzer_index +{ + "settings": { + "analysis": { + "analyzer": { + "html_strip_analyzer": { + "type": "custom", + "char_filter": ["html_strip"], + "tokenizer": "whitespace", + "filter": ["lowercase"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET simple_html_strip_analyzer_index/_analyze +{ + "analyzer": "html_strip_analyzer", + "text": "

OpenSearch is awesome!

" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 3, + "end_offset": 13, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 14, + "end_offset": 16, + "type": "word", + "position": 1 + }, + { + "token": "awesome!", + "start_offset": 25, + "end_offset": 42, + "type": "word", + "position": 2 + } + ] +} +``` + +### Custom analyzer with a mapping character filter for synonym replacement + +The following example analyzer replaces specific characters and patterns before applying the synonym filter: + +```json +PUT mapping_analyzer_index +{ + "settings": { + "analysis": { + "analyzer": { + "synonym_mapping_analyzer": { + "type": "custom", + "char_filter": ["underscore_to_space"], + "tokenizer": "standard", + "filter": ["lowercase", "stop", "synonym_filter"] + } + }, + "char_filter": { + "underscore_to_space": { + "type": "mapping", + "mappings": ["_ => ' '"] + } + }, + "filter": { + "synonym_filter": { + "type": "synonym", + "synonyms": [ + "quick, fast, speedy", + "big, large, huge" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET mapping_analyzer_index/_analyze +{ + "analyzer": "synonym_mapping_analyzer", + "text": "The slow_green_turtle is very large" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "slow","start_offset": 4,"end_offset": 8,"type": "","position": 1}, + {"token": "green","start_offset": 9,"end_offset": 14,"type": "","position": 2}, + {"token": "turtle","start_offset": 15,"end_offset": 21,"type": "","position": 3}, + {"token": "very","start_offset": 25,"end_offset": 29,"type": "","position": 5}, + {"token": "large","start_offset": 30,"end_offset": 35,"type": "","position": 6}, + {"token": "big","start_offset": 30,"end_offset": 35,"type": "SYNONYM","position": 6}, + {"token": "huge","start_offset": 30,"end_offset": 35,"type": "SYNONYM","position": 6} + ] +} +``` + +### Custom analyzer with a custom pattern-based character filter for number normalization + +The following example analyzer normalizes phone numbers by removing dashes and spaces and applies edge n-grams to the normalized text to support partial matches: + +```json +PUT advanced_pattern_replace_analyzer_index +{ + "settings": { + "analysis": { + "analyzer": { + "phone_number_analyzer": { + "type": "custom", + "char_filter": ["phone_normalization"], + "tokenizer": "standard", + "filter": ["lowercase", "edge_ngram"] + } + }, + "char_filter": { + "phone_normalization": { + "type": "pattern_replace", + "pattern": "[-\\s]", + "replacement": "" + } + }, + "filter": { + "edge_ngram": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 10 + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET advanced_pattern_replace_analyzer_index/_analyze +{ + "analyzer": "phone_number_analyzer", + "text": "123-456 7890" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "123","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "1234","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "12345","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "123456","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "1234567","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "12345678","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "123456789","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "1234567890","start_offset": 0,"end_offset": 12,"type": "","position": 0} + ] +} +``` + +## Position increment gap + +The `position_increment_gap` parameter sets a positional gap between terms when indexing multi-valued fields, such as arrays. This gap ensures that phrase queries don't match terms across separate values unless explicitly allowed. For example, a default gap of 100 specifies that terms in different array entries are 100 positions apart, preventing unintended matches in phrase searches. You can adjust this value or set it to `0` in order to allow phrases to span across array values. + +The following example demonstrates the effect of `position_increment_gap` using a `match_phrase` query. + +1. Index a document in a `test-index`: + + ```json + PUT test-index/_doc/1 + { + "names": [ "Slow green", "turtle swims"] + } + ``` + {% include copy-curl.html %} + +1. Query the document using a `match_phrase` query: + + ```json + GET test-index/_search + { + "query": { + "match_phrase": { + "names": { + "query": "green turtle" + } + } + } + } + ``` + {% include copy-curl.html %} + + The response returns no hits because the distance between the terms `green` and `turtle` is `100` (the default `position_increment_gap`). + +1. Now query the document using a `match_phrase` query with a `slop` parameter that is higher than the `position_increment_gap`: + + ```json + GET test-index/_search + { + "query": { + "match_phrase": { + "names": { + "query": "green turtle", + "slop": 101 + } + } + } + } + ``` + {% include copy-curl.html %} + + The response contains the matching document: + + ```json + { + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.010358453, + "hits": [ + { + "_index": "test-index", + "_id": "1", + "_score": 0.010358453, + "_source": { + "names": [ + "Slow green", + "turtle swims" + ] + } + } + ] + } + } + ``` diff --git a/_analyzers/index.md b/_analyzers/index.md index fec61792b2..1dc38b2cd4 100644 --- a/_analyzers/index.md +++ b/_analyzers/index.md @@ -51,7 +51,7 @@ For a list of supported analyzers, see [Analyzers]({{site.url}}{{site.baseurl}}/ ## Custom analyzers -If needed, you can combine tokenizers, token filters, and character filters to create a custom analyzer. +If needed, you can combine tokenizers, token filters, and character filters to create a custom analyzer. For more information, see [Creating a custom analyzer]({{site.url}}{{site.baseurl}}/analyzers/custom-analyzer/). ## Text analysis at indexing time and query time @@ -160,14 +160,16 @@ The response provides information about the analyzers for each field: ``` ## Normalizers + Tokenization divides text into individual terms, but it does not address variations in token forms. Normalization resolves these issues by converting tokens into a standard format. This ensures that similar terms are matched appropriately, even if they are not identical. ### Normalization techniques The following normalization techniques can help address variations in token forms: + 1. **Case normalization**: Converts all tokens to lowercase to ensure case-insensitive matching. For example, "Hello" is normalized to "hello". -2. **Stemming**: Reduces words to their root form. For instance, "cars" is stemmed to "car", and "running" is normalized to "run". +2. **Stemming**: Reduces words to their root form. For instance, "cars" is stemmed to "car" and "running" is normalized to "run". 3. **Synonym handling:** Treats synonyms as equivalent. For example, "jogging" and "running" can be indexed under a common term, such as "run". diff --git a/_analyzers/language-analyzers.md b/_analyzers/language-analyzers.md deleted file mode 100644 index ca4ba320dd..0000000000 --- a/_analyzers/language-analyzers.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -layout: default -title: Language analyzers -nav_order: 100 -parent: Analyzers -redirect_from: - - /query-dsl/analyzers/language-analyzers/ ---- - -# Language analyzers - -OpenSearch supports the following language analyzers: -`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `turkish`, and `thai`. - -To use the analyzer when you map an index, specify the value within your query. For example, to map your index with the French language analyzer, specify the `french` value for the analyzer field: - -```json - "analyzer": "french" -``` - -#### Example request - -The following query specifies the `french` language analyzer for the index `my-index`: - -```json -PUT my-index -{ - "mappings": { - "properties": { - "text": { - "type": "text", - "fields": { - "french": { - "type": "text", - "analyzer": "french" - } - } - } - } - } -} -``` - - diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md new file mode 100644 index 0000000000..e61c684cbb --- /dev/null +++ b/_analyzers/language-analyzers/arabic.md @@ -0,0 +1,182 @@ +--- +layout: default +title: Arabic +parent: Language analyzers +grand_parent: Analyzers +nav_order: 10 +--- + +# Arabic analyzer + +The built-in `arabic` analyzer can be applied to a text field using the following command: + +```json +PUT /arabic-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "arabic" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_arabic +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_arabic_analyzer":{ + "type":"arabic", + "stem_exclusion":["تكنولوجيا","سلطة "] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Arabic analyzer internals + +The `arabic` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - stop (Arabic) + - normalization (Arabic) + - keyword + - stemmer (Arabic) + +## Custom Arabic analyzer + +You can create a custom Arabic analyzer using the following command: + +```json +PUT /arabic-index +{ + "settings": { + "analysis": { + "filter": { + "arabic_stop": { + "type": "stop", + "stopwords": "_arabic_" + }, + "arabic_stemmer": { + "type": "stemmer", + "language": "arabic" + }, + "arabic_normalization": { + "type": "arabic_normalization" + }, + "decimal_digit": { + "type": "decimal_digit" + }, + "arabic_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "arabic_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "arabic_normalization", + "decimal_digit", + "arabic_stop", + "arabic_keywords", + "arabic_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "arabic_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /arabic-index/_analyze +{ + "field": "content", + "text": "الطلاب يدرسون في الجامعات العربية. أرقامهم ١٢٣٤٥٦." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "طلاب", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "يدرس", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "جامع", + "start_offset": 17, + "end_offset": 25, + "type": "", + "position": 3 + }, + { + "token": "عرب", + "start_offset": 26, + "end_offset": 33, + "type": "", + "position": 4 + }, + { + "token": "ارقامهم", + "start_offset": 35, + "end_offset": 42, + "type": "", + "position": 5 + }, + { + "token": "123456", + "start_offset": 43, + "end_offset": 49, + "type": "", + "position": 6 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md new file mode 100644 index 0000000000..9bd0549c80 --- /dev/null +++ b/_analyzers/language-analyzers/armenian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Armenian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 20 +--- + +# Armenian analyzer + +The built-in `armenian` analyzer can be applied to a text field using the following command: + +```json +PUT /arabic-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "armenian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_armenian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_armenian_analyzer": { + "type": "armenian", + "stem_exclusion": ["բարև", "խաղաղություն"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Armenian analyzer internals + +The `armenian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Armenian) + - keyword + - stemmer (Armenian) + +## Custom Armenian analyzer + +You can create a custom Armenian analyzer using the following command: + +```json +PUT /armenian-index +{ + "settings": { + "analysis": { + "filter": { + "armenian_stop": { + "type": "stop", + "stopwords": "_armenian_" + }, + "armenian_stemmer": { + "type": "stemmer", + "language": "armenian" + }, + "armenian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "armenian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "armenian_stop", + "armenian_keywords", + "armenian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "armenian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET armenian-index/_analyze +{ + "analyzer": "stem_exclusion_armenian_analyzer", + "text": "բարև բոլորին, մենք խաղաղություն ենք ուզում և նոր օր ենք սկսել" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "բարև","start_offset": 0,"end_offset": 4,"type": "","position": 0}, + {"token": "բոլոր","start_offset": 5,"end_offset": 12,"type": "","position": 1}, + {"token": "խաղաղություն","start_offset": 19,"end_offset": 31,"type": "","position": 3}, + {"token": "ուզ","start_offset": 36,"end_offset": 42,"type": "","position": 5}, + {"token": "նոր","start_offset": 45,"end_offset": 48,"type": "","position": 7}, + {"token": "օր","start_offset": 49,"end_offset": 51,"type": "","position": 8}, + {"token": "սկսել","start_offset": 56,"end_offset": 61,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md new file mode 100644 index 0000000000..e73510cc66 --- /dev/null +++ b/_analyzers/language-analyzers/basque.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Basque +parent: Language analyzers +grand_parent: Analyzers +nav_order: 30 +--- + +# Basque analyzer + +The built-in `basque` analyzer can be applied to a text field using the following command: + +```json +PUT /basque-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "basque" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_basque_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_basque_analyzer": { + "type": "basque", + "stem_exclusion": ["autoritate", "baldintza"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Basque analyzer internals + +The `basque` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Basque) + - keyword + - stemmer (Basque) + +## Custom Basque analyzer + +You can create a custom Basque analyzer using the following command: + +```json +PUT /basque-index +{ + "settings": { + "analysis": { + "filter": { + "basque_stop": { + "type": "stop", + "stopwords": "_basque_" + }, + "basque_stemmer": { + "type": "stemmer", + "language": "basque" + }, + "basque_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "basque_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "basque_stop", + "basque_keywords", + "basque_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "basque_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /basque-index/_analyze +{ + "field": "content", + "text": "Ikasleek euskal unibertsitateetan ikasten dute. Haien zenbakiak 123456 dira." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "ikasle","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "euskal","start_offset": 9,"end_offset": 15,"type": "","position": 1}, + {"token": "unibertsi","start_offset": 16,"end_offset": 33,"type": "","position": 2}, + {"token": "ikas","start_offset": 34,"end_offset": 41,"type": "","position": 3}, + {"token": "haien","start_offset": 48,"end_offset": 53,"type": "","position": 5}, + {"token": "zenba","start_offset": 54,"end_offset": 63,"type": "","position": 6}, + {"token": "123456","start_offset": 64,"end_offset": 70,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md new file mode 100644 index 0000000000..af913a01ef --- /dev/null +++ b/_analyzers/language-analyzers/bengali.md @@ -0,0 +1,142 @@ +--- +layout: default +title: Bengali +parent: Language analyzers +grand_parent: Analyzers +nav_order: 40 +--- + +# Bengali analyzer + +The built-in `bengali` analyzer can be applied to a text field using the following command: + +```json +PUT /bengali-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bengali" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_bengali_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_bengali_analyzer": { + "type": "bengali", + "stem_exclusion": ["কর্তৃপক্ষ", "অনুমোদন"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Bengali analyzer internals + +The `bengali` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - indic_normalization + - normalization (Bengali) + - stop (Bengali) + - keyword + - stemmer (Bengali) + +## Custom Bengali analyzer + +You can create a custom Bengali analyzer using the following command: + +```json +PUT /bengali-index +{ + "settings": { + "analysis": { + "filter": { + "bengali_stop": { + "type": "stop", + "stopwords": "_bengali_" + }, + "bengali_stemmer": { + "type": "stemmer", + "language": "bengali" + }, + "bengali_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "bengali_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "indic_normalization", + "bengali_normalization", + "bengali_stop", + "bengali_keywords", + "bengali_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bengali_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /bengali-index/_analyze +{ + "field": "content", + "text": "ছাত্ররা বিশ্ববিদ্যালয়ে পড়াশোনা করে। তাদের নম্বরগুলি ১২৩৪৫৬।" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "ছাত্র","start_offset": 0,"end_offset": 7,"type": "","position": 0}, + {"token": "বিসসবিদালয়","start_offset": 8,"end_offset": 23,"type": "","position": 1}, + {"token": "পরাসোন","start_offset": 24,"end_offset": 32,"type": "","position": 2}, + {"token": "তা","start_offset": 38,"end_offset": 43,"type": "","position": 4}, + {"token": "নমমর","start_offset": 44,"end_offset": 53,"type": "","position": 5}, + {"token": "123456","start_offset": 54,"end_offset": 60,"type": "","position": 6} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md new file mode 100644 index 0000000000..67db2b92bc --- /dev/null +++ b/_analyzers/language-analyzers/brazilian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Brazilian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 50 +--- + +# Brazilian analyzer + +The built-in `brazilian` analyzer can be applied to a text field using the following command: + +```json +PUT /brazilian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "brazilian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_brazilian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_brazilian_analyzer": { + "type": "brazilian", + "stem_exclusion": ["autoridade", "aprovação"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Brazilian analyzer internals + +The `brazilian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Brazilian) + - keyword + - stemmer (Brazilian) + +## Custom Brazilian analyzer + +You can create a custom Brazilian analyzer using the following command: + +```json +PUT /brazilian-index +{ + "settings": { + "analysis": { + "filter": { + "brazilian_stop": { + "type": "stop", + "stopwords": "_brazilian_" + }, + "brazilian_stemmer": { + "type": "stemmer", + "language": "brazilian" + }, + "brazilian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "brazilian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "brazilian_stop", + "brazilian_keywords", + "brazilian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "brazilian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /brazilian-index/_analyze +{ + "field": "content", + "text": "Estudantes estudam em universidades brasileiras. Seus números são 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estudant","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "estud","start_offset": 11,"end_offset": 18,"type": "","position": 1}, + {"token": "univers","start_offset": 22,"end_offset": 35,"type": "","position": 3}, + {"token": "brasileir","start_offset": 36,"end_offset": 47,"type": "","position": 4}, + {"token": "numer","start_offset": 54,"end_offset": 61,"type": "","position": 6}, + {"token": "sao","start_offset": 62,"end_offset": 65,"type": "","position": 7}, + {"token": "123456","start_offset": 66,"end_offset": 72,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/bulgarian.md b/_analyzers/language-analyzers/bulgarian.md new file mode 100644 index 0000000000..42d5794e18 --- /dev/null +++ b/_analyzers/language-analyzers/bulgarian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Bulgarian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 60 +--- + +# Bulgarian analyzer + +The built-in `bulgarian` analyzer can be applied to a text field using the following command: + +```json +PUT /bulgarian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bulgarian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_bulgarian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_bulgarian_analyzer": { + "type": "bulgarian", + "stem_exclusion": ["авторитет", "одобрение"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Bulgarian analyzer internals + +The `bulgarian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Bulgarian) + - keyword + - stemmer (Bulgarian) + +## Custom Bulgarian analyzer + +You can create a custom Bulgarian analyzer using the following command: + +```json +PUT /bulgarian-index +{ + "settings": { + "analysis": { + "filter": { + "bulgarian_stop": { + "type": "stop", + "stopwords": "_bulgarian_" + }, + "bulgarian_stemmer": { + "type": "stemmer", + "language": "bulgarian" + }, + "bulgarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "bulgarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "bulgarian_stop", + "bulgarian_keywords", + "bulgarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bulgarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /bulgarian-index/_analyze +{ + "field": "content", + "text": "Студентите учат в българските университети. Техните номера са 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "студент","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "учат","start_offset": 11,"end_offset": 15,"type": "","position": 1}, + {"token": "българск","start_offset": 18,"end_offset": 29,"type": "","position": 3}, + {"token": "университят","start_offset": 30,"end_offset": 42,"type": "","position": 4}, + {"token": "техн","start_offset": 44,"end_offset": 51,"type": "","position": 5}, + {"token": "номер","start_offset": 52,"end_offset": 58,"type": "","position": 6}, + {"token": "123456","start_offset": 62,"end_offset": 68,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/catalan.md b/_analyzers/language-analyzers/catalan.md new file mode 100644 index 0000000000..89762da094 --- /dev/null +++ b/_analyzers/language-analyzers/catalan.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Catalan +parent: Language analyzers +grand_parent: Analyzers +nav_order: 70 +--- + +# Catalan analyzer + +The built-in `catalan` analyzer can be applied to a text field using the following command: + +```json +PUT /catalan-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "catalan" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_catalan_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_catalan_analyzer": { + "type": "catalan", + "stem_exclusion": ["autoritat", "aprovació"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Catalan analyzer internals + +The `catalan` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (Catalan) + - lowercase + - stop (Catalan) + - keyword + - stemmer (Catalan) + +## Custom Catalan analyzer + +You can create a custom Catalan analyzer using the following command: + +```json +PUT /catalan-index +{ + "settings": { + "analysis": { + "filter": { + "catalan_stop": { + "type": "stop", + "stopwords": "_catalan_" + }, + "catalan_elision": { + "type": "elision", + "articles": [ "d", "l", "m", "n", "s", "t"], + "articles_case": true + }, + "catalan_stemmer": { + "type": "stemmer", + "language": "catalan" + }, + "catalan_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "catalan_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "catalan_elision", + "lowercase", + "catalan_stop", + "catalan_keywords", + "catalan_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "catalan_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /catalan-index/_analyze +{ + "field": "content", + "text": "Els estudiants estudien a les universitats catalanes. Els seus números són 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estud","start_offset": 4,"end_offset": 14,"type": "","position": 1}, + {"token": "estud","start_offset": 15,"end_offset": 23,"type": "","position": 2}, + {"token": "univer","start_offset": 30,"end_offset": 42,"type": "","position": 5}, + {"token": "catalan","start_offset": 43,"end_offset": 52,"type": "","position": 6}, + {"token": "numer","start_offset": 63,"end_offset": 70,"type": "","position": 9}, + {"token": "123456","start_offset": 75,"end_offset": 81,"type": "","position": 11} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/cjk.md b/_analyzers/language-analyzers/cjk.md new file mode 100644 index 0000000000..aed7e6da22 --- /dev/null +++ b/_analyzers/language-analyzers/cjk.md @@ -0,0 +1,142 @@ +--- +layout: default +title: CJK +parent: Language analyzers +grand_parent: Analyzers +nav_order: 80 +--- + +# CJK analyzer + +The built-in `cjk` analyzer can be applied to a text field using the following command: + +```json +PUT /cjk-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "cjk" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_cjk_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_cjk_analyzer": { + "type": "cjk", + "stem_exclusion": ["example", "words"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## CJK analyzer internals + +The `cjk` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - cjk_width + - lowercase + - cjk_bigram + - stop (similar to English) + +## Custom CJK analyzer + +You can create a custom CJK analyzer using the following command: + +```json +PUT /cjk-index +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": [ + "a", "and", "are", "as", "at", "be", "but", "by", "for", + "if", "in", "into", "is", "it", "no", "not", "of", "on", + "or", "s", "such", "t", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", + "with", "www" + ] + } + }, + "analyzer": { + "cjk_custom_analyzer": { + "tokenizer": "standard", + "filter": [ + "cjk_width", + "lowercase", + "cjk_bigram", + "english_stop" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "cjk_custom_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /cjk-index/_analyze +{ + "field": "content", + "text": "学生们在中国、日本和韩国的大学学习。123456" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "学生","start_offset": 0,"end_offset": 2,"type": "","position": 0}, + {"token": "生们","start_offset": 1,"end_offset": 3,"type": "","position": 1}, + {"token": "们在","start_offset": 2,"end_offset": 4,"type": "","position": 2}, + {"token": "在中","start_offset": 3,"end_offset": 5,"type": "","position": 3}, + {"token": "中国","start_offset": 4,"end_offset": 6,"type": "","position": 4}, + {"token": "日本","start_offset": 7,"end_offset": 9,"type": "","position": 5}, + {"token": "本和","start_offset": 8,"end_offset": 10,"type": "","position": 6}, + {"token": "和韩","start_offset": 9,"end_offset": 11,"type": "","position": 7}, + {"token": "韩国","start_offset": 10,"end_offset": 12,"type": "","position": 8}, + {"token": "国的","start_offset": 11,"end_offset": 13,"type": "","position": 9}, + {"token": "的大","start_offset": 12,"end_offset": 14,"type": "","position": 10}, + {"token": "大学","start_offset": 13,"end_offset": 15,"type": "","position": 11}, + {"token": "学学","start_offset": 14,"end_offset": 16,"type": "","position": 12}, + {"token": "学习","start_offset": 15,"end_offset": 17,"type": "","position": 13}, + {"token": "123456","start_offset": 18,"end_offset": 24,"type": "","position": 14} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/czech.md b/_analyzers/language-analyzers/czech.md new file mode 100644 index 0000000000..c1778cd0f4 --- /dev/null +++ b/_analyzers/language-analyzers/czech.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Czech +parent: Language analyzers +grand_parent: Analyzers +nav_order: 90 +--- + +# Czech analyzer + +The built-in `czech` analyzer can be applied to a text field using the following command: + +```json +PUT /czech-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "czech" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_czech_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_czech_analyzer": { + "type": "czech", + "stem_exclusion": ["autorita", "schválení"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Czech analyzer internals + +The `czech` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Czech) + - keyword + - stemmer (Czech) + +## Custom Czech analyzer + +You can create a custom Czech analyzer using the following command: + +```json +PUT /czech-index +{ + "settings": { + "analysis": { + "filter": { + "czech_stop": { + "type": "stop", + "stopwords": "_czech_" + }, + "czech_stemmer": { + "type": "stemmer", + "language": "czech" + }, + "czech_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "czech_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "czech_stop", + "czech_keywords", + "czech_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "czech_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /czech-index/_analyze +{ + "field": "content", + "text": "Studenti studují na českých univerzitách. Jejich čísla jsou 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "studuj", + "start_offset": 9, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "česk", + "start_offset": 20, + "end_offset": 27, + "type": "", + "position": 3 + }, + { + "token": "univerzit", + "start_offset": 28, + "end_offset": 40, + "type": "", + "position": 4 + }, + { + "token": "čísl", + "start_offset": 49, + "end_offset": 54, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 60, + "end_offset": 66, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/danish.md b/_analyzers/language-analyzers/danish.md new file mode 100644 index 0000000000..b5ee1b0e97 --- /dev/null +++ b/_analyzers/language-analyzers/danish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Danish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 100 +--- + +# Danish analyzer + +The built-in `danish` analyzer can be applied to a text field using the following command: + +```json +PUT /danish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "danish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_danish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_danish_analyzer": { + "type": "danish", + "stem_exclusion": ["autoritet", "godkendelse"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Danish analyzer internals + +The `danish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Danish) + - keyword + - stemmer (Danish) + +## Custom Danish analyzer + +You can create a custom Danish analyzer using the following command: + +```json +PUT /danish-index +{ + "settings": { + "analysis": { + "filter": { + "danish_stop": { + "type": "stop", + "stopwords": "_danish_" + }, + "danish_stemmer": { + "type": "stemmer", + "language": "danish" + }, + "danish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "danish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "danish_stop", + "danish_keywords", + "danish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "danish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /danish-index/_analyze +{ + "field": "content", + "text": "Studerende studerer på de danske universiteter. Deres numre er 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stud", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "stud", + "start_offset": 11, + "end_offset": 19, + "type": "", + "position": 1 + }, + { + "token": "dansk", + "start_offset": 26, + "end_offset": 32, + "type": "", + "position": 4 + }, + { + "token": "universitet", + "start_offset": 33, + "end_offset": 46, + "type": "", + "position": 5 + }, + { + "token": "numr", + "start_offset": 54, + "end_offset": 59, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 63, + "end_offset": 69, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/dutch.md b/_analyzers/language-analyzers/dutch.md new file mode 100644 index 0000000000..0259707d78 --- /dev/null +++ b/_analyzers/language-analyzers/dutch.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Dutch +parent: Language analyzers +grand_parent: Analyzers +nav_order: 110 +--- + +# Dutch analyzer + +The built-in `dutch` analyzer can be applied to a text field using the following command: + +```json +PUT /dutch-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "dutch" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_dutch_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_dutch_analyzer": { + "type": "dutch", + "stem_exclusion": ["autoriteit", "goedkeuring"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Dutch analyzer internals + +The `dutch` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Dutch) + - keyword + - stemmer_override + - stemmer (Dutch) + +## Custom Dutch analyzer + +You can create a custom Dutch analyzer using the following command: + +```json +PUT /dutch-index +{ + "settings": { + "analysis": { + "filter": { + "dutch_stop": { + "type": "stop", + "stopwords": "_dutch_" + }, + "dutch_stemmer": { + "type": "stemmer", + "language": "dutch" + }, + "dutch_keywords": { + "type": "keyword_marker", + "keywords": [] + }, + "dutch_override": { + "type": "stemmer_override", + "rules": [ + "fiets=>fiets", + "bromfiets=>bromfiets", + "ei=>eier", + "kind=>kinder" + ] + } + }, + "analyzer": { + "dutch_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "dutch_stop", + "dutch_keywords", + "dutch_override", + "dutch_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "dutch_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /dutch-index/_analyze +{ + "field": "content", + "text": "De studenten studeren in Nederland en bezoeken Amsterdam. Hun nummers zijn 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 3,"end_offset": 12,"type": "","position": 1}, + {"token": "studer","start_offset": 13,"end_offset": 21,"type": "","position": 2}, + {"token": "nederland","start_offset": 25,"end_offset": 34,"type": "","position": 4}, + {"token": "bezoek","start_offset": 38,"end_offset": 46,"type": "","position": 6}, + {"token": "amsterdam","start_offset": 47,"end_offset": 56,"type": "","position": 7}, + {"token": "nummer","start_offset": 62,"end_offset": 69,"type": "","position": 9}, + {"token": "123456","start_offset": 75,"end_offset": 81,"type": "","position": 11} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/english.md b/_analyzers/language-analyzers/english.md new file mode 100644 index 0000000000..2d0b600312 --- /dev/null +++ b/_analyzers/language-analyzers/english.md @@ -0,0 +1,143 @@ +--- +layout: default +title: English +parent: Language analyzers +grand_parent: Analyzers +nav_order: 120 +--- + +# English analyzer + +The built-in `english` analyzer can be applied to a text field using the following command: + +```json +PUT /english-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "english" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer": { + "type": "english", + "stem_exclusion": ["authority", "authorization"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## English analyzer internals + +The `english` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - stemmer (possessive_english) + - lowercase + - stop (English) + - keyword + - stemmer (English) + +## Custom English analyzer + +You can create a custom English analyzer using the following command: + +```json +PUT /english-index +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": "_english_" + }, + "english_stemmer": { + "type": "stemmer", + "language": "english" + }, + "english_keywords": { + "type": "keyword_marker", + "keywords": [] + }, + "english_possessive_stemmer": { + "type": "stemmer", + "language": "possessive_english" + } + }, + "analyzer": { + "english_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "english_possessive_stemmer", + "lowercase", + "english_stop", + "english_keywords", + "english_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "english_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /english-index/_analyze +{ + "field": "content", + "text": "The students study in the USA and work at NASA. Their numbers are 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 4,"end_offset": 12,"type": "","position": 1}, + {"token": "studi","start_offset": 13,"end_offset": 18,"type": "","position": 2}, + {"token": "usa","start_offset": 26,"end_offset": 29,"type": "","position": 5}, + {"token": "work","start_offset": 34,"end_offset": 38,"type": "","position": 7}, + {"token": "nasa","start_offset": 42,"end_offset": 46,"type": "","position": 9}, + {"token": "number","start_offset": 54,"end_offset": 61,"type": "","position": 11}, + {"token": "123456","start_offset": 66,"end_offset": 72,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/estonian.md b/_analyzers/language-analyzers/estonian.md new file mode 100644 index 0000000000..a4cb664f18 --- /dev/null +++ b/_analyzers/language-analyzers/estonian.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Estonian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 130 +--- + +# Estonian analyzer + +The built-in `estonian` analyzer can be applied to a text field using the following command: + +```json +PUT /estonian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "estonian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_estonian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_estonian_analyzer": { + "type": "estonian", + "stem_exclusion": ["autoriteet", "kinnitus"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Estonian analyzer internals + +The `estonian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Estonian) + - keyword + - stemmer (Estonian) + +## Custom Estonian analyzer + +You can create a custom Estonian analyzer using the following command: + +```json +PUT /estonian-index +{ + "settings": { + "analysis": { + "filter": { + "estonian_stop": { + "type": "stop", + "stopwords": "_estonian_" + }, + "estonian_stemmer": { + "type": "stemmer", + "language": "estonian" + }, + "estonian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "estonian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "estonian_stop", + "estonian_keywords", + "estonian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "estonian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /estonian-index/_analyze +{ + "field": "content", + "text": "Õpilased õpivad Tallinnas ja Eesti ülikoolides. Nende numbrid on 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "õpilase","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "õpi","start_offset": 9,"end_offset": 15,"type": "","position": 1}, + {"token": "tallinna","start_offset": 16,"end_offset": 25,"type": "","position": 2}, + {"token": "eesti","start_offset": 29,"end_offset": 34,"type": "","position": 4}, + {"token": "ülikooli","start_offset": 35,"end_offset": 46,"type": "","position": 5}, + {"token": "nende","start_offset": 48,"end_offset": 53,"type": "","position": 6}, + {"token": "numbri","start_offset": 54,"end_offset": 61,"type": "","position": 7}, + {"token": "on","start_offset": 62,"end_offset": 64,"type": "","position": 8}, + {"token": "123456","start_offset": 65,"end_offset": 71,"type": "","position": 9} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/finnish.md b/_analyzers/language-analyzers/finnish.md new file mode 100644 index 0000000000..6f559650d2 --- /dev/null +++ b/_analyzers/language-analyzers/finnish.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Finnish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 140 +--- + +# Finnish analyzer + +The built-in `finnish` analyzer can be applied to a text field using the following command: + +```json +PUT /finnish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "finnish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_finnish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_finnish_analyzer": { + "type": "finnish", + "stem_exclusion": ["valta", "hyväksyntä"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Finnish analyzer internals + +The `finnish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Finnish) + - keyword + - stemmer (Finnish) + +## Custom Finnish analyzer + +You can create a custom Finnish analyzer using the following command: + +```json +PUT /finnish-index +{ + "settings": { + "analysis": { + "filter": { + "finnish_stop": { + "type": "stop", + "stopwords": "_finnish_" + }, + "finnish_stemmer": { + "type": "stemmer", + "language": "finnish" + }, + "finnish_keywords": { + "type": "keyword_marker", + "keywords": ["Helsinki", "Suomi"] + } + }, + "analyzer": { + "finnish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "finnish_stop", + "finnish_keywords", + "finnish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "finnish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /finnish-index/_analyze +{ + "field": "content", + "text": "Opiskelijat opiskelevat Helsingissä ja Suomen yliopistoissa. Heidän numeronsa ovat 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "opiskelij","start_offset": 0,"end_offset": 11,"type": "","position": 0}, + {"token": "opiskelev","start_offset": 12,"end_offset": 23,"type": "","position": 1}, + {"token": "helsing","start_offset": 24,"end_offset": 35,"type": "","position": 2}, + {"token": "suome","start_offset": 39,"end_offset": 45,"type": "","position": 4}, + {"token": "yliopisto","start_offset": 46,"end_offset": 59,"type": "","position": 5}, + {"token": "numero","start_offset": 68,"end_offset": 77,"type": "","position": 7}, + {"token": "123456","start_offset": 83,"end_offset": 89,"type": "","position": 9} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/french.md b/_analyzers/language-analyzers/french.md new file mode 100644 index 0000000000..64e7ab5415 --- /dev/null +++ b/_analyzers/language-analyzers/french.md @@ -0,0 +1,148 @@ +--- +layout: default +title: French +parent: Language analyzers +grand_parent: Analyzers +nav_order: 150 +--- + +# French analyzer + +The built-in `french` analyzer can be applied to a text field using the following command: + +```json +PUT /french-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "french" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_french_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_french_analyzer": { + "type": "french", + "stem_exclusion": ["autorité", "acceptation"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## French analyzer internals + +The `french` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (French) + - lowercase + - stop (French) + - keyword + - stemmer (French) + +## Custom French analyzer + +You can create a custom French analyzer using the following command: + +```json +PUT /french-index +{ + "settings": { + "analysis": { + "filter": { + "french_stop": { + "type": "stop", + "stopwords": "_french_" + }, + "french_elision": { + "type": "elision", + "articles_case": true, + "articles": [ + "l", "m", "t", "qu", "n", "s", + "j", "d", "c", "jusqu", "quoiqu", + "lorsqu", "puisqu" + ] + }, + "french_stemmer": { + "type": "stemmer", + "language": "light_french" + }, + "french_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "french_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "french_elision", + "lowercase", + "french_stop", + "french_keywords", + "french_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "french_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /french-index/_analyze +{ + "field": "content", + "text": "Les étudiants étudient à Paris et dans les universités françaises. Leurs numéros sont 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "etudiant","start_offset": 4,"end_offset": 13,"type": "","position": 1}, + {"token": "etudient","start_offset": 14,"end_offset": 22,"type": "","position": 2}, + {"token": "pari","start_offset": 25,"end_offset": 30,"type": "","position": 4}, + {"token": "universit","start_offset": 43,"end_offset": 54,"type": "","position": 8}, + {"token": "francais","start_offset": 55,"end_offset": 65,"type": "","position": 9}, + {"token": "numero","start_offset": 73,"end_offset": 80,"type": "","position": 11}, + {"token": "123456","start_offset": 86,"end_offset": 92,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/galician.md b/_analyzers/language-analyzers/galician.md new file mode 100644 index 0000000000..00338b23a7 --- /dev/null +++ b/_analyzers/language-analyzers/galician.md @@ -0,0 +1,138 @@ +--- +layout: default +title: Galician +parent: Language analyzers +grand_parent: Analyzers +nav_order: 160 +--- + +# Galician analyzer + +The built-in `galician` analyzer can be applied to a text field using the following command: + +```json +PUT /galician-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "galician" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_galician_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_galician_analyzer": { + "type": "galician", + "stem_exclusion": ["autoridade", "aceptación"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Galician analyzer internals + +The `galician` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (French) + - keyword + - stemmer (French) + +## Custom Galician analyzer + +You can create a custom Galician analyzer using the following command: + +```json +PUT /galician-index +{ + "settings": { + "analysis": { + "filter": { + "galician_stop": { + "type": "stop", + "stopwords": "_galician_" + }, + "galician_stemmer": { + "type": "stemmer", + "language": "galician" + }, + "galician_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "galician_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "galician_stop", + "galician_keywords", + "galician_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "galician_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /galician-index/_analyze +{ + "field": "content", + "text": "Os estudantes estudan en Santiago e nas universidades galegas. Os seus números son 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estud","start_offset": 3,"end_offset": 13,"type": "","position": 1}, + {"token": "estud","start_offset": 14,"end_offset": 21,"type": "","position": 2}, + {"token": "santiag","start_offset": 25,"end_offset": 33,"type": "","position": 4}, + {"token": "univers","start_offset": 40,"end_offset": 53,"type": "","position": 7}, + {"token": "galeg","start_offset": 54,"end_offset": 61,"type": "","position": 8}, + {"token": "numer","start_offset": 71,"end_offset": 78,"type": "","position": 11}, + {"token": "son","start_offset": 79,"end_offset": 82,"type": "","position": 12}, + {"token": "123456","start_offset": 83,"end_offset": 89,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/german.md b/_analyzers/language-analyzers/german.md new file mode 100644 index 0000000000..4071ef5378 --- /dev/null +++ b/_analyzers/language-analyzers/german.md @@ -0,0 +1,174 @@ +--- +layout: default +title: German +parent: Language analyzers +grand_parent: Analyzers +nav_order: 170 +--- + +# German analyzer + +The built-in `german` analyzer can be applied to a text field using the following command: + +```json +PUT /german-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "german" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_german_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_german_analyzer": { + "type": "german", + "stem_exclusion": ["Autorität", "Genehmigung"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## German analyzer internals + +The `german` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (German) + - keyword + - normalization (German) + - stemmer (German) + +## Custom German analyzer + +You can create a custom German analyzer using the following command: + +```json +PUT /german-index +{ + "settings": { + "analysis": { + "filter": { + "german_stop": { + "type": "stop", + "stopwords": "_german_" + }, + "german_stemmer": { + "type": "stemmer", + "language": "light_german" + }, + "german_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "german_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "german_stop", + "german_keywords", + "german_normalization", + "german_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "german_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /german-index/_analyze +{ + "field": "content", + "text": "Die Studenten studieren an den deutschen Universitäten. Ihre Nummern sind 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 4, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "studi", + "start_offset": 14, + "end_offset": 23, + "type": "", + "position": 2 + }, + { + "token": "deutsch", + "start_offset": 31, + "end_offset": 40, + "type": "", + "position": 5 + }, + { + "token": "universitat", + "start_offset": 41, + "end_offset": 54, + "type": "", + "position": 6 + }, + { + "token": "numm", + "start_offset": 61, + "end_offset": 68, + "type": "", + "position": 8 + }, + { + "token": "123456", + "start_offset": 74, + "end_offset": 80, + "type": "", + "position": 10 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/greek.md b/_analyzers/language-analyzers/greek.md new file mode 100644 index 0000000000..2446b1e2d6 --- /dev/null +++ b/_analyzers/language-analyzers/greek.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Greek +parent: Language analyzers +grand_parent: Analyzers +nav_order: 180 +--- + +# Greek analyzer + +The built-in `greek` analyzer can be applied to a text field using the following command: + +```json +PUT /greek-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "greek" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_greek_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_greek_analyzer": { + "type": "greek", + "stem_exclusion": ["αρχή", "έγκριση"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Greek analyzer internals + +The `greek` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Greek) + - keyword + - stemmer (Greek) + +## Custom Greek analyzer + +You can create a custom Greek analyzer using the following command: + +```json +PUT /greek-index +{ + "settings": { + "analysis": { + "filter": { + "greek_stop": { + "type": "stop", + "stopwords": "_greek_" + }, + "greek_stemmer": { + "type": "stemmer", + "language": "greek" + }, + "greek_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "greek_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "greek_stop", + "greek_keywords", + "greek_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "greek_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /greek-index/_analyze +{ + "field": "content", + "text": "Οι φοιτητές σπουδάζουν στα ελληνικά πανεπιστήμια. Οι αριθμοί τους είναι 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "φοιτητές","start_offset": 3,"end_offset": 11,"type": "","position": 1}, + {"token": "σπουδάζ","start_offset": 12,"end_offset": 22,"type": "","position": 2}, + {"token": "στα","start_offset": 23,"end_offset": 26,"type": "","position": 3}, + {"token": "ελληνικά","start_offset": 27,"end_offset": 35,"type": "","position": 4}, + {"token": "πανεπιστήμ","start_offset": 36,"end_offset": 48,"type": "","position": 5}, + {"token": "αριθμοί","start_offset": 53,"end_offset": 60,"type": "","position": 7}, + {"token": "τους","start_offset": 61,"end_offset": 65,"type": "","position": 8}, + {"token": "είνα","start_offset": 66,"end_offset": 71,"type": "","position": 9}, + {"token": "123456","start_offset": 72,"end_offset": 78,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/hindi.md b/_analyzers/language-analyzers/hindi.md new file mode 100644 index 0000000000..93f2eea319 --- /dev/null +++ b/_analyzers/language-analyzers/hindi.md @@ -0,0 +1,178 @@ +--- +layout: default +title: Hindi +parent: Language analyzers +grand_parent: Analyzers +nav_order: 190 +--- + +# Hindi analyzer + +The built-in `hindi` analyzer can be applied to a text field using the following command: + +```json +PUT /hindi-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hindi" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_hindi_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_hindi_analyzer": { + "type": "hindi", + "stem_exclusion": ["अधिकार", "अनुमोदन"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Hindi analyzer internals + +The `hindi` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - keyword + - normalization (indic) + - normalization (Hindi) + - stop (Hindi) + - stemmer (Hindi) + +## Custom Hindi analyzer + +You can create a custom Hindi analyzer using the following command: + +```json +PUT /hindi-index +{ + "settings": { + "analysis": { + "filter": { + "hindi_stop": { + "type": "stop", + "stopwords": "_hindi_" + }, + "hindi_stemmer": { + "type": "stemmer", + "language": "hindi" + }, + "hindi_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hindi_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "hindi_keywords", + "indic_normalization", + "hindi_normalization", + "hindi_stop", + "hindi_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hindi_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /hindi-index/_analyze +{ + "field": "content", + "text": "छात्र भारतीय विश्वविद्यालयों में पढ़ते हैं। उनके नंबर १२३४५६ हैं।" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "छातर", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "भारतिय", + "start_offset": 6, + "end_offset": 12, + "type": "", + "position": 1 + }, + { + "token": "विशवविदयालय", + "start_offset": 13, + "end_offset": 28, + "type": "", + "position": 2 + }, + { + "token": "पढ", + "start_offset": 33, + "end_offset": 38, + "type": "", + "position": 4 + }, + { + "token": "नंबर", + "start_offset": 49, + "end_offset": 53, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 54, + "end_offset": 60, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/hungarian.md b/_analyzers/language-analyzers/hungarian.md new file mode 100644 index 0000000000..d115c5d29c --- /dev/null +++ b/_analyzers/language-analyzers/hungarian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Hungarian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 200 +--- + +# Hungarian analyzer + +The built-in `hungarian` analyzer can be applied to a text field using the following command: + +```json +PUT /hungarian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_hungarian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_hungarian_analyzer": { + "type": "hungarian", + "stem_exclusion": ["hatalom", "jóváhagyás"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Hungarian analyzer internals + +The `hungarian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Hungarian) + - keyword + - stemmer (Hungarian) + +## Custom Hungarian analyzer + +You can create a custom Hungarian analyzer using the following command: + +```json +PUT /hungarian-index +{ + "settings": { + "analysis": { + "filter": { + "hungarian_stop": { + "type": "stop", + "stopwords": "_hungarian_" + }, + "hungarian_stemmer": { + "type": "stemmer", + "language": "hungarian" + }, + "hungarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hungarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "hungarian_stop", + "hungarian_keywords", + "hungarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /hungarian-index/_analyze +{ + "field": "content", + "text": "A diákok a magyar egyetemeken tanulnak. A számaik 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "diák", + "start_offset": 2, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "magyar", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "egyetem", + "start_offset": 18, + "end_offset": 29, + "type": "", + "position": 4 + }, + { + "token": "tanul", + "start_offset": 30, + "end_offset": 38, + "type": "", + "position": 5 + }, + { + "token": "szám", + "start_offset": 42, + "end_offset": 49, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md new file mode 100644 index 0000000000..89a4a42254 --- /dev/null +++ b/_analyzers/language-analyzers/index.md @@ -0,0 +1,135 @@ +--- +layout: default +title: Language analyzers +nav_order: 100 +parent: Analyzers +has_children: true +has_toc: true +redirect_from: + - /query-dsl/analyzers/language-analyzers/ + - /analyzers/language-analyzers/ +--- + +# Language analyzers + +OpenSearch supports the following language analyzers: +`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `thai`, and `turkish`. + +To use an analyzer when you map an index, specify the value in your query. For example, to map your index with the French language analyzer, specify the `french` value in the analyzer field: + +```json + "analyzer": "french" +``` + +#### Example request + +The following query specifies an index `my-index` with the `content` field configured as multi-field, and a sub-field named `french` is configured with the `french` language analyzer: + +```json +PUT my-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "fields": { + "french": { + "type": "text", + "analyzer": "french" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The default `french` analyzer can also be configured for the entire index using the following query: + +```json +PUT my-index +{ + "settings": { + "analysis": { + "analyzer": { + "default": { + "type": "french" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text" + }, + "title": { + "type": "text" + }, + "description": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can apply stem exclusion to any language analyzer by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring that they are not stemmed. + +## Stem exclusion example + +Use the following request to configure `stem_exclusion`: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer":{ + "type":"english", + "stem_exclusion": ["manager", "management"] + } + } + } + } +} +``` +{% include copy-curl.html %} + + +## Stem exclusion with custom analyzers + +All language analyzers consist of tokenizers and token filters specific to a particular language. If you want to implement a custom version of the language analyzer with stem exclusion, you need to configure the `keyword_marker` token filter and list the words excluded from stemming in the `keywords` parameter: + +```json +PUT index_with_keyword_marker_analyzer +{ + "settings": { + "analysis": { + "filter": { + "protected_keywords_filter": { + "type": "keyword_marker", + "keywords": ["Apple", "OpenSearch"] + } + }, + "analyzer": { + "custom_english_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "protected_keywords_filter", + "english_stemmer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_analyzers/language-analyzers/indonesian.md b/_analyzers/language-analyzers/indonesian.md new file mode 100644 index 0000000000..5c3d430b3a --- /dev/null +++ b/_analyzers/language-analyzers/indonesian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Indonesian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 210 +--- + +# Indonesian analyzer + +The built-in `indonesian` analyzer can be applied to a text field using the following command: + +```json +PUT /indonesian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "indonesian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_indonesian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_indonesian_analyzer": { + "type": "indonesian", + "stem_exclusion": ["otoritas", "persetujuan"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Indonesian analyzer internals + +The `indonesian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Indonesian) + - keyword + - stemmer (Indonesian) + +## Custom Indonesian analyzer + +You can create a custom Indonesian analyzer using the following command: + +```json +PUT /hungarian-index +{ + "settings": { + "analysis": { + "filter": { + "hungarian_stop": { + "type": "stop", + "stopwords": "_hungarian_" + }, + "hungarian_stemmer": { + "type": "stemmer", + "language": "hungarian" + }, + "hungarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hungarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "hungarian_stop", + "hungarian_keywords", + "hungarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /indonesian-index/_analyze +{ + "field": "content", + "text": "Mahasiswa belajar di universitas Indonesia. Nomor mereka adalah 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "mahasiswa", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "ajar", + "start_offset": 10, + "end_offset": 17, + "type": "", + "position": 1 + }, + { + "token": "universitas", + "start_offset": 21, + "end_offset": 32, + "type": "", + "position": 3 + }, + { + "token": "indonesia", + "start_offset": 33, + "end_offset": 42, + "type": "", + "position": 4 + }, + { + "token": "nomor", + "start_offset": 44, + "end_offset": 49, + "type": "", + "position": 5 + }, + { + "token": "123456", + "start_offset": 64, + "end_offset": 70, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/irish.md b/_analyzers/language-analyzers/irish.md new file mode 100644 index 0000000000..3e1535d134 --- /dev/null +++ b/_analyzers/language-analyzers/irish.md @@ -0,0 +1,157 @@ +--- +layout: default +title: Irish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 210 +--- + +# Irish analyzer + +The built-in `irish` analyzer can be applied to a text field using the following command: + +```json +PUT /irish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "irish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_irish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_irish_analyzer": { + "type": "irish", + "stem_exclusion": ["údarás", "faomhadh"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Irish analyzer internals + +The `irish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - hyphenation (Irish) + - elision (Irish) + - lowercase (Irish) + - stop (Irish) + - keyword + - stemmer (Irish) + +## Custom Irish analyzer + +You can create a custom Irish analyzer using the following command: + +```json +PUT /irish-index +{ + "settings": { + "analysis": { + "filter": { + "irish_stop": { + "type": "stop", + "stopwords": "_irish_" + }, + "irish_elision": { + "type": "elision", + "articles": [ "d", "m", "b" ], + "articles_case": true + }, + "irish_hyphenation": { + "type": "stop", + "stopwords": [ "h", "n", "t" ], + "ignore_case": true + }, + "irish_lowercase": { + "type": "lowercase", + "language": "irish" + }, + "irish_stemmer": { + "type": "stemmer", + "language": "irish" + }, + "irish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "irish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "irish_hyphenation", + "irish_elision", + "irish_lowercase", + "irish_stop", + "irish_keywords", + "irish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "irish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /irish-index/_analyze +{ + "field": "content", + "text": "Tá mic léinn ag staidéar in ollscoileanna na hÉireann. Is iad a gcuid uimhreacha ná 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "tá","start_offset": 0,"end_offset": 2,"type": "","position": 0}, + {"token": "mic","start_offset": 3,"end_offset": 6,"type": "","position": 1}, + {"token": "léinn","start_offset": 7,"end_offset": 12,"type": "","position": 2}, + {"token": "staidéar","start_offset": 16,"end_offset": 24,"type": "","position": 4}, + {"token": "ollscoileanna","start_offset": 28,"end_offset": 41,"type": "","position": 6}, + {"token": "héireann","start_offset": 45,"end_offset": 53,"type": "","position": 8}, + {"token": "cuid","start_offset": 64,"end_offset": 69,"type": "","position": 12}, + {"token": "uimhreacha","start_offset": 70,"end_offset": 80,"type": "","position": 13}, + {"token": "123456","start_offset": 84,"end_offset": 90,"type": "","position": 15} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/italian.md b/_analyzers/language-analyzers/italian.md new file mode 100644 index 0000000000..190056d63c --- /dev/null +++ b/_analyzers/language-analyzers/italian.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Italian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 220 +--- + +# Italian analyzer + +The built-in `italian` analyzer can be applied to a text field using the following command: + +```json +PUT /italian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_italian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_italian_analyzer": { + "type": "italian", + "stem_exclusion": ["autorità", "approvazione"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Italian analyzer internals + +The `italian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (Italian) + - lowercase + - stop (Italian) + - keyword + - stemmer (Italian) + +## Custom Italian analyzer + +You can create a custom Italian analyzer using the following command: + +```json +PUT /italian-index +{ + "settings": { + "analysis": { + "filter": { + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "italian_stemmer": { + "type": "stemmer", + "language": "light_italian" + }, + "italian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "italian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "italian_elision", + "lowercase", + "italian_stop", + "italian_keywords", + "italian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /italian-index/_analyze +{ + "field": "content", + "text": "Gli studenti studiano nelle università italiane. I loro numeri sono 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 4,"end_offset": 12,"type": "","position": 1}, + {"token": "studian","start_offset": 13,"end_offset": 21,"type": "","position": 2}, + {"token": "universit","start_offset": 28,"end_offset": 38,"type": "","position": 4}, + {"token": "italian","start_offset": 39,"end_offset": 47,"type": "","position": 5}, + {"token": "numer","start_offset": 56,"end_offset": 62,"type": "","position": 8}, + {"token": "123456","start_offset": 68,"end_offset": 74,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/latvian.md b/_analyzers/language-analyzers/latvian.md new file mode 100644 index 0000000000..2301759763 --- /dev/null +++ b/_analyzers/language-analyzers/latvian.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Latvian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 230 +--- + +# Latvian analyzer + +The built-in `latvian` analyzer can be applied to a text field using the following command: + +```json +PUT /latvian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "latvian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_latvian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_latvian_analyzer": { + "type": "latvian", + "stem_exclusion": ["autoritāte", "apstiprinājums"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Latvian analyzer internals + +The `latvian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Latvian) + - keyword + - stemmer (Latvian) + +## Custom Latvian analyzer + +You can create a custom Latvian analyzer using the following command: + +```json +PUT /italian-index +{ + "settings": { + "analysis": { + "filter": { + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "italian_stemmer": { + "type": "stemmer", + "language": "light_italian" + }, + "italian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "italian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "italian_elision", + "lowercase", + "italian_stop", + "italian_keywords", + "italian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /latvian-index/_analyze +{ + "field": "content", + "text": "Studenti mācās Latvijas universitātēs. Viņu numuri ir 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "māc","start_offset": 9,"end_offset": 14,"type": "","position": 1}, + {"token": "latvij","start_offset": 15,"end_offset": 23,"type": "","position": 2}, + {"token": "universitāt","start_offset": 24,"end_offset": 37,"type": "","position": 3}, + {"token": "vin","start_offset": 39,"end_offset": 43,"type": "","position": 4}, + {"token": "numur","start_offset": 44,"end_offset": 50,"type": "","position": 5}, + {"token": "123456","start_offset": 54,"end_offset": 60,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/lithuanian.md b/_analyzers/language-analyzers/lithuanian.md new file mode 100644 index 0000000000..ca5966c54e --- /dev/null +++ b/_analyzers/language-analyzers/lithuanian.md @@ -0,0 +1,136 @@ +--- +layout: default +title: Lithuanian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 230 +--- + +# Lithuanian analyzer + +The built-in `lithuanian` analyzer can be applied to a text field using the following command: + +```json +PUT /lithuanian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "lithuanian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_lithuanian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_lithuanian_analyzer": { + "type": "lithuanian", + "stem_exclusion": ["autoritetas", "patvirtinimas"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Lithuanian analyzer internals + +The `lithuanian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Lithuanian) + - keyword + - stemmer (Lithuanian) + +## Custom Lithuanian analyzer + +You can create a custom Lithuanian analyzer using the following command: + +```json +PUT /lithuanian-index +{ + "settings": { + "analysis": { + "filter": { + "lithuanian_stop": { + "type": "stop", + "stopwords": "_lithuanian_" + }, + "lithuanian_stemmer": { + "type": "stemmer", + "language": "lithuanian" + }, + "lithuanian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "lithuanian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "lithuanian_stop", + "lithuanian_keywords", + "lithuanian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "lithuanian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /lithuanian-index/_analyze +{ + "field": "content", + "text": "Studentai mokosi Lietuvos universitetuose. Jų numeriai yra 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 9,"type": "","position": 0}, + {"token": "mok","start_offset": 10,"end_offset": 16,"type": "","position": 1}, + {"token": "lietuv","start_offset": 17,"end_offset": 25,"type": "","position": 2}, + {"token": "universitet","start_offset": 26,"end_offset": 41,"type": "","position": 3}, + {"token": "num","start_offset": 46,"end_offset": 54,"type": "","position": 5}, + {"token": "123456","start_offset": 59,"end_offset": 65,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/norwegian.md b/_analyzers/language-analyzers/norwegian.md new file mode 100644 index 0000000000..cfb04eebf3 --- /dev/null +++ b/_analyzers/language-analyzers/norwegian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Norwegian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 240 +--- + +# Norwegian analyzer + +The built-in `norwegian` analyzer can be applied to a text field using the following command: + +```json +PUT /norwegian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "norwegian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_norwegian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_norwegian_analyzer": { + "type": "norwegian", + "stem_exclusion": ["autoritet", "godkjenning"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Norwegian analyzer internals + +The `norwegian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Norwegian) + - keyword + - stemmer (Norwegian) + +## Custom Norwegian analyzer + +You can create a custom Norwegian analyzer using the following command: + +```json +PUT /norwegian-index +{ + "settings": { + "analysis": { + "filter": { + "norwegian_stop": { + "type": "stop", + "stopwords": "_norwegian_" + }, + "norwegian_stemmer": { + "type": "stemmer", + "language": "norwegian" + }, + "norwegian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "norwegian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "norwegian_stop", + "norwegian_keywords", + "norwegian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "norwegian_analyzer" + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /norwegian-index/_analyze +{ + "field": "content", + "text": "Studentene studerer ved norske universiteter. Deres nummer er 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "studer","start_offset": 11,"end_offset": 19,"type": "","position": 1}, + {"token": "norsk","start_offset": 24,"end_offset": 30,"type": "","position": 3}, + {"token": "universitet","start_offset": 31,"end_offset": 44,"type": "","position": 4}, + {"token": "numm","start_offset": 52,"end_offset": 58,"type": "","position": 6}, + {"token": "123456","start_offset": 62,"end_offset": 68,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/persian.md b/_analyzers/language-analyzers/persian.md new file mode 100644 index 0000000000..40b38656fd --- /dev/null +++ b/_analyzers/language-analyzers/persian.md @@ -0,0 +1,144 @@ +--- +layout: default +title: Persian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 250 +--- + +# Persian analyzer + +The built-in `persian` analyzer can be applied to a text field using the following command: + +```json +PUT /persian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "persian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_persian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_persian_analyzer": { + "type": "persian", + "stem_exclusion": ["حکومت", "تأیید"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Persian analyzer internals + +The `persian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Char filter: `mapping` + +- Token filters: + - lowercase + - decimal_digit + - normalization (Arabic) + - normalization (Persian) + - keyword + - stemmer (Norwegian) + +## Custom Persian analyzer + +You can create a custom Persian analyzer using the following command: + +```json +PUT /persian-index +{ + "settings": { + "analysis": { + "filter": { + "persian_stop": { + "type": "stop", + "stopwords": "_persian_" + }, + "persian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "char_filter": { + "null_width_replace_with_space": { + "type": "mapping", + "mappings": [ "\\u200C=>\\u0020"] + } + }, + "analyzer": { + "persian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "char_filter": [ "null_width_replace_with_space" ], + "filter": [ + "lowercase", + "decimal_digit", + "arabic_normalization", + "persian_normalization", + "persian_stop" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "persian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /persian-index/_analyze +{ + "field": "content", + "text": "دانشجویان در دانشگاه‌های ایرانی تحصیل می‌کنند. شماره‌های آن‌ها ۱۲۳۴۵۶ است." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "دانشجويان","start_offset": 0,"end_offset": 9,"type": "","position": 0}, + {"token": "دانشگاه","start_offset": 13,"end_offset": 20,"type": "","position": 2}, + {"token": "ايراني","start_offset": 25,"end_offset": 31,"type": "","position": 4}, + {"token": "تحصيل","start_offset": 32,"end_offset": 37,"type": "","position": 5}, + {"token": "شماره","start_offset": 47,"end_offset": 52,"type": "","position": 8}, + {"token": "123456","start_offset": 63,"end_offset": 69,"type": "","position": 12} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/portuguese.md b/_analyzers/language-analyzers/portuguese.md new file mode 100644 index 0000000000..166ffa0010 --- /dev/null +++ b/_analyzers/language-analyzers/portuguese.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Portuguese +parent: Language analyzers +grand_parent: Analyzers +nav_order: 260 +--- + +# Portuguese analyzer + +The built-in `portuguese` analyzer can be applied to a text field using the following command: + +```json +PUT /portuguese-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "portuguese" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_portuguese_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_portuguese_analyzer": { + "type": "portuguese", + "stem_exclusion": ["autoridade", "aprovação"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Portuguese analyzer internals + +The `portuguese` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Portuguese) + - keyword + - stemmer (Portuguese) + +## Custom Portuguese analyzer + +You can create a custom Portuguese analyzer using the following command: + +```json +PUT /portuguese-index +{ + "settings": { + "analysis": { + "filter": { + "portuguese_stop": { + "type": "stop", + "stopwords": "_portuguese_" + }, + "portuguese_stemmer": { + "type": "stemmer", + "language": "light_portuguese" + }, + "portuguese_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "portuguese_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "portuguese_stop", + "portuguese_keywords", + "portuguese_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "portuguese_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /portuguese-index/_analyze +{ + "field": "content", + "text": "Os estudantes estudam nas universidades brasileiras. Seus números são 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "estudant", + "start_offset": 3, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "estudam", + "start_offset": 14, + "end_offset": 21, + "type": "", + "position": 2 + }, + { + "token": "universidad", + "start_offset": 26, + "end_offset": 39, + "type": "", + "position": 4 + }, + { + "token": "brasileir", + "start_offset": 40, + "end_offset": 51, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 58, + "end_offset": 65, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 70, + "end_offset": 76, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/romanian.md b/_analyzers/language-analyzers/romanian.md new file mode 100644 index 0000000000..cad0953385 --- /dev/null +++ b/_analyzers/language-analyzers/romanian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Romanian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 270 +--- + +# Romanian analyzer + +The built-in `romanian` analyzer can be applied to a text field using the following command: + +```json +PUT /romanian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "romanian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_romanian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_romanian_analyzer": { + "type": "romanian", + "stem_exclusion": ["autoritate", "aprobat"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Romanian analyzer internals + +The `romanian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Romanian) + - keyword + - stemmer (Romanian) + +## Custom Romanian analyzer + +You can create a custom Romanian analyzer using the following command: + +```json +PUT /romanian-index +{ + "settings": { + "analysis": { + "filter": { + "romanian_stop": { + "type": "stop", + "stopwords": "_romanian_" + }, + "romanian_stemmer": { + "type": "stemmer", + "language": "romanian" + }, + "romanian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "romanian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "romanian_stop", + "romanian_keywords", + "romanian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "romanian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /romanian-index/_analyze +{ + "field": "content", + "text": "Studenții învață la universitățile din România. Numerele lor sunt 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "studenț", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "învaț", + "start_offset": 10, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "universităț", + "start_offset": 20, + "end_offset": 34, + "type": "", + "position": 3 + }, + { + "token": "român", + "start_offset": 39, + "end_offset": 46, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 48, + "end_offset": 56, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 66, + "end_offset": 72, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/russian.md b/_analyzers/language-analyzers/russian.md new file mode 100644 index 0000000000..bd57ba0b27 --- /dev/null +++ b/_analyzers/language-analyzers/russian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Russian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 280 +--- + +# Russian analyzer + +The built-in `russian` analyzer can be applied to a text field using the following command: + +```json +PUT /russian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "russian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_russian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_russian_analyzer": { + "type": "russian", + "stem_exclusion": ["авторитет", "одобрение"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Russian analyzer internals + +The `russian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Russian) + - keyword + - stemmer (Russian) + +## Custom Russian analyzer + +You can create a custom Russian analyzer using the following command: + +```json +PUT /russian-index +{ + "settings": { + "analysis": { + "filter": { + "russian_stop": { + "type": "stop", + "stopwords": "_russian_" + }, + "russian_stemmer": { + "type": "stemmer", + "language": "russian" + }, + "russian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "russian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "russian_stop", + "russian_keywords", + "russian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "russian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /russian-index/_analyze +{ + "field": "content", + "text": "Студенты учатся в университетах России. Их номера 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "студент", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "учат", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "университет", + "start_offset": 18, + "end_offset": 31, + "type": "", + "position": 3 + }, + { + "token": "росс", + "start_offset": 32, + "end_offset": 38, + "type": "", + "position": 4 + }, + { + "token": "номер", + "start_offset": 43, + "end_offset": 49, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 7 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/sorani.md b/_analyzers/language-analyzers/sorani.md new file mode 100644 index 0000000000..f71d43c481 --- /dev/null +++ b/_analyzers/language-analyzers/sorani.md @@ -0,0 +1,168 @@ +--- +layout: default +title: Sorani +parent: Language analyzers +grand_parent: Analyzers +nav_order: 290 +--- + +# Sorani analyzer + +The built-in `sorani` analyzer can be applied to a text field using the following command: + +```json +PUT /sorani-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "sorani" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_sorani_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_sorani_analyzer": { + "type": "sorani", + "stem_exclusion": ["مؤسسه", "اجازه"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Sorani analyzer internals + +The `sorani` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - normalization (Sorani) + - lowercase + - decimal_digit + - stop (Sorani) + - keyword + - stemmer (Sorani) + +## Custom Sorani analyzer + +You can create a custom Sorani analyzer using the following command: + +```json +PUT /sorani-index +{ + "settings": { + "analysis": { + "filter": { + "sorani_stop": { + "type": "stop", + "stopwords": "_sorani_" + }, + "sorani_stemmer": { + "type": "stemmer", + "language": "sorani" + }, + "sorani_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "sorani_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "sorani_stop", + "sorani_keywords", + "sorani_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "sorani_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /sorani-index/_analyze +{ + "field": "content", + "text": "خوێندنی فەرمی لە هەولێرەوە. ژمارەکان ١٢٣٤٥٦." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "خوێندن", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "فەرم", + "start_offset": 8, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "هەولێر", + "start_offset": 17, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "ژمار", + "start_offset": 28, + "end_offset": 36, + "type": "", + "position": 4 + }, + { + "token": "123456", + "start_offset": 37, + "end_offset": 43, + "type": "", + "position": 5 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/spanish.md b/_analyzers/language-analyzers/spanish.md new file mode 100644 index 0000000000..8a0d8fad3c --- /dev/null +++ b/_analyzers/language-analyzers/spanish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Spanish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 300 +--- + +# Spanish analyzer + +The built-in `spanish` analyzer can be applied to a text field using the following command: + +```json +PUT /spanish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "spanish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_spanish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_spanish_analyzer": { + "type": "spanish", + "stem_exclusion": ["autoridad", "aprobación"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Spanish analyzer internals + +The `spanish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Spanish) + - keyword + - stemmer (Spanish) + +## Custom Spanish analyzer + +You can create a custom Spanish analyzer using the following command: + +```json +PUT /spanish-index +{ + "settings": { + "analysis": { + "filter": { + "spanish_stop": { + "type": "stop", + "stopwords": "_spanish_" + }, + "spanish_stemmer": { + "type": "stemmer", + "language": "light_spanish" + }, + "spanish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "spanish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "spanish_stop", + "spanish_keywords", + "spanish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "spanish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /spanish-index/_analyze +{ + "field": "content", + "text": "Los estudiantes estudian en universidades españolas. Sus números son 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "estudiant", + "start_offset": 4, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "estudian", + "start_offset": 16, + "end_offset": 24, + "type": "", + "position": 2 + }, + { + "token": "universidad", + "start_offset": 28, + "end_offset": 41, + "type": "", + "position": 4 + }, + { + "token": "español", + "start_offset": 42, + "end_offset": 51, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 57, + "end_offset": 64, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 69, + "end_offset": 75, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/swedish.md b/_analyzers/language-analyzers/swedish.md new file mode 100644 index 0000000000..9da595f12e --- /dev/null +++ b/_analyzers/language-analyzers/swedish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Swedish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 310 +--- + +# Swedish analyzer + +The built-in `swedish` analyzer can be applied to a text field using the following command: + +```json +PUT /swedish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "swedish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_swedish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_swedish_analyzer": { + "type": "swedish", + "stem_exclusion": ["myndighet", "godkännande"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Swedish analyzer internals + +The `swedish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Swedish) + - keyword + - stemmer (Swedish) + +## Custom Swedish analyzer + +You can create a custom Swedish analyzer using the following command: + +```json +PUT /swedish-index +{ + "settings": { + "analysis": { + "filter": { + "swedish_stop": { + "type": "stop", + "stopwords": "_swedish_" + }, + "swedish_stemmer": { + "type": "stemmer", + "language": "swedish" + }, + "swedish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "swedish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "swedish_stop", + "swedish_keywords", + "swedish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "swedish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /swedish-index/_analyze +{ + "field": "content", + "text": "Studenter studerar vid svenska universitet. Deras nummer är 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "studer", + "start_offset": 10, + "end_offset": 18, + "type": "", + "position": 1 + }, + { + "token": "svensk", + "start_offset": 23, + "end_offset": 30, + "type": "", + "position": 3 + }, + { + "token": "universitet", + "start_offset": 31, + "end_offset": 42, + "type": "", + "position": 4 + }, + { + "token": "numm", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 60, + "end_offset": 66, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/thai.md b/_analyzers/language-analyzers/thai.md new file mode 100644 index 0000000000..e4daa1f0be --- /dev/null +++ b/_analyzers/language-analyzers/thai.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Thai +parent: Language analyzers +grand_parent: Analyzers +nav_order: 320 +--- + +# Thai analyzer + +The built-in `thai` analyzer can be applied to a text field using the following command: + +```json +PUT /thai-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_thai_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_thai_analyzer": { + "type": "thai", + "stem_exclusion": ["อำนาจ", "การอนุมัติ"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Thai analyzer internals + +The `thai` analyzer is built using the following components: + +- Tokenizer: `thai` + +- Token filters: + - lowercase + - decimal_digit + - stop (Thai) + - keyword + +## Custom Thai analyzer + +You can create a custom Thai analyzer using the following command: + +```json +PUT /thai-index +{ + "settings": { + "analysis": { + "filter": { + "thai_stop": { + "type": "stop", + "stopwords": "_thai_" + }, + "thai_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "thai_analyzer": { + "tokenizer": "thai", + "filter": [ + "lowercase", + "decimal_digit", + "thai_stop", + "thai_keywords" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /thai-index/_analyze +{ + "field": "content", + "text": "นักเรียนกำลังศึกษาอยู่ที่มหาวิทยาลัยไทย หมายเลข 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "นักเรียน","start_offset": 0,"end_offset": 8,"type": "word","position": 0}, + {"token": "กำลัง","start_offset": 8,"end_offset": 13,"type": "word","position": 1}, + {"token": "ศึกษา","start_offset": 13,"end_offset": 18,"type": "word","position": 2}, + {"token": "มหาวิทยาลัย","start_offset": 25,"end_offset": 36,"type": "word","position": 5}, + {"token": "ไทย","start_offset": 36,"end_offset": 39,"type": "word","position": 6}, + {"token": "หมายเลข","start_offset": 40,"end_offset": 47,"type": "word","position": 7}, + {"token": "123456","start_offset": 48,"end_offset": 54,"type": "word","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/turkish.md b/_analyzers/language-analyzers/turkish.md new file mode 100644 index 0000000000..fb36c5413c --- /dev/null +++ b/_analyzers/language-analyzers/turkish.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Turkish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 330 +--- + +# Turkish analyzer + +The built-in `turkish` analyzer can be applied to a text field using the following command: + +```json +PUT /turkish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "turkish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_turkish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_turkish_analyzer": { + "type": "turkish", + "stem_exclusion": ["otorite", "onay"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Turkish analyzer internals + +The `turkish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - apostrophe + - lowercase (Turkish) + - stop (Turkish) + - keyword + - stemmer (Turkish) + +## Custom Turkish analyzer + +You can create a custom Turkish analyzer using the following command: + +```json +PUT /turkish-index +{ + "settings": { + "analysis": { + "filter": { + "turkish_stop": { + "type": "stop", + "stopwords": "_turkish_" + }, + "turkish_stemmer": { + "type": "stemmer", + "language": "turkish" + }, + "turkish_lowercase": { + "type": "lowercase", + "language": "turkish" + }, + "turkish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "turkish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "apostrophe", + "turkish_lowercase", + "turkish_stop", + "turkish_keywords", + "turkish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "turkish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /turkish-index/_analyze +{ + "field": "content", + "text": "Öğrenciler Türk üniversitelerinde öğrenim görüyor. Numara 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "öğrenci","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "türk","start_offset": 11,"end_offset": 15,"type": "","position": 1}, + {"token": "üniversite","start_offset": 16,"end_offset": 33,"type": "","position": 2}, + {"token": "öğre","start_offset": 34,"end_offset": 41,"type": "","position": 3}, + {"token": "görüyor","start_offset": 42,"end_offset": 49,"type": "","position": 4}, + {"token": "numar","start_offset": 51,"end_offset": 57,"type": "","position": 5}, + {"token": "123456","start_offset": 58,"end_offset": 64,"type": "","position": 6} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/normalizers.md b/_analyzers/normalizers.md index b89659f814..52841d2571 100644 --- a/_analyzers/normalizers.md +++ b/_analyzers/normalizers.md @@ -1,7 +1,7 @@ --- layout: default title: Normalizers -nav_order: 100 +nav_order: 110 --- # Normalizers diff --git a/_analyzers/supported-analyzers/index.md b/_analyzers/supported-analyzers/index.md index af6ce6c3a6..43e41b8d6a 100644 --- a/_analyzers/supported-analyzers/index.md +++ b/_analyzers/supported-analyzers/index.md @@ -24,9 +24,18 @@ Analyzer | Analysis performed | Analyzer output **Stop** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Removes stop words
- Converts tokens to lowercase | [`s`, `fun`, `contribute`, `brand`, `new`, `pr`, `opensearch`] **Keyword** (no-op) | - Outputs the entire string unchanged | [`It’s fun to contribute a brand-new PR or 2 to OpenSearch!`] **Pattern** | - Parses strings into tokens using regular expressions
- Supports converting strings to lowercase
- Supports removing stop words | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] -[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] +[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] **Fingerprint** | - Parses strings on any non-letter character
- Normalizes characters by converting them to ASCII
- Converts tokens to lowercase
- Sorts, deduplicates, and concatenates tokens into a single token
- Supports removing stop words | [`2 a brand contribute fun it's new opensearch or pr to`]
Note that the apostrophe was converted to its ASCII counterpart. ## Language analyzers -OpenSearch supports analyzers for various languages. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/). \ No newline at end of file +OpenSearch supports multiple language analyzers. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index). + +## Additional analyzers + +The following table lists the additional analyzers that OpenSearch supports. + +| Analyzer | Analysis performed | +|:---------------|:---------------------------------------------------------------------------------------------------------| +| `phone` | An [index analyzer]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) for parsing phone numbers. | +| `phone-search` | A [search analyzer]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/) for parsing phone numbers. | diff --git a/_analyzers/supported-analyzers/phone-analyzers.md b/_analyzers/supported-analyzers/phone-analyzers.md new file mode 100644 index 0000000000..f24b7cf328 --- /dev/null +++ b/_analyzers/supported-analyzers/phone-analyzers.md @@ -0,0 +1,128 @@ +--- +layout: default +title: Phone number +parent: Analyzers +nav_order: 140 +--- + +# Phone number analyzers + +The `analysis-phonenumber` plugin provides analyzers and tokenizers for parsing phone numbers. A dedicated analyzer is required because parsing phone numbers is a non-trivial task (even though it might seem trivial at first glance). For common misconceptions regarding phone number parsing, see [Falsehoods programmers believe about phone numbers](https://github.com/google/libphonenumber/blob/master/FALSEHOODS.md). + + +OpenSearch supports the following phone number analyzers: + +* [`phone`](#the-phone-analyzer): An [index analyzer]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) to use at indexing time. +* [`phone-search`](#the-phone-search-analyzer): A [search analyzer]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/) to use at search time. + +Internally, the plugin uses the [`libphonenumber`](https://github.com/google/libphonenumber) library and follows its parsing rules. + +The phone number analyzers are not meant to find phone numbers in larger texts. Instead, you should use them on fields that only contain phone numbers. +{: .note} + +## Installing the plugin + +Before you can use the phone number analyzers, you must install the `analysis-phonenumber` plugin by running the following command: + +```sh +./bin/opensearch-plugin install analysis-phonenumber +``` + +## Specifying a default region + +You can optionally specify a default region for parsing phone numbers by providing the `phone-region` parameter within the analyzer. Valid phone regions are represented by ISO 3166 country codes. For more information, see [List of ISO 3166 country codes](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes). + +When tokenizing phone numbers containing the international calling prefix `+`, the default region is irrelevant. However, for phone numbers that use a national prefix for international numbers (for example, `001` instead of `+1` to dial Northern America from most European countries), the region needs to be provided. You can also properly index local phone numbers with no international prefix by specifying the region. + +## Example + +The following request creates an index containing one field that ingests phone numbers for Switzerland (region code `CH`): + +```json +PUT /example-phone +{ + "settings": { + "analysis": { + "analyzer": { + "phone-ch": { + "type": "phone", + "phone-region": "CH" + }, + "phone-search-ch": { + "type": "phone-search", + "phone-region": "CH" + } + } + } + }, + "mappings": { + "properties": { + "phone_number": { + "type": "text", + "analyzer": "phone-ch", + "search_analyzer": "phone-search-ch" + } + } + } +} +``` +{% include copy-curl.html %} + +## The phone analyzer + +The `phone` analyzer generates n-grams based on the given phone number. A (fictional) Swiss phone number containing an international calling prefix can be parsed with or without the Swiss-specific phone region. Thus, the following two requests will produce the same result: + +```json +GET /example-phone/_analyze +{ + "analyzer" : "phone-ch", + "text" : "+41 60 555 12 34" +} +``` +{% include copy-curl.html %} + +```json +GET /example-phone/_analyze +{ + "analyzer" : "phone", + "text" : "+41 60 555 12 34" +} +``` +{% include copy-curl.html %} + +The response contains the generated n-grams: + +```json +["+41 60 555 12 34", "6055512", "41605551", "416055512", "6055", "41605551234", ...] +``` + +However, if you specify the phone number without the international calling prefix `+` (either by using `0041` or omitting +the international calling prefix altogether), then only the analyzer configured with the correct phone region can parse the number: + +```json +GET /example-phone/_analyze +{ + "analyzer" : "phone-ch", + "text" : "060 555 12 34" +} +``` +{% include copy-curl.html %} + +## The phone-search analyzer + +In contrast, the `phone-search` analyzer does not create n-grams and only issues some basic tokens. For example, send the following request and specify the `phone-search` analyzer: + +```json +GET /example-phone/_analyze +{ + "analyzer" : "phone-search", + "text" : "+41 60 555 12 34" +} +``` +{% include copy-curl.html %} + +The response contains the following tokens: + +```json +["+41 60 555 12 34", "41 60 555 12 34", "41605551234", "605551234", "41"] +``` diff --git a/_analyzers/token-filters/common_gram.md b/_analyzers/token-filters/common_gram.md new file mode 100644 index 0000000000..58f5bbe149 --- /dev/null +++ b/_analyzers/token-filters/common_gram.md @@ -0,0 +1,94 @@ +--- +layout: default +title: Common grams +parent: Token filters +nav_order: 60 +--- + +# Common grams token filter + +The `common_grams` token filter improves search relevance by keeping commonly occurring phrases (common grams) in the text. This is useful when dealing with languages or datasets in which certain word combinations frequently occur as a unit and can impact search relevance if treated as separate tokens. If any common words are present in the input string, this token filter generates both their unigrams and bigrams. + +Using this token filter improves search relevance by keeping common phrases intact. This can help in matching queries more accurately, particularly for frequent word combinations. It also improves search precision by reducing the number of irrelevant matches. + +When using this filter, you must carefully select and maintain the `common_words` list. +{: .warning} + +## Parameters + +The `common_grams` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`common_words` | Required | List of strings | A list of words that should be treated as words that commonly appear together. These words will be used to generate common grams. If the `common_words` parameter is an empty list, the `common_grams` token filter becomes a no-op filter, meaning that it doesn't modify the input tokens at all. +`ignore_case` | Optional | Boolean | Indicates whether the filter should ignore case differences when matching common words. Default is `false`. +`query_mode` | Optional | Boolean | When set to `true`, the following rules are applied:
- Unigrams that are generated from `common_words` are not included in the output.
- Bigrams in which a non-common word is followed by a common word are retained in the output.
- Unigrams of non-common words are excluded if they are immediately followed by a common word.
- If a non-common word appears at the end of the text and is preceded by a common word, its unigram is not included in the output. + + +## Example + +The following example request creates a new index named `my_common_grams_index` and configures an analyzer with the `common_grams` filter: + +```json +PUT /my_common_grams_index +{ + "settings": { + "analysis": { + "filter": { + "my_common_grams_filter": { + "type": "common_grams", + "common_words": ["a", "in", "for"], + "ignore_case": true, + "query_mode": true + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_common_grams_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_common_grams_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "A quick black cat jumps over the lazy dog in the park" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "a_quick","start_offset": 0,"end_offset": 7,"type": "gram","position": 0}, + {"token": "quick","start_offset": 2,"end_offset": 7,"type": "","position": 1}, + {"token": "black","start_offset": 8,"end_offset": 13,"type": "","position": 2}, + {"token": "cat","start_offset": 14,"end_offset": 17,"type": "","position": 3}, + {"token": "jumps","start_offset": 18,"end_offset": 23,"type": "","position": 4}, + {"token": "over","start_offset": 24,"end_offset": 28,"type": "","position": 5}, + {"token": "the","start_offset": 29,"end_offset": 32,"type": "","position": 6}, + {"token": "lazy","start_offset": 33,"end_offset": 37,"type": "","position": 7}, + {"token": "dog_in","start_offset": 38,"end_offset": 44,"type": "gram","position": 8}, + {"token": "in_the","start_offset": 42,"end_offset": 48,"type": "gram","position": 9}, + {"token": "the","start_offset": 45,"end_offset": 48,"type": "","position": 10}, + {"token": "park","start_offset": 49,"end_offset": 53,"type": "","position": 11} + ] +} +``` + diff --git a/_analyzers/token-filters/condition.md b/_analyzers/token-filters/condition.md new file mode 100644 index 0000000000..5e87c2cbbf --- /dev/null +++ b/_analyzers/token-filters/condition.md @@ -0,0 +1,135 @@ +--- +layout: default +title: Condition +parent: Token filters +nav_order: 70 +--- + +# Condition token filter + +The `condition` token filter is a special type of filter that allows you to apply other token filters conditionally based on certain criteria. This provides more control over when certain token filters should be applied during text analysis. +Multiple filters can be configured and only applied when they meet the conditions you define. +This token filter can be very useful for language-specific processing and handling of special characters. + + +## Parameters + +There are two parameters that must be configured in order to use the `condition` token filter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`filter` | Required | Array | Specifies which token filters should be applied to the tokens when the specified condition (defined by the `script` parameter) is met. +`script` | Required | Object | Configures an [inline script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/exec-script/) that defines the condition that needs to be met in order for the filters specified in the `filter` parameter to be applied (only inline scripts are accepted). + + +## Example + +The following example request creates a new index named `my_conditional_index` and configures an analyzer with a `condition` filter. This filter applies a `lowercase` filter to any tokens that contain the character sequence "um": + +```json +PUT /my_conditional_index +{ + "settings": { + "analysis": { + "filter": { + "my_conditional_filter": { + "type": "condition", + "filter": ["lowercase"], + "script": { + "source": "token.getTerm().toString().contains('um')" + } + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_conditional_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_conditional_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "THE BLACK CAT JUMPS OVER A LAZY DOG" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "THE", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "BLACK", + "start_offset": 4, + "end_offset": 9, + "type": "", + "position": 1 + }, + { + "token": "CAT", + "start_offset": 10, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 14, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "OVER", + "start_offset": 20, + "end_offset": 24, + "type": "", + "position": 4 + }, + { + "token": "A", + "start_offset": 25, + "end_offset": 26, + "type": "", + "position": 5 + }, + { + "token": "LAZY", + "start_offset": 27, + "end_offset": 31, + "type": "", + "position": 6 + }, + { + "token": "DOG", + "start_offset": 32, + "end_offset": 35, + "type": "", + "position": 7 + } + ] +} +``` + diff --git a/_analyzers/token-filters/decimal-digit.md b/_analyzers/token-filters/decimal-digit.md new file mode 100644 index 0000000000..002375f7e5 --- /dev/null +++ b/_analyzers/token-filters/decimal-digit.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Decimal digit +parent: Token filters +nav_order: 80 +--- + +# Decimal digit token filter + +The `decimal_digit` token filter is used to normalize decimal digit characters (0--9) into their ASCII equivalents in various scripts. This is useful when you want to ensure that all digits are treated uniformly in text analysis, regardless of the script in which they are written. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `decimal_digit` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_decimal_digit_filter": { + "type": "decimal_digit" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["my_decimal_digit_filter"] + } + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "123 ١٢٣ १२३" +} +``` +{% include copy-curl.html %} + +`text` breakdown: + + - "123" (ASCII digits) + - "١٢٣" (Arabic-Indic digits) + - "१२३" (Devanagari digits) + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "123", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "123", + "start_offset": 4, + "end_offset": 7, + "type": "", + "position": 1 + }, + { + "token": "123", + "start_offset": 8, + "end_offset": 11, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/delimited-payload.md b/_analyzers/token-filters/delimited-payload.md new file mode 100644 index 0000000000..f17bb1b1ce --- /dev/null +++ b/_analyzers/token-filters/delimited-payload.md @@ -0,0 +1,211 @@ +--- +layout: default +title: Delimited payload +parent: Token filters +nav_order: 90 +--- + +# Delimited payload token filter + +The `delimited_payload` token filter is used to parse tokens containing payloads during the analysis process. For example, the string `red|1.5 fast|2.0 car|1.0` is parsed into the tokens `red` (with a payload of `1.5`), `fast` (with a payload of `2.0`), and `car` (with a payload of `1.0`). This is particularly useful when your tokens include additional associated data (like weights, scores, or other numeric values) that you can use for scoring or custom query logic. The filter can handle different types of payloads, including integers, floats, and strings, and attach payloads (extra metadata) to tokens. + +When analyzing text, the `delimited_payload` token filter parses each token, extracts the payload, and attaches it to the token. This payload can later be used in queries to influence scoring, boosting, or other custom behaviors. + +Payloads are stored as Base64-encoded strings. By default, payloads are not returned in the query response along with the tokens. To return the payloads, you must configure additional parameters. For more information, see [Example with a stored payload]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-payload/#example-without-a-stored-payload). + +## Parameters + +The `delimited_payload` token filter has two parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`encoding` | Optional | String | Specifies the data type of the payload attached to the tokens. This determines how the payload data is interpreted during analysis and querying.
Valid values are:

- `float`: The payload is interpreted as a 32-bit floating-point number using IEEE 754 format (for example, `2.5` in `car|2.5`).
- `identity`: The payload is interpreted as a sequence of characters (for example, in `user|admin`, `admin` is interpreted as a string).
- `int`: The payload is interpreted as a 32-bit integer (for example, `1` in `priority|1`).
Default is `float`. +`delimiter` | Optional | String | Specifies the character that separates the token from its payload in the input text. Default is the pipe character (`|`). + +## Example without a stored payload + +The following example request creates a new index named `my_index` and configures an analyzer with a `delimited_payload` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_payload_filter": { + "type": "delimited_payload", + "delimiter": "|", + "encoding": "float" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "whitespace", + "filter": ["my_payload_filter"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "red|1.5 fast|2.0 car|1.0" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "red", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "fast", + "start_offset": 8, + "end_offset": 16, + "type": "word", + "position": 1 + }, + { + "token": "car", + "start_offset": 17, + "end_offset": 24, + "type": "word", + "position": 2 + } + ] +} +``` + +## Example with a stored payload + +To configure the payload to be returned in the response, create an index that stores term vectors and set `term_vector` to `with_positions_payloads` or `with_positions_offsets_payloads` in the index mappings. For example, the following index is configured to store term vectors: + +```json +PUT /visible_payloads +{ + "mappings": { + "properties": { + "text": { + "type": "text", + "term_vector": "with_positions_payloads", + "analyzer": "custom_analyzer" + } + } + }, + "settings": { + "analysis": { + "filter": { + "my_payload_filter": { + "type": "delimited_payload", + "delimiter": "|", + "encoding": "float" + } + }, + "analyzer": { + "custom_analyzer": { + "tokenizer": "whitespace", + "filter": [ "my_payload_filter" ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +You can index a document into this index using the following request: + +```json +PUT /visible_payloads/_doc/1 +{ + "text": "red|1.5 fast|2.0 car|1.0" +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /visible_payloads/_termvectors/1 +{ + "fields": ["text"] +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens, which include payloads: + +```json +{ + "_index": "visible_payloads", + "_id": "1", + "_version": 1, + "found": true, + "took": 3, + "term_vectors": { + "text": { + "field_statistics": { + "sum_doc_freq": 3, + "doc_count": 1, + "sum_ttf": 3 + }, + "terms": { + "brown": { + "term_freq": 1, + "tokens": [ + { + "position": 1, + "start_offset": 10, + "end_offset": 19, + "payload": "QEAAAA==" + } + ] + }, + "fox": { + "term_freq": 1, + "tokens": [ + { + "position": 2, + "start_offset": 20, + "end_offset": 27, + "payload": "P8AAAA==" + } + ] + }, + "quick": { + "term_freq": 1, + "tokens": [ + { + "position": 0, + "start_offset": 0, + "end_offset": 9, + "payload": "QCAAAA==" + } + ] + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_analyzers/token-filters/dictionary-decompounder.md b/_analyzers/token-filters/dictionary-decompounder.md new file mode 100644 index 0000000000..ced6fd6fbc --- /dev/null +++ b/_analyzers/token-filters/dictionary-decompounder.md @@ -0,0 +1,101 @@ +--- +layout: default +title: Dictionary decompounder +parent: Token filters +nav_order: 110 +--- + +# Dictionary decompounder token filter + +The `dictionary_decompounder` token filter is used to split compound words into their constituent parts based on a predefined dictionary. This filter is particularly useful for languages like German, Dutch, or Finnish, in which compound words are common, so breaking them down can improve search relevance. The `dictionary_decompounder` token filter determines whether each token (word) can be split into smaller tokens based on a list of known words. If the token can be split into known words, the filter generates the subtokens for the token. + +## Parameters + +The `dictionary_decompounder` token filter has the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`word_list` | Required unless `word_list_path` is configured | Array of strings | The dictionary of words that the filter uses to split compound words. +`word_list_path` | Required unless `word_list` is configured | String | A file path to a text file containing the dictionary words. Accepts either an absolute path or a path relative to the `config` directory. The dictionary file must be UTF-8 encoded, and each word must be listed on a separate line. +`min_word_size` | Optional | Integer | The minimum length of the entire compound word that will be considered for splitting. If a compound word is shorter than this value, it is not split. Default is `5`. +`min_subword_size` | Optional | Integer | The minimum length for any subword. If a subword is shorter than this value, it is not included in the output. Default is `2`. +`max_subword_size` | Optional | Integer | The maximum length for any subword. If a subword is longer than this value, it is not included in the output. Default is `15`. +`only_longest_match` | Optional | Boolean | If set to `true`, only the longest matching subword will be returned. Default is `false`. + +## Example + +The following example request creates a new index named `decompound_example` and configures an analyzer with the `dictionary_decompounder` filter: + +```json +PUT /decompound_example +{ + "settings": { + "analysis": { + "filter": { + "my_dictionary_decompounder": { + "type": "dictionary_decompounder", + "word_list": ["slow", "green", "turtle"] + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "my_dictionary_decompounder"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /decompound_example/_analyze +{ + "analyzer": "my_analyzer", + "text": "slowgreenturtleswim" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slowgreenturtleswim", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "green", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/edge-ngram.md b/_analyzers/token-filters/edge-ngram.md new file mode 100644 index 0000000000..be3eaf6fab --- /dev/null +++ b/_analyzers/token-filters/edge-ngram.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Edge n-gram +parent: Token filters +nav_order: 120 +--- +# Edge n-gram token filter +The `edge_ngram` token filter is very similar to the `ngram` token filter, where a particular string is split into substrings of different lengths. The `edge_ngram` token filter, however, generates n-grams (substrings) only from the beginning (edge) of a token. It's particularly useful in scenarios like autocomplete or prefix matching, where you want to match the beginning of words or phrases as the user types them. + +## Parameters + +The `edge_ngram` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_gram` | Optional | Integer | The minimum length of the n-grams that will be generated. Default is `1`. +`max_gram` | Optional | Integer | The maximum length of the n-grams that will be generated. Default is `1` for the `edge_ngram` filter and `2` for custom token filters. Avoid setting this parameter to a low value. If the value is set too low, only very short n-grams will be generated and the search term will not be found. For example, if `max_gram` is set to `3` and you index the word "banana", the longest generated token will be "ban". If the user searches for "banana", no matches will be returned. You can use the `truncate` token filter as a search analyzer to mitigate this risk. +`preserve_original` | Optional | Boolean | Includes the original token in the output. Default is `false` . + +## Example + +The following example request creates a new index named `edge_ngram_example` and configures an analyzer with the `edge_ngram` filter: + +```json +PUT /edge_ngram_example +{ + "settings": { + "analysis": { + "filter": { + "my_edge_ngram": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 4 + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "my_edge_ngram"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /edge_ngram_example/_analyze +{ + "analyzer": "my_analyzer", + "text": "slow green turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slo", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "gre", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "gree", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "tur", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "turt", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/elision.md b/_analyzers/token-filters/elision.md new file mode 100644 index 0000000000..abc6dba658 --- /dev/null +++ b/_analyzers/token-filters/elision.md @@ -0,0 +1,124 @@ +--- +layout: default +title: Elision +parent: Token filters +nav_order: 130 +--- + +# Elision token filter + +The `elision` token filter is used to remove elided characters from words in certain languages. Elision typically occurs in languages such as French, in which words are often contracted and combined with the following word, typically by omitting a vowel and replacing it with an apostrophe. + +The `elision` token filter is already preconfigured in the following [language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/): `catalan`, `french`, `irish`, and `italian`. +{: .note} + +## Parameters + +The custom `elision` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`articles` | Required if `articles_path` is not configured | Array of strings | Defines which articles or short words should be removed when they appear as part of an elision. +`articles_path` | Required if `articles` is not configured | String | Specifies the path to a custom list of articles that should be removed during the analysis process. +`articles_case` | Optional | Boolean | Specifies whether the filter is case sensitive when matching elisions. Default is `false`. + +## Example + +The default set of French elisions is `l'`, `m'`, `t'`, `qu'`, `n'`, `s'`, `j'`, `d'`, `c'`, `jusqu'`, `quoiqu'`, `lorsqu'`, and `puisqu'`. You can update this by configuring the `french_elision` token filter. The following example request creates a new index named `french_texts` and configures an analyzer with a `french_elision` filter: + +```json +PUT /french_texts +{ + "settings": { + "analysis": { + "filter": { + "french_elision": { + "type": "elision", + "articles": [ "l", "t", "m", "d", "n", "s", "j" ] + } + }, + "analyzer": { + "french_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "french_elision"] + } + } + } + }, + "mappings": { + "properties": { + "text": { + "type": "text", + "analyzer": "french_analyzer" + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /french_texts/_analyze +{ + "analyzer": "french_analyzer", + "text": "L'étudiant aime l'école et le travail." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "étudiant", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "aime", + "start_offset": 11, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "école", + "start_offset": 16, + "end_offset": 23, + "type": "", + "position": 2 + }, + { + "token": "et", + "start_offset": 24, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "le", + "start_offset": 27, + "end_offset": 29, + "type": "", + "position": 4 + }, + { + "token": "travail", + "start_offset": 30, + "end_offset": 37, + "type": "", + "position": 5 + } + ] +} +``` diff --git a/_analyzers/token-filters/fingerprint.md b/_analyzers/token-filters/fingerprint.md new file mode 100644 index 0000000000..75c6615459 --- /dev/null +++ b/_analyzers/token-filters/fingerprint.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Fingerprint +parent: Token filters +nav_order: 140 +--- + +# Fingerprint token filter + +The `fingerprint` token filter is used to standardize and deduplicate text. This is particularly useful when consistency in text processing is crucial. The `fingerprint` token filter achieves this by processing text using the following steps: + +1. **Lowercasing**: Converts all text to lowercase. +2. **Splitting**: Breaks the text into tokens. +3. **Sorting**: Arranges the tokens in alphabetical order. +4. **Removing duplicates**: Eliminates repeated tokens. +5. **Joining tokens**: Combines the tokens into a single string, typically joined by a space or another specified separator. + +## Parameters + +The `fingerprint` token filter can be configured with the following two parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_output_size` | Optional | Integer | Limits the length of the generated fingerprint string. If the concatenated string exceeds the `max_output_size`, the filter will not produce any output, resulting in an empty token. Default is `255`. +`separator` | Optional | String | Defines the character(s) used to join the tokens into a single string after they have been sorted and deduplicated. Default is space (`" "`). + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `fingerprint` token filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_fingerprint": { + "type": "fingerprint", + "max_output_size": 200, + "separator": "-" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_fingerprint" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "OpenSearch is a powerful search engine that scales easily" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "a-easily-engine-is-opensearch-powerful-scales-search-that", + "start_offset": 0, + "end_offset": 57, + "type": "fingerprint", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/flatten-graph.md b/_analyzers/token-filters/flatten-graph.md new file mode 100644 index 0000000000..8d51c57400 --- /dev/null +++ b/_analyzers/token-filters/flatten-graph.md @@ -0,0 +1,109 @@ +--- +layout: default +title: Flatten graph +parent: Token filters +nav_order: 150 +--- + +# Flatten graph token filter + +The `flatten_graph` token filter is used to handle complex token relationships that occur when multiple tokens are generated at the same position in a graph structure. Some token filters, like `synonym_graph` and `word_delimiter_graph`, generate multi-position tokens---tokens that overlap or span multiple positions. These token graphs are useful for search queries but are not directly supported during indexing. The `flatten_graph` token filter resolves multi-position tokens into a linear sequence of tokens. Flattening the graph ensures compatibility with the indexing process. + +Token graph flattening is a lossy process. Whenever possible, avoid using the `flatten_graph` filter. Instead, apply graph token filters exclusively in search analyzers, removing the need for the `flatten_graph` filter. +{: .important} + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `flatten_graph` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_index_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_custom_filter", + "flatten_graph" + ] + } + }, + "filter": { + "my_custom_filter": { + "type": "word_delimiter_graph", + "catenate_all": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /test_index/_analyze +{ + "analyzer": "my_index_analyzer", + "text": "OpenSearch helped many employers" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0, + "positionLength": 2 + }, + { + "token": "Open", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "Search", + "start_offset": 4, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "helped", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "many", + "start_offset": 18, + "end_offset": 22, + "type": "", + "position": 3 + }, + { + "token": "employers", + "start_offset": 23, + "end_offset": 32, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/hunspell.md b/_analyzers/token-filters/hunspell.md new file mode 100644 index 0000000000..6720ba74de --- /dev/null +++ b/_analyzers/token-filters/hunspell.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Hunspell +parent: Token filters +nav_order: 160 +--- + +# Hunspell token filter + +The `hunspell` token filter is used for stemming and morphological analysis of words in a specific language. This filter applies Hunspell dictionaries, which are widely used in spell checkers. It works by breaking down words into their root forms (stemming). + +The Hunspell dictionary files are automatically loaded at startup from the `/hunspell/` directory. For example, the `en_GB` locale must have at least one `.aff` file and one or more `.dic` files in the `/hunspell/en_GB/` directory. + +You can download these files from [LibreOffice dictionaries](https://github.com/LibreOffice/dictionaries). + +## Parameters + +The `hunspell` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`language/lang/locale` | At least one of the three is required | String | Specifies the language for the Hunspell dictionary. +`dedup` | Optional | Boolean | Determines whether to remove multiple duplicate stemming terms for the same token. Default is `true`. +`dictionary` | Optional | Array of strings | Configures the dictionary files to be used for the Hunspell dictionary. Default is all files in the `/hunspell/` directory. +`longest_only` | Optional | Boolean | Specifies whether only the longest stemmed version of the token should be returned. Default is `false`. + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `hunspell` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_hunspell_filter": { + "type": "hunspell", + "lang": "en_GB", + "dedup": true, + "longest_only": true + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_hunspell_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "the turtle moves slowly" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 4, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "move", + "start_offset": 11, + "end_offset": 16, + "type": "", + "position": 2 + }, + { + "token": "slow", + "start_offset": 17, + "end_offset": 23, + "type": "", + "position": 3 + } + ] +} +``` diff --git a/_analyzers/token-filters/hyphenation-decompounder.md b/_analyzers/token-filters/hyphenation-decompounder.md new file mode 100644 index 0000000000..6e53d4dfd5 --- /dev/null +++ b/_analyzers/token-filters/hyphenation-decompounder.md @@ -0,0 +1,102 @@ +--- +layout: default +title: Hyphenation decompounder +parent: Token filters +nav_order: 170 +--- + +# Hyphenation decompounder token filter + +The `hyphenation_decompounder` token filter is used to break down compound words into their constituent parts. This filter is particularly useful for languages like German, Dutch, and Swedish, in which compound words are common. The filter uses hyphenation patterns (typically defined in .xml files) to identify the possible locations within a compound word where it can be split into components. These components are then checked against a provided dictionary. If there is a match, those components are treated as valid tokens. For more information about hyphenation pattern files, see [FOP XML Hyphenation Patterns](https://offo.sourceforge.net/#FOP+XML+Hyphenation+Patterns). + +## Parameters + +The `hyphenation_decompounder` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`hyphenation_patterns_path` | Required | String | The path (relative to the `config` directory or absolute) to the hyphenation patterns file, which contains the language-specific rules for word splitting. The file is typically in XML format. Sample files can be downloaded from the [OFFO SourceForge project](https://sourceforge.net/projects/offo/). +`word_list` | Required if `word_list_path` is not set | Array of strings | A list of words used to validate the components generated by the hyphenation patterns. +`word_list_path` | Required if `word_list` is not set | String | The path (relative to the `config` directory or absolute) to a list of subwords. +`max_subword_size` | Optional | Integer | The maximum subword length. If the generated subword exceeds this length, it will not be added to the generated tokens. Default is `15`. +`min_subword_size` | Optional | Integer | The minimum subword length. If the generated subword is shorter than the specified length, it will not be added to the generated tokens. Default is `2`. +`min_word_size` | Optional | Integer | The minimum word character length. Word tokens shorter than this length are excluded from decomposition into subwords. Default is `5`. +`only_longest_match` | Optional | Boolean | Only includes the longest subword in the generated tokens. Default is `false`. + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `hyphenation_decompounder` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "filter": { + "my_hyphenation_decompounder": { + "type": "hyphenation_decompounder", + "hyphenation_patterns_path": "analysis/hyphenation_patterns.xml", + "word_list": ["notebook", "note", "book"], + "min_subword_size": 3, + "min_word_size": 5, + "only_longest_match": false + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_hyphenation_decompounder" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /test_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "notebook" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "notebook", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "note", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "book", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/index.md b/_analyzers/token-filters/index.md index 7d5814e06b..b06489c805 100644 --- a/_analyzers/token-filters/index.md +++ b/_analyzers/token-filters/index.md @@ -17,51 +17,51 @@ The following table lists all token filters that OpenSearch supports. Token filter | Underlying Lucene token filter| Description [`apostrophe`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/apostrophe/) | [ApostropheFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/ApostropheFilter.html) | In each token containing an apostrophe, the `apostrophe` token filter removes the apostrophe itself and all characters following it. [`asciifolding`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/asciifolding/) | [ASCIIFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html) | Converts alphabetic, numeric, and symbolic characters. -`cjk_bigram` | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. +[`cjk_bigram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/cjk-bigram/) | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. [`cjk_width`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/cjk-width/) | [CJKWidthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html) | Normalizes Chinese, Japanese, and Korean (CJK) tokens according to the following rules:
- Folds full-width ASCII character variants into their equivalent basic Latin characters.
- Folds half-width katakana character variants into their equivalent kana characters. [`classic`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/classic) | [ClassicFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/classic/ClassicFilter.html) | Performs optional post-processing on the tokens generated by the classic tokenizer. Removes possessives (`'s`) and removes `.` from acronyms. -`common_grams` | [CommonGramsFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html) | Generates bigrams for a list of frequently occurring terms. The output contains both single terms and bigrams. -`conditional` | [ConditionalTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.html) | Applies an ordered list of token filters to tokens that match the conditions provided in a script. -`decimal_digit` | [DecimalDigitFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). -`delimited_payload` | [DelimitedPayloadTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html) | Separates a token stream into tokens with corresponding payloads, based on a provided delimiter. A token consists of all characters before the delimiter, and a payload consists of all characters after the delimiter. For example, if the delimiter is `|`, then for the string `foo|bar`, `foo` is the token and `bar` is the payload. +[`common_grams`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/common_gram/) | [CommonGramsFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html) | Generates bigrams for a list of frequently occurring terms. The output contains both single terms and bigrams. +[`conditional`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/condition/) | [ConditionalTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.html) | Applies an ordered list of token filters to tokens that match the conditions provided in a script. +[`decimal_digit`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/decimal-digit/) | [DecimalDigitFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). +[`delimited_payload`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-payload/) | [DelimitedPayloadTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html) | Separates a token stream into tokens with corresponding payloads, based on a provided delimiter. A token consists of all characters preceding the delimiter, and a payload consists of all characters following the delimiter. For example, if the delimiter is `|`, then for the string `foo|bar`, `foo` is the token and `bar` is the payload. [`delimited_term_freq`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-term-frequency/) | [DelimitedTermFrequencyTokenFilter](https://lucene.apache.org/core/9_7_0/analysis/common/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.html) | Separates a token stream into tokens with corresponding term frequencies, based on a provided delimiter. A token consists of all characters before the delimiter, and a term frequency is the integer after the delimiter. For example, if the delimiter is `|`, then for the string `foo|5`, `foo` is the token and `5` is the term frequency. -`dictionary_decompounder` | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. -`edge_ngram` | [EdgeNGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. -`elision` | [ElisionFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). -`fingerprint` | [FingerprintFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. -`flatten_graph` | [FlattenGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. -`hunspell` | [HunspellStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell supports a word having multiple stems, this filter can emit multiple tokens for each consumed token. Requires you to configure one or more language-specific Hunspell dictionaries. -`hyphenation_decompounder` | [HyphenationCompoundWordTokenFilter](https://lucene.apache.org/core/9_8_0/analysis/common/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.html) | Uses XML-based hyphenation patterns to find potential subwords in compound words and checks the subwords against the specified word list. The token output contains only the subwords found in the word list. -`keep_types` | [TypeTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. -`keep_word` | [KeepWordFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. -`keyword_marker` | [KeywordMarkerFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. -`keyword_repeat` | [KeywordRepeatFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. -`kstem` | [KStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides kstem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. -`kuromoji_completion` | [JapaneseCompletionFilter](https://lucene.apache.org/core/9_10_0/analysis/kuromoji/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.html) | Adds Japanese romanized terms to the token stream (in addition to the original tokens). Usually used to support autocomplete on Japanese search terms. Note that the filter has a `mode` parameter, which should be set to `index` when used in an index analyzer and `query` when used in a search analyzer. Requires the `analysis-kuromoji` plugin. For information about installing the plugin, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins). -`length` | [LengthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens whose lengths are shorter or longer than the length range specified by `min` and `max`. -`limit` | [LimitTokenCountFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. A common use case is to limit the size of document field values based on token count. -`lowercase` | [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) is for the English language. You can set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). -`min_hash` | [MinHashFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. -`multiplexer` | N/A | Emits multiple tokens at the same position. Runs each token through each of the specified filter lists separately and outputs the results as separate tokens. -`ngram` | [NGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. -Normalization | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. -`pattern_capture` | N/A | Generates a token for every capture group in the provided regular expression. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). -`pattern_replace` | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). -`phonetic` | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin. -`porter_stem` | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. -`predicate_token_filter` | N/A | Removes tokens that don’t match the specified predicate script. Supports inline Painless scripts only. -`remove_duplicates` | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. -`reverse` | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. -`shingle` | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but apply to words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. -`snowball` | N/A | Stems words using a [Snowball-generated stemmer](https://snowballstem.org/). You can use the `snowball` token filter with the following languages in the `language` field: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Estonian`, `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Irish`, `Italian`, `Kp`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. -`stemmer` | N/A | Provides algorithmic stemming for the following languages in the `language` field: `arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `dutch_kp`, `english`, `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english`, `estonian`, `finnish`, `light_finnish`, `french`, `light_french`, `minimal_french`, `galician`, `minimal_galician`, `german`, `german2`, `light_german`, `minimal_german`, `greek`, `hindi`, `hungarian`, `light_hungarian`, `indonesian`, `irish`, `italian`, `light_italian`, `latvian`, `Lithuanian`, `norwegian`, `light_norwegian`, `minimal_norwegian`, `light_nynorsk`, `minimal_nynorsk`, `portuguese`, `light_portuguese`, `minimal_portuguese`, `portuguese_rslp`, `romanian`, `russian`, `light_russian`, `sorani`, `spanish`, `light_spanish`, `swedish`, `light_swedish`, `turkish`. -`stemmer_override` | N/A | Overrides stemming algorithms by applying a custom mapping so that the provided terms are not stemmed. -`stop` | [StopFilter](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/StopFilter.html) | Removes stop words from a token stream. -`synonym` | N/A | Supplies a synonym list for the analysis process. The synonym list is provided using a configuration file. -`synonym_graph` | N/A | Supplies a synonym list, including multiword synonyms, for the analysis process. -`trim` | [TrimFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space from each token in a stream. -`truncate` | [TruncateTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens whose length exceeds the specified character limit. -`unique` | N/A | Ensures each token is unique by removing duplicate tokens from a stream. -`uppercase` | [UpperCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. -`word_delimiter` | [WordDelimiterFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. -`word_delimiter_graph` | [WordDelimiterGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. Assigns multi-position tokens a `positionLength` attribute. +[`dictionary_decompounder`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/dictionary-decompounder/) | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. +[`edge_ngram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/edge-ngram/) | [EdgeNGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. +[`elision`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/elision/) | [ElisionFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). +[`fingerprint`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/fingerprint/) | [FingerprintFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. +[`flatten_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/flatten-graph/) | [FlattenGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. +[`hunspell`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/hunspell/) | [HunspellStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell allows a word to have multiple stems, this filter can emit multiple tokens for each consumed token. Requires the configuration of one or more language-specific Hunspell dictionaries. +[`hyphenation_decompounder`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/hyphenation-decompounder/) | [HyphenationCompoundWordTokenFilter](https://lucene.apache.org/core/9_8_0/analysis/common/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.html) | Uses XML-based hyphenation patterns to find potential subwords in compound words and checks the subwords against the specified word list. The token output contains only the subwords found in the word list. +[`keep_types`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keep-types/) | [TypeTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. +[`keep_words`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keep-words/) | [KeepWordFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. +[`keyword_marker`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keyword-marker/) | [KeywordMarkerFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. +[`keyword_repeat`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keyword-repeat/) | [KeywordRepeatFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. +[`kstem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/kstem/) | [KStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides KStem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. +[`kuromoji_completion`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/kuromoji-completion/) | [JapaneseCompletionFilter](https://lucene.apache.org/core/9_10_0/analysis/kuromoji/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.html) | Adds Japanese romanized terms to a token stream (in addition to the original tokens). Usually used to support autocomplete of Japanese search terms. Note that the filter has a `mode` parameter that should be set to `index` when used in an index analyzer and `query` when used in a search analyzer. Requires the `analysis-kuromoji` plugin. For information about installing the plugin, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins). +[`length`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/length/) | [LengthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens that are shorter or longer than the length range specified by `min` and `max`. +[`limit`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/limit/) | [LimitTokenCountFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. For example, document field value sizes can be limited based on the token count. +[`lowercase`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/lowercase/) | [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) processes the English language. To process other languages, set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). +[`min_hash`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/min-hash/) | [MinHashFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. +[`multiplexer`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/multiplexer/) | N/A | Emits multiple tokens at the same position. Runs each token through each of the specified filter lists separately and outputs the results as separate tokens. +[`ngram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/ngram/) | [NGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. +[Normalization]({{site.url}}{{site.baseurl}}/analyzers/token-filters/normalization/) | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. +[`pattern_capture`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/pattern-capture/) | N/A | Generates a token for every capture group in the provided regular expression. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +[`pattern_replace`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/pattern-replace/) | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +[`phonetic`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/phonetic/) | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin. +[`porter_stem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/porter-stem/) | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. +[`predicate_token_filter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/predicate-token-filter/) | N/A | Removes tokens that do not match the specified predicate script. Supports only inline Painless scripts. +[`remove_duplicates`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/remove-duplicates/) | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. +[`reverse`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/reverse/) | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. +[`shingle`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/shingle/) | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but are generated using words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. +[`snowball`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/snowball/) | N/A | Stems words using a [Snowball-generated stemmer](https://snowballstem.org/). The `snowball` token filter supports using the following languages in the `language` field: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Estonian`, `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Irish`, `Italian`, `Kp`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. +[`stemmer`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stemmer/) | N/A | Provides algorithmic stemming for the following languages used in the `language` field: `arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `dutch_kp`, `english`, `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english`, `estonian`, `finnish`, `light_finnish`, `french`, `light_french`, `minimal_french`, `galician`, `minimal_galician`, `german`, `german2`, `light_german`, `minimal_german`, `greek`, `hindi`, `hungarian`, `light_hungarian`, `indonesian`, `irish`, `italian`, `light_italian`, `latvian`, `Lithuanian`, `norwegian`, `light_norwegian`, `minimal_norwegian`, `light_nynorsk`, `minimal_nynorsk`, `portuguese`, `light_portuguese`, `minimal_portuguese`, `portuguese_rslp`, `romanian`, `russian`, `light_russian`, `sorani`, `spanish`, `light_spanish`, `swedish`, `light_swedish`, `turkish`. +[`stemmer_override`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stemmer-override/) | N/A | Overrides stemming algorithms by applying a custom mapping so that the provided terms are not stemmed. +[`stop`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stop/) | [StopFilter](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/StopFilter.html) | Removes stop words from a token stream. +[`synonym`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym/) | N/A | Supplies a synonym list for the analysis process. The synonym list is provided using a configuration file. +[`synonym_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym-graph/) | N/A | Supplies a synonym list, including multiword synonyms, for the analysis process. +[`trim`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/trim/) | [TrimFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space characters from each token in a stream. +[`truncate`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/truncate/) | [TruncateTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens with lengths exceeding the specified character limit. +[`unique`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/unique/) | N/A | Ensures that each token is unique by removing duplicate tokens from a stream. +[`uppercase`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/uppercase/) | [UpperCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. +[`word_delimiter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter/) | [WordDelimiterFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. +[`word_delimiter_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter-graph/) | [WordDelimiterGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. Assigns a `positionLength` attribute to multi-position tokens. diff --git a/_analyzers/token-filters/keep-types.md b/_analyzers/token-filters/keep-types.md new file mode 100644 index 0000000000..59e617f567 --- /dev/null +++ b/_analyzers/token-filters/keep-types.md @@ -0,0 +1,115 @@ +--- +layout: default +title: Keep types +parent: Token filters +nav_order: 180 +--- + +# Keep types token filter + +The `keep_types` token filter is a type of token filter used in text analysis to control which token types are kept or discarded. Different tokenizers produce different token types, for example, ``, ``, or ``. + +The `keyword`, `simple_pattern`, and `simple_pattern_split` tokenizers do not support the `keep_types` token filter because these tokenizers do not support token type attributes. +{: .note} + +## Parameters + +The `keep_types` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`types` | Required | List of strings | List of token types to be kept or discarded (determined by the `mode`). +`mode`| Optional | String | Whether to `include` or `exclude` the token types specified in `types`. Default is `include`. + + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `keep_types` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "keep_types_filter"] + } + }, + "filter": { + "keep_types_filter": { + "type": "keep_types", + "types": [""] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /test_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Hello 2 world! This is an example." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "hello", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "world", + "start_offset": 8, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "this", + "start_offset": 15, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "is", + "start_offset": 20, + "end_offset": 22, + "type": "", + "position": 4 + }, + { + "token": "an", + "start_offset": 23, + "end_offset": 25, + "type": "", + "position": 5 + }, + { + "token": "example", + "start_offset": 26, + "end_offset": 33, + "type": "", + "position": 6 + } + ] +} +``` diff --git a/_analyzers/token-filters/keep-words.md b/_analyzers/token-filters/keep-words.md new file mode 100644 index 0000000000..4a6b199e5c --- /dev/null +++ b/_analyzers/token-filters/keep-words.md @@ -0,0 +1,92 @@ +--- +layout: default +title: Keep words +parent: Token filters +nav_order: 190 +--- + +# Keep words token filter + +The `keep_words` token filter is designed to keep only certain words during the analysis process. This filter is useful if you have a large body of text but are only interested in certain keywords or terms. + +## Parameters + +The `keep_words` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`keep_words` | Required if `keep_words_path` is not configured | List of strings | The list of words to keep. +`keep_words_path` | Required if `keep_words` is not configured | String | The path to the file containing the list of words to keep. +`keep_words_case` | Optional | Boolean | Whether to lowercase all words during comparison. Default is `false`. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keep_words` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_keep_word": { + "tokenizer": "standard", + "filter": [ "keep_words_filter" ] + } + }, + "filter": { + "keep_words_filter": { + "type": "keep", + "keep_words": ["example", "world", "opensearch"], + "keep_words_case": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_keep_word", + "text": "Hello, world! This is an OpenSearch example." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "world", + "start_offset": 7, + "end_offset": 12, + "type": "", + "position": 1 + }, + { + "token": "OpenSearch", + "start_offset": 25, + "end_offset": 35, + "type": "", + "position": 5 + }, + { + "token": "example", + "start_offset": 36, + "end_offset": 43, + "type": "", + "position": 6 + } + ] +} +``` diff --git a/_analyzers/token-filters/keyword-marker.md b/_analyzers/token-filters/keyword-marker.md new file mode 100644 index 0000000000..0ec2cb96f5 --- /dev/null +++ b/_analyzers/token-filters/keyword-marker.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Keyword marker +parent: Token filters +nav_order: 200 +--- + +# Keyword marker token filter + +The `keyword_marker` token filter is used to prevent certain tokens from being altered by stemmers or other filters. The `keyword_marker` token filter does this by marking the specified tokens as `keywords`, which prevents any stemming or other processing. This ensures that specific words remain in their original form. + +## Parameters + +The `keyword_marker` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`ignore_case` | Optional | Boolean | Whether to ignore the letter case when matching keywords. Default is `false`. +`keywords` | Required if either `keywords_path` or `keywords_pattern` is not set | List of strings | The list of tokens to mark as keywords. +`keywords_path` | Required if either `keywords` or `keywords_pattern` is not set | String | The path (relative to the `config` directory or absolute) to the list of keywords. +`keywords_pattern` | Required if either `keywords` or `keywords_path` is not set | String | A [regular expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) used for matching tokens to be marked as keywords. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keyword_marker` filter. The filter marks the word `example` as a keyword: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "keyword_marker_filter", "stemmer"] + } + }, + "filter": { + "keyword_marker_filter": { + "type": "keyword_marker", + "keywords": ["example"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Favorite example" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens. Note that while the word `favorite` was stemmed, the word `example` was not stemmed because it was marked as a keyword: + +```json +{ + "tokens": [ + { + "token": "favorit", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "example", + "start_offset": 9, + "end_offset": 16, + "type": "", + "position": 1 + } + ] +} +``` + +You can further examine the impact of the `keyword_marker` token filter by adding the following parameters to the `_analyze` query: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "This is an OpenSearch example demonstrating keyword marker.", + "explain": true, + "attributes": "keyword" +} +``` +{% include copy-curl.html %} + +This will produce additional details in the response similar to the following: + +```json +{ + "name": "porter_stem", + "tokens": [ + ... + { + "token": "example", + "start_offset": 22, + "end_offset": 29, + "type": "", + "position": 4, + "keyword": true + }, + { + "token": "demonstr", + "start_offset": 30, + "end_offset": 43, + "type": "", + "position": 5, + "keyword": false + }, + ... + ] +} +``` diff --git a/_analyzers/token-filters/keyword-repeat.md b/_analyzers/token-filters/keyword-repeat.md new file mode 100644 index 0000000000..5ba15a037c --- /dev/null +++ b/_analyzers/token-filters/keyword-repeat.md @@ -0,0 +1,160 @@ +--- +layout: default +title: Keyword repeat +parent: Token filters +nav_order: 210 +--- + +# Keyword repeat token filter + +The `keyword_repeat` token filter emits the keyword version of a token into a token stream. This filter is typically used when you want to retain both the original token and its modified version after further token transformations, such as stemming or synonym expansion. The duplicated tokens allow the original, unchanged version of the token to remain in the final analysis alongside the modified versions. + +The `keyword_repeat` token filter should be placed before stemming filters. Stemming is not applied to every token, thus you may have duplicate tokens in the same position after stemming. To remove duplicate tokens, use the `remove_duplicates` token filter after the stemmer. +{: .note} + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keyword_repeat` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_kstem": { + "type": "kstem" + }, + "my_lowercase": { + "type": "lowercase" + } + }, + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_lowercase", + "keyword_repeat", + "my_kstem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "Stopped quickly" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stopped", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "stop", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "quickly", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "quick", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + } + ] +} +``` + +You can further examine the impact of the `keyword_repeat` token filter by adding the following parameters to the `_analyze` query: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "Stopped quickly", + "explain": true, + "attributes": "keyword" +} +``` +{% include copy-curl.html %} + +The response includes detailed information, such as tokenization, filtering, and the application of specific token filters: + +```json +{ + "detail": { + "custom_analyzer": true, + "charfilters": [], + "tokenizer": { + "name": "standard", + "tokens": [ + {"token": "OpenSearch","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3} + ] + }, + "tokenfilters": [ + { + "name": "lowercase", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3} + ] + }, + { + "name": "keyword_marker_filter", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0,"keyword": true}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1,"keyword": false}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2,"keyword": false}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3,"keyword": false} + ] + }, + { + "name": "kstem_filter", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0,"keyword": true}, + {"token": "help","start_offset": 11,"end_offset": 17,"type": "","position": 1,"keyword": false}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2,"keyword": false}, + {"token": "employer","start_offset": 23,"end_offset": 32,"type": "","position": 3,"keyword": false} + ] + } + ] + } +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/kstem.md b/_analyzers/token-filters/kstem.md new file mode 100644 index 0000000000..d13fd2c675 --- /dev/null +++ b/_analyzers/token-filters/kstem.md @@ -0,0 +1,92 @@ +--- +layout: default +title: KStem +parent: Token filters +nav_order: 220 +--- + +# KStem token filter + +The `kstem` token filter is a stemming filter used to reduce words to their root forms. The filter is a lightweight algorithmic stemmer designed for the English language that performs the following stemming operations: + +- Reduces plurals to their singular form. +- Converts different verb tenses to their base form. +- Removes common derivational endings, such as "-ing" or "-ed". + +The `kstem` token filter is equivalent to the a `stemmer` filter configured with a `light_english` language. It provides a more conservative stemming compared to other stemming filters like `porter_stem`. + +The `kstem` token filter is based on the Lucene KStemFilter. For more information, see the [Lucene documentation](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html). + +## Example + +The following example request creates a new index named `my_kstem_index` and configures an analyzer with a `kstem` filter: + +```json +PUT /my_kstem_index +{ + "settings": { + "analysis": { + "filter": { + "kstem_filter": { + "type": "kstem" + } + }, + "analyzer": { + "my_kstem_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "kstem_filter" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_kstem_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_kstem_index/_analyze +{ + "analyzer": "my_kstem_analyzer", + "text": "stops stopped" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stop", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "stop", + "start_offset": 6, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/kuromoji-completion.md b/_analyzers/token-filters/kuromoji-completion.md new file mode 100644 index 0000000000..24833e92e1 --- /dev/null +++ b/_analyzers/token-filters/kuromoji-completion.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Kuromoji completion +parent: Token filters +nav_order: 230 +--- + +# Kuromoji completion token filter + +The `kuromoji_completion` token filter is used to stem Katakana words in Japanese, which are often used to represent foreign words or loanwords. This filter is especially useful for autocompletion or suggest queries, in which partial matches on Katakana words can be expanded to include their full forms. + +To use this token filter, you must first install the `analysis-kuromoji` plugin on all nodes by running `bin/opensearch-plugin install analysis-kuromoji` and then restart the cluster. For more information about installing additional plugins, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/index/). + +## Example + +The following example request creates a new index named `kuromoji_sample` and configures an analyzer with a `kuromoji_completion` filter: + +```json +PUT kuromoji_sample +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "kuromoji_tokenizer", + "filter": [ + "my_katakana_stemmer" + ] + } + }, + "filter": { + "my_katakana_stemmer": { + "type": "kuromoji_completion" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer with text that translates to "use a computer": + +```json +POST /kuromoji_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "コンピューターを使う" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "コンピューター", // The original Katakana word "computer". + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "konpyuーtaー", // Romanized version (Romaji) of "コンピューター". + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "konnpyuーtaー", // Another possible romanized version of "コンピューター" (with a slight variation in the spelling). + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "を", // A Japanese particle, "wo" or "o" + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "wo", // Romanized form of the particle "を" (often pronounced as "o"). + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "o", // Another version of the romanization. + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "使う", // The verb "use" in Kanji. + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + }, + { + "token": "tukau", // Romanized version of "使う" + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + }, + { + "token": "tsukau", // Another romanized version of "使う", where "tsu" is more phonetically correct + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/length.md b/_analyzers/token-filters/length.md new file mode 100644 index 0000000000..f6c5dcc706 --- /dev/null +++ b/_analyzers/token-filters/length.md @@ -0,0 +1,91 @@ +--- +layout: default +title: Length +parent: Token filters +nav_order: 240 +--- + +# Length token filter + +The `length` token filter is used to remove tokens that don't meet specified length criteria (minimum and maximum values) from the token stream. + +## Parameters + +The `length` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min` | Optional | Integer | The minimum token length. Default is `0`. +`max` | Optional | Integer | The maximum token length. Default is `Integer.MAX_VALUE` (`2147483647`). + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `length` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "only_keep_4_to_10_characters": { + "tokenizer": "whitespace", + "filter": [ "length_4_to_10" ] + } + }, + "filter": { + "length_4_to_10": { + "type": "length", + "min": 4, + "max": 10 + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "only_keep_4_to_10_characters", + "text": "OpenSearch is a great tool!" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "great", + "start_offset": 16, + "end_offset": 21, + "type": "word", + "position": 3 + }, + { + "token": "tool!", + "start_offset": 22, + "end_offset": 27, + "type": "word", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/limit.md b/_analyzers/token-filters/limit.md new file mode 100644 index 0000000000..a849f5f06b --- /dev/null +++ b/_analyzers/token-filters/limit.md @@ -0,0 +1,89 @@ +--- +layout: default +title: Limit +parent: Token filters +nav_order: 250 +--- + +# Limit token filter + +The `limit` token filter is used to limit the number of tokens passed through the analysis chain. + +## Parameters + +The `limit` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_count` | Optional | Integer | The maximum number of tokens to be generated. Default is `1`. +`consume_all_tokens` | Optional | Boolean | (Expert-level setting) Uses all tokens from the tokenizer, even if the result exceeds `max_token_count`. When this parameter is set, the output still only contains the number of tokens specified by `max_token_count`. However, all tokens generated by the tokenizer are processed. Default is `false`. + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `limit` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "three_token_limit": { + "tokenizer": "standard", + "filter": [ "custom_token_limit" ] + } + }, + "filter": { + "custom_token_limit": { + "type": "limit", + "max_token_count": 3 + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "three_token_limit", + "text": "OpenSearch is a powerful and flexible search engine." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 14, + "end_offset": 15, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/lowercase.md b/_analyzers/token-filters/lowercase.md new file mode 100644 index 0000000000..89f0f219fa --- /dev/null +++ b/_analyzers/token-filters/lowercase.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Lowercase +parent: Token filters +nav_order: 260 +--- + +# Lowercase token filter + +The `lowercase` token filter is used to convert all characters in the token stream to lowercase, making searches case insensitive. + +## Parameters + +The `lowercase` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Description +:--- | :--- | :--- + `language` | Optional | Specifies a language-specific token filter. Valid values are:
- [`greek`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)
- [`irish`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)
- [`turkish`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html).
Default is the [Lucene LowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/LowerCaseFilter.html). + +## Example + +The following example request creates a new index named `custom_lowercase_example`. It configures an analyzer with a `lowercase` filter and specifies `greek` as the `language`: + +```json +PUT /custom_lowercase_example +{ + "settings": { + "analysis": { + "analyzer": { + "greek_lowercase_example": { + "type": "custom", + "tokenizer": "standard", + "filter": ["greek_lowercase"] + } + }, + "filter": { + "greek_lowercase": { + "type": "lowercase", + "language": "greek" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /custom_lowercase_example/_analyze +{ + "analyzer": "greek_lowercase_example", + "text": "Αθήνα ΕΛΛΑΔΑ" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "αθηνα", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "ελλαδα", + "start_offset": 6, + "end_offset": 12, + "type": "", + "position": 1 + } + ] +} +``` diff --git a/_analyzers/token-filters/min-hash.md b/_analyzers/token-filters/min-hash.md new file mode 100644 index 0000000000..e4f1a8da91 --- /dev/null +++ b/_analyzers/token-filters/min-hash.md @@ -0,0 +1,138 @@ +--- +layout: default +title: Min hash +parent: Token filters +nav_order: 270 +--- + +# Min hash token filter + +The `min_hash` token filter is used to generate hashes for tokens based on a [MinHash](https://en.wikipedia.org/wiki/MinHash) approximation algorithm, which is useful for detecting similarity between documents. The `min_hash` token filter generates hashes for a set of tokens (typically from an analyzed field). + +## Parameters + +The `min_hash` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`hash_count` | Optional | Integer | The number of hash values to generate for each token. Increasing this value generally improves the accuracy of similarity estimation but increases the computational cost. Default is `1`. +`bucket_count` | Optional | Integer | The number of hash buckets to use. This affects the granularity of the hashing. A larger number of buckets provides finer granularity and reduces hash collisions but requires more memory. Default is `512`. +`hash_set_size` | Optional | Integer | The number of hashes to retain in each bucket. This can influence the hashing quality. Larger set sizes may lead to better similarity detection but consume more memory. Default is `1`. +`with_rotation` | Optional | Boolean | When set to `true`, the filter populates empty buckets with the value from the first non-empty bucket found to its circular right, provided that the `hash_set_size` is `1`. If the `bucket_count` argument exceeds `1`, this setting automatically defaults to `true`; otherwise, it defaults to `false`. + +## Example + +The following example request creates a new index named `minhash_index` and configures an analyzer with a `min_hash` filter: + +```json +PUT /minhash_index +{ + "settings": { + "analysis": { + "filter": { + "minhash_filter": { + "type": "min_hash", + "hash_count": 3, + "bucket_count": 512, + "hash_set_size": 1, + "with_rotation": false + } + }, + "analyzer": { + "minhash_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "minhash_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /minhash_index/_analyze +{ + "analyzer": "minhash_analyzer", + "text": "OpenSearch is very powerful." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens (the tokens are not human readable because they represent hashes): + +```json +{ + "tokens" : [ + { + "token" : "\u0000\u0000㳠锯ੲ걌䐩䉵", + "start_offset" : 0, + "end_offset" : 27, + "type" : "MIN_HASH", + "position" : 0 + }, + { + "token" : "\u0000\u0000㳠锯ੲ걌䐩䉵", + "start_offset" : 0, + "end_offset" : 27, + "type" : "MIN_HASH", + "position" : 0 + }, + ... +``` + +In order to demonstrate the usefulness of the `min_hash` token filter, you can use the following Python script to compare the two strings using the previously created analyzer: + +```python +from opensearchpy import OpenSearch +from requests.auth import HTTPBasicAuth + +# Initialize the OpenSearch client with authentication +host = 'https://localhost:9200' # Update if using a different host/port +auth = ('admin', 'admin') # Username and password + +# Create the OpenSearch client with SSL verification turned off +client = OpenSearch( + hosts=[host], + http_auth=auth, + use_ssl=True, + verify_certs=False, # Disable SSL certificate validation + ssl_show_warn=False # Suppress SSL warnings in the output +) + +# Analyzes text and returns the minhash tokens +def analyze_text(index, text): + response = client.indices.analyze( + index=index, + body={ + "analyzer": "minhash_analyzer", + "text": text + } + ) + return [token['token'] for token in response['tokens']] + +# Analyze two similar texts +tokens_1 = analyze_text('minhash_index', 'OpenSearch is a powerful search engine.') +tokens_2 = analyze_text('minhash_index', 'OpenSearch is a very powerful search engine.') + +# Calculate Jaccard similarity +set_1 = set(tokens_1) +set_2 = set(tokens_2) +shared_tokens = set_1.intersection(set_2) +jaccard_similarity = len(shared_tokens) / len(set_1.union(set_2)) + +print(f"Jaccard Similarity: {jaccard_similarity}") +``` + +The response should contain the Jaccard similarity score: + +```yaml +Jaccard Similarity: 0.8571428571428571 +``` \ No newline at end of file diff --git a/_analyzers/token-filters/multiplexer.md b/_analyzers/token-filters/multiplexer.md new file mode 100644 index 0000000000..21597b7fc1 --- /dev/null +++ b/_analyzers/token-filters/multiplexer.md @@ -0,0 +1,165 @@ +--- +layout: default +title: Multiplexer +parent: Token filters +nav_order: 280 +--- + +# Multiplexer token filter + +The `multiplexer` token filter allows you to create multiple versions of the same token by applying different filters. This is useful when you want to analyze the same token in multiple ways. For example, you may want to analyze a token using different stemming, synonyms, or n-gram filters and use all of the generated tokens together. This token filter works by duplicating the token stream and applying different filters to each copy. + +The `multiplexer` token filter removes duplicate tokens from the token stream. +{: .important} + +The `multiplexer` token filter does not support multiword `synonym` or `synonym_graph` token filters or `shingle` token filters because they need to analyze not only the current token but also upcoming tokens in order to determine how to transform the input correctly. +{: .important} + +## Parameters + +The `multiplexer` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`filters` | Optional | List of strings | A comma-separated list of token filters to apply to each copy of the token stream. Default is an empty list. +`preserve_original` | Optional | Boolean | Whether to keep the original token as one of the outputs. Default is `true`. + +## Example + +The following example request creates a new index named `multiplexer_index` and configures an analyzer with a `multiplexer` filter: + +```json +PUT /multiplexer_index +{ + "settings": { + "analysis": { + "filter": { + "english_stemmer": { + "type": "stemmer", + "name": "english" + }, + "synonym_filter": { + "type": "synonym", + "synonyms": [ + "quick,fast" + ] + }, + "multiplexer_filter": { + "type": "multiplexer", + "filters": ["english_stemmer", "synonym_filter"], + "preserve_original": true + } + }, + "analyzer": { + "multiplexer_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "multiplexer_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /multiplexer_index/_analyze +{ + "analyzer": "multiplexer_analyzer", + "text": "The slow turtle hides from the quick dog" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 4, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "turtle", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 2 + }, + { + "token": "turtl", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 2 + }, + { + "token": "hides", + "start_offset": 16, + "end_offset": 21, + "type": "", + "position": 3 + }, + { + "token": "hide", + "start_offset": 16, + "end_offset": 21, + "type": "", + "position": 3 + }, + { + "token": "from", + "start_offset": 22, + "end_offset": 26, + "type": "", + "position": 4 + }, + { + "token": "the", + "start_offset": 27, + "end_offset": 30, + "type": "", + "position": 5 + }, + { + "token": "quick", + "start_offset": 31, + "end_offset": 36, + "type": "", + "position": 6 + }, + { + "token": "fast", + "start_offset": 31, + "end_offset": 36, + "type": "SYNONYM", + "position": 6 + }, + { + "token": "dog", + "start_offset": 37, + "end_offset": 40, + "type": "", + "position": 7 + } + ] +} +``` diff --git a/_analyzers/token-filters/ngram.md b/_analyzers/token-filters/ngram.md new file mode 100644 index 0000000000..c029eac26e --- /dev/null +++ b/_analyzers/token-filters/ngram.md @@ -0,0 +1,137 @@ +--- +layout: default +title: N-gram +parent: Token filters +nav_order: 290 +--- + +# N-gram token filter + +The `ngram` token filter is a powerful tool used to break down text into smaller components, known as _n-grams_, which can improve partial matching and fuzzy search capabilities. It works by splitting a token into smaller substrings of defined lengths. These filters are commonly used in search applications to support autocomplete, partial matches, and typo-tolerant search. For more information, see [Autocomplete functionality]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/autocomplete/) and [Did-you-mean]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/did-you-mean/). + +## Parameters + +The `ngram` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_gram` | Optional | Integer | The minimum length of the n-grams. Default is `1`. +`max_gram` | Optional | Integer | The maximum length of the n-grams. Default is `2`. +`preserve_original` | Optional | Boolean | Whether to keep the original token as one of the outputs. Default is `false`. + +## Example + +The following example request creates a new index named `ngram_example_index` and configures an analyzer with an `ngram` filter: + +```json +PUT /ngram_example_index +{ + "settings": { + "analysis": { + "filter": { + "ngram_filter": { + "type": "ngram", + "min_gram": 2, + "max_gram": 3 + } + }, + "analyzer": { + "ngram_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "ngram_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /ngram_example_index/_analyze +{ + "analyzer": "ngram_analyzer", + "text": "Search" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "se", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "sea", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ea", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ear", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ar", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "arc", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "rc", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "rch", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ch", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/normalization.md b/_analyzers/token-filters/normalization.md new file mode 100644 index 0000000000..1be08e65c2 --- /dev/null +++ b/_analyzers/token-filters/normalization.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Normalization +parent: Token filters +nav_order: 300 +--- + +# Normalization token filter + +The `normalization` token filter is designed to adjust and simplify text in a way that reduces variations, particularly variations in special characters. It is primarily used to handle variations in writing by standardizing characters in specific languages. + +The following `normalization` token filters are available: + +- [arabic_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html) +- [german_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html) +- [hindi_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/hi/HindiNormalizer.html) +- [indic_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/in/IndicNormalizer.html) +- [sorani_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html) +- [persian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html) +- [scandinavian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html) +- [scandinavian_folding](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html) +- [serbian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) + + +## Example + +The following example request creates a new index named `german_normalizer_example` and configures an analyzer with a `german_normalization` filter: + +```json +PUT /german_normalizer_example +{ + "settings": { + "analysis": { + "filter": { + "german_normalizer": { + "type": "german_normalization" + } + }, + "analyzer": { + "german_normalizer_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "german_normalizer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /german_normalizer_example/_analyze +{ + "text": "Straße München", + "analyzer": "german_normalizer_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "strasse", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "munchen", + "start_offset": 7, + "end_offset": 14, + "type": "", + "position": 1 + } + ] +} +``` diff --git a/_analyzers/token-filters/pattern-capture.md b/_analyzers/token-filters/pattern-capture.md new file mode 100644 index 0000000000..cff36b583d --- /dev/null +++ b/_analyzers/token-filters/pattern-capture.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Pattern capture +parent: Token filters +nav_order: 310 +--- + +# Pattern capture token filter + +The `pattern_capture` token filter is a powerful filter that uses regular expressions to capture and extract parts of text according to specific patterns. This filter can be useful when you want to extract particular parts of tokens, such as email domains, hashtags, or numbers, and reuse them for further analysis or indexing. + +## Parameters + +The `pattern_capture` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`patterns` | Required | Array of strings | An array of regular expressions used to capture parts of text. +`preserve_original` | Required | Boolean| Whether to keep the original token in the output. Default is `true`. + + +## Example + +The following example request creates a new index named `email_index` and configures an analyzer with a `pattern_capture` filter to extract the local part and domain name from an email address: + +```json +PUT /email_index +{ + "settings": { + "analysis": { + "filter": { + "email_pattern_capture": { + "type": "pattern_capture", + "preserve_original": true, + "patterns": [ + "^([^@]+)", + "@(.+)$" + ] + } + }, + "analyzer": { + "email_analyzer": { + "tokenizer": "uax_url_email", + "filter": [ + "email_pattern_capture", + "lowercase" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /email_index/_analyze +{ + "text": "john.doe@example.com", + "analyzer": "email_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "john.doe@example.com", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + }, + { + "token": "john.doe", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + }, + { + "token": "example.com", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/pattern-replace.md b/_analyzers/token-filters/pattern-replace.md new file mode 100644 index 0000000000..73ef7fa7d8 --- /dev/null +++ b/_analyzers/token-filters/pattern-replace.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Pattern replace +parent: Token filters +nav_order: 320 +--- + +# Pattern replace token filter + +The `pattern_replace` token filter allows you to modify tokens using regular expressions. This filter replaces patterns in tokens with the specified values, giving you flexibility in transforming or normalizing tokens before indexing them. It's particularly useful when you need to clean or standardize text during analysis. + +## Parameters + +The `pattern_replace` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Required | String | A regular expression pattern that matches the text that needs to be replaced. +`all` | Optional | Boolean | Whether to replace all pattern matches. If `false`, only the first match is replaced. Default is `true`. +`replacement` | Optional | String | A string with which to replace the matched pattern. Default is an empty string. + + +## Example + +The following example request creates a new index named `text_index` and configures an analyzer with a `pattern_replace` filter to replace tokens containing digits with the string `[NUM]`: + +```json +PUT /text_index +{ + "settings": { + "analysis": { + "filter": { + "number_replace_filter": { + "type": "pattern_replace", + "pattern": "\\d+", + "replacement": "[NUM]" + } + }, + "analyzer": { + "number_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "number_replace_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /text_index/_analyze +{ + "text": "Visit us at 98765 Example St.", + "analyzer": "number_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "visit", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "us", + "start_offset": 6, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "at", + "start_offset": 9, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "[NUM]", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "example", + "start_offset": 18, + "end_offset": 25, + "type": "", + "position": 4 + }, + { + "token": "st", + "start_offset": 26, + "end_offset": 28, + "type": "", + "position": 5 + } + ] +} +``` diff --git a/_analyzers/token-filters/phonetic.md b/_analyzers/token-filters/phonetic.md new file mode 100644 index 0000000000..7fe380851f --- /dev/null +++ b/_analyzers/token-filters/phonetic.md @@ -0,0 +1,98 @@ +--- +layout: default +title: Phonetic +parent: Token filters +nav_order: 330 +--- + +# Phonetic token filter + +The `phonetic` token filter transforms tokens into their phonetic representations, enabling more flexible matching of words that sound similar but are spelled differently. This is particularly useful for searching names, brands, or other entities that users might spell differently but pronounce similarly. + +The `phonetic` token filter is not included in OpenSearch distributions by default. To use this token filter, you must first install the `analysis-phonetic` plugin as follows and then restart OpenSearch: + +```bash +./bin/opensearch-plugin install analysis-phonetic +``` +{% include copy.html %} + +For more information about installing plugins, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). +{: .note} + +## Parameters + +The `phonetic` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`encoder` | Optional | String | Specifies the phonetic algorithm to use.

Valid values are:
- `metaphone` (default)
- `double_metaphone`
- `soundex`
- `refined_soundex`
- `caverphone1`
- `caverphone2`
- `cologne`
- `nysiis`
- `koelnerphonetik`
- `haasephonetik`
- `beider_morse`
- `daitch_mokotoff ` +`replace` | Optional | Boolean | Whether to replace the original token. If `false`, the original token is included in the output along with the phonetic encoding. Default is `true`. + + +## Example + +The following example request creates a new index named `names_index` and configures an analyzer with a `phonetic` filter: + +```json +PUT /names_index +{ + "settings": { + "analysis": { + "filter": { + "my_phonetic_filter": { + "type": "phonetic", + "encoder": "double_metaphone", + "replace": true + } + }, + "analyzer": { + "phonetic_analyzer": { + "tokenizer": "standard", + "filter": [ + "my_phonetic_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated for the names `Stephen` and `Steven` using the analyzer: + +```json +POST /names_index/_analyze +{ + "text": "Stephen", + "analyzer": "phonetic_analyzer" +} +``` +{% include copy-curl.html %} + +```json +POST /names_index/_analyze +{ + "text": "Steven", + "analyzer": "phonetic_analyzer" +} +``` +{% include copy-curl.html %} + +In both cases, the response contains the same generated token: + +```json +{ + "tokens": [ + { + "token": "STFN", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/porter-stem.md b/_analyzers/token-filters/porter-stem.md new file mode 100644 index 0000000000..fa2f4208a7 --- /dev/null +++ b/_analyzers/token-filters/porter-stem.md @@ -0,0 +1,83 @@ +--- +layout: default +title: Porter stem +parent: Token filters +nav_order: 340 +--- + +# Porter stem token filter + +The `porter_stem` token filter reduces words to their base (or _stem_) form and removes common suffixes from words, which helps in matching similar words by their root. For example, the word `running` is stemmed to `run`. This token filter is primarily used for the English language and provides stemming based on the [Porter stemming algorithm](https://snowballstem.org/algorithms/porter/stemmer.html). + + +## Example + +The following example request creates a new index named `my_stem_index` and configures an analyzer with a `porter_stem` filter: + +```json +PUT /my_stem_index +{ + "settings": { + "analysis": { + "filter": { + "my_porter_stem": { + "type": "porter_stem" + } + }, + "analyzer": { + "porter_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_porter_stem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_stem_index/_analyze +{ + "text": "running runners ran", + "analyzer": "porter_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "runner", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "ran", + "start_offset": 16, + "end_offset": 19, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/predicate-token-filter.md b/_analyzers/token-filters/predicate-token-filter.md new file mode 100644 index 0000000000..24729f0224 --- /dev/null +++ b/_analyzers/token-filters/predicate-token-filter.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Predicate token filter +parent: Token filters +nav_order: 340 +--- + +# Predicate token filter + +The `predicate_token_filter` evaluates whether tokens should be kept or discarded, depending on the conditions defined in a custom script. The tokens are evaluated in the analysis predicate context. This filter supports only inline Painless scripts. + +## Parameters + +The `predicate_token_filter` has one required parameter: `script`. This parameter provides a condition that is used to evaluate whether the token should be kept. + +## Example + +The following example request creates a new index named `predicate_index` and configures an analyzer with a `predicate_token_filter`. The filter specifies to only output tokens if they are longer than 7 characters: + +```json +PUT /predicate_index +{ + "settings": { + "analysis": { + "filter": { + "my_predicate_filter": { + "type": "predicate_token_filter", + "script": { + "source": "token.term.length() > 7" + } + } + }, + "analyzer": { + "predicate_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_predicate_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /predicate_index/_analyze +{ + "text": "The OpenSearch community is growing rapidly", + "analyzer": "predicate_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 4, + "end_offset": 14, + "type": "", + "position": 1 + }, + { + "token": "community", + "start_offset": 15, + "end_offset": 24, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/remove-duplicates.md b/_analyzers/token-filters/remove-duplicates.md new file mode 100644 index 0000000000..b0a589884a --- /dev/null +++ b/_analyzers/token-filters/remove-duplicates.md @@ -0,0 +1,152 @@ +--- +layout: default +title: Remove duplicates +parent: Token filters +nav_order: 350 +--- + +# Remove duplicates token filter + +The `remove_duplicates` token filter is used to remove duplicate tokens that are generated in the same position during analysis. + +## Example + +The following example request creates an index with a `keyword_repeat` token filter. The filter adds a `keyword` version of each token in the same position as the token itself and then uses a `kstem` to create a stemmed version of the token: + +```json +PUT /example-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "keyword_repeat", + "kstem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to analyze the string `Slower turtle`: + +```json +GET /example-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Slower turtle" +} +``` +{% include copy-curl.html %} + +The response contains the token `turtle` twice in the same position: + +```json +{ + "tokens": [ + { + "token": "slower", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` + +The duplicate token can be removed by adding a `remove_duplicates` token filter to the index settings: + +```json +PUT /index-remove-duplicate +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "keyword_repeat", + "kstem", + "remove_duplicates" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /index-remove-duplicate/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Slower turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slower", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/reverse.md b/_analyzers/token-filters/reverse.md new file mode 100644 index 0000000000..dc48f07e77 --- /dev/null +++ b/_analyzers/token-filters/reverse.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Reverse +parent: Token filters +nav_order: 360 +--- + +# Reverse token filter + +The `reverse` token filter reverses the order of the characters in each token, making suffix information accessible at the beginning of the reversed tokens during analysis. + +This is useful for suffix-based searches: + +The `reverse` token filter is useful when you need to perform suffix-based searches, such as in the following scenarios: + +- **Suffix matching**: Searching for words based on their suffixes, such as identifying words with a specific ending (for example, `-tion` or `-ing`). +- **File extension searches**: Searching for files by their extensions, such as `.txt` or `.jpg`. +- **Custom sorting or ranking**: By reversing tokens, you can implement unique sorting or ranking logic based on suffixes. +- **Autocomplete for suffixes**: Implementing autocomplete suggestions that use suffixes rather than prefixes. + + +## Example + +The following example request creates a new index named `my-reverse-index` and configures an analyzer with a `reverse` filter: + +```json +PUT /my-reverse-index +{ + "settings": { + "analysis": { + "filter": { + "reverse_filter": { + "type": "reverse" + } + }, + "analyzer": { + "my_reverse_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "reverse_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-reverse-index/_analyze +{ + "analyzer": "my_reverse_analyzer", + "text": "hello world" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "olleh", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "dlrow", + "start_offset": 6, + "end_offset": 11, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/shingle.md b/_analyzers/token-filters/shingle.md new file mode 100644 index 0000000000..ea961bf3e0 --- /dev/null +++ b/_analyzers/token-filters/shingle.md @@ -0,0 +1,120 @@ +--- +layout: default +title: Shingle +parent: Token filters +nav_order: 370 +--- + +# Shingle token filter + +The `shingle` token filter is used to generate word n-grams, or _shingles_, from input text. For example, for the string `slow green turtle`, the `shingle` filter creates the following one- and two-word shingles: `slow`, `slow green`, `green`, `green turtle`, and `turtle`. + +This token filter is often used in conjunction with other filters to enhance search accuracy by indexing phrases rather than individual tokens. For more information, see [Phrase suggester]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/did-you-mean/#phrase-suggester). + +## Parameters + +The `shingle` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_shingle_size` | Optional | Integer | The minimum number of tokens to concatenate. Default is `2`. +`max_shingle_size` | Optional | Integer | The maximum number of tokens to concatenate. Default is `2`. +`output_unigrams` | Optional | Boolean | Whether to include unigrams (individual tokens) as output. Default is `true`. +`output_unigrams_if_no_shingles` | Optional | Boolean | Whether to output unigrams if no shingles are generated. Default is `false`. +`token_separator` | Optional | String | A separator used to concatenate tokens into a shingle. Default is a space (`" "`). +`filler_token` | Optional | String | A token inserted into empty positions or gaps between tokens. Default is an underscore (`_`). + +If `output_unigrams` and `output_unigrams_if_no_shingles` are both set to `true`, `output_unigrams_if_no_shingles` is ignored. +{: .note} + +## Example + +The following example request creates a new index named `my-shingle-index` and configures an analyzer with a `shingle` filter: + +```json +PUT /my-shingle-index +{ + "settings": { + "analysis": { + "filter": { + "my_shingle_filter": { + "type": "shingle", + "min_shingle_size": 2, + "max_shingle_size": 2, + "output_unigrams": true + } + }, + "analyzer": { + "my_shingle_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_shingle_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-shingle-index/_analyze +{ + "analyzer": "my_shingle_analyzer", + "text": "slow green turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slow", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "slow green", + "start_offset": 0, + "end_offset": 10, + "type": "shingle", + "position": 0, + "positionLength": 2 + }, + { + "token": "green", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "green turtle", + "start_offset": 5, + "end_offset": 17, + "type": "shingle", + "position": 1, + "positionLength": 2 + }, + { + "token": "turtle", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/snowball.md b/_analyzers/token-filters/snowball.md new file mode 100644 index 0000000000..149486e727 --- /dev/null +++ b/_analyzers/token-filters/snowball.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Snowball +parent: Token filters +nav_order: 380 +--- + +# Snowball token filter + +The `snowball` token filter is a stemming filter based on the [Snowball](https://snowballstem.org/) algorithm. It supports many languages and is more efficient and accurate than the Porter stemming algorithm. + +## Parameters + +The `snowball` token filter can be configured with a `language` parameter that accepts the following values: + +- `Arabic` +- `Armenian` +- `Basque` +- `Catalan` +- `Danish` +- `Dutch` +- `English` (default) +- `Estonian` +- `Finnish` +- `French` +- `German` +- `German2` +- `Hungarian` +- `Italian` +- `Irish` +- `Kp` +- `Lithuanian` +- `Lovins` +- `Norwegian` +- `Porter` +- `Portuguese` +- `Romanian` +- `Russian` +- `Spanish` +- `Swedish` +- `Turkish` + +## Example + +The following example request creates a new index named `my-snowball-index` and configures an analyzer with a `snowball` filter: + +```json +PUT /my-snowball-index +{ + "settings": { + "analysis": { + "filter": { + "my_snowball_filter": { + "type": "snowball", + "language": "English" + } + }, + "analyzer": { + "my_snowball_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_snowball_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-snowball-index/_analyze +{ + "analyzer": "my_snowball_analyzer", + "text": "running runners" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "runner", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stemmer-override.md b/_analyzers/token-filters/stemmer-override.md new file mode 100644 index 0000000000..c06f673714 --- /dev/null +++ b/_analyzers/token-filters/stemmer-override.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Stemmer override +parent: Token filters +nav_order: 400 +--- + +# Stemmer override token filter + +The `stemmer_override` token filter allows you to define custom stemming rules that override the behavior of default stemmers like Porter or Snowball. This can be useful when you want to apply specific stemming behavior to certain words that might not be modified correctly by the standard stemming algorithms. + +## Parameters + +The `stemmer_override` token filter must be configured with exactly one of the following parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`rules` | String | Defines the override rules directly in the settings. +`rules_path` | String | Specifies the path to the file containing custom rules (mappings). The path can be either an absolute path or a path relative to the config directory. + +## Example + +The following example request creates a new index named `my-index` and configures an analyzer with a `stemmer_override` filter: + +```json +PUT /my-index +{ + "settings": { + "analysis": { + "filter": { + "my_stemmer_override_filter": { + "type": "stemmer_override", + "rules": [ + "running, runner => run", + "bought => buy", + "best => good" + ] + } + }, + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_stemmer_override_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "I am a runner and bought the best shoes" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "am", + "start_offset": 2, + "end_offset": 4, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 5, + "end_offset": 6, + "type": "", + "position": 2 + }, + { + "token": "run", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 3 + }, + { + "token": "and", + "start_offset": 14, + "end_offset": 17, + "type": "", + "position": 4 + }, + { + "token": "buy", + "start_offset": 18, + "end_offset": 24, + "type": "", + "position": 5 + }, + { + "token": "the", + "start_offset": 25, + "end_offset": 28, + "type": "", + "position": 6 + }, + { + "token": "good", + "start_offset": 29, + "end_offset": 33, + "type": "", + "position": 7 + }, + { + "token": "shoes", + "start_offset": 34, + "end_offset": 39, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stemmer.md b/_analyzers/token-filters/stemmer.md new file mode 100644 index 0000000000..dd1344fcbc --- /dev/null +++ b/_analyzers/token-filters/stemmer.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Stemmer +parent: Token filters +nav_order: 390 +--- + +# Stemmer token filter + +The `stemmer` token filter reduces words to their root or base form (also known as their _stem_). + +## Parameters + +The `stemmer` token filter can be configured with a `language` parameter that accepts the following values: + +- Arabic: `arabic` +- Armenian: `armenian` +- Basque: `basque` +- Bengali: `bengali` +- Brazilian Portuguese: `brazilian` +- Bulgarian: `bulgarian` +- Catalan: `catalan` +- Czech: `czech` +- Danish: `danish` +- Dutch: `dutch, dutch_kp` +- English: `english` (default), `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english` +- Estonian: `estonian` +- Finnish: `finnish`, `light_finnish` +- French: `light_french`, `french`, `minimal_french` +- Galician: `galician`, `minimal_galician` (plural step only) +- German: `light_german`, `german`, `german2`, `minimal_german` +- Greek: `greek` +- Hindi: `hindi` +- Hungarian: `hungarian, light_hungarian` +- Indonesian: `indonesian` +- Irish: `irish` +- Italian: `light_italian, italian` +- Kurdish (Sorani): `sorani` +- Latvian: `latvian` +- Lithuanian: `lithuanian` +- Norwegian (Bokmål): `norwegian`, `light_norwegian`, `minimal_norwegian` +- Norwegian (Nynorsk): `light_nynorsk`, `minimal_nynorsk` +- Portuguese: `light_portuguese`, `minimal_portuguese`, `portuguese`, `portuguese_rslp` +- Romanian: `romanian` +- Russian: `russian`, `light_russian` +- Spanish: `light_spanish`, `spanish` +- Swedish: `swedish`, `light_swedish` +- Turkish: `turkish` + +You can also use the `name` parameter as an alias for the `language` parameter. If both are set, the `name` parameter is ignored. +{: .note} + +## Example + +The following example request creates a new index named `my-stemmer-index` and configures an analyzer with a `stemmer` filter: + +```json +PUT /my-stemmer-index +{ + "settings": { + "analysis": { + "filter": { + "my_english_stemmer": { + "type": "stemmer", + "language": "english" + } + }, + "analyzer": { + "my_stemmer_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_english_stemmer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-stemmer-index/_analyze +{ + "analyzer": "my_stemmer_analyzer", + "text": "running runs" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "run", + "start_offset": 8, + "end_offset": 12, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stop.md b/_analyzers/token-filters/stop.md new file mode 100644 index 0000000000..8f3e01b72d --- /dev/null +++ b/_analyzers/token-filters/stop.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Stop +parent: Token filters +nav_order: 410 +--- + +# Stop token filter + +The `stop` token filter is used to remove common words (also known as _stopwords_) from a token stream during analysis. Stopwords are typically articles and prepositions, such as `a` or `for`. These words are not significantly meaningful in search queries and are often excluded to improve search efficiency and relevance. + +The default list of English stopwords includes the following words: `a`, `an`, `and`, `are`, `as`, `at`, `be`, `but`, `by`, `for`, `if`, `in`, `into`, `is`, `it`, `no`, `not`, `of`, `on`, `or`, `such`, `that`, `the`, `their`, `then`, `there`, `these`, `they`, `this`, `to`, `was`, `will`, and `with`. + +## Parameters + +The `stop` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`stopwords` | Optional | String | Specifies either a custom array of stopwords or a language for which to fetch the predefined Lucene stopword list:

- [`_arabic_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt)
- [`_armenian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/hy/stopwords.txt)
- [`_basque_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/eu/stopwords.txt)
- [`_bengali_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt)
- [`_brazilian_` (Brazilian Portuguese)](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/br/stopwords.txt)
- [`_bulgarian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt)
- [`_catalan_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ca/stopwords.txt)
- [`_cjk_` (Chinese, Japanese, and Korean)](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/cjk/stopwords.txt)
- [`_czech_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/cz/stopwords.txt)
- [`_danish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/danish_stop.txt)
- [`_dutch_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/dutch_stop.txt)
- [`_english_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L48) (Default)
- [`_estonian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/et/stopwords.txt)
- [`_finnish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/finnish_stop.txt)
- [`_french_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/french_stop.txt)
- [`_galician_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/gl/stopwords.txt)
- [`_german_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/german_stop.txt)
- [`_greek_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/el/stopwords.txt)
- [`_hindi_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt)
- [`_hungarian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/hungarian_stop.txt)
- [`_indonesian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt)
- [`_irish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ga/stopwords.txt)
- [`_italian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/italian_stop.txt)
- [`_latvian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt)
- [`_lithuanian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/lt/stopwords.txt)
- [`_norwegian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/norwegian_stop.txt)
- [`_persian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt)
- [`_portuguese_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/portuguese_stop.txt)
- [`_romanian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt)
- [`_russian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/russian_stop.txt)
- [`_sorani_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/sr/stopwords.txt)
- [`_spanish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ckb/stopwords.txt)
- [`_swedish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/swedish_stop.txt)
- [`_thai_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt)
- [`_turkish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt) +`stopwords_path` | Optional | String | Specifies the file path (absolute or relative to the config directory) of the file containing custom stopwords. +`ignore_case` | Optional | Boolean | If `true`, stopwords will be matched regardless of their case. Default is `false`. +`remove_trailing` | Optional | Boolean | If `true`, trailing stopwords will be removed during analysis. Default is `true`. + +## Example + +The following example request creates a new index named `my-stopword-index` and configures an analyzer with a `stop` filter that uses the predefined stopword list for the English language: + +```json +PUT /my-stopword-index +{ + "settings": { + "analysis": { + "filter": { + "my_stop_filter": { + "type": "stop", + "stopwords": "_english_" + } + }, + "analyzer": { + "my_stop_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_stop_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-stopword-index/_analyze +{ + "analyzer": "my_stop_analyzer", + "text": "A quick dog jumps over the turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "quick", + "start_offset": 2, + "end_offset": 7, + "type": "", + "position": 1 + }, + { + "token": "dog", + "start_offset": 8, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "over", + "start_offset": 18, + "end_offset": 22, + "type": "", + "position": 4 + }, + { + "token": "turtle", + "start_offset": 27, + "end_offset": 33, + "type": "", + "position": 6 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/synonym-graph.md b/_analyzers/token-filters/synonym-graph.md new file mode 100644 index 0000000000..d8e763d1fc --- /dev/null +++ b/_analyzers/token-filters/synonym-graph.md @@ -0,0 +1,180 @@ +--- +layout: default +title: Synonym graph +parent: Token filters +nav_order: 420 +--- + +# Synonym graph token filter + +The `synonym_graph` token filter is a more advanced version of the `synonym` token filter. It supports multiword synonyms and processes synonyms across multiple tokens, making it ideal for phrases or scenarios in which relationships between tokens are important. + +## Parameters + +The `synonym_graph` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`synonyms` | Either `synonyms` or `synonyms_path` must be specified | String | A list of synonym rules defined directly in the configuration. +`synonyms_path` | Either `synonyms` or `synonyms_path` must be specified | String | The file path to a file containing synonym rules (either an absolute path or a path relative to the config directory). +`lenient` | Optional | Boolean | Whether to ignore exceptions when loading the rule configurations. Default is `false`. +`format` | Optional | String | Specifies the format used to determine how OpenSearch defines and interprets synonyms. Valid values are:
- `solr`
- [`wordnet`](https://wordnet.princeton.edu/).
Default is `solr`. +`expand` | Optional | Boolean | Whether to expand equivalent synonym rules. Default is `true`.

For example:
If `synonyms` are defined as `"quick, fast"` and `expand` is set to `true`, then the synonym rules are configured as follows:
- `quick => quick`
- `quick => fast`
- `fast => quick`
- `fast => fast`

If `expand` is set to `false`, the synonym rules are configured as follows:
- `quick => quick`
- `fast => quick` + +## Example: Solr format + +The following example request creates a new index named `my-index` and configures an analyzer with a `synonym_graph` filter. The filter is configured with the default `solr` rule format: + +```json +PUT /my-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_graph_filter": { + "type": "synonym_graph", + "synonyms": [ + "sports car, race car", + "fast car, speedy vehicle", + "luxury car, premium vehicle", + "electric car, EV" + ] + } + }, + "analyzer": { + "my_synonym_graph_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_graph_filter" + ] + } + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-car-index/_analyze +{ + "analyzer": "my_synonym_graph_analyzer", + "text": "I just bought a sports car and it is a fast car." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "i","start_offset": 0,"end_offset": 1,"type": "","position": 0}, + {"token": "just","start_offset": 2,"end_offset": 6,"type": "","position": 1}, + {"token": "bought","start_offset": 7,"end_offset": 13,"type": "","position": 2}, + {"token": "a","start_offset": 14,"end_offset": 15,"type": "","position": 3}, + {"token": "race","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4}, + {"token": "sports","start_offset": 16,"end_offset": 22,"type": "","position": 4,"positionLength": 2}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 5,"positionLength": 2}, + {"token": "car","start_offset": 23,"end_offset": 26,"type": "","position": 6}, + {"token": "and","start_offset": 27,"end_offset": 30,"type": "","position": 7}, + {"token": "it","start_offset": 31,"end_offset": 33,"type": "","position": 8}, + {"token": "is","start_offset": 34,"end_offset": 36,"type": "","position": 9}, + {"token": "a","start_offset": 37,"end_offset": 38,"type": "","position": 10}, + {"token": "speedy","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 11}, + {"token": "fast","start_offset": 39,"end_offset": 43,"type": "","position": 11,"positionLength": 2}, + {"token": "vehicle","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 12,"positionLength": 2}, + {"token": "car","start_offset": 44,"end_offset": 47,"type": "","position": 13} + ] +} +``` + +## Example: WordNet format + +The following example request creates a new index named `my-wordnet-index` and configures an analyzer with a `synonym_graph` filter. The filter is configured with the [`wordnet`](https://wordnet.princeton.edu/) rule format: + +```json +PUT /my-wordnet-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_graph_filter": { + "type": "synonym_graph", + "format": "wordnet", + "synonyms": [ + "s(100000001, 1, 'sports car', n, 1, 0).", + "s(100000001, 2, 'race car', n, 1, 0).", + "s(100000001, 3, 'fast car', n, 1, 0).", + "s(100000001, 4, 'speedy vehicle', n, 1, 0)." + ] + } + }, + "analyzer": { + "my_synonym_graph_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_graph_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-wordnet-index/_analyze +{ + "analyzer": "my_synonym_graph_analyzer", + "text": "I just bought a sports car and it is a fast car." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "i","start_offset": 0,"end_offset": 1,"type": "","position": 0}, + {"token": "just","start_offset": 2,"end_offset": 6,"type": "","position": 1}, + {"token": "bought","start_offset": 7,"end_offset": 13,"type": "","position": 2}, + {"token": "a","start_offset": 14,"end_offset": 15,"type": "","position": 3}, + {"token": "race","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4}, + {"token": "fast","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4,"positionLength": 2}, + {"token": "speedy","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4,"positionLength": 3}, + {"token": "sports","start_offset": 16,"end_offset": 22,"type": "","position": 4,"positionLength": 4}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 5,"positionLength": 4}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 6,"positionLength": 3}, + {"token": "vehicle","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 7,"positionLength": 2}, + {"token": "car","start_offset": 23,"end_offset": 26,"type": "","position": 8}, + {"token": "and","start_offset": 27,"end_offset": 30,"type": "","position": 9}, + {"token": "it","start_offset": 31,"end_offset": 33,"type": "","position": 10}, + {"token": "is","start_offset": 34,"end_offset": 36,"type": "","position": 11}, + {"token": "a","start_offset": 37,"end_offset": 38,"type": "","position": 12}, + {"token": "sports","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13}, + {"token": "race","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13,"positionLength": 2}, + {"token": "speedy","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13,"positionLength": 3}, + {"token": "fast","start_offset": 39,"end_offset": 43,"type": "","position": 13,"positionLength": 4}, + {"token": "car","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 14,"positionLength": 4}, + {"token": "car","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 15,"positionLength": 3}, + {"token": "vehicle","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 16,"positionLength": 2}, + {"token": "car","start_offset": 44,"end_offset": 47,"type": "","position": 17} + ] +} +``` diff --git a/_analyzers/token-filters/synonym.md b/_analyzers/token-filters/synonym.md new file mode 100644 index 0000000000..a1dfff845d --- /dev/null +++ b/_analyzers/token-filters/synonym.md @@ -0,0 +1,277 @@ +--- +layout: default +title: Synonym +parent: Token filters +nav_order: 415 +--- + +# Synonym token filter + +The `synonym` token filter allows you to map multiple terms to a single term or create equivalence groups between words, improving search flexibility. + +## Parameters + +The `synonym` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`synonyms` | Either `synonyms` or `synonyms_path` must be specified | String | A list of synonym rules defined directly in the configuration. +`synonyms_path` | Either `synonyms` or `synonyms_path` must be specified | String | The file path to a file containing synonym rules (either an absolute path or a path relative to the config directory). +`lenient` | Optional | Boolean | Whether to ignore exceptions when loading the rule configurations. Default is `false`. +`format` | Optional | String | Specifies the format used to determine how OpenSearch defines and interprets synonyms. Valid values are:
- `solr`
- [`wordnet`](https://wordnet.princeton.edu/).
Default is `solr`. +`expand` | Optional | Boolean | Whether to expand equivalent synonym rules. Default is `true`.

For example:
If `synonyms` are defined as `"quick, fast"` and `expand` is set to `true`, then the synonym rules are configured as follows:
- `quick => quick`
- `quick => fast`
- `fast => quick`
- `fast => fast`

If `expand` is set to `false`, the synonym rules are configured as follows:
- `quick => quick`
- `fast => quick` + +## Example: Solr format + +The following example request creates a new index named `my-synonym-index` and configures an analyzer with a `synonym` filter. The filter is configured with the default `solr` rule format: + +```json +PUT /my-synonym-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_filter": { + "type": "synonym", + "synonyms": [ + "car, automobile", + "quick, fast, speedy", + "laptop => computer" + ] + } + }, + "analyzer": { + "my_synonym_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-synonym-index/_analyze +{ + "analyzer": "my_synonym_analyzer", + "text": "The quick dog jumps into the car with a laptop" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "quick", + "start_offset": 4, + "end_offset": 9, + "type": "", + "position": 1 + }, + { + "token": "fast", + "start_offset": 4, + "end_offset": 9, + "type": "SYNONYM", + "position": 1 + }, + { + "token": "speedy", + "start_offset": 4, + "end_offset": 9, + "type": "SYNONYM", + "position": 1 + }, + { + "token": "dog", + "start_offset": 10, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 14, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "into", + "start_offset": 20, + "end_offset": 24, + "type": "", + "position": 4 + }, + { + "token": "the", + "start_offset": 25, + "end_offset": 28, + "type": "", + "position": 5 + }, + { + "token": "car", + "start_offset": 29, + "end_offset": 32, + "type": "", + "position": 6 + }, + { + "token": "automobile", + "start_offset": 29, + "end_offset": 32, + "type": "SYNONYM", + "position": 6 + }, + { + "token": "with", + "start_offset": 33, + "end_offset": 37, + "type": "", + "position": 7 + }, + { + "token": "a", + "start_offset": 38, + "end_offset": 39, + "type": "", + "position": 8 + }, + { + "token": "computer", + "start_offset": 40, + "end_offset": 46, + "type": "SYNONYM", + "position": 9 + } + ] +} +``` + +## Example: WordNet format + +The following example request creates a new index named `my-wordnet-index` and configures an analyzer with a `synonym` filter. The filter is configured with the [`wordnet`](https://wordnet.princeton.edu/) rule format: + +```json +PUT /my-wordnet-index +{ + "settings": { + "analysis": { + "filter": { + "my_wordnet_synonym_filter": { + "type": "synonym", + "format": "wordnet", + "synonyms": [ + "s(100000001,1,'fast',v,1,0).", + "s(100000001,2,'quick',v,1,0).", + "s(100000001,3,'swift',v,1,0)." + ] + } + }, + "analyzer": { + "my_wordnet_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_wordnet_synonym_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-wordnet-index/_analyze +{ + "analyzer": "my_wordnet_analyzer", + "text": "I have a fast car" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "have", + "start_offset": 2, + "end_offset": 6, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 7, + "end_offset": 8, + "type": "", + "position": 2 + }, + { + "token": "fast", + "start_offset": 9, + "end_offset": 13, + "type": "", + "position": 3 + }, + { + "token": "quick", + "start_offset": 9, + "end_offset": 13, + "type": "SYNONYM", + "position": 3 + }, + { + "token": "swift", + "start_offset": 9, + "end_offset": 13, + "type": "SYNONYM", + "position": 3 + }, + { + "token": "car", + "start_offset": 14, + "end_offset": 17, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/trim.md b/_analyzers/token-filters/trim.md new file mode 100644 index 0000000000..cdfebed52f --- /dev/null +++ b/_analyzers/token-filters/trim.md @@ -0,0 +1,93 @@ +--- +layout: default +title: Trim +parent: Token filters +nav_order: 430 +--- + +# Trim token filter + +The `trim` token filter removes leading and trailing white space characters from tokens. + +Many popular tokenizers, such as `standard`, `keyword`, and `whitespace` tokenizers, automatically strip leading and trailing white space characters during tokenization. When using these tokenizers, there is no need to configure an additional `trim` token filter. +{: .note} + + +## Example + +The following example request creates a new index named `my_pattern_trim_index` and configures an analyzer with a `trim` filter and a `pattern` tokenizer, which does not remove leading and trailing white space characters: + +```json +PUT /my_pattern_trim_index +{ + "settings": { + "analysis": { + "filter": { + "my_trim_filter": { + "type": "trim" + } + }, + "tokenizer": { + "my_pattern_tokenizer": { + "type": "pattern", + "pattern": "," + } + }, + "analyzer": { + "my_pattern_trim_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer", + "filter": [ + "lowercase", + "my_trim_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_pattern_trim_index/_analyze +{ + "analyzer": "my_pattern_trim_analyzer", + "text": " OpenSearch , is , powerful " +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 12, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 13, + "end_offset": 18, + "type": "word", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 19, + "end_offset": 32, + "type": "word", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/truncate.md b/_analyzers/token-filters/truncate.md new file mode 100644 index 0000000000..16d1452901 --- /dev/null +++ b/_analyzers/token-filters/truncate.md @@ -0,0 +1,107 @@ +--- +layout: default +title: Truncate +parent: Token filters +nav_order: 440 +--- + +# Truncate token filter + +The `truncate` token filter is used to shorten tokens exceeding a specified length. It trims tokens to a maximum number of characters, ensuring that tokens exceeding this limit are truncated. + +## Parameters + +The `truncate` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`length` | Optional | Integer | Specifies the maximum length of the generated token. Default is `10`. + +## Example + +The following example request creates a new index named `truncate_example` and configures an analyzer with a `truncate` filter: + +```json +PUT /truncate_example +{ + "settings": { + "analysis": { + "filter": { + "truncate_filter": { + "type": "truncate", + "length": 5 + } + }, + "analyzer": { + "truncate_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "truncate_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /truncate_example/_analyze +{ + "analyzer": "truncate_analyzer", + "text": "OpenSearch is powerful and scalable" +} + +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opens", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "power", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + }, + { + "token": "and", + "start_offset": 23, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "scala", + "start_offset": 27, + "end_offset": 35, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/unique.md b/_analyzers/token-filters/unique.md new file mode 100644 index 0000000000..c4dfcbab16 --- /dev/null +++ b/_analyzers/token-filters/unique.md @@ -0,0 +1,106 @@ +--- +layout: default +title: Unique +parent: Token filters +nav_order: 450 +--- + +# Unique token filter + +The `unique` token filter ensures that only unique tokens are kept during the analysis process, removing duplicate tokens that appear within a single field or text block. + +## Parameters + +The `unique` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`only_on_same_position` | Optional | Boolean | If `true`, the token filter acts as a `remove_duplicates` token filter and only removes tokens that are in the same position. Default is `false`. + +## Example + +The following example request creates a new index named `unique_example` and configures an analyzer with a `unique` filter: + +```json +PUT /unique_example +{ + "settings": { + "analysis": { + "filter": { + "unique_filter": { + "type": "unique", + "only_on_same_position": false + } + }, + "analyzer": { + "unique_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "unique_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /unique_example/_analyze +{ + "analyzer": "unique_analyzer", + "text": "OpenSearch OpenSearch is powerful powerful and scalable" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 22, + "end_offset": 24, + "type": "", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 25, + "end_offset": 33, + "type": "", + "position": 2 + }, + { + "token": "and", + "start_offset": 43, + "end_offset": 46, + "type": "", + "position": 3 + }, + { + "token": "scalable", + "start_offset": 47, + "end_offset": 55, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/uppercase.md b/_analyzers/token-filters/uppercase.md new file mode 100644 index 0000000000..5026892400 --- /dev/null +++ b/_analyzers/token-filters/uppercase.md @@ -0,0 +1,83 @@ +--- +layout: default +title: Uppercase +parent: Token filters +nav_order: 460 +--- + +# Uppercase token filter + +The `uppercase` token filter is used to convert all tokens (words) to uppercase during analysis. + +## Example + +The following example request creates a new index named `uppercase_example` and configures an analyzer with an `uppercase` filter: + +```json +PUT /uppercase_example +{ + "settings": { + "analysis": { + "filter": { + "uppercase_filter": { + "type": "uppercase" + } + }, + "analyzer": { + "uppercase_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "uppercase_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /uppercase_example/_analyze +{ + "analyzer": "uppercase_analyzer", + "text": "OpenSearch is powerful" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OPENSEARCH", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "IS", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "POWERFUL", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/word-delimiter-graph.md b/_analyzers/token-filters/word-delimiter-graph.md new file mode 100644 index 0000000000..ac734bebeb --- /dev/null +++ b/_analyzers/token-filters/word-delimiter-graph.md @@ -0,0 +1,164 @@ +--- +layout: default +title: Word delimiter graph +parent: Token filters +nav_order: 480 +--- + +# Word delimiter graph token filter + +The `word_delimiter_graph` token filter is used to split tokens at predefined characters and also offers optional token normalization based on customizable rules. + +The `word_delimiter_graph` filter is used to remove punctuation from complex identifiers like part numbers or product IDs. In such cases, it is best used with the `keyword` tokenizer. For hyphenated words, use the `synonym_graph` token filter instead of the `word_delimiter_graph` filter because users frequently search for these terms both with and without hyphens. +{: .note} + +By default, the filter applies the following rules. + +| Description | Input | Output | +|:---|:---|:---| +| Treats non-alphanumeric characters as delimiters. | `ultra-fast` | `ultra`, `fast` | +| Removes delimiters at the beginning or end of tokens. | `Z99++'Decoder'`| `Z99`, `Decoder` | +| Splits tokens when there is a transition between uppercase and lowercase letters. | `OpenSearch` | `Open`, `Search` | +| Splits tokens when there is a transition between letters and numbers. | `T1000` | `T`, `1000` | +| Removes the possessive ('s) from the end of tokens. | `John's` | `John` | + +It's important **not** to use tokenizers that strip punctuation, like the `standard` tokenizer, with this filter. Doing so may prevent proper token splitting and interfere with options like `catenate_all` or `preserve_original`. We recommend using this filter with a `keyword` or `whitespace` tokenizer. +{: .important} + +## Parameters + +You can configure the `word_delimiter_graph` token filter using the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`adjust_offsets` | Optional | Boolean | Determines whether the token offsets should be recalculated for split or concatenated tokens. When `true`, the filter adjusts the token offsets to accurately represent the token's position within the token stream. This adjustment ensures that the token's location in the text aligns with its modified form after processing, which is particularly useful for applications like highlighting or phrase queries. When `false`, the offsets remain unchanged, which may result in misalignment when the processed tokens are mapped back to their positions in the original text. If your analyzer uses filters like `trim` that change the token lengths without changing their offsets, we recommend setting this parameter to `false`. Default is `true`. +`catenate_all` | Optional | Boolean | Produces concatenated tokens from a sequence of alphanumeric parts. For example, `"quick-fast-200"` becomes `[ quickfast200, quick, fast, 200 ]`. Default is `false`. +`catenate_numbers` | Optional | Boolean | Concatenates numerical sequences. For example, `"10-20-30"` becomes `[ 102030, 10, 20, 30 ]`. Default is `false`. +`catenate_words` | Optional | Boolean | Concatenates alphabetic words. For example, `"high-speed-level"` becomes `[ highspeedlevel, high, speed, level ]`. Default is `false`. +`generate_number_parts` | Optional | Boolean | If `true`, numeric tokens (tokens consisting of numbers only) are included in the output. Default is `true`. +`generate_word_parts` | Optional | Boolean | If `true`, alphabetical tokens (tokens consisting of alphabetic characters only) are included in the output. Default is `true`. +`ignore_keywords` | Optional | Boolean | Whether to process tokens marked as keywords. Default is `false`. +`preserve_original` | Optional | Boolean | Keeps the original token (which may include non-alphanumeric delimiters) alongside the generated tokens in the output. For example, `"auto-drive-300"` becomes `[ auto-drive-300, auto, drive, 300 ]`. If `true`, the filter generates multi-position tokens not supported by indexing, so do not use this filter in an index analyzer or use the `flatten_graph` filter after this filter. Default is `false`. +`protected_words` | Optional | Array of strings | Specifies tokens that should not be split. +`protected_words_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing tokens that should not be separated by new lines. +`split_on_case_change` | Optional | Boolean | Splits tokens where consecutive letters have different cases (one is lowercase and the other is uppercase). For example, `"OpenSearch"` becomes `[ Open, Search ]`. Default is `true`. +`split_on_numerics` | Optional | Boolean | Splits tokens where there are consecutive letters and numbers. For example `"v8engine"` will become `[ v, 8, engine ]`. Default is `true`. +`stem_english_possessive` | Optional | Boolean | Removes English possessive endings, such as `'s`. Default is `true`. +`type_table` | Optional | Array of strings | A custom map that specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For example, to treat a hyphen (`-`) as an alphanumeric character, specify `["- => ALPHA"]` so that words are not split at hyphens. Valid types are:
- `ALPHA`: alphabetical
- `ALPHANUM`: alphanumeric
- `DIGIT`: numeric
- `LOWER`: lowercase alphabetical
- `SUBWORD_DELIM`: non-alphanumeric delimiter
- `UPPER`: uppercase alphabetical +`type_table_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing a custom character map. The map specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For valid types, see `type_table`. + +## Example + +The following example request creates a new index named `my-custom-index` and configures an analyzer with a `word_delimiter_graph` filter: + +```json +PUT /my-custom-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "tokenizer": "keyword", + "filter": [ "custom_word_delimiter_filter" ] + } + }, + "filter": { + "custom_word_delimiter_filter": { + "type": "word_delimiter_graph", + "split_on_case_change": true, + "split_on_numerics": true, + "stem_english_possessive": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-custom-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "FastCar's Model2023" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Fast", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "Car", + "start_offset": 4, + "end_offset": 7, + "type": "word", + "position": 1 + }, + { + "token": "Model", + "start_offset": 10, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "2023", + "start_offset": 15, + "end_offset": 19, + "type": "word", + "position": 3 + } + ] +} +``` + + +## Differences between the word_delimiter_graph and word_delimiter filters + + +Both the `word_delimiter_graph` and `word_delimiter` token filters generate tokens spanning multiple positions when any of the following parameters are set to `true`: + +- `catenate_all` +- `catenate_numbers` +- `catenate_words` +- `preserve_original` + +To illustrate the differences between these filters, consider the input text `Pro-XT500`. + + +### word_delimiter_graph + + +The `word_delimiter_graph` filter assigns a `positionLength` attribute to multi-position tokens, indicating how many positions a token spans. This ensures that the filter always generates valid token graphs, making it suitable for use in advanced token graph scenarios. Although token graphs with multi-position tokens are not supported for indexing, they can still be useful in search scenarios. For example, queries like `match_phrase` can use these graphs to generate multiple subqueries from a single input string. For the example input text, the `word_delimiter_graph` filter generates the following tokens: + +- `Pro` (position 1) +- `XT500` (position 2) +- `ProXT500` (position 1, `positionLength`: 2) + +The `positionLength` attribute the production of a valid graph to be used in advanced queries. + + +### word_delimiter + + +In contrast, the `word_delimiter` filter does not assign a `positionLength` attribute to multi-position tokens, leading to invalid graphs when these tokens are present. For the example input text, the `word_delimiter` filter generates the following tokens: + +- `Pro` (position 1) +- `XT500` (position 2) +- `ProXT500` (position 1, no `positionLength`) + +The lack of a `positionLength` attribute results in a token graph that is invalid for token streams containing multi-position tokens. \ No newline at end of file diff --git a/_analyzers/token-filters/word-delimiter.md b/_analyzers/token-filters/word-delimiter.md new file mode 100644 index 0000000000..d820fae2a0 --- /dev/null +++ b/_analyzers/token-filters/word-delimiter.md @@ -0,0 +1,128 @@ +--- +layout: default +title: Word delimiter +parent: Token filters +nav_order: 470 +--- + +# Word delimiter token filter + +The `word_delimiter` token filter is used to split tokens at predefined characters and also offers optional token normalization based on customizable rules. + +We recommend using the `word_delimiter_graph` filter instead of the `word_delimiter` filter whenever possible because the `word_delimiter` filter sometimes produces invalid token graphs. For more information about the differences between the two filters, see [Differences between the `word_delimiter_graph` and `word_delimiter` filters]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter-graph/#differences-between-the-word_delimiter_graph-and-word_delimiter-filters). +{: .important} + +The `word_delimiter` filter is used to remove punctuation from complex identifiers like part numbers or product IDs. In such cases, it is best used with the `keyword` tokenizer. For hyphenated words, use the `synonym_graph` token filter instead of the `word_delimiter` filter because users frequently search for these terms both with and without hyphens. +{: .note} + +By default, the filter applies the following rules. + +| Description | Input | Output | +|:---|:---|:---| +| Treats non-alphanumeric characters as delimiters. | `ultra-fast` | `ultra`, `fast` | +| Removes delimiters at the beginning or end of tokens. | `Z99++'Decoder'`| `Z99`, `Decoder` | +| Splits tokens when there is a transition between uppercase and lowercase letters. | `OpenSearch` | `Open`, `Search` | +| Splits tokens when there is a transition between letters and numbers. | `T1000` | `T`, `1000` | +| Removes the possessive ('s) from the end of tokens. | `John's` | `John` | + +It's important **not** to use tokenizers that strip punctuation, like the `standard` tokenizer, with this filter. Doing so may prevent proper token splitting and interfere with options like `catenate_all` or `preserve_original`. We recommend using this filter with a `keyword` or `whitespace` tokenizer. +{: .important} + +## Parameters + +You can configure the `word_delimiter` token filter using the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`catenate_all` | Optional | Boolean | Produces concatenated tokens from a sequence of alphanumeric parts. For example, `"quick-fast-200"` becomes `[ quickfast200, quick, fast, 200 ]`. Default is `false`. +`catenate_numbers` | Optional | Boolean | Concatenates numerical sequences. For example, `"10-20-30"` becomes `[ 102030, 10, 20, 30 ]`. Default is `false`. +`catenate_words` | Optional | Boolean | Concatenates alphabetic words. For example, `"high-speed-level"` becomes `[ highspeedlevel, high, speed, level ]`. Default is `false`. +`generate_number_parts` | Optional | Boolean | If `true`, numeric tokens (tokens consisting of numbers only) are included in the output. Default is `true`. +`generate_word_parts` | Optional | Boolean | If `true`, alphabetical tokens (tokens consisting of alphabetic characters only) are included in the output. Default is `true`. +`preserve_original` | Optional | Boolean | Keeps the original token (which may include non-alphanumeric delimiters) alongside the generated tokens in the output. For example, `"auto-drive-300"` becomes `[ auto-drive-300, auto, drive, 300 ]`. If `true`, the filter generates multi-position tokens not supported by indexing, so do not use this filter in an index analyzer or use the `flatten_graph` filter after this filter. Default is `false`. +`protected_words` | Optional | Array of strings | Specifies tokens that should not be split. +`protected_words_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing tokens that should not be separated by new lines. +`split_on_case_change` | Optional | Boolean | Splits tokens where consecutive letters have different cases (one is lowercase and the other is uppercase). For example, `"OpenSearch"` becomes `[ Open, Search ]`. Default is `true`. +`split_on_numerics` | Optional | Boolean | Splits tokens where there are consecutive letters and numbers. For example `"v8engine"` will become `[ v, 8, engine ]`. Default is `true`. +`stem_english_possessive` | Optional | Boolean | Removes English possessive endings, such as `'s`. Default is `true`. +`type_table` | Optional | Array of strings | A custom map that specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For example, to treat a hyphen (`-`) as an alphanumeric character, specify `["- => ALPHA"]` so that words are not split at hyphens. Valid types are:
- `ALPHA`: alphabetical
- `ALPHANUM`: alphanumeric
- `DIGIT`: numeric
- `LOWER`: lowercase alphabetical
- `SUBWORD_DELIM`: non-alphanumeric delimiter
- `UPPER`: uppercase alphabetical +`type_table_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing a custom character map. The map specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For valid types, see `type_table`. + +## Example + +The following example request creates a new index named `my-custom-index` and configures an analyzer with a `word_delimiter` filter: + +```json +PUT /my-custom-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "tokenizer": "keyword", + "filter": [ "custom_word_delimiter_filter" ] + } + }, + "filter": { + "custom_word_delimiter_filter": { + "type": "word_delimiter", + "split_on_case_change": true, + "split_on_numerics": true, + "stem_english_possessive": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-custom-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "FastCar's Model2023" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Fast", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "Car", + "start_offset": 4, + "end_offset": 7, + "type": "word", + "position": 1 + }, + { + "token": "Model", + "start_offset": 10, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "2023", + "start_offset": 15, + "end_offset": 19, + "type": "word", + "position": 3 + } + ] +} +``` diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md index e5ac796c12..1f9e49c855 100644 --- a/_analyzers/tokenizers/index.md +++ b/_analyzers/tokenizers/index.md @@ -2,7 +2,7 @@ layout: default title: Tokenizers nav_order: 60 -has_children: false +has_children: true has_toc: false redirect_from: - /analyzers/tokenizers/index/ diff --git a/_analyzers/tokenizers/lowercase.md b/_analyzers/tokenizers/lowercase.md new file mode 100644 index 0000000000..5542ecbf50 --- /dev/null +++ b/_analyzers/tokenizers/lowercase.md @@ -0,0 +1,93 @@ +--- +layout: default +title: Lowercase +parent: Tokenizers +nav_order: 70 +--- + +# Lowercase tokenizer + +The `lowercase` tokenizer breaks text into terms at white space and then lowercases all the terms. Functionally, this is identical to configuring a `letter` tokenizer with a `lowercase` token filter. However, using a `lowercase` tokenizer is more efficient because the tokenizer actions are performed in a single step. + +## Example usage + +The following example request creates a new index named `my-lowercase-index` and configures an analyzer with a `lowercase` tokenizer: + +```json +PUT /my-lowercase-index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_lowercase_tokenizer": { + "type": "lowercase" + } + }, + "analyzer": { + "my_lowercase_analyzer": { + "type": "custom", + "tokenizer": "my_lowercase_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my-lowercase-index/_analyze +{ + "analyzer": "my_lowercase_analyzer", + "text": "This is a Test. OpenSearch 123!" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "this", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 5, + "end_offset": 7, + "type": "word", + "position": 1 + }, + { + "token": "a", + "start_offset": 8, + "end_offset": 9, + "type": "word", + "position": 2 + }, + { + "token": "test", + "start_offset": 10, + "end_offset": 14, + "type": "word", + "position": 3 + }, + { + "token": "opensearch", + "start_offset": 16, + "end_offset": 26, + "type": "word", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/tokenizers/ngram.md b/_analyzers/tokenizers/ngram.md new file mode 100644 index 0000000000..08ac456267 --- /dev/null +++ b/_analyzers/tokenizers/ngram.md @@ -0,0 +1,111 @@ +--- +layout: default +title: N-gram +parent: Tokenizers +nav_order: 80 +--- + +# N-gram tokenizer + +The `ngram` tokenizer splits text into overlapping n-grams (sequences of characters) of a specified length. This tokenizer is particularly useful when you want to perform partial word matching or autocomplete search functionality because it generates substrings (character n-grams) of the original input text. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with an `ngram` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_ngram_tokenizer": { + "type": "ngram", + "min_gram": 3, + "max_gram": 4, + "token_chars": ["letter", "digit"] + } + }, + "analyzer": { + "my_ngram_analyzer": { + "type": "custom", + "tokenizer": "my_ngram_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_ngram_analyzer", + "text": "OpenSearch" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "Sea","start_offset": 0,"end_offset": 3,"type": "word","position": 0}, + {"token": "Sear","start_offset": 0,"end_offset": 4,"type": "word","position": 1}, + {"token": "ear","start_offset": 1,"end_offset": 4,"type": "word","position": 2}, + {"token": "earc","start_offset": 1,"end_offset": 5,"type": "word","position": 3}, + {"token": "arc","start_offset": 2,"end_offset": 5,"type": "word","position": 4}, + {"token": "arch","start_offset": 2,"end_offset": 6,"type": "word","position": 5}, + {"token": "rch","start_offset": 3,"end_offset": 6,"type": "word","position": 6} + ] +} +``` + +## Parameters + +The `ngram` tokenizer can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_gram` | Optional | Integer | The minimum length of the n-grams. Default is `1`. +`max_gram` | Optional | Integer | The maximum length of the n-grams. Default is `2`. +`token_chars` | Optional | List of strings | The character classes to be included in tokenization. Valid values are:
- `letter`
- `digit`
- `whitespace`
- `punctuation`
- `symbol`
- `custom` (You must also specify the `custom_token_chars` parameter)
Default is an empty list (`[]`), which retains all the characters. +`custom_token_chars` | Optional | String | Custom characters to be included in the tokens. + +### Maximum difference between `min_gram` and `max_gram` + +The maximum difference between `min_gram` and `max_gram` is configured using the index-level `index.max_ngram_diff` setting and defaults to `1`. + +The following example request creates an index with a custom `index.max_ngram_diff` setting: + +```json +PUT /my-index +{ + "settings": { + "index.max_ngram_diff": 2, + "analysis": { + "tokenizer": { + "my_ngram_tokenizer": { + "type": "ngram", + "min_gram": 3, + "max_gram": 5, + "token_chars": ["letter", "digit"] + } + }, + "analyzer": { + "my_ngram_analyzer": { + "type": "custom", + "tokenizer": "my_ngram_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_analyzers/tokenizers/path-hierarchy.md b/_analyzers/tokenizers/path-hierarchy.md new file mode 100644 index 0000000000..a6609f30cd --- /dev/null +++ b/_analyzers/tokenizers/path-hierarchy.md @@ -0,0 +1,182 @@ +--- +layout: default +title: Path hierarchy +parent: Tokenizers +nav_order: 90 +--- + +# Path hierarchy tokenizer + +The `path_hierarchy` tokenizer tokenizes file-system-like paths (or similar hierarchical structures) by breaking them down into tokens at each hierarchy level. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `path_hierarchy` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_path_tokenizer": { + "type": "path_hierarchy" + } + }, + "analyzer": { + "my_path_analyzer": { + "type": "custom", + "tokenizer": "my_path_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_path_analyzer", + "text": "/users/john/documents/report.txt" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "/users", + "start_offset": 0, + "end_offset": 6, + "type": "word", + "position": 0 + }, + { + "token": "/users/john", + "start_offset": 0, + "end_offset": 11, + "type": "word", + "position": 0 + }, + { + "token": "/users/john/documents", + "start_offset": 0, + "end_offset": 21, + "type": "word", + "position": 0 + }, + { + "token": "/users/john/documents/report.txt", + "start_offset": 0, + "end_offset": 32, + "type": "word", + "position": 0 + } + ] +} +``` + +## Parameters + +The `path_hierarchy` tokenizer can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`delimiter` | Optional | String | Specifies the character used to separate path components. Default is `/`. +`replacement` | Optional | String | Configures the character used to replace the delimiter in the tokens. Default is `/`. +`buffer_size` | Optional | Integer | Specifies the buffer size. Default is `1024`. +`reverse` | Optional | Boolean | If `true`, generates tokens in reverse order. Default is `false`. +`skip` | Optional | Integer | Specifies the number of initial tokens (levels) to skip when tokenizing. Default is `0`. + +## Example using delimiter and replacement parameters + +The following example request configures custom `delimiter` and `replacement` parameters: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_path_tokenizer": { + "type": "path_hierarchy", + "delimiter": "\\", + "replacement": "\\" + } + }, + "analyzer": { + "my_path_analyzer": { + "type": "custom", + "tokenizer": "my_path_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_path_analyzer", + "text": "C:\\users\\john\\documents\\report.txt" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "C:", + "start_offset": 0, + "end_offset": 2, + "type": "word", + "position": 0 + }, + { + "token": """C:\users""", + "start_offset": 0, + "end_offset": 8, + "type": "word", + "position": 0 + }, + { + "token": """C:\users\john""", + "start_offset": 0, + "end_offset": 13, + "type": "word", + "position": 0 + }, + { + "token": """C:\users\john\documents""", + "start_offset": 0, + "end_offset": 23, + "type": "word", + "position": 0 + }, + { + "token": """C:\users\john\documents\report.txt""", + "start_offset": 0, + "end_offset": 34, + "type": "word", + "position": 0 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/tokenizers/pattern.md b/_analyzers/tokenizers/pattern.md new file mode 100644 index 0000000000..f422d8c805 --- /dev/null +++ b/_analyzers/tokenizers/pattern.md @@ -0,0 +1,167 @@ +--- +layout: default +title: Pattern +parent: Tokenizers +nav_order: 100 +--- + +# Pattern tokenizer + +The `pattern` tokenizer is a highly flexible tokenizer that allows you to split text into tokens based on a custom Java regular expression. Unlike the `simple_pattern` and `simple_pattern_split` tokenizers, which use Lucene regular expressions, the `pattern` tokenizer can handle more complex and detailed regex patterns, offering greater control over how the text is tokenized. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `pattern` tokenizer. The tokenizer splits text at `-`, `_`, or `.` characters: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_pattern_tokenizer": { + "type": "pattern", + "pattern": "[-_.]" + } + }, + "analyzer": { + "my_pattern_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_pattern_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_pattern_analyzer", + "text": "OpenSearch-2024_v1.2" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "2024", + "start_offset": 11, + "end_offset": 15, + "type": "word", + "position": 1 + }, + { + "token": "v1", + "start_offset": 16, + "end_offset": 18, + "type": "word", + "position": 2 + }, + { + "token": "2", + "start_offset": 19, + "end_offset": 20, + "type": "word", + "position": 3 + } + ] +} +``` + +## Parameters + +The `pattern` tokenizer can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Optional | String | The pattern used to split text into tokens, specified using a [Java regular expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). Default is `\W+`. +`flags` | Optional | String | Configures pipe-separated [flags](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#field.summary) to apply to the regular expression, for example, `"CASE_INSENSITIVE|MULTILINE|DOTALL"`. +`group` | Optional | Integer | Specifies the capture group to be used as a token. Default is `-1` (split at a match). + +## Example using a group parameter + +The following example request configures a `group` parameter that captures only the second group: + +```json +PUT /my_index_group2 +{ + "settings": { + "analysis": { + "tokenizer": { + "my_pattern_tokenizer": { + "type": "pattern", + "pattern": "([a-zA-Z]+)(\\d+)", + "group": 2 + } + }, + "analyzer": { + "my_pattern_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index_group2/_analyze +{ + "analyzer": "my_pattern_analyzer", + "text": "abc123def456ghi" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "123", + "start_offset": 3, + "end_offset": 6, + "type": "word", + "position": 0 + }, + { + "token": "456", + "start_offset": 9, + "end_offset": 12, + "type": "word", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/tokenizers/simple-pattern-split.md b/_analyzers/tokenizers/simple-pattern-split.md new file mode 100644 index 0000000000..1fd130082e --- /dev/null +++ b/_analyzers/tokenizers/simple-pattern-split.md @@ -0,0 +1,105 @@ +--- +layout: default +title: Simple pattern split +parent: Tokenizers +nav_order: 120 +--- + +# Simple pattern split tokenizer + +The `simple_pattern_split` tokenizer uses a regular expression to split text into tokens. The regular expression defines the pattern used to determine where to split the text. Any matching pattern in the text is used as a delimiter, and the text between delimiters becomes a token. Use this tokenizer when you want to define delimiters and tokenize the rest of the text based on a pattern. + +The tokenizer uses the matched parts of the input text (based on the regular expression) only as delimiters or boundaries to split the text into terms. The matched portions are not included in the resulting terms. For example, if the tokenizer is configured to split text at dot characters (`.`) and the input text is `one.two.three`, then the generated terms are `one`, `two`, and `three`. The dot characters themselves are not included in the resulting terms. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `simple_pattern_split` tokenizer. The tokenizer is configured to split text at hyphens: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_pattern_split_tokenizer": { + "type": "simple_pattern_split", + "pattern": "-" + } + }, + "analyzer": { + "my_pattern_split_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_split_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_pattern_split_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_pattern_split_analyzer", + "text": "OpenSearch-2024-10-09" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "2024", + "start_offset": 11, + "end_offset": 15, + "type": "word", + "position": 1 + }, + { + "token": "10", + "start_offset": 16, + "end_offset": 18, + "type": "word", + "position": 2 + }, + { + "token": "09", + "start_offset": 19, + "end_offset": 21, + "type": "word", + "position": 3 + } + ] +} +``` + +## Parameters + +The `simple_pattern_split` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Optional | String | The pattern used to split text into tokens, specified using a [Lucene regular expression](https://lucene.apache.org/core/9_10_0/core/org/apache/lucene/util/automaton/RegExp.html). Default is an empty string, which returns the input text as one token. \ No newline at end of file diff --git a/_analyzers/tokenizers/simple-pattern.md b/_analyzers/tokenizers/simple-pattern.md new file mode 100644 index 0000000000..eacddd6992 --- /dev/null +++ b/_analyzers/tokenizers/simple-pattern.md @@ -0,0 +1,89 @@ +--- +layout: default +title: Simple pattern +parent: Tokenizers +nav_order: 110 +--- + +# Simple pattern tokenizer + +The `simple_pattern` tokenizer identifies matching sequences in text based on a regular expression and uses those sequences as tokens. It extracts terms that match the regular expression. Use this tokenizer when you want to directly extract specific patterns as terms. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `simple_pattern` tokenizer. The tokenizer extracts numeric terms from text: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_pattern_tokenizer": { + "type": "simple_pattern", + "pattern": "\\d+" + } + }, + "analyzer": { + "my_pattern_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_pattern_analyzer", + "text": "OpenSearch-2024-10-09" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "2024", + "start_offset": 11, + "end_offset": 15, + "type": "word", + "position": 0 + }, + { + "token": "10", + "start_offset": 16, + "end_offset": 18, + "type": "word", + "position": 1 + }, + { + "token": "09", + "start_offset": 19, + "end_offset": 21, + "type": "word", + "position": 2 + } + ] +} +``` + +## Parameters + +The `simple_pattern` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Optional | String | The pattern used to split text into tokens, specified using a [Lucene regular expression](https://lucene.apache.org/core/9_10_0/core/org/apache/lucene/util/automaton/RegExp.html). Default is an empty string, which returns the input text as one token. + diff --git a/_analyzers/tokenizers/standard.md b/_analyzers/tokenizers/standard.md new file mode 100644 index 0000000000..c10f25802b --- /dev/null +++ b/_analyzers/tokenizers/standard.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Standard +parent: Tokenizers +nav_order: 130 +--- + +# Standard tokenizer + +The `standard` tokenizer is the default tokenizer in OpenSearch. It tokenizes text based on word boundaries using a grammar-based approach that recognizes letters, digits, and other characters like punctuation. It is highly versatile and suitable for many languages because it uses Unicode text segmentation rules ([UAX#29](https://unicode.org/reports/tr29/)) to break text into tokens. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `standard` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_standard_analyzer": { + "type": "standard" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_standard_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_standard_analyzer", + "text": "OpenSearch is powerful, fast, and scalable." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + }, + { + "token": "fast", + "start_offset": 24, + "end_offset": 28, + "type": "", + "position": 3 + }, + { + "token": "and", + "start_offset": 30, + "end_offset": 33, + "type": "", + "position": 4 + }, + { + "token": "scalable", + "start_offset": 34, + "end_offset": 42, + "type": "", + "position": 5 + } + ] +} +``` + +## Parameters + +The `standard` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`. + diff --git a/_analyzers/tokenizers/thai.md b/_analyzers/tokenizers/thai.md new file mode 100644 index 0000000000..4afb14a9eb --- /dev/null +++ b/_analyzers/tokenizers/thai.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Thai +parent: Tokenizers +nav_order: 140 +--- + +# Thai tokenizer + +The `thai` tokenizer tokenizes Thai language text. Because words in Thai language are not separated by spaces, the tokenizer must identify word boundaries based on language-specific rules. + +## Example usage + +The following example request creates a new index named `thai_index` and configures an analyzer with a `thai` tokenizer: + +```json +PUT /thai_index +{ + "settings": { + "analysis": { + "tokenizer": { + "thai_tokenizer": { + "type": "thai" + } + }, + "analyzer": { + "thai_analyzer": { + "type": "custom", + "tokenizer": "thai_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /thai_index/_analyze +{ + "analyzer": "thai_analyzer", + "text": "ฉันชอบไปเที่ยวที่เชียงใหม่" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "ฉัน", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "ชอบ", + "start_offset": 3, + "end_offset": 6, + "type": "word", + "position": 1 + }, + { + "token": "ไป", + "start_offset": 6, + "end_offset": 8, + "type": "word", + "position": 2 + }, + { + "token": "เที่ยว", + "start_offset": 8, + "end_offset": 14, + "type": "word", + "position": 3 + }, + { + "token": "ที่", + "start_offset": 14, + "end_offset": 17, + "type": "word", + "position": 4 + }, + { + "token": "เชียงใหม่", + "start_offset": 17, + "end_offset": 26, + "type": "word", + "position": 5 + } + ] +} +``` diff --git a/_analyzers/tokenizers/uax-url-email.md b/_analyzers/tokenizers/uax-url-email.md new file mode 100644 index 0000000000..34336a4f55 --- /dev/null +++ b/_analyzers/tokenizers/uax-url-email.md @@ -0,0 +1,84 @@ +--- +layout: default +title: UAX URL email +parent: Tokenizers +nav_order: 150 +--- + +# UAX URL email tokenizer + +In addition to regular text, the `uax_url_email` tokenizer is designed to handle URLs, email addresses, and domain names. It is based on the Unicode Text Segmentation algorithm ([UAX #29](https://www.unicode.org/reports/tr29/)), which allows it to correctly tokenize complex text, including URLs and email addresses. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `uax_url_email` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "uax_url_email_tokenizer": { + "type": "uax_url_email" + } + }, + "analyzer": { + "my_uax_analyzer": { + "type": "custom", + "tokenizer": "uax_url_email_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_uax_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_uax_analyzer", + "text": "Contact us at support@example.com or visit https://example.com for details." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "Contact","start_offset": 0,"end_offset": 7,"type": "","position": 0}, + {"token": "us","start_offset": 8,"end_offset": 10,"type": "","position": 1}, + {"token": "at","start_offset": 11,"end_offset": 13,"type": "","position": 2}, + {"token": "support@example.com","start_offset": 14,"end_offset": 33,"type": "","position": 3}, + {"token": "or","start_offset": 34,"end_offset": 36,"type": "","position": 4}, + {"token": "visit","start_offset": 37,"end_offset": 42,"type": "","position": 5}, + {"token": "https://example.com","start_offset": 43,"end_offset": 62,"type": "","position": 6}, + {"token": "for","start_offset": 63,"end_offset": 66,"type": "","position": 7}, + {"token": "details","start_offset": 67,"end_offset": 74,"type": "","position": 8} + ] +} +``` + +## Parameters + +The `uax_url_email` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`. + diff --git a/_analyzers/tokenizers/whitespace.md b/_analyzers/tokenizers/whitespace.md new file mode 100644 index 0000000000..604eeeb6a0 --- /dev/null +++ b/_analyzers/tokenizers/whitespace.md @@ -0,0 +1,110 @@ +--- +layout: default +title: Whitespace +parent: Tokenizers +nav_order: 160 +--- + +# Whitespace tokenizer + +The `whitespace` tokenizer splits text at white space characters, such as spaces, tabs, and new lines. It treats each word separated by white space as a token and does not perform any additional analysis or normalization like lowercasing or punctuation removal. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `whitespace` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "whitespace_tokenizer": { + "type": "whitespace" + } + }, + "analyzer": { + "my_whitespace_analyzer": { + "type": "custom", + "tokenizer": "whitespace_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_whitespace_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_whitespace_analyzer", + "text": "OpenSearch is fast! Really fast." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "word", + "position": 1 + }, + { + "token": "fast!", + "start_offset": 14, + "end_offset": 19, + "type": "word", + "position": 2 + }, + { + "token": "Really", + "start_offset": 20, + "end_offset": 26, + "type": "word", + "position": 3 + }, + { + "token": "fast.", + "start_offset": 27, + "end_offset": 32, + "type": "word", + "position": 4 + } + ] +} +``` + +## Parameters + +The `whitespace` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`. + diff --git a/_api-reference/cat/cat-indices.md b/_api-reference/cat/cat-indices.md index 288920778b..4bbdde573c 100644 --- a/_api-reference/cat/cat-indices.md +++ b/_api-reference/cat/cat-indices.md @@ -1,6 +1,6 @@ --- layout: default -title: CAT indices operation +title: CAT indices parent: CAT API nav_order: 25 has_children: false @@ -62,3 +62,7 @@ GET _cat/indices/index1,index2,index3 health | status | index | uuid | pri | rep | docs.count | docs.deleted | store.size | pri.store.size green | open | movies | UZbpfERBQ1-3GSH2bnM3sg | 1 | 1 | 1 | 0 | 7.7kb | 3.8kb ``` + +## Limiting the response size + +To limit the number of indexes returned, configure the `cat.indices.response.limit.number_of_indices` setting. For more information, see [Cluster-level CAT response limit settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/cluster-settings/#cluster-level-cat-response-limit-settings). \ No newline at end of file diff --git a/_api-reference/cat/cat-segments.md b/_api-reference/cat/cat-segments.md index a6075aa4f5..76696d3886 100644 --- a/_api-reference/cat/cat-segments.md +++ b/_api-reference/cat/cat-segments.md @@ -57,3 +57,7 @@ index | shard | prirep | ip | segment | generation | docs.count | docs.deleted | movies | 0 | p | 172.18.0.4 | _0 | 0 | 1 | 0 | 3.5kb | 1364 | true | true | 8.7.0 | true movies | 0 | r | 172.18.0.3 | _0 | 0 | 1 | 0 | 3.5kb | 1364 | true | true | 8.7.0 | true ``` + +## Limiting the response size + +To limit the number of indexes returned, configure the `cat.segments.response.limit.number_of_indices` setting. For more information, see [Cluster-level CAT response limit settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/cluster-settings/#cluster-level-cat-response-limit-settings). \ No newline at end of file diff --git a/_api-reference/cat/cat-shards.md b/_api-reference/cat/cat-shards.md index 54b503f094..c9677cb0ed 100644 --- a/_api-reference/cat/cat-shards.md +++ b/_api-reference/cat/cat-shards.md @@ -66,3 +66,7 @@ index | shard | prirep | state | docs | store | ip | | node plugins | 0 | p | STARTED | 0 | 208b | 172.18.0.4 | odfe-node1 plugins | 0 | r | STARTED | 0 | 208b | 172.18.0.3 | odfe-node2 ``` + +## Limiting the response size + +To limit the number of shards returned, configure the `cat.shards.response.limit.number_of_shards` setting. For more information, see [Cluster-level CAT response limit settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/cluster-settings/#cluster-level-cat-response-limit-settings). \ No newline at end of file diff --git a/_api-reference/cluster-api/cluster-stats.md b/_api-reference/cluster-api/cluster-stats.md index 09b3e5087e..0a1b348a67 100644 --- a/_api-reference/cluster-api/cluster-stats.md +++ b/_api-reference/cluster-api/cluster-stats.md @@ -21,6 +21,8 @@ The cluster stats API operation returns statistics about your cluster. ```json GET _cluster/stats GET _cluster/stats/nodes/ +GET _cluster/stats//nodes/ +GET _cluster/stats///nodes/ ``` ## Path parameters @@ -30,10 +32,57 @@ All parameters are optional. Parameter | Type | Description :--- | :--- | :--- <node-filters> | List | A comma-separated list of [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters) that OpenSearch uses to filter results. +metric | String | A comma-separated list of [metric groups](#metric-groups), for example, `jvm,fs`. Default is all metric groups. +index_metric | String | A comma-separated list of [index metric groups](#index-metric-groups), for example, `docs,store`. Default is all index metrics. - Although the `master` node is now called `cluster_manager` for version 2.0, we retained the `master` field for backwards compatibility. If you have a node that has either a `master` role or a `cluster_manager` role, the `count` increases for both fields by 1. To see an example node count increase, see the Response sample. - {: .note } +Although the term `master` was deprecated in favor of `cluster_manager` subsequent to OpenSearch 2.0, the `master` field was retained for backward compatibility. If you have a node that has either a `master` role or a `cluster_manager` role, the `count` increases for both fields by 1. For an example node count increase, see the [example response](#example-response). +{: .note } + +### Metric groups + +The following table lists all available metric groups. + +Metric | Description +:--- |:---- +`indices` | Statistics about indexes in the cluster. +`os` | Statistics about the host OS, including load and memory. +`process` | Statistics about processes, including open file descriptors and CPU usage. +`jvm` | Statistics about the JVM, including heap usage and threads. +`fs` | Statistics about file system usage. +`plugins` | Statistics about OpenSearch plugins integrated with the nodes. +`network_types` | A list of the transport and HTTP networks connected to the nodes. +`discovery_type` | The method used by the nodes to find other nodes in the cluster. +`packaging_types` | Information about each node's OpenSearch distribution. +`ingest` | Statistics about ingest pipelines. + +### Index metric groups + +To filter the information returned for the `indices` metric, you can use specific `index_metric` values. These values are only supported when using the following query types: + +```json +GET _cluster/stats/_all//_nodes/ +GET _cluster/stats/indices//_nodes/ +``` + +The following index metrics are supported: + +- `shards` +- `docs` +- `store` +- `fielddata` +- `query_cache` +- `completion` +- `segments` +- `mappings` +- `analysis` + +For example, the following query requests statistics for `docs` and `search`: + +```json +GET _cluster/stats/indices/docs,segments/_nodes/_all +``` +{% include copy-curl.html %} ## Example request @@ -491,32 +540,32 @@ GET _cluster/stats/nodes/_cluster_manager Field | Description :--- | :--- -nodes | How many nodes returned in the response. -cluster_name | The cluster's name. -cluster_uuid | The cluster's uuid. -timestamp | The Unix epoch time of when the cluster was last refreshed. -status | The cluster's health status. -indices | Statistics about the indexes in the cluster. -indices.count | How many indexes are in the cluster. -indices.shards | Information about the cluster's shards. -indices.docs | How many documents are still in the cluster and how many documents are deleted. -indices.store | Information about the cluster's storage. -indices.fielddata | Information about the cluster's field data -indices.query_cache | Data about the cluster's query cache. -indices.completion | How many bytes in memory are used to complete operations. -indices.segments | Information about the cluster's segments, which are small Lucene indexes. -indices.mappings | Mappings within the cluster. -indices.analysis | Information about analyzers used in the cluster. -nodes | Statistics about the nodes in the cluster. -nodes.count | How many nodes were returned from the request. -nodes.versions | OpenSearch's version number. -nodes.os | Information about the operating systems used in the nodes. -nodes.process | The processes the returned nodes use. -nodes.jvm | Statistics about the Java Virtual Machines in use. -nodes.fs | The nodes' file storage. -nodes.plugins | The OpenSearch plugins integrated within the nodes. -nodes.network_types | The transport and HTTP networks within the nodes. -nodes.discovery_type | The method the nodes use to find other nodes within the cluster. -nodes.packaging_types | Information about the nodes' OpenSearch distribution. -nodes.ingest | Information about the nodes' ingest pipelines/nodes, if there are any. -total_time_spent | The total amount of download and upload time spent across all shards in the cluster when downloading or uploading from the remote store. +`nodes` | The number of nodes returned in the response. +`cluster_name` | The cluster's name. +`cluster_uuid` | The cluster's UUID. +`timestamp` | The Unix epoch time indicating when the cluster was last refreshed. +`status` | The cluster's health status. +`indices` | Statistics about the indexes in the cluster. +`indices.count` | The number of indexes in the cluster. +`indices.shards` | Information about the cluster's shards. +`indices.docs` | The number of documents remaining in the cluster and the number of documents that were deleted. +`indices.store` | Information about the cluster's storage. +`indices.fielddata` | Information about the cluster's field data. +`indices.query_cache` | Data about the cluster's query cache. +`indices.completion` | The number of bytes in memory that were used to complete operations. +`indices.segments` | Information about the cluster's segments, which are small Lucene indexes. +`indices.mappings` | Information about mappings in the cluster. +`indices.analysis` | Information about analyzers used in the cluster. +`nodes` | Statistics about the nodes in the cluster. +`nodes.count` | The number of nodes returned by the request. +`nodes.versions` | The OpenSearch version number for each node. +`nodes.os` | Information about the operating systems used by the nodes. +`nodes.process` | A list of processes used by each node. +`nodes.jvm` | Statistics about the JVMs in use. +`nodes.fs` | Information about the node's file storage. +`nodes.plugins` | A list of the OpenSearch plugins integrated with the nodes. +`nodes.network_types` | A list of the transport and HTTP networks connected to the nodes. +`nodes.discovery_type` | A list of methods used by the nodes to find other nodes in the cluster. +`nodes.packaging_types` | Information about each node's OpenSearch distribution. +`nodes.ingest` | Information about the node's ingest pipelines/nodes, if there are any. +`total_time_spent` | The total amount of download and upload time spent across all shards in the cluster when downloading or uploading from the remote store. diff --git a/_api-reference/common-parameters.md b/_api-reference/common-parameters.md index 5b536ad992..ac3efbf4bf 100644 --- a/_api-reference/common-parameters.md +++ b/_api-reference/common-parameters.md @@ -123,4 +123,17 @@ Kilometers | `km` or `kilometers` Meters | `m` or `meters` Centimeters | `cm` or `centimeters` Millimeters | `mm` or `millimeters` -Nautical miles | `NM`, `nmi`, or `nauticalmiles` \ No newline at end of file +Nautical miles | `NM`, `nmi`, or `nauticalmiles` + +## `X-Opaque-Id` header + +You can specify an opaque identifier for any request using the `X-Opaque-Id` header. This identifier is used to track tasks and deduplicate deprecation warnings in server-side logs. This identifier is used to differentiate between callers sending requests to your OpenSearch cluster. Do not specify a unique value per request. + +#### Example request + +The following request adds an opaque ID to the request: + +```json +curl -H "X-Opaque-Id: my-curl-client-1" -XGET localhost:9200/_tasks +``` +{% include copy.html %} diff --git a/_api-reference/document-apis/update-document.md b/_api-reference/document-apis/update-document.md index ff17940cdb..8500fec101 100644 --- a/_api-reference/document-apis/update-document.md +++ b/_api-reference/document-apis/update-document.md @@ -14,6 +14,14 @@ redirect_from: If you need to update a document's fields in your index, you can use the update document API operation. You can do so by specifying the new data you want to be in your index or by including a script in your request body, which OpenSearch runs to update the document. By default, the update operation only updates a document that exists in the index. If a document does not exist, the API returns an error. To _upsert_ a document (update the document that exists or index a new one), use the [upsert](#using-the-upsert-operation) operation. +You cannot explicitly specify an ingest pipeline when calling the Update Document API. If a `default_pipeline` or `final_pipeline` is defined in your index, the following behavior applies: + +- **Upsert operations**: When indexing a new document, the `default_pipeline` and `final_pipeline` defined in the index are executed as specified. +- **Update operations**: When updating an existing document, ingest pipeline execution is not recommended because it may produce erroneous results. Support for running ingest pipelines during update operations is deprecated and will be removed in version 3.0.0. If your index has a defined ingest pipeline, the update document operation will return the following deprecation warning: +``` +the index [sample-index1] has a default ingest pipeline or a final ingest pipeline, the support of the ingest pipelines for update operation causes unexpected result and will be removed in 3.0.0 +``` + ## Path and HTTP methods ```json diff --git a/_api-reference/index-apis/recover.md b/_api-reference/index-apis/recover.md index 2f91e20fcb..41f071cf6c 100644 --- a/_api-reference/index-apis/recover.md +++ b/_api-reference/index-apis/recover.md @@ -28,7 +28,7 @@ The Recovery API reports solely on completed recoveries for shard copies present ```json GET /_recovery -GET //recovery/ +GET //_recovery/ ``` ## Path parameters diff --git a/_api-reference/index-apis/update-settings.md b/_api-reference/index-apis/update-settings.md index c0991bf852..3afaaa10d3 100644 --- a/_api-reference/index-apis/update-settings.md +++ b/_api-reference/index-apis/update-settings.md @@ -11,7 +11,7 @@ redirect_from: **Introduced 1.0** {: .label .label-purple } -You can use the update settings API operation to update index-level settings. You can change dynamic index settings at any time, but static settings cannot be changed after index creation. For more information about static and dynamic index settings, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). +You can use the update settings API operation to update index-level settings. You can change dynamic index settings at any time, but static settings cannot be changed after index creation. For more information about static and dynamic index settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). Aside from the static and dynamic index settings, you can also update individual plugins' settings. To get the full list of updatable settings, run `GET /_settings?include_defaults=true`. @@ -26,7 +26,7 @@ PUT //_settings Parameter | Type | Description :--- | :--- | :--- -<index> | String | The index to update. Can be a comma-separated list of multiple index names. Use `_all` or * to close all indexes. +<index> | String | The index to update. Can be a comma-separated list of multiple index names. Use `_all` or `*` to specify all indexes. ## Query parameters diff --git a/_api-reference/list/index.md b/_api-reference/list/index.md new file mode 100644 index 0000000000..b3f694157c --- /dev/null +++ b/_api-reference/list/index.md @@ -0,0 +1,175 @@ +--- +layout: default +title: List API +nav_order: 45 +has_children: true +--- + +# List APIs +**Introduced 2.18** +{: .label .label-purple } + +The List API retrieves statistics about indexes and shards in a paginated format. This streamlines the task of processing responses that include many indexes. + +The List API supports two operations: + +- [List indices]({{site.url}}{{site.baseurl}}/api-reference/list/list-indices/) +- [List shards]({{site.url}}{{site.baseurl}}/api-reference/list/list-shards/) + +## Shared query parameters + +All List API operations support the following optional query parameters. + +Parameter | Description +:--- | :--- | +`v` | Provides verbose output by adding headers to the columns. It also adds some formatting to help align each of the columns. All examples in this section include the `v` parameter. +`help` | Lists the default and other available headers for a given operation. +`h` | Limits the output to specific headers. +`format` | The format in which to return the result. Valid values are `json`, `yaml`, `cbor`, and `smile`. +`s` | Sorts the output by the specified columns. + +## Examples + +The following examples show how to use the optional query parameters to customize all List API responses. + + +### Get verbose output + +To query indexes and their statistics with a verbose output that includes all column headings in the response, use the `v` query parameter, as shown in the following example. + +#### Request + +```json +GET _list/indices?v +``` +{% include copy-curl.html %} + +#### Response + +```json +health status index uuid pri rep docs.count docs.deleted +green open .kibana_1 - - - - +yellow open sample-index-1 - - - - +next_token null +``` + + +### Get all available headers + +To see all the available headers, use the `help` parameter with the following syntax: + +```json +GET _list/?help +``` +{% include copy-curl.html %} + +#### Request + +The following example list indices operation returns all the available headers: + +```json +GET _list/indices?help +``` +{% include copy-curl.html %} + +#### Response + +The following example displays the indexes and their health status in a table: + +```json +health | h | current health status +status | s | open/close status +index | i,idx | index name +uuid | id,uuid | index uuid +pri | p,shards.primary,shardsPrimary | number of primary shards +rep | r,shards.replica,shardsReplica | number of replica shards +docs.count | dc,docsCount | available docs +``` + +### Get a subset of headers + +To limit the output to a subset of headers, use the `h` parameter with the following syntax: + +```json +GET _list/?h=,&v +``` +{% include copy-curl.html %} + +For any operation, you can determine which headers are available by using the `help` parameter and then using the `h` parameter to limit the output to only a subset of headers. + +#### Request + +The following example limits the indexes in the response to only the index name and health status headers: + +```json +GET _list/indices?h=health,index +``` +{% include copy-curl.html %} + +### Response + +```json +green .kibana_1 +yellow sample-index-1 +next_token null +``` + + +### Sort by a header + +To sort the output on a single page by a header, use the `s` parameter with the following syntax: + +```json +GET _list/?s=, +``` +{% include copy-curl.html %} + +#### Request + +The following example request sorts indexes by index name: + +```json +GET _list/indices?s=h,i +``` +{% include copy-curl.html %} + +#### Response + +```json +green sample-index-2 +yellow sample-index-1 +next_token null +``` + +### Retrieve data in JSON format + +By default, List APIs return data in a `text/plain` format. Other supported formats are [YAML](https://yaml.org/), [CBOR](https://cbor.io/), and [Smile](https://github.com/FasterXML/smile-format-specification). + + +To retrieve data in the JSON format, use the `format=json` parameter with the following syntax. + +If you use the Security plugin, ensure you have the appropriate permissions. +{: .note } + +#### Request + +```json +GET _list/?format=json +``` +{% include copy-curl.html %} + +#### Request + +```json +GET _list/indices?format=json +``` +{% include copy-curl.html %} + +### Response + +The response contains data in JSON format: + +```json +{"next_token":null,"indices":[{"health":"green","status":"-","index":".kibana_1","uuid":"-","pri":"-","rep":"-","docs.count":"-","docs.deleted":"-","store.size":"-","pri.store.size":"-"},{"health":"yellow","status":"-","index":"sample-index-1","uuid":"-","pri":"-","rep":"-","docs.count":"-","docs.deleted":"-","store.size":"-","pri.store.size":"-"}]} +``` + diff --git a/_api-reference/list/list-indices.md b/_api-reference/list/list-indices.md new file mode 100644 index 0000000000..618413d35c --- /dev/null +++ b/_api-reference/list/list-indices.md @@ -0,0 +1,83 @@ +--- +layout: default +title: List indices +parent: List API +nav_order: 25 +has_children: false +--- + +# List indices +**Introduced 2.18** +{: .label .label-purple } + +The list indices operation provides the following index information in a paginated format: + +- The amount of disk space used by the index. +- The number of shards contained in the index. +- The index's health status. + +## Path and HTTP methods + +```json +GET _list/indices +GET _list/indices/ +``` + +## Query parameters + +Parameter | Type | Description +:--- | :--- | :--- +`bytes` | Byte size | Specifies the units for the byte size, for example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`health` | String | Limits indexes based on their health status. Supported values are `green`, `yellow`, and `red`. +`include_unloaded_segments` | Boolean | Whether to include information from segments not loaded into memory. Default is `false`. +`cluster_manager_timeout` | Time | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. +`pri` | Boolean | Whether to return information only from the primary shards. Default is `false`. +`time` | Time | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`expand_wildcards` | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. +`next_token` | String | Fetches the next page of indexes. When `null`, only provides the first page of indexes. Default is `null`. +`size` | Integer | The maximum number of indexes to be displayed on a single page. The number of indexes on a single page of the response is not always equal to the specified `size`. Default is `500`. Minimum is `1` and maximum value is `5000`. +`sort` | String | The order in which the indexes are displayed. If `desc`, then the most recently created indexes are displayed first. If `asc`, then the oldest indexes are displayed first. Default is `asc`. + +When using the `next_token` path parameter, use the token produced by the response to see the next page of indexes. After the API returns `null`, all indexes contained in the API have been returned. +{: .tip } + + +## Example requests + +To get information for all the indexes, use the following query and keep specifying the `next_token` as received from response until its `null`: + +```json +GET _list/indices/?v&next_token=token +``` + + +To limit the information to a specific index, add the index name after your query, as shown in the following example: + +```json +GET _list/indices/?v +``` +{% include copy-curl.html %} + +To get information about more than one index, separate the indexes with commas, as shown in the following example: + +```json +GET _list/indices/index1,index2,index3?v&next_token=token +``` +{% include copy-curl.html %} + + +## Example response + +**Plain text format** + +```json +health | status | index | uuid | pri | rep | docs.count | docs.deleted | store.size | pri.store.size +green | open | movies | UZbpfERBQ1-3GSH2bnM3sg | 1 | 1 | 1 | 0 | 7.7kb | 3.8kb +next_token MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw== +``` + +**JSON format** + +```json +{"next_token":"MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw==","indices":[{"health":"green","status":"open","index":"movies","uuid":"UZbpfERBQ1-3GSH2bnM3sg","pri":"1","rep":"1","docs.count":"1","docs.deleted":"0","store.size":"7.7kb","pri.store.size":"3.8kb"}]} +``` diff --git a/_api-reference/list/list-shards.md b/_api-reference/list/list-shards.md new file mode 100644 index 0000000000..7111aeb0f2 --- /dev/null +++ b/_api-reference/list/list-shards.md @@ -0,0 +1,78 @@ +--- +layout: default +title: List shards +parent: List API +nav_order: 20 +--- + +# List shards +**Introduced 2.18** +{: .label .label-purple } + +The list shards operation outputs, in a paginated format, the state of all primary and replica shards and how they are distributed. + +## Path and HTTP methods + +```json +GET _list/shards +GET _list/shards/ +``` + +## Query parameters + +All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +`bytes` | Byte size | Specifies the byte size units, for example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`local` | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. +`cluster_manager_timeout` | Time | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. +`cancel_after_time_interval` | Time | The amount of time after which the shard request is canceled. Default is `-1` (no timeout). +`time` | Time | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`next_token` | String | Fetches the next page of indexes. When `null`, only provides the first page of indexes. Default is `null`. +`size` | Integer | The maximum number of indexes to be displayed on a single page. The number of indexes on a single page of the response is not always equal to the specified `size`. Default and minimum value is `2000`. Maximum value is `20000`. +`sort` | String | The order in which the indexes are displayed. If `desc`, then the most recently created indexes are displayed first. If `asc`, then the oldest indexes are displayed first. Default is `asc`. + +When using the `next_token` path parameter, use the token produced by the response to see the next page of indexes. After the API returns `null`, all indexes contained in the API have been returned. +{: .tip } + +## Example requests + +To get information for all the indexes and shards, use the following query and keep specifying the `next_token` as received from response until its `null`: + +```json +GET _list/shards/?v&next_token=token +``` + +To limit the information to a specific index, add the index name after your query, as shown in the following example and keep specifying the `next_token` as received from response until its `null`: + +```json +GET _list/shards/?v&next_token=token +``` +{% include copy-curl.html %} + +If you want to get information for more than one index, separate the indexes with commas, as shown in the following example: + +```json +GET _list/shards/index1,index2,index3?v&next_token=token +``` +{% include copy-curl.html %} + +## Example response + +**Plain text format** + +```json +index | shard | prirep | state | docs | store | ip | | node +plugins | 0 | p | STARTED | 0 | 208b | 172.18.0.4 | odfe-node1 +plugins | 0 | r | STARTED | 0 | 208b | 172.18.0.3 | odfe-node2 +.... +.... +next_token MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw== +``` + +**JSON format** + +```json +{"next_token":"MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw==","shards":[{"index":"plugins","shard":"0","prirep":"p","state":"STARTED","docs":"0","store":"208B","ip":"172.18.0.4","node":"odfe-node1"},{"index":"plugins","shard":"0","prirep":"r","state":"STARTED","docs":"0","store":"208B","ip":"172.18.0.3","node":"odfe-node2"}]} +``` diff --git a/_api-reference/msearch-template.md b/_api-reference/msearch-template.md index 614d31d460..fdebf5bed1 100644 --- a/_api-reference/msearch-template.md +++ b/_api-reference/msearch-template.md @@ -106,7 +106,8 @@ OpenSearch returns an array with the results of each search in the same order as }, "max_score": null, "hits": [] - } + }, + "status": 200 }, { "took": 3, @@ -124,7 +125,8 @@ OpenSearch returns an array with the results of each search in the same order as }, "max_score": null, "hits": [] - } + }, + "status": 200 } ] } diff --git a/_api-reference/snapshots/restore-snapshot.md b/_api-reference/snapshots/restore-snapshot.md index 766604ee1d..b22c371134 100644 --- a/_api-reference/snapshots/restore-snapshot.md +++ b/_api-reference/snapshots/restore-snapshot.md @@ -51,8 +51,10 @@ All request body parameters are optional. | index_settings | String | A comma-delimited list of settings to add or change in all restored indices. Use this parameter to override index settings during snapshot restoration. For data streams, these index settings are applied to the restored backing indices. | | indices | String | A comma-delimited list of data streams and indices to restore from the snapshot. Multi-index syntax is supported. By default, a restore operation includes all data streams and indices in the snapshot. If this argument is provided, the restore operation only includes the data streams and indices that you specify. | | partial | Boolean | How the restore operation will behave if indices in the snapshot do not have all primary shards available. If `false`, the entire restore operation fails if any indices in the snapshot do not have all primary shards available.

If `true`, allows the restoration of a partial snapshot of indices with unavailable shards. Only shards that were successfully included in the snapshot are restored. All missing shards are recreated as empty. By default, the entire restore operation fails if one or more indices included in the snapshot do not have all primary shards available. To change this behavior, set `partial` to `true`. Defaults to `false`. | -| rename_pattern | String | The pattern to apply to restored data streams and indices. Data streams and indices matching the rename pattern will be renamed according to `rename_replacement`.

The rename pattern is applied as defined by the regular expression that supports referencing the original text.

The request fails if two or more data streams or indices are renamed into the same name. If you rename a restored data stream, its backing indices are also renamed. For example, if you rename the logs data stream to `recovered-logs`, the backing index `.ds-logs-1` is renamed to `.ds-recovered-logs-1`.

If you rename a restored stream, ensure an index template matches the new stream name. If there are no matching index template names, the stream cannot roll over and new backing indices are not created.| -| rename_replacement | String | The rename replacement string. See `rename_pattern` for more information.| +| rename_pattern | String | The pattern to apply to the restored data streams and indexes. Data streams and indexes matching the rename pattern will be renamed according to the `rename_replacement` setting.

The rename pattern is applied as defined by the regular expression that supports referencing the original text.

The request fails if two or more data streams or indexes are renamed to the same name. If you rename a restored data stream, its backing indexes are also renamed. For example, if you rename the logs data stream to `recovered-logs`, the backing index `.ds-logs-1` is renamed to `.ds-recovered-logs-1`.

If you rename a restored stream, ensure an index template matches the new stream name. If there are no matching index template names, the stream cannot roll over, and new backing indexes are not created.| +| rename_replacement | String | The rename replacement string.| +| rename_alias_pattern | String | The pattern to apply to the restored aliases. Aliases matching the rename pattern will be renamed according to the `rename_alias_replacement` setting.

The rename pattern is applied as defined by the regular expression that supports referencing the original text.

If two or more aliases are renamed to the same name, these aliases will be merged into one.| +| rename_alias_replacement | String | The rename replacement string for aliases.| | source_remote_store_repository | String | The name of the remote store repository of the source index being restored. If not provided, the Snapshot Restore API will use the repository that was registered when the snapshot was created. | wait_for_completion | Boolean | Whether to return a response after the restore operation has completed. If `false`, the request returns a response when the restore operation initializes. If `true`, the request returns a response when the restore operation completes. Defaults to `false`. | @@ -123,4 +125,4 @@ If open indices in a snapshot already exist in a cluster, and you don't delete, }, "status" : 500 } -```` \ No newline at end of file +```` diff --git a/_automating-configurations/api/create-workflow.md b/_automating-configurations/api/create-workflow.md index 610bfe8fab..ad9552c3ef 100644 --- a/_automating-configurations/api/create-workflow.md +++ b/_automating-configurations/api/create-workflow.md @@ -16,7 +16,7 @@ Creating a workflow adds the content of a workflow template to the flow framewor To obtain the validation template for workflow steps, call the [Get Workflow Steps API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-steps/). -You can include placeholder expressions in the value of workflow step fields. For example, you can specify a credential field in a template as `openAI_key: '${{ openai_key }}'`. The expression will be substituted with the user-provided value during provisioning, using the format {% raw %}`${{ }}`{% endraw %}. You can pass the actual key as a parameter by using the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) or by using this API with the `provision` parameter set to `true`. +You can include placeholder expressions in the value of workflow step fields. For example, you can specify a credential field in a template as {% raw %}`openAI_key: '${{ openai_key }}'`{% endraw %}. The expression will be substituted with the user-provided value during provisioning, using the format {% raw %}`${{ }}`{% endraw %}. You can pass the actual key as a parameter by using the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) or by using this API with the `provision` parameter set to `true`. Once a workflow is created, provide its `workflow_id` to other APIs. diff --git a/_automating-configurations/api/provision-workflow.md b/_automating-configurations/api/provision-workflow.md index 62c4954ee9..cb1fe42789 100644 --- a/_automating-configurations/api/provision-workflow.md +++ b/_automating-configurations/api/provision-workflow.md @@ -30,7 +30,7 @@ The following table lists the available path parameters. ## Query parameters -If you have included a substitution expression in the template, you may pass it as a query parameter or as a string value of a request body field. For example, if you specified a credential field in a template as `openAI_key: '${{ openai_key }}'`, then you can include the `openai_key` parameter as a query parameter or body field so it can be substituted during provisioning. For example, the following request provides a query parameter: +If you have included a substitution expression in the template, you may pass it as a query parameter or as a string value of a request body field. For example, if you specified a credential field in a template as {% raw %}`openAI_key: '${{ openai_key }}'`{% endraw %}, then you can include the `openai_key` parameter as a query parameter or body field so it can be substituted during provisioning. For example, the following request provides a query parameter: ```json POST /_plugins/_flow_framework/workflow//_provision?= @@ -47,14 +47,14 @@ POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision ``` {% include copy-curl.html %} -The following request substitutes the expression `${{ openai_key }}` with the value "12345" using a query parameter: +The following request substitutes the expression {% raw %}`${{ openai_key }}`{% endraw %} with the value "12345" using a query parameter: ```json POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision?openai_key=12345 ``` {% include copy-curl.html %} -The following request substitutes the expression `${{ openai_key }}` with the value "12345" using the request body: +The following request substitutes the expression {% raw %}`${{ openai_key }}`{% endraw %} with the value "12345" using the request body: ```json POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision diff --git a/_benchmark/reference/commands/aggregate.md b/_benchmark/reference/commands/aggregate.md new file mode 100644 index 0000000000..a891bf3edf --- /dev/null +++ b/_benchmark/reference/commands/aggregate.md @@ -0,0 +1,98 @@ +--- +layout: default +title: aggregate +nav_order: 85 +parent: Command reference +grand_parent: OpenSearch Benchmark Reference +redirect_from: + - /benchmark/commands/aggregate/ +--- + +# aggregate + +The `aggregate` command combines multiple test executions into a single aggregated result, providing a more streamlined way to conduct and analyze multiple test runs. There are two methods of aggregation: + +- [Auto-aggregation](#auto-aggregation) +- [Manual aggregation](#manual-aggregation) + +## Auto-aggregation + +The auto-aggregation method runs multiple iterations of benchmark tests and automatically aggregates the results, all within a single command. You can use the flags outlined in this with the `execute` command. + +### Usage + +The following example runs the `geonames` workload and aggregates the results twice: + +```bash +opensearch-benchmark execute --test-iterations=2 --aggregate=true --workload=geonames --target-hosts=127.0.0.1:9200 +``` +{% include copy-curl.html %} + +### Auto-aggregation flags + +The following new flags can be used to customize the auto-aggregation method: + +- `--test-iterations`: Specifies the number of times to run the workload (default is `1`). +- `--aggregate`: Determines whether to aggregate the results of multiple test executions (default is `true`). +- `--sleep-timer`: Specifies the number of seconds to sleep before starting the next test execution (default is `5`). +- `--cancel-on-error`: When set, stops executing tests if an error occurs in one of the test iterations (default is `false`). + +## Manual aggregation + +You can use the `aggregate` command to manually aggregate results from multiple test executions. + +### Usage + +To aggregate multiple test executions manually, specify the `test_execution_ids` you would like to aggregate, as shown in the following example: + +```bash +opensearch-benchmark aggregate --test-executions=,,... +``` +{% include copy-curl.html %} + +### Response + +OpenSearch Benchmark responds with the following: + +``` + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + +Aggregate test execution ID: aggregate_results_geonames_9aafcfb8-d3b7-4583-864e-4598b5886c4f + +------------------------------- +[INFO] SUCCESS (took 1 seconds) +------------------------------- +``` + +The results will be aggregated into one test execution and stored under the ID shown in the output. + +### Additional options +- `--test-execution-id`: Define a unique ID for the aggregated test execution. +- `--results-file`: Write the aggregated results to the provided file. +- `--workload-repository`: Define the repository from which OpenSearch Benchmark will load workloads (default is `default`). + +## Aggregated results + +Aggregated results includes the following information: + +- **Relative Standard Deviation (RSD)**: For each metric an additional `mean_rsd` value shows the spread of results across test executions. +- **Overall min/max values**: Instead of averaging minimum and maximum values, the aggregated result include `overall_min` and `overall_max` which reflect the true minimum/maximum across all test runs. +- **Storage**: Aggregated test results are stored in a separate `aggregated_results` folder alongside the `test_executions` folder. + +The following example shows aggregated results: + +```json + "throughput": { + "overall_min": 29056.890292903263, + "mean": 50115.8603858536, + "median": 50099.54349684457, + "overall_max": 72255.15946248993, + "unit": "docs/s", + "mean_rsd": 59.426059705973664 + }, +``` diff --git a/_benchmark/reference/commands/command-flags.md b/_benchmark/reference/commands/command-flags.md index 6520f80803..96948e45c7 100644 --- a/_benchmark/reference/commands/command-flags.md +++ b/_benchmark/reference/commands/command-flags.md @@ -328,3 +328,33 @@ Sets what fraction of randomized query values can be repeated. Takes values betw Sets how many distinct repeatable pair values are generated for each operation when randomization is used. Default is `5000`. This setting does not work when `--randomization-enabled` is not used. + + +## test-iterations + + +Specifies the number of times to run the workload. Default is `1`. + + +## aggregate + + +Determines whether OpenSearch Benchmark should aggregate the results of multiple test executions. + +When set to `true`, OpenSearch Benchmark will combine the results from all iterations into a single aggregated report. When set to `false`, results from each iteration will be reported separately. + +Default is `true`. + + +## sleep-timer + + +Specifies the number of seconds to sleep before starting the next test execution. Default is `5`. + + + +## cancel-on-error + + +When set, this flag instructs OpenSearch Benchmark to stop executing tests if an error occurs in one of the test iterations. Default is `false` (not set). + diff --git a/_config.yml b/_config.yml index 4ead6344c2..3c6f737cc8 100644 --- a/_config.yml +++ b/_config.yml @@ -5,10 +5,10 @@ baseurl: "/docs/latest" # the subpath of your site, e.g. /blog url: "https://opensearch.org" # the base hostname & protocol for your site, e.g. http://example.com permalink: /:path/ -opensearch_version: '2.17.0' -opensearch_dashboards_version: '2.17.0' -opensearch_major_minor_version: '2.17' -lucene_version: '9_11_1' +opensearch_version: '2.18.0' +opensearch_dashboards_version: '2.18.0' +opensearch_major_minor_version: '2.18' +lucene_version: '9_12_0' # Build settings markdown: kramdown @@ -31,9 +31,6 @@ collections: install-and-configure: permalink: /:collection/:path/ output: true - upgrade-to: - permalink: /:collection/:path/ - output: true im-plugin: permalink: /:collection/:path/ output: true @@ -94,6 +91,9 @@ collections: data-prepper: permalink: /:collection/:path/ output: true + migration-assistant: + permalink: /:collection/:path/ + output: true tools: permalink: /:collection/:path/ output: true @@ -121,6 +121,9 @@ collections: getting-started: permalink: /:collection/:path/ output: true + workspace: + permalink: /:collection/:path/ + output: true opensearch_collection: # Define the collections used in the theme @@ -134,11 +137,6 @@ opensearch_collection: install-and-configure: name: Install and upgrade nav_fold: true - upgrade-to: - name: Migrate to OpenSearch - # nav_exclude: true - nav_fold: true - # search_exclude: true im-plugin: name: Managing Indexes nav_fold: true @@ -210,6 +208,12 @@ clients_collection: name: Clients nav_fold: true +migration_assistant_collection: + collections: + migration-assistant: + name: Migration Assistant + nav_fold: true + benchmark_collection: collections: benchmark: @@ -249,6 +253,12 @@ defaults: values: section: "benchmark" section-name: "Benchmark" + - + scope: + path: "_migration-assistant" + values: + section: "migration-assistant" + section-name: "Migration Assistant" # Enable or disable the site search # By default, just-the-docs enables its JSON file-based search. We also have an OpenSearch-driven search functionality. @@ -308,6 +318,7 @@ plugins: - jekyll-remote-theme - jekyll-redirect-from - jekyll-sitemap + - jekyll-spec-insert # This format has to conform to RFC822 last-modified-at: @@ -317,6 +328,8 @@ last-modified-at: # The following items will not be processed, by default. Create a custom list # to override the default setting. exclude: + - README.md + - DEVELOPER_GUIDE.md - Gemfile - Gemfile.lock - node_modules @@ -324,6 +337,12 @@ exclude: - vendor/cache/ - vendor/gems/ - vendor/ruby/ - - README.md - - .idea - - templates + - templates/ + - .sass-cache/ + - .jekyll-cache/ + - .idea/ + - .github/ + - .bundle/ + - _site/ + - spec-insert + - release-notes \ No newline at end of file diff --git a/_dashboards/csp/csp-dynamic-configuration.md b/_dashboards/csp/csp-dynamic-configuration.md index abe80a60c7..794323472b 100644 --- a/_dashboards/csp/csp-dynamic-configuration.md +++ b/_dashboards/csp/csp-dynamic-configuration.md @@ -1,11 +1,11 @@ --- layout: default -title: Configuring CSP rules for `frame-ancestors` +title: Configuring CSP rules for frame ancestors nav_order: 140 has_children: false --- -# Configuring CSP rules for `frame-ancestors` +# Configuring CSP rules for frame ancestors Introduced 2.13 {: .label .label-purple } diff --git a/_dashboards/dashboards-assistant/alert-insight.md b/_dashboards/dashboards-assistant/alert-insight.md new file mode 100644 index 0000000000..603e5aba44 --- /dev/null +++ b/_dashboards/dashboards-assistant/alert-insight.md @@ -0,0 +1,321 @@ +--- +layout: default +title: Alert insights +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Alert insights + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress the feature or if you want to leave feedback, join the discussion in the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant alert insights help generate alert summaries and provide log patterns based on the logs that triggered the alert. + +## Configuring alert insights + +To configure alert insights, use the following steps. + +### Prerequisite + +Before using alert insights, you must have the `alerting` and `alerting-dashboards` plugins installed on your cluster. By default, these plugins are installed as part of standard OpenSearch distributions. For more information, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). + +### Step 1: Enable alert insights + +To enable alert insights, configure the following `opensearch_dashboards.yml` setting: + +```yaml +assistant.alertInsight.enabled: true +``` +{% include copy.html %} + +### Step 2: Create the agents + +To orchestrate alert insights, you'll need to create the necessary [agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). Create a workflow template for creating all necessary agents by sending the following request: + +
+ + Request + + {: .text-delta} + +```json +POST /_plugins/_flow_framework/workflow?provision=true +{ + "name": "Alert Summary Agent", + "description": "Create Alert Summary Agent using Claude on BedRock", + "use_case": "REGISTER_AGENT", + "version": { + "template": "1.0.0", + "compatibility": ["2.17.0", "3.0.0"] + }, + "workflows": { + "provision": { + "user_params": {}, + "nodes": [ + { + "id": "create_claude_connector", + "type": "create_connector", + "previous_node_inputs": {}, + "user_inputs": { + "version": "1", + "name": "Claude instant runtime Connector", + "protocol": "aws_sigv4", + "description": "The connector to BedRock service for Claude model", + "actions": [ + { + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "method": "POST", + "request_body": "{\"prompt\":\"\\n\\nHuman: ${parameters.prompt}\\n\\nAssistant:\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", + "action_type": "predict", + "url": "https://bedrock-runtime.us-west-2.amazonaws.com/model/anthropic.claude-instant-v1/invoke" + } + ], + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "parameters": { + "region": "us-west-2", + "endpoint": "bedrock-runtime.us-west-2.amazonaws.com", + "content_type": "application/json", + "auth": "Sig_V4", + "max_tokens_to_sample": "8000", + "service_name": "bedrock", + "temperature": "0.0001", + "response_filter": "$.completion", + "anthropic_version": "bedrock-2023-05-31" + } + } + }, + { + "id": "register_claude_model", + "type": "register_remote_model", + "previous_node_inputs": { + "create_claude_connector": "connector_id" + }, + "user_inputs": { + "description": "Claude model", + "deploy": true, + "name": "claude-instant" + } + }, + { + "id": "create_alert_summary_ml_model_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "parameters": { + "prompt": "You are an OpenSearch Alert Assistant to help summarize the alerts.\n Here is the detail of alert: ${parameters.context};\n The question is: ${parameters.question}." + }, + "name": "MLModelTool", + "type": "MLModelTool" + } + }, + { + "id": "create_alert_summary_agent", + "type": "register_agent", + "previous_node_inputs": { + "create_alert_summary_ml_model_tool": "tools" + }, + "user_inputs": { + "parameters": {}, + "type": "flow", + "name": "Alert Summary Agent", + "description": "this is an alert summary agent" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +
+ +For sample agent templates, see [Flow Framework sample templates](https://github.com/opensearch-project/flow-framework/tree/2.x/sample-templates). Note the agent ID; you'll use it in the following step. + +For this example, use the templates to create the following agents: +- An alert insights agent, see [flow template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/create-knowledge-base-alert-agent.json) +- Two summary agents: + - A basic alert summary agent, see [flow template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/alert-summary-agent-claude-tested.json) + - An agent for an alert summary that includes log patterns, see [flow template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/alert-summary-log-pattern-agent.json) + + These agents require different prompts. The prompt for the log patterns summary must include a placeholder `${parameters.topNLogPatternData}` and additional instructions to guide the LLM on using this information effectively. Note that log patterns are available only for query monitors created using OpenSearch Dashboards. + +### Step 3: Create the root agents + +Next, create [root agents]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-tutorial/#root_agent) for agents created in the previous step. + +Create a root agent for the alert summary agent: + +```json +POST /.plugins-ml-config/_doc/os_summary +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +Create a root agent for the alert summary with log patterns agent: + +```json +POST /.plugins-ml-config/_doc/os_summary_with_log_pattern +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +Create a root agent for the alert insights agent: + +```json +POST /.plugins-ml-config/_doc/os_insight +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +The created `os_insight` agent provides alert insights related to OpenSearch cluster metrics. For insights about alerts unrelated to OpenSearch cluster metrics, you need to register an agent with [this template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/create-knowledge-base-alert-agent.json) and change the agent name to `KB_For_Alert_Insight`. +{: .note} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agents + +You can verify that the agents were created successfully by calling the agents with an example payload. + +To test the alert summary agent, send the following request: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "question": "Please summarize this alert, do not use any tool.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + } +} +``` +{% include copy-curl.html %} + +To test the alert summary with log patterns agent, send the following request: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "question": "Please summarize this alert, do not use any tool.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "topNLogPatternData": "[[539,["[Sun Dec 04 07:12:44 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 06:19:18 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:59:47 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:11:22 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:31:12 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 05:04:04 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 20:24:49 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 06:16:23 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 20:47:17 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:30:43 2005] [error] mod_jk child workerEnv in error state 6","[Mon Dec 05 06:35:27 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:07:30 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 16:32:56 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 8"],"[ :: ] [] _ "],[32,["[Sun Dec 04 14:29:00 2005] [error] [client 4.245.93.87] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 08:54:17 2005] [error] [client 147.31.138.75] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:34:57 2005] [error] [client 61.138.216.82] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 07:45:45 2005] [error] [client 63.13.186.196] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 10:53:30 2005] [error] [client 218.76.139.20] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:48:48 2005] [error] [client 67.166.248.235] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:18:36 2005] [error] [client 67.154.58.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 01:30:32 2005] [error] [client 211.62.201.48] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 16:45:04 2005] [error] [client 216.216.185.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 17:31:39 2005] [error] [client 218.75.106.250] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:00:56 2005] [error] [client 68.228.3.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:14:09 2005] [error] [client 61.220.139.68] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 09:35:12 2005] [error] [client 207.203.80.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:28:44 2005] [error] [client 198.232.168.9] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 16:24:05 2005] [error] [client 58.225.62.140] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:53:43 2005] [error] [client 218.39.132.175] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 12:33:13 2005] [error] [client 208.51.151.210] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:59:01 2005] [error] [client 24.83.37.136] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 11:42:43 2005] [error] [client 216.127.124.16] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 05:15:09 2005] [error] [client 222.166.160.184] Directory index forbidden by rule: /var/www/html/"],"[ :: ] [] [ ...] : ////"],[12,["[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:16 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2"],"[ :: ] [] _ -"]]" + } +} +``` +{% include copy-curl.html %} + +To test the alert insights agent, send the following request: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "question": "Please provide your insight on this alerts.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "summary": + } +} +``` +{% include copy-curl.html %} + +## Generating an alert summary + +You can generate an alert summary by calling the `/api/assistant/summary` API endpoint. To generate an alert summary, the fields `index`, `dsl`, and `topNLogPatternData` are optional. If all three fields are provided, the agent will provide a summary with log pattern analysis; otherwise, it will provide a general summary: + +```json +POST /api/assistant/summary +{ + "summaryType": "alerts", + "question": "Please summarize this alert, do not use any tool.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "index": "loghub-apache-new", + "dsl": "{\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}", + "topNLogPatternData": "[[539,["[Sun Dec 04 07:12:44 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 06:19:18 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:59:47 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:11:22 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:31:12 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 05:04:04 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 20:24:49 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 06:16:23 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 20:47:17 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:30:43 2005] [error] mod_jk child workerEnv in error state 6","[Mon Dec 05 06:35:27 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:07:30 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 16:32:56 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 8"],"[ :: ] [] _ "],[32,["[Sun Dec 04 14:29:00 2005] [error] [client 4.245.93.87] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 08:54:17 2005] [error] [client 147.31.138.75] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:34:57 2005] [error] [client 61.138.216.82] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 07:45:45 2005] [error] [client 63.13.186.196] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 10:53:30 2005] [error] [client 218.76.139.20] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:48:48 2005] [error] [client 67.166.248.235] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:18:36 2005] [error] [client 67.154.58.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 01:30:32 2005] [error] [client 211.62.201.48] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 16:45:04 2005] [error] [client 216.216.185.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 17:31:39 2005] [error] [client 218.75.106.250] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:00:56 2005] [error] [client 68.228.3.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:14:09 2005] [error] [client 61.220.139.68] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 09:35:12 2005] [error] [client 207.203.80.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:28:44 2005] [error] [client 198.232.168.9] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 16:24:05 2005] [error] [client 58.225.62.140] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:53:43 2005] [error] [client 218.39.132.175] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 12:33:13 2005] [error] [client 208.51.151.210] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:59:01 2005] [error] [client 24.83.37.136] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 11:42:43 2005] [error] [client 216.127.124.16] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 05:15:09 2005] [error] [client 222.166.160.184] Directory index forbidden by rule: /var/www/html/"],"[ :: ] [] [ ...] : ////"],[12,["[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:16 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2"],"[ :: ] [] _ -"]]" +} +``` +{% include copy-curl.html %} + +The following table describes the Assistant Summary API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`summaryType` | Required | Specifies the type of application calling this API. Use `alerts` for alert insights. +`question` | Required | Specifies the user's question regarding alert insights. Default is `Please summarize this alert, do not use any tool.` +`context` | Required | Provides context for the alert, including the alert monitor definition, active alerts, and trigger values. +`index` | Optional | The index that the alert monitors. If this parameter is not provided, log pattern analysis is not returned. +`dsl` | Optional | The DSL query for alert monitoring. If this parameter is not provided, log pattern analysis is not returned. +`topNLogPatternData` | Optional | Log patterns for the alert trigger data. If this parameter is not provided, log pattern analysis is not returned. + +## Generating alert insights + +You can generate alert insights by calling the `/api/assistant/insight` API endpoint. To generate alert insights, all of the following parameters are required: + +```json +POST /api/assistant/insight +{ + "summaryType": "alerts", + "insightType": "user_insight" + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "question": "Please provide your insight on this alerts.", + "summary": +} +``` +{% include copy-curl.html %} + +The following table describes the Assistant Insight API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`summaryType` | Required | Specifies the type of application calling this API. Use `alerts` for alert insights. +`insightType` | Required | Defines the alert type. Use `os_insight` for cluster metrics alerts and `user_insight` for other alert types. +`question` | Required | Specifies the user's question regarding alert insights. Default is `Please provide your insight on this alerts.` +`context` | Required | Provides context for the alert, including the alert monitor definition, active alerts, and trigger values. +`summary` | Required | The result returned by the alert summary agent. + + +## Viewing alert insights in OpenSearch Dashboards + +Before viewing alert insights, you must configure alerts in OpenSearch Dashboards. For more information, see [Alerting]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/index/). + +To view alert insights in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Plugins > Alerting**. All alerts are displayed. + +1. Hover over the alerts for your desired monitor. If you configured alert insights, you will see a sparkle icon ({::nomarkdown}sparkle icon{:/}) next to the alerts in the **Alerts** column, as shown in the following image. + + Alerting page with sparkle icon + +1. Select the alerts label or the sparkle icon. You will see the generated summary, as shown in the following image. + + Alert summary + +1. Select the information icon ({::nomarkdown}info icon{:/}) to view alert insights. You will see the generated alert insights, as shown in the following image. + + Alert insights \ No newline at end of file diff --git a/_dashboards/dashboards-assistant/data-summary.md b/_dashboards/dashboards-assistant/data-summary.md new file mode 100644 index 0000000000..e90e184e07 --- /dev/null +++ b/_dashboards/dashboards-assistant/data-summary.md @@ -0,0 +1,294 @@ +--- +layout: default +title: Data summary +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Data summary + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant data summary feature uses large language models (LLMs) to help you generate summaries for data stored in OpenSearch indexes. This tool provides an efficient way to gain insights from large datasets, making it easier to understand and act on the information contained in your OpenSearch indexes. + +## Configuration + +To configure the data summary feature, use the following steps. + +### Prerequisite + +Before using the data summary feature, enable query enhancements in OpenSearch Dashboards as follows: + +1. On the top menu bar, go to **Management > Dashboards Management**. +1. In the left navigation pane, select **Advanced settings**. +1. On the settings page, toggle **Enable query enhancements** to **On**. + +### Step 1: Enable the data summary feature + +To enable the data summary feature, configure the following `opensearch_dashboards.yml` setting: + +```yaml +queryEnhancements.queryAssist.summary.enabled: true +``` +{% include copy.html %} + +### Step 2: Create a data summary agent + +To orchestrate data summarization, create a data summary [agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). To create an agent, send a `POST /_plugins/_flow_framework/workflow?provision=true` request and provide the agent template as a payload: + +
+ + Request + + {: .text-delta} + +```json +POST /_plugins/_flow_framework/workflow?provision=true +{ + "name": "Query Assist Agent", + "description": "Create a Query Assist Agent using Claude on BedRock", + "use_case": "REGISTER_AGENT", + "version": { + "template": "1.0.0", + "compatibility": ["2.13.0", "3.0.0"] + }, + "workflows": { + "provision": { + "user_params": {}, + "nodes": [ + { + "id": "create_claude_connector", + "type": "create_connector", + "previous_node_inputs": {}, + "user_inputs": { + "version": "1", + "name": "Claude instant runtime Connector", + "protocol": "aws_sigv4", + "description": "The connector to BedRock service for Claude model", + "actions": [ + { + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "method": "POST", + "request_body": "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", + "action_type": "predict", + "url": "https://bedrock-runtime.us-west-2.amazonaws.com/model/anthropic.claude-instant-v1/invoke" + } + ], + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "parameters": { + "region": "us-west-2", + "endpoint": "bedrock-runtime.us-west-2.amazonaws.com", + "content_type": "application/json", + "auth": "Sig_V4", + "max_tokens_to_sample": "8000", + "service_name": "bedrock", + "temperature": "0.0001", + "response_filter": "$.completion", + "anthropic_version": "bedrock-2023-05-31" + } + } + }, + { + "id": "register_claude_model", + "type": "register_remote_model", + "previous_node_inputs": { + "create_claude_connector": "connector_id" + }, + "user_inputs": { + "description": "Claude model", + "deploy": true, + "name": "claude-instant", + "guardrails": { + "type": "local_regex", + "input_guardrail": { + "stop_words": [ + { + "index_name": "words0", + "source_fields": ["title"] + } + ], + "regex": ["regex1", "regex2"] + }, + "output_guardrail": { + "stop_words": [ + { + "index_name": "words0", + "source_fields": ["title"] + } + ], + "regex": ["regex1", "regex2"] + } + } + } + }, + { + "id": "TransferQuestionToPPLAndExecuteTool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "type": "PPLTool", + "name": "TransferQuestionToPPLAndExecuteTool", + "description": "Use this tool to transfer natural language to generate PPL and execute PPL to query inside. Use this tool after you know the index name, otherwise, call IndexRoutingTool first. The input parameters are: {index:IndexName, question:UserQuestion}", + "parameters": { + "response_filter": "$.completion", + "execute": false + }, + "include_output_in_agent_response": true + } + }, + { + "id": "summarize_success_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "type": "MLModelTool", + "Name": "SummarizeSuccessTool", + "description": "Use this tool to summarize a PPL success response in query assist", + "parameters": { + "prompt": "\n\nHuman: You will be given a search response, summarize it as a concise paragraph while considering the following:\nUser's question on index '${parameters.index}': ${parameters.question}\nPPL (Piped Processing Language) query used: ${parameters.query}\n\nGive some documents to support your point.\nNote that the output could be truncated, summarize what you see. Don't mention about total items returned and don't mention about the fact that output is truncated if you see 'Output is too long, truncated' in the response.\n\nSkip the introduction; go straight into the summarization.\n\nUse the following pieces of context to answer the users question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n${parameters.response}\n\nAssistant:", + "response_filter": "$.completion" + } + } + }, + { + "id": "summarize_error_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "type": "MLModelTool", + "name": "SummarizeErrorTool", + "description": "Use this tool to summarize a PPL error response in query assist", + "include_output_in_agent_response": true, + "parameters": { + "prompt": "\n\nHuman: You will be given an API response with errors, summarize it as a concise paragraph. Do not try to answer the user's question.\nIf the error cannot be fixed, eg. no such field or function not supported, then give suggestions to rephrase the question.\nIt is imperative that you must not give suggestions on how to fix the error or alternative PPL query, or answers to the question.\n\nConsider the following:\nUser's question on index '${parameters.index}': ${parameters.question}\nPPL (Piped Processing Language) query used: ${parameters.query}\n\nSkip the introduction; go straight into the summarization.\n\nUse the following pieces of context to answer the users question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n${parameters.response}\n\nAssistant:", + "response_filter": "$.completion" + } + } + }, + { + "id": "suggestions_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "type": "MLModelTool", + "name": "SuggestionsTool", + "description": "Use this tool to generate possible questions for an index in query assist", + "include_output_in_agent_response": true, + "parameters": { + "prompt": "\n\nHuman: OpenSearch index: ${parameters.index}\n\nRecommend 2 or 3 possible questions on this index given the fields below. Only give the questions, do not give descriptions of questions and do not give PPL queries.\n\nThe format for a field is\n```\n- field_name: field_type (sample field value)\n```\n\nFields:\n${parameters.fields}\n\nPut each question in a tag.\n\nAssistant:", + "response_filter": "$.completion" + } + } + }, + { + "id": "ppl_agent", + "type": "register_agent", + "previous_node_inputs": { + "TransferQuestionToPPLAndExecuteTool": "tools" + }, + "user_inputs": { + "parameters": {}, + "app_type": "query_assist", + "name": "PPL agent", + "description": "this is the PPL agent", + "type": "flow" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +
+ +For sample agent templates, see [Flow Framework sample templates](https://github.com/opensearch-project/flow-framework/tree/2.x/sample-templates). Note the agent ID; you'll use it in the following step. + +### Step 3: Create a root agent + +Next, create a [root agent]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-tutorial/#root_agent) for the data summary agent created in the previous step: + +```json +POST /.plugins-ml-config/_doc/os_data2summary +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agent + +You can verify that the data summary agent was created successfully by calling the agent with an example payload: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "sample_data":"'[{\"_index\":\"90943e30-9a47-11e8-b64d-95841ca0b247\",\"_source\":{\"referer\":\"http://twitter.com/success/gemini-9a\",\"request\":\"/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"agent\":\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\",\"extension\":\"deb\",\"memory\":null,\"ip\":\"239.67.210.53\",\"index\":\"opensearch_dashboards_sample_data_logs\",\"message\":\"239.67.210.53 - - [2018-08-30T15:29:01.686Z] \\\"GET /beats/metricbeat/metricbeat-6.3.2-amd64.deb HTTP/1.1\\\" 404 2633 \\\"-\\\" \\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\\\"\",\"url\":\"https://artifacts.opensearch.org/downloads/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"tags\":\"success\",\"geo\":{\"srcdest\":\"CN:PL\",\"src\":\"CN\",\"coordinates\":{\"lat\":44.91167028,\"lon\":-108.4455092},\"dest\":\"PL\"},\"utc_time\":\"2024-09-05 15:29:01.686\",\"bytes\":2633,\"machine\":{\"os\":\"win xp\",\"ram\":21474836480},\"response\":\"404\",\"clientip\":\"239.67.210.53\",\"host\":\"artifacts.opensearch.org\",\"event\":{\"dataset\":\"sample_web_logs\"},\"phpmemory\":null,\"timestamp\":\"2024-09-05 15:29:01.686\"}}]'", + "sample_count":1, + "total_count":383, + "question":"Are there any errors in my logs?", + "ppl":"source=opensearch_dashboards_sample_data_logs| where QUERY_STRING(['response'], '4* OR 5*')"} +} +``` +{% include copy-curl.html %} + +## Generating a data summary + +You can generate a data summary by calling the `/api/assistant/data2summary` API endpoint. The `sample_count`, `total_count`, `question`, and `ppl` parameters are optional: + +```json +POST /api/assistant/data2summary +{ + "sample_data":"'[{\"_index\":\"90943e30-9a47-11e8-b64d-95841ca0b247\",\"_source\":{\"referer\":\"http://twitter.com/success/gemini-9a\",\"request\":\"/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"agent\":\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\",\"extension\":\"deb\",\"memory\":null,\"ip\":\"239.67.210.53\",\"index\":\"opensearch_dashboards_sample_data_logs\",\"message\":\"239.67.210.53 - - [2018-08-30T15:29:01.686Z] \\\"GET /beats/metricbeat/metricbeat-6.3.2-amd64.deb HTTP/1.1\\\" 404 2633 \\\"-\\\" \\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\\\"\",\"url\":\"https://artifacts.opensearch.org/downloads/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"tags\":\"success\",\"geo\":{\"srcdest\":\"CN:PL\",\"src\":\"CN\",\"coordinates\":{\"lat\":44.91167028,\"lon\":-108.4455092},\"dest\":\"PL\"},\"utc_time\":\"2024-09-05 15:29:01.686\",\"bytes\":2633,\"machine\":{\"os\":\"win xp\",\"ram\":21474836480},\"response\":\"404\",\"clientip\":\"239.67.210.53\",\"host\":\"artifacts.opensearch.org\",\"event\":{\"dataset\":\"sample_web_logs\"},\"phpmemory\":null,\"timestamp\":\"2024-09-05 15:29:01.686\"}}]'", + "sample_count":1, + "total_count":383, + "question":"Are there any errors in my logs?", + "ppl":"source=opensearch_dashboards_sample_data_logs| where QUERY_STRING(['response'], '4* OR 5*')" +} +``` +{% include copy-curl.html %} + +The following table describes the Assistant Data Summary API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`sample_data` | Required | A sample of data returned by the specified query and used as input for summarization. +`question` | Optional | The user's natural language question about the data, which guides the summary generation. +`ppl` | Optional | The Piped Processing Language (PPL) query used to retrieve data; in query assistance, this is generated by the LLM using the user's natural language question. +`sample_count` | Optional | The number of entries included in sample_data. +`total_count` | Optional | The total number of entries in the full query result set. + +## Viewing data summaries in OpenSearch Dashboards + +To view alert insights in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Dashboards > Discover**. + +1. From the query language dropdown list, select **PPL**. You will see the generated data summary after the query text, as shown in the following image. + + data summary diff --git a/_dashboards/dashboards-assistant/index.md b/_dashboards/dashboards-assistant/index.md index bf2d754be8..615a53b75f 100644 --- a/_dashboards/dashboards-assistant/index.md +++ b/_dashboards/dashboards-assistant/index.md @@ -2,7 +2,7 @@ layout: default title: OpenSearch Assistant for OpenSearch Dashboards nav_order: 3 -has_children: false +has_children: true has_toc: false --- @@ -22,7 +22,7 @@ To enable **OpenSearch Assistant** in OpenSearch Dashboards, locate your copy of ```yaml assistant.chat.enabled: true ``` -{% include copy-curl.html %} +{% include copy.html %} Then configure the root `agent_id` through the following API: @@ -131,8 +131,17 @@ assistant.next.enabled: true ``` {% include copy-curl.html %} +## Additional Dashboards Assistant capabilities + +For information about additional Dashboards Assistant capabilities, see the following pages: + +- [Generating alert insights]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/alert-insight/) +- [Generating data summaries]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/data-summary/) +- [Generating anomaly detector suggestions]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/suggest-anomaly-detector/) +- [Generating visualizations from text]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/text-to-visualization/) + ## Related articles - [Getting started guide for OpenSearch Assistant in OpenSearch Dashboards](https://github.com/opensearch-project/dashboards-assistant/blob/main/GETTING_STARTED_GUIDE.md) - [OpenSearch Assistant configuration through the REST API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/opensearch-assistant/) -- [Build your own chatbot]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/build-chatbot/) \ No newline at end of file +- [Build your own chatbot]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/build-chatbot/) diff --git a/_dashboards/dashboards-assistant/suggest-anomaly-detector.md b/_dashboards/dashboards-assistant/suggest-anomaly-detector.md new file mode 100644 index 0000000000..8f4aac80fd --- /dev/null +++ b/_dashboards/dashboards-assistant/suggest-anomaly-detector.md @@ -0,0 +1,89 @@ +--- +layout: default +title: Anomaly detector suggestions +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Anomaly detector suggestions + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant can use a large language model (LLM) to suggest the creation of an anomaly detector. The LLM analyzes data patterns in your OpenSearch indexes and recommends configuration settings for the anomaly detector, making it easier to identify unusual activity or trends in your data. + +## Configuration + +To configure anomaly detector suggestions, use the following steps. + +### Prerequisite + +Before using anomaly detector suggestions, enable query enhancements in OpenSearch Dashboards as follows: + +1. On the top menu bar, go to **Management > Dashboards Management**. +1. In the left navigation pane, select **Advanced settings**. +1. On the settings page, toggle **Enable query enhancements** to **On**. + +### Step 1: Enable anomaly detector suggestions + +To enable anomaly detector suggestions, configure the following `opensearch_dashboards.yml` setting: + +```yaml +assistant.smartAnomalyDetector.enabled: true +``` +{% include copy.html %} + +### Step 2: Create an anomaly detector suggestion agent + +To orchestrate anomaly detector suggestions, create an anomaly detector suggestion [agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). To create an agent, send a `POST /_plugins/_flow_framework/workflow?provision=true` request and provide the agent template as a payload. For more information, see [Configuring OpenSearch Assistant]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/index/#configuring-opensearch-assistant). + +For sample agent templates, see [Flow Framework sample templates](https://github.com/opensearch-project/flow-framework/tree/2.x/sample-templates). Note the agent ID; you'll use it in the following step. + +### Step 3: Configure the agent + +Next, configure the anomaly detector suggestion agent created in the previous step: + +```json +POST /.plugins-ml-config/_doc/os_suggest_ad +{ + "type": "suggest_anomaly_detector_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agent + +You can verify that the agent was created successfully by calling the agent with an example payload: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "index":"sample_weblogs_test" + } +} +``` +{% include copy-curl.html %} + +## Viewing anomaly detector suggestions in OpenSearch Dashboards + +To view anomaly detector suggestions in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Dashboards > Discover**. + +1. From the index pattern dropdown list, select an index pattern. + +1. Select the **AI assistant** dropdown list and then select **Suggest anomaly detector**, as shown in the following image. + + Click the Suggest anomaly detector action + +1. Wait for the LLM to populate the **Suggest anomaly detector** fields that will be used to create an anomaly detector for the index pattern. Then select the **Create detector** button to create an anomaly detector, as shown in the following image. + + Suggested anomaly detector diff --git a/_dashboards/dashboards-assistant/text-to-visualization.md b/_dashboards/dashboards-assistant/text-to-visualization.md new file mode 100644 index 0000000000..a30ca6d1f8 --- /dev/null +++ b/_dashboards/dashboards-assistant/text-to-visualization.md @@ -0,0 +1,286 @@ +--- +layout: default +title: Text to visualization +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Text to visualization + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant can create visualizations using natural language instructions. + +## Configuration + +To configure text to visualization, use the following steps. + +### Step 1: Enable text to visualization + +To enable text to visualization, configure the following `opensearch_dashboards.yml` setting: + +```yaml +assistant.text2viz.enabled: true +``` +{% include copy.html %} + +### Step 2: Create the agents + +To orchestrate text to visualization, you'll need to create the necessary [agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). Create a workflow template for creating all necessary text-to-visualization agents by sending the following request: + +
+ + Request + + {: .text-delta} + +```json +POST /_plugins/_flow_framework/workflow +{ + "name": "Text to visualization agents", + "description": "This template is to create all Agents required for text to visualization", + "use_case": "REGISTER_AGENTS", + "version": { + "template": "1.0.0", + "compatibility": [ + "2.18.0", + "3.0.0" + ] + }, + "workflows": { + "provision": { + "user_params": {}, + "nodes": [ + { + "id": "create_claude_connector", + "type": "create_connector", + "previous_node_inputs": {}, + "user_inputs": { + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "parameters": { + "endpoint": "bedrock-runtime.us-east-1.amazonaws.com", + "content_type": "application/json", + "auth": "Sig_V4", + "max_tokens_to_sample": "8000", + "service_name": "bedrock", + "temperature": "0.0000", + "response_filter": "$.content[0].text", + "region": "us-east-1", + "anthropic_version": "bedrock-2023-05-31" + }, + "version": "1", + "name": "Claude haiku runtime Connector", + "protocol": "aws_sigv4", + "description": "The connector to BedRock service for claude model", + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-3-haiku-20240307-v1:0/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required" + }, + "request_body": "{\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"${parameters.prompt}\"}]}],\"anthropic_version\":\"${parameters.anthropic_version}\",\"max_tokens\":${parameters.max_tokens_to_sample}}" + } + ] + } + }, + { + "id": "register_claude_model", + "type": "register_remote_model", + "previous_node_inputs": { + "create_claude_connector": "connector_id" + }, + "user_inputs": { + "name": "claude-haiku", + "description": "Claude model", + "deploy": true + } + }, + { + "id": "create_t2vega_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "parameters": { + "prompt": "You're an expert at creating vega-lite visualization. No matter what the user asks, you should reply with a valid vega-lite specification in json.\nYour task is to generate Vega-Lite specification in json based on the given sample data, the schema of the data, the PPL query to get the data and the user's input.\nLet's start from dimension and metric/date. Now I have a question, I already transfer it to PPL and query my Opensearch cluster. \nThen I get data. For the PPL, it will do aggregation like \"stats AVG(field_1) as avg, COUNT(field_2) by field_3, field_4, field_5\". \nIn this aggregation, the metric is [avg, COUNT(field_2)] , and then we judge the type of field_3,4,5. If only field_5 is type related to date, the dimension is [field_3, field_4], and date is [field_5]\nFor example, stats SUM(bytes) by span(timestamp, 1w), machine.os, response, then SUM(bytes) is metric and span(timestamp, 1w) is date, while machine.os, response are dimensions.\nNotice: Some fields like 'span()....' will be the date, but not metric and dimension. \nAnd one field will only count once in dimension count. You should always pick field name from schema\nTo summarize,\nA dimension is a categorical variable that is used to group, segment, or categorize data. It is typically a qualitative attribute that provides context for metrics and is used to slice and dice data to see how different categories perform in relation to each other.\nThe dimension is not date related fields. The dimension and date are very closed. The only difference is date is related to datetime, while dimension is not.\nA metric is a quantitative measure used to quantify or calculate some aspect of the data. Metrics are numerical and typically represent aggregated values like sums, averages, counts, or other statistical calculations.\n\nIf a ppl doesn't have aggregation using 'stats', then each field in output is dimension.\nOtherwise, if a ppl use aggregation using 'stats' but doesn't group by using 'by', then each field in output is metric.\n\nThen for each given PPL, you could give the metric and dimension and date. One field will in only one of the metric, dimension or date.\n\nThen according to the metric number and dimension number of PPL result, you should first format the entrance code by metric_number, dimension_number, and date_number. For example, if metric_number = 1, dimension_number = 2, date_number=1, then the entrance code is 121.\nI define several use case categories here according to the entrance code.\nFor each category, I will define the entrance condition (number of metric and dimension)\nI will also give some defined attribute of generated vega-lite. Please refer to it to generate vega-lite.\n\nType 1:\nEntrance code: <1, 1, 0>\nDefined Attributes:\n {\n \"title\": \"\",\n \"description\": \"<description>\",\n \"mark\": \"bar\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<metric name>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<dimension name>\",\n \"type\": \"nominal\"\n }\n },\n }\n\nType 2:\nEntrance code: <1, 2, 0>\nDefined Attributes:\n{\n \"mark\": \"bar\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n },\n \"color\": {\n \"field\": \"<dimension 2>\",\n \"type\": \"nominal\"\n }\n }\n }\n\n\nType 3\nEntrance code: <3, 1, 0>\nDefined Attributes:\n{\n \"mark\": \"point\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<metric 2>\",\n \"type\": \"quantitative\"\n },\n \"size\": {\n \"field\": \"<metric 3>\",\n \"type\": \"quantitative\"\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n}\n\nType 4\nEntrance code: <2, 1, 0>\nDefined Attributes:\n{\n \"mark\": \"point\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<mtric 1>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<metric 2>\",\n \"type\": \"quantitative\"\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n}\n\nType 5:\nEntrance code: <2, 1, 1>\nDefined Attributes:\n{\n \"layer\": [\n {\n \"mark\": \"bar\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 1 name>\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n },\n {\n \"mark\": {\n \"type\": \"line\",\n \"color\": \"red\"\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"<metric 2>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 2 name>\",\n \"orient\": \"right\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n }\n ],\n \"resolve\": {\n \"scale\": {\n \"y\": \"independent\"\n }\n }\n }\n\nType 6:\nEntrance code: <2, 0, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"layer\": [\n {\n \"mark\": \"area\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 1 name>\"\n }\n }\n }\n },\n {\n \"mark\": {\n \"type\": \"line\",\n \"color\": \"black\"\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"date\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"metric 2\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 2 name>\",\n \"orient\": \"right\"\n }\n }\n }\n }\n ],\n \"resolve\": {\n \"scale\": {\n \"y\": \"independent\"\n }\n }\n }\n \nType 7:\nEntrance code: <1, 0, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"mark\": \"line\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\",\n \"axis\": {\n \"title\": \"<date name>\"\n }\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric name>\"\n }\n }\n }\n }\n\nType 8:\nEntrance code: <1, 1, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"mark\": \"line\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\",\n \"axis\": {\n \"title\": \"<date name>\"\n }\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric name>\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\",\n \"legend\": {\n \"title\": \"<dimension name>\"\n }\n }\n }\n }\n\nType 9:\nEntrance code: <1, 2, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"mark\": \"line\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\",\n \"axis\": {\n \"title\": \"<date name>\"\n }\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 1>\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\",\n \"legend\": {\n \"title\": \"<dimension 1>\"\n }\n },\n \"facet\": {\n \"field\": \"<dimension 2>\",\n \"type\": \"nominal\",\n \"columns\": 2\n }\n }\n }\n\nType 10:\nEntrance code: all other code\nAll others type.\nUse a table to show the result\n\n\nBesides, here are some requirements:\n1. Do not contain the key called 'data' in vega-lite specification.\n2. If mark.type = point and shape.field is a field of the data, the definition of the shape should be inside the root \"encoding\" object, NOT in the \"mark\" object, for example, {\"encoding\": {\"shape\": {\"field\": \"field_name\"}}}\n3. Please also generate title and description\n\nThe sample data in json format:\n${parameters.sampleData}\n\nThis is the schema of the data:\n${parameters.dataSchema}\n\nThe user used this PPL query to get the data: ${parameters.ppl}\n\nThe user's question is: ${parameters.input_question}\n\nNotice: Some fields like 'span()....' will be the date, but not metric and dimension. \nAnd one field will only count once in dimension count. You should always pick field name from schema.\n And when you code is <2, 1, 0>, it belongs type 4.\n And when you code is <1, 2, 0>, it belongs type 9.\n\n\nNow please reply a valid vega-lite specification in json based on above instructions.\nPlease return the number of dimension, metric and date. Then choose the type. \nPlease also return the type.\nFinally return the vega-lite specification according to the type.\nPlease make sure all the key in the schema matches the word I given. \nYour answer format should be:\nNumber of metrics:[list the metric name here, Don't use duplicate name] <number of metrics {a}> \nNumber of dimensions:[list the dimension name here] <number of dimension {b}> \nNumber of dates:[list the date name here] <number of dates {c}> \nThen format the entrance code by: <Number of metrics, Number of dimensions, Number of dates>\nType and its entrance code: <type number>: <its entrance code>\nThen apply the vega-lite requirements of the type.\n<vega-lite> {here is the vega-lite json} </vega-lite>\n\nAnd don't use 'transformer' in your vega-lite and wrap your vega-lite json in <vega-lite> </vega-lite> tags\n" + }, + "name": "Text2Vega", + "type": "MLModelTool" + } + }, + { + "id": "create_instruction_based_t2vega_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "parameters": { + "prompt": "You're an expert at creating vega-lite visualization. No matter what the user asks, you should reply with a valid vega-lite specification in json.\nYour task is to generate Vega-Lite specification in json based on the given sample data, the schema of the data, the PPL query to get the data and the user's input.\n\nBesides, here are some requirements:\n1. Do not contain the key called 'data' in vega-lite specification.\n2. If mark.type = point and shape.field is a field of the data, the definition of the shape should be inside the root \"encoding\" object, NOT in the \"mark\" object, for example, {\"encoding\": {\"shape\": {\"field\": \"field_name\"}}}\n3. Please also generate title and description\n\nThe sample data in json format:\n${parameters.sampleData}\n\nThis is the schema of the data:\n${parameters.dataSchema}\n\nThe user used this PPL query to get the data: ${parameters.ppl}\n\nThe user's input question is: ${parameters.input_question}\nThe user's instruction on the visualization is: ${parameters.input_instruction}\n\nNow please reply a valid vega-lite specification in json based on above instructions.\nPlease only contain vega-lite in your response.\n" + }, + "name": "Text2Vega", + "type": "MLModelTool" + } + }, + { + "id": "t2vega_agent", + "type": "register_agent", + "previous_node_inputs": { + "create_t2vega_tool": "tools" + }, + "user_inputs": { + "parameters": {}, + "type": "flow", + "name": "t2vega agent", + "description": "this is the t2vega agent that has a set of rules to generate the visualizations" + } + }, + { + "id": "t2vega_instruction_based_agent", + "type": "register_agent", + "previous_node_inputs": { + "create_instruction_based_t2vega_tool": "tools" + }, + "user_inputs": { + "parameters": {}, + "type": "flow", + "name": "t2vega instruction based agent", + "description": "this is the t2vega agent that supports instructions" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +</details> + +Use the workflow ID returned in the response to provision the resources: + +```json +POST /_plugins/_flow_framework/workflow/<workflow_id>/_provision +``` +{% include copy-curl.html %} + +To view the status of the workflow and all created resources, send the following request: + +```json +GET /_plugins/_flow_framework/workflow/<workflow_id>/_status +``` +{% include copy-curl.html %} + +### Step 3: Configure the root agent + +Next, configure a root agent for text to visualization: + +```json +POST /.plugins-ml-config/_doc/os_text2vega +{ + "type": "os_chat_root_agent", + "configuration": { + "agent_id": "<ROOT_AGENT_ID>" + } +} +``` +{% include copy-curl.html %} + +Configure the agent to receive user instructions for creating visualizations: + +```json +POST /.plugins-ml-config/_doc/os_text2vega_with_instructions +{ + "type": "os_chat_root_agent", + "configuration": { + "agent_id": "<ROOT_AGENT_ID>" + } +} +``` +{% include copy-curl.html %} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agent + +You can verify that the agent was created successfully by calling the agent with an example payload: + +```json +POST /_plugins/_ml/agents/<ROOT_AGENT_ID>/_execute +{ + "parameters": { + "input_question": "find unique visitors and average bytes every 3 hours", + "input_instruction": "display with different layers, use independent scale for different layers, display unique visitors with light blue bar chart", + "ppl": "source=opensearch_dashboards_sample_data_ecommerce| stats DISTINCT_COUNT(user) as unique_visitors, AVG(taxful_total_price) as avg_bytes by span(order_date, 3h)", + "sampleData": """[{\"unique_visitors\":15,\"avg_bytes\":90.98684210526316,\"span(order_date,3h)\":\"2024-04-25 00:00:00\"},{\"unique_visitors\":14,\"avg_bytes\":72.72083333333333,\"span(order_date,3h)\":\"2024-04-25 03:00:00\"}]""", + "dataSchema": """[{\"name\":\"unique_visitors\",\"type\":\"integer\"},{\"name\":\"avg_bytes\",\"type\":\"double\"},{\"name\":\"span(order_date,3h)\",\"type\":\"timestamp\"}]""" + } +} +``` +{% include copy-curl.html %} + +## Generating a visualization from text + +You can generate a visualization from text by calling the `/api/assistant/text2vega` API endpoint. The `input_instruction` parameter is optional: + +```json +POST /api/assistant/text2vega +{ + "input_instruction": "<input_instruction>", + "input_question": "<input_question>", + "ppl": "<ppl_query>", + "dataSchema": "<data_schema_of_ppl_response>", + "sampleData": "<sample_data_of_ppl_response>" +} +``` +{% include copy-curl.html %} + +The following table describes the Text to Visualization API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`input_question` | Required | The user's original question used to generate the corresponding Piped Processing Language (PPL) query. +`ppl` | Required | The generated PPL query that retrieves the data required for the visualization. +`dataSchema` | Required | Describes the structure and types of the data fields in the visualization output, based on the PPL response. +`sampleData` | Required | Provides sample entries from the data that will populate the visualization. +`input_instruction` | Optional | Specifies the styling instructions, such as colors, for the visualization. + +## Generating visualizations from text in OpenSearch Dashboards + +To generate visualizations from text in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Dashboards > Visualize** and then select **Create visualization**. + +1. In the **New Visualization** dialog, select **Natural language**, as shown in the following image. + + <img width="800px" src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-start.png" alt="Create a visualization by selecting natural language"> + +1. From the data sources dropdown list, select a data source, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-select-data-source.png" alt="Create a visualization by selecting natural language"> + +1. In the text box on the upper right, enter a question using natural language. A new visualization is generated, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-ask-question.png" alt="Create a visualization by selecting natural language"> + +1. To modify the generated visualization, select **Edit visual**. In the **Edit visual** dialog, enter the desired modifications and then select **Apply**, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-edit-visual.png" alt="Create a visualization by selecting natural language"> + + The visualization is updated, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-edit-visual-response.png" alt="Create a visualization by selecting natural language"> + + + diff --git a/_dashboards/management/accelerate-external-data.md b/_dashboards/management/accelerate-external-data.md index 6d1fa030e4..00eb8671ec 100644 --- a/_dashboards/management/accelerate-external-data.md +++ b/_dashboards/management/accelerate-external-data.md @@ -1,10 +1,8 @@ --- layout: default title: Optimize query performance using OpenSearch indexing -parent: Connecting Amazon S3 to OpenSearch -grand_parent: Data sources -nav_order: 15 -has_children: false +parent: Data sources +nav_order: 17 --- # Optimize query performance using OpenSearch indexing @@ -14,35 +12,171 @@ Introduced 2.11 Query performance can be slow when using external data sources for reasons such as network latency, data transformation, and data volume. You can optimize your query performance by using OpenSearch indexes, such as a skipping index or a covering index. -A _skipping index_ uses skip acceleration methods, such as partition, minimum and maximum values, and value sets, to ingest and create compact aggregate data structures. This makes them an economical option for direct querying scenarios. +- A _skipping index_ uses skip acceleration methods, such as partition, minimum and maximum values, and value sets, to ingest and create compact aggregate data structures. This makes them an economical option for direct querying scenarios. For more information, see [Skipping indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#skipping-indexes). +- A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. For more information, see [Covering indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#covering-indexes). +- A _materialized view_ enhances query performance by storing precomputed and aggregated data from the source data. For more information, see [Materialized views](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#materialized-views). -A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. See the [Flint Index Reference Manual](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md) for comprehensive guidance on this feature's indexing process. +For comprehensive guidance on each indexing process, see the [Flint Index Reference Manual](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md). ## Data sources use case: Accelerate performance -To get started with the **Accelerate performance** use case available in **Data sources**, follow these steps: +To get started with accelerating query performance, perform the following steps: -1. Go to **OpenSearch Dashboards** > **Query Workbench** and select your Amazon S3 data source from the **Data sources** dropdown menu in the upper-left corner. -2. From the left-side navigation menu, select a database. -3. View the results in the table and confirm that you have the desired data. +1. Go to **OpenSearch Dashboards** > **Query Workbench** and select your data source from the **Data sources** dropdown menu. +2. From the navigation menu, select a database. +3. View the results in the table and confirm that you have the correct data. 4. Create an OpenSearch index by following these steps: - 1. Select the **Accelerate data** button. A pop-up window appears. - 2. Enter your details in **Select data fields**. In the **Database** field, select the desired acceleration index: **Skipping index** or **Covering index**. A _skipping index_ uses skip acceleration methods, such as partition, min/max, and value sets, to ingest data using compact aggregate data structures. This makes them an economical option for direct querying scenarios. A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. -5. Under **Index settings**, enter the information for your acceleration index. For information about naming, select **Help**. Note that an Amazon S3 table can only have one skipping index at a time. + 1. Select **Accelerate data**. A pop-up window appears. + 2. Enter your database and table details under **Select data fields**. +5. For **Acceleration type**, select the type of acceleration according to your use case. Then, enter the information for your acceleration type. For more information, see the following sections: + - [Skipping indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#skipping-indexes) + - [Covering indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#covering-indexes) + - [Materialized views](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#materialized-views) + +## Skipping indexes + +A _skipping index_ uses skip acceleration methods, such as partition, min/max, and value sets, to ingest data using compact aggregate data structures. This makes them an economical option for direct querying scenarios. + +With a skipping index, you can index only the metadata of the data stored in Amazon S3. When you query a table with a skipping index, the query planner references the index and rewrites the query to efficiently locate the data, instead of scanning all partitions and files. This allows the skipping index to quickly narrow down the specific location of the stored data. ### Define skipping index settings -1. Under **Skipping index definition**, select the **Add fields** button to define the skipping index acceleration method and choose the fields you want to add. -2. Select the **Copy Query to Editor** button to apply your skipping index settings. -3. View the skipping index query details in the table pane and then select the **Run** button. Your index is added to the left-side navigation menu containing the list of your databases. +1. Under **Skipping index definition**, select **Generate** to automatically generate a skipping index. Alternately, to manually choose the fields you want to add, select **Add fields**. Choose from the following types: + - `Partition`: Uses data partition details to locate data. This type is best for partitioning-based columns such as year, month, day, hour. + - `MinMax`: Uses lower and upper bound of the indexed column to locate data. This type is best for numeric columns. + - `ValueSet`: Uses a unique value set to locate data. This type is best for columns with low to moderate cardinality that require exact matching. + - `BloomFilter`: Uses the bloom filter algorithm to locate data. This type is best for columns with high cardinality that do not require exact matching. +2. Select **Create acceleration** to apply your skipping index settings. +3. View the skipping index query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a skipping index using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE SKIPPING INDEX +ON datasourcename.gluedatabasename.vpclogstable( + `srcaddr` BLOOM_FILTER, + `dstaddr` BLOOM_FILTER, + `day` PARTITION, + `account_id`BLOOM_FILTER + ) WITH ( +index_settings = '{"number_of_shards":5,"number_of_replicas":1}', +auto_refresh = true, +checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint' +) +``` + +## Covering indexes + +A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. + +With a covering index, you can ingest data from a specified column in a table. This is the most performant of the three indexing types. Because OpenSearch ingests all data from your desired column, you get better performance and can perform advanced analytics. + +OpenSearch creates a new index from the covering index data. You can use this new index to create visualizations, or for anomaly detection and geospatial capabilities. You can manage the covering view index with Index State Management. For more information, see [Index State Management](https://opensearch.org/docs/latest/im-plugin/ism/index/). ### Define covering index settings -1. Under **Index settings**, enter a valid index name. Note that each Amazon S3 table can have multiple covering indexes. -2. Once you have added the index name, define the covering index fields by selecting `(add fields here)` under **Covering index definition**. -3. Select the **Copy Query to Editor** button to apply your covering index settings. -4. View the covering index query details in the table pane and then select the **Run** button. Your index is added to the left-side navigation menu containing the list of your databases. +1. For **Index name**, enter a valid index name. Note that each table can have multiple covering indexes. +2. Choose a **Refresh type**. By default, OpenSearch automatically refreshes the index. Otherwise, you must manually trigger a refresh using a REFRESH statement. +3. Enter a **Checkpoint location**, which is a path for refresh job checkpoints. The location must be a path in an HDFS compatible file system. +4. Define the covering index fields by selecting **(add fields here)** under **Covering index definition**. +5. Select **Create acceleration** to apply your covering index settings. +6. View the covering index query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a covering index on your table using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE INDEX vpc_covering_index +ON datasourcename.gluedatabasename.vpclogstable (version, account_id, interface_id, +srcaddr, dstaddr, srcport, dstport, protocol, packets, +bytes, start, action, log_status STRING, +`aws-account-id`, `aws-service`, `aws-region`, year, +month, day, hour ) +WITH ( + auto_refresh = true, + refresh_interval = '15 minute', + checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint' +) +``` + +## Materialized views + +With _materialized views_, you can use complex queries, such as aggregations, to power Dashboards visualizations. Materialized views ingest a small amount of your data, depending on the query, into OpenSearch. OpenSearch then forms an index from the ingested data that you can use for visualizations. You can manage the materialized view index with Index State Management. For more information, see [Index State Management](https://opensearch.org/docs/latest/im-plugin/ism/index/). + +### Define materialized view settings + +1. For **Index name**, enter a valid index name. Note that each table can have multiple covering indexes. +2. Choose a **Refresh type**. By default, OpenSearch automatically refreshes the index. Otherwise, you must manually trigger a refresh using a `REFRESH` statement. +3. Enter a **Checkpoint location**, which is a path for refresh job checkpoints. The location must be a path in an HDFS compatible file system. +4. Enter a **Watermark delay**, which defines how late data can come and still be processed, such as 1 minute or 10 seconds. +5. Define the covering index fields under **Materialized view definition**. +6. Select **Create acceleration** to apply your materialized view index settings. +7. View the materialized view query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a materialized view index on your table using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE MATERIALIZED VIEW {table_name}__week_live_mview AS + SELECT + cloud.account_uid AS `aws.vpc.cloud_account_uid`, + cloud.region AS `aws.vpc.cloud_region`, + cloud.zone AS `aws.vpc.cloud_zone`, + cloud.provider AS `aws.vpc.cloud_provider`, + + CAST(IFNULL(src_endpoint.port, 0) AS LONG) AS `aws.vpc.srcport`, + CAST(IFNULL(src_endpoint.svc_name, 'Unknown') AS STRING) AS `aws.vpc.pkt-src-aws-service`, + CAST(IFNULL(src_endpoint.ip, '0.0.0.0') AS STRING) AS `aws.vpc.srcaddr`, + CAST(IFNULL(src_endpoint.interface_uid, 'Unknown') AS STRING) AS `aws.vpc.src-interface_uid`, + CAST(IFNULL(src_endpoint.vpc_uid, 'Unknown') AS STRING) AS `aws.vpc.src-vpc_uid`, + CAST(IFNULL(src_endpoint.instance_uid, 'Unknown') AS STRING) AS `aws.vpc.src-instance_uid`, + CAST(IFNULL(src_endpoint.subnet_uid, 'Unknown') AS STRING) AS `aws.vpc.src-subnet_uid`, + + CAST(IFNULL(dst_endpoint.port, 0) AS LONG) AS `aws.vpc.dstport`, + CAST(IFNULL(dst_endpoint.svc_name, 'Unknown') AS STRING) AS `aws.vpc.pkt-dst-aws-service`, + CAST(IFNULL(dst_endpoint.ip, '0.0.0.0') AS STRING) AS `aws.vpc.dstaddr`, + CAST(IFNULL(dst_endpoint.interface_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-interface_uid`, + CAST(IFNULL(dst_endpoint.vpc_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-vpc_uid`, + CAST(IFNULL(dst_endpoint.instance_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-instance_uid`, + CAST(IFNULL(dst_endpoint.subnet_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-subnet_uid`, + CASE + WHEN regexp(dst_endpoint.ip, '(10\\..*)|(192\\.168\\..*)|(172\\.1[6-9]\\..*)|(172\\.2[0-9]\\..*)|(172\\.3[0-1]\\.*)') + THEN 'ingress' + ELSE 'egress' + END AS `aws.vpc.flow-direction`, + + CAST(IFNULL(connection_info['protocol_num'], 0) AS INT) AS `aws.vpc.connection.protocol_num`, + CAST(IFNULL(connection_info['tcp_flags'], '0') AS STRING) AS `aws.vpc.connection.tcp_flags`, + CAST(IFNULL(connection_info['protocol_ver'], '0') AS STRING) AS `aws.vpc.connection.protocol_ver`, + CAST(IFNULL(connection_info['boundary'], 'Unknown') AS STRING) AS `aws.vpc.connection.boundary`, + CAST(IFNULL(connection_info['direction'], 'Unknown') AS STRING) AS `aws.vpc.connection.direction`, + + CAST(IFNULL(traffic.packets, 0) AS LONG) AS `aws.vpc.packets`, + CAST(IFNULL(traffic.bytes, 0) AS LONG) AS `aws.vpc.bytes`, + + CAST(FROM_UNIXTIME(time / 1000) AS TIMESTAMP) AS `@timestamp`, + CAST(FROM_UNIXTIME(start_time / 1000) AS TIMESTAMP) AS `start_time`, + CAST(FROM_UNIXTIME(start_time / 1000) AS TIMESTAMP) AS `interval_start_time`, + CAST(FROM_UNIXTIME(end_time / 1000) AS TIMESTAMP) AS `end_time`, + status_code AS `aws.vpc.status_code`, + + severity AS `aws.vpc.severity`, + class_name AS `aws.vpc.class_name`, + category_name AS `aws.vpc.category_name`, + activity_name AS `aws.vpc.activity_name`, + disposition AS `aws.vpc.disposition`, + type_name AS `aws.vpc.type_name`, + + region AS `aws.vpc.region`, + accountid AS `aws.vpc.account-id` + FROM + datasourcename.gluedatabasename.vpclogstable +WITH ( + auto_refresh = true, + refresh_interval = '15 Minute', + checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint', + watermark_delay = '1 Minute', +) +``` ## Limitations -This feature is still under development, so there are some limitations. For real-time updates, refer to the [developer documentation on GitHub](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#limitations). +This feature is still under development, so there are some limitations. For real-time updates, see the [developer documentation on GitHub](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#limitations). diff --git a/_dashboards/management/acl.md b/_dashboards/management/acl.md new file mode 100644 index 0000000000..bd57b72419 --- /dev/null +++ b/_dashboards/management/acl.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Access control lists for saved objects +parent: Dashboards Management +nav_order: 50 +--- + +# Access control lists for saved objects +Introduced 2.18 +{: .label .label-purple } + +You can use access control lists (ACLs) to manage permissions for your saved objects, providing authorization (AuthZ) capabilities without requiring backend plugin integration. + +## Understanding ACL types + +ACLs are applied at two levels: + +1. **Workspace ACL:** Workspace objects inherit permissions from their parent workspace. See [Workspace ACL]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl) for more information. +2. **Objects ACL:** Each individual object can have its own ACL policy. All operations on these objects must pass ACL policy validation. + +## Enabling the ACL feature + +The ACL feature must be enabled before you can define any access controls. Enable it by: + +1. Opening your `opensearch_dashboards.yml` file. +2. Enabling permissions with `savedObjects.permission.enabled: true`. + +## Defining ACL permissions + +ACL permissions are defined using the following schema: + +```json +{ + "permissions": { + "<permission_type_1>": { + "users": ["<principal_1>", "<principal_2>"], + "groups": ["<principal_3>", "<principal_4>"] + } + } +} +``` +{% include copy-curl.html %} + +### Granting permissions to authenticated users + +The wildcard character (`*`) grants permissions to all authenticated users. In the following example, the ACL grants workspace management permissions to the `finance_manager` group and dashboard creation permissions to the `finance_analyst` group: + +```json +{ + "permissions": { + "write": { + "groups": ["finance_manager"] + }, + "library_write": { + "groups": ["finance_analyst"] + } + } +} +``` +{% include copy-curl.html %} + +### Configuring mixed-level permissions + +To allow one user, `user-1` for example, to modify an object while giving read-only access to others, you can configure the ACL policy as follows: + +```json +{ + "permissions": { + "read": { + "users": ["*"] + }, + "write": { + "users": ["user-1"] + }, + } +} +``` +{% include copy-curl.html %} diff --git a/_dashboards/management/management-index.md b/_dashboards/management/management-index.md index 7edc4d06c2..01796180e5 100644 --- a/_dashboards/management/management-index.md +++ b/_dashboards/management/management-index.md @@ -9,16 +9,14 @@ has_children: true Introduced 2.10 {: .label .label-purple } -**Dashboards Management** serves as the command center for customizing OpenSearch Dashboards to your needs. A view of the interface is shown in the following image. +**Dashboards Management** is the central hub for managing and customizing OpenSearch data directly within OpenSearch Dashboards. -<img src="{{site.url}}{{site.baseurl}}/images/dashboards/dashboards-management-ui.png" alt="Dashboards Management interface" width="700"/> - -{::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/icons/alert-icon.png" class="inline-icon" alt="alert icon"/>{:/} **Note**<br>OpenSearch and OpenSearch Dashboards privileges govern access to individual features. If you do not have the appropriate access, consult your administrator. -{: .note} +OpenSearch and OpenSearch Dashboards permissions govern access to individual features. If you do not have the appropriate access permissions, consult your administrator. +{: .warning} ## Applications -The following applications are available in **Dashboards Management**: +You can access the following applications in **Dashboards Management**: - **[Index Patterns]({{site.url}}{{site.baseurl}}/dashboards/management/index-patterns/):** To access OpenSearch data, you need to create an index pattern so that you can select the data you want to use and define the properties of the fields. The Index Pattern tool gives you the ability to create an index pattern from within the UI. Index patterns point to one or more indexes, data streams, or index aliases. - **[Data Sources]({{site.url}}{{site.baseurl}}/dashboards/management/multi-data-sources/):** The Data Sources tool is used to configure and manage the data sources that OpenSearch uses to collect and analyze data. You can use the tool to specify the source configuration in your copy of the [OpenSearch Dashboards configuration file]({{site.url}}{{site.baseurl}}https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/config/opensearch_dashboards.yml). diff --git a/_dashboards/workspace/apis.md b/_dashboards/workspace/apis.md new file mode 100644 index 0000000000..683488e423 --- /dev/null +++ b/_dashboards/workspace/apis.md @@ -0,0 +1,386 @@ +--- +layout: default +title: Workspaces APIs +parent: Workspace for OpenSearch Dashboards +nav_order: 10 +--- + +# Workspaces APIs +Introduced 2.18 +{: .label .label-purple } + +The Workspaces API provides a set of endpoints for managing workspaces in OpenSearch Dashboards. + +## List Workspaces API + +You can use the following endpoint to retrieve a list of workspaces: + +```json +POST <osd host>:<port>/api/workspaces/_list +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `search` | String | Optional | A query string used to filter workspaces with simple query syntax, for example, `simple_query_string`. | +| `searchFields` | Array | Optional | Specifies which fields to perform the search query against. | +| `sortField` | String | Optional | The field name to use for sorting results. | +| `sortOrder` | String | Optional | Specifies ascending or descending sort order. | +| `perPage` | Number | Optional | The number of workspace results per page. | +| `page` | Number | Optional | The number of pages of results to retrieve. | +| `permissionModes` | Array | Optional | A list of permissions to filter by. | + +#### Example request + +```json +POST /api/workspaces/_list +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "page": 1, + "per_page": 20, + "total": 3, + "workspaces": [ + { + "name": "test1", + "features": [ + "use-case-all" + ], + "id": "hWNZls" + }, + { + "name": "test2", + "features": [ + "use-case-observability" + ], + "id": "SnkOPt" + } + ] + } +} +``` +{% include copy-curl.html %} + +## Get Workspaces API + +You can use the following endpoint to retrieve a single workspace: + +```json +GET <osd host>:<port>/api/workspaces/<id> +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. All path parameters are required. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `<id>` | String | Required | Identifies the unique workspace to be retrieved. | + +#### Example request + +```json +GET /api/workspaces/SnkOPt +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "name": "test2", + "features": ["use-case-all"], + "id": "SnkOPt" + } +} +``` +{% include copy-curl.html %} + +## Create Workspaces API + +You can use the following endpoint to create a workspace: + +```json +POST <osd host>:<port>/api/workspaces +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `attributes` | Object | Required | Defines the workspace attributes. | +| `permissions` | Object | Optional | Specifies the permissions for the workspace. | + +#### Example request + +```json +POST api/workspaces +{ + "attributes": { + "name": "test4", + "description": "test4" + } +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "id": "eHVoCJ" + } +} +``` +{% include copy-curl.html %} + +## Update Workspaces API + +You can use the following endpoint to update the attributes and permissions for a workspace: + +```json +PUT <osd host>:<port>/api/workspaces/<id> +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `<id>` | String | Required | Identifies the unique workspace to be retrieved. | +| `attributes` | Object | Required | Defines the workspace attributes. | +| `permissions` | Object | Optional | Specifies the permissions for the workspace. | + +#### Example request + +```json +PUT api/workspaces/eHVoCJ +{ + "attributes": { + "name": "test4", + "description": "test update" + } +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": true +} +``` +{% include copy-curl.html %} + +## Delete Workspaces API + +You can use the following endpoint to delete a workspace: + +```json +DELETE <osd host>:<port>/api/workspaces/<id> +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. All path parameters are required. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `<id>` | String | Required | Identifies the unique workspace to be retrieved. | + +#### Example request + +```json +DELETE api/workspaces/eHVoCJ +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": true +} +``` +{% include copy-curl.html %} + +## Duplicate Saved Objects Workspaces API + +You can use the following endpoint to copy saved objects between workspaces: + +```json +POST <osd host>:<port>/api/workspaces/_duplicate_saved_objects +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `objects` | Array | Required | Specifies the saved objects to be duplicated. | +| `targetWorkspace` | String | Required | Identifies the destination workspace for copying. | +| `includeReferencesDeep` | Boolean | Optional | Determines whether to copy all referenced objects to the target workspace. Default is `true`. | + +The following table lists the attributes of the object in the `objects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | Defines the saved object classification, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_duplicate_saved_objects +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "successCount": 1, + "success": true, + "successResults": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + "meta": { + "title": "test*", + "icon": "indexPatternApp" + }, + "destinationId": "f4b724fd-9647-4bbf-bf59-610b43a62c75" + } + ] +} +``` +{% include copy-curl.html %} + +## Associate Saved Objects Workspaces API + +You can use the following endpoint to associate saved objects with a workspace: + +```json +POST <osd host>:<port>/api/workspaces/_associate +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `workspaceId` | String | Required | Identifies the target workspace for object association. | +| `savedObjects` | Array | Required | Specifies the list of saved objects to be copied. | + +The following table lists the attributes of the object in the `objects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | Defines the saved object classification, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_associate +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": [ + { + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + } + ] +} +``` +{% include copy-curl.html %} + +## Dissociate Saved Objects Workspaces API + +You can use the following endpoint to dissociate saved objects from a workspace: + +```json +POST <osd host>:<port>/api/workspaces/_dissociate +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `workspaceId` | String | Required | The target workspace with which to associate the objects. | +| `savedObjects` | Array | Required | A list of saved objects to copy. | + +The following table lists the attributes of the `savedObjects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | The type of the saved object, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_dissociate +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": [ + { + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + } + ] +} +``` +{% include copy-curl.html %} diff --git a/_dashboards/workspace/create-workspace.md b/_dashboards/workspace/create-workspace.md new file mode 100644 index 0000000000..34ba65bb54 --- /dev/null +++ b/_dashboards/workspace/create-workspace.md @@ -0,0 +1,52 @@ +--- +layout: default +title: Create a workspace +parent: Workspace for OpenSearch Dashboards +nav_order: 1 +--- + +# Create a workspace +Introduced 2.18 +{: .label .label-purple } + +Before getting started with this tutorial, you must enable the workspace feature flag. See [Enabling the ACL feature]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace/#enabling-the-workspace-feature) for more information. + +When the saved objects permission is enabled, only users with admin status can create workspaces. See [Configuring the dashboard admin]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information. + +To create a workspace, follow these steps: + +1. Open OpenSearch Dashboards. +2. From the main page, choose the appropriate card for your use case, for example, **Observability**, **Security Analytics**, **Search**, **Essentials**, or **Analytics**. Alternatively, you can select the **Create workspace** button and choose the appropriate use case from the dropdown menu. +3. Enter the required information in the **Workspace details** window. + - **Workspace name** is required. Valid characters are `a-z`, `A-Z`, `0-9`, parentheses (`()`), brackets (`[]`), underscore (`_`), hyphen (`-`), and spaces. Choose a unique workspace name within the character limit (40 characters). The **Create workspace** button is disabled when the workspace name already exists or exceeds the character limit, and an error message appears. + - **Use case and features** is required. Choose the use case that best fits your needs. If you are using Amazon OpenSearch Serverless and have enabled the [multiple data sources]({{site.url}}{{site.baseurl}}/dashboards/management/data-sources/) feature, **Essentials** is automatically assigned. +4. (Optional) Select the color picker to customize the color of your workspace icon. +5. (Optional) Add a workspace description of up to 200 characters. This option is disabled when the description exceeds the character limit. +6. Save your workspace. + - The **Create workspace** button becomes active once you enter the information for all required fields. You become the workspace owner automatically. The system redirects you to either the collaborators page if the saved objects permission is enabled or the overview page if the saved objects permission is disabled. See [Configuring dashboard admin]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information about permissions. + +To set up permissions, see [Workspace access control lists]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/) for more information. + +## Associating data sources with a workspace + +The **Associate data source** option is only visible when the multiple data sources feature is enabled. Before creating your workspace, you must connect it with at least one data source. If you have not set up your data sources, see [Data sources]({{site.url}}{{site.baseurl}}/dashboards/management/data-sources/). Once your sources are connected, you can link them to your new workspace. +{: .warning} + +### Associating OpenSearch data sources + +To associate OpenSearch data sources, follow these steps: + +1. Select the **Associate OpenSearch Data Sources** button to open the selection modal. +2. View the available data sources in the modal: + - Standard OpenSearch sources appear as single entries. + - Sources with direct query connections show a +N indicator. +3. Select the appropriate data source name(s). +4. Select the **Associate data sources** button to complete the association. + +### Associating direct query sources + +To associate direct query sources, follow these steps: + +1. Select the **Associate direct query data sources** button to open the selection modal. The modal displays only sources with direct query connections. +2. Select a data source to automatically expand its direct query connections. +3. Select the **Associate data sources** button to complete the association. diff --git a/_dashboards/workspace/index.md b/_dashboards/workspace/index.md new file mode 100644 index 0000000000..f0f572a4a5 --- /dev/null +++ b/_dashboards/workspace/index.md @@ -0,0 +1,27 @@ +--- +layout: default +title: Getting started with workspaces +parent: Workspace for OpenSearch Dashboards +nav_order: 0 +--- + +# Getting started with workspaces +Introduced 2.18 +{: .label .label-purple } + +OpenSearch Dashboards 2.18 introduces an enhanced home page that provides a comprehensive view of all your workspaces. + +The new home page includes the following features: + +1. A **Create workspace** button for [OpenSearch Dashboard admins]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) to navigate to the [create workspace]({{site.url}}{{site.baseurl}}/dashboards/workspace/create-workspace) page. +2. Workspace access time information and a link to the workspace overview page. +3. A use case information icon that displays information about the workspace's purpose. +4. A **View all workspaces** button that navigates to the [workspace management]({{site.url}}{{site.baseurl}}/dashboards/workspace/manage-workspace/#navigating-the-workspaces-list) page. +5. Links to the latest OpenSearch documentation through the **Learn more from documentation** button and to [OpenSearch Playground](https://playground.opensearch.org/app/home#/) through the **Explore live demo environment at playground.opensearch.org** button. + +The navigation logic ensures a seamless user experience by directing you to the appropriate page based on your workspace access level: + +- If a you have a default workspace configured, you are directed to the workspace overview page. +- If a you have only one workspace, you are directed to the overview page of that workspace. +- If a you have multiple workspaces, you are directed to the new home page. +- If a you have no workspaces, you are directed to the new home page. diff --git a/_dashboards/workspace/manage-workspace.md b/_dashboards/workspace/manage-workspace.md new file mode 100644 index 0000000000..45733d75be --- /dev/null +++ b/_dashboards/workspace/manage-workspace.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Manage workspaces +parent: Workspace for OpenSearch Dashboards +nav_order: 2 +--- + +# Manage workspaces +Introduced 2.18 +{: .label .label-purple } + +You can access and modify the workspace details, including name, description, use case, and icon color, on the **Workspace details** page. + +To access and modify your workspace details, follow these steps: + +1. Open OpenSearch Dashboards and navigate to **My Workspaces**. +2. Choose the desired workspace and then select the **Edit** button to make changes +3. Select the **Save** button to confirm changes or the **Discard changes** button to cancel modifications. + +## Workspace update permissions + +The following permissions apply when changing workspaces: + +1. **Without the Security plugin:** All users can edit and update the workspace. +2. **With the Security plugin installed and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All users can edit and update workspaces. +3. **With the Security plugin and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml`:** Only the [workspace owner]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#defining-workspace-collaborators) and the [workspace admins]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) can edit and update workspaces. + +## Workspace update restrictions + +When updating workspace use cases, the following rules apply. + +Original use case | Target use case | +:---: | :---: +Analytics | Cannot be changed to any other use case +Search | Analytics +Security analytics | Analytics +Observability | Analytics +Essentials | Analytics Search<br> Security Analytics<br> Observability + +## Workspace control panel + +The **Workspace details** page features the following buttons in the upper-right corner: + +1. **Delete** ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dashboards/trash-can-icon.png" class="inline-icon" alt="trash can icon"/>{:/} icon) + - **Without the Security plugin installed:** All users can delete the workspace. + - **With the Security plugins installed and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All users can delete the workspace. + - **With the Security plugin installed and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml` file:** Only the admin can delete the workspace. +2. **Set as default workspace:** Sets the current workspace as the default login destination. +3. **Workspace overview:** Opens the **Overview** page in a new tab. + +## Adding assets to the workspace + +Access the **Sample data** in the navigation menu on the left. Select the appropriate dataset to install it in your cluster and OpenSearch Dashboards. + +## Copying assets between workspaces + +Data sources and configuration copying are not supported. +{: .warning} + +The assets page provides the following methods for copying assets across workspaces: + +1. **Copy all assets to...:** Copies all assets in the table. +2. **Copy to...:** Moves selected assets from the table. +3. **Copy to...:** Copies a single asset from the table. + +After selecting a copy option, choose the target workspace from the dropdown menu. The **Copy related assets** checkbox allows you to transfer associated assets. + +Upon selecting the **Copy** button, a side panel appears showing successful and failed asset transfers. Asset copy destinations depend on the following security configurations: + +1. **Without the Security plugin:** All workspaces are accessible. +2. **With the Security plugin and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All workspaces are accessible. +3. **With the Security plugin and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml` file:** Only workspaces for which the user has read and write or admin permissions are accessible. + +## Associating data sources + +On the data source management page, you can access a comprehensive list of associated OpenSearch connections, monitor direct query connections relevant to your current workspace, and establish new data source associations as needed. + +### Managing OpenSearch connections + +The OpenSearch connections tab displays all associated connections for the current workspace. Follow these steps to manage your connections: + +1. Access a comprehensive list of associated OpenSearch connections on the connections tab. +2. Use the **Remove association** button to unlink connections as needed. +3. Add new data sources by selecting the **OpenSearch data sources** button and subsequent modal. +4. Select from unassociated OpenSearch connections to expand your workspace's capabilities. + +### Adding direct query connections + +The **Direct query connections** tab displays a list of all direct query connections associated with your current workspace. To add more direct query connections to your workspace, select the **Direct query data sources** button. A modal window opens. + +The association modal displays a list of OpenSearch connections that contain direct query connections and have not yet been associated with your current workspace. When you associate an OpenSearch connection with your current workspace, all direct query connections within that OpenSearch connection are automatically associated as well. + +## Deleting your workspace + +Workspace deletion is restricted to admins. If you do not see a {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dashboards/trash-can-icon.png" class="inline-icon" alt="trash can icon"/>{:/} icon, check your permissions. See [Configuring dashboard administrators]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information. +{: .warning} + +Deleting a workspace permanently erases all its assets (except data sources) and the workspace itself. This action cannot be reversed. + +To delete a workspace, follow these steps: + +1. From the **Workspace details** page, select the {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dashboards/trash-can-icon.png" class="inline-icon" alt="trash can icon"/>{:/} icon in the upper-right corner to delete the current workspace. +2. Alternatively, from the workspace list page, select the {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/ellipsis-icon.png" class="inline-icon" alt="ellipsis icon"/>{:/} icon and select **Delete**. Optionally, select multiple workspaces for bulk deletion. + +## Navigating the workspaces list + +The workspaces list page serves as your central hub for workspace management, displaying all workspaces for which you have access permissions. Key features include the following: + +- Search: Quickly find a workspace by name. +- Filter: Sort workspaces by use case. +- At a glance: View each workspace's name, use case, description, last update time, and associated data sources. + +Each workspace entry includes an **Actions** column with the following functional buttons. These tools streamline your workspace management, allowing for efficient organization and customization of your OpenSearch Dashboards environment: + +1. Copy ID: One-click copying of the workspace ID. +2. Edit: Direct access to the workspace's detailed configuration page. +3. Set as default: Easily set any workspace as your default workspace. +4. Delete: Remove workspaces as needed (may require admin privileges). diff --git a/_dashboards/workspace/workspace-acl.md b/_dashboards/workspace/workspace-acl.md new file mode 100644 index 0000000000..16b2cc8628 --- /dev/null +++ b/_dashboards/workspace/workspace-acl.md @@ -0,0 +1,153 @@ +--- +layout: default +title: Workspace access control lists +parent: Workspace for OpenSearch Dashboards +nav_order: 3 +--- + +# Workspace access control lists +Introduced 2.18 +{: .label .label-purple } + +Workspace access control lists (ACLs) manage authorization for saved objects `AuthZ(Authorization)` while enabling [Security in OpenSearch]({{site.url}}{{site.baseurl}}/security/) for `AuthN(Authentication)`. + +## Personas + +**Workspace** use cases involve the following key personas: + +* **Dashboard admin:** Has full access to all OpenSearch Dashboards functions and data. +* **Workspace administrator (also called _owner_):** Has full control over a specific workspace, including its configuration and saved objects. When a workspace is created, its creator is automatically assigned the role of workspace owner. +* **Workspace content producer:** Can view, create, and update saved objects within the workspace. +* **Workspace viewer:** Has read-only access to saved objects in the workspace. + + Roles are workspace specific, allowing users to assume different roles across workspaces. + {: .note} + +## Enabling permission control + +See [Enabling the ACL feature]({{site.url}}{{site.baseurl}}/dashboards/management/acl#enabling-the-acl-feature) for instructions. + +## Configuring dashboard administrators + +To grant full access to all workspaces and objects in OpenSearch Dashboards, configure the admin permissions. Edit the `opensearch_dashboards.yml` file to define the admin by user ID and backend role, as shown in the following configuration: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["UserID"] +opensearchDashboards.dashboardAdmin.groups: ["BackendRole"] +savedObjects.permission.enabled: true +``` +{% include copy.html %} + +By default, the configuration is set to `[]`, meaning that no users are designated as admins. If the Security plugin is not installed and `savedObjects.permission.enabled: false`, all users are granted admin permissions. + +### Configuring global admin access + +Set all users as admins with this wildcard setting: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["*"] +``` +{% include copy-curl.html %} + +### Configuring admin access for a single user + +Configure a user with the `admin-user-id` setting: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["admin-user-id"] +``` +{% include copy-curl.html %} + +### Configuring admin access by backend role + +Configure a user with the `admin-role` setting: + +```yaml +opensearchDashboards.dashboardAdmin.groups: ["admin-role"] +``` +{% include copy-curl.html %} + +### Admin-restricted operations + +Admin-restricted operations include the following: + +- Workspace creation +- Workspace deletion +- Data source connections +- Disconnecting data sources from workspaces + +## Defining workspace collaborators + +Access to collaborator management is limited to admins. The **Collaborators** feature is only available when permission control is enabled. For instructions on activating permission control, see [Enabling permission control](#enabling-permission-control). The access levels include the following: + +- **Read only:** Grants permission to view the workspace and its assets. +- **Read and write:** Allows viewing and editing of assets within the workspace. +- **Admin:** Provides full access, including viewing and editing of assets within the workspace and updating workspace metadata, such as name, description, data sources, and collaborators. + +From the **Collaborators** page, you can by collaborator ID and filter results by collaborator type and access level. + +### Adding collaborators + +Workspace creators are granted the **Admin** access level as a collaborator. To add more collaborators, select the **Add collaborators** button, which displays a dropdown menu. Choose **Add Users** or **Add Groups** to access the corresponding modal for adding new collaborators. + +#### Adding users + +To add users, follow these steps: + +1. Select the **Add Users** button to open the modal. The modal displays one empty `User ID` field by default. +2. Choose an access level: **Read only**, **Read and write**, or **Admin**. +3. Choose **Add another User** to add multiple users. Do not use duplicate or existing `User ID` fields to avoid errors. +4. Resolve any errors before finalizing. Successfully added users appear in the collaborators table. + +#### Adding groups + +To add groups, follow these steps: + +1. Select the **Add Groups** button to open the modal. The modal displays one empty `Group ID` field by default. +2. Choose an access level: **Read only**, **Read and write**, or **Admin**. +3. Use **Add another group** to add multiple groups. Do not use duplicate or existing `Group ID` fields to avoid errors. +4. Resolve any errors before finalizing. Successfully added users appear in the collaborators table. + +### Modifying access levels + +You can modify collaborators access levels after adding them to the collaborators table if you have the required permissions. Collaborators can be assigned any access level. However, if all **Admin** collaborators are changed to lower access levels, then only admins can manage workspace collaboration. + +#### Modifying individual access levels + +To modify a single collaborator's access level, follow these steps: + +1. Select the action icon on the right of the table row. +2. Select **Change access level** from the dropdown menu. +3. Choose the desired access level from the list. +4. Confirm the change in the modal that appears and select **Confirm**. The collaborator's access level is updated in the table upon confirmation. + +#### Modifying access levels in batch + +To change access levels for several collaborators simultaneously, follow these steps: + +1. Select the desired collaborator rows in the table. +2. Select the **Actions** button that appears. +3. Select **Change access level** from the dropdown menu. +4. Select the new access level from the list provided. +5. Review and confirm the changes in the modal that appears. The access levels for all selected collaborators are updated in the table upon confirmation. + +### Deleting collaborators + +After adding collaborators to the table, you have the option to delete them. Be cautious when removing admin collaborators because deleting all of them restricts workspace collaborator management to admins only. A confirmation modal is displayed before finalizing this action. + +#### Deleting individual collaborators + +To delete an individual collaborator, follow these steps: + +1. Select the {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/ellipsis-icon.png" class="inline-icon" alt="ellipsis icon"/>{:/} icon on the right of the table row to display a dropdown menu. +2. Select **Delete collaborator** from the dropdown menu. A confirmation modal appears to verify your action. +3. Select **Confirm** in the modal to remove the collaborator from the table. + +#### Deleting collaborators in batch + +To remove several collaborators simultaneously, follow these steps: + +1. Select the rows containing the collaborators you want to remove from the table. A "Delete x collaborators" button appears. +2. Select the **Delete x collaborators** button. +3. Review the confirmation modal that appears. +4. Select **Confirm** to remove all selected collaborators from the table. diff --git a/_dashboards/workspace/workspace.md b/_dashboards/workspace/workspace.md new file mode 100644 index 0000000000..0938c48891 --- /dev/null +++ b/_dashboards/workspace/workspace.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Workspace for OpenSearch Dashboards +nav_order: 110 +has_children: true +--- + +# Workspace for OpenSearch Dashboards +Introduced 2.18 +{: .label .label-purple } + +The Workspace feature in OpenSearch Dashboards enables you to tailor your environment with use-case-specific configurations. For example, you can create dedicated workspaces for observability scenarios, allowing you to focus on relevant functionalities. Additionally, the Workspace feature enables organization of visual assets, such as dashboards and visualizations, within a workspace with isolated storage. + +## Workspace data model + +The Workspace data model is defined by the following structure: + +```typescript +interface Workspace { + id: string; + name: string; + description?: string; + features?: string[]; + color: string; + uiSettings: Record<string, unknown>; +} +``` +{% include copy-curl.html %} + +The Workspace data model is composed of the following key attributes: + +- `id`: String type; unique ID for each each workspace. +- `name`: String type; designates the name of the workspace. +- `description`: Optional string type; provides contextual information for the workspace. +- `features`: Optional array of strings; contains use case IDs linked to the workspace. + +--- + +#### Example Workspace object + +The following object shows a typical Workspace configuration: + +```typescript +{ + id: "M5NqCu", + name: "Analytics team", + description: "Analytics team workspace", + features: ["use-case-analytics"], +} +``` +{% include copy-curl.html %} + +The configuration creates the `Analytics team` using the `use-case-observability` feature set. Use cases map to specific feature groups, limiting functionality to the defined set within each workspace. + +The following are predefined use case options: + +- `use-case-observability` +- `use-case-security-analytics` +- `use-case-search` +- `use-case-essentials` +- `use-case-all` + +--- + +## Associating saved objects with workspaces + +Saved objects in OpenSearch Dashboards, such as dashboards, visualizations, and index patterns, can be associated with specific workspaces, improving organization and accessibility as the volume of objects grows. + +The `workspaces` attribute, an array of strings, is added to saved objects to be linked with one or more workspaces. As a result, saved objects such as dashboards and visualizations are only accessible within their designated workspaces. + +The following saved object shows a dashboard object associated with the workspace `M5NqCu`: + +```typescript +{ + type: "dashboard", + id: "da123f20-6680-11ee-93fa-df944ec23359", + workspaces: ["M5NqCu"] +} +``` +{% include copy-curl.html %} + +Saved objects support association with multiple workspaces, facilitating cross-team collaboration and resource sharing. This feature is useful when an object is relevant to multiple teams, projects, or use cases. + +The following example shows a data source object linked to multiple workspaces: + +```typescript +{ + type: "data-source", + id: "da123f20-6680-11ee-93fa-df944ec23359", + workspaces: ["M5NqCu", "<TeamA-workspace-id>", "<Analytics-workspace-id>"] +} +``` +{% include copy-curl.html %} + +## Non-workspace saved objects + +Not all saved objects in OpenSearch Dashboards are associated with a workspace. Some objects operate independently of the workspace framework. These objects lack `workspace` attributes and serve system-wide functions. For example, the global user interface settings object manages configurations affecting the entire OpenSearch Dashboards interface in order to maintain consistent functionality across all workspaces. + +This dual approach allows OpenSearch Dashboards to balance granular, context-specific customization with overall system consistency. + +## Enabling the Workspace feature + +In your `opensearch_dashboards.yml` file, set the following option: + +```yaml +workspace.enabled: true +uiSettings: + overrides: + "home:useNewHomePage": true +``` +{% include copy-curl.html %} + +If your cluster has the Security plugin installed, then multi-tenancy must be disabled to avoid conflicts with similar workspaces: + +```yaml +opensearch_security.multitenancy.enabled: false +``` +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md index 9628bb6caf..ba574bdf7d 100644 --- a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md +++ b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md @@ -53,6 +53,7 @@ You can configure `random_cut_forest` mode with the following options. | `sample_size` | `256` | 100--2500 | The sample size used in the ML algorithm. | | `time_decay` | `0.1` | 0--1.0 | The time decay value used in the ML algorithm. Used as the mathematical expression `timeDecay` divided by `SampleSize` in the ML algorithm. | | `type` | `metrics` | N/A | The type of data sent to the algorithm. | +| `output_after` | 32 | N/A | Specifies the number of events to process before outputting any detected anomalies. | | `version` | `1.0` | N/A | The algorithm version number. | ## Usage diff --git a/_data-prepper/pipelines/configuration/processors/aws-lambda.md b/_data-prepper/pipelines/configuration/processors/aws-lambda.md new file mode 100644 index 0000000000..bd167996a1 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/aws-lambda.md @@ -0,0 +1,94 @@ +--- +layout: default +title: aws_lambda +parent: Processors +grand_parent: Pipelines +nav_order: 10 +--- + +# aws_lambda integration for Data Prepper + +The [AWS Lambda](https://aws.amazon.com/lambda/) integration allows developers to use serverless computing capabilities within their Data Prepper pipelines for flexible event processing and data routing. + +## AWS Lambda processor configuration + +The `aws_lambda` processor enables invocation of an AWS Lambda function within your Data Prepper pipeline in order to process events. It supports both synchronous and asynchronous invocations based on your use case. + +## Configuration fields + +You can configure the processor using the following configuration options. + +Field | Type | Required | Description +-------------------- | ------- | -------- | ---------------------------------------------------------------------------- +`function_name` | String | Required | The name of the AWS Lambda function to invoke. +`invocation_type` | String | Required | Specifies the invocation type, either `request-response` or `event`. Default is `request-response`. +`aws.region` | String | Required | The AWS Region in which the Lambda function is located. +`aws.sts_role_arn` | String | Optional | The Amazon Resource Name (ARN) of the role to assume before invoking the Lambda function. +`max_retries` | Integer | Optional | The maximum number of retries for failed invocations. Default is `3`. +`batch` | Object | Optional | The batch settings for the Lambda invocations. Default is `key_name = "events"`. Default threshold is `event_count=100`, `maximum_size="5mb"`, and `event_collect_timeout = 10s`. +`lambda_when` | String | Optional | A conditional expression that determines when to invoke the Lambda processor. +`response_codec` | Object | Optional | A codec configuration for parsing Lambda responses. Default is `json`. +`tags_on_match_failure` | List | Optional | A list of tags to add to events when Lambda matching fails or encounters an unexpected error. +`sdk_timeout` | Duration| Optional | Configures the SDK's client connection timeout period. Default is `60s`. +`response_events_match` | Boolean | Optional | Specifies how Data Prepper interprets and processes Lambda function responses. Default is `false`. + +#### Example configuration + +``` +processors: + - aws_lambda: + function_name: "my-lambda-function" + invocation_type: "request-response" + response_events_match: false + aws: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/my-lambda-role" + max_retries: 3 + batch: + key_name: "events" + threshold: + event_count: 100 + maximum_size: "5mb" + event_collect_timeout: PT10S + lambda_when: "event['status'] == 'process'" + +``` +{% include copy-curl.html %} + +## Usage + +The processor supports the following invocation types: + +- `request-response`: The processor waits for Lambda function completion before proceeding. +- `event`: The function is triggered asynchronously without waiting for a response. +- `batch`: When enabled, events are aggregated and sent in bulk to optimize Lambda invocations. Batch thresholds control the event count, size limit, and timeout. +- `codec`: JSON is used for both request and response codecs. Lambda must return JSON array outputs. +- `tags_on_match_failure`: Custom tags can be applied to events when Lambda processing fails or encounters unexpected issues. + +## Behavior + +When configured for batching, the AWS Lambda processor groups multiple events into a single request. This grouping is governed by batch thresholds, which can be based on the event count, size limit, or timeout. The processor then sends the entire batch to the Lambda function as a single payload. + +## Lambda response handling + +The `response_events_match` setting defines how Data Prepper handles the relationship between batch events sent to Lambda and the response received: + +- `true`: Lambda returns a JSON array with results for each batched event. Data Prepper maps this array back to its corresponding original event, ensuring that each event in the batch gets the corresponding part of the response from the array. +- `false`: Lambda returns one or more events for the entire batch. Response events are not correlated with the original events. Original event metadata is not preserved in the response events. For example, when `response_events_match` is set to `true`, the Lambda function is expected to return the same number of response events as the number of original requests, maintaining the original order. + +## Limitations + +Note the following limitations: + +- Payload limitation: 6 MB payload limit +- Response codec: JSON-only codec support + +## Integration testing + +Integration tests for this plugin are executed separately from the main Data Prepper build process. Use the following Gradle command to run these tests: + +``` +./gradlew :data-prepper-plugins:aws-lambda:integrationTest -Dtests.processor.lambda.region="us-east-1" -Dtests.processor.lambda.functionName="lambda_test_function" -Dtests.processor.lambda.sts_role_arn="arn:aws:iam::123456789012:role/dataprepper-role +``` + +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/processors/copy-values.md b/_data-prepper/pipelines/configuration/processors/copy-values.md index f654e6f027..20d6a4a2c7 100644 --- a/_data-prepper/pipelines/configuration/processors/copy-values.md +++ b/_data-prepper/pipelines/configuration/processors/copy-values.md @@ -14,44 +14,126 @@ The `copy_values` processor copies values within an event and is a [mutate event You can configure the `copy_values` processor with the following options. -| Option | Required | Description | -:--- | :--- | :--- -| `entries` | Yes | A list of entries to be copied in an event. | -| `from_key` | Yes | The key of the entry to be copied. | -| `to_key` | Yes | The key of the new entry to be added. | -| `overwrite_if_to_key_exists` | No | When set to `true`, the existing value is overwritten if `key` already exists in the event. The default value is `false`. | +| Option | Required | Type | Description | +:--- | :--- | :--- | :--- +| `entries` | Yes | [entry](#entry) | A list of entries to be copied in an event. See [entry](#entry) for more information. | +| `from_list` | No | String | The key for the list of objects to be copied. | +| `to_list` | No | String | The key for the new list to be added. | +| `overwrite_if_to_list_exists` | No | Boolean | When set to `true`, the existing value is overwritten if the `key` specified by `to_list` already exists in the event. Default is `false`. | + +## entry + +For each entry, you can configure the following options. + +| Option | Required | Type | Description | +:--- | :--- | :--- | :--- +| `from_key` | Yes | String | The key for the entry to be copied. | +| `to_key` | Yes | String | The key for the new entry to be added. | +| `overwrite_if_to_key_exists` | No | Boolean | When set to `true`, the existing value is overwritten if the `key` already exists in the event. Default is `false`. | + ## Usage -To get started, create the following `pipeline.yaml` file: +The following examples show you how to use the `copy_values` processor. + +### Example: Copy values and skip existing fields + +The following example shows you how to configure the processor to copy values and skip existing fields: + +```yaml +... + processor: + - copy_values: + entries: + - from_key: "message1" + to_key: "message2" + - from_key: "message1" + to_key: "message3" +... +``` +{% include copy.html %} + +When the input event contains the following data: + +```json +{"message1": "hello", "message2": "bye"} +``` + +The processor copies "message1" to "message3" but not to "message2" because "message2" already exists. The processed event contains the following data: + +```json +{"message1": "hello", "message2": "bye", "message3": "hello"} +``` + +### Example: Copy values with overwrites + +The following example shows you how to configure the processor to copy values: ```yaml -pipeline: - source: - ... - .... +... processor: - copy_values: entries: - - from_key: "message" - to_key: "newMessage" - overwrite_if_to_key_exists: true - sink: + - from_key: "message1" + to_key: "message2" + overwrite_if_to_key_exists: true + - from_key: "message1" + to_key: "message3" +... ``` {% include copy.html %} -Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +When the input event contains the following data: + +```json +{"message1": "hello", "message2": "bye"} +``` -For example, before you run the `copy_values` processor, if the `logs_json.log` file contains the following event record: +The processor copies "message1" to both "message2" and "message3", overwriting the existing value in "message2". The processed event contains the following data: ```json -{"message": "hello"} +{"message1": "hello", "message2": "hello", "message3": "hello"} ``` -When you run this processor, it parses the message into the following output: +### Example: Selectively copy values between two lists of objects + +The following example shows you how to configure the processor to copy values between lists: + +```yaml +... + processor: + - copy_values: + from_list: mylist + to_list: newlist + entries: + - from_key: name + to_key: fruit_name +... +``` +{% include copy.html %} + +When the input event contains the following data: ```json -{"message": "hello", "newMessage": "hello"} +{ + "mylist": [ + {"name": "apple", "color": "red"}, + {"name": "orange", "color": "orange"} + ] +} ``` -If `newMessage` already exists, its existing value is overwritten with `value`. +The processed event contains a `newlist` with selectively copied fields: + +```json +{ + "newlist": [ + {"fruit_name": "apple"}, + {"fruit_name": "orange"} + ], + "mylist": [ + {"name": "apple", "color": "red"}, + {"name": "orange", "color": "orange"} + ] +} +``` diff --git a/_data-prepper/pipelines/configuration/sinks/aws-lambda.md b/_data-prepper/pipelines/configuration/sinks/aws-lambda.md new file mode 100644 index 0000000000..d8c00bdb16 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sinks/aws-lambda.md @@ -0,0 +1,73 @@ +--- +layout: default +title: aws_lambda +parent: Sinks +grand_parent: Pipelines +nav_order: 10 +--- + +---------------------------------------------------------------------------------------- +# `aws_lambda` sink for Data Prepper + +This page explains how to configure and use [AWS Lambda](https://aws.amazon.com/lambda/) with Data Prepper, enabling Lambda functions to serve as both processors and sinks. + +## `aws_lambda` sink + +Configure the Lambda sink using the following parameters. + +Field | Type | Required | Description +--------------------| ------- | -------- | ---------------------------------------------------------------------------- +`function_name` | String | Yes | The name of the AWS Lambda function to invoke. +`invocation_type` | String | No | Specifies the invocation type. Default is `event`. +`aws.region` | String | Yes | The AWS Region in which the Lambda function is located. +`aws.sts_role_arn` | String | No | The Amazon Resource Name (ARN) of the role to assume before invoking the Lambda function. +`max_retries` | Integer | No | The maximum number of retries if the invocation fails. Default is `3`. +`batch` | Object | No | Optional batch settings for Lambda invocations. Default is `key_name = events`. Default threshold is `event_count=100`, `maximum_size="5mb"`, and `event_collect_timeout = 10s`. +`lambda_when` | String | No | A conditional expression that determines when to invoke the Lambda sink. +`dlq` | Object | No | The dead-letter queue (DLQ) configuration for failed invocations. + +#### Example configuration + +``` +sink: + - aws_lambda: + function_name: "my-lambda-sink" + invocation_type: "event" + aws: + region: "us-west-2" + sts_role_arn: "arn:aws:iam::123456789012:role/my-lambda-sink-role" + max_retries: 5 + batch: + key_name: "events" + threshold: + event_count: 50 + maximum_size: "3mb" + event_collect_timeout: PT5S + lambda_when: "event['type'] == 'log'" + dlq: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/my-sqs-role" + bucket: "<<your-dlq-bucket-name>>" +``` +{% include copy-curl.html %} + +## Usage + +The invocation types are as follows: + +- `event` (Default): Executes functions asynchronously without waiting for responses. +- `request-response` (Sink only): Executes functions synchronously, though responses are not processed. +- `batch`: Automatically groups events based on configured thresholds. +- `dlq`: Supports the DLQ configuration for failed invocations after retry attempts. + +Data Prepper components use an AWS Identity and Access Management (IAM) role assumption, `aws.sts_role_arn`, for secure Lambda function invocation and respect Lambda's concurrency limits during event processing. For more information, see the [AWS Lambda documentation](https://docs.aws.amazon.com/lambda). +{: .note} + +## Developer guide + +Integration tests must be executed separately from the main Data Prepper build. Execute them with the following command: + +``` +./gradlew :data-prepper-plugins:aws-lambda:integrationTest -Dtests.sink.lambda.region="us-east-1" -Dtests.sink.lambda.functionName="lambda_test_function" -Dtests.sink.lambda.sts_role_arn="arn:aws:iam::123456789012:role/dataprepper-role +``` +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/sources/kinesis.md b/_data-prepper/pipelines/configuration/sources/kinesis.md new file mode 100644 index 0000000000..659ed2afe1 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/kinesis.md @@ -0,0 +1,168 @@ +--- +layout: default +title: kinesis +parent: Sources +grand_parent: Pipelines +nav_order: 45 +--- + +# kinesis + +You can use the Data Prepper `kinesis` source to ingest records from one or more [Amazon Kinesis Data Streams](https://aws.amazon.com/kinesis/data-streams/). + +## Usage + +The following example pipeline specifies Kinesis as a source. The pipeline ingests data from multiple Kinesis data streams named `stream1` and `stream2` and sets the `initial_position` to indicate the starting point for reading the stream records: + +```yaml +version: "2" +kinesis-pipeline: + source: + kinesis: + streams: + - stream_name: "stream1" + initial_position: "LATEST" + - stream_name: "stream2" + initial_position: "LATEST" + aws: + region: "us-west-2" + sts_role_arn: "arn:aws:iam::123456789012:role/my-iam-role" +``` + +## Configuration options + +The `kinesis` source supports the following configuration options. + +Option | Required | Type | Description +:--- |:---------|:---------| :--- +`aws` | Yes | AWS | Specifies the AWS configuration. See [`aws`](#aws). +`acknowledgments` | No | Boolean | When set to `true`, enables the `kinesis` source to receive [end-to-end acknowledgments]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines#end-to-end-acknowledgments) when events are received by OpenSearch sinks. +`streams` | Yes | List | Configures a list of multiple Kinesis data streams that the `kinesis` source uses to read records. You can configure up to four streams. See [Streams](#streams). +`codec` | Yes | Codec | Specifies the [codec](#codec) to apply. +`buffer_timeout` | No | Duration | Sets the amount of time allowed for writing events to the Data Prepper buffer before timeout occurs. Any events that the source cannot write to the buffer during the specified amount of time are discarded. Default is `1s`. +`records_to_accumulate` | No | Integer | Determines the number of messages that accumulate before being written to the buffer. Default is `100`. +`consumer_strategy` | No | String | Selects the consumer strategy to use for ingesting Kinesis data streams. The default is `fan-out`, but `polling` can also be used. If `polling` is enabled, the additional configuration is required. +`polling` | No | polling | See [polling](#polling). + +### Streams + +You can use the following options in the `streams` array. + +Option | Required | Type | Description +:--- |:---------| :--- | :--- +`stream_name` | Yes | String | Defines the name of each Kinesis data stream. +`initial_position` | No | String | Sets the `initial_position` to determine at what point the `kinesis` source starts reading stream records. Use `LATEST` to start from the most recent record or `EARLIEST` to start from the beginning of the stream. Default is `LATEST`. +`checkpoint_interval` | No | Duration | Configure the `checkpoint_interval` to periodically checkpoint Kinesis data streams and avoid duplication of record processing. Default is `PT2M`. +`compression` | No | String | Specifies the compression format. To decompress records added by a [CloudWatch Logs Subscription Filter](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/SubscriptionFilters.html) to Kinesis, use the `gzip` compression format. + +## codec + +The `codec` determines how the `kinesis` source parses each Kinesis stream record. For increased and more efficient performance, you can use [codec combinations]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/codec-processor-combinations/) with certain processors. + +### json codec + +The `json` codec parses each single line as a single JSON object from a JSON array and then creates a Data Prepper event for each object in the array. It can be used for parsing nested CloudWatch events into individual log entries. +It also supports the below configuration to use with this codec. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`key_name` | No | String | The name of the input field from which to extract the JSON array and create Data Prepper events. +`include_keys` | No | List | The list of input fields to be extracted and added as additional fields in the Data Prepper event. +`include_keys_metadata` | No | List | The list of input fields to be extracted and added to the Data Prepper event metadata object. + +### `newline` codec + +The `newline` codec parses each Kinesis stream record as a single log event, making it ideal for processing single-line records. It also works well with the [`parse_json` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/parse-json/) to parse each line. + +You can use the following options to configure the `newline` codec. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`skip_lines` | No | Integer | Sets the number of lines to skip before creating events. You can use this configuration to skip common header rows. Default is `0`. +`header_destination` | No | String | Defines a key value to assign to the header line of the stream event. If this option is specified, then each event will contain a `header_destination` field. + +### polling + +When the `consumer_strategy` is set to `polling`, the `kinesis` source uses a polling-based approach to read records from the Kinesis data streams, instead of the default `fan-out` approach. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`max_polling_records` | No | Integer | Sets the number of records to fetch from Kinesis during a single call. +`idle_time_between_reads` | No | Duration | Defines the amount of idle time between calls. + +### aws + +You can use the following options in the `aws` configuration. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`region` | No | String | Sets the AWS Region to use for credentials. Defaults to the [standard SDK behavior for determining the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | Defines the AWS Security Token Service (AWS STS) role to assume for requests to Amazon Kinesis Data Streams and Amazon DynamoDB. Defaults to `null`, which uses the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`aws_sts_header_overrides` | No | Map | Defines a map of header overrides that the AWS Identity and Access Management (IAM) role assumes for the sink plugin. + +## Exposed metadata attributes + +The `kinesis` source adds the following metadata to each processed event. You can access the metadata attributes using the [expression syntax `getMetadata` function]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/get-metadata/). + +- `stream_name`: Contains the name of the Kinesis data stream from which the event was obtained. + +## Permissions + +The following minimum permissions are required in order to run `kinesis` as a source: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "kinesis:DescribeStream", + "kinesis:DescribeStreamConsumer", + "kinesis:DescribeStreamSummary", + "kinesis:GetRecords", + "kinesis:GetShardIterator", + "kinesis:ListShards", + "kinesis:ListStreams", + "kinesis:ListStreamConsumers", + "kinesis:RegisterStreamConsumer", + "kinesis:SubscribeToShard" + ], + "Resource": [ + "arn:aws:kinesis:us-east-1:{account-id}:stream/stream1", + "arn:aws:kinesis:us-east-1:{account-id}:stream/stream2" + ] + }, + { + "Sid": "allowCreateTable", + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:PutItem", + "dynamodb:DescribeTable", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:Scan", + "dynamodb:UpdateItem", + "dynamodb:Query" + ], + "Resource": [ + "arn:aws:dynamodb:us-east-1:{account-id}:table/kinesis-pipeline" + ] + } + ] +} +``` + +The `kinesis` source uses a DynamoDB table for ingestion coordination among multiple workers, so you need DynamoDB permissions. + +## Metrics + +The `kinesis` source includes the following metrics. + +### Counters + +* `recordsProcessed`: Counts the number of processed stream records. +* `recordProcessingErrors`: Counts the number of stream record processing errors. +* `acknowledgementSetSuccesses`: Counts the number of processed stream records that were successfully added to the sink. +* `acknowledgementSetFailures`: Counts the number of processed stream records that failed to be added to the sink. diff --git a/_data-prepper/pipelines/configuration/sources/s3.md b/_data-prepper/pipelines/configuration/sources/s3.md index db92718a36..7ca27ee500 100644 --- a/_data-prepper/pipelines/configuration/sources/s3.md +++ b/_data-prepper/pipelines/configuration/sources/s3.md @@ -104,7 +104,7 @@ Option | Required | Type | Description `s3_select` | No | [s3_select](#s3_select) | The Amazon S3 Select configuration. `scan` | No | [scan](#scan) | The S3 scan configuration. `delete_s3_objects_on_read` | No | Boolean | When `true`, the S3 scan attempts to delete S3 objects after all events from the S3 object are successfully acknowledged by all sinks. `acknowledgments` should be enabled when deleting S3 objects. Default is `false`. -`workers` | No | Integer | Configures the number of worker threads that the source uses to read data from S3. Leave this value as the default unless your S3 objects are less than 1 MB in size. Performance may decrease for larger S3 objects. This setting affects SQS-based sources and S3-Scan sources. Default is `1`. +`workers` | No | Integer | Configures the number of worker threads (1--10) that the source uses to read data from S3. Leave this value as the default unless your S3 objects are less than 1 MB in size. Performance may decrease for larger S3 objects. This setting affects SQS-based sources and S3-Scan sources. Default is `1`. diff --git a/_data-prepper/pipelines/expression-syntax.md b/_data-prepper/pipelines/expression-syntax.md index 383b54c19b..07f68ee58e 100644 --- a/_data-prepper/pipelines/expression-syntax.md +++ b/_data-prepper/pipelines/expression-syntax.md @@ -30,6 +30,9 @@ The following table lists the supported operators. Operators are listed in order |----------------------|-------------------------------------------------------|---------------| | `()` | Priority expression | Left to right | | `not`<br> `+`<br> `-`| Unary logical NOT<br>Unary positive<br>Unary negative | Right to left | +| `*`, `/` | Multiplication and division operators | Left to right | +| `+`, `-` | Addition and subtraction operators | Left to right | +| `+` | String concatenation operator | Left to right | | `<`, `<=`, `>`, `>=` | Relational operators | Left to right | | `==`, `!=` | Equality operators | Left to right | | `and`, `or` | Conditional expression | Left to right | @@ -78,7 +81,6 @@ Conditional expressions allow you to combine multiple expressions or values usin <Any> or <Any> not <Any> ``` -{% include copy-curl.html %} The following are some example conditional expressions: @@ -91,9 +93,64 @@ not /status_code in {200, 202} ``` {% include copy-curl.html %} +### Arithmetic expressions + +Arithmetic expressions enable basic mathematical operations like addition, subtraction, multiplication, and division. These expressions can be combined with conditional expressions to create more complex conditional statements. The available arithmetic operators are +, -, *, and /. The syntax for using the arithmetic operators is as follows: + +``` +<Any> + <Any> +<Any> - <Any> +<Any> * <Any> +<Any> / <Any> +``` + +The following are example arithmetic expressions: + +``` +/value + length(/message) +/bytes / 1024 +/value1 - /value2 +/TimeInSeconds * 1000 +``` +{% include copy-curl.html %} + +The following are some example arithmetic expressions used in conditional expressions : + +``` +/value + length(/message) > 200 +/bytes / 1024 < 10 +/value1 - /value2 != /value3 + /value4 +``` +{% include copy-curl.html %} + +### String concatenation expressions + +String concatenation expressions enable you to combine strings to create new strings. These concatenated strings can also be used within conditional expressions. The syntax for using string concatenation is as follows: + +``` +<String Variable or String Literal> + <String Variable or String Literal> +``` + +The following are example string concatenation expressions: + +``` +/name + "suffix" +"prefix" + /name +"time of " + /timeInMs + " ms" +``` +{% include copy-curl.html %} + +The following are example string concatenation expressions that can be used in conditional expressions: + +``` +/service + ".com" == /url +"www." + /service != /url +``` +{% include copy-curl.html %} + ### Reserved symbols -Reserved symbols are symbols that are not currently used in the expression syntax but are reserved for possible future functionality or extensions. Reserved symbols include `^`, `*`, `/`, `%`, `+`, `-`, `xor`, `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `++`, `--`, and `${<text>}`. +Certain symbols, such as ^, %, xor, =, +=, -=, *=, /=, %=, ++, --, and ${<text>}, are reserved for future functionality or extensions. Reserved symbols include `^`, `%`, `xor`, `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `++`, `--`, and `${<text>}`. ## Syntax components @@ -170,6 +227,9 @@ White space is optional around relational operators, regex equality operators, e | `()` | Priority expression | Yes | `/a==(/b==200)`<br>`/a in ({200})` | `/status in({200})` | | `in`, `not in` | Set operators | Yes | `/a in {200}`<br>`/a not in {400}` | `/a in{200, 202}`<br>`/a not in{400}` | | `<`, `<=`, `>`, `>=` | Relational operators | No | `/status < 300`<br>`/status>=300` | | +| `+` | String concatenation operator | No | `/status_code + /message + "suffix"` +| `+`, `-` | Arithmetic addition and subtraction operators | No | `/status_code + length(/message) - 2` +| `*`, `/` | Multiplication and division operators | No | `/status_code * length(/message) / 3` | `=~`, `!~` | Regex equality operators | No | `/msg =~ "^\w*$"`<br>`/msg=~"^\w*$"` | | | `==`, `!=` | Equality operators | No | `/status == 200`<br>`/status_code==200` | | | `and`, `or`, `not` | Conditional operators | Yes | `/a<300 and /b>200` | `/b<300and/b>200` | diff --git a/_data/top_nav.yml b/_data/top_nav.yml index 51d8138680..6552d90359 100644 --- a/_data/top_nav.yml +++ b/_data/top_nav.yml @@ -63,6 +63,8 @@ items: url: /docs/latest/clients/ - label: Benchmark url: /docs/latest/benchmark/ + - label: Migration Assistant + url: /docs/latest/migration-assistant/ - label: Platform url: /platform/index.html children: diff --git a/_data/versions.json b/_data/versions.json index c14e91fa0c..6fe1c59e6b 100644 --- a/_data/versions.json +++ b/_data/versions.json @@ -1,10 +1,11 @@ { - "current": "2.17", + "current": "2.18", "all": [ - "2.17", + "2.18", "1.3" ], "archived": [ + "2.17", "2.16", "2.15", "2.14", @@ -26,7 +27,7 @@ "1.1", "1.0" ], - "latest": "2.17" + "latest": "2.18" } diff --git a/_field-types/mapping-parameters/analyzer.md b/_field-types/mapping-parameters/analyzer.md new file mode 100644 index 0000000000..32b26da1e0 --- /dev/null +++ b/_field-types/mapping-parameters/analyzer.md @@ -0,0 +1,90 @@ +--- +layout: default +title: Analyzer +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 5 +has_children: false +has_toc: false +--- + +# Analyzer + +The `analyzer` mapping parameter is used to define the text analysis process that applies to a text field during both index and search operations. + +The key functions of the `analyzer` mapping parameter are: + +1. **Tokenization:** The analyzer determines how the text is broken down into individual tokens (words, numbers) that can be indexed and searched. Each generated token must not exceed 32,766 bytes in order to avoid indexing failures. + +2. **Normalization:** The analyzer can apply various normalization techniques, such as converting text to lowercase, removing stop words, and stemming/lemmatizing words. + +3. **Consistency:** By defining the same analyzer for both index and search operations, you ensure that the text analysis process is consistent, which helps improve the relevance of search results. + +4. **Customization:** OpenSearch allows you to define custom analyzers by specifying the tokenizer, character filters, and token filters to be used. This gives you fine-grained control over the text analysis process. + +For information about specific analyzer parameters, such as `analyzer`, `search_analyzer`, or `search_quote_analyzer`, see [Search analyzers]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/). +{: .note} + +------------ + +## Example + +The following example configuration defines a custom analyzer called `my_custom_analyzer`: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_stop_filter", + "my_stemmer" + ] + } + }, + "filter": { + "my_stop_filter": { + "type": "stop", + "stopwords": ["the", "a", "and", "or"] + }, + "my_stemmer": { + "type": "stemmer", + "language": "english" + } + } + } + }, + "mappings": { + "properties": { + "my_text_field": { + "type": "text", + "analyzer": "my_custom_analyzer", + "search_analyzer": "standard", + "search_quote_analyzer": "my_custom_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +In this example, the `my_custom_analyzer` uses the standard tokenizer, converts all tokens to lowercase, applies a custom stop word filter, and applies an English stemmer. + +You can then map a text field so that it uses this custom analyzer for both index and search operations: + +```json +"mappings": { + "properties": { + "my_text_field": { + "type": "text", + "analyzer": "my_custom_analyzer" + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/mapping-parameters/boost.md b/_field-types/mapping-parameters/boost.md new file mode 100644 index 0000000000..f1648a861d --- /dev/null +++ b/_field-types/mapping-parameters/boost.md @@ -0,0 +1,50 @@ +--- +layout: default +title: Boost +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 10 +has_children: false +has_toc: false +--- + +# Boost + +The `boost` mapping parameter is used to increase or decrease the relevance score of a field during search queries. It allows you to apply more or less weight to specific fields when calculating the overall relevance score of a document. + +The `boost` parameter is applied as a multiplier to the score of a field. For example, if a field has a `boost` value of `2`, then the score contribution of that field is doubled. Conversely, a `boost` value of `0.5` would halve the score contribution of that field. + +----------- + +## Example + +The following is an example of how you can use the `boost` parameter in an OpenSearch mapping: + +```json +PUT my-index1 +{ + "mappings": { + "properties": { + "title": { + "type": "text", + "boost": 2 + }, + "description": { + "type": "text", + "boost": 1 + }, + "tags": { + "type": "keyword", + "boost": 1.5 + } + } + } +} +``` +{% include copy-curl.html %} + +In this example, the `title` field has a boost of `2`, which means that it contributes twice as much to the overall relevance score than the description field (which has a boost of `1`). The `tags` field has a boost of `1.5`, so it contributes one and a half times more than the description field. + +The `boost` parameter is particularly useful when you want to apply more weight to certain fields. For example, you might want to boost the `title` field more than the `description` field because the title may be a better indicator of the document's relevance. + +The `boost` parameter is a multiplicative factor---not an additive one. This means that a field with a higher boost value will have a disproportionately large effect on the overall relevance score as compared to fields with lower boost values. When using the `boost` parameter, it is recommended that you start with small values (1.5 or 2) and test the effect on your search results. Overly high boost values can skew the relevance scores and lead to unexpected or undesirable search results. diff --git a/_field-types/mapping-parameters/coerce.md b/_field-types/mapping-parameters/coerce.md new file mode 100644 index 0000000000..3cf844897a --- /dev/null +++ b/_field-types/mapping-parameters/coerce.md @@ -0,0 +1,100 @@ +--- +layout: default +title: Coerce +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 15 +has_children: false +has_toc: false +--- + +# Coerce + +The `coerce` mapping parameter controls how values are converted to the expected field data type during indexing. This parameter lets you verify that your data is formatted and indexed properly, following the expected field types. This improves the accuracy of your search results. + +--- + +## Examples + +The following examples demonstrate how to use the `coerce` mapping parameter. + +#### Indexing a document with `coerce` enabled + +```json +PUT products +{ + "mappings": { + "properties": { + "price": { + "type": "integer", + "coerce": true + } + } + } +} + +PUT products/_doc/1 +{ + "name": "Product A", + "price": "19.99" +} +``` +{% include copy-curl.html %} + +In this example, the `price` field is defined as an `integer` type with `coerce` set to `true`. When indexing the document, the string value `19.99` is coerced to the integer `19`. + +#### Indexing a document with `coerce` disabled + +```json +PUT orders +{ + "mappings": { + "properties": { + "quantity": { + "type": "integer", + "coerce": false + } + } + } +} + +PUT orders/_doc/1 +{ + "item": "Widget", + "quantity": "10" +} +``` +{% include copy-curl.html %} + +In this example, the `quantity` field is defined as an `integer` type with `coerce` set to `false`. When indexing the document, the string value `10` is not coerced, and the document is rejected because of the type mismatch. + +#### Setting the index-level coercion setting + +```json +PUT inventory +{ + "settings": { + "index.mapping.coerce": false + }, + "mappings": { + "properties": { + "stock_count": { + "type": "integer", + "coerce": true + }, + "sku": { + "type": "keyword" + } + } + } +} + +PUT inventory/_doc/1 +{ + "sku": "ABC123", + "stock_count": "50" +} +``` +{% include copy-curl.html %} + +In this example, the index-level `index.mapping.coerce` setting is set to `false`, which disables coercion for the index. However, the `stock_count` field overrides this setting and enables coercion for this specific field. diff --git a/_field-types/mapping-parameters/copy-to.md b/_field-types/mapping-parameters/copy-to.md new file mode 100644 index 0000000000..b029f814b5 --- /dev/null +++ b/_field-types/mapping-parameters/copy-to.md @@ -0,0 +1,109 @@ +--- +layout: default +title: Copy_to +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 20 +has_children: false +has_toc: false +--- + +# Copy_to + +The `copy_to` parameter allows you to copy the values of multiple fields into a single field. This parameter can be useful if you often search across multiple fields because it allows you to search the group field instead. + +Only the field value is copied and not the terms resulting from the analysis process. The original `_source` field remains unmodified, and the same value can be copied to multiple fields using the `copy_to` parameter. However, recursive copying through intermediary fields is not supported; instead, use `copy_to` directly from the originating field to multiple target fields. + +--- + +## Examples + +The following example uses the `copy_to` parameter to search for products by their name and description and copy those values into a single field: + +```json +PUT my-products-index +{ + "mappings": { + "properties": { + "name": { + "type": "text", + "copy_to": "product_info" + }, + "description": { + "type": "text", + "copy_to": "product_info" + }, + "product_info": { + "type": "text" + }, + "price": { + "type": "float" + } + } + } +} + +PUT my-products-index/_doc/1 +{ + "name": "Wireless Headphones", + "description": "High-quality wireless headphones with noise cancellation", + "price": 99.99 +} + +PUT my-products-index/_doc/2 +{ + "name": "Bluetooth Speaker", + "description": "Portable Bluetooth speaker with long battery life", + "price": 49.99 +} +``` +{% include copy-curl.html %} + +In this example, the values from the `name` and `description` fields are copied into the `product_info` field. You can now search for products by querying the `product_info` field, as follows: + +```json +GET my-products-index/_search +{ + "query": { + "match": { + "product_info": "wireless headphones" + } + } +} +``` +{% include copy-curl.html %} + +## Response + +```json +{ + "took": 20, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.9061546, + "hits": [ + { + "_index": "my-products-index", + "_id": "1", + "_score": 1.9061546, + "_source": { + "name": "Wireless Headphones", + "description": "High-quality wireless headphones with noise cancellation", + "price": 99.99 + } + } + ] + } +} +``` + diff --git a/_field-types/mapping-parameters/doc-values.md b/_field-types/mapping-parameters/doc-values.md new file mode 100644 index 0000000000..9ea9364c0b --- /dev/null +++ b/_field-types/mapping-parameters/doc-values.md @@ -0,0 +1,46 @@ +--- +layout: default +title: doc_values +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 25 +has_children: false +has_toc: false +--- + +# doc_values + +By default, OpenSearch indexes most fields for search purposes. The `doc_values ` parameter enables document-to-term lookups for operations such as sorting, aggregations, and scripting. + +The `doc_values` parameter accepts the following options. + +Option | Description +:--- | :--- +`true` | Enables `doc_values` for the field. Default is `true`. +`false` | Disables `doc_values` for the field. + +The `doc_values` parameter is not supported for use in text fields. + +--- + +## Example: Creating an index with `doc_values` enabled and disabled + +The following example request creates an index with `doc_values` enabled for one field and disabled for another: + +```json +PUT my-index-001 +{ + "mappings": { + "properties": { + "status_code": { + "type": "keyword" + }, + "session_id": { + "type": "keyword", + "doc_values": false + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/dynamic.md b/_field-types/mapping-parameters/dynamic.md similarity index 96% rename from _field-types/dynamic.md rename to _field-types/mapping-parameters/dynamic.md index 59f59bfe3d..2d48e98082 100644 --- a/_field-types/dynamic.md +++ b/_field-types/mapping-parameters/dynamic.md @@ -1,18 +1,22 @@ --- layout: default -title: Dynamic parameter -nav_order: 10 +title: Dynamic +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 30 +has_children: false +has_toc: false redirect_from: - /opensearch/dynamic/ --- -# Dynamic parameter +# Dynamic The `dynamic` parameter specifies whether newly detected fields can be added dynamically to a mapping. It accepts the parameters listed in the following table. Parameter | Description :--- | :--- -`true` | Specfies that new fields can be added dynamically to the mapping. Default is `true`. +`true` | Specifies that new fields can be added dynamically to the mapping. Default is `true`. `false` | Specifies that new fields cannot be added dynamically to the mapping. If a new field is detected, then it is not indexed or searchable but can be retrieved from the `_source` field. `strict` | Throws an exception. The indexing operation fails when new fields are detected. `strict_allow_templates` | Adds new fields if they match predefined dynamic templates in the mapping. diff --git a/_field-types/mapping-parameters/enabled.md b/_field-types/mapping-parameters/enabled.md new file mode 100644 index 0000000000..2695ff7529 --- /dev/null +++ b/_field-types/mapping-parameters/enabled.md @@ -0,0 +1,47 @@ +--- +layout: default +title: enabled +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 40 +has_children: false +has_toc: false +--- + +# Enabled + +The `enabled` parameter allows you to control whether OpenSearch parses the contents of a field. This parameter can be applied to the top-level mapping definition and to object fields. + +The `enabled` parameter accepts the following values. + +Parameter | Description +:--- | :--- +`true` | The field is parsed and indexed. Default is `true`. +`false` | The field is not parsed or indexed but is still retrievable from the `_source` field. When `enabled` is set to `false`, OpenSearch stores the field's value in the `_source` field but does not index or parse its contents. This can be useful for fields that you want to store but do not need to search, sort, or aggregate on. + +--- + +## Example: Using the `enabled` parameter + +In the following example request, the `session_data` field is disabled. OpenSearch stores its contents in the `_source` field but does not index or parse them: + +```json +PUT my-index-002 +{ + "mappings": { + "properties": { + "user_id": { + "type": "keyword" + }, + "last_updated": { + "type": "date" + }, + "session_data": { + "type": "object", + "enabled": false + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/mapping-parameters/index.md b/_field-types/mapping-parameters/index.md new file mode 100644 index 0000000000..ca5586bb8f --- /dev/null +++ b/_field-types/mapping-parameters/index.md @@ -0,0 +1,28 @@ +--- +layout: default +title: Mapping parameters +nav_order: 75 +has_children: true +has_toc: false +--- + +# Mapping parameters + +Mapping parameters are used to configure the behavior of index fields. For parameter use cases, see a mapping parameter's respective page. + +The following table lists OpenSearch mapping parameters. + +Parameter | Description +:--- | :--- +`analyzer` | Specifies the analyzer used to analyze string fields. Default is the `standard` analyzer, which is a general-purpose analyzer that splits text on white space and punctuation, converts to lowercase, and removes stop words. Allowed values are `standard`, `simple`, and `whitespace`. +`boost` | Specifies a field-level boost factor applied at query time. Allows you to increase or decrease the relevance score of a specific field during search queries. Default boost value is `1.0`, which means no boost is applied. Allowed values are any positive floating-point number. +`coerce` | Controls how values are converted to the expected field data type during indexing. Default value is `true`, which means that OpenSearch tries to coerce the value to the expected value type. Allowed values are `true` or `false`. +`copy_to` | Copies the value of a field to another field. There is no default value for this parameter. Optional. +`doc_values` | Specifies whether a field should be stored on disk to make sorting and aggregation faster. Default value is `true`, which means that the doc values are enabled. Allowed values are a single field name or a list of field names. +`dynamic` | Determines whether new fields should be added dynamically. Default value is `true`, which means that new fields can be added dynamically. Allowed values are `true`, `false`, or `strict`. +`enabled` | Specifies whether the field is enabled or disabled. Default value is `true`, which means that the field is enabled. Allowed values are `true` or `false`. +`format` | Specifies the date format for date fields. There is no default value for this parameter. Allowed values are any valid date format string, such as `yyyy-MM-dd` or `epoch_millis`. +`ignore_above` | Skips indexing values that exceed the specified length. Default value is `2147483647`, which means that there is no limit on the field value length. Allowed values are any positive integer. +`ignore_malformed` | Specifies whether malformed values should be ignored. Default value is `false`, which means that malformed values are not ignored. Allowed values are `true` or `false`. +`index` | Specifies whether a field should be indexed. Default value is `true`, which means that the field is indexed. Allowed values are `true`, `false`, or `not_analyzed`. +`index_options` | Specifies what information should be stored in an index for scoring purposes. Default value is `docs`, which means that only the document numbers are stored in the index. Allowed values are `docs`, `freqs`, `positions`, or `offsets`. \ No newline at end of file diff --git a/_field-types/supported-field-types/flat-object.md b/_field-types/supported-field-types/flat-object.md index c9e59710e1..65d7c6dc8e 100644 --- a/_field-types/supported-field-types/flat-object.md +++ b/_field-types/supported-field-types/flat-object.md @@ -56,7 +56,8 @@ The flat object field type supports the following queries: - [Multi-match]({{site.url}}{{site.baseurl}}/query-dsl/full-text/multi-match/) - [Query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) - [Simple query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/simple-query-string/) -- [Exists]({{site.url}}{{site.baseurl}}/query-dsl/term/exists/) +- [Exists]({{site.url}}{{site.baseurl}}/query-dsl/term/exists/) +- [Wildcard]({{site.url}}{{site.baseurl}}/query-dsl/term/wildcard/) ## Limitations @@ -243,4 +244,4 @@ PUT /test-index/ ``` {% include copy-curl.html %} -Because `issue.number` is not part of the flat object, you can use it to aggregate and sort documents. \ No newline at end of file +Because `issue.number` is not part of the flat object, you can use it to aggregate and sort documents. diff --git a/_field-types/supported-field-types/index.md b/_field-types/supported-field-types/index.md index a43da396d5..5540e73f4e 100644 --- a/_field-types/supported-field-types/index.md +++ b/_field-types/supported-field-types/index.md @@ -30,6 +30,7 @@ IP | [`ip`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/ip/): k-NN vector | [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/): Allows indexing a k-NN vector into OpenSearch and performing different kinds of k-NN search. Percolator | [`percolator`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/percolator/): Specifies to treat this field as a query. Derived | [`derived`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/derived/): Creates new fields dynamically by executing scripts on existing fields. +Star-tree | [`star_tree`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/star-tree/): Precomputes aggregations and stores them in a [star-tree index](https://docs.pinot.apache.org/basics/indexing/star-tree-index), accelerating the performance of aggregation queries. ## Arrays diff --git a/_field-types/supported-field-types/star-tree.md b/_field-types/supported-field-types/star-tree.md new file mode 100644 index 0000000000..2bfccb6632 --- /dev/null +++ b/_field-types/supported-field-types/star-tree.md @@ -0,0 +1,184 @@ +--- +layout: default +title: Star-tree +nav_order: 61 +parent: Supported field types +--- + +# Star-tree field type + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +A star-tree index precomputes aggregations, accelerating the performance of aggregation queries. +If a star-tree index is configured as part of an index mapping, the star-tree index is created and maintained as data is ingested in real time. + +OpenSearch will automatically use the star-tree index to optimize aggregations if the queried fields are part of star-tree index dimension fields and the aggregations are on star-tree index metric fields. No changes are required in the query syntax or the request parameters. + +For more information, see [Star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/). + +## Prerequisites + +To use a star-tree index, follow the instructions in [Enabling a star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index#enabling-a-star-tree-index). + +## Examples + +The following examples show how to use a star-tree index. + +### Star-tree index mappings + +Define star-tree index mappings in the `composite` section in `mappings`. + +The following example API request creates a corresponding star-tree index named`request_aggs`. To compute metric aggregations for `request_size` and `latency` fields with queries on `port` and `status` fields, configure the following mappings: + +```json +PUT logs +{ + "settings": { + "index.number_of_shards": 1, + "index.number_of_replicas": 0, + "index.composite_index": true + }, + "mappings": { + "composite": { + "request_aggs": { + "type": "star_tree", + "config": { + "max_leaf_docs": 10000, + "skip_star_node_creation_for_dimensions": [ + "port" + ], + "ordered_dimensions": [ + { + "name": "status" + }, + { + "name": "port" + } + ], + "metrics": [ + { + "name": "request_size", + "stats": [ + "sum", + "value_count", + "min", + "max" + ] + }, + { + "name": "latency", + "stats": [ + "sum", + "value_count", + "min", + "max" + ] + } + ] + } + } + }, + "properties": { + "status": { + "type": "integer" + }, + "port": { + "type": "integer" + }, + "request_size": { + "type": "integer" + }, + "latency": { + "type": "scaled_float", + "scaling_factor": 10 + } + } + } +} +``` + +## Star-tree index configuration options + +You can customize your star-tree implementation using the following `config` options in the `mappings` section. These options cannot be modified without reindexing. + +| Parameter | Description | +| :--- | :--- | +| `ordered_dimensions` | A [list of fields](#ordered-dimensions) based on which metrics will be aggregated in a star-tree index. Required. | +| `metrics` | A [list of metric](#metrics) fields required in order to perform aggregations. Required. | +| `max_leaf_docs` | The maximum number of star-tree documents that a leaf node can point to. After the maximum number of documents is reached, child nodes will be created based on the unique value of the next field in the `ordered_dimension` (if any). Default is `10000`. A lower value will use more storage but result in faster query performance. Inversely, a higher value will use less storage but result in slower query performance. For more information, see [Star-tree indexing structure]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/#star-tree-index-structure). | +| `skip_star_node_creation_for_dimensions` | A list of dimensions for which a star-tree index will skip star node creation. When `true`, this reduces storage size at the expense of query performance. Default is `false`. For more information about star nodes, see [Star-tree indexing structure]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/#star-tree-index-structure). | + + +### Ordered dimensions + +The `ordered_dimensions` parameter contains fields based on which metrics will be aggregated in a star-tree index. The star-tree index will be selected for querying only if all the fields in the query are part of the `ordered_dimensions`. + +When using the `ordered_dimesions` parameter, follow these best practices: + +- The order of dimensions matters. You can define the dimensions ordered from the highest cardinality to the lowest cardinality for efficient storage and query pruning. +- Avoid using high-cardinality fields as dimensions. High-cardinality fields adversely affect storage space, indexing throughput, and query performance. +- Currently, fields supported by the `ordered_dimensions` parameter are all [numeric field types](https://opensearch.org/docs/latest/field-types/supported-field-types/numeric/), with the exception of `unsigned_long`. For more information, see [GitHub issue #15231](https://github.com/opensearch-project/OpenSearch/issues/15231). +- Support for other field types, such as `keyword` and `ip`, will be added in future versions. For more information, see [GitHub issue #16232](https://github.com/opensearch-project/OpenSearch/issues/16232). +- A minimum of `2` and a maximum of `10` dimensions are supported per star-tree index. + +The `ordered_dimensions` parameter supports the following property. + +| Parameter | Required/Optional | Description | +| :--- | :--- | :--- | +| `name` | Required | The name of the field. The field name should be present in the `properties` section as part of the index `mapping`. Ensure that the `doc_values` setting is `enabled` for any associated fields. | + + +### Metrics + +Configure any metric fields on which you need to perform aggregations. `Metrics` are required as part of a star-tree index configuration. + +When using `metrics`, follow these best practices: + +- Currently, fields supported by `metrics` are all [numeric field types](https://opensearch.org/docs/latest/field-types/supported-field-types/numeric/), with the exception of `unsigned_long`. For more information, see [GitHub issue #15231](https://github.com/opensearch-project/OpenSearch/issues/15231). +- Supported metric aggregations include `Min`, `Max`, `Sum`, `Avg`, and `Value_count`. + - `Avg` is a derived metric based on `Sum` and `Value_count` and is not indexed when a query is run. The remaining base metrics are indexed. +- A maximum of `100` base metrics are supported per star-tree index. + +If `Min`, `Max`, `Sum`, and `Value_count` are defined as `metrics` for each field, then up to 25 such fields can be configured, as shown in the following example: + +```json +{ + "metrics": [ + { + "name": "field1", + "stats": [ + "sum", + "value_count", + "min", + "max" + ], + ..., + ..., + "name": "field25", + "stats": [ + "sum", + "value_count", + "min", + "max" + ] + } + ] +} +``` + + +#### Properties + +The `metrics` parameter supports the following properties. + +| Parameter | Required/Optional | Description | +| :--- | :--- | :--- | +| `name` | Required | The name of the field. The field name should be present in the `properties` section as part of the index `mapping`. Ensure that the `doc_values` setting is `enabled` for any associated fields. | +| `stats` | Optional | A list of metric aggregations computed for each field. You can choose between `Min`, `Max`, `Sum`, `Avg`, and `Value Count`.<br/>Default is `Sum` and `Value_count`.<br/>`Avg` is a derived metric statistic that will automatically be supported in queries if `Sum` and `Value_Count` are present as part of metric `stats`. + + +## Supported queries and aggregations + +For more information about supported queries and aggregations, see [Supported queries and aggregations for a star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/#supported-queries-and-aggregations). + diff --git a/_getting-started/communicate.md b/_getting-started/communicate.md index 3472270c30..773558fb21 100644 --- a/_getting-started/communicate.md +++ b/_getting-started/communicate.md @@ -28,7 +28,7 @@ curl -X GET "http://localhost:9200/_cluster/health" If you're using the Security plugin, provide the username and password in the request: ```bash -curl -X GET "http://localhost:9200/_cluster/health" -ku admin:<custom-admin-password> +curl -X GET "https://localhost:9200/_cluster/health" -ku admin:<custom-admin-password> ``` {% include copy.html %} @@ -317,4 +317,4 @@ Once a field is created, you cannot change its type. Changing a field type requi ## Next steps -- See [Ingest data into OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/) to learn about ingestion options. \ No newline at end of file +- See [Ingest data into OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/) to learn about ingestion options. diff --git a/_includes/cards.html b/_includes/cards.html index 6d958e61a5..5ab37b8c27 100644 --- a/_includes/cards.html +++ b/_includes/cards.html @@ -30,8 +30,14 @@ <p class="description">Measure performance metrics for your OpenSearch cluster</p> <p class="last-link">Documentation →</p> </div> + + <div class="card"> + <a href="{{site.url}}/docs/latest/migration-assistant/" class='card-link'></a> + <p class="heading">Migration Assistant</p> + <p class="description">Migrate to OpenSearch from other platforms</p> + <p class="last-link">Documentation →</p> + </div> </div> </div> - diff --git a/_includes/header.html b/_includes/header.html index 20d82c451e..32d5b14774 100644 --- a/_includes/header.html +++ b/_includes/header.html @@ -82,7 +82,7 @@ {% endif %} <div role="banner" id="top"> <div class="navigation-container"> - <a class="navigation-container--logo" href="{{ '/' | relative_url }}"> + <a class="navigation-container--logo" href="https://opensearch.org/"> OpenSearch <svg width="200" height="39" viewBox="0 0 200 39" fill="none" xmlns="http://www.w3.org/2000/svg"> <g clip-path="url(#clip0_723_1352)"> diff --git a/_ingest-pipelines/processors/append.md b/_ingest-pipelines/processors/append.md index 8101cf97c9..fe222e647f 100644 --- a/_ingest-pipelines/processors/append.md +++ b/_ingest-pipelines/processors/append.md @@ -43,6 +43,7 @@ Parameter | Required/Optional | Description | `description` | Optional | A brief description of the processor. | `if` | Optional | A condition for running the processor. | `ignore_failure` | Optional | Specifies whether the processor continues execution even if it encounters errors. If set to `true`, failures are ignored. Default is `false`. | +`allow_duplicates` | Optional | Specifies whether to append the values already contained in the field. If `true`, duplicate values are appended. Otherwise, they are skipped. | `on_failure` | Optional | A list of processors to run if the processor fails. | `tag` | Optional | An identifier tag for the processor. Useful for debugging in order to distinguish between processors of the same type. | diff --git a/_install-and-configure/additional-plugins/index.md b/_install-and-configure/additional-plugins/index.md index 87d0662442..afc17cd8b2 100644 --- a/_install-and-configure/additional-plugins/index.md +++ b/_install-and-configure/additional-plugins/index.md @@ -9,29 +9,30 @@ nav_order: 10 There are many more plugins available in addition to those provided by the standard distribution of OpenSearch. These additional plugins have been built by OpenSearch developers or members of the OpenSearch community. While it isn't possible to provide an exhaustive list (because many plugins are not maintained in an OpenSearch GitHub repository), the following plugins, available in the [OpenSearch/plugins](https://github.com/opensearch-project/OpenSearch/tree/main/plugins) directory on GitHub, are some of the plugins that can be installed using one of the installation options, for example, using the command `bin/opensearch-plugin install <plugin-name>`. -| Plugin name | Earliest available version | -| :--- | :--- | -| analysis-icu | 1.0.0 | -| analysis-kuromoji | 1.0.0 | -| analysis-nori | 1.0.0 | -| analysis-phonetic | 1.0.0 | -| analysis-smartcn | 1.0.0 | -| analysis-stempel | 1.0.0 | -| analysis-ukrainian | 1.0.0 | -| discovery-azure-classic | 1.0.0 | -| discovery-ec2 | 1.0.0 | -| discovery-gce | 1.0.0 | -| [`ingest-attachment`]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/ingest-attachment-plugin/) | 1.0.0 | -| mapper-annotated-text | 1.0.0 | -| mapper-murmur3 | 1.0.0 | -| [`mapper-size`]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/mapper-size-plugin/) | 1.0.0 | -| query-insights | 2.12.0 | -| repository-azure | 1.0.0 | -| repository-gcs | 1.0.0 | -| repository-hdfs | 1.0.0 | -| repository-s3 | 1.0.0 | -| store-smb | 1.0.0 | -| transport-nio | 1.0.0 | +| Plugin name | Earliest available version | +|:-----------------------------------------------------------------------------------------------------------------------|:---------------------------| +| analysis-icu | 1.0.0 | +| analysis-kuromoji | 1.0.0 | +| analysis-nori | 1.0.0 | +| [`analysis-phonenumber`]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/phone-analyzers/) | 2.18.0 | +| analysis-phonetic | 1.0.0 | +| analysis-smartcn | 1.0.0 | +| analysis-stempel | 1.0.0 | +| analysis-ukrainian | 1.0.0 | +| discovery-azure-classic | 1.0.0 | +| discovery-ec2 | 1.0.0 | +| discovery-gce | 1.0.0 | +| [`ingest-attachment`]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/ingest-attachment-plugin/) | 1.0.0 | +| mapper-annotated-text | 1.0.0 | +| mapper-murmur3 | 1.0.0 | +| [`mapper-size`]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/mapper-size-plugin/) | 1.0.0 | +| query-insights | 2.12.0 | +| repository-azure | 1.0.0 | +| repository-gcs | 1.0.0 | +| repository-hdfs | 1.0.0 | +| repository-s3 | 1.0.0 | +| store-smb | 1.0.0 | +| transport-nio | 1.0.0 | ## Related articles diff --git a/_install-and-configure/configuring-opensearch/cluster-settings.md b/_install-and-configure/configuring-opensearch/cluster-settings.md index 9af0f5c5b1..65804a5de9 100644 --- a/_install-and-configure/configuring-opensearch/cluster-settings.md +++ b/_install-and-configure/configuring-opensearch/cluster-settings.md @@ -13,9 +13,9 @@ To learn more about static and dynamic settings, see [Configuring OpenSearch]({{ ## Cluster-level routing and allocation settings -OpenSearch supports the following cluster-level routing and shard allocation settings. All settings in this list are dynamic: +OpenSearch supports the following cluster-level routing and shard allocation settings: -- `cluster.routing.allocation.enable` (String): Enables or disables allocation for specific kinds of shards. +- `cluster.routing.allocation.enable` (Dynamic, string): Enables or disables allocation for specific kinds of shards. Valid values are: - `all` – Allows shard allocation for all types of shards. @@ -25,17 +25,17 @@ OpenSearch supports the following cluster-level routing and shard allocation set Default is `all`. -- `cluster.routing.allocation.node_concurrent_incoming_recoveries` (Integer): Configures how many concurrent incoming shard recoveries are allowed to happen on a node. Default is `2`. +- `cluster.routing.allocation.node_concurrent_incoming_recoveries` (Dynamic, integer): Configures how many concurrent incoming shard recoveries are allowed to occur on a node. Default is `2`. -- `cluster.routing.allocation.node_concurrent_outgoing_recoveries` (Integer): Configures how many concurrent outgoing shard recoveries are allowed to happen on a node. Default is `2`. +- `cluster.routing.allocation.node_concurrent_outgoing_recoveries` (Dynamic, integer): Configures how many concurrent outgoing shard recoveries are allowed to occur on a node. Default is `2`. -- `cluster.routing.allocation.node_concurrent_recoveries` (String): Used to set `cluster.routing.allocation.node_concurrent_incoming_recoveries` and `cluster.routing.allocation.node_concurrent_outgoing_recoveries` to the same value. +- `cluster.routing.allocation.node_concurrent_recoveries` (Dynamic, string): Used to set `cluster.routing.allocation.node_concurrent_incoming_recoveries` and `cluster.routing.allocation.node_concurrent_outgoing_recoveries` to the same value. -- `cluster.routing.allocation.node_initial_primaries_recoveries` (Integer): Sets the number of recoveries for unassigned primaries after a node restart. Default is `4`. +- `cluster.routing.allocation.node_initial_primaries_recoveries` (Dynamic, integer): Sets the number of recoveries for unassigned primaries after a node restart. Default is `4`. -- `cluster.routing.allocation.same_shard.host` (Boolean): When set to `true`, multiple copies of a shard are prevented from being allocated to distinct nodes on the same host. Default is `false`. +- `cluster.routing.allocation.same_shard.host` (Dynamic, Boolean): When set to `true`, multiple copies of a shard are prevented from being allocated to distinct nodes on the same host. Default is `false`. -- `cluster.routing.rebalance.enable` (String): Enables or disables rebalancing for specific kinds of shards. +- `cluster.routing.rebalance.enable` (Dynamic, string): Enables or disables rebalancing for specific kinds of shards. Valid values are: - `all` – Allows shard balancing for all types of shards. @@ -45,7 +45,7 @@ OpenSearch supports the following cluster-level routing and shard allocation set Default is `all`. -- `cluster.routing.allocation.allow_rebalance` (String): Specifies when shard rebalancing is allowed. +- `cluster.routing.allocation.allow_rebalance` (Dynamic, string): Specifies when shard rebalancing is allowed. Valid values are: - `always` – Always allow rebalancing. @@ -54,35 +54,35 @@ OpenSearch supports the following cluster-level routing and shard allocation set Default is `indices_all_active`. -- `cluster.routing.allocation.cluster_concurrent_rebalance` (Integer): Allows you to control how many concurrent shard rebalances are allowed across a cluster. Default is `2`. +- `cluster.routing.allocation.cluster_concurrent_rebalance` (Dynamic, integer): Allows you to control how many concurrent shard rebalances are allowed across a cluster. Default is `2`. -- `cluster.routing.allocation.balance.shard` (Floating point): Defines the weight factor for the total number of shards allocated per node. Default is `0.45`. +- `cluster.routing.allocation.balance.shard` (Dynamic, floating point): Defines the weight factor for the total number of shards allocated per node. Default is `0.45`. -- `cluster.routing.allocation.balance.index` (Floating point): Defines the weight factor for the number of shards per index allocated on a node. Default is `0.55`. +- `cluster.routing.allocation.balance.index` (Dynamic, floating point): Defines the weight factor for the number of shards per index allocated on a node. Default is `0.55`. -- `cluster.routing.allocation.balance.threshold` (Floating point): The minimum optimization value of operations that should be performed. Default is `1.0`. +- `cluster.routing.allocation.balance.threshold` (Dynamic, floating point): The minimum optimization value of operations that should be performed. Default is `1.0`. -- `cluster.routing.allocation.balance.prefer_primary` (Boolean): When set to `true`, OpenSearch attempts to evenly distribute the primary shards between the cluster nodes. Enabling this setting does not always guarantee an equal number of primary shards on each node, especially in the event of failover. Changing this setting to `false` after it was set to `true` does not invoke redistribution of primary shards. Default is `false`. +- `cluster.routing.allocation.balance.prefer_primary` (Dynamic, Boolean): When set to `true`, OpenSearch attempts to evenly distribute the primary shards between the cluster nodes. Enabling this setting does not always guarantee an equal number of primary shards on each node, especially in the event of a failover. Changing this setting to `false` after it was set to `true` does not invoke redistribution of primary shards. Default is `false`. -- `cluster.routing.allocation.rebalance.primary.enable` (Boolean): When set to `true`, OpenSearch attempts to rebalance the primary shards between the cluster nodes. When enabled, the cluster tries to maintain the number of primary shards on each node, with the maximum buffer defined by the `cluster.routing.allocation.rebalance.primary.buffer` setting. Changing this setting to `false` after it was set to `true` does not invoke the redistribution of primary shards. Default is `false`. +- `cluster.routing.allocation.rebalance.primary.enable` (Dynamic, Boolean): When set to `true`, OpenSearch attempts to rebalance the primary shards between the cluster nodes. When enabled, the cluster tries to maintain the number of primary shards on each node, with the maximum buffer defined by the `cluster.routing.allocation.rebalance.primary.buffer` setting. Changing this setting to `false` after it was set to `true` does not invoke the redistribution of primary shards. Default is `false`. -- `cluster.routing.allocation.rebalance.primary.buffer` (Floating point): Defines the maximum allowed buffer of primary shards between nodes when `cluster.routing.allocation.rebalance.primary.enable` is enabled. Default is `0.1`. +- `cluster.routing.allocation.rebalance.primary.buffer` (Dynamic, floating point): Defines the maximum allowed buffer of primary shards between nodes when `cluster.routing.allocation.rebalance.primary.enable` is enabled. Default is `0.1`. -- `cluster.routing.allocation.disk.threshold_enabled` (Boolean): When set to `false`, disables the disk allocation decider. This will also remove any existing `index.blocks.read_only_allow_delete index blocks` when disabled. Default is `true`. +- `cluster.routing.allocation.disk.threshold_enabled` (Dynamic, Boolean): When set to `false`, disables the disk allocation decider. This will also remove any existing `index.blocks.read_only_allow_delete index blocks` when disabled. Default is `true`. -- `cluster.routing.allocation.disk.watermark.low` (String): Controls the low watermark for disk usage. When set to a percentage, OpenSearch will not allocate shards to nodes with that percentage of disk used. This can also be entered as ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. This setting does not affect the primary shards of newly created indexes, but will prevent their replicas from being allocated. Default is `85%`. +- `cluster.routing.allocation.disk.watermark.low` (Dynamic, string): Controls the low watermark for disk usage. When set to a percentage, OpenSearch will not allocate shards to nodes with that percentage of disk usage. This can also be entered as a ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. This setting does not affect the primary shards of newly created indexes but will prevent their replicas from being allocated. Default is `85%`. -- `cluster.routing.allocation.disk.watermark.high` (String): Controls the high watermark. OpenSearch will attempt to relocate shards away from a node whose disk usage is above the percentage defined. This can also be entered as a ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. This setting affects the allocation of all shards. Default is `90%`. +- `cluster.routing.allocation.disk.watermark.high` (Dynamic, string): Controls the high watermark. OpenSearch will attempt to relocate shards away from a node whose disk usage is above the defined percentage. This can also be entered as a ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. This setting affects the allocation of all shards. Default is `90%`. -- `cluster.routing.allocation.disk.watermark.flood_stage` (String): Controls the flood stage watermark. This is a last resort to prevent nodes from running out of disk space. OpenSearch enforces a read-only index block (`index.blocks.read_only_allow_delete`) on every index that has one or more shards allocated on the node and that has at least one disk exceeding the flood stage. The index block is released once the disk utilization falls below the high watermark. This can also be entered as a ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. Default is `95%`. +- `cluster.routing.allocation.disk.watermark.flood_stage` (Dynamic, string): Controls the flood stage watermark. This is a last resort to prevent nodes from running out of disk space. OpenSearch enforces a read-only index block (`index.blocks.read_only_allow_delete`) on every index that has one or more shards allocated on the node and at least one disk exceeding the flood stage. The index block is released once the disk utilization falls below the high watermark. This can also be entered as a ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. Default is `95%`. -- `cluster.info.update.interval` (Time unit): Sets how often OpenSearch should check disk usage for each node in the cluster. Default is `30s`. +- `cluster.info.update.interval` (Dynamic, time unit): Sets how often OpenSearch should check disk usage for each node in the cluster. Default is `30s`. -- `cluster.routing.allocation.include.<attribute>` (Enum): Allocates shards to a node whose `attribute` has at least one of the included comma-separated values. +- `cluster.routing.allocation.include.<attribute>` (Dynamic, enum): Allocates shards to a node whose `attribute` contains at least one of the included comma-separated values. -- `cluster.routing.allocation.require.<attribute>` (Enum): Only allocates shards to a node whose `attribute` has all of the included comma-separated values. +- `cluster.routing.allocation.require.<attribute>` (Dynamic, enum): Only allocates shards to a node whose `attribute` contains all of the included comma-separated values. -- `cluster.routing.allocation.exclude.<attribute>` (Enum): Does not allocate shards to a node whose `attribute` has any of the included comma-separated values. The cluster allocation settings support the following built-in attributes. +- `cluster.routing.allocation.exclude.<attribute>` (Dynamic, enum): Does not allocate shards to a node whose `attribute` contains any of the included comma-separated values. The cluster allocation settings support the following built-in attributes. Valid values are: - `_name` – Match nodes by node name. @@ -93,15 +93,17 @@ OpenSearch supports the following cluster-level routing and shard allocation set - `_id` – Match nodes by node ID. - `_tier` – Match nodes by data tier role. -- `cluster.routing.allocation.shard_movement_strategy` (Enum): Determines the order in which shards are relocated from outgoing to incoming nodes. +- `cluster.routing.allocation.shard_movement_strategy` (Dynamic, enum): Determines the order in which shards are relocated from outgoing to incoming nodes. This setting supports the following strategies: - `PRIMARY_FIRST` – Primary shards are relocated first, before replica shards. This prioritization may help prevent a cluster's health status from going red if the relocating nodes fail during the process. - `REPLICA_FIRST` – Replica shards are relocated first, before primary shards. This prioritization may help prevent a cluster's health status from going red when carrying out shard relocation in a mixed-version, segment-replication-enabled OpenSearch cluster. In this situation, primary shards relocated to OpenSearch nodes of a newer version could try to copy segment files to replica shards on an older version of OpenSearch, which would result in shard failure. Relocating replica shards first may help to avoid this in multi-version clusters. - `NO_PREFERENCE` – The default behavior in which the order of shard relocation has no importance. -- `cluster.allocator.gateway.batch_size` (Integer): Limits the number of shards sent to data nodes in one batch for fetching any unassigned shard metadata. Default is `2000`. -- `cluster.allocator.existing_shards_allocator.batch_enabled` (Boolean): Enables batch allocation of unassigned shards that already exist on the disk as opposed to allocating one shard at a time. This reduces memory and transport overhead by fetching any unassigned shard metadata in a batch call. Default is `false`. +- `cluster.allocator.gateway.batch_size` (Dynamic, integer): Limits the number of shards sent to data nodes in a single batch to fetch any unassigned shard metadata. Default is `2000`. + +- `cluster.allocator.existing_shards_allocator.batch_enabled` (Static, Boolean): Enables batch allocation of unassigned shards that already exist on the disk, as opposed to allocating one shard at a time. This reduces memory and transport overhead by fetching any unassigned shard metadata in a batch call. Default is `false`. + ## Cluster-level shard, block, and task settings OpenSearch supports the following cluster-level shard, block, and task settings: @@ -149,3 +151,19 @@ OpenSearch supports the following cluster-level coordination settings. All setti - `cluster.fault_detection.leader_check.timeout` (Time unit): The amount of time a node waits for a response from the elected cluster manager during a leader check before deeming the check a failure. Valid values are from `1ms` to `60s`, inclusive. Default is `10s`. Changing this setting to a value other than the default can result in an unstable cluster. - `cluster.fault_detection.follower_check.timeout` (Time unit): The amount of time the elected cluster manager waits for a response during a follower check before deeming the check a failure. Valid values are from `1ms` to `60s`, inclusive. Default is `10s`. Changing this setting to a value other than the default can result in an unstable cluster. + +- `cluster.fault_detection.follower_check.interval` (Time unit): The amount of time that the elected cluster manager waits between sending follower checks to other nodes in the cluster. Valid values are `100ms` and higher. Default is `1000ms`. Changing this setting to a value other than the default can result in an unstable cluster. + +- `cluster.follower_lag.timeout` (Time unit): The amount of time that the elected cluster manager waits to receive acknowledgements for cluster state updates from lagging nodes. Default is `90s`. If a node does not successfully apply the cluster state update within this period of time, it is considered to have failed and is removed from the cluster. + +- `cluster.publish.timeout` (Time unit): The amount of time that the cluster manager waits for each cluster state update to be completely published to all nodes, unless `discovery.type` is set to `single-node`. Default is `30s`. + +## Cluster-level CAT response limit settings + +OpenSearch supports the following cluster-level CAT API response limit settings, all of which are dynamic: + +- `cat.indices.response.limit.number_of_indices` (Integer): Sets a limit on the number of indexes returned by the [CAT Indices API]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-indices/). The default value is `-1` (no limit). If the number of indexes in the response exceeds this limit, the API returns a `429` error. To avoid this, you can specify an index pattern filter in your query (for example, `_cat/indices/<index-pattern>`). + +- `cat.shards.response.limit.number_of_shards` (Integer): Sets a limit on the number of shards returned by the [CAT Shards API]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-shards/). The default value is `-1` (no limit). If the number of shards in the response exceeds this limit, the API returns a `429` error. To avoid this, you can specify an index pattern filter in your query (for example, `_cat/shards/<index-pattern>`). + +- `cat.segments.response.limit.number_of_indices` (Integer): Sets a limit on the number of indexes returned by the [CAT Segments API]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-segments/). The default value is `-1` (no limit). If the number of indexes in the response exceeds this limit, the API returns a `429` error. To avoid this, you can specify an index pattern filter in your query (for example, `_cat/segments/<index-pattern>`). diff --git a/_install-and-configure/configuring-opensearch/index-settings.md b/_install-and-configure/configuring-opensearch/index-settings.md index bd9b9651aa..378fb8fbff 100644 --- a/_install-and-configure/configuring-opensearch/index-settings.md +++ b/_install-and-configure/configuring-opensearch/index-settings.md @@ -79,6 +79,18 @@ OpenSearch supports the following dynamic cluster-level index settings: - `cluster.snapshot.shard.path.prefix` (String): Controls the fixed path prefix for snapshot shard-level blobs. This setting only applies when the repository `shard_path_type` setting is either `HASHED_PREFIX` or `HASHED_INFIX`. Default is an empty string, `""`. +- `cluster.default_number_of_replicas` (Integer): Controls the default number of replicas for indexes in the cluster. The index-level `index.number_of_replicas` setting defaults to this value if not configured. Default is `1`. + +- `cluster.thread_pool.<fixed-threadpool>.size` (Integer): Controls the sizes of both the fixed and resizable queue thread pools. Overrides the defaults provided in `opensearch.yml`. + +- `cluster.thread_pool.<scaling-threadpool>.max` (Integer): Sets the maximum size of the scaling thread pool. Overrides the default provided in `opensearch.yml`. + +- `cluster.thread_pool.<scaling-threadpool>.core` (Integer): Specifies the core size of the scaling thread pool. Overrides the default provided in `opensearch.yml`. + + +Before tuning thread pool settings dynamically, note that these are expert-level settings that can potentially destabilize your cluster. Modifying thread pool settings applies the same thread pool size to all nodes, so it's not recommended for clusters with different hardware for the same roles. Similarly, avoid tuning thread pools shared by both data nodes and cluster manager nodes. After making these changes, we recommend monitoring your cluster to ensure that it remains stable and performs as expected. +{: .warning} + ## Index-level index settings You can specify index settings at index creation. There are two types of index settings: @@ -185,7 +197,7 @@ For more information about updating settings, including supported query paramete OpenSearch supports the following dynamic index-level index settings: -- `index.number_of_replicas` (Integer): The number of replica shards each primary shard should have. For example, if you have 4 primary shards and set `index.number_of_replicas` to 3, the index has 12 replica shards. Default is 1. +- `index.number_of_replicas` (Integer): The number of replica shards each primary shard should have. For example, if you have 4 primary shards and set `index.number_of_replicas` to 3, the index has 12 replica shards. If not set, defaults to `cluster.default_number_of_replicas` (which is `1` by default). - `index.auto_expand_replicas` (String): Whether the cluster should automatically add replica shards based on the number of data nodes. Specify a lower bound and upper limit (for example, 0--9) or `all` for the upper limit. For example, if you have 5 data nodes and set `index.auto_expand_replicas` to 0--3, then the cluster does not automatically add another replica shard. However, if you set this value to `0-all` and add 2 more nodes for a total of 7, the cluster will expand to now have 6 replica shards. Default is disabled. diff --git a/_install-and-configure/install-opensearch/index.md b/_install-and-configure/install-opensearch/index.md index d0c6e242cd..94c259667a 100644 --- a/_install-and-configure/install-opensearch/index.md +++ b/_install-and-configure/install-opensearch/index.md @@ -31,7 +31,7 @@ OpenSearch Version | Compatible Java Versions | Bundled Java Version 1.0--1.2.x | 11, 15 | 15.0.1+9 1.3.x | 8, 11, 14 | 11.0.24+8 2.0.0--2.11.x | 11, 17 | 17.0.2+8 -2.12.0+ | 11, 17, 21 | 21.0.4+7 +2.12.0+ | 11, 17, 21 | 21.0.5+11 To use a different Java installation, set the `OPENSEARCH_JAVA_HOME` or `JAVA_HOME` environment variable to the Java install location. For example: ```bash diff --git a/_layouts/default.html b/_layouts/default.html index d4d40d8cc4..7f2bf0a2a8 100755 --- a/_layouts/default.html +++ b/_layouts/default.html @@ -87,6 +87,8 @@ {% assign section = site.clients_collection.collections %} {% elsif page.section == "benchmark" %} {% assign section = site.benchmark_collection.collections %} + {% elsif page.section == "migration-assistant" %} + {% assign section = site.migration_assistant_collection.collections %} {% endif %} {% if section %} diff --git a/_migration-assistant/deploying-migration-assistant/configuration-options.md b/_migration-assistant/deploying-migration-assistant/configuration-options.md new file mode 100644 index 0000000000..7097d7e90e --- /dev/null +++ b/_migration-assistant/deploying-migration-assistant/configuration-options.md @@ -0,0 +1,175 @@ +--- +layout: default +title: Configuration options +nav_order: 15 +parent: Deploying Migration Assistant +--- + +# Configuration options + +This page outlines the configuration options for three key migrations scenarios: + +1. **Metadata migration** +2. **Backfill migration with `Reindex-from-Snapshot` (RFS)** +3. **Live capture migration with Capture and Replay (C&R)** + +Each of these migrations depends on either a snapshot or a capture proxy. The following example `cdk.context.json` configurations are used by AWS Cloud Development Kit (AWS CDK) to deploy and configure Migration Assistant for OpenSearch, shown as separate blocks for each migration type. If you are performing a migration applicable to multiple scenarios, these options can be combined. + + +For a complete list of configuration options, see [opensearch-migrations-options.md](https://github.com/opensearch-project/opensearch-migrations/blob/main/deployment/cdk/opensearch-service-migration/options.md). If you need a configuration option that is not found on this page, create an issue in the [OpenSearch Migrations repository](https://github.com/opensearch-project/opensearch-migrations/issues). +{: .tip } + +Options for the source cluster endpoint, target cluster endpoint, and existing virtual private cloud (VPC) should be configured in order for the migration tools to function effectively. + +## Shared configuration options + +Each migration configuration shares the following options. + + +| Name | Example | Description | +| :--- | :--- | :--- | +| `sourceClusterEndpoint` | `"https://source-cluster.elb.us-east-1.endpoint.com"` | The endpoint for the source cluster. | +| `targetClusterEndpoint` | `"https://vpc-demo-opensearch-cluster-cv6hggdb66ybpk4kxssqt6zdhu.us-west-2.es.amazonaws.com:443"` | The endpoint for the target cluster. Required if using an existing target cluster for the migration instead of creating a new one. | +| `vpcId` | `"vpc-123456789abcdefgh"` | The ID of the existing VPC in which the migration resources will be stored. The VPC must have at least two private subnets that span two Availability Zones. | + + +## Backfill migration using RFS + +The following CDK performs a backfill migrations using RFS: + +```json +{ + "backfill-migration": { + "stage": "dev", + "vpcId": <VPC_ID>, + "sourceCluster": { + "endpoint": <SOURCE_CLUSTER_ENDPOINT>, + "version": "ES 7.10", + "auth": {"type": "none"} + }, + "targetCluster": { + "endpoint": <TARGET_CLUSTER_ENDPOINT>, + "auth": { + "type": "basic", + "username": <TARGET_CLUSTER_USERNAME>, + "passwordFromSecretArn": <TARGET_CLUSTER_PASSWORD_SECRET> + } + }, + "reindexFromSnapshotServiceEnabled": true, + "reindexFromSnapshotExtraArgs": "", + "artifactBucketRemovalPolicy": "DESTROY" + } +} +``` +{% include copy.html %} + +Performing an RFS backfill migration requires an existing snapshot. + + +The RFS configuration uses the following options. All options are optional. + +| Name | Example | Description | +| :--- | :--- | :--- | +| `reindexFromSnapshotServiceEnabled` | `true` | Enables deployment and configuration of the RFS ECS service. | +| `reindexFromSnapshotExtraArgs` | `"--target-aws-region us-east-1 --target-aws-service-signing-name es"` | Extra arguments for the Document Migration command, with space separation. See [RFS Extra Arguments](https://github.com/opensearch-project/opensearch-migrations/blob/main/DocumentsFromSnapshotMigration/README.md#arguments) for more information. You can pass `--no-insecure` to remove the `--insecure` flag. | + +To view all available arguments for `reindexFromSnapshotExtraArgs`, see [Snapshot migrations README](https://github.com/opensearch-project/opensearch-migrations/blob/main/DocumentsFromSnapshotMigration/README.md#arguments). At a minimum, no extra arguments may be needed. + +## Live capture migration with C&R + +The following sample CDK performs a live capture migration with C&R: + +```json +{ + "live-capture-migration": { + "stage": "dev", + "vpcId": <VPC_ID>, + "sourceCluster": { + "endpoint": <SOURCE_CLUSTER_ENDPOINT>, + "version": "ES 7.10", + "auth": {"type": "none"} + }, + "targetCluster": { + "endpoint": <TARGET_CLUSTER_ENDPOINT>, + "auth": { + "type": "basic", + "username": <TARGET_CLUSTER_USERNAME>, + "passwordFromSecretArn": <TARGET_CLUSTER_PASSWORD_SECRET> + } + }, + "captureProxyServiceEnabled": true, + "captureProxyExtraArgs": "", + "trafficReplayerServiceEnabled": true, + "trafficReplayerExtraArgs": "", + "artifactBucketRemovalPolicy": "DESTROY" + } +} +``` +{% include copy.html %} + +Performing a live capture migration requires that a Capture Proxy be configured to capture incoming traffic and send it to the target cluster using the Traffic Replayer service. For arguments available in `captureProxyExtraArgs`, refer to the `@Parameter` fields [here](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/src/main/java/org/opensearch/migrations/trafficcapture/proxyserver/CaptureProxy.java). For `trafficReplayerExtraArgs`, refer to the `@Parameter` fields [here](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficReplayer/src/main/java/org/opensearch/migrations/replay/TrafficReplayer.java). At a minimum, no extra arguments may be needed. + + +| Name | Example | Description | +| :--- | :--- | :--- | +| `captureProxyServiceEnabled` | `true` | Enables the Capture Proxy service deployment using an AWS CloudFormation stack. | +| `captureProxyExtraArgs` | `"--suppressCaptureForHeaderMatch user-agent .*elastic-java/7.17.0.*"` | Extra arguments for the Capture Proxy command, including options specified by the [Capture Proxy](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/src/main/java/org/opensearch/migrations/trafficcapture/proxyserver/CaptureProxy.java). | +| `trafficReplayerServiceEnabled` | `true` | Enables the Traffic Replayer service deployment using a CloudFormation stack. | +| `trafficReplayerExtraArgs` | `"--sigv4-auth-header-service-region es,us-east-1 --speedup-factor 5"` | Extra arguments for the Traffic Replayer command, including options for auth headers and other parameters specified by the [Traffic Replayer](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficReplayer/src/main/java/org/opensearch/migrations/replay/TrafficReplayer.java). | + + +For arguments available in `captureProxyExtraArgs`, see the `@Parameter` fields in [`CaptureProxy.java`](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/src/main/java/org/opensearch/migrations/trafficcapture/proxyserver/CaptureProxy.java). For `trafficReplayerExtraArgs`, see the `@Parameter` fields in [TrafficReplayer.java](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficReplayer/src/main/java/org/opensearch/migrations/replay/TrafficReplayer.java). + + +## Cluster authentication options + +Both the source and target cluster can use no authentication, authentication limited to VPC, basic authentication with a username and password, or AWS Signature Version 4 scoped to a user or role. + +### No authentication + +```json + "sourceCluster": { + "endpoint": <SOURCE_CLUSTER_ENDPOINT>, + "version": "ES 7.10", + "auth": {"type": "none"} + } +``` +{% include copy.html %} + +### Basic authentication + +```json + "sourceCluster": { + "endpoint": <SOURCE_CLUSTER_ENDPOINT>, + "version": "ES 7.10", + "auth": { + "type": "basic", + "username": <TARGET_CLUSTER_USERNAME>, + "passwordFromSecretArn": <TARGET_CLUSTER_PASSWORD_SECRET> + } + } +``` +{% include copy.html %} + +### Signature Version 4 authentication + +```json + "sourceCluster": { + "endpoint": <SOURCE_CLUSTER_ENDPOINT>, + "version": "ES 7.10", + "auth": { + "type": "sigv4", + "region": "us-east-1", + "serviceSigningName": "es" + } + } +``` +{% include copy.html %} + +The `serviceSigningName` can be `es` for an Elasticsearch or OpenSearch domain, or `aoss` for an OpenSearch Serverless collection. + +All of these authentication options apply to both source and target clusters. + +## Network configuration + +The migration tooling expects the source cluster, target cluster, and migration resources to exist in the same VPC. If this is not the case, manual networking setup outside of this documentation is likely required. diff --git a/_migration-assistant/deploying-migration-assistant/iam-and-security-groups-for-existing-clusters.md b/_migration-assistant/deploying-migration-assistant/iam-and-security-groups-for-existing-clusters.md new file mode 100644 index 0000000000..331b99e1fa --- /dev/null +++ b/_migration-assistant/deploying-migration-assistant/iam-and-security-groups-for-existing-clusters.md @@ -0,0 +1,70 @@ +--- +layout: default +title: IAM and security groups for existing clusters +nav_order: 20 +parent: Deploying Migration Assistant +--- + +# IAM and security groups for existing clusters + +This page outlines security scenarios for using the migration tools with existing clusters, including any necessary configuration changes to ensure proper communication between them. + +## Importing an Amazon OpenSearch Service or Amazon OpenSearch Serverless target cluster + +Use the following scenarios for Amazon OpenSearch Service or Amazon OpenSearch Serverless target clusters. + +### OpenSearch Service + +For an OpenSearch Domain, two main configurations are typically required to ensure proper functioning of the migration solution: + +1. **Security Group Configuration** + + The domain should have a security group that allows communication from the applicable migration services (Traffic Replayer, Migration Console, `Reindex-from-Snapshot`). The CDK automatically creates an `osClusterAccessSG` security group, which is applied to the migration services. The user should then add this security group to their existing domain to allow access. + +2. **Access Policy Configuration** should be one of the following: + - An open access policy that allows all access. + - Configured to allow at least the AWS Identity and Access Management (IAM) task roles for the applicable migration services (Traffic Replayer, Migration Console, `Reindex-from-Snapshot`) to access the domain. + +### OpenSearch Serverless + +For an OpenSearch Serverless Collection, you will need to configure both network and data access policies: + +1. **Network Policy Configuration**: + The Collection should have a network policy that uses the `VPC` access type. This requires creating a VPC endpoint on the VPC used for the solution. The VPC endpoint should be configured for the private subnets of the VPC and should attach the `osClusterAccessSG` security group. + +2. **Data Access Policy Configuration**: + The data access policy should grant permission to perform all [index operations](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-data-access.html#serverless-data-supported-permissions) (`aoss:*`) for all indexes in the Collection. The IAM task roles of the applicable Migration services (Traffic Replayer, migration console, `Reindex-from-Snapshot`) should be used as the principals for this data access policy. + +## Capture Proxy on Coordinator Nodes of Source Cluster + +Although the CDK does not automatically set up the Capture Proxy on source cluster nodes (except in the demo solution), the Capture Proxy instances must communicate with the resources deployed by the CDK, such as Kafka. This section outlines the necessary steps to set up communication. + +Before [setting up Capture Proxy instances](https://github.com/opensearch-project/opensearch-migrations/tree/main/TrafficCapture/trafficCaptureProxyServer#installing-capture-proxy-on-coordinator-nodes) on the source cluster, ensure the following configurations are in place: + +1. **Security Group Configuration**: + The coordinator nodes should add the `trafficStreamSourceSG` security group to allow sending captured traffic to Kafka. + +2. **IAM Policy Configuration**: + The IAM role used by the coordinator nodes should have permissions to publish captured traffic to Kafka. You can add the following template policy through the AWS Console (IAM Role → Add permissions → Create inline policy → JSON view): + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": "kafka-cluster:Connect", + "Resource": "arn:aws:kafka:<REGION>:<ACCOUNT-ID>:cluster/migration-msk-cluster-<STAGE>/*", + "Effect": "Allow" + }, + { + "Action": [ + "kafka-cluster:CreateTopic", + "kafka-cluster:DescribeTopic", + "kafka-cluster:WriteData" + ], + "Resource": "arn:aws:kafka:<REGION>:<ACCOUNT-ID>:topic/migration-msk-cluster-<STAGE>/*", + "Effect": "Allow" + } + ] +} +``` diff --git a/_migration-assistant/deploying-migration-assistant/index.md b/_migration-assistant/deploying-migration-assistant/index.md new file mode 100644 index 0000000000..1c559a81b1 --- /dev/null +++ b/_migration-assistant/deploying-migration-assistant/index.md @@ -0,0 +1,13 @@ +--- +layout: default +title: Deploying Migration Assistant +nav_order: 15 +has_children: true +permalink: /deploying-migration-assistant/ +redirect-from: + - /deploying-migration-assistant/index/ +--- + +# Deploying Migration Assistant + +This section provides information about the available options for deploying Migration Assistant. diff --git a/_migration-assistant/getting-started-data-migration.md b/_migration-assistant/getting-started-data-migration.md new file mode 100644 index 0000000000..4110f29edf --- /dev/null +++ b/_migration-assistant/getting-started-data-migration.md @@ -0,0 +1,353 @@ +--- +layout: default +title: Getting started with data migration +nav_order: 10 +redirect_from: + - /upgrade-to/upgrade-to/ + - /upgrade-to/snapshot-migrate/ +--- + +# Getting started with data migration + +This quickstart outlines how to deploy Migration Assistant for OpenSearch and execute an existing data migration using `Reindex-from-Snapshot` (RFS). It uses AWS for illustrative purposes. However, the steps can be modified for use with other cloud providers. + +## Prerequisites and assumptions + +Before using this quickstart, make sure you fulfill the following prerequisites: + +* Verify that your migration path [is supported]({{site.url}}{{site.baseurl}}/migration-assistant/is-migration-assistant-right-for-you/#migration-paths). Note that we test with the exact versions specified, but you should be able to migrate data on alternative minor versions as long as the major version is supported. +* The source cluster must be deployed Amazon Simple Storage Service (Amazon S3) plugin. +* The target cluster must be deployed. + +The steps in this guide assume the following: + +* In this guide, a snapshot will be taken and stored in Amazon S3; the following assumptions are made about this snapshot: + * The `_source` flag is enabled on all indexes to be migrated. + * The snapshot includes the global cluster state (`include_global_state` is `true`). + * Shard sizes of up to approximately 80 GB are supported. Larger shards cannot be migrated. If this presents challenges for your migration, contact the [migration team](https://opensearch.slack.com/archives/C054JQ6UJFK). +* Migration Assistant will be installed in the same AWS Region and have access to both the source snapshot and target cluster. + +--- + +## Step 1: Install Bootstrap on an Amazon EC2 instance (~10 minutes) + +To begin your migration, use the following steps to install a `bootstrap` box on an Amazon Elastic Compute Cloud (Amazon EC2) instance. The instance uses AWS CloudFormation to create and manage the stack. + +1. Log in to the target AWS account in which you want to deploy Migration Assistant. +2. From the browser where you are logged in to your target AWS account, right-click [here](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?templateURL=https://solutions-reference.s3.amazonaws.com/migration-assistant-for-amazon-opensearch-service/latest/migration-assistant-for-amazon-opensearch-service.template&redirectId=SolutionWeb) to load the CloudFormation template from a new browser tab. +3. Follow the CloudFormation stack wizard: + * **Stack Name:** `MigrationBootstrap` + * **Stage Name:** `dev` + * Choose **Next** after each step > **Acknowledge** > **Submit**. +4. Verify that the Bootstrap stack exists and is set to `CREATE_COMPLETE`. This process takes around 10 minutes to complete. + +--- + +## Step 2: Set up Bootstrap instance access (~5 minutes) + +Use the following steps to set up Bootstrap instance access: + +1. After deployment, find the EC2 instance ID for the `bootstrap-dev-instance`. +2. Create an AWS Identity and Access Management (IAM) policy using the following snippet, replacing `<aws-region>`, `<aws-account>`, `<stage>`, and `<ec2-instance-id>` with your information: + + ```json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "ssm:StartSession", + "Resource": [ + "arn:aws:ec2:<aws-region>:<aws-account>:instance/<ec2-instance-id>", + "arn:aws:ssm:<aws-region>:<aws-account>:document/BootstrapShellDoc-<stage>-<aws-region>" + ] + } + ] + } + ``` + {% include copy.html %} + +3. Name the policy, for example, `SSM-OSMigrationBootstrapAccess`, and then create the policy by selecting **Create policy**. + +--- + +## Step 3: Log in to Bootstrap and building Migration Assistant (~15 minutes) + +Next, log in to Bootstrap and build Migration Assistant using the following steps. + +### Prerequisites + +To use these steps, make sure you fulfill the following prerequisites: + +* The AWS Command Line Interface (AWS CLI) and AWS Session Manager plugin are installed on your instance. +* The AWS credentials are configured (`aws configure`) for your instance. + +### Steps + +1. Load AWS credentials into your terminal. +2. Log in to the instance using the following command, replacing `<instance-id>` and `<aws-region>` with your instance ID and Region: + + ```bash + aws ssm start-session --document-name BootstrapShellDoc-<stage>-<aws-region> --target <instance-id> --region <aws-region> [--profile <profile-name>] + ``` + {% include copy.html %} + +3. Once logged in, run the following command from the shell of the Bootstrap instance in the `/opensearch-migrations` directory: + + ```bash + ./initBootstrap.sh && cd deployment/cdk/opensearch-service-migration + ``` + {% include copy.html %} + +4. After a successful build, note the path for infrastructure deployment, which will be used in the next step. + +--- + +## Step 4: Configure and deploy RFS (~20 minutes) + +Use the following steps to configure and deploy RFS: + +1. Add the target cluster password to AWS Secrets Manager as an unstructured string. Be sure to copy the secret Amazon Resource Name (ARN) for use during deployment. +2. From the same shell as the Bootstrap instance, modify the `cdk.context.json` file located in the `/opensearch-migrations/deployment/cdk/opensearch-service-migration` directory: + + ```json + { + "migration-assistant": { + "vpcId": "<TARGET CLUSTER VPC ID>", + "targetCluster": { + "endpoint": "<TARGET CLUSTER ENDPOINT>", + "auth": { + "type": "basic", + "username": "<TARGET CLUSTER USERNAME>", + "passwordFromSecretArn": "<TARGET CLUSTER PASSWORD SECRET>" + } + }, + "sourceCluster": { + "endpoint": "<SOURCE CLUSTER ENDPOINT>", + "auth": { + "type": "basic", + "username": "<TARGET CLUSTER USERNAME>", + "passwordFromSecretArn": "<TARGET CLUSTER PASSWORD SECRET>" + } + }, + "reindexFromSnapshotExtraArgs": "<RFS PARAMETERS (see below)>", + "stage": "dev", + "otelCollectorEnabled": true, + "migrationConsoleServiceEnabled": true, + "reindexFromSnapshotServiceEnabled": true, + "migrationAssistanceEnabled": true + } + } + ``` + {% include copy.html %} + + The source and target cluster authorization can be configured to have no authorization, `basic` with a username and password, or `sigv4`. + +3. Bootstrap the account with the following command: + + ```bash + cdk bootstrap --c contextId=migration-assistant --require-approval never + ``` + {% include copy.html %} + +4. Deploy the stacks: + + ```bash + cdk deploy "*" --c contextId=migration-assistant --require-approval never --concurrency 5 + ``` + {% include copy.html %} + +5. Verify that all CloudFormation stacks were installed successfully. + +### RFS parameters + +If you're creating a snapshot using migration tooling, these parameters are automatically configured. If you're using an existing snapshot, modify the `reindexFromSnapshotExtraArgs` setting with the following values: + + ```bash + --s3-repo-uri s3://<bucket-name>/<repo> --s3-region <region> --snapshot-name <name> + ``` + +You will also need to give the `migrationconsole` and `reindexFromSnapshot` TaskRoles permissions to the S3 bucket. + +--- + +## Step 5: Deploy Migration Assistant + +To deploy Migration Assistant, use the following steps: + +1. Bootstrap the account: + + ```bash + cdk bootstrap --c contextId=migration-assistant --require-approval never --concurrency 5 + ``` + {% include copy.html %} + +2. Deploy the stacks when `cdk.context.json` is fully configured: + + ```bash + cdk deploy "*" --c contextId=migration-assistant --require-approval never --concurrency 3 + ``` + {% include copy.html %} + +These commands deploy the following stacks: + +* Migration Assistant network stack +* `Reindex-from-snapshot` stack +* Migration console stack + +--- + +## Step 6: Access the migration console + +Run the following command to access the migration console: + +```bash +./accessContainer.sh migration-console dev <region> +``` +{% include copy.html %} + + +`accessContainer.sh` is located in `/opensearch-migrations/deployment/cdk/opensearch-service-migration/` on the Bootstrap instance. To learn more, see [Accessing the migration console]. +`{: .note} + +--- + +## Step 7: Verify the connection to the source and target clusters + +To verify the connection to the clusters, run the following command: + +```bash +console clusters connection-check +``` +{% include copy.html %} + +You should receive the following output: + +```bash +* **Source Cluster:** Successfully connected! +* **Target Cluster:** Successfully connected! +``` + +To learn more about migration console commands, see [Migration commands]. + +--- + +## Step 8: Create a snapshot + +Run the following command to initiate snapshot creation from the source cluster: + +```bash +console snapshot create [...] +``` +{% include copy.html %} + +To check the snapshot creation status, run the following command: + +```bash +console snapshot status [...] +``` +{% include copy.html %} + +To learn more information about the snapshot, run the following command: + +```bash +console snapshot status --deep-check [...] +``` +{% include copy.html %} + +Wait for snapshot creation to complete before moving to step 9. + +To learn more about snapshot creation, see [Snapshot Creation]. + +--- + +## Step 9: Migrate metadata + +Run the following command to migrate metadata: + +```bash +console metadata migrate [...] +``` +{% include copy.html %} + +For more information, see [Migrating metadata]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/). + +--- + +## Step 10: Migrate documents with RFS + +You can now use RFS to migrate documents from your original cluster: + +1. To start the migration from RFS, start a `backfill` using the following command: + + ```bash + console backfill start + ``` + {% include copy.html %} + +2. _(Optional)_ To speed up the migration, increase the number of documents processed at a simultaneously by using the following command: + + ```bash + console backfill scale <NUM_WORKERS> + ``` + {% include copy.html %} + +3. To check the status of the documentation backfill, use the following command: + + ```bash + console backfill status + ``` + {% include copy.html %} + +4. If you need to stop the backfill process, use the following command: + + ```bash + console backfill stop + ``` + {% include copy.html %} + +For more information, see [Backfill]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/). + +--- + +## Step 11: Backfill monitoring + +Use the following command for detailed monitoring of the backfill process: + +```bash +console backfill status --deep-check +``` +{% include copy.html %} + +You should receive the following output: + +```json +BackfillStatus.RUNNING +Running=9 +Pending=1 +Desired=10 +Shards total: 62 +Shards completed: 46 +Shards incomplete: 16 +Shards in progress: 11 +Shards unclaimed: 5 +``` + +Logs and metrics are available in Amazon CloudWatch in the `OpenSearchMigrations` log group. + +--- + +## Step 12: Verify that all documents were migrated + +Use the following query in CloudWatch Logs Insights to identify failed documents: + +```bash +fields @message +| filter @message like "Bulk request succeeded, but some operations failed." +| sort @timestamp desc +| limit 10000 +``` +{% include copy.html %} + +If any failed documents are identified, you can index the failed documents directly as opposed to using RFS. + diff --git a/_migration-assistant/index.md b/_migration-assistant/index.md new file mode 100644 index 0000000000..f024fdb69c --- /dev/null +++ b/_migration-assistant/index.md @@ -0,0 +1,75 @@ +--- +layout: default +title: Migration Assistant for OpenSearch +nav_order: 1 +has_children: false +nav_exclude: true +has_toc: false +permalink: /migration-assistant/ +redirect_from: + - /migration-assistant/index/ + - /upgrade-to/index/ + - /upgrade-to/ +--- + +# Migration Assistant for OpenSearch + +Migration Assistant for OpenSearch aids you in successfully performing an end-to-end, zero-downtime migration to OpenSearch from other search providers. It helps with the following scenarios: + +- **Metadata migration**: Migrating cluster metadata, such as index settings, aliases, and templates. +- **Backfill migration**: Migrating existing or historical data from a source to a target cluster. +- **Live traffic migration**: Replicating live ongoing traffic from a source to a target cluster. +- **Comparative tooling**: Comparing the performance and behaviors of an existing cluster with a prospective new one. + +This user guide focuses on conducting a comprehensive migration involving both existing and live data with zero downtime and the option to back out of a migration. + +It's crucial to note that migration strategies are not universally applicable. This guide provides a detailed methodology, based on certain assumptions detailed throughout, emphasizing the importance of robust engineering practices to ensure a successful migration. +{: .tip } + +## Key components + +The following are the key components of Migration Assistant. + +### Elasticsearch/OpenSearch source + +Your source cluster in this solution operates on Elasticsearch or OpenSearch, hosted on EC2 instances or similar computing environments. A proxy is set up to interact with this source cluster, either positioned in front of or directly on the coordinating nodes of the cluster. + +### Migration management console + +A console that provides a migration-specific CLI and offers a variety of tools to streamline the migration process. Everything necessary for completing a migration, other than cleaning up the migration resources, can be done via this Console. + +### Traffic capture proxy + +This component is designed for HTTP RESTful traffic. It forwards traffic to the source cluster and also splits and channels this traffic to a stream processing service for later playback. + +### Traffic Replayer + +Acting as a traffic simulation tool, the Traffic Replayer replays recorded request traffic to a target cluster, mirroring source traffic patterns. It links original requests and their responses to those directed at the target cluster, facilitating comparative analysis. + +### Metadata migration tool + +The Metadata migration tool integrated into the Migration CLI can be used independently to migrate cluster metadata, including index mappings, index configuration settings, templates, component templates, and aliases. + +### reindex-from-snapshot + +`Reindex-from-Snapshot` (RFS) reindexes data from an existing snapshot. Workers on Amazon Elastic Container Service (Amazon ECS) coordinate the migration of documents from an existing snapshot, reindexing the documents in parallel to a target cluster. + +### Target cluster + +The destination cluster for migration or comparison in an A/B test. + +## Architecture overview + +The Migration Assistant architecture is based on the use of an AWS Cloud infrastructure, but most tools are designed to be cloud independent. A local containerized version of this solution is also available. + +The design deployed in AWS is as follows: + +![Migration architecture overview]({{site.url}}{{site.baseurl}}/images/migrations/migrations-architecture-overview.png) + +1. Client traffic is directed to the existing cluster. +2. An Application Load Balancer with capture proxies relays traffic to a source while replicating data to Amazon Managed Streaming for Apache Kafka (Amazon MSK). +3. Using the migration console, you can initiate metadata migration to establish indexes, templates, component templates, and aliases on the target cluster. +4. With continuous traffic capture in place, you can use a `reindex-from-snapshot` process to capture data from your current index. +4. Once `reindex-from-snapshot` is complete, captured traffic is replayed from Amazon MSK to the target cluster by the traffic replayer. +5. Performance and behavior of traffic sent to the source and target clusters are compared by reviewing logs and metrics. +6. After confirming that the target cluster's functionality meets expectations, clients are redirected to the new target. diff --git a/_migration-assistant/is-migration-assistant-right-for-you.md b/_migration-assistant/is-migration-assistant-right-for-you.md new file mode 100644 index 0000000000..073c2b6cd7 --- /dev/null +++ b/_migration-assistant/is-migration-assistant-right-for-you.md @@ -0,0 +1,58 @@ +--- +layout: default +title: Is Migration Assistant right for you? +nav_order: 5 +--- + +# Is Migration Assistant right for you? + +Before deciding if Migration Assistant is right for you, it's important to assess your specific needs and understand the available tools for performing an upgrade or migration. + +Migration Assistant addresses gaps found in other migration solutions, but in some cases, alternative tools may be a better fit. + +For instance, if you need to upgrade more than one major version, such as moving from Elasticsearch 6.8 to OpenSearch 2.15, Migration Assistant allows you to do this in a single hop. In contrast, other options like rolling upgrades or snapshot restore would require multiple steps because they cannot handle major version jumps without reindexing your data. Additionally, if you need to capture live traffic and perform a zero-downtime migration, Migration Assistant would be the right choice. + +There are also tools available for migrating cluster configuration, templates, and aliases, which can be helpful depending on the complexity of your setup. These tools streamline the migration process by preserving critical settings and custom configurations. + +## Migration paths + +| **Source Version** | **Target Version** | +|-----------------------------|----------------------------------| +| Elasticsearch 6.8 | OpenSearch 1.3 | +| Elasticsearch 6.8 | OpenSearch 2.15 | +| Elasticsearch 7.10.2 | OpenSearch 1.3 | +| Elasticsearch 7.10.2 | OpenSearch 2.15 | +| Elasticsearch 7.17 | OpenSearch 1.3 | +| Elasticsearch 7.17 | OpenSearch 2.15 | +| OpenSearch 1.3 | OpenSearch 2.15 | + + +{: .note} + +### Supported source and target platforms + +* Self-managed (hosted by cloud provider or on-premises) +* AWS OpenSearch + +The tooling is designed to work with other cloud provider platforms, but it is not officially tested with these other platforms. If you would like to add support, please contact one of the maintainers on [GitHub](https://github.com/opensearch-project/opensearch-migrations/blob/main/MAINTAINERS.md). + +### Future migration paths + +To see the OpenSearch migrations roadmap, go to [OpenSearch Migrations - Roadmap](https://github.com/orgs/opensearch-project/projects/229/views/1). + +## Supported components + +Before starting a migration, consider the scope of the components involved. The table below outlines the components that should be considered for migration, indicates their support by the Migration Assistant, and provides comments and recommendations. + +| Component | Supported | Recommendations | +| :--- |:--- | :--- | +| **Documents** | Yes | Migrate existing data with `reindex-from-snapshot` (RFS) and live traffic with Capture and Replay. | +| **Index settings** | Yes | Migrate with the metadata migration tool. | +| **Index mappings** | Yes | Migrate with the metadata migration tool. | +| **Index templates** | Yes | Migrate with the metadata migration tool. | +| **Component templates** | Yes | Migrate with the metadata migration tool. | +| **Aliases** | Yes | Migrate with the metadata migration tool. | +| **Index State Management (ISM) policies** | Expected in 2025 | Manually migrate using an API. | +| **Elasticsearch Kibana dashboards** | Expected in 2025 | This tool is only needed when used to migrate Elasticsearch Kibana Dashboards to OpenSearch Dashboards. To start, export JSON files from Kibana and import them into OpenSearch Dashboards; before importing, use the [`dashboardsSanitizer`](https://github.com/opensearch-project/opensearch-migrations/tree/main/dashboardsSanitizer) tool on X-Pack visualizations like Canvas and Lens in Kibana Dashboards, as they may require recreation for compatibility with OpenSearch. | +| **Security constructs** | No | Configure roles and permissions based on cloud provider recommendations. For example, if using AWS, leverage AWS Identity and Access Management (IAM) for enhanced security management. | +| **Plugins** | No | Check plugin compatibility; some Elasticsearch plugins may not have direct equivalents in OpenSearch. | diff --git a/_migration-assistant/migration-console/accessing-the-migration-console.md b/_migration-assistant/migration-console/accessing-the-migration-console.md new file mode 100644 index 0000000000..ea66f5c04c --- /dev/null +++ b/_migration-assistant/migration-console/accessing-the-migration-console.md @@ -0,0 +1,35 @@ +--- +layout: default +title: Accessing the migration console +nav_order: 35 +parent: Migration console +--- + +# Accessing the migration console + +The Bootstrap box deployed through Migration Assistant contains a script that simplifies access to the migration console through that instance. + +To access the migration console, use the following commands: + +```shell +export STAGE=dev +export AWS_REGION=us-west-2 +/opensearch-migrations/deployment/cdk/opensearch-service-migration/accessContainer.sh migration-console ${STAGE} ${AWS_REGION} +``` +{% include copy.html %} + +When opening the console a message will appear above the command prompt, `Welcome to the Migration Assistant Console`. + +On a machine with the [AWS Command Line Interface (AWS CLI)](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) and the [AWS Session Manager plugin](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html), you can directly connect to the migration console. Ensure that you've run `aws configure` with credentials that have access to the environment. + +Use the following commands: + +```shell +export STAGE=dev +export SERVICE_NAME=migration-console +export TASK_ARN=$(aws ecs list-tasks --cluster migration-${STAGE}-ecs-cluster --family "migration-${STAGE}-${SERVICE_NAME}" | jq --raw-output '.taskArns[0]') +aws ecs execute-command --cluster "migration-${STAGE}-ecs-cluster" --task "${TASK_ARN}" --container "${SERVICE_NAME}" --interactive --command "/bin/bash" +``` +{% include copy.html %} + +Typically, `STAGE` is equivalent to a standard `dev` environment, but this may vary based on what the user specified during deployment. \ No newline at end of file diff --git a/_migration-assistant/migration-console/index.md b/_migration-assistant/migration-console/index.md new file mode 100644 index 0000000000..3e08e72c5c --- /dev/null +++ b/_migration-assistant/migration-console/index.md @@ -0,0 +1,16 @@ +--- +layout: default +title: Migration console +nav_order: 30 +has_children: true +permalink: /migration-console/ +redirect_from: + - /migration-console/index/ +--- + +# Migration console + +The Migrations Assistant deployment includes an Amazon Elastic Container Service (Amazon ECS) task that hosts tools that run different phases of the migration and check the progress or results of the migration. This ECS task is called the **migration console**. The migration console is a command line interface used to interact with the deployed components of the solution. + +This section provides information about how to access the migration console and what commands are supported. + diff --git a/_migration-assistant/migration-console/migration-console-commands-references.md b/_migration-assistant/migration-console/migration-console-commands-references.md new file mode 100644 index 0000000000..21d793b3f3 --- /dev/null +++ b/_migration-assistant/migration-console/migration-console-commands-references.md @@ -0,0 +1,131 @@ +--- +layout: default +title: Command reference +nav_order: 40 +parent: Migration console +--- + +# Migration console command reference + +Migration console commands follow this syntax: `console [component] [action]`. The components include `clusters`, `backfill`, `snapshot`, `metadata`, and `replay`. The console is configured with a registry of the deployed services and the source and target cluster, generated from the `cdk.context.json` values. + +## Commonly used commands + +The exact commands used will depend heavily on use-case and goals, but the following are a series of common commands with a quick description of what they do. + +### Check connection + +Reports whether both the source and target clusters can be reached and provides their versions. + +```sh +console clusters connection-check +``` +{% include copy.html %} + +### Run `cat-indices` + +Runs the `cat-indices` API on the cluster. + +```sh +console clusters cat-indices +``` +{% include copy.html %} + +### Create a snapshot + +Creates a snapshot of the source cluster and stores it in a preconfigured Amazon Simple Storage Service (Amazon S3) bucket. + +```sh +console snapshot create +``` +{% include copy.html %} + +## Check snapshot status + +Runs a detailed check on the snapshot creation status, including estimated completion time: + +```sh +console snapshot status --deep-check +``` +{% include copy.html %} + +## Evaluate metadata + +Performs a dry run of metadata migration, showing which indexes, templates, and other objects will be migrated to the target cluster. + +```sh +console metadata evaluate +``` +{% include copy.html %} + +## Migrate metadata + +Migrates the metadata from the source cluster to the target cluster. + +```sh +console metadata migrate +``` +{% include copy.html %} + +## Start a backfill + +If `Reindex-From-Snapshot` (RFS) is enabled, this command starts an instance of the service to begin moving documents to the target cluster: + +There are similar `scale UNITS` and `stop` commands to change the number of active instances for RFS. + + +```sh +console backfill start +``` +{% include copy.html %} + +## Check backfill status + +Gets the current status of the backfill migration, including the number of operating instances and the progress of the shards. + + +## Start Traffic Replayer + +If Traffic Replayer is enabled, this command starts an instance of Traffic Replayer to begin replaying traffic against the target cluster. +The `stop` command stops all active instances. + +```sh +console replay start +``` +{% include copy.html %} + +## Read logs + +Reads any logs that exist when running Traffic Replayer. Use tab completion on the path to fill in the available `NODE_IDs` and, if applicable, log file names. The tuple logs roll over at a certain size threshold, so there may be many files named with timestamps. The `jq` command pretty-prints each line of the tuple output before writing it to file. + +```sh +console tuples show --in /shared-logs-output/traffic-replayer-default/[NODE_ID]/tuples/console.log | jq > readable_tuples.json +``` +{% include copy.html %} + +## Help option + +All commands and options can be explored within the tool itself by using the `--help` option, either for the entire `console` application or for individual components (for example, `console backfill --help`). For example: + +```bash +$ console --help +Usage: console [OPTIONS] COMMAND [ARGS]... + +Options: + --config-file TEXT Path to config file + --json + -v, --verbose Verbosity level. Default is warn, -v is info, -vv is + debug. + --help Show this message and exit. + +Commands: + backfill Commands related to controlling the configured backfill... + clusters Commands to interact with source and target clusters + completion Generate shell completion script and instructions for setup. + kafka All actions related to Kafka operations + metadata Commands related to migrating metadata to the target cluster. + metrics Commands related to checking metrics emitted by the capture... + replay Commands related to controlling the replayer. + snapshot Commands to create and check status of snapshots of the... + tuples All commands related to tuples. +``` diff --git a/_migration-assistant/migration-phases/assessing-your-cluster-for-migration.md b/_migration-assistant/migration-phases/assessing-your-cluster-for-migration.md new file mode 100644 index 0000000000..5ded49eb59 --- /dev/null +++ b/_migration-assistant/migration-phases/assessing-your-cluster-for-migration.md @@ -0,0 +1,48 @@ +--- +layout: default +title: Assessing your cluster for migration +nav_order: 60 +parent: Migration phases +--- + +# Assessing your cluster for migration + + +The goal of the Migration Assistant is to streamline the process of migrating from one location or version of Elasticsearch/OpenSearch to another. However, completing a migration sometimes requires resolving client compatibility issues before they can communicate directly with the target cluster. + +## Understanding breaking changes + +Before performing any upgrade or migration, you should review any documentation of breaking changes. Even if the cluster is migrated there might be changes required for clients to connect to the new cluster + +## Upgrade and breaking changes guides + +For migrations paths between Elasticsearch 6.8 and OpenSearch 2.x users should be familiar with documentation in the links below that apply to their specific case: + +* [Upgrading Amazon Service Domains](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/version-migration.html). + +* [Changes from Elasticsearch to OpenSearch fork](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/rename.html). + +* [OpenSearch Breaking Changes](https://opensearch.org/docs/latest/breaking-changes/). + +The next step is to set up a proper test bed to verify that your applications will work as expected on the target version. + +## Impact of data transformations + +Any time you apply a transformation to your data, such as: + +- Changing index names +- Modifying field names or field mappings +- Splitting indices with type mappings + +These changes might need to be reflected in your client configurations. For example, if your clients are reliant on specific index or field names, you must ensure that their queries are updated accordingly. + +We recommend running production-like queries against the target cluster before switching over actual production traffic. This helps verify that the client can: + +- Communicate with the target cluster +- Locate the necessary indices and fields +- Retrieve the expected results + +For complex migrations involving multiple transformations or breaking changes, we highly recommend performing a trial migration with representative, non-production data (e.g., in a staging environment) to fully test client compatibility with the target cluster. + + + diff --git a/_migration-assistant/migration-phases/backfill.md b/_migration-assistant/migration-phases/backfill.md new file mode 100644 index 0000000000..d2ff7cd873 --- /dev/null +++ b/_migration-assistant/migration-phases/backfill.md @@ -0,0 +1,181 @@ +--- +layout: default +title: Backfill +nav_order: 90 +parent: Migration phases +--- + +# Backfill + +After the [metadata]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/) for your cluster has been migrated, you can use capture proxy data replication and snapshots to backfill your data into the next cluster. + +## Capture proxy data replication + +If you're interested in capturing live traffic during your migration, Migration Assistant includes an Application Load Balancer for routing traffic to the capture proxy and the target cluster. Upstream client traffic must be routed through the capture proxy in order to replay the requests later. Before using the capture proxy, remember the following: + +* The layer upstream from the Application Load Balancer is compatible with the certificate on the Application Load Balancer listener, whether it's for clients or a Network Load Balancer. The `albAcmCertArn` in the `cdk.context.json` may need to be provided to ensure that clients trust the Application Load Balancer certificate. +* If a Network Load Balancer is used directly upstream of the Application Load Balancer, it must use a TLS listener. +* Upstream resources and security groups must allow network access to the Migration Assistant Application Load Balancer. + +To set up the capture proxy, go to the AWS Management Console and navigate to **EC2 > Load Balancers > Migration Assistant Application Load Balancer**. Copy the Application Load Balancer URL. With the URL copied, you can use one of the following options. + + +### If you are using **Network Load Balancer → Application Load Balancer → Cluster** + +1. Ensure that ingress is provided directly to the Application Load Balancer for the capture proxy. +2. Create a target group for the Migration Assistant Application Load Balancer on port `9200`, and set the health check to `HTTPS`. +3. Associate this target group with your existing Network Load Balancer on a new listener for testing. +4. Verify that the health check is successful, and perform smoke testing with some clients through the new listener port. +5. Once you are ready to migrate all clients, detach the Migration Assistant Application Load Balancer target group from the testing Network Load Balancer listener and modify the existing Network Load Balancer listener to direct traffic to this target group. +6. Now client requests will be routed through the proxy (once they establish a new connection). Verify the application metrics. + +### If you are using **Network Load Balancer → Cluster** + +If you do not want to modify application logic, add an Application Load Balancer in front of your cluster and follow the **Network Load Balancer → Application Load Balancer → Cluster** steps. Otherwise: + +1. Create a target group for the Application Load Balancer on port `9200` and set the health check to `HTTPS`. +2. Associate this target group with your existing Network Load Balancer on a new listener. +3. Verify that the health check is successful, and perform smoke testing with some clients through the new listener port. +4. Once you are ready to migrate all clients, deploy a change so that clients hit the new listener. + + +### If you are **not using an Network Load Balancer** + +If you're only using backfill as your migration technique, make a client/DNS change to route clients to the Migration Assistant Application Load Balancer on port `9200`. + + +### Kafka connection + +After you have routed the client based on your use case, test adding records against HTTP requests using the following steps: + +In the migration console, run the following command: + + ```bash + console kafka describe-topic-records + ``` + {% include copy.html %} + + Note the records in the logging topic. + +After a short period, execute the same command again and compare the increased number of records against the expected HTTP requests. + + +## Creating a snapshot + +Create a snapshot for your backfill using the following command: + +```bash +console snapshot create +``` +{% include copy.html %} + +To check the progress of your snapshot, use the following command: + +```bash +console snapshot status --deep-check +``` +{% include copy.html %} + +Depending on the size of the data in the source cluster and the bandwidth allocated for snapshots, the process can take some time. Adjust the maximum rate at which the source cluster's nodes create the snapshot using the `--max-snapshot-rate-mb-per-node` option. Increasing the snapshot rate will consume more node resources, which may affect the cluster's ability to handle normal traffic. + +## Backfilling documents to the source cluster + +From the snapshot you created of your source cluster, you can begin backfilling documents into the target cluster. Once you have started this process, a fleet of workers will spin up to read the snapshot and reindex documents into the target cluster. This fleet of workers can be scaled to increased the speed at which documents are reindexed into the target cluster. + +### Checking the starting state of the clusters + +You can check the indexes and document counts of the source and target clusters by running the `cat-indices` command. This can be used to monitor the difference between the source and target for any migration scenario. Check the indexes of both clusters using the following command: + +```shell +console clusters cat-indices +``` +{% include copy.html %} + +You should receive the following response: + +```shell +SOURCE CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open my-index WJPVdHNyQ1KMKol84Cy72Q 1 0 8 0 44.7kb 44.7kb + +TARGET CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open .opendistro_security N3uy88FGT9eAO7FTbLqqqA 1 0 10 0 78.3kb 78.3kb +``` + +### Starting the backfill + +Use the following command to start the backfill and deploy the workers: + +```shell +console backfill start +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```shell +BackfillStatus.RUNNING +Running=1 +Pending=0 +Desired=1 +Shards total: 48 +Shards completed: 48 +Shards incomplete: 0 +Shards in progress: 0 +Shards unclaimed: 0 +``` + +The status will be `Running` even if all the shards have been migrated. + +### Scaling up the fleet + +To speed up the transfer, you can scale the number of workers. It may take a few minutes for these additional workers to come online. The following command will update the worker fleet to a size of 10: + +```shell +console backfill scale 5 +``` +{% include copy.html %} + +We recommend slowly scaling up the fleet while monitoring the health metrics of the target cluster to avoid over-saturating it. [Amazon OpenSearch Service domains](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/monitoring.html) provide a number of metrics and logs that can provide this insight. + +### Stopping the migration + +Backfill requires manually stopping the fleet. Once all the data has been migrated, you can shut down the fleet and all its workers using the following command: +Backfill requires manually stopping the fleet. Once all the data has been migrated, you can shut down the fleet and all its workers using the following command: +```shell +console backfill stop +``` + +### Amazon CloudWatch metrics and dashboard + +Migration Assistant creates an Amazon CloudWatch dashboard that you can use to visualize the health and performance of the backfill process. It combines the metrics for the backfill workers and, for those migrating to Amazon OpenSearch Service, the target cluster. + +You can find the backfill dashboard in the CloudWatch console based on the AWS Region in which you have deployed Migration Assistant. The metric graphs for your target cluster will be blank until you select the OpenSearch domain you're migrating to from the dropdown menu at the top of the dashboard. + +## Validating the backfill + +After the backfill is complete and the workers have stopped, examine the contents of your cluster using the [Refresh API](https://opensearch.org/docs/latest/api-reference/index-apis/refresh/) and the [Flush API](https://opensearch.org/docs/latest/api-reference/index-apis/flush/). The following example uses the console CLI with the Refresh API to check the backfill status: + +```shell +console clusters cat-indices --refresh +``` +{% include copy.html %} + +This will display the number of documents in each of the indexes in the target cluster, as shown in the following example response: + +```shell +SOURCE CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open my-index -DqPQDrATw25hhe5Ss34bQ 1 0 3 0 12.7kb 12.7kb + +TARGET CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open .opensearch-observability 8HOComzdSlSWCwqWIOGRbQ 1 1 0 0 416b 208b +green open .plugins-ml-config 9tld-PCJToSUsMiyDhlyhQ 5 1 1 0 9.5kb 4.7kb +green open my-index bGfGtYoeSU6U6p8leR5NAQ 1 0 3 0 5.5kb 5.5kb +green open .migrations_working_state lopd47ReQ9OEhw4ZuJGZOg 1 1 2 0 18.6kb 6.4kb +green open .kibana_1 +``` + +You can run additional queries against the target cluster to mimic your production workflow and closely examine the results. diff --git a/_migration-assistant/migration-phases/index.md b/_migration-assistant/migration-phases/index.md new file mode 100644 index 0000000000..c3c6c14b07 --- /dev/null +++ b/_migration-assistant/migration-phases/index.md @@ -0,0 +1,16 @@ +--- +layout: default +title: Migration phases +nav_order: 50 +has_children: true +permalink: /migration-phases/ +redirect_from: + - /migration-phases/index/ +--- + +This page details how to conduct a migration with Migration Assistant. It encompasses a variety of scenarios including: + +- [**Metadata migration**]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/): Migrating cluster metadata, such as index settings, aliases, and templates. +- [**Backfill migration**]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/): Migrating existing or historical data from a source to a target cluster. +- **Live traffic migration**: Replicating live ongoing traffic from a source to a target cluster. + diff --git a/_migration-assistant/migration-phases/migrating-metadata.md b/_migration-assistant/migration-phases/migrating-metadata.md new file mode 100644 index 0000000000..249a2ca4d0 --- /dev/null +++ b/_migration-assistant/migration-phases/migrating-metadata.md @@ -0,0 +1,247 @@ +--- +layout: default +title: Migrating metadata +nav_order: 85 +parent: Migration phases +--- + +# Migrating metadata + +Metadata migration involves creating a snapshot of your cluster and then migrating the metadata from the snapshot using the migration console. + +This tool gathers information from a source cluster through a snapshot or through HTTP requests against the source cluster. These snapshots are fully compatible with the backfill process for `Reindex-From-Snapshot` (RFS) scenarios. + +After collecting information on the source cluster, comparisons are made against the target cluster. If running a migration, any metadata items that do not already exist will be created on the target cluster. + +## Creating the snapshot + +Creating a snapshot of the source cluster captures all the metadata and documents to be migrated to a new target cluster. + +Create the initial snapshot of the source cluster using the following command: + +```shell +console snapshot create +``` +{% include copy.html %} + +To check the progress of the snapshot in real time, use the following command: + +```shell +console snapshot status --deep-check +``` +{% include copy.html %} + +You should receive the following response when the snapshot is created: + +```shell +SUCCESS +Snapshot is SUCCESS. +Percent completed: 100.00% +Data GiB done: 29.211/29.211 +Total shards: 40 +Successful shards: 40 +Failed shards: 0 +Start time: 2024-07-22 18:21:42 +Duration: 0h 13m 4s +Anticipated duration remaining: 0h 0m 0s +Throughput: 38.13 MiB/sec +``` + +### Managing slow snapshot speeds + +Depending on the size of the data in the source cluster and the bandwidth allocated for snapshots, the process can take some time. Adjust the maximum rate at which the source cluster's nodes create the snapshot using the `--max-snapshot-rate-mb-per-node` option. Increasing the snapshot rate will consume more node resources, which may affect the cluster's ability to handle normal traffic. + +## Command arguments + +For the following commands, to identify all valid arguments, please run with `--help`. + +```shell +console metadata evaluate --help +``` +{% include copy.html %} + +```shell +console metadata migrate --help +``` +{% include copy.html %} + +Based on the migration console deployment options, a number of commands will be pre-populated. To view them, run console with verbosity: + +```shell +console -v metadata migrate --help +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```shell +(.venv) bash-5.2# console -v metadata migrate --help +INFO:console_link.cli:Logging set to INFO +. +. +. +INFO:console_link.models.metadata:Migrating metadata with command: /root/metadataMigration/bin/MetadataMigration --otel-collector-endpoint http://otel-collector:4317 migrate --snapshot-name snapshot_2023_01_01 --target-host https://opensearchtarget:9200 --min-replicas 0 --file-system-repo-path /snapshot/test-console --target-username admin --target-password ******** --target-insecure --help +. +. +. +``` + + +## Using the `evaluate` command + +By scanning the contents of the source cluster, applying filtering, and applying modifications a list of all items that will be migrated will be created. Any items not seen in this output will not be migrated onto the target cluster if the migrate command was to be run. This is a safety check before making modifications on the target cluster. + +```shell +console metadata evaluate [...] +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```bash +Starting Metadata Evaluation +Clusters: + Source: + Remote Cluster: OpenSearch 1.3.16 ConnectionContext(uri=http://localhost:33039, protocol=HTTP, insecure=false, compressionSupported=false) + + Target: + Remote Cluster: OpenSearch 2.14.0 ConnectionContext(uri=http://localhost:33037, protocol=HTTP, insecure=false, compressionSupported=false) + + +Migration Candidates: + Index Templates: + simple_index_template + + Component Templates: + simple_component_template + + Indexes: + blog_2023, movies_2023 + + Aliases: + alias1, movies-alias + + +Results: + 0 issue(s) detected +``` + + +## Using the migrate command + +Running through the same data as the evaluate command all of the migrated items will be applied onto the target cluster. If re-run multiple times items that were previously migrated will not be recreated. If any items do need to be re-migrated, please delete them from the target cluster and then rerun the evaluate then migrate commands to ensure the desired changes are made. + +```shell +console metadata migrate [...] +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```shell +Starting Metadata Migration + +Clusters: + Source: + Snapshot: OpenSearch 1.3.16 FileSystemRepo(repoRootDir=/tmp/junit10626813752669559861) + + Target: + Remote Cluster: OpenSearch 2.14.0 ConnectionContext(uri=http://localhost:33042, protocol=HTTP, insecure=false, compressionSupported=false) + + +Migrated Items: + Index Templates: + simple_index_template + + Component Templates: + simple_component_template + + Indexes: + blog_2023, movies_2023 + + Aliases: + alias1, movies-alias + + +Results: + 0 issue(s) detected +``` + + +## Metadata verification process + +Before moving on to additional migration steps, it is recommended to confirm details of your cluster. Depending on your configuration, this could be checking the sharding strategy or making sure index mappings are correctly defined by ingesting a test document. + +## Troubleshooting + +Use these instructions to help troubleshoot the following issues. + +### Accessing detailed logs + +Metadata migration creates a detailed log file that includes low level tracing information for troubleshooting. For each execution of the program a log file is created inside a shared volume on the migration console named `shared-logs-output` the following command will list all log files, one for each run of the command. + +```shell +ls -al /shared-logs-output/migration-console-default/*/metadata/ +``` +{% include copy.html %} + +To inspect the file within the console `cat`, `tail` and `grep` commands line tools. By looking for warnings, errors and exceptions in this log file can help understand the source of failures, or at the very least be useful for creating issues in this project. + +```shell +tail /shared-logs-output/migration-console-default/*/metadata/*.log +``` +{% include copy.html %} + +### Warnings and errors + +When encountering `WARN` or `ERROR` elements in the response, they will be accompanied by a short message, such as `WARN - my_index already exists`. More information can be found in the detailed logs associated with the warning or error. + +### OpenSearch running in compatibility mode + +There might be an error about being unable to update an ES 7.10.2 cluster, this can occur when compatibility mode has been enabled on an OpenSearch cluster disable it to continue, see [Enable compatibility mode](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/rename.html#rename-upgrade). + + +### Breaking change compatibility + +Metadata migration requires modifying data from the source to the target versions to recreate items. Sometimes these features are no longer supported and have been removed from the target version. Sometimes these features are not available in the target version, which is especially true when downgrading. While this tool is meant to make this process easier, it is not exhaustive in its support. When encountering a compatibility issue or an important feature gap for your migration, [search the issues and comment on the existing issue](https://github.com/opensearch-project/opensearch-migrations/issues) or [create a new](https://github.com/opensearch-project/opensearch-migrations/issues/new/choose) issue if one cannot be found. + +#### Deprecation of Mapping Types + +In Elasticsearch 6.8 the mapping types feature was discontinued in Elasticsearch 7.0+ which has created complexity in migrating to newer versions of Elasticsearch and OpenSearch, [learn more](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/removal-of-types.html) ↗. + +As Metadata migration supports migrating from ES 6.8 on to the latest versions of OpenSearch this scenario is handled by removing the type mapping types and restructuring the template or index properties. Note that, at the time of this writing multiple type mappings are not supported, [tracking task](https://opensearch.atlassian.net/browse/MIGRATIONS-1778) ↗. + + +**Example starting state with mapping type foo (ES 6):** + +```json +{ + "mappings": [ + { + "foo": { + "properties": { + "field1": { "type": "text" }, + "field2": { "type": "keyword" } + } + } + } + ] +} +``` +{% include copy.html %} + +**Example ending state with foo removed (ES 7):** + +```json +{ + "mappings": { + "properties": { + "field1": { "type": "text" }, + "field2": { "type": "keyword" }, + } + } +} +``` +{% include copy.html %} + +For additional technical details, [view the mapping type removal source code](https://github.com/opensearch-project/opensearch-migrations/blob/main/transformation/src/main/java/org/opensearch/migrations/transformation/rules/IndexMappingTypeRemoval.java). diff --git a/_migration-assistant/migration-phases/removing-migration-infrastructure.md b/_migration-assistant/migration-phases/removing-migration-infrastructure.md new file mode 100644 index 0000000000..656a8e1998 --- /dev/null +++ b/_migration-assistant/migration-phases/removing-migration-infrastructure.md @@ -0,0 +1,21 @@ +--- +layout: default +title: Removing migration infrastructure +nav_order: 120 +parent: Migration phases +--- + +# Removing migration infrastructure + +After a migration is complete all resources should be removed except for the target cluster, and optionally your Cloudwatch Logs, and Traffic Replayer logs. + +To remove all the CDK stack(s) which get created during a deployment you can execute a command similar to below within the CDK directory + +```bash +cdk destroy "*" --c contextId=<CONTEXT_ID> +``` +{% include copy.html %} + +Follow the instructions on the command-line to remove the deployed resources from the AWS account. + +The AWS Management Console can also be used to remove Migration Assistant resources and confirm that they are no longer in the account. \ No newline at end of file diff --git a/_migration-assistant/migration-phases/switching-traffic-from-the-source-cluster.md b/_migration-assistant/migration-phases/switching-traffic-from-the-source-cluster.md new file mode 100644 index 0000000000..c43580eef9 --- /dev/null +++ b/_migration-assistant/migration-phases/switching-traffic-from-the-source-cluster.md @@ -0,0 +1,52 @@ +--- +layout: default +title: Switching traffic from the source cluster +nav_order: 110 +parent: Migration phases +--- + +# Switching traffic from the source cluster + +After the source and target clusters are synchronized, traffic needs to be switched to the target cluster so that the source cluster can be taken offline. + +## Assumptions + +This page assumes that the following has occurred before making the switch: + +- All client traffic is being routed through a switchover listener in the [MigrationAssistant Application Load Balancer]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/). +- Client traffic has been verified as compatible with the target cluster. +- The target cluster is in a good state to accept client traffic. +- The target proxy service is deployed. + +## Switching traffic + +Use the following steps to switch traffic from the source cluster to the target cluster: + +1. In the AWS Management Console, navigate to **ECS** > **Migration Assistant Cluster**. Note the desired count of the capture proxy, which should be greater than 1. + +2. Update the **ECS Service** of the target proxy to be at least as large as the traffic capture proxy. Wait for tasks to start up, and verify that all targets are healthy in the target proxy service's **Load balancer health** section. + +3. Navigate to **EC2** > **Load Balancers** > **Migration Assistant ALB**. + +4. Navigate to **ALB Metrics** and examine any useful information, specifically looking at **Active Connection Count** and **New Connection Count**. Note any large discrepancies, which can indicate reused connections affecting traffic switchover. + +5. Navigate to **Capture Proxy Target Group** (`ALBSourceProxy-<STAGE>-TG`) > **Monitoring**. + +6. Examine the **Metrics Requests**, **Target (2XX, 3XX, 4XX, 5XX)**, and **Target Response Time** metrics. Verify that this appears as expected and includes all traffic expected to be included in the switchover. Note details that could help identify anomalies during the switchover, including the expected response time and response code rate. + +7. Navigate back to **ALB Metrics** and choose **Target Proxy Target Group** (`ALBTargetProxy-<STAGE>-TG`). Verify that all expected targets are healthy and that none are in a draining state. + +8. Navigate back to **ALB Metrics** and to the **Listener** on port `9200`. + +9. Choose the **Default rule** and **Edit**. + +10. Modify the weights of the targets to switch the desired traffic to the target proxy. To perform a full switchover, modify the **Target Proxy** weight to `1` and the **Source Proxy** weight to `0`. + +11. Choose **Save Changes**. + +12. Navigate to both **SourceProxy** and **TargetProxy TG Monitoring** metrics and verify that traffic is switching over as expected. If connections are being reused by clients, perform any necessary actions to terminate them. Monitor these metrics until **SourceProxy TG** shows 0 requests when all clients have switched over. + + +## Fallback + +If you need to fall back to the source cluster at any point during the switchover, revert the **Default rule** so that the Application Load Balancer routes to the **SourceProxy Target Group**. \ No newline at end of file diff --git a/_migration-assistant/migration-phases/using-traffic-replayer.md b/_migration-assistant/migration-phases/using-traffic-replayer.md new file mode 100644 index 0000000000..5b7af3c3f7 --- /dev/null +++ b/_migration-assistant/migration-phases/using-traffic-replayer.md @@ -0,0 +1,309 @@ +--- +layout: default +title: Using Traffic Replayer +nav_order: 100 +parent: Migration phases +--- + +# Using Traffic Replayer + +This guide covers how to use Traffic Replayer to replay captured traffic from a source cluster to a target cluster during the migration process. Traffic Replayer allows you to verify that the target cluster can handle requests in the same way as the source cluster and catch up to real-time traffic for a smooth migration. + +## When to run Traffic Replayer + +After deploying Migration Assistant, Traffic Replayer does not run by default. It should be started only after all metadata and documents have been migrated to ensure that recent changes to the source cluster are properly reflected in the target cluster. + +For example, if a document was deleted after a snapshot was taken, starting Traffic Replayer before the document migration is complete may cause the deletion request to execute before the document is added to the target. Running Traffic Replayer after all other migration processes ensures that the target cluster will be consistent with the source cluster. + +## Configuration options + +[Traffic Replayer settings]({{site.url}}{{site.baseurl}}/migration-assistant/deploying-migration-assistant/configuration-options/) are configured during the deployment of Migration Assistant. Make sure to set the authentication mode for Traffic Replayer so that it can properly communicate with the target cluster. + +## Using Traffic Replayer + +To manage Traffic Replayer, use the `console replay` command. The following examples show the available commands. + +### Start Traffic Replayer + +The following command starts Traffic Replayer with the options specified at deployment: + +```bash +console replay start +``` + +When starting Traffic Replayer, you should receive an output similar to the following: + +```bash +root@ip-10-0-2-66:~# console replay start +Replayer started successfully. +Service migration-dev-traffic-replayer-default set to 1 desired count. Currently 0 running and 0 pending. +``` + +## Check the status of Traffic Replayer + +Use the following command to show the status of Traffic Replayer: + +```bash +console replay status +``` + +Replay will return one of the following statuses: + +- `Running` shows how many container instances are actively running. +- `Pending` indicates how many instances are being provisione.d +- `Desired` shows the total number of instances that should be running. + +You should receive an output similar to the following: + +```bash +root@ip-10-0-2-66:~# console replay status +(<ReplayStatus.STOPPED: 4>, 'Running=0\nPending=0\nDesired=0') +``` + +## Stop Traffic Replayer + +The following command stops Traffic Replayer: + +```bash +console replay stop +``` + +You should receive an output similar to the following: + +```bash +root@ip-10-0-2-66:~# console replay stop +Replayer stopped successfully. +Service migration-dev-traffic-replayer-default set to 0 desired count. Currently 0 running and 0 pending. +``` + + + +### Delivery guarantees + +Traffic Replayer retrieves traffic from Kafka and updates its commit cursor after sending requests to the target cluster. This provides an "at least once" delivery guarantee; however, success isn't always guaranteed. Therefore, you should monitor metrics and tuple outputs or perform external validation to ensure that the target cluster is functioning as expected. + +## Time scaling + +Traffic Replayer sends requests in the same order that they were received from each connection to the source. However, relative timing between different connections is not guaranteed. For example: + +- **Scenario**: Two connections exist:one sends a PUT request every minute, and the other sends a GET request every second. +- **Behavior**: Traffic Replayer will maintain the sequence within each connection, but the relative timing between the connections (PUTs and GETs) is not preserved. + +Assume that a source cluster responds to requests (GETs and PUTs) within 100 ms: + +- With a **speedup factor of 1**, the target will experience the same request rates and idle periods as the source. +- With a **speedup factor of 2**, requests will be sent twice as fast, with GETs sent every 500 ms and PUTs every 30 seconds. +- With a **speedup factor of 10**, requests will be sent 10x faster, and as long as the target responds quickly, Traffic Replayer can maintain the pace. + +If the target cannot respond fast enough, Traffic Replayer will wait for the previous request to complete before sending the next one. This may cause delays and affect global relative ordering. + +## Transformations + +During migrations, some requests may need to be transformed between versions. For example, Elasticsearch previously supported multiple type mappings in indexes, but this is no longer the case in OpenSearch. Clients may need to be adjusted accordingly by splitting documents into multiple indexes or transforming request data. + +Traffic Replayer automatically rewrites host and authentication headers, but for more complex transformations, custom transformation rules can be specified using the `--transformer-config` option. For more information, see the [Traffic Replayer README](https://github.com/opensearch-project/opensearch-migrations/blob/c3d25958a44ec2e7505892b4ea30e5fbfad4c71b/TrafficCapture/trafficReplayer/README.md#transformations). + +### Example transformation + +Suppose that a source request contains a `tagToExcise` element that needs to be removed and its children promoted and that the URI path includes `extraThingToRemove`, which should also be removed. The following Jolt script handles this transformation: + +```json +[{ "JsonJoltTransformerProvider": +[ + { + "script": { + "operation": "shift", + "spec": { + "payload": { + "inlinedJsonBody": { + "top": { + "tagToExcise": { + "*": "payload.inlinedJsonBody.top.&" + }, + "*": "payload.inlinedJsonBody.top.&" + }, + "*": "payload.inlinedJsonBody.&" + }, + "*": "payload.&" + }, + "*": "&" + } + } + }, + { + "script": { + "operation": "modify-overwrite-beta", + "spec": { + "URI": "=split('/extraThingToRemove',@(1,&))" + } + } + }, + { + "script": { + "operation": "modify-overwrite-beta", + "spec": { + "URI": "=join('',@(1,&))" + } + } + } +] +}] +``` + +The resulting request sent to the target will appear similar to the following: + +```bash +PUT /oldStyleIndex/moreStuff HTTP/1.0 +host: testhostname + +{"top":{"properties":{"field1":{"type":"text"},"field2":{"type":"keyword"}}}} +``` +{% include copy.html %} + +You can pass Base64-encoded transformation scripts using `--transformer-config-base64`. + +## Result logs + +HTTP transactions from the source capture and those resent to the target cluster are logged in files located at `/shared-logs-output/traffic-replayer-default/*/tuples/tuples.log`. The `/shared-logs-output` directory is shared across containers, including the migration console. You can access these files from the migration console using the same path. Previous runs are also available in a `gzipped` format. + +Each log entry is a newline-delimited JSON object, containing information about the source and target requests/responses along with other transaction details, such as response times. + +These logs contain the contents of all requests, including authorization headers and the contents of all HTTP messages. Ensure that access to the migration environment is restricted, as these logs serve as a source of truth for determining what happened in both the source and target clusters. Response times for the source refer to the amount of time between the proxy sending the end of a request and receiving the response. While response times for the target are recorded in the same manner, keep in mind that the locations of the capture proxy, Traffic Replayer, and target may differ and that these logs do not account for the client's location. +{: .note} + + +### Example log entry + +The following example log entry shows a `/_cat/indices?v` request sent to both the source and target clusters: + +```json +{ + "sourceRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "capture-proxy:9200", + "Authorization": "Basic YWRtaW46YWRtaW4=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "sourceResponse": { + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 59, + "content-type": "text/plain; charset=UTF-8", + "content-length": "214", + "body": "aGVhbHRoIHN0YXR1cyBpbmRleCAgICAgICB..." + }, + "targetRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "opensearchtarget", + "Authorization": "Basic YWRtaW46bXlTdHJvbmdQYXNzd29yZDEyMyE=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "targetResponses": [{ + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 721, + "content-type": "text/plain; charset=UTF-8", + "content-length": "484", + "body": "aGVhbHRoIHN0YXR1cyBpbmRleCAgICAgICB..." + }], + "connectionId": "0242acfffe13000a-0000000a-00000005-1eb087a9beb83f3e-a32794b4.0", + "numRequests": 1, + "numErrors": 0 +} +``` +{% include copy.html %} + + +### Decoding log content + +The contents of HTTP message bodies are Base64 encoded in order to handle various types of traffic, including compressed data. To view the logs in a more human-readable format, use the console library `tuples show`. Running the script as follows will produce a `readable-tuples.log` in the home directory: + +```shell +console tuples show --in /shared-logs-output/traffic-replayer-default/d3a4b31e1af4/tuples/tuples.log > readable-tuples.log +``` + +The `readable-tuples.log` should appear similar to the following: + +```json +{ + "sourceRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "capture-proxy:9200", + "Authorization": "Basic YWRtaW46YWRtaW4=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "sourceResponse": { + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 59, + "content-type": "text/plain; charset=UTF-8", + "content-length": "214", + "body": "health status index uuid ..." + }, + "targetRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "opensearchtarget", + "Authorization": "Basic YWRtaW46bXlTdHJvbmdQYXNzd29yZDEyMyE=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "targetResponses": [{ + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 721, + "content-type": "text/plain; charset=UTF-8", + "content-length": "484", + "body": "health status index uuid ..." + }], + "connectionId": "0242acfffe13000a-0000000a-00000005-1eb087a9beb83f3e-a32794b4.0", + "numRequests": 1, + "numErrors": 0 +} +``` + + +## Metrics + +Traffic Replayer emits various OpenTelemetry metrics to Amazon CloudWatch, and traces are sent through AWS X-Ray. The following are some useful metrics that can help evaluate cluster performance. + +### `sourceStatusCode` + +This metric tracks the HTTP status codes for both the source and target clusters, with dimensions for the HTTP verb, such as `GET` or `POST`, and the status code families (200--299). These dimensions can help quickly identify discrepancies between the source and target, such as when `DELETE 200s` becomes `4xx` or `GET 4xx` errors turn into `5xx` errors. + +### `lagBetweenSourceAndTargetRequests` + +This metric shows the delay between requests hitting the source and target clusters. With a speedup factor greater than 1 and a target cluster that can handle requests efficiently, this value should decrease as the replay progresses, indicating a reduction in replay lag. + +### Additional metrics + +The following metrics are also reported: + +- **Throughput**: `bytesWrittenToTarget` and `bytesReadFromTarget` indicate the throughput to and from the cluster. +- **Retries**: `numRetriedRequests` tracks the number of requests retried due to status code mismatches between the source and target. +- **Event counts**: Various `(*)Count` metrics track the number of completed events. +- **Durations**: `(*)Duration` metrics measure the duration of each step in the process. +- **Exceptions**: `(*)ExceptionCount` shows the number of exceptions encountered during each processing phase. + + +## CloudWatch considerations + +Metrics pushed to CloudWatch may experience a visibility lag of around 5 minutes. CloudWatch also retains higher-resolution data for a shorter period than lower-resolution data. For more information, see [Amazon CloudWatch concepts](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_concepts.html). \ No newline at end of file diff --git a/_migration-assistant/migration-phases/verifying-migration-tools.md b/_migration-assistant/migration-phases/verifying-migration-tools.md new file mode 100644 index 0000000000..77df2b4280 --- /dev/null +++ b/_migration-assistant/migration-phases/verifying-migration-tools.md @@ -0,0 +1,205 @@ +--- +layout: default +title: Verifying migration tools +nav_order: 70 +parent: Migration phases +--- + +# Verifying migration tools + +Before using the Migration Assistant, take the following steps to verify that your cluster is ready for migration. + +## Verifying snapshot creation + +Verify that a snapshot can be created of your source cluster and used for metadata and backfill scenarios. + +### Installing the Elasticsearch S3 Repository plugin + +The snapshot needs to be stored in a location that Migration Assistant can access. This guide uses Amazon Simple Storage Service (Amazon S3). By default, Migration Assistant creates an S3 bucket for storage. Therefore, it is necessary to install the [Elasticsearch S3 repository plugin](https://www.elastic.co/guide/en/elasticsearch/plugins/7.10/repository-s3.html) on your source nodes (https://www.elastic.co/guide/en/elasticsearch/plugins/7.10/repository-s3.html). + +Additionally, make sure that the plugin has been configured with AWS credentials that allow it to read and write to Amazon S3. If your Elasticsearch cluster is running on Amazon Elastic Compute Cloud (Amazon EC2) or Amazon Elastic Container Service (Amazon ECS) instances with an AWS Identity and Access Management (IAM) execution role, include the necessary S3 permissions. Alternatively, you can store the credentials in the [Elasticsearch keystore](https://www.elastic.co/guide/en/elasticsearch/plugins/7.10/repository-s3-client.html). + +### Verifying the S3 repository plugin configuration + +You can verify that the S3 repository plugin is configured correctly by creating a test snapshot. + +Create an S3 bucket for the snapshot using the following AWS Command Line Interface (AWS CLI) command: + +```shell +aws s3api create-bucket --bucket <your-bucket-name> --region <your-aws-region> +``` +{% include copy.html %} + +Register a new S3 snapshot repository on your source cluster using the following cURL command: + +```shell +curl -X PUT "http://<your-source-cluster>:9200/_snapshot/test_s3_repository" -H "Content-Type: application/json" -d '{ + "type": "s3", + "settings": { + "bucket": "<your-bucket-name>", + "region": "<your-aws-region>" + } +}' +``` +{% include copy.html %} + +Next, create a test snapshot that captures only the cluster's metadata: + +```shell +curl -X PUT "http://<your-source-cluster>:9200/_snapshot/test_s3_repository/test_snapshot_1" -H "Content-Type: application/json" -d '{ + "indices": "", + "ignore_unavailable": true, + "include_global_state": true +}' +``` +{% include copy.html %} + +Check the AWS Management Console to confirm that your bucket contains the snapshot. + +### Removing test snapshots after verification + +To remove the resources created during verification, you can use the following deletion commands: + +**Test snapshot** + +```shell +curl -X DELETE "http://<your-source-cluster>:9200/_snapshot/test_s3_repository/test_snapshot_1?pretty" +``` +{% include copy.html %} + +**Test snapshot repository** + +```shell +curl -X DELETE "http://<your-source-cluster>:9200/_snapshot/test_s3_repository?pretty" +``` +{% include copy.html %} + +**S3 bucket** + +```shell +aws s3 rm s3://<your-bucket-name> --recursive +aws s3api delete-bucket --bucket <your-bucket-name> --region <your-aws-region> +``` +{% include copy.html %} + +### Troubleshooting + +Use this guidance to troubleshoot any of the following snapshot verification issues. + +#### Access denied error (403) + +If you encounter an error like `AccessDenied (Service: Amazon S3; Status Code: 403)`, verify the following: + +- The IAM role assigned to your Elasticsearch cluster has the necessary S3 permissions. +- The bucket name and AWS Region provided in the snapshot configuration match the actual S3 bucket you created. + +#### Older versions of Elasticsearch + +Older versions of the Elasticsearch S3 repository plugin may have trouble reading IAM role credentials embedded in Amazon EC2 and Amazon ECS instances. This is because the copy of the AWS SDK shipped with them is too old to read the new standard way of retrieving those credentials, as shown in [the Instance Metadata Service v2 (IMDSv2) specification](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html). This can result in snapshot creation failures, with an error message similar to the following: + +```json +{"error":{"root_cause":[{"type":"repository_verification_exception","reason":"[migration_assistant_repo] path [rfs-snapshot-repo] is not accessible on master node"}],"type":"repository_verification_exception","reason":"[migration_assistant_repo] path [rfs-snapshot-repo] is not accessible on master node","caused_by":{"type":"i_o_exception","reason":"Unable to upload object [rfs-snapshot-repo/tests-s8TvZ3CcRoO8bvyXcyV2Yg/master.dat] using a single upload","caused_by":{"type":"amazon_service_exception","reason":"Unauthorized (Service: null; Status Code: 401; Error Code: null; Request ID: null)"}}},"status":500} +``` + +If you encounter this issue, you can resolve it by temporarily enabling IMDSv1 on the instances in your source cluster for the duration of the snapshot. There is a toggle for this available in the AWS Management Console as well as in the AWS CLI. Switching this toggle will turn on the older access model and enable the Elasticsearch S3 repository plugin to work as normal. For more information about IMDSv1, see [Modify instance metadata options for existing instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-IMDS-existing-instances.html). + +## Switching over client traffic + +The Migration Assistant Application Load Balancer is deployed with a listener that shifts traffic between the source and target clusters through proxy services. The Application Load Balancer should start in **Source Passthrough** mode. + +### Verifying that the traffic switchover is complete + +Use the following steps to verify that the traffic switchover is complete: + +1. In the AWS Management Console, navigate to **EC2 > Load Balancers**. +2. Select the **MigrationAssistant ALB**. +3. Examine the listener on port `9200` and verify that 100% of the traffic is directed to the **Source Proxy**. +4. Navigate to the **Migration ECS Cluster** in the AWS Management Console. +5. Select the **Target Proxy Service**. +6. Verify that the desired count for the service is running: + * If the desired count is not met, update the service to increase it to at least 1 and wait for the service to start. +7. On the **Health and Metrics** tab under **Load balancer health**, verify that all targets are reporting as healthy: + * This confirms that the Application Load Balancer can connect to the target cluster through the target proxy. +8. (Reset) Update the desired count for the **Target Proxy Service** back to its original value in Amazon ECS. + +### Fixing unidentified traffic patterns + +When switching over traffic to the target cluster, you might encounter unidentified traffic patterns. To help identify the cause of these patterns, use the following steps: +* Verify that the target cluster allows traffic ingress from the **Target Proxy Security Group**. +* Navigate to **Target Proxy ECS Tasks** to investigate any failing tasks. +Set the **Filter desired status** to **Any desired status** to view all tasks, then navigate to the logs for any stopped tasks. + + +## Verifying replication + +Use the following steps to verify that replication is working once the traffic capture proxy is deployed: + + +1. Navigate to the **Migration ECS Cluster** in the AWS Management Console. +2. Navigate to **Capture Proxy Service**. +3. Verify that the capture proxy is running with the desired proxy count. If it is not, update the service to increase it to at least 1 and wait for startup. +4. Under **Health and Metrics** > **Load balancer health**, verify that all targets are healthy. This means that the Application Load Balancer is able to connect to the source cluster through the capture proxy. +5. Navigate to the **Migration Console Terminal**. +6. Run `console kafka describe-topic-records`. Wait 30 seconds for another Application Load Balancer health check. +7. Run `console kafka describe-topic-records` again and verify that the number of RECORDS increased between runs. +8. Run `console replay start` to start Traffic Replayer. +9. Run `tail -f /shared-logs-output/traffic-replayer-default/*/tuples/tuples.log | jq '.targetResponses[]."Status-Code"'` to confirm that the Kafka requests were sent to the target and that it responded as expected. If the responses don't appear: + * Check that the migration console can access the target cluster by running `./catIndices.sh`, which should show the indexes in the source and target. + * Confirm that messages are still being recorded to Kafka. + * Check for errors in the Traffic Replayer logs (`/migration/STAGE/default/traffic-replayer-default`) using CloudWatch. +10. (Reset) Update the desired count for the **Capture Proxy Service** back to its original value in Amazon ECS. + +### Troubleshooting + +Use this guidance to troubleshoot any of the following replication verification issues. + +### Health check responses with 401/403 status code + +If the source cluster is configured to require authentication, the capture proxy will not be able to verify replication beyond receiving a 401/403 status code for Application Load Balancer health checks. For more information, see [Failure Modes](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/README.md#failure-modes). + +### Traffic does not reach the source cluster + +Verify that the source cluster allows traffic ingress from the Capture Proxy Security Group. + +Look for failing tasks by navigating to **Traffic Capture Proxy ECS**. Change **Filter desired status** to **Any desired status** in order to see all tasks and navigate to the logs for stopped tasks. + + +## Resetting before migration + +After all verifications are complete, reset all resources before using Migration Assistant for an actual migration. + +The following steps outline how to reset resources with Migration Assistant before executing the actual migration. At this point all verifications are expected to have been completed. These steps can be performed after [Accessing the Migration Console]({{site.url}}{{site.baseurl}}/migration-assistant/migration-console/accessing-the-migration-console/). + +### Traffic Replayer + +To stop running Traffic Replayer, use the following command: + +```bash +console replay stop +``` +{% include copy.html %} + +### Kafka + +To clear all captured traffic from the Kafka topic, you can run the following command. + +This command will result in the loss of any traffic data captured by the capture proxy up to this point and thus should be used with caution. +{: .warning} + +```bash +console kafka delete-topic +``` +{% include copy.html %} + +### Target cluster + +To clear non-system indexes from the target cluster that may have been created as a result of testing, you can run the following command: + +This command will result in the loss of all data in the target cluster and should be used with caution. +{: .warning} + +```bash +console clusters clear-indices --cluster target +``` +{% include copy.html %} + diff --git a/_ml-commons-plugin/custom-local-models.md b/_ml-commons-plugin/custom-local-models.md index c2866938f6..09c3105f8d 100644 --- a/_ml-commons-plugin/custom-local-models.md +++ b/_ml-commons-plugin/custom-local-models.md @@ -65,14 +65,10 @@ To ensure that this basic local setup works, specify the following cluster setti PUT _cluster/settings { "persistent": { - "plugins": { - "ml_commons": { - "allow_registering_model_via_url": "true", - "only_run_on_ml_node": "false", - "model_access_control_enabled": "true", - "native_memory_threshold": "99" - } - } + "plugins.ml_commons.allow_registering_model_via_url": "true", + "plugins.ml_commons.only_run_on_ml_node": "false", + "plugins.ml_commons.model_access_control_enabled": "true", + "plugins.ml_commons.native_memory_threshold": "99" } } ``` diff --git a/_ml-commons-plugin/pretrained-models.md b/_ml-commons-plugin/pretrained-models.md index 1b0c726c33..552e3e607e 100644 --- a/_ml-commons-plugin/pretrained-models.md +++ b/_ml-commons-plugin/pretrained-models.md @@ -88,13 +88,9 @@ This example uses a simple setup with no dedicated ML nodes and allows running a PUT _cluster/settings { "persistent": { - "plugins": { - "ml_commons": { - "only_run_on_ml_node": "false", - "model_access_control_enabled": "true", - "native_memory_threshold": "99" - } - } + "plugins.ml_commons.only_run_on_ml_node": "false", + "plugins.ml_commons.model_access_control_enabled": "true", + "plugins.ml_commons.native_memory_threshold": "99" } } ``` diff --git a/_ml-commons-plugin/remote-models/connectors.md b/_ml-commons-plugin/remote-models/connectors.md index 3ec6c73b07..788f1b003d 100644 --- a/_ml-commons-plugin/remote-models/connectors.md +++ b/_ml-commons-plugin/remote-models/connectors.md @@ -294,7 +294,7 @@ In some cases, you may need to update credentials, like `access_key`, that you u ```json PUT /_plugins/_ml/models/<model_id> { - "connector": { + "connectors": { "credential": { "openAI_key": "YOUR NEW OPENAI KEY" } diff --git a/_ml-commons-plugin/tutorials/generate-embeddings.md b/_ml-commons-plugin/tutorials/generate-embeddings.md index 92b62b9fe8..c236424eb8 100644 --- a/_ml-commons-plugin/tutorials/generate-embeddings.md +++ b/_ml-commons-plugin/tutorials/generate-embeddings.md @@ -107,7 +107,7 @@ PUT _ingest/pipeline/bedrock_embedding_pipeline { "set": { "field": "title_tmp", - "value": "{{_ingest._value.title}}" + "value": {% raw %}"{{_ingest._value.title}}"{% endraw %} } }, { @@ -333,4 +333,4 @@ POST _bulk { "index" : { "_index" : "my_books" } } { "books" : [{"title": "third book", "description": "This is third book"}, {"description": "This is fourth book"}] } ``` -{% include copy-curl.html %} \ No newline at end of file +{% include copy-curl.html %} diff --git a/_observing-your-data/query-insights/grouping-top-n-queries.md b/_observing-your-data/query-insights/grouping-top-n-queries.md index 495766c0f3..d4c900d7e7 100644 --- a/_observing-your-data/query-insights/grouping-top-n-queries.md +++ b/_observing-your-data/query-insights/grouping-top-n-queries.md @@ -45,6 +45,96 @@ bool When queries share the same query structure, they are grouped together, ensuring that all similar queries belong to the same group. +## Configuring the query structure + +The preceding example query shows a simplified query structure. By default, the query structure also includes field names and field data types. + +For example, consider an index `index1` with the following field mapping: + +```json +"mappings": { + "properties": { + "field1": { + "type": "keyword" + }, + "field2": { + "type": "text" + }, + "field3": { + "type": "text" + }, + "field4": { + "type": "long" + } + } +} +``` + +If you run the following query on this index: + +```json +{ + "query": { + "bool": { + "must": [ + { + "term": { + "field1": "example_value" + } + } + ], + "filter": [ + { + "match": { + "field2": "search_text" + } + }, + { + "range": { + "field4": { + "gte": 1, + "lte": 100 + } + } + } + ], + "should": [ + { + "regexp": { + "field3": ".*" + } + } + ] + } + } +} +``` + +Then the query has the following corresponding query structure: + +```c +bool [] + must: + term [field1, keyword] + filter: + match [field2, text] + range [field4, long] + should: + regexp [field3, text] +``` + +To exclude field names and field data types from the query structure, configure the following settings: + +```json +PUT _cluster/settings +{ + "persistent" : { + "search.insights.top_queries.grouping.attributes.field_name" : false, + "search.insights.top_queries.grouping.attributes.field_type" : false + } +} +``` +{% include copy-curl.html %} ## Aggregate metrics per group diff --git a/_observing-your-data/query-insights/health.md b/_observing-your-data/query-insights/health.md new file mode 100644 index 0000000000..02324d8237 --- /dev/null +++ b/_observing-your-data/query-insights/health.md @@ -0,0 +1,119 @@ +--- +layout: default +title: Query Insights plugin health +parent: Query insights +nav_order: 50 +--- + +# Query Insights plugin health + +The Query Insights plugin provides an [API](#health-stats-api) and [metrics](#opentelemetry-error-metrics-counters) for monitoring its health and performance, enabling proactive identification of issues that may affect query processing or system resources. + +## Health Stats API +**Introduced 2.18** +{: .label .label-purple } + +The Health Stats API provides health metrics for each node running the Query Insights plugin. These metrics allow for an in-depth view of resource usage and the health of the query processing components. + +### Path and HTTP methods + +```json +GET _insights/health_stats +``` + +### Example request + +```json +GET _insights/health_stats +``` +{% include copy-curl.html %} + +### Example response + +The response includes a set of health-related fields for each node: + +```json +PUT _cluster/settings +{ + "AqegbPL0Tv2XWvZV4PTS8Q": { + "ThreadPoolInfo": { + "query_insights_executor": { + "type": "scaling", + "core": 1, + "max": 5, + "keep_alive": "5m", + "queue_size": 2 + } + }, + "QueryRecordsQueueSize": 2, + "TopQueriesHealthStats": { + "latency": { + "TopQueriesHeapSize": 5, + "QueryGroupCount_Total": 0, + "QueryGroupCount_MaxHeap": 0 + }, + "memory": { + "TopQueriesHeapSize": 5, + "QueryGroupCount_Total": 0, + "QueryGroupCount_MaxHeap": 0 + }, + "cpu": { + "TopQueriesHeapSize": 5, + "QueryGroupCount_Total": 0, + "QueryGroupCount_MaxHeap": 0 + } + } + } +} +``` + +### Response fields + +The following table lists all response body fields. + +Field | Data type | Description +:--- |:---| :--- +`ThreadPoolInfo` | Object | Information about the Query Insights thread pool, including type, core count, max threads, and queue size. See [The ThreadPoolInfo object](#the-threadpoolinfo-object). +`QueryRecordsQueueSize` | Integer | The size of the queue that buffers incoming search queries before processing. A high value may suggest increased load or slower processing. +`TopQueriesHealthStats` | Object | Performance metrics for each top query service that provide information about memory allocation (heap size) and query grouping. See [The TopQueriesHealthStats object](#the-topquerieshealthstats-object). + +### The ThreadPoolInfo object + +The `ThreadPoolInfo` object contains the following detailed configuration and performance data for the thread pool dedicated to the Query Insights plugin. + +Field | Data type | Description +:--- |:---| :--- +`type`| String | The thread pool type (for example, `scaling`). +`core`| Integer | The minimum number of threads in the thread pool. +`max`| Integer | The maximum number of threads in the thread pool. +`keep_alive`| Time unit | The amount of time that idle threads are retained. +`queue_size`| Integer | The maximum number of tasks in the queue. + +### The TopQueriesHealthStats object + +The `TopQueriesHealthStats` object provides breakdowns for latency, memory, and CPU usage and contains the following information. + +Field | Data type | Description +:--- |:---| :--- +`TopQueriesHeapSize`| Integer | The heap memory allocation for the query group. +`QueryGroupCount_Total`| Integer | The total number of processed query groups. +`QueryGroupCount_MaxHeap`| Integer | The size of the max heap that stores all query groups in memory. + +## OpenTelemetry error metrics counters + +The Query Insights plugin integrates with OpenTelemetry to provide real-time error metrics counters. These counters help to identify specific operational failures in the plugin and improve reliability. Each metric provides targeted insights into potential error sources in the plugin workflow, allowing for more focused debugging and maintenance. + +To collect these metrics, you must configure and collect query metrics. For more information, see [Query metrics]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-metrics/). + +The following table lists all available metrics. + +Field | Description +:--- | :--- +`LOCAL_INDEX_READER_PARSING_EXCEPTIONS` | The number of errors that occur when parsing data using the LocalIndexReader. +`LOCAL_INDEX_EXPORTER_BULK_FAILURES` | The number of failures that occur when ingesting Query Insights plugin data into local indexes. +`LOCAL_INDEX_EXPORTER_EXCEPTIONS` | The number of exceptions that occur in the Query Insights plugin LocalIndexExporter. +`INVALID_EXPORTER_TYPE_FAILURES` | The number of invalid exporter type failures. +`INVALID_INDEX_PATTERN_EXCEPTIONS` | The number of invalid index pattern exceptions. +`DATA_INGEST_EXCEPTIONS` | The number of exceptions that occur when ingesting data into the Query Insights plugin. +`QUERY_CATEGORIZE_EXCEPTIONS` | The number of exceptions that occur when categorizing the queries. +`EXPORTER_FAIL_TO_CLOSE_EXCEPTION` | The number of failures that occur when closing the exporter. \ No newline at end of file diff --git a/_observing-your-data/query-insights/index.md b/_observing-your-data/query-insights/index.md index ef3a65bfcd..4bba866c05 100644 --- a/_observing-your-data/query-insights/index.md +++ b/_observing-your-data/query-insights/index.md @@ -4,6 +4,8 @@ title: Query insights nav_order: 40 has_children: true has_toc: false +redirect_from: + - /query-insights/ --- # Query insights @@ -40,3 +42,7 @@ You can obtain the following information using Query Insights: - [Top n queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/) - [Grouping top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/grouping-top-n-queries/) - [Query metrics]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-metrics/) + +## Query Insights plugin health + +For information about monitoring the health of the Query Insights plugin, see [Query Insights plugin health]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/health/). \ No newline at end of file diff --git a/_observing-your-data/query-insights/top-n-queries.md b/_observing-your-data/query-insights/top-n-queries.md index b63d670926..a5bf42e694 100644 --- a/_observing-your-data/query-insights/top-n-queries.md +++ b/_observing-your-data/query-insights/top-n-queries.md @@ -96,6 +96,17 @@ GET /_insights/top_queries?type=memory ``` {% include copy-curl.html %} + +To specify a time range for querying top N results, use the `from` and `to` parameters in ISO8601 format: `YYYY-MM-DD'T'HH:mm:ss.SSSZ`. +For example, to retrieve the top N queries from August 25, 2024, at 15:00 UTC to August 30, 2024, at 17:00 UTC, send the following request: + +```json +GET /_insights/top_queries?from=2024-08-25T15:00:00.000Z&to=2024-08-30T17:00:00.000Z +``` +{% include copy-curl.html %} + +If you have a [local index exporter enabled](#configuring-a-local-index-exporter), historical queries stored in local OpenSearch indexes will also be included in the specified time range. + If your query returns no results, ensure that top N query monitoring is enabled for the target metric type and that search requests were made within the current [time window](#configuring-the-window-size). {: .important} diff --git a/_observing-your-data/trace/distributed-tracing.md b/_observing-your-data/trace/distributed-tracing.md index 4fb464f67c..773b4dd34a 100644 --- a/_observing-your-data/trace/distributed-tracing.md +++ b/_observing-your-data/trace/distributed-tracing.md @@ -1,6 +1,6 @@ --- layout: default -title: Distrbuted tracing +title: Distributed tracing parent: Trace Analytics nav_order: 65 --- diff --git a/_query-dsl/full-text/match-bool-prefix.md b/_query-dsl/full-text/match-bool-prefix.md index 3964dc5ee8..6905d49989 100644 --- a/_query-dsl/full-text/match-bool-prefix.md +++ b/_query-dsl/full-text/match-bool-prefix.md @@ -216,7 +216,7 @@ The `<field>` accepts the following parameters. All parameters except `query` ar Parameter | Data type | Description :--- | :--- | :--- `query` | String | The text, number, Boolean value, or date to use for search. Required. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `fuzziness` | `AUTO`, `0`, or a positive integer | The number of character edits (insert, delete, substitute) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. `fuzzy_rewrite` | String | Determines how OpenSearch rewrites the query. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. If the `fuzziness` parameter is not `0`, the query uses a `fuzzy_rewrite` method of `top_terms_blended_freqs_${max_expansions}` by default. Default is `constant_score`. `fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to `true` (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n"). If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases. diff --git a/_query-dsl/full-text/match-phrase.md b/_query-dsl/full-text/match-phrase.md index 747c4814d9..3f36465790 100644 --- a/_query-dsl/full-text/match-phrase.md +++ b/_query-dsl/full-text/match-phrase.md @@ -268,6 +268,6 @@ The `<field>` accepts the following parameters. All parameters except `query` ar Parameter | Data type | Description :--- | :--- | :--- `query` | String | The query string to use for search. Required. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `slop` | `0` (default) or a positive integer | Controls the degree to which words in a query can be misordered and still be considered a match. From the [Lucene documentation](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/PhraseQuery.html#getSlop--): "The number of other words permitted between words in query phrase. For example, to switch the order of two words requires two moves (the first move places the words atop one another), so to permit reorderings of phrases, the slop must be at least two. A value of zero requires an exact match." `zero_terms_query` | String | In some cases, the analyzer removes all terms from a query string. For example, the `stop` analyzer removes all terms from the string `an but this`. In those cases, `zero_terms_query` specifies whether to match no documents (`none`) or all documents (`all`). Valid values are `none` and `all`. Default is `none`. \ No newline at end of file diff --git a/_query-dsl/full-text/match.md b/_query-dsl/full-text/match.md index 056ef76890..5ece14e127 100644 --- a/_query-dsl/full-text/match.md +++ b/_query-dsl/full-text/match.md @@ -451,7 +451,7 @@ Parameter | Data type | Description :--- | :--- | :--- `query` | String | The query string to use for search. Required. `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create a [match phrase query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) automatically for multi-term synonyms. For example, if you specify `ba,batting average` as synonyms and search for `ba`, OpenSearch searches for `ba OR "batting average"` (if this option is `true`) or `ba OR (batting AND average)` (if this option is `false`). Default is `true`. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. `enable_position_increments` | Boolean | When `true`, resulting queries are aware of position increments. This setting is useful when the removal of stop words leaves an unwanted "gap" between terms. Default is `true`. `fuzziness` | String | The number of character edits (insertions, deletions, substitutions, or transpositions) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. Valid values are non-negative integers or `AUTO`. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. diff --git a/_query-dsl/full-text/multi-match.md b/_query-dsl/full-text/multi-match.md index ab1496fdd3..a3995df714 100644 --- a/_query-dsl/full-text/multi-match.md +++ b/_query-dsl/full-text/multi-match.md @@ -900,9 +900,9 @@ Parameter | Data type | Description :--- | :--- | :--- `query` | String | The query string to use for search. Required. `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create a [match phrase query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) automatically for multi-term synonyms. For example, if you specify `ba,batting average` as synonyms and search for `ba`, OpenSearch searches for `ba OR "batting average"` (if this option is `true`) or `ba OR (batting AND average)` (if this option is `false`). Default is `true`. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. -`fields` | Array of strings | The list of fields in which to search. If you don't provide the `fields` parameter, `multi_match` query searches the fields specified in the `index.query. Default_field` setting, which defaults to `*`. +`fields` | Array of strings | The list of fields in which to search. If you don't provide the `fields` parameter, `multi_match` query searches the fields specified in the `index.query.default_field` setting, which defaults to `*`. `fuzziness` | String | The number of character edits (insert, delete, substitute) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. Valid values are non-negative integers or `AUTO`. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. Not supported for `phrase`, `phrase_prefix`, and `cross_fields` queries. `fuzzy_rewrite` | String | Determines how OpenSearch rewrites the query. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. If the `fuzziness` parameter is not `0`, the query uses a `fuzzy_rewrite` method of `top_terms_blended_freqs_${max_expansions}` by default. Default is `constant_score`. `fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to `true` (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n"). If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases. diff --git a/_query-dsl/full-text/query-string.md b/_query-dsl/full-text/query-string.md index 47180e3f6d..7b3343155d 100644 --- a/_query-dsl/full-text/query-string.md +++ b/_query-dsl/full-text/query-string.md @@ -623,7 +623,7 @@ Parameter | Data type | Description `query` | String | The text that may contain expressions in the [query string syntax](#query-string-syntax) to use for search. Required. `allow_leading_wildcard` | Boolean | Specifies whether `*` and `?` are allowed as first characters of a search term. Default is `true`. `analyze_wildcard` | Boolean | Specifies whether OpenSearch should attempt to analyze wildcard terms. Default is `false`. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create a [match phrase query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) automatically for multi-term synonyms. For example, if you specify `ba, batting average` as synonyms and search for `ba`, OpenSearch searches for `ba OR "batting average"` (if this option is `true`) or `ba OR (batting AND average)` (if this option is `false`). Default is `true`. `boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. `default_field` | String | The field in which to search if the field is not specified in the query string. Supports wildcards. Defaults to the value specified in the `index.query. Default_field` index setting. By default, the `index.query. Default_field` is `*`, which means extract all fields eligible for term query and filter the metadata fields. The extracted fields are combined into a query if the `prefix` is not specified. Eligible fields do not include nested documents. Searching all eligible fields could be a resource-intensive operation. The `indices.query.bool.max_clause_count` search setting defines the maximum value for the product of the number of fields and the number of terms that can be queried at one time. The default value for `indices.query.bool.max_clause_count` is 1,024. diff --git a/_query-dsl/full-text/simple-query-string.md b/_query-dsl/full-text/simple-query-string.md index 58780cfdb4..1624efdaa7 100644 --- a/_query-dsl/full-text/simple-query-string.md +++ b/_query-dsl/full-text/simple-query-string.md @@ -355,14 +355,14 @@ Parameter | Data type | Description :--- | :--- | :--- `query`| String | The text that may contain expressions in the [simple query string syntax](#simple-query-string-syntax) to use for search. Required. `analyze_wildcard` | Boolean | Specifies whether OpenSearch should attempt to analyze wildcard terms. Default is `false`. -`analyzer` | String | The analyzer used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The analyzer used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create [match_phrase queries]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match/) automatically for multi-term synonyms. Default is `true`. `default_operator`| String | If the query string contains multiple search terms, whether all terms need to match (`AND`) or only one term needs to match (`OR`) for a document to be considered a match. Valid values are:<br>- `OR`: The string `to be` is interpreted as `to OR be`<br>- `AND`: The string `to be` is interpreted as `to AND be`<br> Default is `OR`. -`fields` | String array | The list of fields to search (for example, `"fields": ["title^4", "description"]`). Supports wildcards. If unspecified, defaults to the `index.query. Default_field` setting, which defaults to `["*"]`. The maximum number of fields that can be searched at the same time is defined by `indices.query.bool.max_clause_count`, which is 1,024 by default. +`fields` | String array | The list of fields to search (for example, `"fields": ["title^4", "description"]`). Supports wildcards. If unspecified, defaults to the `index.query.default_field` setting, which defaults to `["*"]`. The maximum number of fields that can be searched at the same time is defined by `indices.query.bool.max_clause_count`, which is 1,024 by default. `flags` | String | A `|`-delimited string of [flags]({{site.baseurl}}/query-dsl/full-text/simple-query-string/) to enable (for example, `AND|OR|NOT`). Default is `ALL`. You can explicitly set the value for `default_field`. For example, to return all titles, set it to `"default_field": "title"`. `fuzzy_max_expansions` | Positive integer | The maximum number of terms to which the query can expand. Fuzzy queries “expand to” a number of matching terms that are within the distance specified in `fuzziness`. Then OpenSearch tries to match those terms. Default is `50`. `fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to `true` (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n"). If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases. `fuzzy_prefix_length`| Integer | The number of beginning characters left unchanged for fuzzy matching. Default is 0. `lenient` | Boolean | Setting `lenient` to `true` ignores data type mismatches between the query and the document field. For example, a query string of `"8.2"` could match a field of type `float`. Default is `false`. `minimum_should_match` | Positive or negative integer, positive or negative percentage, combination | If the query string contains multiple search terms and you use the `or` operator, the number of terms that need to match for the document to be considered a match. For example, if `minimum_should_match` is 2, `wind often rising` does not match `The Wind Rises.` If `minimum_should_match` is `1`, it matches. For details, see [Minimum should match]({{site.url}}{{site.baseurl}}/query-dsl/minimum-should-match/). -`quote_field_suffix` | String | This option supports searching for exact matches (surrounded with quotation marks) using a different analysis method than non-exact matches use. For example, if `quote_field_suffix` is `.exact` and you search for `\"lightly\"` in the `title` field, OpenSearch searches for the word `lightly` in the `title.exact` field. This second field might use a different type (for example, `keyword` rather than `text`) or a different analyzer. \ No newline at end of file +`quote_field_suffix` | String | This option supports searching for exact matches (surrounded with quotation marks) using a different analysis method than non-exact matches use. For example, if `quote_field_suffix` is `.exact` and you search for `\"lightly\"` in the `title` field, OpenSearch searches for the word `lightly` in the `title.exact` field. This second field might use a different type (for example, `keyword` rather than `text`) or a different analyzer. diff --git a/_query-dsl/geo-and-xy/index.md b/_query-dsl/geo-and-xy/index.md index ee51e1e523..9bcf6a9462 100644 --- a/_query-dsl/geo-and-xy/index.md +++ b/_query-dsl/geo-and-xy/index.md @@ -30,7 +30,7 @@ OpenSearch provides the following geographic query types: - [**Geo-bounding box queries**]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/geo-and-xy/geo-bounding-box/): Return documents with geopoint field values that are within a bounding box. - [**Geodistance queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geodistance/): Return documents with geopoints that are within a specified distance from the provided geopoint. -- [**Geopolygon queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geodistance/): Return documents containing geopoints that are within a polygon. +- [**Geopolygon queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geopolygon/): Return documents containing geopoints that are within a polygon. - [**Geoshape queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geoshape/): Return documents that contain: - Geoshapes and geopoints that have one of four spatial relations to the provided shape: `INTERSECTS`, `DISJOINT`, `WITHIN`, or `CONTAINS`. - - Geopoints that intersect the provided shape. \ No newline at end of file + - Geopoints that intersect the provided shape. diff --git a/_query-dsl/term/terms.md b/_query-dsl/term/terms.md index 7dac6a9619..2de0b71bd6 100644 --- a/_query-dsl/term/terms.md +++ b/_query-dsl/term/terms.md @@ -183,7 +183,7 @@ PUT classes/_doc/102 To search for students enrolled in `CS102`, use the dot path notation to specify the full path to the field in the `path` parameter: ```json -ET students/_search +GET students/_search { "query": { "terms": { @@ -383,4 +383,4 @@ POST /products/_search } } ``` -{% include copy-curl.html %} \ No newline at end of file +{% include copy-curl.html %} diff --git a/_reporting/report-dashboard-index.md b/_reporting/report-dashboard-index.md index 0df87a965c..5e6d07b802 100644 --- a/_reporting/report-dashboard-index.md +++ b/_reporting/report-dashboard-index.md @@ -11,7 +11,7 @@ redirect_from: You can use OpenSearch Dashboards to create PNG, PDF, and CSV reports. To create reports, you must have the correct permissions. For a summary of the predefined roles and the permissions they grant, see the [Security plugin]({{site.url}}{{site.baseurl}}/security/access-control/users-roles#predefined-roles). -CSV reports have a non-configurable 10,000 row limit. They have no explicit size limit (for example, MB), but extremely large documents could cause report generation to fail with an out of memory error from the V8 JavaScript engine. +CSV reports have a non-configurable 10,000-row limit in OpenSearch version 2.16 and earlier. As of version 2.17, this limit can be configured when setting up a report. While reports have no explicit size limit (for example, MB), extremely large documents could cause report generation to fail with an out-of-memory error from the V8 JavaScript engine. {: .tip } ## Generating reports diff --git a/_search-plugins/caching/request-cache.md b/_search-plugins/caching/request-cache.md index 124152300b..49d5e8cd82 100644 --- a/_search-plugins/caching/request-cache.md +++ b/_search-plugins/caching/request-cache.md @@ -28,6 +28,7 @@ Setting | Data type | Default | Level | Static/Dynamic | Description `indices.cache.cleanup_interval` | Time unit | `1m` (1 minute) | Cluster | Static | Schedules a recurring background task that cleans up expired entries from the cache at the specified interval. `indices.requests.cache.size` | Percentage | `1%` | Cluster | Static | The cache size as a percentage of the heap size (for example, to use 1% of the heap, specify `1%`). `index.requests.cache.enable` | Boolean | `true` | Index | Dynamic | Enables or disables the request cache. +`indices.requests.cache.maximum_cacheable_size` | Integer | `0` | Cluster | Dynamic | Sets the maximum `size` of queries to be added to the request cache. ### Example diff --git a/_search-plugins/caching/tiered-cache.md b/_search-plugins/caching/tiered-cache.md index 22b1138be8..1b793c8465 100644 --- a/_search-plugins/caching/tiered-cache.md +++ b/_search-plugins/caching/tiered-cache.md @@ -44,7 +44,7 @@ In OpenSearch 2.14, a request cache can be used in a tiered cache. To begin, con To use the OpenSearch-provided tiered spillover cache implementation, set the cache store name to `tiered_spillover`, as shown in the following example: ```yaml -indices.request.cache.store.name: tiered_spillover +indices.requests.cache.store.name: tiered_spillover ``` {% include copy.html %} @@ -53,8 +53,8 @@ indices.request.cache.store.name: tiered_spillover Set the on-heap and disk store tiers to `opensearch_onheap` and `ehcache_disk`, as shown in the following example: ```yaml -indices.request.cache.tiered_spillover.onheap.store.name: opensearch_onheap -indices.request.cache.tiered_spillover.disk.store.name: ehcache_disk +indices.requests.cache.tiered_spillover.onheap.store.name: opensearch_onheap +indices.requests.cache.tiered_spillover.disk.store.name: ehcache_disk ``` The `opensearch_onheap` setting uses the built-in on-heap cache available in OpenSearch. @@ -68,19 +68,19 @@ The following table lists the cache store settings for the `opensearch_onheap` s Setting | Data type | Default | Description :--- | :--- | :--- | :--- -`indices.request.cache.opensearch_onheap.size` | Percentage | 1% of the heap size | The size of the on-heap cache. Optional. -`indices.request.cache.opensearch_onheap.expire` | Time unit | `MAX_VALUE` (disabled) | Specifies a time-to-live (TTL) for the cached results. Optional. +`indices.requests.cache.opensearch_onheap.size` | Percentage | 1% of the heap size | The size of the on-heap cache. Optional. +`indices.requests.cache.opensearch_onheap.expire` | Time unit | `MAX_VALUE` (disabled) | Specifies a time-to-live (TTL) for the cached results. Optional. The following table lists the disk cache store settings for the `ehcache_disk` store. Setting | Data type | Default | Description :--- | :--- | :--- | :--- -`indices.request.cache.ehcache_disk.max_size_in_bytes` | Long | `1073741824` (1 GB) | Defines the size of the disk cache. Optional. -`indices.request.cache.ehcache_disk.storage.path` | String | `""` | Defines the storage path for the disk cache. Required. -`indices.request.cache.ehcache_disk.expire_after_access` | Time unit | `MAX_VALUE` (disabled) | Specifies a TTL for the cached results. Optional. -`indices.request.cache.ehcache_disk.alias` | String | `ehcacheDiskCache#INDICES_REQUEST_CACHE` | Specifies an alias for the disk cache. Optional. -`indices.request.cache.ehcache_disk.segments` | Integer | `16` | Defines the number of segments into which the disk cache is separated. Used for concurrency. Optional. -`indices.request.cache.ehcache_disk.concurrency` | Integer | `1` | Defines the number of distinct write queues created for the disk store, where a group of segments shares a write queue. Optional. +`indices.requests.cache.ehcache_disk.max_size_in_bytes` | Long | `1073741824` (1 GB) | Defines the size of the disk cache. Optional. +`indices.requests.cache.ehcache_disk.storage.path` | String | `{data.paths}/nodes/{node.id}/request_cache` | Defines the storage path for the disk cache. Optional. +`indices.requests.cache.ehcache_disk.expire_after_access` | Time unit | `MAX_VALUE` (disabled) | Specifies a TTL for the cached results. Optional. +`indices.requests.cache.ehcache_disk.alias` | String | `ehcacheDiskCache#INDICES_REQUEST_CACHE` | Specifies an alias for the disk cache. Optional. +`indices.requests.cache.ehcache_disk.segments` | Integer | `16` | Defines the number of segments into which the disk cache is separated. Used for concurrency. Optional. +`indices.requests.cache.ehcache_disk.concurrency` | Integer | `1` | Defines the number of distinct write queues created for the disk store, where a group of segments shares a write queue. Optional. ### Additional settings for the `tiered_spillover` store @@ -88,8 +88,11 @@ The following table lists additional settings for the `tiered_spillover` store s Setting | Data type | Default | Description :--- | :--- | :--- | :--- -`indices.request.cache.tiered_spillover.disk.store.policies.took_time.threshold` | Time unit | `10ms` | A policy used to determine whether to cache a query into a disk cache based on its took time. This is a dynamic setting. Optional. -`indices.request.cache.tiered_spillover.disk.store.enabled` | Boolean | `True` | Enables or disables the disk cache dynamically within a tiered spillover cache. Note: After disabling a disk cache, entries are not removed automatically and requires the cache to be manually cleared. Optional. +`indices.requests.cache.tiered_spillover.disk.store.policies.took_time.threshold` | Time unit | `10ms` | A policy used to determine whether to cache a query into a disk cache based on its took time. This is a dynamic setting. Optional. +`indices.requests.cache.tiered_spillover.disk.store.enabled` | Boolean | `True` | Enables or disables the disk cache dynamically within a tiered spillover cache. Note: After disabling a disk cache, entries are not removed automatically and requires the cache to be manually cleared. Optional. +`indices.requests.cache.tiered_spillover.onheap.store.size` | Percentage | 1% of the heap size | Defines the size of the on-heap cache within tiered cache. Optional. +`indices.requests.cache.tiered_spillover.disk.store.size` | Long | `1073741824` (1 GB) | Defines the size of the disk cache within tiered cache. Optional. +`indices.requests.cache.tiered_spillover.segments` | Integer | `2 ^ (ceil(log2(CPU_CORES * 1.5)))` | This determines the number of segments in the tiered cache, with each segment secured by a re-entrant read/write lock. These locks enable multiple concurrent readers without contention, while the segmentation allows multiple writers to operate simultaneously, resulting in higher write throughput. Optional. ### Delete stale entries settings diff --git a/_search-plugins/filter-search.md b/_search-plugins/filter-search.md new file mode 100644 index 0000000000..f8625e0ac0 --- /dev/null +++ b/_search-plugins/filter-search.md @@ -0,0 +1,151 @@ +--- +layout: default +title: Filter search results +nav_order: 36 +--- + +# Filter search results + +You can filter searches using different methods, each suited to specific scenarios. You can apply filters at the query level, using `boolean` query clauses and `post_filter` and `aggregation` level filters, as follows: + +- **Query-level filtering:** Apply `boolean` query filter clauses to filter search hits and aggregations, such as to narrow results to specific categories or brands. +- **Post-filter filtering:** Use `post_filter` to refine search hits based on user selections while preserving all aggregation options. +- **Aggregation-level filtering:** Adjust specific aggregations based on selected filters without impacting other aggregations. + +## Query-level filtering with Boolean queries + +Use a `boolean` query with a filter clause to apply filters to both search hits and aggregations. For example, if a shopper searches for `smartphones` from `BrandA`, a Boolean query can restrict results to only those smartphones from `BrandA`. The following steps guide you through query-level filtering. + +1. Create an index `electronics` and provide the mapping using the following request: + +```json +PUT /electronics +{ + "mappings": { + "properties": { + "brand": { "type": "keyword" }, + "category": { "type": "keyword" }, + "price": { "type": "float" }, + "features": { "type": "keyword" } + } + } +} +``` +{% include copy-curl.html %} + +2. Add documents to the `electronics` index using the following request: + +```json +PUT /electronics/_doc/1?refresh +{ + "brand": "BrandA", + "category": "Smartphone", + "price": 699.99, + "features": ["5G", "Dual Camera"] +} +PUT /electronics/_doc/2?refresh +{ + "brand": "BrandA", + "category": "Laptop", + "price": 1199.99, + "features": ["Touchscreen", "16GB RAM"] +} +PUT /electronics/_doc/3?refresh +{ + "brand": "BrandB", + "category": "Smartphone", + "price": 799.99, + "features": ["5G", "Triple Camera"] +} +``` +{% include copy-curl.html %} + +3. Apply a `boolean` filter query to display only `smartphones` from `BrandA` using the following request: + +```json +GET /electronics/_search +{ + "query": { + "bool": { + "filter": [ + { "term": { "brand": "BrandA" }}, + { "term": { "category": "Smartphone" }} + ] + } + } +} +``` +{% include copy-curl.html %} + +## Narrowing results using `post-filter` while preserving aggregation visibility + +Use `post_filter` to limit search hits while preserving all aggregation options. For example, if a shopper selects `BrandA`, results are filtered to show only `BrandA` products while maintaining the visibility of all brand options in the aggregations, as shown in the following example request: + +```json +GET /electronics/_search +{ + "query": { + "bool": { + "filter": { "term": { "category": "Smartphone" }} + } + }, + "aggs": { + "brands": { + "terms": { "field": "brand" } + } + }, + "post_filter": { + "term": { "brand": "BrandA" } + } +} +``` +{% include copy-curl.html %} + +The result should show `BrandA` smartphones in the search hits and all brands in the aggregations. + +## Refining aggregations with aggregation-level filtering + +You can use aggregation-level filtering to apply filters to specific aggregations without affecting the main aggregation to which they belong. + +For example, you can use aggregation-level filtering to filter the `price_ranges` aggregation based on selected brands, `BrandA` and `BrandB`, without affecting the main `price_ranges` aggregation, as shown in the following example request. This displays price ranges relevant to the selected brands while also displaying overall price ranges for all products. + +```json +GET /electronics/_search +{ + "query": { + "bool": { + "filter": { "term": { "category": "Smartphone" }} + } + }, + "aggs": { + "price_ranges": { + "range": { + "field": "price", + "ranges": [ + { "to": 500 }, + { "from": 500, "to": 1000 }, + { "from": 1000 } + ] + } + }, + "filtered_brands": { + "filter": { + "terms": { "brand": ["BrandA", "BrandB"] } + }, + "aggs": { + "price_ranges": { + "range": { + "field": "price", + "ranges": [ + { "to": 500 }, + { "from": 500, "to": 1000 }, + { "from": 1000 } + ] + } + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_search-plugins/improving-search-performance.md b/_search-plugins/improving-search-performance.md index 4a0ffafe11..4cc0a60dc0 100644 --- a/_search-plugins/improving-search-performance.md +++ b/_search-plugins/improving-search-performance.md @@ -11,4 +11,6 @@ OpenSearch offers several ways to improve search performance: - Run resource-intensive queries asynchronously with [asynchronous search]({{site.url}}{{site.baseurl}}/search-plugins/async/). -- Search segments concurrently using [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/). \ No newline at end of file +- Search segments concurrently using [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/). + +- Improve aggregation performance using a [star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/). diff --git a/_search-plugins/knn/disk-based-vector-search.md b/_search-plugins/knn/disk-based-vector-search.md index dfb9262db5..8fe794f44c 100644 --- a/_search-plugins/knn/disk-based-vector-search.md +++ b/_search-plugins/knn/disk-based-vector-search.md @@ -21,6 +21,11 @@ To create an index for disk-based vector search, send the following request: ```json PUT my-vector-index { + "settings" : { + "index": { + "knn": true + } + }, "mappings": { "properties": { "my_vector_field": { @@ -43,6 +48,11 @@ To reduce the compression level, provide the `compression_level` parameter when ```json PUT my-vector-index { + "settings" : { + "index": { + "knn": true + } + }, "mappings": { "properties": { "my_vector_field": { @@ -67,6 +77,11 @@ If you need more granular fine-tuning, you can override additional k-NN paramete ```json PUT my-vector-index { + "settings" : { + "index": { + "knn": true + } + }, "mappings": { "properties": { "my_vector_field": { diff --git a/_search-plugins/knn/knn-index.md b/_search-plugins/knn/knn-index.md index 620b262cf9..b53fa997d8 100644 --- a/_search-plugins/knn/knn-index.md +++ b/_search-plugins/knn/knn-index.md @@ -92,7 +92,7 @@ Mapping parameter | Required | Default | Updatable | Description :--- | :--- | :--- | :--- | :--- `name` | true | n/a | false | The identifier for the nearest neighbor method. `space_type` | false | l2 | false | The vector space used to calculate the distance between vectors. Note: This value can also be specified at the top level of the mapping. -`engine` | false | nmslib | false | The approximate k-NN library to use for indexing and search. The available libraries are faiss, nmslib, and Lucene. +`engine` | false | faiss | false | The approximate k-NN library to use for indexing and search. The available libraries are `faiss`, `nmslib`, and `lucene`. `parameters` | false | null | false | The parameters used for the nearest neighbor method. ### Supported nmslib methods @@ -368,6 +368,7 @@ Setting | Default | Updatable | Description :--- | :--- | :--- | :--- `index.knn` | false | false | Whether the index should build native library indexes for the `knn_vector` fields. If set to false, the `knn_vector` fields will be stored in doc values, but approximate k-NN search functionality will be disabled. `index.knn.algo_param.ef_search` | 100 | true | The size of the dynamic list used during k-NN searches. Higher values result in more accurate but slower searches. Only available for NMSLIB. +`index.knn.advanced.approximate_threshold` | 15,000 | true | The number of vectors a segment must have before creating specialized data structures for approximate search. Set to `-1` to disable building vector data structures and `0` to always build them. `index.knn.algo_param.ef_construction` | 100 | false | Deprecated in 1.0.0. Instead, use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value. `index.knn.algo_param.m` | 16 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. `index.knn.space_type` | l2 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. diff --git a/_search-plugins/knn/painless-functions.md b/_search-plugins/knn/painless-functions.md index 7a8d9fec7b..4b2311ad65 100644 --- a/_search-plugins/knn/painless-functions.md +++ b/_search-plugins/knn/painless-functions.md @@ -51,7 +51,7 @@ The following table describes the available painless functions the k-NN plugin p Function name | Function signature | Description :--- | :--- l2Squared | `float l2Squared (float[] queryVector, doc['vector field'])` | This function calculates the square of the L2 distance (Euclidean distance) between a given query vector and document vectors. The shorter the distance, the more relevant the document is, so this example inverts the return value of the l2Squared function. If the document vector matches the query vector, the result is 0, so this example also adds 1 to the distance to avoid divide by zero errors. -l1Norm | `float l1Norm (float[] queryVector, doc['vector field'])` | This function calculates the square of the L2 distance (Euclidean distance) between a given query vector and document vectors. The shorter the distance, the more relevant the document is, so this example inverts the return value of the l2Squared function. If the document vector matches the query vector, the result is 0, so this example also adds 1 to the distance to avoid divide by zero errors. +l1Norm | `float l1Norm (float[] queryVector, doc['vector field'])` | This function calculates the L1 Norm distance (Manhattan distance) between a given query vector and document vectors. cosineSimilarity | `float cosineSimilarity (float[] queryVector, doc['vector field'])` | Cosine similarity is an inner product of the query vector and document vector normalized to both have a length of 1. If the magnitude of the query vector doesn't change throughout the query, you can pass the magnitude of the query vector to improve performance, instead of calculating the magnitude every time for every filtered document:<br /> `float cosineSimilarity (float[] queryVector, doc['vector field'], float normQueryVector)` <br />In general, the range of cosine similarity is [-1, 1]. However, in the case of information retrieval, the cosine similarity of two documents ranges from 0 to 1 because the tf-idf statistic can't be negative. Therefore, the k-NN plugin adds 1.0 in order to always yield a positive cosine similarity score. hamming | `float hamming (float[] queryVector, doc['vector field'])` | This function calculates the Hamming distance between a given query vector and document vectors. The Hamming distance is the number of positions at which the corresponding elements are different. The shorter the distance, the more relevant the document is, so this example inverts the return value of the Hamming distance. @@ -73,4 +73,4 @@ The `hamming` space type is supported for binary vectors in OpenSearch version 2 Because scores can only be positive, this script ranks documents with vector fields higher than those without. With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown. -{: .note } \ No newline at end of file +{: .note } diff --git a/_search-plugins/knn/performance-tuning.md b/_search-plugins/knn/performance-tuning.md index 77f44dee93..ae2368b597 100644 --- a/_search-plugins/knn/performance-tuning.md +++ b/_search-plugins/knn/performance-tuning.md @@ -45,7 +45,7 @@ If your hardware has multiple cores, you can allow multiple threads in native li Monitor CPU utilization and choose the correct number of threads. Because native library index construction is costly, choosing more threads then you need can cause additional CPU load. -### (Expert-level) Disable vector field storage in the source field +### (Expert level) Disable vector field storage in the source field The `_source` field contains the original JSON document body that was passed at index time. This field is not indexed and is not searchable but is stored so that it can be returned when executing fetch requests such as `get` and `search`. When using vector fields within the source, you can remove the vector field to save disk space, as shown in the following example where the `location` vector is excluded: @@ -95,9 +95,74 @@ In OpenSearch 2.15 or later, you can further improve indexing speed and reduce d } ``` -This is an expert-level setting. Disabling the `_recovery_source` may lead to failures during peer-to-peer recovery. Before disabling the `_recovery_source`, check with your OpenSearch cluster admin to determine whether your cluster performs regular flushes before starting the peer-to-peer recovery of shards before disabling the `_recovery_source`. +This is an expert-level setting. Disabling the `_recovery_source` may lead to failures during peer-to-peer recovery. Before disabling the `_recovery_source`, check with your OpenSearch cluster admin to determine whether your cluster performs regular flushes before starting the peer-to-peer recovery of shards prior to disabling the `_recovery_source`. {: .warning} +### (Expert level) Build vector data structures on demand + +This approach is recommended only for workloads that involve a single initial bulk upload and will be used exclusively for search after force merging to a single segment. + +During indexing, vector search builds a specialized data structure for a `knn_vector` field to enable efficient approximate k-NN search. However, these structures are rebuilt during [force merge]({{site.url}}{{site.baseurl}}/api-reference/index-apis/force-merge/) on k-NN indexes. To optimize indexing speed, follow these steps: + +1. **Disable vector data structure creation**: Disable vector data structure creation for new segments by setting [`index.knn.advanced.approximate_threshold`]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#index-settings) to `-1`. + + To specify the setting at index creation, send the following request: + + ```json + PUT /test-index/ + { + "settings": { + "index.knn.advanced.approximate_threshold": "-1" + } + } + ``` + {% include copy-curl.html %} + + To specify the setting after index creation, send the following request: + + ```json + PUT /test-index/_settings + { + "index.knn.advanced.approximate_threshold": "-1" + } + ``` + {% include copy-curl.html %} + +1. **Perform bulk indexing**: Index data in [bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) without performing any searches during ingestion: + + ```json + POST _bulk + { "index": { "_index": "test-index", "_id": "1" } } + { "my_vector1": [1.5, 2.5], "price": 12.2 } + { "index": { "_index": "test-index", "_id": "2" } } + { "my_vector1": [2.5, 3.5], "price": 7.1 } + ``` + {% include copy-curl.html %} + + If searches are performed while vector data structures are disabled, they will run using exact k-NN search. + +1. **Reenable vector data structure creation**: Once indexing is complete, enable vector data structure creation by setting `index.knn.advanced.approximate_threshold` to `0`: + + ```json + PUT /test-index/_settings + { + "index.knn.advanced.approximate_threshold": "0" + } + ``` + {% include copy-curl.html %} + + If you do not reset the setting to `0` before the force merge, you will need to reindex your data. + {: .note} + +1. **Force merge segments into one segment**: Perform a force merge and specify `max_num_segments=1` to create the vector data structures only once: + + ```json + POST test-index/_forcemerge?max_num_segments=1 + ``` + {% include copy-curl.html %} + + After the force merge, new search requests will execute approximate k-NN search using the newly created data structures. + ## Search performance tuning Take the following steps to improve search performance: diff --git a/_search-plugins/ltr/advanced-functionality.md b/_search-plugins/ltr/advanced-functionality.md new file mode 100644 index 0000000000..50a7e6de19 --- /dev/null +++ b/_search-plugins/ltr/advanced-functionality.md @@ -0,0 +1,541 @@ +--- +layout: default +title: Advanced functionality +nav_order: 80 +parent: Learning to Rank +has_children: false +--- + +# Advanced functionality + +OpenSearch Learning to Rank (LTR) offers additional functionality. It is recommended that you have a foundational understanding of OpenSearch LTR before working with these features. + +## Reusable features + +[Building features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features/) involves uploading a list of features. To avoid repeating common features across multiple sets, you can maintain a library of reusable features. + +For example, if a title field query is frequently used in your feature sets, then you can create a reusable title query using the feature API: + +```json + POST _ltr/_feature/titleSearch + { + "feature": + { + "params": [ + "keywords" + ], + "template": { + "match": { + "title": "{{keywords}}" + } + } + } + } +``` +{% include copy-curl.html %} + +Normal CRUD operations apply, so you can delete a feature by using the following operation: + +```json +DELETE _ltr/_feature/titleSearch +``` +{% include copy-curl.html %} + + +To fetch an individual feature, you can use the following request: + +```json +GET _ltr/_feature/titleSearch +``` +{% include copy-curl.html %} + +To view a list of all features filtered by name prefix, you can use the following request: + +```json +GET /_ltr/_feature?prefix=t +``` +{% include copy-curl.html %} + +To create or update a feature set, you can refer to the `titleSearch` feature by using the following request: + +```json +POST /_ltr/_featureset/my_featureset/_addfeatures/titleSearch +``` +{% include copy-curl.html %} + +This adds the `titleSearch` feature to the next ordinal position within the `my_featureset` feature set. + +## Derived features + +Derived features are those that build upon other features. These can be expressed as [Lucene expressions](http://lucene.apache.org/core/7_1_0/expressions/index.html?org/apache/lucene/expressions/js/package-summary.html) and are identified by the `"template_language": "derived_expression"`. + +Additionally, derived features can accept query-time variables of type [`Number`](https://docs.oracle.com/javase/8/docs/api/java/lang/Number.html), as described in [Creating feature sets]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features#creating-feature-sets). + +### Script features + +Script features are a type of [derived feature](#derived-features). These features have access to the `feature_vector`, but they are implemented as native or Painless OpenSearch scripts rather than as [Lucene +expressions](http://lucene.apache.org/core/7_1_0/expressions/index.html?org/apache/lucene/expressions/js/package-summary.html). + +To identify these features, set the `"template_language": "script_feature""`. The custom script can access the `feature_vector` through the [Java Map](https://docs.oracle.com/javase/8/docs/api/java/util/Map.html), as described in [Create a feature set]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features#creating-feature-sets). + +Script-based features may impact the performance of your OpenSearch cluster, so it is best to avoid them if you require highly performant queries. +{: .warning} + +### Script feature parameters + +Script features are native or Painless scripts within the context of LTR. These script features can accept parameters as described in the [OpenSearch script documentation]({{site.url}}{{site.baseurl}}/api-reference/script-apis/index/). When working with LTR scripts, you can override parameter values and names. The priority for parameterization, in increasing order, is as follows: + +- The parameter name and value are passed directly to the source script, but not in the LTR script parameters. These cannot be configured at query time. +- The parameter name is passed to both the `sltr` query and the source script, allowing the script parameter values to be overridden at query time. +- The LTR script parameter name to native script parameter name indirection allows you to use different parameter names in your LTR feature definition than those in the underlying native script. This gives you flexibility in how you define and use scripts within the LTR context. + +For example, to set up a customizable way to rank movies in search results, considering both the title match and other adjustable factors, you can use the following request: + +```json +POST _ltr/_featureset/more_movie_features +{ + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{{keywords}}" + } + } + }, + { + "name": "custom_title_query_boost", + "params": [ + "some_multiplier", + "ltr_param_foo" + ], + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "(long)params.default_param * params.feature_vector.get('title_query') * (long)params.some_multiplier * (long) params.param_foo", + "params": { + "default_param": 10, + "some_multiplier": "some_multiplier", + "extra_script_params": { + "ltr_param_foo": "param_foo" + } + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Multiple feature stores + +A feature store corresponds to an independent LTR system, including features, feature sets, and models backed by a single index and cache. A feature store typically represents a single search problem or application, like Wikipedia or Wiktionary. To use multiple feature stores in your OpenSearch cluster, you can create and manage them using the provided API. For example, you can create a feature set for the `wikipedia` feature store as follows: + +```json +PUT _ltr/wikipedia + +POST _ltr/wikipedia/_featureset/attempt_1 +{ + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{{keywords}}" + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +When logging features, you can specify the feature store using the `store` parameter in the `sltr` section of your query, as shown in the following example structure. If you do not provide a `store` parameter, the default store is used to look up the feature set. + +```json +{ + "sltr": { + "_name": "logged_featureset", + "featureset": "attempt_1", + "store": "wikipedia", + "params": { + "keywords": "star" + } + } +} +``` +{% include copy-curl.html %} + +To delete the feature set, you can use the following operation: + +```json +DELETE _ltr/wikipedia/_featureset/attempt_1 +``` +{% include copy-curl.html %} + +## Model caching + +The Model Caching plugin uses an internal cache for compiled models. To force the models to be recompiled, you can clear the cache for a feature store: + +```json +POST /_ltr/_clearcache +``` +{% include copy-curl.html %} + +To get cluster-wide cache statistics for a specific store, use the following request: + +```json +GET /_ltr/_cachestats +``` +{% include copy-curl.html %} + +You can control the characteristics of the internal cache by using the following node settings: + +``` +# limit cache usage to 12 megabytes (defaults to 10mb or max_heap/10 if lower) ltr.caches.max_mem: 12mb +# Evict cache entries 10 minutes after insertion (defaults to 1hour, set to 0 to disable) ltr.caches.expire_after_write: 10m +# Evict cache entries 10 minutes after access (defaults to 1hour, set to 0 to disable) ltr.caches.expire_after_read: 10m +``` +{% include copy.html %} + +## Extra logging + +As described in [Logging features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/), you can use the logging extension to return feature values with each document. For native scripts, you can also return additional arbitrary information along with the logged features. + +For native scripts, the `extra_logging` parameter is injected into the script parameters. This parameter is a [`Supplier<Map<String,Object>>`](https://docs.oracle.com/javase/8/docs/api/java/util/function/Supplier.html), which provides a non-null `Map<String,Object>` only during the logging fetch phase. Any values you add to this map are returned alongside the logged features: + +```java +{ + @Override + public double runAsDouble() { + ... + Map<String,Object> extraLoggingMap = ((Supplier<Map<String,Object>>) getParams().get("extra_logging")).get(); + if (extraLoggingMap != null) { + extraLoggingMap.put("extra_float", 10.0f); + extraLoggingMap.put("extra_string", "additional_info"); + } + ... + } +} +``` +{% include copy-curl.html %} + +If the extra logging map is accessed, it is returned as an additional entry with the logged features. The format of the logged features, including the extra logging information, will appear similar to the following example: + +```json + { + "log_entry1": [ + { + "name": "title_query", + "value": 9.510193 + }, + { + "name": "body_query", + "value": 10.7808075 + }, + { + "name": "user_rating", + "value": 7.8 + }, + { + "name": "extra_logging", + "value": { + "extra_float": 10.0, + "extra_string": "additional_info" + } + } + ] +} +``` +{% include copy-curl.html %} + +## Feature score caching + +By default, the Feature Score Caching plugin calculates feature scores for both model inference and feature score logging. For example, if you write a query to rescore the top 100 documents and return the top 10 with feature scores, then the plugin calculates the feature scores of the top 100 documents for model inference and then calculates and logs the scores for the top 10 documents. + +The following query shows this behavior: + +```json +POST tmdb/_search +{ + "size": 10, + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "window_size" : 100, + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } + }, + "ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "rescore_index": 0 + } + } + } +} +``` +{% include copy-curl.html %} + +In some environments, it may be faster to cache the feature scores for model inference and reuse them for logging. To enable feature score caching, add the `cache: "true"` +flag to the `sltr` query that is the target of feature score logging, as shown in the following example: + +```json +{ + "sltr":{ + "cache":true, + "params":{ + "keywords":"rambo" + }, + "model":"my_model" + } +} +``` +{% include copy-curl.html %} + +## Stats + +You can use the Stats API to retrieve the plugin's overall status and statistics. To do this, send the following request: + +```json +GET /_ltr/_stats +``` +{% include copy-curl.html %} + +The response includes information about the cluster, configured stores, and cache statistics for various plugin components: + +```json +{ + "_nodes":{ + "total":1, + "successful":1, + "failed":0 + }, + "cluster_name":"es-cluster", + "stores":{ + "_default_":{ + "model_count":10, + "featureset_count":1, + "feature_count":0, + "status":"green" + } + }, + "status":"green", + "nodes":{ + "2QtMvxMvRoOTymAsoQbxhw":{ + "cache":{ + "feature":{ + "eviction_count":0, + "miss_count":0, + "hit_count":0, + "entry_count":0, + "memory_usage_in_bytes":0 + }, + "featureset":{ + "eviction_count":0, + "miss_count":0, + "hit_count":0, + "entry_count":0, + "memory_usage_in_bytes":0 + }, + "model":{ + "eviction_count":0, + "miss_count":0, + "hit_count":0, + "entry_count":0, + "memory_usage_in_bytes":0 + } + } + } + } +} +``` +{% include copy-curl.html %} + +You can use filters to retrieve a single statistic by sending the following request: + +```json +GET /_ltr/_stats/{stat} +``` +{% include copy-curl.html %} + +You can limit the information to a single node in the cluster by sending the following requests: + +```json +GET /_ltr/_stats/nodes/{nodeId} +GET /_ltr/_stats/{stat}/nodes/{nodeId} +``` +{% include copy-curl.html %} + +## TermStat query +Experimental +{: .label .label-red } + +The `TermStatQuery` is in an experimental stage, and the Domain-Specific Language (DSL) may change as the code advances. For stable term-statistic access, see [ExplorerQuery]{.title-ref}. + +The `TermStatQuery` is a reimagined version of the legacy `ExplorerQuery`. It provides a clearer way to specify terms and offers more flexibility for experimentation. This query surfaces the same data as the [ExplorerQuery]{.title-ref}, but it allows you to specify a custom Lucene expression to retrieve the desired data, such as in the following example: + +```json +POST tmdb/_search +{ + "query": { + "term_stat": { + "expr": "df", + "aggr": "max", + "terms": ["rambo", "rocky"], + "fields": ["title"] + } + } +} +``` +{% include copy-curl.html %} + +The `expr` parameter is used to specify a Lucene expression. This expression is run on a per-term basis. The expression can be a simple stat type or a custom formula with multiple stat types, such as `(tf * idf) / 2`. Available stat types in the Lucene expression context are listed in the following table. + +Type | Description +:---| :--- +`df` | The direct document frequency for a term. For example, if `rambo` occurs in three movie titles across multiple documents, then the value would be `3`. +`idf` | The inverse document frequency (IDF) calculation using the formula `log((NUM_DOCS+1)/(raw_df+1)) + 1`. +`tf` | The term frequency for a document. For example, if `rambo` occurs three times in a movie synopsis in the same document, then the value would be `3`. +`tp` | The term positions for a document. Multiple positions can be returned for a single term, so you should review the behavior of the `pos_aggr` parameter. +`ttf` | The total term frequency for a term across an index. For example, if `rambo` is mentioned a total of 100 times in the `overview` field across all documents, then the value would be `100`. + +The `aggr` parameter specifies the type of aggregation to be applied to the collected statistics from the `expr`. For example, if you specify the terms `rambo` and `rocky`, then the query gathers statistics for both terms. Because you can only return a single value, you need to decide which statistical calculation to use. The available aggregation types are `min`, `max`, `avg`, `sum`, and `stddev`. The query also provides the following counts: `matches` (the number of terms that matched in the current document) and `unique` (the unique number of terms that were passed in the query). + +The `terms` parameter specifies an array of terms for which you want to gather statistics. Only single terms are supported, with no support for phrases or span queries. If your field is tokenized, you can pass multiple terms in one string in the array. + +The `fields` parameter specifies the fields to check for the specified `terms`. If no `analyzer` is specified, then the configured `search_analyzer` for each field is used. + +The optional parameters are listed in the following table. + +Type | Description +:---| :--- +`analyzer` | If specified, this analyzer is used instead of the configured `search_analyzer` for each field. +`pos_aggr` | Because each term can have multiple positions, you can use this parameter to specify the aggregation to apply to the term positions. This supports the same values as the `aggr` parameter and defaults to `avg`. + +### Script injection + +Script injection provides the ability to inject term statistics into a scripting context. When working with `ScriptFeatures`, you can pass a `term_stat` object with the `terms`, `fields`, and `analyzer` parameters. An injected variable named `termStats` then provides access to the raw values in your custom script. This enables advanced feature engineering by giving you access to all the underlying data. + +To access the count of matched tokens, use [`params.matchCount.get`]{.title-ref}. To access the unique token count, use [`params.uniqueTerms`]{.title-ref}. + +You can either hardcode the `term_stat` parameter in your script definition or pass the parameter to be set at query time. For example, the following example query defines a feature set with a script feature that uses hardcoded `term_stat` parameters: + +```json +POST _ltr/_featureset/test +{ + "featureset": { + "features": [ + { + "name": "injection", + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "params.termStats['df'].size()", + "params": { + "term_stat": { + "analyzer": "!standard", + "terms": ["rambo rocky"], + "fields": ["overview"] + } + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +Analyzer names must be prefixed with a bang(!) when specifying them locally. Otherwise, they are treated as the parameter lookup value. +{: .note} + +To set parameter lookups, you can pass the name of the parameter from which you want to pull the value, as shown in the following example request: + +```json +POST _ltr/_featureset/test +{ + "featureset": { + "features": [ + { + "name": "injection", + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "params.termStats['df'].size()", + "params": { + "term_stat": { + "analyzer": "analyzerParam", + "terms": "termsParam", + "fields": "fieldsParam" + } + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can pass the `term_stat` parameters as query-time parameters, as shown in the following request: + +```json +POST tmdb/_search +{ + "query": { + "bool": { + "filter": [ + { + "terms": { + "_id": ["7555", "1370", "1369"] + } + }, + { + "sltr": { + "_name": "logged_featureset", + "featureset": "test", + "params": { + "analyzerParam": "standard", + "termsParam": ["troutman"], + "fieldsParam": ["overview"] + } + }} + ] + } + }, + "ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "named_query": "logged_featureset" + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_search-plugins/ltr/core-concepts.md b/_search-plugins/ltr/core-concepts.md new file mode 100644 index 0000000000..4a7f73e4ce --- /dev/null +++ b/_search-plugins/ltr/core-concepts.md @@ -0,0 +1,141 @@ +--- +layout: default +title: ML ranking core concepts +nav_order: 10 +parent: Learning to Rank +has_children: false +--- + +# ML ranking core concepts + +This guide is intended for OpenSearch developers and data scientist who are interested in adding machine learning (ML) ranking capabilities to their OpenSearch system. + +## What is LTR? + +Learning to Rank (LTR) applies ML to search relevance ranking. This differs from other classic ML problems, such as the following: + +- **Regression:** The goal is to predict a variable, such as a stock price, as a function of known information, such as number of employees or revenue. The output is a direct prediction. +- **Classification:** The goal is to categorize an entity into predefined classes, for example, profitable or not profitable. The output is a category. + +The objective of LTR is not to make a direct prediction but rather to learn a function (`f`) that can rank documents in an order that best matches your perception of relevance for a given query. The output `f` does not represent a literal value but rather a prediction of the document's relative usefulness. + +For comprehensive information about LTR, see [How is Search Different From Other Machine Learning Problems?](http://opensourceconnections.com/blog/2017/08/03/search-as-machine-learning-prob/) and [What is Learning to Rank?](http://opensourceconnections.com/blog/2017/02/24/what-is-learning-to-rank/). + +## Defining the ideal ordering with judgment lists + +Judgment lists, also known as golden sets, provide a way to grade individual search results for a keyword search. These lists express the ideal ordering of search results based on your expectations. + +For example, using the [demo on GitHub](http://github.com/opensearch-project/opensearch-learning-to-rank-base/tree/main/demo/), in a search for `Rambo`, the judgment list may appear similar to the following: + +``` +grade,keywords,movie +4,Rambo,First Blood # Exactly Relevant +4,Rambo,Rambo +3,Rambo,Rambo III # Fairly Relevant +3,Rambo,Rambo First Blood Part II +2,Rambo,Rocky # Tangentially Relevant +2,Rambo,Cobra +0,Rambo,Bambi # Not even close... +0,Rambo,First Daughter +``` + +This judgment list establishes the ideal ordering of search results for the query `Rambo`. Metrics like [Normalized Discounted Cumulative Gain (NDCG)](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) and [Expected Reciprocal Rank (ERR)](https://dl.acm.org/doi/abs/10.1145/1645953.1646033) can then be used to evaluate how closely the actual search results match this ideal ordering. + +The ranking function `f` aims to generate results closely aligned with the judgment list, maximizing quality metrics across various training queries. This ensures maximally useful search results. + +## Understanding features as building blocks of relevance + +The ranking function `f` uses input variables to arrive at a predicted output. For example, in stock price forecasting, input variables may encompass company-specific data like employee count and revenue. Likewise, in search relevance, the predictive model must leverage features that characterize the document, the query, and their associations, such as the [term frequency–inverse document frequency (TF–IDF)](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) score of the query keywords in a field. + +Similarly, in the context of searching for movies, the ranking function must use relevant features to determine the most relevant results. These features may include: + +- Whether and to what degree the search keywords match the title field, such as `titleScore`. +- Whether and to what degree the search keywords match the description field, such as `descScore`. +- The movie's popularity, such as `popularity`. +- The movie's rating, such as `rating`. +- The number of keywords used during the search, such as `numKeywords*)`. + +The ranking function would become `f(titleScore, descScore, popularity, rating, numKeywords)`. The goal is to use the features in a way that maximizes the likelihood of the search results being useful. + +For example, in the `Rambo` use case, it seems intuitive that `titleScore` would be important. However, for the top movie _First Blood_, the keyword `Rambo` is likely only mentioned in the description. In this case, the `descScore` would become relevant. Additionally, the `popularity` and `rating` features could help differentiate between sequels and originals. If the existing features do not work for this purpose, then a new feature `isSequel` could be introduced. This new feature could then be used to make better ranking decisions. + +Selecting and experimenting with features is fundamental to LTR. Using features that fail to help predict patterns in the target variable can result in an unsatisfactory search experience, following the principle of "garbage in, garbage out" that applies to any ML problem. + +## Completing the training set by logging features + +When you have a set of defined features, the next step is to annotate the judgment list with each feature's values. These values are used when the training process begins. For example, consider the following judgment list: + +``` +grade,keywords,movie +4,Rambo,First Blood +4,Rambo,Rambo +3,Rambo,Rambo III +... +``` + +To complete the training set, add the following features: + +``` +grade,keywords,movie,titleScore,descScore,popularity,... +4,Rambo,First Blood,0.0,21.5,100,... +4,Rambo,Rambo,42.5,21.5,95,... +3,Rambo,Rambo III,53.1,40.1,50,... +``` + +The `titleScore` represents the relevance score of the `Rambo` keyword in the title field of the document, and so on. + +Many LTR models are familiar with a file format introduced by Support Vector Machine for Ranking (SVMRank), an early LTR method. In this format, queries are given IDs, and the actual document identifier can be removed from the training process. Features are labeled with ordinals starting at `1`. For the preceding example, the file format would be: + +``` +4 qid:1 1:0.0 2:21.5 3:100,... +4 qid:1 1:42.5 2:21.5 3:95,... +3 qid:1 1:53.1 2:40.1 3:50,... +... +``` + +In actual systems, you might log these values and then use them later to annotate a judgment list. In other cases, the judgment list might come from user analytics, so the feature values are logged as you interact with the search application. See [Logging features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/) for more information. + +## Training a ranking function + +The following are key considerations for training a ranking function: + +- **Ranking models:** Several models, such as the following, are available for training, each with pros and cons: + + - **Tree-based models** (for example, LambdaMART, MART, Random Forests) + - Generally the most accurate. + - Large and complex, making them expensive to train. + - Tools such as [RankLib](https://sourceforge.net/p/lemur/wiki/RankLib/) and [XGBoost](https://github.com/dmlc/xgboost) focus on tree-based models. + + - **SVM-based models (SVMRank)** + - Less accurate but less expensive to train. + - See [Support Vector Machine for Ranking](https://www.cs.cornell.edu/people/tj/svm_light/svm_rank.html) for more information. + + - **Linear models** + - Perform basic linear regression on the judgment list. + - Tend to not be useful outside of the examples. + - See [Learning to Rank 101 — Linear Models](http://opensourceconnections.com/blog/2017/04/01/learning-to-rank-linear-models/) for more information. + +- **Model selection:** The choice of model can depend not only on performance but also on your level of experience and familiarity with the different approaches. + +## Testing: Is the model any good? + +When testing the quality of the ranking model, consider the following: + +- **Judgment list limitations:** A judgment list cannot include every possible query that a model may encounter in the real world. It is important to test the model on a variety of queries in order to assess its ability to generalize beyond the training data. +- **Overfitting:** A model that is overfit to the training data does not perform well on new, unseen data. To avoid this, consider doing the following: + - Preserving some judgment lists as a _test set_ that is not used during the training process. + - Evaluating the model's performance on the test set, which reflects how it may perform in unfamiliar scenarios. + - Monitoring the _test NDCG_ metric, which should remain high as the model is trained. +- **Temporal generalization:** Even after deploying the model, you should continue testing the model's performance using more recent judgment lists to ensure that it does not become overfit to seasonal or temporal situations. + +## Real-world concerns + +The following are practical considerations for using the Learning to Rank plugin: + +- **Accurate judgment lists:** How can you create judgment lists that reflect your users' perception of search quality? +- **Measuring search quality:** What metrics should you use to determine whether the search results are useful to your users? +- **Data collection infrastructure:** What kind of infrastructure do you need in order to collect and log user behavior and feature data? +- **Model retraining:** How will you know when your model needs to be retrained? +- **A/B testing:** How will you compare your new model to your current search solution? What key performance indicators (KPIs) will you use to determine the success of your search system? + +See [How does the plugin fit in?]({{site.url}}{{site.baseurl}}/search-plugins/ltr/fits-in/) to learn more about how the Learning to Rank plugin's functionality fits into a complete LTR system. diff --git a/_search-plugins/ltr/faq.md b/_search-plugins/ltr/faq.md new file mode 100644 index 0000000000..14db276b3a --- /dev/null +++ b/_search-plugins/ltr/faq.md @@ -0,0 +1,23 @@ +--- +layout: default +title: Common issues +nav_order: 1000 +parent: Learning to Rank +has_children: false +--- + +# Common issues + +To make the most of Learning to Rank (LTR), consider these helpful insights. + +## Negative scores + +Lucene does not allow for negative query scores. This can be problematic if your raw features include negative values. To address this, confirm that your features are non-negative _before_ training your model. You can achieve this by creating normalized fields with values shifted by the minimum value or by passing the scores through a function that produces a value greater than or equal to `0`. + +## Bugs + +If you encounter a bug while working with the plugin, you can open an issue in the [opensearch-learning-to-rank-base repository](https://github.com/opensearch-project/opensearch-learning-to-rank-base/issues). The project team regularly investigates and resolves issues. If you are seeking general support, the issue may be closed and you may be directed to the relevant support channel(s). + +## Further assistance + +If you need further assistance, join the [Relevance Slack Community](https://opensourceconnections.com/slack) and participate in the #opensearch-learn-to-rank channel to receive guidance and support from the community. diff --git a/_search-plugins/ltr/feature-engineering.md b/_search-plugins/ltr/feature-engineering.md new file mode 100644 index 0000000000..a059dcf709 --- /dev/null +++ b/_search-plugins/ltr/feature-engineering.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Feature engineering +nav_order: 40 +parent: Learning to Rank +has_children: false +--- + +# Feature engineering + +Common feature engineering tasks that you may encounter while developing a learning to rank (LTR) solution are described in the following sections. + +## Getting raw term statistics + +Many LTR solutions use raw term statistics in their training, such as the following: +- **Total term frequency (`raw_ttf`):** The total number of times that a term appears across an entire index. +- **Document frequency (`raw_df`):** The number of documents in which a term appears. +- **Term frequency (`raw_tf`):** The number of times that a term appears in a specific document. +- **Classic IDF (`classic_idf`):** The inverse document frequency (IDF) calculation `log((NUM_DOCS+1)/(raw_df+1)) + 1`. + +The Learning to Rank plugin provides a `match_explorer` query primitive that can extract these statistics for you, as shown in the following example: + +```json +POST tmdb/_search +{ + "query": { + "match_explorer": { + "type": "max_raw_df", + "query": { + "match": { + "title": "rambo rocky" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query returns the highest document frequency between the terms `rambo ` and `rocky`. + +You can use operations such as `max`, `min`, `sum`, and `stddev` with the statistics to get the information you need. + +### Term position statistics + +You can prepend the `type` with the desired operation (`min`, `max`, `avg`) to calculate the corresponding statistic across the term positions. If the terms are not present in the document, then the result will be `0`. + +The available statistics include the following: + +- `min_raw_tp` (minimum raw term position): This statistic finds the earliest position of any search term in the document. For example, with the query `dance monkey`, if `dance` occurs at positions [2, 5, 9] and `monkey` occurs at [1, 4], then the minimum is 1. +- `max_raw_tp` (maximum raw term position): This statistic finds the latest position of any search term in the document. Using the preceding example, the maximum is 9. +- `avg_raw_tp` (average raw term position): This statistic calculates the average term position for any of the query terms. Using the preceding example, the average for `dance` is 5.33 [(2+5+9)/3)] and the average for `monkey` is 2.5 [(1+4)/2], with an overall average of 3.91. +- `unique_terms_count`: Provides a count of the unique search terms in the query. + +## Document-specific features + +When working on an LTR solution, you may need to incorporate features that are specific to the document rather than to the relationship between the query and the document. These document-specific features can include metrics related to popularity or recency. + +The `function_score` query provides the functionality to extract these document-specific features. The following example query shows how you can use it to incorporate the `vote_average` field as a feature: + +```json +{ + "query": { + "function_score": { + "functions": [{ + "field_value_factor": { + "field": "vote_average", + "missing": 0 + } + }], + "query": { + "match_all": {} + } + } + } +} +``` +{% include copy-curl.html %} + +In the example, the score of the query is determined by the value of the `vote_average` field, which could be a measure of document popularity or quality. + +## Index drift + +When working with an index that is regularly updated, it is important to consider that the trends and patterns you observe may not remain constant over time. Your index can drift as user behavior, content, and other factors change. For example, on an e-commerce store, you may find that sandals are popular during summer months but become almost impossible to find in the winter. Similarly, the features that drive purchases or engagement during one time period may not be as important during another. + +## Next steps + +Learn about [logging feature scores]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/). diff --git a/_search-plugins/ltr/fits-in.md b/_search-plugins/ltr/fits-in.md new file mode 100644 index 0000000000..30ca291b82 --- /dev/null +++ b/_search-plugins/ltr/fits-in.md @@ -0,0 +1,29 @@ +--- +layout: default +title: Scope of the plugin +nav_order: 20 +parent: Learning to Rank +has_children: false +--- + +# Scope of the plugin + +The Learning to Rank plugin for OpenSearch helps you develop and use machine learning (ML)-based ranking models for your application search operations. The following sections describe how the plugin fits into the overall LTR process. + +## What the plugin does + +The plugin provides the building blocks to develop and use LTR models, giving you the following capabilities: + +1. **Developing query-dependent features:** Create custom features that capture the relationship between a search query and a document. These features can be stored in OpenSearch. +2. **Logging feature values:** Record the feature values for documents returned in search results. Once you have logged the feature sets for your documents, you can combine this data with the judgment lists you have developed. This will give you a complete training set that you can use to test and train your ranking models. Tools such as RankLib or XGBoost can then be used to develop a satisfactory model. +3. **Deploying and using models:** Upload trained ranking models to the plugin and use them to rerank search results. The plugin offers a custom OpenSearch query domain-specific language (DSL) primitive that allows you to execute the model during the search process. + +## What the plugin does not do + +The plugin does not support the creation of judgment lists. This is a task you must handle yourself because it is domain specific. See the [Wikimedia Foundation blog](https://blog.wikimedia.org/2017/09/19/search-relevance-survey/) for an example approach to developing judgment lists for searching articles. Some domains, such as e-commerce, may focus more on conversion-related signals, while others may involve human relevance assessors (either internal experts or crowdsourced workers). + +The plugin does not handle model training or testing. This is an offline process that should be handled using the appropriate tools, such as [XGBoost](https://xgboost.ai/) and [RankLib](https://lemurproject.org/ranklib.php). The plugin integrates with these external model-building workflows. Training and testing ranking models can be a CPU-intensive task that requires data science expertise and offline testing. Most organizations prefer to have data scientists oversee the model development process rather than running it directly in their production environment. + +## Next steps + +Learn about [working with features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features/). diff --git a/_search-plugins/ltr/index.md b/_search-plugins/ltr/index.md new file mode 100644 index 0000000000..1c1f0eb0d2 --- /dev/null +++ b/_search-plugins/ltr/index.md @@ -0,0 +1,37 @@ +--- +layout: default +title: Learning to Rank +nav_order: 20 +has_children: true +has_toc: false +redirect_from: + - /search-plugins/ltr/ +--- + +# Learning to Rank + +The Learning to Rank plugin for OpenSearch enables you to use machine learning (ML) and behavioral data to fine-tune the relevance of documents. It uses models from the [XGBoost](https://xgboost.ai/) and [RankLib](https://lemurproject.org/ranklib.php) libraries. These models rescore the search results, considering query-dependent features such as click-through data or field matches, which can further improve relevance. + +The term _learning to rank_ is abbreviated as LTR throughout the OpenSearch documentation when the term is used in a general sense. For the plugin developer documentation, see [opensearch-learning-to-rank-base](https://github.com/opensearch-project/opensearch-learning-to-rank-base). +{: .note} + +## Getting started + +The following resources can help you get started: + +- If you are new to LTR, start with the [ML ranking core concepts]({{site.url}}{{site.baseurl}}/search-plugins/ltr/core-concepts/) documentation. +- For a quick introduction, see the demo in [hello-ltr](https://github.com/o19s/hello-ltr). +- If you are familiar with LTR, start with the [Integrating the plugin]({{site.url}}{{site.baseurl}}/search-plugins/ltr/fits-in/) documentation. + +## Installing the plugin + +Prebuilt versions of the plugin are available at [https://github.com/opensearch-project/opensearch-learning-to-rank-base/releases](https://github.com/opensearch-project/opensearch-learning-to-rank-base/releases). + +If you need a version that is compatible with your OpenSearch installation, follow the instructions in the [README](https://github.com/opensearch-project/opensearch-learning-to-rank-base#development) file or [create an issue](https://github.com/opensearch-project/opensearch-learning-to-rank-base/issues). + +Once you have an appropriate version, you can install the plugin using the command line shown in the following example: + +``` +./bin/opensearch-plugin install https://github.com/opensearch-project/opensearch-learning-to-rank-base/releases/download/ltr-plugin-v2.11.1-RC1/ltr-plugin-v2.11.1-RC1.zip +``` +{% include copy-curl.html %} diff --git a/_search-plugins/ltr/logging-features.md b/_search-plugins/ltr/logging-features.md new file mode 100644 index 0000000000..7922b8683d --- /dev/null +++ b/_search-plugins/ltr/logging-features.md @@ -0,0 +1,418 @@ +--- +layout: default +title: Logging feature scores +nav_order: 50 +parent: Learning to Rank +has_children: false +--- + +# Logging feature scores + +Feature values need to be logged in order to train a model. This is a crucial component of the Learning to Rank plugin---as you search, feature values from the feature sets are logged so that they can be used for training. This allows models that effectively predict relevance using that set of features to be discovered. + +## `sltr` query + +The `sltr` query is the primary method for running features and evaluating models. When logging, an `sltr` query is used to execute each feature query and retrieve the feature scores. A feature set structure that works with the [`hello-ltr`](https://github.com/o19s/hello-ltr) demo schema is shown in the following example request: + +```json +PUT _ltr/_featureset/more_movie_features +{ + "name": "more_movie_features", + "features": [ + { + "name": "body_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "overview": "{% raw %}{{keywords}}{% endraw %}" + } + } + }, + { + "name": "title_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "title": "{% raw %}{{keywords}}{% endraw %}" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +## Common use cases + +Common use cases for logging feature sets are described in the following sections. + +### Joining feature values with a judgment list + +If the judgment list is already available, you can join feature values for each keyword/document pair to create a complete training set. For example, consider the following judgment list: + +``` +grade,keywords,docId +4,rambo,7555 +3,rambo,1370 +3,rambo,1369 +4,rocky,4241 +``` +{% include copy-curl.html %} + +The feature values need to be retrieved for all documents that have a judgment for each search term, one search term at a time. For example, starting with a `rambo` search, a filter can be created for the associated document as follows: + +```json +{ + "filter": [ + {"terms": { + "_id": ["7555", "1370", "1369"] + }} + ] +} +``` +{% include copy-curl.html %} + +The Learning to Rank plugin must point to the features to be logged. The `sltr` query, which is part of the plugin, can be used for this purpose. The `sltr` query has a `_name` (the named queries feature) used to reference it, refers to the previously created feature set `more_movie_features`, and passes the search keyword `rambo` and any other required parameters, as shown in the following example query: + +```json +{ + "sltr": { + "_name": "logged_featureset", + "featureset": "more_movie_features", + "params": { + "keywords": "rambo" + } + } +} +``` +{% include copy-curl.html %} + +[Searching with LTR]({{site.url}}{{site.baseurl}}/search-plugins/ltr/searching-with-your-model/) provides an `sltr` query to use for executing a model. This `sltr` query is used as a mechanism to direct the Learning to Rank plugin to the feature set requiring logging. +{: .note} + +To avoid influencing the score, the `sltr` query is injected as a filter, as shown in the following example: + +```json +{ + "query": { + "bool": { + "filter": [ + { + "terms": { + "_id": [ + "7555", + "1370", + "1369" + ] + } + }, + { + "sltr": { + "_name": "logged_featureset", + "featureset": "more_movie_features", + "params": { + "keywords": "rambo" + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +Executing this query returns the three expected hits. The next step is to enable feature logging to refer to the `sltr` query to be logged. + +The logging identifies the `sltr` query, runs the feature set's queries, scores each document, and returns those scores as computed fields for each document, as shown in the following example logging structure: + +```json +"ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "named_query": "logged_featureset" + } + } +} +``` +{% include copy-curl.html %} + +The log extension supports the following arguments: + +- `name`: The name of the log entry to fetch from each document. +- `named_query`: The named query that corresponds to an `sltr` query. +- `rescore_index`: If the `sltr` query is in a rescore phase, then this is the index of the query in the rescore list. +- `missing_as_zero`: Produces a `0` for missing features (when the feature does not match). Default is `false`. + +To enable the log to locate an `sltr` query, either during the normal query phase or during rescoring, either `named_query` or `rescore_index` must be set. +{: .note} + +The full example request is as follows: + +```json +POST tmdb/_search +{ + "query": { + "bool": { + "filter": [ + { + "terms": { + "_id": ["7555", "1370", "1369"] + } + }, + { + "sltr": { + "_name": "logged_featureset", + "featureset": "more_movie_features", + "params": { + "keywords": "rambo" + } + }} + ] + } + }, + "ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "named_query": "logged_featureset" + } + } + } +} +``` +{% include copy-curl.html %} + +Each document now contains a log entry, as shown in the following example: + +```json +{ + "_index": "tmdb", + "_type": "movie", + "_id": "1370", + "_score": 20.291, + "_source": { + ... + }, + "fields": { + "_ltrlog": [ + { + "log_entry1": [ + {"name": "title_query" + "value": 9.510193}, + {"name": "body_query + "value": 10.7808075} + ] + } + ] + }, + "matched_queries": [ + "logged_featureset" + ] +} +``` +{% include copy-curl.html %} + +The judgment list can be joined with the feature values to produce a training set. For the line corresponding to document `1370` with keyword `rambo`, the following can be added: + +``` +> 4 qid:1 1:9.510193 2:10.7808075 +``` +{% include copy-curl.html %} + +Repeat this process for all of your queries. + +For large judgment lists, it is recommended to batch the logs for multiple queries. You can use [multi-search]({{site.url}}{{site.baseurl}}/api-reference/multi-search/) capabilities for this purpose. +{: .note} + +### Logging values for a live feature set + +If you are running in production with a model being executed within an `sltr` query, a live model may appear similar to the following example request: + +```json +POST tmdb/_search +{ + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } + } +} +``` +{% include copy-curl.html %} + +See [Searching with LTR]({{site.url}}{{site.baseurl}}/search-plugins/ltr/searching-with-your-model/) for information about model execution. +{: .note} + +To log the feature values for the query, apply the appropriate logging spec to reference the `sltr` query, as shown in the following example: + +```json +"ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "rescore_index": 0 + } + } +} +``` +{% include copy-curl.html %} + +The example logs the features in the response, enabling future model retraining using the same feature set. + +### Modifying and logging an existing feature set + +Feature sets can be expanded. For example, as shown in the following example request, if a new feature, such as `user_rating`, needs to be incorporated, it can be added to the existing feature set `more_movie_features`: + +``` json +PUT _ltr/_feature/user_rating/_addfeatures +{ + "features": [ + "name": "user_rating", + "params": [], + "template_language": "mustache", + "template" : { + "function_score": { + "functions": { + "field": "vote_average" + }, + "query": { + "match_all": {} + } + } + } + ] +} +``` +{% include copy-curl.html %} + +See [Working with features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features/) for more information. +{: .note} + +When logging is performed, the new feature is included in the output, as shown in the following example: + +``` json +{ + "log_entry1": [ + { + "name": "title_query", + "value": 9.510193 + }, + { + "name": "body_query", + "value": 10.7808075 + }, + { + "name": "user_rating", + "value": 7.8 + } + ] +} +``` +{% include copy-curl.html %} + +### Logging values for a proposed feature set + +You can create a completely new feature set for experimental purposes, for example, `other_movie_features`, as shown in the following example request: + +```json +PUT _ltr/_featureset/other_movie_features +{ + "name": "other_movie_features", + "features": [ + { + "name": "cast_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "cast.name": "{% raw %}{{keywords}}{% endraw %}" + } + } + }, + { + "name": "genre_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "genres.name": "{% raw %}{{keywords}}{% endraw %}" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +The feature set, `other_movie_features`, can be logged alongside the live production set, `more_movie_features`, by appending it as another filter, as shown in the following example request: + +```json +POST tmdb/_search +{ +"query": { + "bool": { + "filter": [ + { "sltr": { + "_name": "logged_featureset", + "featureset": "other_movie_features", + "params": { + "keywords": "rambo" + } + }}, + {"match": { + "_all": "rambo" + }} + ] + } +}, +"rescore": { + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } +} +} +``` +{% include copy-curl.html %} + +You can continue adding as many feature sets as needed for logging. + +## Logging scenarios + +Once you have covered the basics, you can consider some real-life feature logging scenarios. + +First, logging is used to develop judgment lists from user analytics to capture the exact value of a feature at the precise time of interaction. For instance, you may want to know the recency, title score, and other values at the precise time of a user's interaction. This would help you analyze which features or factors had relevance while training. To achieve this, you can build a comprehensive feature set for future experimentation. + +Second, logging can be used to retrain a model in which you already have confidence. You may want to keep your models up to date with a shifting index because models can lose their effectiveness over time. You may have A/B testing in place or be monitoring business metrics and notice gradual degradation in model performance. + +Third, logging is used during model development. You may have a judgment list but want to iterate heavily with a local copy of OpenSearch. This allows for extensive experimentation with new features, adding and removing them from the feature sets as needed. While this process may result in being slightly out of sync with the live index, the goal is to arrive at a set of satisfactory model parameters. Once this is achieved, the model can be trained with production data to confirm that the level of performance remains acceptable. + +## Next steps + +Learn more about training models in the [Uploading a trained model]({{site.url}}{{site.baseurl}}/search-plugins/ltr/training-models/) documentation. diff --git a/_search-plugins/ltr/searching-with-your-model.md b/_search-plugins/ltr/searching-with-your-model.md new file mode 100644 index 0000000000..ca1ff87307 --- /dev/null +++ b/_search-plugins/ltr/searching-with-your-model.md @@ -0,0 +1,102 @@ +--- +layout: default +title: Optimizing search with LTR +nav_order: 70 +parent: Learning to Rank +has_children: false +--- + +# Optimizing search with LTR + +After you have trained a model, you can use the `sltr` query to execute it. However, directly running the query on the entire index is not recommended because it can be CPU intensive and impact the performance of your OpenSearch cluster. The query allows you to apply your trained model to search results, as shown in the following example: + +```json + POST tmdb/_search + { + "query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } +``` +{% include copy-curl.html %} + +## Rescoring top N + +To execute your model more efficiently, you can use the built-in rescore functionality to apply your model to the top N results of a baseline relevance query, as shown in the following example query: + +```json + POST tmdb/_search + { + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "window_size": 1000, + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } + } + } +``` +{% include copy-curl.html %} + +A `match` is first executed for the term `rambo` and then `my_model` is applied to the top 1,000 results. This baseline query is used to generate an initial set of results that are then scored using the default similarity BM25 probabilistic ranking framework to calculate relevance scores. + +## Rescoring a subset of features + +You can selectively score a subset of features by specifying the `active_features` in the `sltr` query, as shown in the following example. This allows you to focus the model's scoring on the selected features, while any unspecified features are marked as missing. You only need to specify the `params` relevant to the `active_features`. If you request a feature name that is not part of the assigned feature set, then the query throws an error. + +```json + POST tmdb/_search + { + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "window_size": 1000, + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model", + "active_features": ["title_query"] + } + } + } + } + } +``` +{% include copy-curl.html %} + +The `my_model` model is applied but only scores the `title_query` feature. + +## Combining `sltr` with other OpenSearch features + +The `sltr` query can be integrated with the following OpenSearch features and functionalities to create more sophisticated and tailored search solutions that go beyond applying a model to your results: + +- Filtering out results based on business rules using OpenSearch filters before applying the model +- Chaining multiple rescores to refine the relevance of your results +- Rescoring once to address relevance with `sltr` and a second time for business concerns +- Downboosting relevant but low-quality content in the baseline query to prevent it from being rescored + +## Next steps + +Learn about [advanced functionality]({{site.url}}{{site.baseurl}}/search-plugins/ltr/advanced-functionality/). diff --git a/_search-plugins/ltr/training-models.md b/_search-plugins/ltr/training-models.md new file mode 100644 index 0000000000..fb068cedd7 --- /dev/null +++ b/_search-plugins/ltr/training-models.md @@ -0,0 +1,335 @@ +--- +layout: default +title: Uploading trained models +nav_order: 60 +parent: Learning to Rank +has_children: false +--- + +# Uploading trained models + +While model training occurs outside of the Learning to Rank plugin, you can use the plugin for [logging feature scores]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/). After you have trained a model, you can upload it to the plugin in the available serialization formats, such as RankLib and XGBoost. + +## RankLib model training + +The feature logging process generates a RankLib-comsumable judgment file. In the following judgment file, the query with ID 1 `rambo` includes the logged features 1 (a title `TF*IDF` +score) and 2 (a description `TF*IDF` score) for a set of documents: + +``` +4 qid:1 1:9.8376875 2:12.318446 # 7555 rambo +3 qid:1 1:10.7808075 2:9.510193 # 1370 rambo +3 qid:1 1:10.7808075 2:6.8449354 # 1369 rambo +3 qid:1 1:10.7808075 2:0.0 # 1368 rambo +``` + +The RankLib library can be called using the following command: + + ``` + cmd = "java -jar RankLib-2.8.jar -ranker %s -train%rs -save %s -frate 1.0" % (whichModel, judgmentsWithFeaturesFile, modelOutput) +``` + +The `judgmentsWithFeatureFile` is the input provided to RankLib for training. Additional parameters can be passed. See the [RankLib documentation](https://sourceforge.net/p/lemur/wiki/RankLib/) for more information. + +RankLib outputs the model in its own serialization format. As shown in the following example, a LambdaMART model is an ensemble of regression trees: + +``` +## LambdaMART +## No. of trees = 1000 +## No. of leaves = 10 +## No. of threshold candidates = 256 +## Learning rate = 0.1 +## Stop early = 100 + + <ensemble> + <tree id="1" weight="0.1"> + <split> + <feature> 2 </feature> + ... +``` + +Within the RankLib model, each tree in the ensemble examines feature values, makes decisions based on these feature values, and outputs the relevance scores. The features are referred to by their ordinal position, starting from 1, which corresponds to the 0th feature in the original feature set. RankLib does not use feature names during model training. + +### Other RankLib models + +RankLib is a library that implements several other model types in addition to LambdaMART, such as MART, +RankNet, RankBoost, AdaRank, Coordinate Ascent, ListNet, and Random Forests. Each of these models has its own set of parameters and training process. + +For example, the RankNet model is a neural network that learns to predict the probability of a document being more relevant than another document. The model is trained using a pairwise loss function that compares the predicted relevance of two documents with the actual relevance. The model is serialized in a format similar to the following example: + +``` +## RankNet +## Epochs = 100 +## No. of features = 5 +## No. of hidden layers = 1 +... +## Layer 1: 10 neurons +1 2 +1 +10 +0 0 -0.013491530393429608 0.031183180961270988 0.06558792020112071 -0.006024092627087733 0.05729619574181734 -0.0017010373987742411 0.07684848696852313 -0.06570387602230028 0.04390491141617467 0.013371636736099578 +... +``` + +All these models can be used with the Learning to Rank plugin, provided that the model is serialized in the RankLib format. + +## XGBoost model training + +Unlike the RankLib model, the XGBoost model is serialized in a format specific to gradient-boosted decision trees, as shown in the following example: + +```json + [ { "nodeid": 0, "depth": 0, "split": "tmdb_multi", "split_condition": 11.2009, "yes": 1, "no": 2, "missing": 1, "children": [ + { "nodeid": 1, "depth": 1, "split": "tmdb_title", "split_condition": 2.20631, "yes": 3, "no": 4, "missing": 3, "children": [ + { "nodeid": 3, "leaf": -0.03125 }, + ... +``` + +## XGBoost parameters + +Optional parameters can be specified for an XGBoost model. These parameters are specified as an object, with the decision trees specified in the `splits` field. The supported parameters include `objective`, which defines the model learning objective as described in the [XGBoost documentation](https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters). This parameter can transform the final model prediction. The supported values include `binary:logistic`, `binary:logitraw`, `rank:ndcg`, `rank:map`, `rank:pairwise`, `reg:linear`, and `reg:logistic`. + +## Simple linear models + +Machine learning (ML) models, such as Support Vector Machines (SVMs), output linear weights for each feature. The LTR model supports representing these linear weights in a simple format, such as those learned from an SVM or linear regression model. In the following example output, the weights indicate the relative importance of the features in the model's prediction: + +```json +{ + "title_query" : 0.3, + "body_query" : 0.5, + "recency" : 0.1 +} +``` + +## Feature normalization + +Feature normalization is used to convert feature values to a consistent range, typically between 0 and 1 or -1 and 1. This is done during the training phase to better understand the relative impact of each feature. Some models, especially linear ones such as SVMRank, rely on normalization to function correctly. + +## Model upload process + +After training your model, the next step is to make it available for search operations. This involves uploading the model to the Learning to Rank plugin. When uploading a model, you must provide the following information: + +- Feature set used during training +- Model type, for example, RankLib or XGBoost +- Model content + +The following example request shows how to upload a RankLib model that was trained using the `more_movie_features` feature set: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_ranklib_model", + "model": { + "type": "model/ranklib", + "definition": "## LambdaMART\n + ## No. of trees = 1000 + ## No. of leaves = 10 + ## No. of threshold candidates = 256 + ## Learning rate = 0.1 + ## Stop early = 100 + + <ensemble> + <tree id="1" weight="0.1"> + <split> + <feature> 2 </feature> + ... + " + } + } + } +``` + +The following example request shows how to upload an XGBoost model that was trained using the `more_movie_features` feature set: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_xgboost_model", + "model": { + "type": "model/xgboost+json", + "definition": "[ { \"nodeid\": 0, \"depth\": 0, \"split\": \"tmdb_multi\", \"split_condition\": 11.2009, \"yes\": 1, \"no\": 2, \"missing\": 1, \"children\": [ + { \"nodeid\": 1, \"depth\": 1, \"split\": \"tmdb_title\", \"split_condition\": 2.20631, \"yes\": 3, \"no\": 4, \"missing\": 3, \"children\": [ + { \"nodeid\": 3, \"leaf\": -0.03125 }, + ..." + } + } + } +``` + +The following example request shows how to upload an XGBoost model that was trained using the `more_movie_features` feature set with parameters: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_xgboost_model", + "model": { + "type": "model/xgboost+json", + "definition": "{ + \"objective\": \"reg:logistic\", + \"splits\": [ { \"nodeid\": 0, \"depth\": 0, \"split\": \"tmdb_multi\", \"split_condition\": 11.2009, \"yes\": 1, \"no\": 2, \"missing\": 1, \"children\": [ + { \"nodeid\": 1, \"depth\": 1, \"split\": \"tmdb_title\", \"split_condition\": 2.20631, \"yes\": 3, \"no\": 4, \"missing\": 3, \"children\": [ + { \"nodeid\": 3, \"leaf\": -0.03125 }, + ... + ] + }" + } + } + } +```` + +The following example request shows how to upload a simple linear model that was trained using the `more_movie_features` feature set: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_linear_model", + "model": { + "type": "model/linear", + "definition": """ + { + "title_query" : 0.3, + "body_query" : 0.5, + "recency" : 0.1 + } + """ + } + } + } +``` + +## Creating a model with feature normalization + +Feature normalization is a crucial preprocessing step that can be applied before model evaluation. LTR supports two types of feature normalization: min-max and standard normalization. + +### Standard normalization + +Standard normalization transforms features as follows: + +- Maps the mean value to 0 +- Maps one standard deviation above the mean to 1 +- Maps one standard deviation below the mean to -1 + +The following example request shows how to create a model with standard feature normalization: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_linear_model", + "model": { + "type": "model/linear", + "feature_normalizers": { + "release_year": { + "standard": { + "mean": 1970, + "standard_deviation": 30 + } + } + }, + "definition": """ + { + "release_year" : 0.3, + "body_query" : 0.5, + "recency" : 0.1 + } + """ + } + } + } +``` + +### Min-max normalization + +Min-max normalization scales features to a fixed range, typically between 0 and 1. Min-max normalization transforms features as follows: + +- Maps the specified minimum value to 0 +- Maps the specified maximum value to 1 +- Scales the values between 0 and 1 linearly + +The following example request shows how to implement min-max normalization: + +```json + "feature_normalizers": { + "vote_average": { + "min_max": { + "minimum": 0, + "maximum": 10 + } + } + } +``` + +## Model independence from feature sets + +Models are initially created with reference to a feature set. After their creation, they exist as independent top-level entities. + +### Accessing models + +To retrieve a model, use a GET request: + +``` +GET _ltr/_model/my_linear_model +``` + +To delete a model, use a DELETE request: + +``` +DELETE _ltr/_model/my_linear_model +``` + +Model names must be globally unique across all feature sets. +{: .note} + +### Model persistence + +When a model is created, its features are copied. This prevents changes to the original features from affecting existing models or model production. For example, if the feature set used to create the model is deleted, you can still access and use the model. + +### Model response + +When retrieving a model, you receive a response that includes the features used to create it, as shown in the following example: + +```json + { + "_index": ".ltrstore", + "_type": "store", + "_id": "model-my_linear_model", + "_version": 1, + "found": true, + "_source": { + "name": "my_linear_model", + "type": "model", + "model": { + "name": "my_linear_model", + "feature_set": { + "name": "more_movie_features", + "features": [ + { + "name": "body_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "overview": "{{keywords}}" + } + } + }, + { + "name": "title_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "title": "{{keywords}}" + } + } + } + ]}}} +``` + +## Next steps + +Learn about [searching with LTR]({{site.url}}{{site.baseurl}}/search-plugins/ltr/searching-with-your-model/). diff --git a/_search-plugins/ltr/working-with-features.md b/_search-plugins/ltr/working-with-features.md new file mode 100644 index 0000000000..00ebd908d7 --- /dev/null +++ b/_search-plugins/ltr/working-with-features.md @@ -0,0 +1,270 @@ +--- +layout: default +title: Working with features +nav_order: 30 +parent: Learning to Rank +has_children: false +--- + +# Working with features + +The following sections describe the specific functionality provided by the Learning to Rank plugin. This information will help you build and upload features for your learning to rank (LTR) system. See [ML ranking core concepts]({{site.url}}{{site.baseurl}}/search-plugins/ltr/core-concepts/) and [Scope of the plugin]({{site.url}}{{site.baseurl}}/search-plugins/ltr/fits-in/) for more information about the Learning to Rank plugin's roles and functionality. + +## Understanding the role of features in the Learning to Rank plugin + +The Learning to Rank plugin defines a _feature_ as an _OpenSearch query_. When you execute an OpenSearch query using your search terms and other relevant parameters, the resulting score is the value that can be used in your training data. For example, a feature may include basic `match` queries on fields such as `title`: + +```json +{ + "query": { + "match": { + "title": "{% raw %}{{keywords}}{% endraw %}" + } + } +} +``` +{% include copy-curl.html %} + +In addition to simple query-based features, you can also use document properties, such as `popularity`, as features. For example, you can use a function score query to get the average movie rating: + +```json +{ + "query": { + "function_score": { + "functions": { + "field": "vote_average" + }, + "query": { + "match_all": {} + } + } + } +} +``` +{% include copy-curl.html %} + +Another example is a query based on location, such as a geodistance filter: + +```json +{ + "query": { + "bool" : { + "must" : { + "match_all" : {} + }, + "filter" : { + "geo_distance" : { + "distance" : "200km", + "pin.location" : { + "lat" : "{% raw %}{{users_lat}}{% endraw %}", + "lon" : "{% raw %}{{users_lon}}{% endraw %}" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +These types of queries are the building blocks that the ranking `f` function you are training combines mathematically to determine a relevance score. + +## Using Mustache templates in LTR queries + +The features in LTR queries use Mustache templates. This allows you to insert variables into your search queries. For example, you could have a query that uses `{% raw %}{{keywords}}{% endraw %}` to insert your search terms. Or you could use `{% raw %}{{users_lat}}{% endraw %}` and `{% raw %}{{users_lon}}{% endraw %}` to include the location. This gives you the flexibility to personalize your search. + +## Uploading and naming features + +The Learning to Rank plugin enables you to create and modify features. After you define your features, you can log them for use in model training. By combining the logged feature data with your judgment list, you can train a model. Once the model is ready, you can upload it and then apply it to your search queries. + +## Initializing the default feature store + +The Learning to Rank plugin uses a feature store to store metadata about your features and models. Typically, there is one feature store per major search implementation, for example, [Wikipedia](http://wikipedia.org) as compared to [Wikitravel](http://wikitravel.org). + +For most uses cases, you can use the default feature store and avoid managing multiple feature stores. To initialize the default feature store, run the following request: + +``` +PUT _ltr +``` +{% include copy-curl.html %} + +If you need to start again from the beginning, you can delete the default feature store by using the following operation: + +``` +DELETE _ltr +``` +{% include copy-curl.html %} + +Deleting the feature store removes all existing feature and model data. +{: .warning} + +The default feature store is used throughout the rest of this guide. + +## Working with features and feature sets + +A _feature set_ is a collection of features that have been grouped together. You can use feature sets to log multiple feature values for offline training. When creating a new model, you copy the relevant feature set into the model definition. + +## Creating feature sets + +To create a feature set, you can send a POST request. When creating the feature set, you provide a name and an optional list of features, as shown in the following example request: + +```json +POST _ltr/_featureset/more_movie_features +{ + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{% raw %}{{keywords}}{% endraw %}" + } + } + }, + { + "name": "title_query_boost", + "params": [ + "some_multiplier" + ], + "template_language": "derived_expression", + "template": "title_query * some_multiplier" + }, + { + "name": "custom_title_query_boost", + "params": [ + "some_multiplier" + ], + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "params.feature_vector.get('title_query') * (long)params.some_multiplier", + "params": { + "some_multiplier": "some_multiplier" + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Managing feature sets + +To fetch a specific feature set, you can use the following request: + +``` +GET _ltr/_featureset/more_movie_features +``` +{% include copy-curl.html %} + +To see a list of all defined feature sets, you can use the following request: + +``` +GET _ltr/_featureset +``` +{% include copy-curl.html %} + +If you have many feature sets, you can filter the list by using a prefix, as shown in the following example request: + +``` +GET _ltr/_featureset?prefix=mor +``` +{% include copy-curl.html %} + +This returns only the feature sets with names starting with `mor`. + +If you need to start over, you can delete a feature set using the following request: + +``` +DELETE _ltr/_featureset/more_movie_features +``` +{% include copy-curl.html %} + +## Validating features + +When adding new features, you should validate that the features work as expected. You can do this by adding a `validation` block in your feature creation request. This allows the Learning to Rank plugin to run the query before adding the feature, catching any issues early. If you do not run this validation, you may not discover until later that the query, while valid JSON, contains a malformed OpenSearch query. + +To run validation, you can specify the test parameters and the index to use, as shown in the following example validation block: + +```json +"validation": { + "params": { + "keywords": "rambo" + }, + "index": "tmdb" +}, +``` +{% include copy-curl.html %} + +Place the validation block alongside your feature set definition. In the following example, the `match` query is malformed (curly brackets are missing in the Mustache template). The validation fails, returning an error: + +```json +{ + "validation": { + "params": { + "keywords": "rambo" + }, + "index": "tmdb" + }, + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{% raw %}{{keywords{% endraw %}" + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Expanding feature sets + +You may not initially know which features are the most useful. In these cases, you can later add new features to an existing feature set for logging and model evaluation. For example, if you want to create a `user_rating` feature, you can use the Feature Set Append API, as shown in the following example request: + +```json +POST /_ltr/_featureset/my_featureset/_addfeatures +{ + "features": [{ + "name": "user_rating", + "params": [], + "template_language": "mustache", + "template" : { + "function_score": { + "functions": { + "field": "vote_average" + }, + "query": { + "match_all": {} + } + } + } + }] +} +``` +{% include copy-curl.html %} + +## Enforcing unique feature names + +The Learning to Rank plugin enforces unique names for each feature. This is because some model training libraries refer to features by name. In the preceding example, you could not add a new `user_rating` feature without causing an error because that feature name is already in use. + +## Treating feature sets as lists + +Feature sets are more like ordered lists than simple sets. Each feature has both a name and an ordinal position. Some LTR training applications, such as RankLib, refer to features by their ordinal position (for example, 1st feature, 2nd feature). Others may use the feature name. When working with logged features, you may need to handle both the ordinal and the name because the ordinal is preserved to maintain the list order. + +## Next steps + +Learn about [feature engineering]({{site.url}}{{site.baseurl}}/search-plugins/ltr/feature-engineering/) and [advanced functionality]({{site.url}}{{site.baseurl}}/search-plugins/ltr/advanced-functionality/). diff --git a/_search-plugins/neural-search-tutorial.md b/_search-plugins/neural-search-tutorial.md index 5f5a5fe79e..9c1b224cb8 100644 --- a/_search-plugins/neural-search-tutorial.md +++ b/_search-plugins/neural-search-tutorial.md @@ -55,13 +55,9 @@ For this simple setup, you'll use an OpenSearch-provided machine learning (ML) m PUT _cluster/settings { "persistent": { - "plugins": { - "ml_commons": { - "only_run_on_ml_node": "false", - "model_access_control_enabled": "true", - "native_memory_threshold": "99" - } - } + "plugins.ml_commons.only_run_on_ml_node": "false", + "plugins.ml_commons.model_access_control_enabled": "true", + "plugins.ml_commons.native_memory_threshold": "99" } } ``` diff --git a/_search-plugins/search-pipelines/ml-inference-search-response.md b/_search-plugins/search-pipelines/ml-inference-search-response.md index efd24e13c8..b0573d17be 100644 --- a/_search-plugins/search-pipelines/ml-inference-search-response.md +++ b/_search-plugins/search-pipelines/ml-inference-search-response.md @@ -96,7 +96,188 @@ For local models, you must provide a `model_input` field that specifies the mode For remote models, the `model_input` field is optional, and its default value is `"{ \"parameters\": ${ml_inference.parameters} }`. -### Example: Externally hosted model +### Example: Local model + +The following example shows you how to configure an `ml_inference` search response processor with a local model. + +**Step 1: Create a pipeline** + +The following example shows you how to create a search pipeline for the `huggingface/sentence-transformers/all-distilroberta-v1` local model. The model is a [pretrained sentence transformer model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sentence-transformers) hosted in your OpenSearch cluster. + +If you invoke the model using the Predict API, then the request appears as follows: + +```json +POST /_plugins/_ml/_predict/text_embedding/cleMb4kBJ1eYAeTMFFg4 +{ + "text_docs":[ "today is sunny"], + "return_number": true, + "target_response": ["sentence_embedding"] +} +``` + +Using this schema, specify the `model_input` as follows: + +```json + "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }" +``` + +In the `input_map`, map the `passage_text` document field to the `text_docs` field expected by the model: + +```json +"input_map": [ + { + "text_docs": "passage_text" + } +] +``` + +Because you specified the field to be converted into embeddings as a JSON path, you need to set the `full_response_path` to `true`. Then the full JSON document is parsed in order to obtain the input field: + +```json +"full_response_path": true +``` + +The text in the `passage_text` field will be used to generate embeddings: + +```json +{ + "passage_text": "hello world" +} +``` + +The Predict API request returns the following response: + +```json +{ + "inference_results" : [ + { + "output" : [ + { + "name" : "sentence_embedding", + "data_type" : "FLOAT32", + "shape" : [ + 768 + ], + "data" : [ + 0.25517133, + -0.28009856, + 0.48519906, + ... + ] + } + ] + } + ] +} +``` + +The model generates embeddings in the `$.inference_results.*.output.*.data` field. The `output_map` maps this field to the newly created `passage_embedding` field in the search response document: + +```json +"output_map": [ + { + "passage_embedding": "$.inference_results.*.output.*.data" + } +] +``` + +To configure an `ml_inference` search response processor with a local model, specify the `function_name` explicitly. In this example, the `function_name` is `text_embedding`. For information about valid `function_name` values, see [Request fields](#request-body-fields). + +The following is the final configuration of the `ml_inference` search response processor with the local model: + +```json +PUT /_search/pipeline/ml_inference_pipeline_local +{ + "description": "search passage and generates embeddings", + "processors": [ + { + "ml_inference": { + "function_name": "text_embedding", + "full_response_path": true, + "model_id": "<your model id>", + "model_config": { + "return_number": true, + "target_response": ["sentence_embedding"] + }, + "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }", + "input_map": [ + { + "text_docs": "passage_text" + } + ], + "output_map": [ + { + "passage_embedding": "$.inference_results.*.output.*.data" + } + ], + "ignore_missing": true, + "ignore_failure": true + } + } + ] +} +``` +{% include copy-curl.html %} + +**Step 2: Run the pipeline** + +Run the following query, providing the pipeline name in the request: + +```json +GET /my_index/_search?search_pipeline=ml_inference_pipeline_local +{ +"query": { + "term": { + "passage_text": { + "value": "hello" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Response + +The response confirms that the processor has generated text embeddings in the `passage_embedding` field: + +```json +{ + "took": 288, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.00009405752, + "hits": [ + { + "_index": "my_index", + "_id": "1", + "_score": 0.00009405752, + "_source": { + "passage_text": "hello world", + "passage_embedding": [ + 0.017304314, + -0.021530833, + 0.050184276, + 0.08962978, + ...] + } + } + ] + } +} +``` + +### Example: Externally hosted text embedding model The following example shows you how to configure an `ml_inference` search response processor with an externally hosted model. @@ -172,7 +353,6 @@ GET /my_index/_search?search_pipeline=ml_inference_pipeline_local The response confirms that the processor has generated text embeddings in the `passage_embedding` field. The document within `_source` now contains both the `passage_text` and `passage_embedding` fields: ```json - { "took": 288, "timed_out": false, @@ -209,140 +389,312 @@ The response confirms that the processor has generated text embeddings in the `p } ``` -### Example: Local model +### Example: Externally hosted large language model -The following example shows you how to configure an `ml_inference` search response processor with a local model. +This example demonstrates how to configure an `ml_inference` search response processor to work with an externally hosted large language model (LLM) and map the model's response to the search extension object. Using the `ml_inference` processor, you can enable an LLM to summarize search results directly within the response. The summary is included in the `ext` field of the search response, providing seamless access to AI-generated insights alongside the original search results. -**Step 1: Create a pipeline** +**Prerequisite** -The following example shows you how to create a search pipeline for the `huggingface/sentence-transformers/all-distilroberta-v1` local model. The model is a [pretrained sentence transformer model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sentence-transformers) hosted in your OpenSearch cluster. +You must configure an externally hosted LLM for this use case. For more information about externally hosted models, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). Once you register the LLM, you can use the following request to test it. This request requires providing the `prompt` and `context` fields: -If you invoke the model using the Predict API, then the request appears as follows: +```json +POST /_plugins/_ml/models/KKne6JIBAs32TwoK-FFR/_predict +{ + "parameters": { + "prompt":"\n\nHuman: You are a professional data analysist. You will always answer question: Which month had the lowest customer acquisition cost per new customer? based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say I don't know. Context: ${parameters.context.toString()}. \n\n Assistant:", + "context":"Customer acquisition cost: January: $50, February: $45, March: $40. New customers: January: 500, February: 600, March: 750" + } +} +``` +{% include copy-curl.html %} + +The response contains the model output in the `inference_results` field: ```json -POST /_plugins/_ml/_predict/text_embedding/cleMb4kBJ1eYAeTMFFg4 { - "text_docs":[ "today is sunny"], - "return_number": true, - "target_response": ["sentence_embedding"] + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "response": """ Based on the data provided: + + - Customer acquisition cost in January was $50 and new customers were 500. So cost per new customer was $50/500 = $0.10 + - Customer acquisition cost in February was $45 and new customers were 600. So cost per new customer was $45/600 = $0.075 + - Customer acquisition cost in March was $40 and new customers were 750. So cost per new customer was $40/750 = $0.053 + + Therefore, the month with the lowest customer acquisition cost per new customer was March, at $0.053.""" + } + } + ], + "status_code": 200 + } + ] } ``` -Using this schema, specify the `model_input` as follows: +**Step 1: Create a pipeline** + +Create a search pipeline for the registered model. The model requires a `context` field as input. The model response summarizes the text in the `review` field and stores the summary in the `ext.ml_inference.llm_response` field of the search response: ```json - "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }" +PUT /_search/pipeline/my_pipeline_request_review_llm +{ + "response_processors": [ + { + "ml_inference": { + "tag": "ml_inference", + "description": "This processor is going to run llm", + "model_id": "EOF6wJIBtDGAJRTD4kNg", + "function_name": "REMOTE", + "input_map": [ + { + "context": "review" + } + ], + "output_map": [ + { + "ext.ml_inference.llm_response": "response" + } + ], + "model_config": { + "prompt": "\n\nHuman: You are a professional data analysist. You will always answer question: Which month had the lowest customer acquisition cost per new customer? based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say I don't know. Context: ${parameters.context.toString()}. \n\n Assistant:" + }, + "ignore_missing": false, + "ignore_failure": false + } + } + ] +} ``` +{% include copy-curl.html %} -In the `input_map`, map the `passage_text` document field to the `text_docs` field expected by the model: +In this configuration, you've provided the following parameters: + +- The `model_id` parameter specifies the ID of the generative AI model. +- The `function_name` parameter is set to `REMOTE`, indicating that the model is hosted externally. +- The `input_map` parameter maps the review field from the document to the context field expected by the model. +- The `output_map` parameter specifies that the model's response should be stored in `ext.ml_inference.llm_response` in the search response. +- The `model_config` parameter includes a prompt that tells the model how to process the input and generate a summary. + +**Step 2: Index sample documents** + +Index some sample documents to test the pipeline: ```json -"input_map": [ - { - "text_docs": "passage_text" +POST /_bulk +{"index":{"_index":"review_string_index","_id":"1"}} +{"review":"Customer acquisition cost: January: $50, New customers: January: 500."} +{"index":{"_index":"review_string_index","_id":"2"}} +{"review":"Customer acquisition cost: February: $45, New customers: February: 600."} +{"index":{"_index":"review_string_index","_id":"3"}} +{"review":"Customer acquisition cost: March: $40, New customers: March: 750."} +``` +{% include copy-curl.html %} + +**Step 3: Run the pipeline** + +Run a search query using the pipeline: + +```json +GET /review_string_index/_search?search_pipeline=my_pipeline_request_review_llm +{ + "query": { + "match_all": {} } -] +} ``` +{% include copy-curl.html %} -Because you specified the field to be converted into embeddings as a JSON path, you need to set the `full_response_path` to `true`. Then the full JSON document is parsed in order to obtain the input field: +The response includes the original documents and the generated summary in the `ext.ml_inference.llm_response` field: ```json -"full_response_path": true +{ + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "review_string_index", + "_id": "1", + "_score": 1, + "_source": { + "review": "Customer acquisition cost: January: $50, New customers: January: 500." + } + }, + { + "_index": "review_string_index", + "_id": "2", + "_score": 1, + "_source": { + "review": "Customer acquisition cost: February: $45, New customers: February: 600." + } + }, + { + "_index": "review_string_index", + "_id": "3", + "_score": 1, + "_source": { + "review": "Customer acquisition cost: March: $40, New customers: March: 750." + } + } + ] + }, + "ext": { + "ml_inference": { + "llm_response": """ Based on the context provided: + + - Customer acquisition cost in January was $50 and new customers were 500. So the cost per new customer was $50/500 = $0.10 + + - Customer acquisition cost in February was $45 and new customers were 600. So the cost per new customer was $45/600 = $0.075 + + - Customer acquisition cost in March was $40 and new customers were 750. So the cost per new customer was $40/750 = $0.053 + + Therefore, the month with the lowest customer acquisition cost per new customer was March, as it had the lowest cost per customer of $0.053.""" + } + } +} ``` -The text in the `passage_text` field will be used to generate embeddings: +### Example: Reranking search results using a text similarity model + +The following example shows you how to configure an `ml_inference` search response processor with a text similarity model. + +**Prerequisite** + +You must configure an externally hosted text similarity model for this use case. For more information about externally hosted models, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). Once you register the text similarity model, you can use the following request to test it. This request requires that you provide the `text` and `text_pair` fields within the `inputs` field: ```json +POST /_plugins/_ml/models/Ialx65IBAs32TwoK1lXf/_predict { - "passage_text": "hello world" + "parameters": { + "inputs": + { + "text": "I like you", + "text_pair": "I hate you" + } + } } ``` +{% include copy-curl.html %} -The Predict API request returns the following response: +The model returns similarity scores for each input document: ```json { - "inference_results" : [ + "inference_results": [ { - "output" : [ + "output": [ { - "name" : "sentence_embedding", - "data_type" : "FLOAT32", - "shape" : [ - 768 - ], - "data" : [ - 0.25517133, - -0.28009856, - 0.48519906, - ... - ] + "name": "response", + "dataAsMap": { + "label": "LABEL_0", + "score": 0.022704314440488815 + } } - ] + ], + "status_code": 200 } ] } ``` +{% include copy-curl.html %} -The model generates embeddings in the `$.inference_results.*.output.*.data` field. The `output_map` maps this field to the newly created `passage_embedding` field in the search response document: +**Step 1: Index sample documents** + +Create an index and add some sample documents: ```json -"output_map": [ - { - "passage_embedding": "$.inference_results.*.output.*.data" - } -] +POST _bulk +{"index":{"_index":"demo-index-0","_id":"1"}} +{"diary":"I hate you"} +{"index":{"_index":"demo-index-0","_id":"2"}} +{"diary":"I love you"} +{"index":{"_index":"demo-index-0","_id":"3"}} +{"diary":"I dislike you"} ``` +{% include copy-curl.html %} -To configure an `ml_inference` search response processor with a local model, specify the `function_name` explicitly. In this example, the `function_name` is `text_embedding`. For information about valid `function_name` values, see [Request fields](#request-body-fields). +**Step 2: Create a search pipeline** -The following is the final configuration of the `ml_inference` search response processor with the local model: +For this example, you'll create a search pipeline that uses a text similarity model in a `one-to-one` inference mode, processing each document in the search results individually. This setup allows the model to make one prediction request per document, providing specific relevance insights for each search hit. When using `input_map` to map the search request to query text, the JSON path must start with `$._request` or `_request`: ```json -PUT /_search/pipeline/ml_inference_pipeline_local +PUT /_search/pipeline/my_rerank_pipeline { - "description": "search passage and generates embeddings", - "processors": [ + "response_processors": [ { "ml_inference": { - "function_name": "text_embedding", - "full_response_path": true, - "model_id": "<your model id>", - "model_config": { - "return_number": true, - "target_response": ["sentence_embedding"] - }, - "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }", + "tag": "ml_inference", + "description": "This processor runs ml inference during search response", + "model_id": "Ialx65IBAs32TwoK1lXf", + "model_input":"""{"parameters":{"inputs":{"text":"${input_map.text}","text_pair":"${input_map.text_pair}"}}}""", + "function_name": "REMOTE", "input_map": [ { - "text_docs": "passage_text" + "text": "diary", + "text_pair":"$._request.query.term.diary.value" } ], "output_map": [ { - "passage_embedding": "$.inference_results.*.output.*.data" + "rank_score": "$.score" } ], - "ignore_missing": true, - "ignore_failure": true - } + "full_response_path": false, + "model_config": {}, + "ignore_missing": false, + "ignore_failure": false, + "one_to_one": true + }, + "rerank": { + "by_field": { + "target_field": "rank_score", + "remove_target_field": true + } + } } ] } ``` {% include copy-curl.html %} -**Step 2: Run the pipeline** +In this configuration, you've provided the following parameters: -Run the following query, providing the pipeline name in the request: +- The `model_id` parameter specifies the unique identifier of the text similarity model. +- The `function_name` parameter is set to `REMOTE`, indicating that the model is hosted externally. +- The `input_map` parameter maps the `diary` field from each document to the `text` input of the model as well as the search query term to the `text_pair` input. +- The `output_map` parameter maps the model's score to a field named `rank_score` in each document. +- The `model_input` parameter formats the input for the model, ensuring that it matches the structure expected by the Predict API. +- The `one_to_one` parameter is set to `true`, ensuring that the model processes each document individually rather than batching multiple documents together. +- The `ignore_missing` parameter is set to `false`, causing the processor to fail if the mapped fields are missing from a document. +- The `ignore_failure` parameter is set to `false`, causing the entire pipeline to fail if the ML inference processor encounters an error. + +The `rerank` processor is applied after ML inference. It reorders the documents based on the `rank_score` field generated by the ML model and then removes this field from the final results. + +**Step 3: Run the pipeline** + +Now perform a search using the created pipeline: ```json -GET /my_index/_search?search_pipeline=ml_inference_pipeline_local +GET /demo-index-0/_search?search_pipeline=my_rerank_pipeline { -"query": { - "term": { - "passage_text": { - "value": "hello" + "query": { + "term": { + "dairy": { + "value": "today" } } } @@ -350,13 +702,11 @@ GET /my_index/_search?search_pipeline=ml_inference_pipeline_local ``` {% include copy-curl.html %} -#### Response - -The response confirms that the processor has generated text embeddings in the `passage_embedding` field: +The response includes the original documents and their reranked scores: ```json { - "took": 288, + "took": 2, "timed_out": false, "_shards": { "total": 1, @@ -366,26 +716,43 @@ The response confirms that the processor has generated text embeddings in the `p }, "hits": { "total": { - "value": 1, + "value": 3, "relation": "eq" }, - "max_score": 0.00009405752, + "max_score": 0.040183373, "hits": [ { - "_index": "my_index", + "_index": "demo-index-0", "_id": "1", - "_score": 0.00009405752, + "_score": 0.040183373, "_source": { - "passage_text": "hello world", - "passage_embedding": [ - 0.017304314, - -0.021530833, - 0.050184276, - 0.08962978, - ...] + "diary": "I hate you" + } + }, + { + "_index": "demo-index-0", + "_id": "2", + "_score": 0.022628736, + "_source": { + "diary": "I love you" + } + }, + { + "_index": "demo-index-0", + "_id": "3", + "_score": 0.0073115323, + "_source": { + "diary": "I dislike you" } } ] + }, + "profile": { + "shards": [] } } -``` \ No newline at end of file +``` + +## Next steps + +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-pipelines/rerank-processor.md b/_search-plugins/search-pipelines/rerank-processor.md index e543c8a28a..11691eff95 100644 --- a/_search-plugins/search-pipelines/rerank-processor.md +++ b/_search-plugins/search-pipelines/rerank-processor.md @@ -11,33 +11,49 @@ grand_parent: Search pipelines Introduced 2.12 {: .label .label-purple } -The `rerank` search request processor intercepts search results and passes them to a cross-encoder model to be reranked. The model reranks the results, taking into account the scoring context. Then the processor orders documents in the search results based on their new scores. +The `rerank` search response processor intercepts and reranks search results. The processor orders documents in the search results based on their new scores. + +OpenSearch supports the following rerank types. + +Type | Description | Earliest available version +:--- | :--- | :--- +[`ml_opensearch`](#the-ml_opensearch-rerank-type) | Applies an OpenSearch-provided cross-encoder model. | 2.12 +[`by_field`](#the-by_field-rerank-type) | Applies reranking based on a user-provided field. | 2.18 ## Request body fields The following table lists all available request fields. -Field | Data type | Description -:--- | :--- | :--- -`<reranker_type>` | Object | The reranker type provides the rerank processor with static information needed across all reranking calls. Required. -`context` | Object | Provides the rerank processor with information necessary for generating reranking context at query time. -`tag` | String | The processor's identifier. Optional. -`description` | String | A description of the processor. Optional. -`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`<rerank_type>` | Object | Required | The rerank type for document reranking. Valid values are `ml-opensearch` and `by_field`. +`context` | Object | Required for the `ml_opensearch` rerank type. Optional and does not affect the results for the `by_field` rerank type. | Provides the `rerank` processor with information necessary for reranking at query time. +`tag` | String | Optional | The processor's identifier. +`description` | String | Optional | A description of the processor. +`ignore_failure` | Boolean | Optional | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Default is `false`. + +<!-- vale off --> +## The ml_opensearch rerank type +<!-- vale on --> +Introduced 2.12 +{: .label .label-purple } -### The `ml_opensearch` reranker type +To rerank results using a cross-encoder model, specify the `ml_opensearch` rerank type. -The `ml_opensearch` reranker type is designed to work with the cross-encoder model provided by OpenSearch. For this reranker type, specify the following fields. +### Prerequisite + +Before using the `ml_opensearch` rerank type, you must configure a cross-encoder model. For information about using an OpenSearch-provided model, see [Cross-encoder models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#cross-encoder-models). For information about using a custom model, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). + +The `ml_opensearch` rerank type supports the following fields. All fields are required. Field | Data type | Description :--- | :--- | :--- -`ml_opensearch` | Object | Provides the rerank processor with model information. Required. -`ml_opensearch.model_id` | String | The model ID for the cross-encoder model. Required. For more information, see [Using ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). -`context.document_fields` | Array | An array of document fields that specifies the fields from which to retrieve context for the cross-encoder model. Required. +`ml_opensearch.model_id` | String | The model ID of the cross-encoder model for reranking. For more information, see [Using ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). +`context.document_fields` | Array | An array of document fields that specifies the fields from which to retrieve context for the cross-encoder model. -## Example +### Example -The following example demonstrates using a search pipeline with a `rerank` processor. +The following example demonstrates using a search pipeline with a `rerank` processor implemented using the `ml_opensearch` rerank type. For a complete example, see [Reranking using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/). ### Creating a search pipeline @@ -108,11 +124,72 @@ POST /_search?search_pipeline=rerank_pipeline ``` {% include copy-curl.html %} -The `query_context` object contains the following fields. +The `query_context` object contains the following fields. You must provide either `query_text` or `query_text_path` but cannot provide both simultaneously. + +Field name | Required/Optional | Description +:--- | :--- | :--- +`query_text` | Exactly one of `query_text` or `query_text_path` is required. | The natural language text of the question that you want to use to rerank the search results. +`query_text_path` | Exactly one of `query_text` or `query_text_path` is required. | The full JSON path to the text of the question that you want to use to rerank the search results. The maximum number of characters allowed in the path is `1000`. + + +<!-- vale off --> +## The by_field rerank type +<!-- vale on --> +Introduced 2.18 +{: .label .label-purple } + +To rerank results by a document field, specify the `by_field` rerank type. + +The `by_field` object supports the following fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`target_field` | String | Required | Specifies the field name or a dot path to the field containing the score to use for reranking. +`remove_target_field` | Boolean | Optional | If `true`, the response does not include the `target_field` used to perform reranking. Default is `false`. +`keep_previous_score` | Boolean | Optional | If `true`, the response includes a `previous_score` field, which contains the score calculated before reranking and can be useful when debugging. Default is `false`. + +### Example + +The following example demonstrates using a search pipeline with a `rerank` processor implemented using the `by_field` rerank type. For a complete example, see [Reranking by a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). + +### Creating a search pipeline + +The following request creates a search pipeline with a `by_field` rerank type response processor that ranks the documents by the `reviews.stars` field and specifies to return the original document score: + +```json +PUT /_search/pipeline/rerank_byfield_pipeline +{ + "response_processors": [ + { + "rerank": { + "by_field": { + "target_field": "reviews.stars", + "keep_previous_score" : true + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using the search pipeline + +To apply the search pipeline to a query, provide the search pipeline name in the query parameter: + +```json +POST /book-index/_search?search_pipeline=rerank_byfield_pipeline +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} -Field name | Description -:--- | :--- -`query_text` | The natural language text of the question that you want to use to rerank the search results. Either `query_text` or `query_text_path` (not both) is required. -`query_text_path` | The full JSON path to the text of the question that you want to use to rerank the search results. Either `query_text` or `query_text_path` (not both) is required. The maximum number of characters in the path is `1000`. +## Next steps -For more information about setting up reranking, see [Reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/). \ No newline at end of file +- Learn more about [reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/). +- See a complete example of [reranking using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/). +- See a complete example of [reranking by a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-relevance/rerank-by-field-cross-encoder.md b/_search-plugins/search-relevance/rerank-by-field-cross-encoder.md new file mode 100644 index 0000000000..7f30689491 --- /dev/null +++ b/_search-plugins/search-relevance/rerank-by-field-cross-encoder.md @@ -0,0 +1,276 @@ +--- +layout: default +title: Reranking by a field using a cross-encoder +parent: Reranking search results +grand_parent: Search relevance +has_children: false +nav_order: 30 +--- + +# Reranking by a field using an externally hosted cross-encoder model +Introduced 2.18 +{: .label .label-purple } + +In this tutorial, you'll learn how to use a cross-encoder model hosted on Amazon SageMaker to rerank search results and improve search relevance. + +To rerank documents, you'll configure a search pipeline that processes search results at query time. The pipeline intercepts search results and passes them to the [`ml_inference` search response processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-response/), which invokes the cross-encoder model. The model generates scores used to rerank the matching documents [`by_field`]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). + +## Prerequisite: Deploy a model on Amazon SageMaker + +Run the following code to deploy a model on Amazon SageMaker. For this example, you'll use the [`ms-marco-MiniLM-L-6-v2`](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) Hugging Face cross-encoder model hosted on Amazon SageMaker. We recommend using a GPU for better performance: + +```python +import sagemaker +import boto3 +from sagemaker.huggingface import HuggingFaceModel + +sess = sagemaker.Session() +role = sagemaker.get_execution_role() + +hub = { + 'HF_MODEL_ID':'cross-encoder/ms-marco-MiniLM-L-6-v2', + 'HF_TASK':'text-classification' +} +huggingface_model = HuggingFaceModel( + transformers_version='4.37.0', + pytorch_version='2.1.0', + py_version='py310', + env=hub, + role=role, +) +predictor = huggingface_model.deploy( + initial_instance_count=1, # number of instances + instance_type='ml.m5.xlarge' # ec2 instance type +) +``` +{% include copy.html %} + +After deploying the model, you can find the model endpoint by going to the Amazon SageMaker console in the AWS Management Console and selecting **Inference > Endpoints** on the left tab. Note the URL for the created model; you'll use it to create a connector. + +## Running a search with reranking + +To run a search with reranking, follow these steps: + +1. [Create a connector](#step-1-create-a-connector). +1. [Register the model](#step-2-register-the-model). +1. [Ingest documents into an index](#step-3-ingest-documents-into-an-index). +1. [Create a search pipeline](#step-4-create-a-search-pipeline). +1. [Search using reranking](#step-5-search-using-reranking). + +## Step 1: Create a connector + +Create a connector to the cross-encoder model by providing the model URL in the `actions.url` parameter: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "SageMaker cross-encoder model", + "description": "Test connector for SageMaker cross-encoder hosted model", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "<YOUR_ACCESS_KEY>", + "secret_key": "<YOUR_SECRET_KEY>", + "session_token": "<YOUR_SESSION_TOKEN>" + }, + "parameters": { + "region": "<REGION>", + "service_name": "sagemaker" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "<YOUR_SAGEMAKER_ENDPOINT_URL>", + "headers": { + "content-type": "application/json" + }, + "request_body": "{ \"inputs\": { \"text\": \"${parameters.text}\", \"text_pair\": \"${parameters.text_pair}\" }}" + } + ] +} +``` +{% include copy-curl.html %} + +Note the connector ID contained in the response; you'll use it in the following step. + +## Step 2: Register the model + +To register the model, provide the connector ID in the `connector_id` parameter: + +```json +POST /_plugins/_ml/models/_register +{ + "name": "Cross encoder model", + "version": "1.0.1", + "function_name": "remote", + "description": "Using a SageMaker endpoint to apply a cross encoder model", + "connector_id": "<YOUR_CONNECTOR_ID>" +} +``` +{% include copy-curl.html %} + + +## Step 3: Ingest documents into an index + +Create an index and ingest sample documents containing facts about the New York City boroughs: + +```json +POST /nyc_areas/_bulk +{ "index": { "_id": 1 } } +{ "borough": "Queens", "area_name": "Astoria", "description": "Astoria is a neighborhood in the western part of Queens, New York City, known for its diverse community and vibrant cultural scene.", "population": 93000, "facts": "Astoria is home to many artists and has a large Greek-American community. The area also boasts some of the best Mediterranean food in NYC." } +{ "index": { "_id": 2 } } +{ "borough": "Queens", "area_name": "Flushing", "description": "Flushing is a neighborhood in the northern part of Queens, famous for its Asian-American population and bustling business district.", "population": 227000, "facts": "Flushing is one of the most ethnically diverse neighborhoods in NYC, with a large Chinese and Korean population. It is also home to the USTA Billie Jean King National Tennis Center." } +{ "index": { "_id": 3 } } +{ "borough": "Brooklyn", "area_name": "Williamsburg", "description": "Williamsburg is a trendy neighborhood in Brooklyn known for its hipster culture, vibrant art scene, and excellent restaurants.", "population": 150000, "facts": "Williamsburg is a hotspot for young professionals and artists. The neighborhood has seen rapid gentrification over the past two decades." } +{ "index": { "_id": 4 } } +{ "borough": "Manhattan", "area_name": "Harlem", "description": "Harlem is a historic neighborhood in Upper Manhattan, known for its significant African-American cultural heritage.", "population": 116000, "facts": "Harlem was the birthplace of the Harlem Renaissance, a cultural movement that celebrated Black culture through art, music, and literature." } +{ "index": { "_id": 5 } } +{ "borough": "The Bronx", "area_name": "Riverdale", "description": "Riverdale is a suburban-like neighborhood in the Bronx, known for its leafy streets and affluent residential areas.", "population": 48000, "facts": "Riverdale is one of the most affluent areas in the Bronx, with beautiful parks, historic homes, and excellent schools." } +{ "index": { "_id": 6 } } +{ "borough": "Staten Island", "area_name": "St. George", "description": "St. George is the main commercial and cultural center of Staten Island, offering stunning views of Lower Manhattan.", "population": 15000, "facts": "St. George is home to the Staten Island Ferry terminal and is a gateway to Staten Island, offering stunning views of the Statue of Liberty and Ellis Island." } +``` +{% include copy-curl.html %} + +## Step 4: Create a search pipeline + +Next, create a search pipeline for reranking. In the search pipeline configuration, the `input_map` and `output_map` define how the input data is prepared for the cross-encoder model and how the model's output is interpreted for reranking: + +- The `input_map` specifies which fields in the search documents and the query should be used as model inputs: + - The `text` field maps to the `facts` field in the indexed documents. It provides the document-specific content that the model will analyze. + - The `text_pair` field dynamically retrieves the search query text (`multi_match.query`) from the search request. + + The combination of `text` (document `facts`) and `text_pair` (search `query`) allows the cross-encoder model to compare the relevance of the document to the query, considering their semantic relationship. + +- The `output_map` field specifies how the output of the model is mapped to the fields in the response: + - The `rank_score` field in the response will store the model's relevance score, which will be used to perform reranking. + +When using the `by_field` rerank type, the `rank_score` field will contain the same score as the `_score` field. To remove the `rank_score` field from the search results, set `remove_target_field` to `true`. The original BM25 score, before reranking, is included for debugging purposes by setting `keep_previous_score` to `true`. This allows you to compare the original score with the reranked score to evaluate improvements in search relevance. + +To create the search pipeline, send the following request: + +```json +PUT /_search/pipeline/my_pipeline +{ + "response_processors": [ + { + "ml_inference": { + "tag": "ml_inference", + "description": "This processor runs ml inference during search response", + "model_id": "<model_id_from_step_3>", + "function_name": "REMOTE", + "input_map": [ + { + "text": "facts", + "text_pair":"$._request.query.multi_match.query" + } + ], + "output_map": [ + { + "rank_score": "$.score" + } + ], + "full_response_path": false, + "model_config": {}, + "ignore_missing": false, + "ignore_failure": false, + "one_to_one": true + }, + + "rerank": { + "by_field": { + "target_field": "rank_score", + "remove_target_field": true, + "keep_previous_score" : true + } + } + + } + ] +} +``` +{% include copy-curl.html %} + +## Step 5: Search using reranking + +Use the following request to search indexed documents and rerank them using the cross-encoder model. The request retrieves documents containing any of the specified terms in the `description` or `facts` fields. These terms are then used to compare and rerank the matched documents: + +```json +POST /nyc_areas/_search?search_pipeline=my_pipeline +{ + "query": { + "multi_match": { + "query": "artists art creative community", + "fields": ["description", "facts"] + } + } +} +``` +{% include copy-curl.html %} + +In the response, the `previous_score` field contains the document's BM25 score, which it would have received if you hadn't applied the pipeline. Note that while BM25 ranked "Astoria" the highest, the cross-encoder model prioritized "Harlem" because it matched more search terms: + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 0.03418137, + "hits": [ + { + "_index": "nyc_areas", + "_id": "4", + "_score": 0.03418137, + "_source": { + "area_name": "Harlem", + "description": "Harlem is a historic neighborhood in Upper Manhattan, known for its significant African-American cultural heritage.", + "previous_score": 1.6489418, + "borough": "Manhattan", + "facts": "Harlem was the birthplace of the Harlem Renaissance, a cultural movement that celebrated Black culture through art, music, and literature.", + "population": 116000 + } + }, + { + "_index": "nyc_areas", + "_id": "1", + "_score": 0.0090838, + "_source": { + "area_name": "Astoria", + "description": "Astoria is a neighborhood in the western part of Queens, New York City, known for its diverse community and vibrant cultural scene.", + "previous_score": 2.519608, + "borough": "Queens", + "facts": "Astoria is home to many artists and has a large Greek-American community. The area also boasts some of the best Mediterranean food in NYC.", + "population": 93000 + } + }, + { + "_index": "nyc_areas", + "_id": "3", + "_score": 0.0032599436, + "_source": { + "area_name": "Williamsburg", + "description": "Williamsburg is a trendy neighborhood in Brooklyn known for its hipster culture, vibrant art scene, and excellent restaurants.", + "previous_score": 1.5632852, + "borough": "Brooklyn", + "facts": "Williamsburg is a hotspot for young professionals and artists. The neighborhood has seen rapid gentrification over the past two decades.", + "population": 150000 + } + } + ] + }, + "profile": { + "shards": [] + } +} +``` + \ No newline at end of file diff --git a/_search-plugins/search-relevance/rerank-by-field.md b/_search-plugins/search-relevance/rerank-by-field.md new file mode 100644 index 0000000000..e6f65a4d25 --- /dev/null +++ b/_search-plugins/search-relevance/rerank-by-field.md @@ -0,0 +1,209 @@ +--- +layout: default +title: Reranking by a field +parent: Reranking search results +grand_parent: Search relevance +has_children: false +nav_order: 20 +--- + +# Reranking search results by a field +Introduced 2.18 +{: .label .label-purple } + +You can use a `by_field` rerank type to rerank search results by a document field. Reranking search results by a field is useful if a model has already run and produced a numerical score for your documents or if a previous search response processor was applied and you want to rerank documents differently based on an aggregated field. + +To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) to them. The `rerank` processor evaluates the search results and sorts them based on the new scores obtained from a document field. + +## Running a search with reranking + +To run a search with reranking, follow these steps: + +1. [Configure a search pipeline](#step-1-configure-a-search-pipeline). +1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). +1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). +1. [Search using reranking](#step-4-search-using-reranking). + +## Step 1: Configure a search pipeline + +Configure a search pipeline with a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) and specify the `by_field` rerank type. The pipeline sorts by the `reviews.stars` field (specified by a complete dot path to the field) and returns the original query scores for all documents along with their new scores: + +```json +PUT /_search/pipeline/rerank_byfield_pipeline +{ + "response_processors": [ + { + "rerank": { + "by_field": { + "target_field": "reviews.stars", + "keep_previous_score" : true + } + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information about the request fields, see [Request fields]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#request-body-fields). + +## Step 2: Create an index for ingestion + +In order to use the `rerank` processor defined in your pipeline, create an OpenSearch index and add the pipeline created in the previous step as the default pipeline: + +```json +PUT /book-index +{ + "settings": { + "index.search.default_pipeline" : "rerank_byfield_pipeline" + }, + "mappings": { + "properties": { + "title": { + "type": "text" + }, + "author": { + "type": "text" + }, + "genre": { + "type": "keyword" + }, + "reviews": { + "properties": { + "stars": { + "type": "float" + } + } + }, + "description": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following bulk request: + +```json +POST /_bulk +{ "index": { "_index": "book-index", "_id": "1" } } +{ "title": "The Lost City", "author": "Jane Doe", "genre": "Adventure Fiction", "reviews": { "stars": 4.2 }, "description": "An exhilarating journey through a hidden civilization in the Amazon rainforest." } +{ "index": { "_index": "book-index", "_id": "2" } } +{ "title": "Whispers of the Past", "author": "John Smith", "genre": "Historical Mystery", "reviews": { "stars": 4.7 }, "description": "A gripping tale set in Victorian England, unraveling a century-old mystery." } +{ "index": { "_index": "book-index", "_id": "3" } } +{ "title": "Starlit Dreams", "author": "Emily Clark", "genre": "Science Fiction", "reviews": { "stars": 4.5 }, "description": "In a future where dreams can be shared, one girl discovers her imaginations power." } +{ "index": { "_index": "book-index", "_id": "4" } } +{ "title": "The Enchanted Garden", "author": "Alice Green", "genre": "Fantasy", "reviews": { "stars": 4.8 }, "description": "A magical garden holds the key to a young girls destiny and friendship." } + +``` +{% include copy-curl.html %} + +## Step 4: Search using reranking + +As an example, run a `match_all` query on your index: + +```json +POST /book-index/_search +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} + +The response contains documents sorted in descending order based on the `reviews.stars` field. Each document contains the original query score in the `previous_score` field: + +```json +{ + "took": 33, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 4.8, + "hits": [ + { + "_index": "book-index", + "_id": "4", + "_score": 4.8, + "_source": { + "reviews": { + "stars": 4.8 + }, + "author": "Alice Green", + "genre": "Fantasy", + "description": "A magical garden holds the key to a young girls destiny and friendship.", + "previous_score": 1, + "title": "The Enchanted Garden" + } + }, + { + "_index": "book-index", + "_id": "2", + "_score": 4.7, + "_source": { + "reviews": { + "stars": 4.7 + }, + "author": "John Smith", + "genre": "Historical Mystery", + "description": "A gripping tale set in Victorian England, unraveling a century-old mystery.", + "previous_score": 1, + "title": "Whispers of the Past" + } + }, + { + "_index": "book-index", + "_id": "3", + "_score": 4.5, + "_source": { + "reviews": { + "stars": 4.5 + }, + "author": "Emily Clark", + "genre": "Science Fiction", + "description": "In a future where dreams can be shared, one girl discovers her imaginations power.", + "previous_score": 1, + "title": "Starlit Dreams" + } + }, + { + "_index": "book-index", + "_id": "1", + "_score": 4.2, + "_source": { + "reviews": { + "stars": 4.2 + }, + "author": "Jane Doe", + "genre": "Adventure Fiction", + "description": "An exhilarating journey through a hidden civilization in the Amazon rainforest.", + "previous_score": 1, + "title": "The Lost City" + } + } + ] + }, + "profile": { + "shards": [] + } +} +``` + +## Next steps + +- Learn more about the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-relevance/rerank-cross-encoder.md b/_search-plugins/search-relevance/rerank-cross-encoder.md new file mode 100644 index 0000000000..64f93c886c --- /dev/null +++ b/_search-plugins/search-relevance/rerank-cross-encoder.md @@ -0,0 +1,122 @@ +--- +layout: default +title: Reranking using a cross-encoder model +parent: Reranking search results +grand_parent: Search relevance +has_children: false +nav_order: 10 +--- + +# Reranking search results using a cross-encoder model +Introduced 2.12 +{: .label .label-purple } + +You can rerank search results using a cross-encoder model in order to improve search relevance. To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) to them. The `rerank` processor evaluates the search results and sorts them based on the new scores provided by the cross-encoder model. + +**PREREQUISITE**<br> +Before configuring a reranking pipeline, you must set up a cross-encoder model. For information about using an OpenSearch-provided model, see [Cross-encoder models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#cross-encoder-models). For information about using a custom model, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). +{: .note} + +## Running a search with reranking + +To run a search with reranking, follow these steps: + +1. [Configure a search pipeline](#step-1-configure-a-search-pipeline). +1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). +1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). +1. [Search using reranking](#step-4-search-using-reranking). + +## Step 1: Configure a search pipeline + +Next, configure a search pipeline with a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) and specify the `ml_opensearch` rerank type. In the request, provide a model ID for the cross-encoder model and the document fields to use as context: + +```json +PUT /_search/pipeline/my_pipeline +{ + "description": "Pipeline for reranking with a cross-encoder", + "response_processors": [ + { + "rerank": { + "ml_opensearch": { + "model_id": "gnDIbI0BfUsSoeNT_jAw" + }, + "context": { + "document_fields": [ + "passage_text" + ] + } + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information about the request fields, see [Request fields]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#request-body-fields). + +## Step 2: Create an index for ingestion + +In order to use the `rerank` processor defined in your pipeline, create an OpenSearch index and add the pipeline created in the previous step as the default pipeline: + +```json +PUT /my-index +{ + "settings": { + "index.search.default_pipeline" : "my_pipeline" + }, + "mappings": { + "properties": { + "passage_text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following bulk request: + +```json +POST /_bulk +{ "index": { "_index": "my-index" } } +{ "passage_text" : "I said welcome to them and we entered the house" } +{ "index": { "_index": "my-index" } } +{ "passage_text" : "I feel welcomed in their family" } +{ "index": { "_index": "my-index" } } +{ "passage_text" : "Welcoming gifts are great" } + +``` +{% include copy-curl.html %} + +## Step 4: Search using reranking + +To perform a reranking search on your index, use any OpenSearch query and provide an additional `ext.rerank` field: + +```json +POST /my-index/_search +{ + "query": { + "match": { + "passage_text": "how to welcome in family" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text": "how to welcome in family" + } + } + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can provide the full path to the field containing the context. For more information, see [Rerank processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#example). + +## Next steps + +- Learn more about the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-relevance/reranking-search-results.md b/_search-plugins/search-relevance/reranking-search-results.md index b1e480a84a..065e069b5a 100644 --- a/_search-plugins/search-relevance/reranking-search-results.md +++ b/_search-plugins/search-relevance/reranking-search-results.md @@ -2,7 +2,7 @@ layout: default title: Reranking search results parent: Search relevance -has_children: false +has_children: true nav_order: 60 --- @@ -10,112 +10,12 @@ nav_order: 60 Introduced 2.12 {: .label .label-purple } -You can rerank search results using a cross-encoder reranker in order to improve search relevance. To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) to them. The `rerank` processor evaluates the search results and sorts them based on the new scores provided by the cross-encoder model. +You can rerank search results using a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) in order to improve search relevance. To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the `rerank` processor to them. The `rerank` processor evaluates the search results and sorts them based on the new scores. -**PREREQUISITE**<br> -Before configuring a reranking pipeline, you must set up a cross-encoder model. For information about using an OpenSearch-provided model, see [Cross-encoder models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#cross-encoder-models). For information about using a custom model, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). -{: .note} - -## Running a search with reranking - -To run a search with reranking, follow these steps: - -1. [Configure a search pipeline](#step-1-configure-a-search-pipeline). -1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). -1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). -1. [Search using reranking](#step-4-search-using-reranking). - -## Step 1: Configure a search pipeline - -Next, configure a search pipeline with a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). - -The following example request creates a search pipeline with an `ml_opensearch` rerank processor. In the request, provide a model ID for the cross-encoder model and the document fields to use as context: - -```json -PUT /_search/pipeline/my_pipeline -{ - "description": "Pipeline for reranking with a cross-encoder", - "response_processors": [ - { - "rerank": { - "ml_opensearch": { - "model_id": "gnDIbI0BfUsSoeNT_jAw" - }, - "context": { - "document_fields": [ - "passage_text" - ] - } - } - } - ] -} -``` -{% include copy-curl.html %} - -For more information about the request fields, see [Request fields]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#request-body-fields). - -## Step 2: Create an index for ingestion - -In order to use the rerank processor defined in your pipeline, create an OpenSearch index and add the pipeline created in the previous step as the default pipeline: +You can rerank results in the following ways: -```json -PUT /my-index -{ - "settings": { - "index.search.default_pipeline" : "my_pipeline" - }, - "mappings": { - "properties": { - "passage_text": { - "type": "text" - } - } - } -} -``` -{% include copy-curl.html %} - -## Step 3: Ingest documents into the index - -To ingest documents into the index created in the previous step, send the following bulk request: - -```json -POST /_bulk -{ "index": { "_index": "my-index" } } -{ "passage_text" : "I said welcome to them and we entered the house" } -{ "index": { "_index": "my-index" } } -{ "passage_text" : "I feel welcomed in their family" } -{ "index": { "_index": "my-index" } } -{ "passage_text" : "Welcoming gifts are great" } - -``` -{% include copy-curl.html %} - -## Step 4: Search using reranking - -To perform reranking search on your index, use any OpenSearch query and provide an additional `ext.rerank` field: - -```json -POST /my-index/_search -{ - "query": { - "match": { - "passage_text": "how to welcome in family" - } - }, - "ext": { - "rerank": { - "query_context": { - "query_text": "how to welcome in family" - } - } - } -} -``` -{% include copy-curl.html %} - -Alternatively, you can provide the full path to the field containing the context. For more information, see [Rerank processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#example). +- [Using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/) +- [By a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/) ## Using rerank and normalization processors together @@ -130,4 +30,10 @@ The processing order is as follows: This processing order has the following implications: - Score modification: The rerank processor modifies the scores that were initially adjusted by the normalization processor, potentially leading to different ranking results than initially expected. -- Hybrid queries: In the context of hybrid queries, where multiple types of queries and scoring mechanisms are combined, this behavior is particularly noteworthy. The combined scores from the initial query are normalized first and then reranked, resulting in a two-stage scoring modification. \ No newline at end of file +- Hybrid queries: In the context of hybrid queries, where multiple types of queries and scoring mechanisms are combined, this behavior is particularly noteworthy. The combined scores from the initial query are normalized first and then reranked, resulting in a two-phase scoring modification. + +## Next steps + +- See a complete example of [reranking using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/). +- See a complete example of [reranking by a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). +- Learn more about the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). \ No newline at end of file diff --git a/_search-plugins/searching-data/index.md b/_search-plugins/searching-data/index.md index 279958d97c..42ce7654a0 100644 --- a/_search-plugins/searching-data/index.md +++ b/_search-plugins/searching-data/index.md @@ -19,4 +19,4 @@ Feature | Description [Sort results]({{site.url}}{{site.baseurl}}/opensearch/search/sort/) | Allow sorting of results by different criteria. [Highlight query matches]({{site.url}}{{site.baseurl}}/opensearch/search/highlight/) | Highlight the search term in the results. [Retrieve inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/) | Retrieve underlying hits in nested and parent-join objects. -[Retrieve specific fields]({{site.url}}{{site.baseurl}}search-plugins/searching-data/retrieve-specific-fields/) | Retrieve only the specific fields +[Retrieve specific fields]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/retrieve-specific-fields/) | Retrieve only the specific fields diff --git a/_search-plugins/searching-data/point-in-time.md b/_search-plugins/searching-data/point-in-time.md index ee09354c0f..e1ecd350b1 100644 --- a/_search-plugins/searching-data/point-in-time.md +++ b/_search-plugins/searching-data/point-in-time.md @@ -37,6 +37,10 @@ The create PIT operation returns a PIT ID, which you can use to run multiple que In case of a cluster or node failure, all PIT data is lost. {: .note} +### PIT in SQL + +The [SQL plugin]({{site.url}}{{site.baseurl}}/search-plugins/sql/index/) also supports pagination using PIT. When the `plugin.sql.pagination.api` setting is enabled (the default), SQL search queries in OpenSearch automatically use PIT internally. For more information, see [Pagination in SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/sql-ppl-api/#paginating-results). + ## Pagination with PIT and search_after When you run a query with a PIT ID, you can use the `search_after` parameter to retrieve the next page of results. This gives you control over the order of documents in the pages of results. diff --git a/_search-plugins/sql/limitation.md b/_search-plugins/sql/limitation.md index ac4a6ed619..3382a41912 100644 --- a/_search-plugins/sql/limitation.md +++ b/_search-plugins/sql/limitation.md @@ -94,7 +94,7 @@ Such queries are successfully executed by the `V2` engine unless they have `V1`- * `json` formatted output is supported in `V1` engine only. * The `V2` engine does not track query execution time, so slow queries are not reported. * The `V2` query engine not only runs queries in the OpenSearch engine but also supports post-processing for complex queries. Accordingly, the `explain` output is no longer OpenSearch domain-specific language (DSL) but also includes query plan information from the `V2` query engine. -Suggested change * The `V2` query engine does not support aggregation queries such as `histogram`, `date_histogram`, `percentiles`, `topHits`, `stats`, `extended_stats`, `terms`, or `range`. * JOINs and sub-queries are not supported. To stay up to date on the development for JOINs and sub-queries, track [GitHub issue #1441](https://github.com/opensearch-project/sql/issues/1441) and [GitHub issue #892](https://github.com/opensearch-project/sql/issues/892). -* PartiQL syntax for `nested` queries are not supported. Additionally, arrays of objects and primitive types return the first index of the array, while in `V1` they return the entire array as a JSON object. +* OpenSearch does not natively support the array data type but does allow multi-value fields implicitly. The SQL/PPL plugin adheres strictly to the data type semantics defined in index mappings. When parsing OpenSearch responses, it expects data to match the declared type and does not interpret all data in an array. If the [`plugins.query.field_type_tolerance`](https://github.com/opensearch-project/sql/blob/main/docs/user/admin/settings.rst#plugins-query-field-type-tolerance) setting is enabled, the SQL/PPL plugin handles array datasets by returning scalar data types, allowing basic queries (for example, `SELECT * FROM tbl WHERE condition`). However, using multi-value fields in expressions or functions will result in exceptions. If this setting is disabled or not set, only the first element of an array is returned, preserving the default behavior. +* PartiQL syntax for `nested` queries is not supported. diff --git a/_search-plugins/sql/settings.md b/_search-plugins/sql/settings.md index 4842f98449..28d8c05da3 100644 --- a/_search-plugins/sql/settings.md +++ b/_search-plugins/sql/settings.md @@ -79,6 +79,7 @@ Setting | Default | Description `plugins.query.memory_limit` | 85% | Configures the heap memory usage limit for the circuit breaker of the query engine. `plugins.query.size_limit` | 200 | Sets the default size of index that the query engine fetches from OpenSearch. `plugins.query.datasources.enabled` | true | Change to `false` to disable support for data sources in the plugin. +`plugins.query.field_type_tolerance` | true | If `false`, then an array is reduced to the first non-array value at any nesting level. For example, `[[1, 2], [3, 4]]` will be reduced to `1`. If `true`, then the array is preserved. Default is `true`. ## Spark connector settings diff --git a/_search-plugins/star-tree-index.md b/_search-plugins/star-tree-index.md new file mode 100644 index 0000000000..23d4b11c15 --- /dev/null +++ b/_search-plugins/star-tree-index.md @@ -0,0 +1,190 @@ +--- +layout: default +title: Star-tree index +parent: Improving search performance +nav_order: 54 +--- + +# Star-tree index + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +A star-tree index is a multi-field index that improves the performance of aggregations. + +OpenSearch will automatically use a star-tree index to optimize aggregations if the queried fields are part of dimension fields and the aggregations are on star-tree metric fields. No changes are required in the query syntax or the request parameters. + +## When to use a star-tree index + +A star-tree index can be used to perform faster aggregations. Consider the following criteria and features when deciding to use a star-tree index: + +- Star-tree indexes natively support multi-field aggregations. +- Star-tree indexes are created in real time as part of the indexing process, so the data in a star-tree will always be up to date. +- A star-tree index consolidates data, increasing index paging efficiency and using less IO for search queries. + +## Limitations + +Star-tree indexes have the following limitations: + +- A star-tree index should only be enabled on indexes whose data is not updated or deleted because updates and deletions are not accounted for in a star-tree index. +- A star-tree index can be used for aggregation queries only if the queried fields are a subset of the star-tree's dimensions and the aggregated fields are a subset of the star-tree's metrics. +- After a star-tree index is enabled, it cannot be disabled. In order to disable a star-tree index, the data in the index must be reindexed without the star-tree mapping. Furthermore, changing a star-tree configuration will also require a reindex operation. +- [Multi-values/array values]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/index/#arrays) are not supported. +- Only [limited queries and aggregations](#supported-queries-and-aggregations) are supported. Support for more features will be added in future versions. +- The cardinality of the dimensions should not be very high (as with `_id` fields). Higher cardinality leads to increased storage usage and query latency. + +## Star-tree index structure + +The following image illustrates a standard star-tree index structure. + +<img src="{{site.url}}{{site.baseurl}}/images/star-tree-index.png" alt="A star-tree index containing two dimensions and two metrics" width="700"> + +Sorted and aggregated star-tree documents are backed by `doc_values` in an index. The columnar data found in `doc_values` is stored using the following properties: + +- The values are sorted based on the fields set in the `ordered_dimension` setting. In the preceding image, the dimensions are determined by the `status` setting and then by the `port` for each status. +- For each unique dimension/value combination, the aggregated values for all the metrics, such as `avg(size)` and `count(requests)`, are precomputed during ingestion. + +### Leaf nodes + +Each node in a star-tree index points to a range of star-tree documents. Nodes can be further split into child nodes based on the [max_leaf_docs configuration]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/star-tree/#star-tree-index-configuration-options). The number of documents that a leaf node points to is less than or equal to the value set in `max_leaf_docs`. This ensures that the maximum number of documents that need to traverse nodes to derive an aggregated value is at most the number of `max_leaf_docs`, which provides predictable latency. + +### Star nodes + +A star node contains the aggregated data of all the other nodes for a particular dimension, acting as a "catch-all" node. When a star node is found in a dimension, that dimension is skipped during aggregation. This groups together all values of that dimension and allows a query to skip non-competitive nodes when fetching the aggregated value of a particular field. + +The star-tree index structure diagram contains the following three examples demonstrating how a query behaves when retrieving aggregations from nodes in the star-tree: + +- **Blue**: In a `terms` query that searches for the average request size aggregation, the `port` equals `8443` and the status equals `200`. Because the query contains values in both the `status` and `port` dimensions, the query traverses status node `200` and returns the aggregations from child node `8443`. +- **Green**: In a `term` query that searches for the number of aggregation requests, the `status` equals `200`. Because the query only contains a value from the `status` dimension, the query traverses the `200` node's child star node, which contains the aggregated value of all the `port` child nodes. +- **Red**: In a `term` query that searches for the average request size aggregation, the port equals `5600`. Because the query does not contain a value from the `status` dimension, the query traverses a star node and returns the aggregated result from the `5600` child node. + +Support for the `Terms` query will be added in a future version. For more information, see [GitHub issue #15257](https://github.com/opensearch-project/OpenSearch/issues/15257). +{: .note} + +## Enabling a star-tree index + +To use a star-tree index, modify the following settings: + +- Set the feature flag `opensearch.experimental.feature.composite_index.star_tree.enabled` to `true`. For more information about enabling and disabling feature flags, see [Enabling experimental features]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/experimental/). +- Set the `indices.composite_index.star_tree.enabled` setting to `true`. For instructions on how to configure OpenSearch, see [Configuring settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings). +- Set the `index.composite_index` index setting to `true` during index creation. +- Ensure that the `doc_values` parameter is enabled for the `dimensions` and `metrics` fields used in your star-tree mapping. + + +## Example mapping + +In the following example, index mappings define the star-tree configuration. The star-tree index precomputes aggregations in the `logs` index. The aggregations are calculated on the `size` and `latency` fields for all the combinations of values indexed in the `port` and `status` fields: + +```json +PUT logs +{ + "settings": { + "index.number_of_shards": 1, + "index.number_of_replicas": 0, + "index.composite_index": true + }, + "mappings": { + "composite": { + "request_aggs": { + "type": "star_tree", + "config": { + "ordered_dimensions": [ + { + "name": "status" + }, + { + "name": "port" + } + ], + "metrics": [ + { + "name": "size", + "stats": [ + "sum" + ] + }, + { + "name": "latency", + "stats": [ + "avg" + ] + } + ] + } + } + }, + "properties": { + "status": { + "type": "integer" + }, + "port": { + "type": "integer" + }, + "size": { + "type": "integer" + }, + "latency": { + "type": "scaled_float", + "scaling_factor": 10 + } + } + } +} +``` + +For detailed information about star-tree index mappings and parameters, see [Star-tree field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/star-tree/). + +## Supported queries and aggregations + +Star-tree indexes can be used to optimize queries and aggregations. + +### Supported queries + +The following queries are supported as of OpenSearch 2.18: + +- [Term query](https://opensearch.org/docs/latest/query-dsl/term/term/) +- [Match all docs query](https://opensearch.org/docs/latest/query-dsl/match-all/) + +To use a query with a star-tree index, the query's fields must be present in the `ordered_dimensions` section of the star-tree configuration. Queries must also be paired with a supported aggregation. + +### Supported aggregations + +The following metric aggregations are supported as of OpenSearch 2.18: +- [Sum](https://opensearch.org/docs/latest/aggregations/metric/sum/) +- [Minimum](https://opensearch.org/docs/latest/aggregations/metric/minimum/) +- [Maximum](https://opensearch.org/docs/latest/aggregations/metric/maximum/) +- [Value count](https://opensearch.org/docs/latest/aggregations/metric/value-count/) +- [Average](https://opensearch.org/docs/latest/aggregations/metric/average/) + +To use aggregations: + +- The fields must be present in the `metrics` section of the star-tree configuration. +- The metric aggregation type must be part of the `stats` parameter. + +### Aggregation example + +The following example gets the sum of all the values in the `size` field for all error logs with `status=500`, using the [example mapping](#example-mapping): + +```json +POST /logs/_search +{ + "query": { + "term": { + "status": "500" + } + }, + "aggs": { + "sum_size": { + "sum": { + "field": "size" + } + } + } +} +``` + +Using a star-tree index, the result will be retrieved from a single aggregated document as it traverses the `status=500` node, as opposed to scanning through all of the matching documents. This results in lower query latency. + +## Using queries without a star-tree index + +Set the `indices.composite_index.star_tree.enabled` setting to `false` to run queries without using a star-tree index. diff --git a/_security/audit-logs/storage-types.md b/_security/audit-logs/storage-types.md index 719287ad7f..a07d98db59 100644 --- a/_security/audit-logs/storage-types.md +++ b/_security/audit-logs/storage-types.md @@ -16,6 +16,7 @@ Setting | Description :--- | :--- debug | Outputs to stdout. Useful for testing and debugging. internal_opensearch | Writes to an audit index on the current OpenSearch cluster. +internal_opensearch_data_stream | Writes to an audit log data stream on the current OpenSearch cluster. external_opensearch | Writes to an audit index on a remote OpenSearch cluster. webhook | Sends events to an arbitrary HTTP endpoint. log4j | Writes the events to a Log4j logger. You can use any Log4j [appender](https://logging.apache.org/log4j/2.x/manual/appenders.html), such as SNMP, JDBC, Cassandra, and Kafka. @@ -23,10 +24,29 @@ log4j | Writes the events to a Log4j logger. You can use any Log4j [appender](ht You configure the output location in `opensearch.yml`: ``` -plugins.security.audit.type: <debug|internal_opensearch|external_opensearch|webhook|log4j> +plugins.security.audit.type: <debug|internal_opensearch|internal_opensearch_data_stream|external_opensearch|webhook|log4j> ``` -`external_opensearch`, `webhook`, and `log4j` all have additional configuration options. Details follow. +`internal_opensearch_data_stream`, `external_opensearch`, `webhook`, and `log4j` can be customized with additional configuration options. For more information, see [Internal OpenSearch data streams](#internal-opensearch-data-streams). + + +## Internal OpenSearch data streams + +You can configure the `internal_opensearch_data_stream` type with the following parameters. + + +Name | Data type | Description +:--- | :--- | :--- +`plugins.security.audit.config.data_stream.name` | String | The name of the audit log data stream. Default is `opensearch-security-auditlog`. + +### Template settings + +Name | Data type | Description +:--- | :--- | :--- +`plugins.security.audit.config.data_stream.template.manage` | Boolean | When `true`, the template for the data stream is managed by OpenSearch. Default is `true`. +`plugins.security.audit.config.data_stream.template.name` | String | The name of the data stream template. Default is `opensearch-security-auditlog`. +`plugins.security.audit.config.data_stream.template.number_of_replicas` | Integer | The number of replicas for the data stream. Default is `0`. +`plugins.security.audit.config.data_stream.template.number_of_shards` | Integer | The number of shards for the data stream. Default is `1`. ## External OpenSearch diff --git a/_tools/k8s-operator.md b/_tools/k8s-operator.md index 7ee1c1adee..5027dcf304 100644 --- a/_tools/k8s-operator.md +++ b/_tools/k8s-operator.md @@ -63,40 +63,40 @@ Then install the OpenSearch Kubernetes Operator using the following steps: 3. Enter `make build manifests`. 4. Start a Kubernetes cluster. When using minikube, open a new terminal window and enter `minikube start`. Kubernetes will now use a containerized minikube cluster with a namespace called `default`. Make sure that `~/.kube/config` points to the cluster. -```yml -apiVersion: v1 -clusters: -- cluster: - certificate-authority: /Users/naarcha/.minikube/ca.crt - extensions: - - extension: - last-update: Mon, 29 Aug 2022 10:11:47 CDT - provider: minikube.sigs.k8s.io - version: v1.26.1 - name: cluster_info - server: https://127.0.0.1:61661 - name: minikube -contexts: -- context: - cluster: minikube - extensions: - - extension: - last-update: Mon, 29 Aug 2022 10:11:47 CDT - provider: minikube.sigs.k8s.io - version: v1.26.1 - name: context_info - namespace: default - user: minikube - name: minikube -current-context: minikube -kind: Config -preferences: {} -users: -- name: minikube - user: - client-certificate: /Users/naarcha/.minikube/profiles/minikube/client.crt - client-key: /Users/naarcha/.minikube/profiles/minikube/client.key -``` + ```yml + apiVersion: v1 + clusters: + - cluster: + certificate-authority: /Users/naarcha/.minikube/ca.crt + extensions: + - extension: + last-update: Mon, 29 Aug 2022 10:11:47 CDT + provider: minikube.sigs.k8s.io + version: v1.26.1 + name: cluster_info + server: https://127.0.0.1:61661 + name: minikube + contexts: + - context: + cluster: minikube + extensions: + - extension: + last-update: Mon, 29 Aug 2022 10:11:47 CDT + provider: minikube.sigs.k8s.io + version: v1.26.1 + name: context_info + namespace: default + user: minikube + name: minikube + current-context: minikube + kind: Config + preferences: {} + users: + - name: minikube + user: + client-certificate: /Users/naarcha/.minikube/profiles/minikube/client.crt + client-key: /Users/naarcha/.minikube/profiles/minikube/client.key + ``` 5. Enter `make install` to create the CustomResourceDefinition that runs in your Kubernetes cluster. 6. Start the OpenSearch Kubernetes Operator. Enter `make run`. @@ -146,4 +146,4 @@ kubectl delete -f opensearch-cluster.yaml To learn more about how to customize your Kubernetes OpenSearch cluster, including data persistence, authentication methods, and scaling, see the [OpenSearch Kubernetes Operator User Guide](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/userguide/main.md). -If you want to contribute to the development of the OpenSearch Kubernetes Operator, see the repo [design documents](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/designs/high-level.md). \ No newline at end of file +If you want to contribute to the development of the OpenSearch Kubernetes Operator, see the repo [design documents](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/designs/high-level.md). diff --git a/_tuning-your-cluster/availability-and-recovery/search-backpressure.md b/_tuning-your-cluster/availability-and-recovery/search-backpressure.md index 42cef0a530..29982247a7 100644 --- a/_tuning-your-cluster/availability-and-recovery/search-backpressure.md +++ b/_tuning-your-cluster/availability-and-recovery/search-backpressure.md @@ -102,23 +102,23 @@ search_backpressure.node_duress.num_successive_breaches | 3 | The number of succ search_backpressure.node_duress.cpu_threshold | 90% | The CPU usage threshold (as a percentage) required for a node to be considered to be under duress. search_backpressure.node_duress.heap_threshold | 70% | The heap usage threshold (as a percentage) required for a node to be considered to be under duress. search_backpressure.search_task.elapsed_time_millis_threshold | 45,000 | The elapsed time threshold (in milliseconds) required for an individual parent task before it is considered for cancellation. -search_backpressure.search_task.cancellation_ratio | 0.1 | The maximum number of search tasks to cancel, as a percentage of successful search task completions. -search_backpressure.search_task.cancellation_rate| 0.003 | The maximum number of search tasks to cancel per millisecond of elapsed time. -search_backpressure.search_task.cancellation_burst | 5 | The maximum number of search tasks to cancel in a single iteration of the observer thread. -search_backpressure.search_task.heap_percent_threshold | 2% | The heap usage threshold (as a percentage) required for an individual parent task before it is considered for cancellation. -search_backpressure.search_task.total_heap_percent_threshold | 5% | The heap usage threshold (as a percentage) required for the sum of heap usages of all search tasks before cancellation is applied. -search_backpressure.search_task.heap_variance | 2.0 | The heap usage variance required for an individual parent task before it is considered for cancellation. A task is considered for cancellation when `taskHeapUsage` is greater than or equal to `heapUsageMovingAverage` * `variance`. -search_backpressure.search_task.heap_moving_average_window_size | 10 | The window size used to calculate the rolling average of the heap usage for the completed parent tasks. -search_backpressure.search_task.cpu_time_millis_threshold | 30,000 | The CPU usage threshold (in milliseconds) required for an individual parent task before it is considered for cancellation. -search_backpressure.search_shard_task.elapsed_time_millis_threshold | 30,000 | The elapsed time threshold (in milliseconds) required for a single search shard task before it is considered for cancellation. -search_backpressure.search_shard_task.cancellation_ratio | 0.1 | The maximum number of search shard tasks to cancel, as a percentage of successful search shard task completions. -search_backpressure.search_shard_task.cancellation_rate | 0.003 | The maximum number of search shard tasks to cancel per millisecond of elapsed time. -search_backpressure.search_shard_task.cancellation_burst | 10 | The maximum number of search shard tasks to cancel in a single iteration of the observer thread. -search_backpressure.search_shard_task.heap_percent_threshold | 0.5% | The heap usage threshold (as a percentage) required for a single search shard task before it is considered for cancellation. -search_backpressure.search_shard_task.total_heap_percent_threshold | 5% | The heap usage threshold (as a percentage) required for the sum of heap usages of all search shard tasks before cancellation is applied. -search_backpressure.search_shard_task.heap_variance | 2.0 | The minimum variance required for a single search shard task's heap usage compared to the rolling average of previously completed tasks before it is considered for cancellation. -search_backpressure.search_shard_task.heap_moving_average_window_size | 100 | The number of previously completed search shard tasks to consider when calculating the rolling average of heap usage. -search_backpressure.search_shard_task.cpu_time_millis_threshold | 15,000 | The CPU usage threshold (in milliseconds) required for a single search shard task before it is considered for cancellation. +search_backpressure.search_task.cancellation_ratio | 0.1 | The maximum number of search tasks to cancel, as a percentage of successful search task completions. The value range is (0, 1]. +search_backpressure.search_task.cancellation_rate| 0.003 | The maximum number of search tasks to cancel per millisecond of elapsed time. The value must be greater than 0. +search_backpressure.search_task.cancellation_burst | 5 | The maximum number of search tasks to cancel in a single iteration of the observer thread. The value must be greater than or equal to 1. +search_backpressure.search_task.heap_percent_threshold | 2% | The heap usage threshold (as a percentage) required for an individual parent task before it is considered for cancellation. The value range is [0%, 100%]. +search_backpressure.search_task.total_heap_percent_threshold | 5% | The heap usage threshold (as a percentage) required for the sum of heap usages of all search tasks before cancellation is applied. The value range is [0%, 100%]. +search_backpressure.search_task.heap_variance | 2.0 | The heap usage variance required for an individual parent task before it is considered for cancellation. A task is considered for cancellation when `taskHeapUsage` is greater than or equal to `heapUsageMovingAverage` * `variance`. The value must be greater than or equal to 0. +search_backpressure.search_task.heap_moving_average_window_size | 10 | The window size used to calculate the rolling average of the heap usage for the completed parent tasks. The value must be greater than or equal to 0. +search_backpressure.search_task.cpu_time_millis_threshold | 30,000 | The CPU usage threshold (in milliseconds) required for an individual parent task before it is considered for cancellation. The value must be greater than or equal to 0. +search_backpressure.search_shard_task.elapsed_time_millis_threshold | 30,000 | The elapsed time threshold (in milliseconds) required for a single search shard task before it is considered for cancellation. The value must be greater than or equal to 0. +search_backpressure.search_shard_task.cancellation_ratio | 0.1 | The maximum number of search shard tasks to cancel, as a percentage of successful search shard task completions. The value range is (0, 1]. +search_backpressure.search_shard_task.cancellation_rate | 0.003 | The maximum number of search shard tasks to cancel per millisecond of elapsed time. The value must be greater than 0. +search_backpressure.search_shard_task.cancellation_burst | 10 | The maximum number of search shard tasks to cancel in a single iteration of the observer thread. The value must be greater than or equal to 1. +search_backpressure.search_shard_task.heap_percent_threshold | 0.5% | The heap usage threshold (as a percentage) required for a single search shard task before it is considered for cancellation. The value range is [0%, 100%]. +search_backpressure.search_shard_task.total_heap_percent_threshold | 5% | The heap usage threshold (as a percentage) required for the sum of heap usages of all search shard tasks before cancellation is applied. The value range is [0%, 100%]. +search_backpressure.search_shard_task.heap_variance | 2.0 | The minimum variance required for a single search shard task's heap usage compared to the rolling average of previously completed tasks before it is considered for cancellation. The value must be greater than or equal to 0. +search_backpressure.search_shard_task.heap_moving_average_window_size | 100 | The number of previously completed search shard tasks to consider when calculating the rolling average of heap usage. The value must be greater than or equal to 0. +search_backpressure.search_shard_task.cpu_time_millis_threshold | 15,000 | The CPU usage threshold (in milliseconds) required for a single search shard task before it is considered for cancellation. The value must be greater than or equal to 0. ## Search Backpressure Stats API Introduced 2.4 diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md index d13955f3f0..7076c792e2 100644 --- a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md @@ -46,7 +46,7 @@ services: - node.search.cache.size=50gb ``` - +- Starting with version 2.18, k-NN indexes support searchable snapshots for the NMSLIB and Faiss engines. ## Create a searchable snapshot index @@ -109,4 +109,3 @@ The following are known limitations of the searchable snapshots feature: - Searching remote data can impact the performance of other queries running on the same node. We recommend that users provision dedicated nodes with the `search` role for performance-critical applications. - For better search performance, consider [force merging]({{site.url}}{{site.baseurl}}/api-reference/index-apis/force-merge/) indexes into a smaller number of segments before taking a snapshot. For the best performance, at the cost of using compute resources prior to snapshotting, force merge your index into one segment. - We recommend configuring a maximum ratio of remote data to local disk cache size using the `cluster.filecache.remote_data_ratio` setting. A ratio of 5 is a good starting point for most workloads to ensure good query performance. If the ratio is too large, then there may not be sufficient disk space to handle the search workload. For more details on the maximum ratio of remote data, see issue [#11676](https://github.com/opensearch-project/OpenSearch/issues/11676). -- k-NN native-engine-based indexes using `faiss` and `nmslib` engines are incompatible with searchable snapshots. diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md b/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md index 812d5104c7..ac717633f6 100644 --- a/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md @@ -110,6 +110,20 @@ You will most likely not need to specify any parameters except for `location`. F sudo ./bin/opensearch-keystore add s3.client.default.secret_key ``` +1. (Optional) If you're using a custom S3 endpoint (for example, MinIO), disable the Amazon EC2 metadata connection: + + ```bash + export AWS_EC2_METADATA_DISABLED=true + ``` + + If you're installing OpenSearch using Helm, update the following settings in your values file: + + ```yml + extraEnvs: + - name: AWS_EC2_METADATA_DISABLED + value: "true" + ``` + 1. (Optional) If you're using temporary credentials, add your session token: ```bash @@ -479,15 +493,19 @@ Request parameters | Description `include_global_state` | Whether to restore the cluster state. Default is `false`. `include_aliases` | Whether to restore aliases alongside their associated indexes. Default is `true`. `partial` | Whether to allow the restoration of partial snapshots. Default is `false`. -`rename_pattern` | If you want to rename indexes as you restore them, use this option to specify a regular expression that matches all indexes you want to restore. Use capture groups (`()`) to reuse portions of the index name. -`rename_replacement` | If you want to rename indexes as you restore them, use this option to specify the replacement pattern. Use `$0` to include the entire matching index name, `$1` to include the content of the first capture group, and so on. +`rename_pattern` | If you want to rename indexes, use this option to specify a regular expression that matches all the indexes that you want to restore and rename. Use capture groups (`()`) to reuse portions of the index name. +`rename_replacement` | If you want to rename indexes, use this option to specify the name replacement pattern. Use `$0` to include the entire matching index name or the number of the capture group. For example, `$1` would include the content of the first capture group. +`rename_alias_pattern` | If you want to rename aliases, use this option to specify a regular expression that matches all the aliases you want to restore and rename. Use capture groups (`()`) to reuse portions of the alias name. +`rename_alias_replacement` | If you want to rename aliases, use this option to specify the name replacement pattern. Use `$0` to include the entire matching alias name or the number of the capture group. For example, `$1` would include the content of the first capture group. `index_settings` | If you want to change [index settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/) applied during the restore operation, specify them here. You cannot change `index.number_of_shards`. `ignore_index_settings` | Rather than explicitly specifying new settings with `index_settings`, you can ignore certain index settings in the snapshot and use the cluster defaults applied during restore. You cannot ignore `index.number_of_shards`, `index.number_of_replicas`, or `index.auto_expand_replicas`. `storage_type` | `local` indicates that all snapshot metadata and index data will be downloaded to local storage. <br /><br > `remote_snapshot` indicates that snapshot metadata will be downloaded to the cluster, but the remote repository will remain the authoritative store of the index data. Data will be downloaded and cached as necessary to service queries. At least one node in the cluster must be configured with the [search role]({{site.url}}{{site.baseurl}}/security/access-control/users-roles/) in order to restore a snapshot using the type `remote_snapshot`. <br /><br > Defaults to `local`. ### Conflicts and compatibility -One way to avoid naming conflicts when restoring indexes is to use the `rename_pattern` and `rename_replacement` options. You can then, if necessary, use the `_reindex` API to combine the two. The simpler way is to delete existing indexes prior to restoring from a snapshot. +One way to avoid index naming conflicts when restoring indexes is to use the `rename_pattern` and `rename_replacement` options. You can then, if necessary, use the `_reindex` API to combine the two. However, it may be simpler to delete the indexes that caused the conflict prior to restoring them from a snapshot. + +Similarly, to avoid alias naming conflicts when restoring indexes with aliases, you can use the `rename_alias_pattern` and `rename_alias_replacement` options. You can use the `_close` API to close existing indexes prior to restoring from a snapshot, but the index in the snapshot has to have the same number of shards as the existing index. diff --git a/_tuning-your-cluster/availability-and-recovery/workload-management/query-group-lifecycle-api.md b/_tuning-your-cluster/availability-and-recovery/workload-management/query-group-lifecycle-api.md new file mode 100644 index 0000000000..0fb0b0b65c --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/workload-management/query-group-lifecycle-api.md @@ -0,0 +1,125 @@ +--- +layout: default +title: Query Group Lifecycle API +nav_order: 20 +parent: Workload management +grand_parent: Availability and recovery +--- + +# Query Group Lifecycle API + +The Query Group Lifecycle API creates, updates, retrieves, and deletes query groups. The API categorizes queries into specific groups, called _query groups_, based on desired resource limits. + +## Path and HTTP methods + +```json +PUT _wlm/query_group +PUT _wlm/query_group/<name> +GET _wlm/query_group +GET _wlm/query_group/<name> +DELETE _wlm/query_group/<name> +``` + +## Request body fields + +| Field | Description | +| :--- | :--- | +| `_id` | The ID of the query group, which can be used to associate query requests with the group and enforce the group's resource limits. | +| `name` | The name of the query group. | +| `resiliency_mode` | The resiliency mode of the query group. Valid modes are `enforced`, `soft`, and `monitor`. For more information about resiliency modes, see [Operating modes](https://opensearch.org/docs/latest/tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview/#operating-modes). | +| `resource_limits` | The resource limits for query requests in the query group. Valid resources are `cpu` and `memory`. | + +When creating a query group, make sure that the sum of the resource limits for a single resource, either `cpu` or `memory`, does not exceed 1. + +## Example requests + +The following example requests show how to use the Query Group Lifecycle API. + +### Creating a query group + +```json +PUT _wlm/query_group +{ + "name": "analytics", + "resiliency_mode": "enforced", + "resource_limits": { + "cpu": 0.4, + "memory": 0.2 + } +} +``` +{% include copy-curl.html %} + +### Updating a query group + +```json +PUT _wlm/query_group/analytics +{ + "resiliency_mode": "monitor", + "resource_limits": { + "cpu": 0.41, + "memory": 0.21 + } +} +``` +{% include copy-curl.html %} + +### Getting a query group + +```json +GET _wlm/query_group/analytics +``` +{% include copy-curl.html %} + +### Deleting a query group + +```json +DELETE _wlm/query_group/analytics +``` +{% include copy-curl.html %} + +## Example responses + +OpenSearch returns responses similar to the following. + +### Creating a query group + +```json +{ + "_id":"preXpc67RbKKeCyka72_Gw", + "name":"analytics", + "resiliency_mode":"enforced", + "resource_limits":{ + "cpu":0.4, + "memory":0.2 + }, + "updated_at":1726270184642 +} +``` + +### Updating a query group + +```json +{ + "_id":"preXpc67RbKKeCyka72_Gw", + "name":"analytics", + "resiliency_mode":"monitor", + "resource_limits":{ + "cpu":0.41, + "memory":0.21 + }, + "updated_at":1726270333804 +} +``` + +## Response body fields + +| Field | Description | +| :--- | :--- | +| `_id` | The ID of the query group. | +| `name` | The name of the query group. Required when creating a new query group. | +| `resiliency_mode` | The resiliency mode of the query group. | +| `resource_limits` | The resource limits of the query group. | +| `updated_at` | The time at which the query group was last updated. | + + diff --git a/_tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview.md b/_tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview.md new file mode 100644 index 0000000000..956a01a774 --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview.md @@ -0,0 +1,194 @@ +--- +layout: default +title: Workload management +nav_order: 70 +has_children: true +parent: Availability and recovery +--- + +Introduced 2.18 +{: .label .label-purple } + +# Workload management + +Workload management allows you to group search traffic and isolate network resources, preventing the overuse of network resources by specific requests. It offers the following benefits: + +- Tenant-level admission control and reactive query management. When resource usage exceeds configured limits, it automatically identifies and cancels demanding queries, ensuring fair resource distribution. + +- Tenant-level isolation within the cluster for search workloads, operating at the node level. + +## Installing workload management + +To install workload management, use the following command: + +```json +./bin/opensearch-plugin install workload-management +``` +{% include copy-curl.html %} + +## Query groups + +A _query group_ is a logical grouping of tasks with defined resource limits. System administrators can dynamically manage query groups using the Workload Management APIs. These query groups can be used to create search requests with resource limits. + +### Permissions + +Only users with administrator-level permissions can create and update query groups using the Workload Management APIs. + +### Operating modes + +The following operating modes determine the operating level for a query group: + +- **Disabled mode**: Workload management is disabled. + +- **Enabled mode**: Workload management is enabled and will cancel and reject queries once the query group's configured thresholds are reached. + +- **Monitor_only mode** (Default): Workload management will monitor tasks but will not cancel or reject any queries. + +### Example request + +The following example request adds a query group named `analytics`: + +```json +PUT _wlm/query_group +{ + “name”: “analytics”, + “resiliency_mode”: “enforced”, + “resource_limits”: { + “cpu”: 0.4, + “memory”: 0.2 + } +} +``` +{% include copy-curl.html %} + +When creating a query group, make sure that the sum of the resource limits for a single resource, such as `cpu` or `memory`, does not exceed `1`. + +### Example response + +OpenSearch responds with the set resource limits and the `_id` for the query group: + +```json +{ + "_id":"preXpc67RbKKeCyka72_Gw", + "name":"analytics", + "resiliency_mode":"enforced", + "resource_limits":{ + "cpu":0.4, + "memory":0.2 + }, + "updated_at":1726270184642 +} +``` + +## Using `queryGroupID` + +You can associate a query request with a `queryGroupID` to manage and allocate resources within the limits defined by the query group. By using this ID, request routing and tracking are associated with the query group, ensuring resource quotas and task limits are maintained. + +The following example query uses the `queryGroupId` to ensure that the query does not exceed that query group's resource limits: + +```json +GET testindex/_search +Host: localhost:9200 +Content-Type: application/json +queryGroupId: preXpc67RbKKeCyka72_Gw +{ + "query": { + "match": { + "field_name": "value" + } + } +} +``` +{% include copy-curl.html %} + +## Workload management settings + +The following settings can be used to customize workload management using the `_cluster/settings` API. + +| **Setting name** | **Description** | +| :--- | :--- | +| `wlm.query_group.duress_streak` | Determines the node duress threshold. Once the threshold is reached, the node is marked as `in duress`. | +| `wlm.query_group.enforcement_interval` | Defines the monitoring interval. | +| `wlm.query_group.mode` | Defines the [operating mode](#operating-modes). | +| `wlm.query_group.node.memory_rejection_threshold` | Defines the query group level `memory` threshold. When the threshold is reached, the request is rejected. | +| `wlm.query_group.node.cpu_rejection_threshold` | Defines the query group level `cpu` threshold. When the threshold is reached, the request is rejected. | +| `wlm.query_group.node.memory_cancellation_threshold` | Controls whether the node is considered to be in duress when the `memory` threshold is reached. Requests routed to nodes in duress are canceled. | +| `wlm.query_group.node.cpu_cancellation_threshold` | Controls whether the node is considered to be in duress when the `cpu` threshold is reached. Requests routed to nodes in duress are canceled. | + +When setting rejection and cancellation thresholds, remember that the rejection threshold for a resource should always be lower than the cancellation threshold. + +## Workload Management Stats API + +The Workload Management Stats API returns workload management metrics for a query group, using the following method: + +```json +GET _wlm/stats +``` +{% include copy-curl.html %} + +### Example response + +```json +{ + “_nodes”: { + “total”: 1, + “successful”: 1, + “failed”: 0 + }, + “cluster_name”: “XXXXXXYYYYYYYY”, + “A3L9EfBIQf2anrrUhh_goA”: { + “query_groups”: { + “16YGxFlPRdqIO7K4EACJlw”: { + “total_completions”: 33570, + “total_rejections”: 0, + “total_cancellations”: 0, + “cpu”: { + “current_usage”: 0.03319935314357281, + “cancellations”: 0, + “rejections”: 0 + }, + “memory”: { + “current_usage”: 0.002306486276211217, + “cancellations”: 0, + “rejections”: 0 + } + }, + “DEFAULT_QUERY_GROUP”: { + “total_completions”: 42572, + “total_rejections”: 0, + “total_cancellations”: 0, + “cpu”: { + “current_usage”: 0, + “cancellations”: 0, + “rejections”: 0 + }, + “memory”: { + “current_usage”: 0, + “cancellations”: 0, + “rejections”: 0 + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Response body fields + +| Field name | Description | +| :--- | :--- | +| `total_completions` | The total number of request completions in the `query_group` at the given node. This includes all shard-level and coordinator-level requests. | +| `total_rejections` | The total number request rejections in the `query_group` at the given node. This includes all shard-level and coordinator-level requests. | +| `total_cancellations` | The total number of cancellations in the `query_group` at the given node. This includes all shard-level and coordinator-level requests. | +| `cpu` | The `cpu` resource type statistics for the `query_group`. | +| `memory` | The `memory` resource type statistics for the `query_group`. | + +### Resource type statistics + +| Field name | Description | +| :--- | :---- | +| `current_usage` |The resource usage for the `query_group` at the given node based on the last run of the monitoring thread. This value is updated based on the `wlm.query_group.enforcement_interval`. | +| `cancellations` | The number of cancellations resulting from the cancellation threshold being reached. | +| `rejections` | The number of rejections resulting from the cancellation threshold being reached. | + diff --git a/_tuning-your-cluster/index.md b/_tuning-your-cluster/index.md index fa0973395f..f434c2b5ec 100644 --- a/_tuning-your-cluster/index.md +++ b/_tuning-your-cluster/index.md @@ -20,7 +20,7 @@ To create and deploy an OpenSearch cluster according to your requirements, it’ There are many ways to design a cluster. The following illustration shows a basic architecture that includes a four-node cluster that has one dedicated cluster manager node, one dedicated coordinating node, and two data nodes that are cluster manager eligible and also used for ingesting data. - The nomenclature for the cluster manager node is now referred to as the cluster manager node. + The master node is now referred to as the cluster manager node. {: .note } ![multi-node cluster architecture diagram]({{site.url}}{{site.baseurl}}/images/cluster.png) diff --git a/_tuning-your-cluster/performance.md b/_tuning-your-cluster/performance.md index 28f47aeacb..b5066a890c 100644 --- a/_tuning-your-cluster/performance.md +++ b/_tuning-your-cluster/performance.md @@ -32,12 +32,9 @@ An increased `index.translog.flush_threshold_size` can also increase the time th Before increasing `index.translog.flush_threshold_size`, call the following API operation to get current flush operation statistics: ```json -curl -XPOST "os-endpoint/index-name/_stats/flush?pretty" +GET /<index>/_stats/flush?pretty ``` -{% include copy.html %} - - -Replace the `os-endpoint` and `index-name` with your endpoint and index name. +{% include copy-curl.html %} In the output, note the number of flushes and the total time. The following example output shows that there are 124 flushes, which took 17,690 milliseconds: @@ -53,9 +50,15 @@ In the output, note the number of flushes and the total time. The following exam To increase the flush threshold size, call the following API operation: ```json -curl -XPUT "os-endpoint/index-name/_settings?pretty" -d "{"index":{"translog.flush_threshold_size" : "1024MB"}}" +PUT /<index>/_settings +{ + "index": + { + "translog.flush_threshold_size" : "1024MB" + } +} ``` -{% include copy.html %} +{% include copy-curl.html %} In this example, the flush threshold size is set to 1024 MB, which is ideal for instances that have more than 32 GB of memory. @@ -65,9 +68,9 @@ Choose the appropriate threshold size for your cluster. Run the stats API operation again to see whether the flush activity changed: ```json -curl -XGET "os-endpoint/index-name/_stats/flush?pretty" +GET /<index>/_stats/flush ``` -{% include copy.html %} +{% include copy-curl.html %} It's a best practice to increase the `index.translog.flush_threshold_size` only for the current index. After you confirm the outcome, apply the changes to the index template. {: .note} @@ -127,14 +130,14 @@ To reduce the size of the OpenSearch response, use the `filter_path` parameter t In the following example, the `index-name`, `type-name`, and `took` fields are excluded from the response: ```json -curl -XPOST "es-endpoint/index-name/type-name/_bulk?pretty&filter_path=-took,-items.index._index,-items.index._type" -H 'Content-Type: application/json' -d' +POST /_bulk?pretty&filter_path=-took,-items.index._index,-items.index._type { "index" : { "_index" : "test2", "_id" : "1" } } { "user" : "testuser" } { "update" : {"_id" : "1", "_index" : "test2"} } { "doc" : {"user" : "example"} } ``` -{% include copy.html %} +{% include copy-curl.html %} ## Compression codecs -In OpenSearch 2.9 and later, there are two new codecs for compression: `zstd` and `zstd_no_dict`. You can optionally specify a compression level for these in the `index.codec.compression_level` setting with values in the [1, 6] range. [Benchmark]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#benchmarking) data shows that `zstd` provides a 7% better write throughput and `zstd_no_dict` provides a 14% better throughput, along with a 30% improvement in storage compared with the `default` codec. For more information about compression, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/). \ No newline at end of file +In OpenSearch 2.9 and later, there are two new codecs for compression: `zstd` and `zstd_no_dict`. You can optionally specify a compression level for these in the `index.codec.compression_level` setting with values in the [1, 6] range. [Benchmark]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#benchmarking) data shows that `zstd` provides a 7% better write throughput and `zstd_no_dict` provides a 14% better throughput, along with a 30% improvement in storage compared with the `default` codec. For more information about compression, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/). diff --git a/_upgrade-to/index.md b/_upgrade-to/index.md index 0eea3d6209..696be88c21 100644 --- a/_upgrade-to/index.md +++ b/_upgrade-to/index.md @@ -1,6 +1,6 @@ --- layout: default -title: About the migration process +title: Upgrading OpenSearch nav_order: 1 nav_exclude: true permalink: /upgrade-to/ @@ -8,15 +8,14 @@ redirect_from: - /upgrade-to/index/ --- -# About the migration process +# Upgrading OpenSearch -The process of migrating from Elasticsearch OSS to OpenSearch varies depending on your current version of Elasticsearch OSS, installation type, tolerance for downtime, and cost-sensitivity. Rather than concrete steps to cover every situation, we have general guidance for the process. +The process of upgrading your OpenSearch version varies depending on your current version of OpenSearch, installation type, tolerance for downtime, and cost-sensitivity. For migrating to OpenSearch, we provide a [Migration Assistant]({{site.url}}{{site.baseurl}}/migration-assistant/). -Three approaches exist: +Two upgrade approaches exists: -- Use a snapshot to [migrate your Elasticsearch OSS data]({{site.url}}{{site.baseurl}}/upgrade-to/snapshot-migrate/) to a new OpenSearch cluster. This method may incur downtime. -- Perform a [restart upgrade or a rolling upgrade]({{site.url}}{{site.baseurl}}/upgrade-to/upgrade-to/) on your existing nodes. A restart upgrade involves upgrading the entire cluster and restarting it, whereas a rolling upgrade requires upgrading and restarting nodes in the cluster one by one. -- Replace existing Elasticsearch OSS nodes with new OpenSearch nodes. Node replacement is most popular when upgrading [Docker clusters]({{site.url}}{{site.baseurl}}/upgrade-to/docker-upgrade-to/). +- Perform a [restart upgrade or a rolling upgrade]({{site.url}}{{site.baseurl}}/upgrade-to/snapshot-migrate/) on your existing nodes. A restart upgrade involves upgrading the entire cluster and restarting it, whereas a rolling upgrade requires upgrading and restarting nodes in the cluster one by one. +- Replace existing OpenSearch nodes with new OpenSearch nodes. Node replacement is most popular when upgrading [Docker clusters]({{site.url}}{{site.baseurl}}/upgrade-to/docker-upgrade-to/). Regardless of your approach, to safeguard against data loss, we recommend that you take a [snapshot]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore) of all indexes prior to any migration. diff --git a/_upgrade-to/upgrade-to.md b/_upgrade-to/upgrade-to.md index 340055b214..00950687a5 100644 --- a/_upgrade-to/upgrade-to.md +++ b/_upgrade-to/upgrade-to.md @@ -6,6 +6,10 @@ nav_order: 15 # Migrating from Elasticsearch OSS to OpenSearch + +OpenSearch provides a [Migration Assistant]({{site.url}}{{site.baseurl}}/migration-assistant/) to assist you in migrating from other search solutions. +{: .warning} + If you want to migrate from an existing Elasticsearch OSS cluster to OpenSearch and find the [snapshot approach]({{site.url}}{{site.baseurl}}/upgrade-to/snapshot-migrate/) unappealing, you can migrate your existing nodes from Elasticsearch OSS to OpenSearch. If your existing cluster runs an older version of Elasticsearch OSS, the first step is to upgrade to version 6.x or 7.x. diff --git a/assets/js/copy-button.js b/assets/js/copy-button.js index c26cedfd1c..cb784f07d7 100644 --- a/assets/js/copy-button.js +++ b/assets/js/copy-button.js @@ -62,7 +62,7 @@ function addCurl(textToCopy) { result += path + "\""; if (body.length > 0) { - result += " -H 'Content-Type: application/json' -d'\n" + body + "'"; + result += " -H 'Content-Type: application/json' -d'\n" + body + "\n'"; } return result; diff --git a/images/dashboards-assistant/alert-insight-insight.png b/images/dashboards-assistant/alert-insight-insight.png new file mode 100644 index 0000000000..3d65276d42 Binary files /dev/null and b/images/dashboards-assistant/alert-insight-insight.png differ diff --git a/images/dashboards-assistant/alert-insight-start.png b/images/dashboards-assistant/alert-insight-start.png new file mode 100644 index 0000000000..b55b6296bc Binary files /dev/null and b/images/dashboards-assistant/alert-insight-start.png differ diff --git a/images/dashboards-assistant/alert-insight-summary.png b/images/dashboards-assistant/alert-insight-summary.png new file mode 100644 index 0000000000..1e98170cda Binary files /dev/null and b/images/dashboards-assistant/alert-insight-summary.png differ diff --git a/images/dashboards-assistant/data-summary.png b/images/dashboards-assistant/data-summary.png new file mode 100644 index 0000000000..dc2e4e22f0 Binary files /dev/null and b/images/dashboards-assistant/data-summary.png differ diff --git a/images/dashboards-assistant/info-icon.png b/images/dashboards-assistant/info-icon.png new file mode 100644 index 0000000000..29e6b7b97b Binary files /dev/null and b/images/dashboards-assistant/info-icon.png differ diff --git a/images/dashboards-assistant/sparkle-icon.png b/images/dashboards-assistant/sparkle-icon.png new file mode 100644 index 0000000000..04b6d2b876 Binary files /dev/null and b/images/dashboards-assistant/sparkle-icon.png differ diff --git a/images/dashboards-assistant/suggestAD-UI.png b/images/dashboards-assistant/suggestAD-UI.png new file mode 100644 index 0000000000..dd7e32d6e2 Binary files /dev/null and b/images/dashboards-assistant/suggestAD-UI.png differ diff --git a/images/dashboards-assistant/suggestAD-button.png b/images/dashboards-assistant/suggestAD-button.png new file mode 100644 index 0000000000..a87fe862fe Binary files /dev/null and b/images/dashboards-assistant/suggestAD-button.png differ diff --git a/images/dashboards-assistant/t2viz-ask-question.png b/images/dashboards-assistant/t2viz-ask-question.png new file mode 100644 index 0000000000..e5b7e86f64 Binary files /dev/null and b/images/dashboards-assistant/t2viz-ask-question.png differ diff --git a/images/dashboards-assistant/t2viz-edit-visual-response.png b/images/dashboards-assistant/t2viz-edit-visual-response.png new file mode 100644 index 0000000000..1fd35425a7 Binary files /dev/null and b/images/dashboards-assistant/t2viz-edit-visual-response.png differ diff --git a/images/dashboards-assistant/t2viz-edit-visual.png b/images/dashboards-assistant/t2viz-edit-visual.png new file mode 100644 index 0000000000..0c57dd58aa Binary files /dev/null and b/images/dashboards-assistant/t2viz-edit-visual.png differ diff --git a/images/dashboards-assistant/t2viz-select-data-source.png b/images/dashboards-assistant/t2viz-select-data-source.png new file mode 100644 index 0000000000..172e136a5b Binary files /dev/null and b/images/dashboards-assistant/t2viz-select-data-source.png differ diff --git a/images/dashboards-assistant/t2viz-start.png b/images/dashboards-assistant/t2viz-start.png new file mode 100644 index 0000000000..f6d46a21e5 Binary files /dev/null and b/images/dashboards-assistant/t2viz-start.png differ diff --git a/images/migrations/migration-architecture-overview.svg b/images/migrations/migration-architecture-overview.svg new file mode 100644 index 0000000000..cf758653aa --- /dev/null +++ b/images/migrations/migration-architecture-overview.svg @@ -0,0 +1,2 @@ +<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> +<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="5002px" height="4897px" viewBox="-0.5 -0.5 5002 4897"><defs><linearGradient x1="0%" y1="100%" x2="0%" y2="0%" id="mx-gradient-f34482-1-bc1356-1-s-0"><stop offset="0%" style="stop-color:#BC1356"/><stop offset="100%" style="stop-color:#F34482"/></linearGradient><linearGradient x1="0%" y1="100%" x2="0%" y2="0%" id="mx-gradient-f78e04-1-d05c17-1-s-0"><stop offset="0%" style="stop-color:#D05C17"/><stop offset="100%" style="stop-color:#F78E04"/></linearGradient><linearGradient x1="0%" y1="100%" x2="0%" y2="0%" id="mx-gradient-60a337-1-277116-1-s-0"><stop offset="0%" style="stop-color:#277116"/><stop offset="100%" style="stop-color:#60A337"/></linearGradient></defs><g><rect x="118" y="1" width="1104" height="934" fill="none" stroke="#000000" stroke-width="2" pointer-events="none"/><rect x="129" y="252.5" width="1069" height="462" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" pointer-events="none"/><rect x="129" y="252.5" width="1069" height="462" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" pointer-events="none"/><g transform="translate(662.5,469.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="27" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 1px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b><br /><br /></b></div></div></foreignObject><text x="0" y="20" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><b><br><br></b></text></switch></g><path d="M 220 445 L 220 475 L 402 475 L 402 501.53" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 398 495.76 L 402 503.76 L 406 495.76" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><rect x="150" y="304.5" width="140" height="140" fill="none" stroke="#82b366" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><rect x="150" y="304.5" width="140" height="140" fill="none" stroke="#82b366" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(154.5,309.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="129" height="128" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 129px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b>Source Cluster Snapshot<br /><br /><br /><br /><br /><br /><br /><br /></b></div></div></foreignObject><text x="65" y="70" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="496" y="73" width="341" height="140" fill="none" stroke="#3333ff" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(666.5,86.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="113" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 1px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><br /><br /><br /><br /><br /><br /><br /><br /></div></div></foreignObject><text x="0" y="63" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><br><br><br><br><br><br><br><br></text></switch></g><image x="641.5" y="106.5" width="64" height="64" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(564.5,178.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="218" height="41" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div><font>Amazon OpenSearch or</font></div><div><font>Elasticsearch/OpenSearch self-managed</font></div><div><font><br /></font></div></div></div></foreignObject><text x="109" y="27" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><g transform="translate(613.5,81.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="114" height="17" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font color="#000000" style="font-size: 16px;"><b>Source Cluster</b><br /></font></div></div></foreignObject><text x="57" y="16" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">[Not supported by viewer]</text></switch></g><g transform="translate(532.5,59.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="17" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="font-size: 16px;"></div><b style="font-size: 16px;"><font color="#1a1a1a" style="font-size: 16px;"><br /></font></b></div></div></foreignObject><text x="0" y="16" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial"><div style="font-size: 16px;"></div><b style="font-size: 16px;"><font color="#1a1a1a" style="font-size: 16px;"><br></font></b></text></switch></g><g transform="translate(4736.5,4878.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="26" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;">Text</div></div></foreignObject><text x="13" y="15" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">Text</text></switch></g><g transform="translate(4962.5,4879.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="26" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;">Text</div></div></foreignObject><text x="13" y="15" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">Text</text></switch></g><g transform="translate(712.5,140.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(847.5,195.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(4736.5,4878.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="26" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;">Text</div></div></foreignObject><text x="13" y="15" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">Text</text></switch></g><g transform="translate(846.5,162.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(4962.5,4879.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="26" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;">Text</div></div></foreignObject><text x="13" y="15" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">Text</text></switch></g><g transform="translate(144.5,258.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="180" height="17" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font color="#000000" style="font-size: 16px;"><b>Migration Infrastructure</b><br /></font></div></div></foreignObject><text x="90" y="16" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 108 116 L 403 116 L 403 336.53" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 399 330.76 L 403 338.76 L 407 330.76" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 108 88 L 492.53 88" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 486.76 92 L 494.76 88 L 486.76 84" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 44 104 L 44 102 L 45 102 L 45 101 L 8 101 L 8 922 L 670 922 L 670 904.47" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 674 910.24 L 670 902.24 L 666 910.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><image x="43.5" y="71" width="64" height="64" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(42.5,143.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="66" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><i>Client Traffic</i></div></div></foreignObject><text x="33" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><i>Client Traffic</i></text></switch></g><ellipse cx="152.5" cy="85.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(147.5,75.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><font style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">1</font></b></font></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><ellipse cx="152.5" cy="120" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(147.5,109.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">2</font></b></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><image x="117.5" y="0.5" width="48" height="48" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(168.5,18.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="62" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div>AWS Cloud</div></div></div></foreignObject><text x="31" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><div>AWS Cloud</div></text></switch></g><path d="M 419 381 L 463 381 L 463 383 L 502.03 383" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 496.26 387 L 504.26 383 L 496.26 379" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><image x="370.5" y="356" width="48" height="48" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(321.5,412.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="146" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div>Application Load Balanacer</div></div></div></foreignObject><text x="73" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><div>Application Load Balanacer</div></text></switch></g><rect x="931" y="759.5" width="257" height="134" fill="none" stroke="#000000" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><rect x="931" y="759.5" width="257" height="134" fill="none" stroke="#000000" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><g transform="translate(1059.5,791.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="70" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 1px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b><br /><br /><br /></b><br /><br /></div></div></foreignObject><text x="0" y="41" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><b><br><br><br></b><br><br></text></switch></g><ellipse cx="949.78" cy="815.34" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(944.5,804.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">5</font></b></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><image x="1104.44" y="799.74" width="64" height="64" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(1100.5,871.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="71" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div><font>Amazon EFS</font></div></div></div></foreignObject><text x="36" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><g transform="translate(967.5,761.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="185" height="36" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font color="#000000" style="font-size: 16px;"><b>Monitoring and Analysis<br /></b><br /></font></div></div></foreignObject><text x="93" y="25" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 972.5 802 L 1036.5 802 L 1036.5 866 L 972.5 866 Z" fill="url(#mx-gradient-f34482-1-bc1356-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 1018 840.69 C 1018 837.14 1015.09 834.25 1011.52 834.25 C 1007.94 834.25 1005.04 837.14 1005.04 840.69 C 1005.04 844.24 1007.94 847.13 1011.52 847.13 C 1015.09 847.13 1018 844.24 1018 840.69 M 1019.82 840.69 C 1019.82 845.24 1016.1 848.94 1011.52 848.94 C 1006.94 848.94 1003.21 845.24 1003.21 840.69 C 1003.21 836.14 1006.94 832.44 1011.52 832.44 C 1016.1 832.44 1019.82 836.14 1019.82 840.69 M 1027.49 853.13 L 1021.12 847.43 C 1020.61 848.15 1020.01 848.81 1019.35 849.39 L 1025.71 855.09 C 1026.25 855.57 1027.09 855.53 1027.58 854.99 C 1028.07 854.45 1028.03 853.62 1027.49 853.13 M 1011.52 850.57 C 1017 850.57 1021.47 846.14 1021.47 840.69 C 1021.47 835.24 1017 830.81 1011.52 830.81 C 1006.03 830.81 1001.57 835.24 1001.57 840.69 C 1001.57 846.14 1006.03 850.57 1011.52 850.57 M 1028.94 856.2 C 1028.32 856.89 1027.46 857.24 1026.59 857.24 C 1025.84 857.24 1025.09 856.97 1024.49 856.44 L 1017.88 850.52 C 1016.04 851.7 1013.86 852.39 1011.52 852.39 C 1005.02 852.39 999.74 847.14 999.74 840.69 C 999.74 834.24 1005.02 829 1011.52 829 C 1018.01 829 1023.29 834.24 1023.29 840.69 C 1023.29 842.54 1022.84 844.29 1022.07 845.84 L 1028.71 851.78 C 1030 852.94 1030.1 854.92 1028.94 856.2 M 987.33 824.91 C 987.33 825.38 987.35 825.85 987.41 826.3 C 987.44 826.56 987.36 826.82 987.19 827.01 C 987.05 827.17 986.87 827.27 986.66 827.31 C 984.42 827.88 980.73 829.62 980.73 834.85 C 980.73 838.8 982.92 840.98 984.76 842.11 C 985.38 842.5 986.13 842.71 986.91 842.72 L 997.92 842.73 L 997.92 844.55 L 986.9 844.54 C 985.77 844.52 984.7 844.22 983.79 843.65 C 981.97 842.53 978.9 839.89 978.9 834.85 C 978.9 828.78 983.08 826.54 985.53 825.75 C 985.51 825.47 985.5 825.19 985.5 824.91 C 985.5 819.95 988.89 814.81 993.37 812.95 C 998.62 810.76 1004.19 811.85 1008.25 815.84 C 1009.51 817.08 1010.55 818.59 1011.34 820.33 C 1012.41 819.45 1013.73 818.96 1015.11 818.96 C 1017.84 818.96 1020.91 821.02 1021.45 825.51 C 1024.01 826.1 1029.4 828.14 1029.4 834.92 C 1029.4 837.63 1028.55 839.86 1026.86 841.57 L 1025.56 840.3 C 1026.9 838.94 1027.58 837.14 1027.58 834.92 C 1027.58 828.99 1022.58 827.52 1020.43 827.16 C 1020.18 827.12 1019.97 826.98 1019.83 826.78 C 1019.69 826.58 1019.64 826.34 1019.68 826.11 C 1019.38 822.43 1017.18 820.78 1015.11 820.78 C 1013.81 820.78 1012.59 821.41 1011.76 822.51 C 1011.55 822.77 1011.23 822.91 1010.89 822.86 C 1010.56 822.81 1010.28 822.59 1010.17 822.27 C 1009.42 820.22 1008.34 818.49 1006.97 817.13 C 1003.44 813.67 998.63 812.73 994.08 814.62 C 990.29 816.19 987.33 820.71 987.33 824.91" fill="#ffffff" stroke="none" pointer-events="none"/><g transform="translate(948.5,873.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="112" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(35, 47, 62); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div>Amazon CloudWatch</div></div></div></foreignObject><text x="56" y="12" fill="#232F3E" text-anchor="middle" font-size="12px" font-family="Arial"><div>Amazon CloudWatch</div></text></switch></g><rect x="135" y="489.47" width="1054" height="216" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><rect x="135" y="489.47" width="1054" height="216" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(661.5,582.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="27" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 1px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b><br /><br /></b></div></div></foreignObject><text x="0" y="20" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><b><br><br></b></text></switch></g><g transform="translate(534.5,467.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(1129.5,470.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><path d="M 151 495.5 L 199 495.5 L 199 543.5 L 151 543.5 Z" fill="url(#mx-gradient-f78e04-1-d05c17-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 173.63 500.31 C 173.51 500.31 173.38 500.35 173.28 500.41 L 158.37 509.01 C 158.14 509.14 157.99 509.39 157.99 509.66 L 157.99 528.42 C 157.99 528.68 158.14 528.93 158.37 529.07 L 174.59 538.56 C 174.82 538.7 175.11 538.7 175.35 538.56 L 190.23 529.9 C 190.47 529.77 190.61 529.52 190.61 529.25 C 190.61 528.98 190.46 528.73 190.22 528.59 L 183.08 524.57 C 182.85 524.43 182.56 524.44 182.33 524.57 L 175.05 528.84 L 166.56 523.95 L 166.56 514.17 L 174.03 509.89 C 174.26 509.76 174.41 509.51 174.41 509.24 L 174.41 501.06 C 174.41 500.86 174.32 500.66 174.18 500.52 C 174.03 500.38 173.83 500.3 173.63 500.31 Z M 176.48 500.34 C 176.27 500.33 176.08 500.41 175.93 500.55 C 175.78 500.69 175.7 500.89 175.7 501.09 L 175.7 509.28 C 175.7 509.55 175.85 509.79 176.08 509.93 L 183.42 514.2 L 183.42 522.76 C 183.42 523.03 183.56 523.28 183.8 523.41 L 190.87 527.52 C 191.1 527.66 191.39 527.66 191.63 527.52 C 191.86 527.39 192.01 527.14 192 526.87 L 192 509.68 C 192.01 509.41 191.86 509.16 191.63 509.03 L 176.84 500.44 C 176.73 500.37 176.6 500.34 176.48 500.34 Z M 172.9 502.37 L 172.9 508.8 L 165.43 513.08 C 165.19 513.21 165.05 513.46 165.05 513.73 L 165.05 524.38 C 165.05 524.65 165.19 524.9 165.42 525.04 L 174.67 530.37 C 174.91 530.5 175.2 530.5 175.43 530.36 L 182.72 526.09 L 188.34 529.26 L 174.97 537.04 L 159.5 527.98 L 159.5 510.09 Z M 177.21 502.4 L 190.5 510.11 L 190.5 525.56 L 184.93 522.33 L 184.93 513.77 C 184.93 513.5 184.78 513.25 184.55 513.11 L 177.21 508.84 Z" fill="#ffffff" stroke="none" pointer-events="none"/><g transform="translate(138.5,550.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="72" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(35, 47, 62); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div>Amazon ECS</div></div></div></foreignObject><text x="36" y="12" fill="#232F3E" text-anchor="middle" font-size="12px" font-family="Arial"><div>Amazon ECS</div></text></switch></g><rect x="592.5" y="506.5" width="179" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><rect x="592.5" y="506.5" width="179" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(628.5,509.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="105" height="171" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 106px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b style="">Migration Console<br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /></b></div></div></foreignObject><text x="53" y="92" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="1012" y="612" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="1012" y="612" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(1030.5,629.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="93" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">          Replayer N</div></div></div></foreignObject><text x="47" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 1019.5 626.76 L 1019.5 627.38 L 1019.5 647.24 L 1051.5 647.24 L 1051.5 626.76 Z M 1020.73 627.99 L 1050.27 627.99 L 1050.27 646.01 L 1020.73 646.01 Z M 1022.79 630.01 L 1022.79 643.99 L 1024.02 643.99 L 1024.02 630.01 Z M 1026.56 630.01 L 1026.56 643.99 L 1027.79 643.99 L 1027.79 630.01 Z M 1043.24 630.01 L 1043.24 643.99 L 1044.47 643.99 L 1044.47 630.01 Z M 1047.03 630.01 L 1047.03 643.99 L 1048.26 643.99 L 1048.26 630.01 Z M 1036.23 630.7 L 1036.23 631.92 L 1034.84 631.92 L 1034.84 630.71 L 1033.62 630.71 L 1033.62 631.92 L 1032.3 631.92 L 1032.3 630.71 L 1031.07 630.71 L 1031.07 631.92 L 1031.05 631.92 C 1030.88 631.92 1030.73 631.99 1030.61 632.1 C 1030.5 632.22 1030.43 632.38 1030.43 632.54 L 1030.43 632.56 L 1029.17 632.56 L 1029.17 633.79 L 1030.43 633.79 L 1030.43 635.13 L 1029.18 635.13 L 1029.18 636.36 L 1030.43 636.36 L 1030.43 637.72 L 1029.18 637.72 L 1029.18 638.95 L 1030.43 638.95 L 1030.43 640.25 L 1029.17 640.25 L 1029.17 641.48 L 1030.43 641.48 C 1030.44 641.82 1030.71 642.08 1031.05 642.08 L 1031.07 642.08 L 1031.07 643.38 L 1032.3 643.38 L 1032.3 642.08 L 1033.62 642.08 L 1033.62 643.36 L 1034.84 643.36 L 1034.84 642.08 L 1036.23 642.08 L 1036.23 643.36 L 1037.46 643.36 L 1037.46 642.08 L 1038.76 642.08 L 1038.76 643.38 L 1039.99 643.38 L 1039.99 642.08 L 1040 642.08 C 1040.34 642.08 1040.61 641.82 1040.62 641.48 L 1041.85 641.48 L 1041.85 640.25 L 1040.62 640.25 L 1040.62 638.95 L 1041.87 638.95 L 1041.87 637.72 L 1040.62 637.72 L 1040.62 636.36 L 1041.87 636.36 L 1041.87 635.13 L 1040.62 635.13 L 1040.62 633.79 L 1041.86 633.79 L 1041.86 632.56 L 1040.62 632.56 L 1040.62 632.54 C 1040.62 632.38 1040.55 632.22 1040.44 632.1 C 1040.32 631.99 1040.17 631.92 1040 631.92 L 1039.99 631.92 L 1039.99 630.72 L 1038.76 630.72 L 1038.76 631.92 L 1037.46 631.92 L 1037.46 630.7 Z M 1031.66 633.15 L 1039.39 633.15 L 1039.39 640.85 L 1031.66 640.85 Z" fill="#d05c17" stroke="none" pointer-events="none"/><rect x="948" y="572" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="948" y="572" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(969.5,589.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="88" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">         Replayer 2</div></div></div></foreignObject><text x="44" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 955 585.76 L 955 586.38 L 955 606.24 L 987 606.24 L 987 585.76 Z M 956.23 586.99 L 985.77 586.99 L 985.77 605.01 L 956.23 605.01 Z M 958.29 589.01 L 958.29 602.99 L 959.52 602.99 L 959.52 589.01 Z M 962.06 589.01 L 962.06 602.99 L 963.29 602.99 L 963.29 589.01 Z M 978.74 589.01 L 978.74 602.99 L 979.97 602.99 L 979.97 589.01 Z M 982.53 589.01 L 982.53 602.99 L 983.76 602.99 L 983.76 589.01 Z M 971.73 589.7 L 971.73 590.92 L 970.34 590.92 L 970.34 589.71 L 969.12 589.71 L 969.12 590.92 L 967.8 590.92 L 967.8 589.71 L 966.57 589.71 L 966.57 590.92 L 966.55 590.92 C 966.38 590.92 966.23 590.99 966.11 591.1 C 966 591.22 965.93 591.38 965.93 591.54 L 965.93 591.56 L 964.67 591.56 L 964.67 592.79 L 965.93 592.79 L 965.93 594.13 L 964.68 594.13 L 964.68 595.36 L 965.93 595.36 L 965.93 596.72 L 964.68 596.72 L 964.68 597.95 L 965.93 597.95 L 965.93 599.25 L 964.67 599.25 L 964.67 600.48 L 965.93 600.48 C 965.94 600.82 966.21 601.08 966.55 601.08 L 966.57 601.08 L 966.57 602.38 L 967.8 602.38 L 967.8 601.08 L 969.12 601.08 L 969.12 602.36 L 970.34 602.36 L 970.34 601.08 L 971.73 601.08 L 971.73 602.36 L 972.96 602.36 L 972.96 601.08 L 974.26 601.08 L 974.26 602.38 L 975.49 602.38 L 975.49 601.08 L 975.5 601.08 C 975.84 601.08 976.11 600.82 976.12 600.48 L 977.35 600.48 L 977.35 599.25 L 976.12 599.25 L 976.12 597.95 L 977.37 597.95 L 977.37 596.72 L 976.12 596.72 L 976.12 595.36 L 977.37 595.36 L 977.37 594.13 L 976.12 594.13 L 976.12 592.79 L 977.36 592.79 L 977.36 591.56 L 976.12 591.56 L 976.12 591.54 C 976.12 591.38 976.05 591.22 975.94 591.1 C 975.82 590.99 975.67 590.92 975.5 590.92 L 975.49 590.92 L 975.49 589.72 L 974.26 589.72 L 974.26 590.92 L 972.96 590.92 L 972.96 589.7 Z M 967.16 592.15 L 974.89 592.15 L 974.89 599.85 L 967.16 599.85 Z" fill="#d05c17" stroke="none" pointer-events="none"/><rect x="417.5" y="609" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="417.5" y="609" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(434.5,626.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="97" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">          RFS Task N</div></div></div></foreignObject><text x="49" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 425 623.76 L 425 624.38 L 425 644.24 L 457 644.24 L 457 623.76 Z M 426.23 624.99 L 455.77 624.99 L 455.77 643.01 L 426.23 643.01 Z M 428.29 627.01 L 428.29 640.99 L 429.52 640.99 L 429.52 627.01 Z M 432.06 627.01 L 432.06 640.99 L 433.29 640.99 L 433.29 627.01 Z M 448.74 627.01 L 448.74 640.99 L 449.97 640.99 L 449.97 627.01 Z M 452.53 627.01 L 452.53 640.99 L 453.76 640.99 L 453.76 627.01 Z M 441.73 627.7 L 441.73 628.92 L 440.34 628.92 L 440.34 627.71 L 439.12 627.71 L 439.12 628.92 L 437.8 628.92 L 437.8 627.71 L 436.57 627.71 L 436.57 628.92 L 436.55 628.92 C 436.38 628.92 436.23 628.99 436.11 629.1 C 436 629.22 435.93 629.38 435.93 629.54 L 435.93 629.56 L 434.67 629.56 L 434.67 630.79 L 435.93 630.79 L 435.93 632.13 L 434.68 632.13 L 434.68 633.36 L 435.93 633.36 L 435.93 634.72 L 434.68 634.72 L 434.68 635.95 L 435.93 635.95 L 435.93 637.25 L 434.67 637.25 L 434.67 638.48 L 435.93 638.48 C 435.94 638.82 436.21 639.08 436.55 639.08 L 436.57 639.08 L 436.57 640.38 L 437.8 640.38 L 437.8 639.08 L 439.12 639.08 L 439.12 640.36 L 440.34 640.36 L 440.34 639.08 L 441.73 639.08 L 441.73 640.36 L 442.96 640.36 L 442.96 639.08 L 444.26 639.08 L 444.26 640.38 L 445.49 640.38 L 445.49 639.08 L 445.5 639.08 C 445.84 639.08 446.11 638.82 446.12 638.48 L 447.35 638.48 L 447.35 637.25 L 446.12 637.25 L 446.12 635.95 L 447.37 635.95 L 447.37 634.72 L 446.12 634.72 L 446.12 633.36 L 447.37 633.36 L 447.37 632.13 L 446.12 632.13 L 446.12 630.79 L 447.36 630.79 L 447.36 629.56 L 446.12 629.56 L 446.12 629.54 C 446.12 629.38 446.05 629.22 445.94 629.1 C 445.82 628.99 445.67 628.92 445.5 628.92 L 445.49 628.92 L 445.49 627.72 L 444.26 627.72 L 444.26 628.92 L 442.96 628.92 L 442.96 627.7 Z M 437.16 630.15 L 444.89 630.15 L 444.89 637.85 L 437.16 637.85 Z" fill="#d05c17" stroke="none" pointer-events="none"/><rect x="353.5" y="569" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="353.5" y="569" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(372.5,586.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="92" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">         RFS Task 2</div></div></div></foreignObject><text x="46" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="216" y="506" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><rect x="216" y="506" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(315.5,509.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="171" height="171" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 172px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><font style="font-size: 12px;"><b><span>Reindex-from-Snapshot (RFS)</span><span><br /></span></b></font><b style=""><font style="font-size: 12px;"><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /></font><br /></b></div></div></foreignObject><text x="86" y="92" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 360.5 582.76 L 360.5 583.38 L 360.5 603.24 L 392.5 603.24 L 392.5 582.76 Z M 361.73 583.99 L 391.27 583.99 L 391.27 602.01 L 361.73 602.01 Z M 363.79 586.01 L 363.79 599.99 L 365.02 599.99 L 365.02 586.01 Z M 367.56 586.01 L 367.56 599.99 L 368.79 599.99 L 368.79 586.01 Z M 384.24 586.01 L 384.24 599.99 L 385.47 599.99 L 385.47 586.01 Z M 388.03 586.01 L 388.03 599.99 L 389.26 599.99 L 389.26 586.01 Z M 377.23 586.7 L 377.23 587.92 L 375.84 587.92 L 375.84 586.71 L 374.62 586.71 L 374.62 587.92 L 373.3 587.92 L 373.3 586.71 L 372.07 586.71 L 372.07 587.92 L 372.05 587.92 C 371.88 587.92 371.73 587.99 371.61 588.1 C 371.5 588.22 371.43 588.38 371.43 588.54 L 371.43 588.56 L 370.17 588.56 L 370.17 589.79 L 371.43 589.79 L 371.43 591.13 L 370.18 591.13 L 370.18 592.36 L 371.43 592.36 L 371.43 593.72 L 370.18 593.72 L 370.18 594.95 L 371.43 594.95 L 371.43 596.25 L 370.17 596.25 L 370.17 597.48 L 371.43 597.48 C 371.44 597.82 371.71 598.08 372.05 598.08 L 372.07 598.08 L 372.07 599.38 L 373.3 599.38 L 373.3 598.08 L 374.62 598.08 L 374.62 599.36 L 375.84 599.36 L 375.84 598.08 L 377.23 598.08 L 377.23 599.36 L 378.46 599.36 L 378.46 598.08 L 379.76 598.08 L 379.76 599.38 L 380.99 599.38 L 380.99 598.08 L 381 598.08 C 381.34 598.08 381.61 597.82 381.62 597.48 L 382.85 597.48 L 382.85 596.25 L 381.62 596.25 L 381.62 594.95 L 382.87 594.95 L 382.87 593.72 L 381.62 593.72 L 381.62 592.36 L 382.87 592.36 L 382.87 591.13 L 381.62 591.13 L 381.62 589.79 L 382.86 589.79 L 382.86 588.56 L 381.62 588.56 L 381.62 588.54 C 381.62 588.38 381.55 588.22 381.44 588.1 C 381.32 587.99 381.17 587.92 381 587.92 L 380.99 587.92 L 380.99 586.72 L 379.76 586.72 L 379.76 587.92 L 378.46 587.92 L 378.46 586.7 Z M 372.66 589.15 L 380.39 589.15 L 380.39 596.85 L 372.66 596.85 Z" fill="#d05c17" stroke="none" pointer-events="none"/><rect x="780" y="506.5" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><rect x="780" y="506.5" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(939.5,509.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="51" height="171" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 52px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b style="">Replayer<br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /></b></div></div></foreignObject><text x="26" y="92" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 654.73 557.03 C 654.05 557.03 653.5 557.58 653.5 558.26 L 653.5 596.74 C 653.5 597.42 654.05 597.97 654.73 597.97 L 716.27 597.97 C 716.95 597.97 717.5 597.42 717.5 596.74 L 717.5 558.26 C 717.5 557.58 716.95 557.03 716.27 557.03 Z M 655.96 559.49 L 715.04 559.49 L 715.04 595.51 L 655.96 595.51 Z M 673.97 563.36 C 673.65 563.36 673.33 563.49 673.1 563.72 C 672.87 563.95 672.74 564.26 672.74 564.59 L 672.74 578.82 C 672.74 579.15 672.87 579.46 673.1 579.69 C 673.33 579.92 673.65 580.05 673.97 580.05 L 684.29 580.05 L 684.29 582.63 L 675.26 582.63 C 674.58 582.63 674.03 583.18 674.03 583.86 L 674.03 590.36 C 674.03 591.04 674.58 591.59 675.26 591.59 L 695.79 591.59 C 696.12 591.59 696.43 591.46 696.66 591.23 C 696.89 591 697.02 590.69 697.02 590.36 L 697.02 583.86 C 697.02 583.53 696.89 583.22 696.66 582.99 C 696.43 582.76 696.12 582.63 695.79 582.63 L 686.75 582.63 L 686.75 580.05 L 697.1 580.05 C 697.43 580.05 697.74 579.92 697.97 579.69 C 698.2 579.46 698.33 579.15 698.33 578.82 L 698.33 564.59 C 698.33 563.91 697.78 563.36 697.1 563.36 Z M 660.01 563.48 L 660.01 591.48 L 662.47 591.48 L 662.47 563.48 Z M 667.56 563.48 L 667.56 591.48 L 670.02 591.48 L 670.02 563.48 Z M 700.97 563.48 L 700.97 591.48 L 703.43 591.48 L 703.43 563.48 Z M 708.57 563.48 L 708.57 591.48 L 711.03 591.48 L 711.03 563.48 Z M 675.2 565.82 L 695.87 565.82 L 695.87 577.59 L 675.2 577.59 Z M 676.5 585.09 L 694.56 585.09 L 694.56 589.13 L 676.5 589.13 Z M 689.45 586.44 L 689.45 587.67 L 691.9 587.67 L 691.9 586.44 Z" fill="#d05c17" stroke="none" pointer-events="none"/><g transform="translate(636.5,600.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="97" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(35, 47, 62); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: center;">Migration Console</div></div></div></foreignObject><text x="49" y="12" fill="#232F3E" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="276" y="527" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="276" y="527" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(290.5,530.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="102" height="41" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">          </div><div style="text-align: right;">            RFS Task 1</div><div style="text-align: right;"><br /></div></div></div></foreignObject><text x="51" y="27" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 284.5 541.76 L 284.5 542.38 L 284.5 562.24 L 316.5 562.24 L 316.5 541.76 Z M 285.73 542.99 L 315.27 542.99 L 315.27 561.01 L 285.73 561.01 Z M 287.79 545.01 L 287.79 558.99 L 289.02 558.99 L 289.02 545.01 Z M 291.56 545.01 L 291.56 558.99 L 292.79 558.99 L 292.79 545.01 Z M 308.24 545.01 L 308.24 558.99 L 309.47 558.99 L 309.47 545.01 Z M 312.03 545.01 L 312.03 558.99 L 313.26 558.99 L 313.26 545.01 Z M 301.23 545.7 L 301.23 546.92 L 299.84 546.92 L 299.84 545.71 L 298.62 545.71 L 298.62 546.92 L 297.3 546.92 L 297.3 545.71 L 296.07 545.71 L 296.07 546.92 L 296.05 546.92 C 295.88 546.92 295.73 546.99 295.61 547.1 C 295.5 547.22 295.43 547.38 295.43 547.54 L 295.43 547.56 L 294.17 547.56 L 294.17 548.79 L 295.43 548.79 L 295.43 550.13 L 294.18 550.13 L 294.18 551.36 L 295.43 551.36 L 295.43 552.72 L 294.18 552.72 L 294.18 553.95 L 295.43 553.95 L 295.43 555.25 L 294.17 555.25 L 294.17 556.48 L 295.43 556.48 C 295.44 556.82 295.71 557.08 296.05 557.08 L 296.07 557.08 L 296.07 558.38 L 297.3 558.38 L 297.3 557.08 L 298.62 557.08 L 298.62 558.36 L 299.84 558.36 L 299.84 557.08 L 301.23 557.08 L 301.23 558.36 L 302.46 558.36 L 302.46 557.08 L 303.76 557.08 L 303.76 558.38 L 304.99 558.38 L 304.99 557.08 L 305 557.08 C 305.34 557.08 305.61 556.82 305.62 556.48 L 306.85 556.48 L 306.85 555.25 L 305.62 555.25 L 305.62 553.95 L 306.87 553.95 L 306.87 552.72 L 305.62 552.72 L 305.62 551.36 L 306.87 551.36 L 306.87 550.13 L 305.62 550.13 L 305.62 548.79 L 306.86 548.79 L 306.86 547.56 L 305.62 547.56 L 305.62 547.54 C 305.62 547.38 305.55 547.22 305.44 547.1 C 305.32 546.99 305.17 546.92 305 546.92 L 304.99 546.92 L 304.99 545.72 L 303.76 545.72 L 303.76 546.92 L 302.46 546.92 L 302.46 545.7 Z M 296.66 548.15 L 304.39 548.15 L 304.39 555.85 L 296.66 555.85 Z" fill="#d05c17" stroke="none" pointer-events="none"/><path d="M 219.64 510 C 219.27 510 218.96 510.3 218.96 510.68 L 218.96 535.2 C 218.96 535.57 219.27 535.88 219.64 535.88 L 222.23 535.88 L 222.23 538.26 C 222.23 538.44 222.3 538.61 222.43 538.74 C 222.56 538.86 222.73 538.94 222.91 538.94 L 225.29 538.94 L 225.29 541.32 C 225.29 541.7 225.6 542 225.97 542 L 244.36 542 C 244.73 542 245.04 541.7 245.04 541.32 L 245.04 516.81 C 245.04 516.43 244.73 516.13 244.36 516.13 L 241.97 516.13 L 241.97 513.74 C 241.97 513.37 241.67 513.06 241.29 513.06 L 238.71 513.06 L 238.71 510.68 C 238.71 510.3 238.4 510 238.03 510 Z M 220.32 511.36 L 237.34 511.36 L 237.34 513.06 L 222.91 513.06 C 222.73 513.06 222.56 513.14 222.43 513.26 C 222.3 513.39 222.23 513.56 222.23 513.74 L 222.23 534.51 L 220.32 534.51 Z M 223.59 514.43 L 240.61 514.43 L 240.61 516.13 L 225.97 516.13 C 225.6 516.13 225.29 516.43 225.29 516.81 L 225.29 537.57 L 223.59 537.57 Z M 226.66 517.49 L 243.68 517.49 L 243.68 540.64 L 226.66 540.64 Z M 228.52 519.19 C 228.15 519.19 227.84 519.5 227.84 519.87 L 227.84 522.94 C 227.84 523.31 228.15 523.62 228.52 523.62 L 231.59 523.62 C 231.96 523.62 232.27 523.31 232.27 522.94 L 232.27 519.87 C 232.27 519.5 231.96 519.19 231.59 519.19 Z M 229.2 520.55 L 230.9 520.55 L 230.9 522.26 L 229.2 522.26 Z M 233.12 520.72 L 233.12 522.09 L 242.32 522.09 L 242.32 520.72 Z M 228.52 526.34 C 228.15 526.34 227.84 526.65 227.84 527.02 L 227.84 530.09 C 227.84 530.46 228.15 530.77 228.52 530.77 L 231.59 530.77 C 231.96 530.77 232.27 530.46 232.27 530.09 L 232.27 527.02 C 232.27 526.65 231.96 526.34 231.59 526.34 Z M 229.2 527.7 L 230.9 527.7 L 230.9 529.4 L 229.2 529.4 Z M 233.12 527.87 L 233.12 529.23 L 242.32 529.23 L 242.32 527.87 Z M 228.52 533.49 C 228.34 533.49 228.17 533.56 228.04 533.69 C 227.91 533.82 227.84 533.99 227.84 534.17 L 227.84 537.24 C 227.84 537.62 228.15 537.92 228.52 537.92 L 231.59 537.92 C 231.96 537.92 232.27 537.62 232.27 537.24 L 232.27 534.17 C 232.27 533.99 232.19 533.82 232.07 533.69 C 231.94 533.56 231.77 533.49 231.59 533.49 Z M 229.2 534.85 L 230.9 534.85 L 230.9 536.56 L 229.2 536.56 Z M 233.12 535.02 L 233.12 536.38 L 242.32 536.38 L 242.32 535.02 Z" fill="#d05c17" stroke="none" pointer-events="none"/><rect x="867.5" y="530.5" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="867.5" y="530.5" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(883.5,533.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="98" height="41" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">            </div><div style="text-align: right;">            Replayer 1</div><div style="text-align: right;"><br /></div></div></div></foreignObject><text x="49" y="27" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 883.5 544.26 L 883.5 544.88 L 883.5 564.74 L 915.5 564.74 L 915.5 544.26 Z M 884.73 545.49 L 914.27 545.49 L 914.27 563.51 L 884.73 563.51 Z M 886.79 547.51 L 886.79 561.49 L 888.02 561.49 L 888.02 547.51 Z M 890.56 547.51 L 890.56 561.49 L 891.79 561.49 L 891.79 547.51 Z M 907.24 547.51 L 907.24 561.49 L 908.47 561.49 L 908.47 547.51 Z M 911.03 547.51 L 911.03 561.49 L 912.26 561.49 L 912.26 547.51 Z M 900.23 548.2 L 900.23 549.42 L 898.84 549.42 L 898.84 548.21 L 897.62 548.21 L 897.62 549.42 L 896.3 549.42 L 896.3 548.21 L 895.07 548.21 L 895.07 549.42 L 895.05 549.42 C 894.88 549.42 894.73 549.49 894.61 549.6 C 894.5 549.72 894.43 549.88 894.43 550.04 L 894.43 550.06 L 893.17 550.06 L 893.17 551.29 L 894.43 551.29 L 894.43 552.63 L 893.18 552.63 L 893.18 553.86 L 894.43 553.86 L 894.43 555.22 L 893.18 555.22 L 893.18 556.45 L 894.43 556.45 L 894.43 557.75 L 893.17 557.75 L 893.17 558.98 L 894.43 558.98 C 894.44 559.32 894.71 559.58 895.05 559.58 L 895.07 559.58 L 895.07 560.88 L 896.3 560.88 L 896.3 559.58 L 897.62 559.58 L 897.62 560.86 L 898.84 560.86 L 898.84 559.58 L 900.23 559.58 L 900.23 560.86 L 901.46 560.86 L 901.46 559.58 L 902.76 559.58 L 902.76 560.88 L 903.99 560.88 L 903.99 559.58 L 904 559.58 C 904.34 559.58 904.61 559.32 904.62 558.98 L 905.85 558.98 L 905.85 557.75 L 904.62 557.75 L 904.62 556.45 L 905.87 556.45 L 905.87 555.22 L 904.62 555.22 L 904.62 553.86 L 905.87 553.86 L 905.87 552.63 L 904.62 552.63 L 904.62 551.29 L 905.86 551.29 L 905.86 550.06 L 904.62 550.06 L 904.62 550.04 C 904.62 549.88 904.55 549.72 904.44 549.6 C 904.32 549.49 904.17 549.42 904 549.42 L 903.99 549.42 L 903.99 548.22 L 902.76 548.22 L 902.76 549.42 L 901.46 549.42 L 901.46 548.2 Z M 895.66 550.65 L 903.39 550.65 L 903.39 558.35 L 895.66 558.35 Z" fill="#d05c17" stroke="none" pointer-events="none"/><path d="M 791.64 511.5 C 791.27 511.5 790.96 511.8 790.96 512.18 L 790.96 536.7 C 790.96 537.07 791.27 537.38 791.64 537.38 L 794.23 537.38 L 794.23 539.76 C 794.23 540.13 794.53 540.44 794.91 540.44 L 797.29 540.44 L 797.29 542.82 C 797.29 543.2 797.6 543.5 797.97 543.5 L 816.36 543.5 C 816.73 543.5 817.04 543.2 817.04 542.82 L 817.04 518.31 C 817.04 517.93 816.73 517.63 816.36 517.63 L 813.97 517.63 L 813.97 515.24 C 813.97 514.87 813.67 514.56 813.29 514.56 L 810.71 514.56 L 810.71 512.18 C 810.71 511.8 810.4 511.5 810.03 511.5 Z M 792.32 512.86 L 809.34 512.86 L 809.34 514.56 L 794.91 514.56 C 794.53 514.56 794.23 514.87 794.23 515.24 L 794.23 536.01 L 792.32 536.01 Z M 795.59 515.93 L 812.61 515.93 L 812.61 517.63 L 797.97 517.63 C 797.6 517.63 797.29 517.93 797.29 518.31 L 797.29 539.07 L 795.59 539.07 Z M 798.66 518.99 L 815.68 518.99 L 815.68 542.14 L 798.66 542.14 Z M 800.52 520.69 C 800.15 520.69 799.84 521 799.84 521.37 L 799.84 524.44 C 799.84 524.81 800.15 525.12 800.52 525.12 L 803.59 525.12 C 803.96 525.12 804.27 524.81 804.27 524.44 L 804.27 521.37 C 804.27 521 803.96 520.69 803.59 520.69 Z M 801.2 522.05 L 802.9 522.05 L 802.9 523.76 L 801.2 523.76 Z M 805.12 522.22 L 805.12 523.59 L 814.32 523.59 L 814.32 522.22 Z M 800.52 527.84 C 800.15 527.84 799.84 528.15 799.84 528.52 L 799.84 531.59 C 799.84 531.96 800.15 532.27 800.52 532.27 L 803.59 532.27 C 803.96 532.27 804.27 531.96 804.27 531.59 L 804.27 528.52 C 804.27 528.15 803.96 527.84 803.59 527.84 Z M 801.2 529.2 L 802.9 529.2 L 802.9 530.9 L 801.2 530.9 Z M 805.12 529.37 L 805.12 530.73 L 814.32 530.73 L 814.32 529.37 Z M 800.52 534.99 C 800.34 534.99 800.17 535.06 800.04 535.19 C 799.91 535.32 799.84 535.49 799.84 535.67 L 799.84 538.74 C 799.84 539.12 800.15 539.42 800.52 539.42 L 803.59 539.42 C 803.96 539.42 804.27 539.12 804.27 538.74 L 804.27 535.67 C 804.27 535.49 804.19 535.32 804.07 535.19 C 803.94 535.06 803.77 534.99 803.59 534.99 Z M 801.2 536.35 L 802.9 536.35 L 802.9 538.06 L 801.2 538.06 Z M 805.12 536.52 L 805.12 537.88 L 814.32 537.88 L 814.32 536.52 Z" fill="#d05c17" stroke="none" pointer-events="none"/><g transform="translate(4779.5,1919.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><rect x="738.5" y="398" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="738.5" y="398" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(739.5,416.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="129" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">            Capture Proxy N</div></div></div></foreignObject><text x="65" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 747 408 L 775 408 L 775 436 L 747 436 Z" fill="url(#mx-gradient-f78e04-1-d05c17-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 760.42 410.8 C 760.01 410.8 759.66 411.15 759.66 411.56 L 759.66 413.92 L 760.54 413.92 L 760.54 411.67 L 771.32 411.67 L 771.32 422.45 L 769.07 422.45 L 769.07 423.33 L 771.44 423.33 C 771.85 423.33 772.2 422.98 772.2 422.57 L 772.2 411.56 C 772.2 411.15 771.85 410.8 771.44 410.8 Z M 756.08 414.82 L 756.08 416.19 L 755.97 416.19 C 755.55 416.19 755.21 416.54 755.21 416.95 L 755.21 417.08 L 753.82 417.08 L 753.82 417.95 L 755.21 417.95 L 755.21 419.32 L 753.82 419.32 L 753.82 420.2 L 755.21 420.2 L 755.21 421.56 L 753.82 421.56 L 753.82 422.44 L 755.21 422.44 L 755.21 423.81 L 753.82 423.81 L 753.82 424.69 L 755.21 424.69 L 755.21 426.05 L 753.82 426.05 L 753.82 426.92 L 755.21 426.92 L 755.21 427.05 C 755.21 427.47 755.55 427.81 755.97 427.81 L 756.08 427.81 L 756.08 429.19 L 756.96 429.19 L 756.96 427.81 L 758.33 427.81 L 758.33 429.19 L 759.2 429.19 L 759.2 427.81 L 760.56 427.81 L 760.56 429.19 L 761.44 429.19 L 761.44 427.81 L 762.79 427.81 L 762.79 429.19 L 763.67 429.19 L 763.67 427.81 L 765.05 427.81 L 765.05 429.19 L 765.93 429.19 L 765.93 427.81 L 766.05 427.81 C 766.47 427.81 766.81 427.47 766.81 427.05 L 766.81 426.92 L 768.16 426.92 L 768.16 426.05 L 766.81 426.05 L 766.81 424.69 L 768.16 424.69 L 768.16 423.81 L 766.81 423.81 L 766.81 422.44 L 768.16 422.44 L 768.16 421.56 L 766.81 421.56 L 766.81 420.2 L 768.16 420.2 L 768.16 419.32 L 766.81 419.32 L 766.81 417.95 L 768.16 417.95 L 768.16 417.08 L 766.81 417.08 L 766.81 416.95 C 766.81 416.54 766.47 416.19 766.05 416.19 L 765.93 416.19 L 765.93 414.82 L 765.05 414.82 L 765.05 416.19 L 763.67 416.19 L 763.67 414.82 L 762.79 414.82 L 762.79 416.19 L 761.44 416.19 L 761.44 414.82 L 760.56 414.82 L 760.56 416.19 L 759.2 416.19 L 759.2 414.82 L 758.33 414.82 L 758.33 416.19 L 756.96 416.19 L 756.96 414.82 Z M 756.08 417.07 L 765.94 417.07 L 765.94 426.94 L 756.08 426.94 Z M 750.56 420.67 C 750.15 420.67 749.8 421.02 749.8 421.43 L 749.8 432.44 C 749.8 432.85 750.15 433.2 750.56 433.2 L 761.58 433.2 C 761.99 433.2 762.34 432.85 762.34 432.44 L 762.34 430.07 L 761.46 430.07 L 761.46 432.33 L 750.68 432.33 L 750.68 421.55 L 752.91 421.55 L 752.91 420.67 Z" fill="#ffffff" stroke="none" pointer-events="none"/><rect x="506.5" y="292.5" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><rect x="506.5" y="292.5" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><g transform="translate(597.5,296.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="188" height="171" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 189px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b style="">Capture Proxy ALB Target Group<br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /></b></div></div></foreignObject><text x="94" y="92" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="669.5" y="363" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="669.5" y="363" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(671.5,381.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="127" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">            Capture Proxy 2</div></div></div></foreignObject><text x="64" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 678 373 L 706 373 L 706 401 L 678 401 Z" fill="url(#mx-gradient-f78e04-1-d05c17-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 691.42 375.8 C 691.01 375.8 690.66 376.15 690.66 376.56 L 690.66 378.92 L 691.54 378.92 L 691.54 376.67 L 702.32 376.67 L 702.32 387.45 L 700.07 387.45 L 700.07 388.33 L 702.44 388.33 C 702.85 388.33 703.2 387.98 703.2 387.57 L 703.2 376.56 C 703.2 376.15 702.85 375.8 702.44 375.8 Z M 687.08 379.82 L 687.08 381.19 L 686.97 381.19 C 686.55 381.19 686.21 381.54 686.21 381.95 L 686.21 382.08 L 684.82 382.08 L 684.82 382.95 L 686.21 382.95 L 686.21 384.32 L 684.82 384.32 L 684.82 385.2 L 686.21 385.2 L 686.21 386.56 L 684.82 386.56 L 684.82 387.44 L 686.21 387.44 L 686.21 388.81 L 684.82 388.81 L 684.82 389.69 L 686.21 389.69 L 686.21 391.05 L 684.82 391.05 L 684.82 391.92 L 686.21 391.92 L 686.21 392.05 C 686.21 392.47 686.55 392.81 686.97 392.81 L 687.08 392.81 L 687.08 394.19 L 687.96 394.19 L 687.96 392.81 L 689.33 392.81 L 689.33 394.19 L 690.2 394.19 L 690.2 392.81 L 691.56 392.81 L 691.56 394.19 L 692.44 394.19 L 692.44 392.81 L 693.79 392.81 L 693.79 394.19 L 694.67 394.19 L 694.67 392.81 L 696.05 392.81 L 696.05 394.19 L 696.93 394.19 L 696.93 392.81 L 697.05 392.81 C 697.47 392.81 697.81 392.47 697.81 392.05 L 697.81 391.92 L 699.16 391.92 L 699.16 391.05 L 697.81 391.05 L 697.81 389.69 L 699.16 389.69 L 699.16 388.81 L 697.81 388.81 L 697.81 387.44 L 699.16 387.44 L 699.16 386.56 L 697.81 386.56 L 697.81 385.2 L 699.16 385.2 L 699.16 384.32 L 697.81 384.32 L 697.81 382.95 L 699.16 382.95 L 699.16 382.08 L 697.81 382.08 L 697.81 381.95 C 697.81 381.54 697.47 381.19 697.05 381.19 L 696.93 381.19 L 696.93 379.82 L 696.05 379.82 L 696.05 381.19 L 694.67 381.19 L 694.67 379.82 L 693.79 379.82 L 693.79 381.19 L 692.44 381.19 L 692.44 379.82 L 691.56 379.82 L 691.56 381.19 L 690.2 381.19 L 690.2 379.82 L 689.33 379.82 L 689.33 381.19 L 687.96 381.19 L 687.96 379.82 Z M 687.08 382.07 L 696.94 382.07 L 696.94 391.94 L 687.08 391.94 Z M 681.56 385.67 C 681.15 385.67 680.8 386.02 680.8 386.43 L 680.8 397.44 C 680.8 397.85 681.15 398.2 681.56 398.2 L 692.58 398.2 C 692.99 398.2 693.34 397.85 693.34 397.44 L 693.34 395.07 L 692.46 395.07 L 692.46 397.33 L 681.68 397.33 L 681.68 386.55 L 683.91 386.55 L 683.91 385.67 Z" fill="#ffffff" stroke="none" pointer-events="none"/><rect x="604" y="324.5" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="604" y="324.5" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(604.5,328.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="130" height="41" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">            </div><div style="text-align: right;">             Capture Proxy 1</div><div style="text-align: right;"><br /></div></div></div></foreignObject><text x="65" y="27" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 611.5 335 L 639.5 335 L 639.5 363 L 611.5 363 Z" fill="url(#mx-gradient-f78e04-1-d05c17-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 624.92 337.8 C 624.51 337.8 624.16 338.15 624.16 338.56 L 624.16 340.92 L 625.04 340.92 L 625.04 338.67 L 635.82 338.67 L 635.82 349.45 L 633.57 349.45 L 633.57 350.33 L 635.94 350.33 C 636.35 350.33 636.7 349.98 636.7 349.57 L 636.7 338.56 C 636.7 338.15 636.35 337.8 635.94 337.8 Z M 620.58 341.82 L 620.58 343.19 L 620.47 343.19 C 620.05 343.19 619.71 343.54 619.71 343.95 L 619.71 344.08 L 618.32 344.08 L 618.32 344.95 L 619.71 344.95 L 619.71 346.32 L 618.32 346.32 L 618.32 347.2 L 619.71 347.2 L 619.71 348.56 L 618.32 348.56 L 618.32 349.44 L 619.71 349.44 L 619.71 350.81 L 618.32 350.81 L 618.32 351.69 L 619.71 351.69 L 619.71 353.05 L 618.32 353.05 L 618.32 353.92 L 619.71 353.92 L 619.71 354.05 C 619.71 354.47 620.05 354.81 620.47 354.81 L 620.58 354.81 L 620.58 356.19 L 621.46 356.19 L 621.46 354.81 L 622.83 354.81 L 622.83 356.19 L 623.7 356.19 L 623.7 354.81 L 625.06 354.81 L 625.06 356.19 L 625.94 356.19 L 625.94 354.81 L 627.29 354.81 L 627.29 356.19 L 628.17 356.19 L 628.17 354.81 L 629.55 354.81 L 629.55 356.19 L 630.43 356.19 L 630.43 354.81 L 630.55 354.81 C 630.97 354.81 631.31 354.47 631.31 354.05 L 631.31 353.92 L 632.66 353.92 L 632.66 353.05 L 631.31 353.05 L 631.31 351.69 L 632.66 351.69 L 632.66 350.81 L 631.31 350.81 L 631.31 349.44 L 632.66 349.44 L 632.66 348.56 L 631.31 348.56 L 631.31 347.2 L 632.66 347.2 L 632.66 346.32 L 631.31 346.32 L 631.31 344.95 L 632.66 344.95 L 632.66 344.08 L 631.31 344.08 L 631.31 343.95 C 631.31 343.54 630.97 343.19 630.55 343.19 L 630.43 343.19 L 630.43 341.82 L 629.55 341.82 L 629.55 343.19 L 628.17 343.19 L 628.17 341.82 L 627.29 341.82 L 627.29 343.19 L 625.94 343.19 L 625.94 341.82 L 625.06 341.82 L 625.06 343.19 L 623.7 343.19 L 623.7 341.82 L 622.83 341.82 L 622.83 343.19 L 621.46 343.19 L 621.46 341.82 Z M 620.58 344.07 L 630.44 344.07 L 630.44 353.94 L 620.58 353.94 Z M 615.06 347.67 C 614.65 347.67 614.3 348.02 614.3 348.43 L 614.3 359.44 C 614.3 359.85 614.65 360.2 615.06 360.2 L 626.08 360.2 C 626.49 360.2 626.84 359.85 626.84 359.44 L 626.84 357.07 L 625.96 357.07 L 625.96 359.33 L 615.18 359.33 L 615.18 348.55 L 617.41 348.55 L 617.41 347.67 Z" fill="#ffffff" stroke="none" pointer-events="none"/><path d="M 526 300.5 L 574 300.5 L 574 348.5 L 526 348.5 Z" fill="url(#mx-gradient-f78e04-1-d05c17-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 548.63 305.31 C 548.51 305.31 548.38 305.35 548.28 305.41 L 533.37 314.01 C 533.14 314.14 532.99 314.39 532.99 314.66 L 532.99 333.42 C 532.99 333.68 533.14 333.93 533.37 334.07 L 549.59 343.56 C 549.82 343.7 550.11 343.7 550.35 343.56 L 565.23 334.9 C 565.47 334.77 565.61 334.52 565.61 334.25 C 565.61 333.98 565.46 333.73 565.22 333.59 L 558.08 329.57 C 557.85 329.43 557.56 329.44 557.33 329.57 L 550.05 333.84 L 541.56 328.95 L 541.56 319.17 L 549.03 314.89 C 549.26 314.76 549.41 314.51 549.41 314.24 L 549.41 306.06 C 549.41 305.86 549.32 305.66 549.18 305.52 C 549.03 305.38 548.83 305.3 548.63 305.31 Z M 551.48 305.34 C 551.27 305.33 551.08 305.41 550.93 305.55 C 550.78 305.69 550.7 305.89 550.7 306.09 L 550.7 314.28 C 550.7 314.55 550.85 314.79 551.08 314.93 L 558.42 319.2 L 558.42 327.76 C 558.42 328.03 558.56 328.28 558.8 328.41 L 565.87 332.52 C 566.1 332.66 566.39 332.66 566.63 332.52 C 566.86 332.39 567.01 332.14 567 331.87 L 567 314.68 C 567.01 314.41 566.86 314.16 566.63 314.03 L 551.84 305.44 C 551.73 305.37 551.6 305.34 551.48 305.34 Z M 547.9 307.37 L 547.9 313.8 L 540.43 318.08 C 540.19 318.21 540.05 318.46 540.05 318.73 L 540.05 329.38 C 540.05 329.65 540.19 329.9 540.42 330.04 L 549.67 335.37 C 549.91 335.5 550.2 335.5 550.43 335.36 L 557.72 331.09 L 563.34 334.26 L 549.97 342.04 L 534.5 332.98 L 534.5 315.09 Z M 552.21 307.4 L 565.5 315.11 L 565.5 330.56 L 559.93 327.33 L 559.93 318.77 C 559.93 318.5 559.78 318.25 559.55 318.11 L 552.21 313.84 Z" fill="#ffffff" stroke="none" pointer-events="none"/><g transform="translate(513.5,356.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="72" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(35, 47, 62); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div>Amazon ECS</div></div></div></foreignObject><text x="36" y="12" fill="#232F3E" text-anchor="middle" font-size="12px" font-family="Arial"><div>Amazon ECS</div></text></switch></g><path d="M 1107 445 L 1107 476 L 966 476 L 966 502.03" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 962 496.26 L 966 504.26 L 970 496.26" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 193 351 L 241 351 L 241 399 L 193 399 Z" fill="url(#mx-gradient-60a337-1-277116-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 215.88 355.82 C 211.9 355.8 207.99 356.4 204.9 357.35 C 203.25 357.86 201.83 358.46 200.74 359.15 C 199.65 359.83 198.84 360.6 198.6 361.61 C 198.58 361.68 198.57 361.74 198.58 361.81 L 198.58 361.81 C 198.58 361.81 198.58 361.81 198.58 361.81 C 198.58 361.83 198.58 361.86 198.58 361.89 C 198.59 362.02 198.61 362.14 198.63 362.27 L 202.45 390.14 C 202.45 390.16 202.46 390.18 202.46 390.2 C 202.65 391.05 203.33 391.62 204.16 392.08 C 204.98 392.54 206.03 392.92 207.22 393.23 C 209.59 393.84 212.51 394.18 215.18 394.17 C 218.57 394.2 221.63 393.91 224.04 393.37 C 226.45 392.83 228.25 392.11 229.14 390.85 C 229.21 390.75 229.26 390.64 229.27 390.52 L 230.96 378.49 C 231.35 378.59 231.74 378.67 232.11 378.74 C 232.79 378.86 233.42 378.92 234.02 378.81 C 234.31 378.75 234.61 378.65 234.88 378.43 C 235.14 378.21 235.34 377.88 235.4 377.53 C 235.4 377.49 235.4 377.45 235.41 377.42 C 235.43 376.38 234.68 375.69 233.88 375.07 C 233.2 374.53 232.39 374.06 231.64 373.68 L 233.29 361.85 C 233.31 361.76 233.3 361.66 233.28 361.57 C 233.02 360.53 232.17 359.77 231.08 359.1 C 229.99 358.44 228.59 357.87 227.02 357.39 C 223.87 356.43 220.04 355.85 216.68 355.83 L 216.68 355.83 C 216.41 355.82 216.15 355.82 215.88 355.82 Z M 215.88 357.32 C 216.13 357.32 216.39 357.32 216.64 357.33 L 216.65 357.33 C 216.65 357.33 216.66 357.33 216.66 357.33 C 219.85 357.35 223.6 357.92 226.58 358.83 C 228.07 359.28 229.37 359.82 230.3 360.38 C 231.16 360.91 231.63 361.45 231.77 361.83 C 231.64 362.26 231.25 362.73 230.53 363.2 C 229.69 363.73 228.46 364.23 226.99 364.63 C 224.05 365.44 220.13 365.88 216.29 365.88 C 216.29 365.88 216.28 365.88 216.28 365.88 C 212.73 365.95 208.55 365.55 205.31 364.75 C 203.69 364.35 202.31 363.85 201.38 363.29 C 200.6 362.82 200.21 362.38 200.1 361.98 L 200.09 361.89 C 200.21 361.52 200.66 360.97 201.54 360.41 C 202.46 359.83 203.78 359.26 205.34 358.78 C 208.26 357.89 212.06 357.3 215.88 357.32 Z M 231.42 364.4 L 229.71 376.58 C 229.31 376.46 228.92 376.34 228.56 376.24 C 225.37 375.11 220.68 373.11 217.62 371.6 C 217.62 371.59 217.62 371.58 217.62 371.57 C 217.62 370.63 216.84 369.85 215.9 369.85 C 214.96 369.85 214.18 370.63 214.18 371.57 C 214.18 372.52 214.96 373.3 215.9 373.3 C 216.29 373.3 216.66 373.16 216.95 372.94 C 220.08 374.48 224.79 376.49 228.07 377.65 C 228.08 377.66 228.1 377.66 228.11 377.66 C 228.52 377.79 229 377.94 229.5 378.08 L 227.83 390.06 C 227.34 390.66 225.89 391.42 223.71 391.91 C 221.45 392.42 218.49 392.71 215.18 392.68 C 215.17 392.68 215.17 392.68 215.17 392.68 C 212.64 392.69 209.8 392.35 207.59 391.78 C 206.48 391.49 205.54 391.14 204.89 390.78 C 204.24 390.41 203.96 390.05 203.92 389.88 L 200.45 364.48 C 200.5 364.51 200.56 364.55 200.61 364.58 C 201.75 365.26 203.24 365.79 204.95 366.21 C 208.37 367.05 212.62 367.45 216.3 367.38 C 220.25 367.38 224.25 366.94 227.39 366.08 C 228.95 365.65 230.3 365.12 231.34 364.45 C 231.37 364.44 231.39 364.41 231.42 364.4 Z M 215.9 371.35 C 216.04 371.35 216.13 371.44 216.13 371.57 C 216.13 371.71 216.04 371.8 215.9 371.8 C 215.77 371.8 215.68 371.71 215.68 371.57 C 215.68 371.44 215.77 371.35 215.9 371.35 Z M 231.41 375.26 C 231.96 375.55 232.51 375.9 232.96 376.25 C 233.57 376.73 233.8 377.17 233.84 377.31 C 233.81 377.32 233.81 377.33 233.74 377.34 C 233.47 377.39 232.96 377.37 232.37 377.27 C 232 377.2 231.59 377.1 231.17 377 Z" fill="#ffffff" stroke="none" pointer-events="none"/><g transform="translate(185.5,406.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="62" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(35, 47, 62); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;">Amazon S3</div></div></foreignObject><text x="31" y="12" fill="#232F3E" text-anchor="middle" font-size="12px" font-family="Arial">Amazon S3</text></switch></g><g transform="translate(343.5,460.5)rotate(90,0,6)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font><br /></font></div></div></foreignObject><text x="0" y="12" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font><br></font></text></switch></g><path d="M 692 293 L 692 253 L 683 253 L 683 221.47" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 687 227.24 L 683 219.24 L 679 227.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><ellipse cx="690.5" cy="261" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(685.5,250.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><font style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">2</font></b></font></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 878 383 L 987 383 L 1032.03 383" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 1026.26 387 L 1034.26 383 L 1026.26 379" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><ellipse cx="957" cy="382.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(951.5,370.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="23" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 21px;"><font size="1"><b><font color="#ffffff" style="font-size: 18px;">2</font></b></font></div></div></div></foreignObject><text x="5" y="18" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><ellipse cx="457" cy="378.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(451.5,368.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><font style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">2</font></b></font></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="499.5" y="759.5" width="341" height="140" fill="none" stroke="#3333ff" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(669.5,772.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="113" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 1px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><br /><br /><br /><br /><br /><br /><br /><br /></div></div></foreignObject><text x="0" y="63" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><br><br><br><br><br><br><br><br></text></switch></g><image x="645" y="794" width="64" height="64" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(568.5,866.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="218" height="27" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div><font>Amazon OpenSearch or</font></div><div><font>Elasticsearch/OpenSearch self-managed</font></div></div></div></foreignObject><text x="109" y="20" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><g transform="translate(620.5,767.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="107" height="17" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font color="#000000" style="font-size: 16px;"><b>Target Cluster</b><br /></font></div></div></foreignObject><text x="54" y="16" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">[Not supported by viewer]</text></switch></g><g transform="translate(527.5,747.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="17" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="font-size: 16px;"></div><b style="font-size: 16px;"><font color="#1a1a1a" style="font-size: 16px;"><br /></font></b></div></div></foreignObject><text x="0" y="16" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial"><div style="font-size: 16px;"></div><b style="font-size: 16px;"><font color="#1a1a1a" style="font-size: 16px;"><br></font></b></text></switch></g><g transform="translate(707.5,828.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(841.5,883.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(841.5,850.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><ellipse cx="306" cy="472.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(300.5,462.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">3</font></b></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 966 686 L 966 723 L 755 723 L 755 755.53" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 751 749.76 L 755 757.76 L 759 749.76" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><rect x="1036.5" y="304.5" width="140" height="140" fill="none" stroke="#3333ff" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><rect x="1036.5" y="304.5" width="140" height="140" fill="none" stroke="#3333ff" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><g transform="translate(1062.5,317.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="88" height="113" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 89px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b>Event Streamer<br /></b><br /><br /><br /><br /><br /><br /><br /></div></div></foreignObject><text x="44" y="63" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><image x="1082.22" y="349" width="48" height="48" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(1069.5,405.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="73" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div><font>Amazon MSK</font></div></div></div></foreignObject><text x="37" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><ellipse cx="882" cy="726" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(876.5,713.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="24" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 22px;"><font size="1"><b><font color="#ffffff" style="font-size: 18px;">4</font></b></font></div></div></div></foreignObject><text x="5" y="18" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 402 686 L 402 723 L 585 723 L 585 755.53" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 581 749.76 L 585 757.76 L 589 749.76" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><ellipse cx="480" cy="722.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(474.5,710.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="23" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 21px;"><font size="1"><b><font color="#ffffff" style="font-size: 18px;">3</font></b></font></div></div></div></foreignObject><text x="5" y="18" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><ellipse cx="29" cy="241" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(23.5,230.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">6</font></b></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><ellipse cx="1036.5" cy="472.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(1031.5,460.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="24" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 22px;"><font size="1"><b><font color="#ffffff" style="font-size: 18px;">4</font></b></font></div></div></div></foreignObject><text x="5" y="18" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g></g></svg> \ No newline at end of file diff --git a/images/migrations/migrations-architecture-overview.png b/images/migrations/migrations-architecture-overview.png new file mode 100644 index 0000000000..3002da3a87 Binary files /dev/null and b/images/migrations/migrations-architecture-overview.png differ diff --git a/images/star-tree-index.png b/images/star-tree-index.png new file mode 100644 index 0000000000..81309e1195 Binary files /dev/null and b/images/star-tree-index.png differ diff --git a/release-notes/opensearch-documentation-release-notes-2.18.0.md b/release-notes/opensearch-documentation-release-notes-2.18.0.md new file mode 100644 index 0000000000..30147a37f0 --- /dev/null +++ b/release-notes/opensearch-documentation-release-notes-2.18.0.md @@ -0,0 +1,39 @@ +# OpenSearch Documentation Website 2.18.0 Release Notes + +The OpenSearch 2.18.0 documentation includes the following additions and updates. + +## New documentation for 2.18.0 + +- Update SQL/PPL multiple value field limitation [#8646](https://github.com/opensearch-project/documentation-website/pull/8646) +- Add new use cases to ML Inference Search Response Processor [#8639](https://github.com/opensearch-project/documentation-website/pull/8639) +- Documentation for query field name and datatype in query shape [#8631](https://github.com/opensearch-project/documentation-website/pull/8631) +- add document for Query Insights health_stats API [#8627](https://github.com/opensearch-project/documentation-website/pull/8627) +- Add new indexing parameter and update performance tuning instruction [#8623](https://github.com/opensearch-project/documentation-website/pull/8623) +- Update default engine from nmslib to faiss [#8620](https://github.com/opensearch-project/documentation-website/pull/8620) +- Update documentation for coordination settings and batch size [#8604](https://github.com/opensearch-project/documentation-website/pull/8604) +- Update JDK version for 2.x distributions [#8603](https://github.com/opensearch-project/documentation-website/pull/8603) +- Add documentation for star tree index feature [#8598](https://github.com/opensearch-project/documentation-website/pull/8598) +- Add URI paths for cluster stats filtering. [#8595](https://github.com/opensearch-project/documentation-website/pull/8595) +- Adding documentation for _list APIs [#8594](https://github.com/opensearch-project/documentation-website/pull/8594) +- Adds documentation about byField rerank processor [#8593](https://github.com/opensearch-project/documentation-website/pull/8593) +- Updating tiered caching settings [#8592](https://github.com/opensearch-project/documentation-website/pull/8592) +- Add documentation changes for cluster level dynamic limit settings to block cat/indices, _cat/shards and _cat/segments [#8590](https://github.com/opensearch-project/documentation-website/pull/8590) +- Add doc for dynamic threadpool settings [#8588](https://github.com/opensearch-project/documentation-website/pull/8588) +- Add value range for the search backpressure settings [#8555](https://github.com/opensearch-project/documentation-website/pull/8555) +- Add new rename_alias parameters for restore-snapshot [#8544](https://github.com/opensearch-project/documentation-website/pull/8544) +- Add SQL PIT reference [#8541](https://github.com/opensearch-project/documentation-website/pull/8541) +- Document cluster.default_number_of_replicas and update index.number_of_replicas [#8526](https://github.com/opensearch-project/documentation-website/pull/8526) +- Msearch template API returns status code in each search response [#8522](https://github.com/opensearch-project/documentation-website/pull/8522) +- document the new `analysis-phonenumber` plugin [#8469](https://github.com/opensearch-project/documentation-website/pull/8469) +- Adds documentation for providing search pipeline id in the search/msearch request [#8372](https://github.com/opensearch-project/documentation-website/pull/8372) +- Data Stream support for Audit- Log [#8356](https://github.com/opensearch-project/documentation-website/pull/8356) +- Update documentation to reflect k-NN FAISS AVX512 support [#8307](https://github.com/opensearch-project/documentation-website/pull/8307) +- [Feature]: add ignore missing to text chunking processor [#8266](https://github.com/opensearch-project/documentation-website/pull/8266) +- Add documentation for workload management [#8228](https://github.com/opensearch-project/documentation-website/pull/8228) + +## In progress documentation for 2.18.0 + +- [Workspace] Add documentation for workspace and ACL [#8643](https://github.com/opensearch-project/documentation-website/pull/8643) +- add wlm feature overview [#8632](https://github.com/opensearch-project/documentation-website/pull/8632) +- Add querygroup lifecycle api documentation [#8628](https://github.com/opensearch-project/documentation-website/pull/8628) +- [Workload Management] Querygroup Lifecyle API docs [#8249](https://github.com/opensearch-project/documentation-website/pull/8249) diff --git a/spec-insert/.gitignore b/spec-insert/.gitignore new file mode 100644 index 0000000000..c9958b86d2 --- /dev/null +++ b/spec-insert/.gitignore @@ -0,0 +1,2 @@ +opensearch-openapi.yaml +rspec_examples.txt diff --git a/spec-insert/.rspec b/spec-insert/.rspec new file mode 100644 index 0000000000..c99d2e7396 --- /dev/null +++ b/spec-insert/.rspec @@ -0,0 +1 @@ +--require spec_helper diff --git a/spec-insert/.rubocop.yml b/spec-insert/.rubocop.yml new file mode 100644 index 0000000000..5b88e922f4 --- /dev/null +++ b/spec-insert/.rubocop.yml @@ -0,0 +1,29 @@ +require: rubocop-rake +AllCops: + Include: + - 'lib/**/*.rb' + - 'Rakefile' + NewCops: enable + +Metrics/CyclomaticComplexity: + Enabled: false +Metrics/MethodLength: + Enabled: false +Metrics/ParameterLists: + Enabled: false +Metrics/AbcSize: + Enabled: false +Metrics/PerceivedComplexity: + Enabled: false + +Layout/EmptyLineAfterGuardClause: + Enabled: false + +Style/MultilineBlockChain: + Enabled: false +Style/SingleLineMethods: + Enabled: false + +Naming/FileName: + Exclude: + - 'lib/jekyll-spec-insert.rb' # For Jekyll to recognize the plugin diff --git a/spec-insert/jekyll-spec-insert.gemspec b/spec-insert/jekyll-spec-insert.gemspec new file mode 100644 index 0000000000..d397f40af2 --- /dev/null +++ b/spec-insert/jekyll-spec-insert.gemspec @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +Gem::Specification.new do |spec| + spec.name = 'jekyll-spec-insert' + spec.version = '0.1.0' + spec.authors = ['Theo Truong'] + spec.email = ['theo.nam.truong@gmail.com'] + + spec.summary = 'A Jekyll plugin for inserting OpenSearch OpenAPI specifications into Jekyll sites.' + + spec.files = Dir['lib/**/*.rb'] + spec.require_paths = ['lib'] + + spec.metadata['rubygems_mfa_required'] = 'true' + spec.required_ruby_version = '>= 3.1.0' +end diff --git a/spec-insert/lib/api/action.rb b/spec-insert/lib/api/action.rb new file mode 100644 index 0000000000..5ad3dded77 --- /dev/null +++ b/spec-insert/lib/api/action.rb @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +# frozen_string_literal: true + +require_relative 'parameter' +require_relative 'operation' + +# A collection of operations that comprise a single API Action +# AKA operation-group +class Action + # @param [SpecHash] spec Parsed OpenAPI spec + def self.actions=(spec) + operations = spec.paths.flat_map do |url, ops| + ops.filter_map { |verb, op| Operation.new(op, url, verb) unless op['x-ignorable'] } + end + @actions = operations.group_by(&:group).values.map { |ops| Action.new(ops) }.index_by(&:full_name) + end + + # @return [Hash<String, Action>] API Actions indexed by operation-group + def self.actions + raise 'Actions not set' unless @actions + @actions + end + + # @return [Array<Operation>] Operations in the action + attr_reader :operations + + # @param [Array<Operation>] operations + def initialize(operations) + @operations = operations + @operation = operations.first + @spec = @operation&.spec + end + + # @return [Array<Parameter>] Input arguments. + def arguments; @arguments ||= Parameter.from_operations(@operations.map(&:spec)); end + + # @return [String] Full name of the action (i.e. namespace.action) + def full_name; @operation&.group; end + + # return [String] Name of the action + def name; @operation&.action; end + + # @return [String] Namespace of the action + def namespace; @operation&.namespace; end + + # @return [Array<String>] Sorted unique HTTP verbs + def http_verbs; @operations.map(&:http_verb).uniq.sort; end + + # @return [Array<String>] Unique URLs + def urls; @operations.map(&:url).uniq; end + + # @return [String] Description of the action + def description; @spec&.description; end + + # @return [Boolean] Whether the action is deprecated + def deprecated; @spec&.deprecated; end + + # @return [String] Deprecation message + def deprecation_message; @spec['x-deprecation-message']; end + + # @return [String] API reference + def api_reference; @operation&.external_docs&.url; end +end diff --git a/spec-insert/lib/api/operation.rb b/spec-insert/lib/api/operation.rb new file mode 100644 index 0000000000..6f9fb44cc4 --- /dev/null +++ b/spec-insert/lib/api/operation.rb @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +# frozen_string_literal: true + +# An API Operation +class Operation + # @return [Openapi3Parser::Node::Operation] Operation Spec + attr_reader :spec + # @return [String] URL + attr_reader :url + # @return [String] HTTP Verb + attr_reader :http_verb + # @return [String] Operation Group + attr_reader :group + # @return [String] API Action + attr_reader :action + # @return [String] API Namespace + attr_reader :namespace + + # @param [Openapi3Parser::Node::Operation] spec Operation Spec + # @param [String] url + # @param [String] http_verb + def initialize(spec, url, http_verb) + @spec = spec + @url = url + @http_verb = http_verb.upcase + @group = spec['x-operation-group'] + @action, @namespace = @group.split('.').reverse + end +end diff --git a/spec-insert/lib/api/parameter.rb b/spec-insert/lib/api/parameter.rb new file mode 100644 index 0000000000..fbd87fd50e --- /dev/null +++ b/spec-insert/lib/api/parameter.rb @@ -0,0 +1,94 @@ +# frozen_string_literal: true + +module ArgLocation + PATH = :path + QUERY = :query +end + +# Represents a parameter of an API action +class Parameter + # @return [String] The name of the parameter + attr_reader :name + # @return [String] The description of the parameter + attr_reader :description + # @return [Boolean] Whether the parameter is required + attr_reader :required + # @return [SpecHash] The JSON schema of the parameter + attr_reader :schema + # @return [String] Argument type in documentation + attr_reader :doc_type + # @return [String] The default value of the parameter + attr_reader :default + # @return [Boolean] Whether the parameter is deprecated + attr_reader :deprecated + # @return [String] The deprecation message + attr_reader :deprecation_message + # @return [String] The OpenSearch version when the parameter was deprecated + attr_reader :version_deprecated + # @return [ArgLocation] The location of the parameter + attr_reader :location + + def initialize(name:, description:, required:, schema:, default:, deprecated:, deprecation_message:, + version_deprecated:, location:) + @name = name + @description = description + @required = required + @schema = schema + @doc_type = get_doc_type(schema).gsub('String / List', 'List').gsub('List / String', 'List') + @default = default + @deprecated = deprecated + @deprecation_message = deprecation_message + @version_deprecated = version_deprecated + @location = location + end + + # @param [SpecHash | nil] schema + # @return [String | nil] Documentation type + def get_doc_type(schema) + return nil if schema.nil? + union = schema.anyOf || schema.oneOf + return union.map { |sch| get_doc_type(sch) }.join(' / ') unless union.nil? + return 'Integer' if schema.type == 'integer' + return 'Float' if schema.type == 'number' + return 'Boolean' if schema.type == 'boolean' + return 'String' if schema.type == 'string' + return 'NULL' if schema.type == 'null' + return 'List' if schema.type == 'array' + 'Object' + end + + # @param [SpecHash] Full OpenAPI spec + def self.global=(spec) + @global = spec.components.parameters.filter { |_, p| p['x-global'] }.map { |_, p| from_parameters([p], 1) } + end + + # @return [Array<Parameter>] Global parameters + def self.global + raise 'Global parameters not set' unless @global + @global + end + + # @param [Array<SpecHash>] operations List of operations of the same group + # @return [Array<Parameter>] List of parameters of the operation group + def self.from_operations(operations) + operations.flat_map(&:parameters).filter { |param| !param['x-global'] } + .group_by(&:name).values.map { |params| from_parameters(params, operations.size) } + end + + # @param [Array<SpecHash>] params List of parameters of the same name + # @param [Integer] opts_count Number of operations involved + # @return [Parameter] Single parameter distilled from the list + def self.from_parameters(params, opts_count) + param = params.first || SpecHash.new + schema = param&.schema || SpecHash.new + Parameter.new(name: param.name, + description: param.description || schema.description, + required: params.filter(&:required).size >= opts_count, + schema:, + default: param.default || schema.default, + deprecated: param.deprecated || schema.deprecated, + deprecation_message: param['x-deprecation-message'] || schema['x-deprecation-message'], + version_deprecated: param['x-version-deprecated'] || schema['x-version-deprecated'], + location: params.any? { |p| p.in == 'path' } ? ArgLocation::PATH : ArgLocation::QUERY) + end +end diff --git a/spec-insert/lib/doc_processor.rb b/spec-insert/lib/doc_processor.rb new file mode 100644 index 0000000000..0aaa01061a --- /dev/null +++ b/spec-insert/lib/doc_processor.rb @@ -0,0 +1,62 @@ +# frozen_string_literal: true + +require 'pathname' +require_relative 'renderers/spec_insert' +require_relative 'spec_insert_error' + +# Processes a file, replacing spec-insert blocks with rendered content +class DocProcessor + START_MARKER = /<!-- spec_insert_start/ + END_MARKER = /<!-- spec_insert_end -->/ + + def initialize(file_path, logger:) + @file_path = Pathname(file_path) + @logger = logger + end + + # Processes the file, replacing spec-insert blocks with rendered content + # @param [Boolean] write_to_file Whether to write the changes back to the file + def process(write_to_file: true) + relative_path = @file_path.relative_path_from(Pathname.new(Dir.pwd)) + lines = File.readlines(@file_path) + original_content = lines.join + insertions = find_insertions(lines) + return if insertions.empty? + + insertions.reverse_each { |start, finish, insert| lines[start..finish] = insert.render } + rendered_content = lines.join + if write_to_file && rendered_content != original_content + File.write(@file_path, rendered_content) + @logger.info "Spec components inserted into #{relative_path} successfully." + end + rendered_content + rescue SpecInsertError => e + @logger.error "Error processing #{relative_path}. #{e.message}" + end + + private + + # @return Array<[Integer, Integer, SpecInsert]> + def find_insertions(lines) + start_indices = lines.each_with_index + .filter { |line, _index| line.match?(START_MARKER) } + .map { |_line, index| index } + end_indices = start_indices.map do |index| + (index..lines.length - 1).find { |i| lines[i].match?(END_MARKER) } + end.compact + + validate_markers!(start_indices, end_indices) + + start_indices.zip(end_indices).map do |start, finish| + [start, finish, SpecInsert.new(lines[start..finish])] + end + end + + # @param [Array<Integer>] start_indices + # @param [Array<Integer>] end_indices + def validate_markers!(start_indices, end_indices) + return if start_indices.length == end_indices.length && + start_indices.zip(end_indices).flatten.each_cons(2).all? { |a, b| a < b } + raise SpecInsertError, 'Mismatched "spec_insert_start" and "spec_insert_end" markers.' + end +end diff --git a/spec-insert/lib/insert_arguments.rb b/spec-insert/lib/insert_arguments.rb new file mode 100644 index 0000000000..6216b8a3e0 --- /dev/null +++ b/spec-insert/lib/insert_arguments.rb @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +# Doc Insert Arguments +class InsertArguments + COLUMNS = %w[Parameter Description Required Type Default].freeze + DEFAULT_COLUMNS = %w[Parameter Type Description].freeze + attr_reader :raw + + # @param [Array<String>] lines the lines between <!-- doc_insert_start and --> + def initialize(lines) + end_index = lines.each_with_index.find { |line, _index| line.match?(/^\s*-->/) }&.last&.- 1 + @raw = lines[1..end_index].filter { |line| line.include?(':') }.to_h do |line| + key, value = line.split(':') + [key.strip, value.strip] + end + end + + # @return [String] + def api + @raw['api'] + end + + # @return [String] + def component + @raw['component'] + end + + # @return [Array<String>] + def columns + cols = parse_array(@raw['columns']) || DEFAULT_COLUMNS + invalid = cols - COLUMNS + raise ArgumentError, "Invalid column(s): #{invalid.join(', ')}" unless invalid.empty? + cols + end + + # @return [Boolean] + def pretty + parse_boolean(@raw['pretty'], default: false) + end + + # @return [Boolean] + def include_global + parse_boolean(@raw['include_global'], default: false) + end + + # @return [Boolean] + def include_deprecated + parse_boolean(@raw['include_deprecated'], default: true) + end + + # @return [Boolean] + def omit_header + parse_boolean(@raw['omit_header'], default: false) + end + + private + + # @param [String] value comma-separated array + def parse_array(value) + return nil if value.nil? + value.split(',').map(&:strip) + end + + # @param [String] value + # @param [Boolean] default value to return when nil + def parse_boolean(value, default:) + return default if value.nil? + return true if value.in?(%w[true True TRUE yes Yes YES 1]) + return false if value.in?(%w[false False FALSE no No NO 0]) + raise ArgumentError, "Invalid boolean value: #{value}" + end +end diff --git a/spec-insert/lib/jekyll-spec-insert.rb b/spec-insert/lib/jekyll-spec-insert.rb new file mode 100644 index 0000000000..14a8997cc8 --- /dev/null +++ b/spec-insert/lib/jekyll-spec-insert.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +require 'active_support/all' +require 'listen' +require 'yaml' +require_relative 'spec_hash' +require_relative 'doc_processor' + +# Jekyll plugin to insert document components generated from the spec into the Jekyll site +class JekyllSpecInsert < Jekyll::Command + # @param [Mercenary::Program] prog + def self.init_with_program(prog) + prog.command(:'spec-insert') do |c| + c.syntax 'spec-insert [options]' + c.option 'watch', '--watch', '-W', 'Watch for changes and rebuild' + c.option 'refresh-spec', '--refresh-spec', '-R', 'Redownload the OpenSearch API specification' + c.action do |_args, options| + spec_file = File.join(Dir.pwd, 'spec-insert/opensearch-openapi.yaml') + excluded_paths = YAML.load_file('_config.yml')['exclude'] + download_spec(spec_file, forced: options['refresh-spec']) + SpecHash.load_file(spec_file) + run_once(excluded_paths) + watch(excluded_paths) if options['watch'] + end + end + end + + def self.download_spec(spec_file, forced: false) + return if !forced && File.exist?(spec_file) && (File.mtime(spec_file) > 1.day.ago) + Jekyll.logger.info 'Downloading OpenSearch API specification...' + system 'curl -L -X GET ' \ + 'https://github.com/opensearch-project/opensearch-api-specification' \ + '/releases/download/main-latest/opensearch-openapi.yaml ' \ + "-o #{spec_file}" + end + + def self.run_once(excluded_paths) + excluded_paths = excluded_paths.map { |path| File.join(Dir.pwd, path) } + Dir.glob(File.join(Dir.pwd, '**/*.md')) + .filter { |file| excluded_paths.none? { |excluded| file.start_with?(excluded) } } + .each { |file| DocProcessor.new(file, logger: Jekyll.logger).process } + end + + def self.watch(excluded_paths) + Jekyll.logger.info "\nWatching for changes...\n" + excluded_paths = excluded_paths.map { |path| /\.#{path}$/ } + + Listen.to(Dir.pwd, only: /\.md$/, ignore: excluded_paths) do |modified, added, _removed| + (modified + added).each { |file| DocProcessor.new(file, logger: Jekyll.logger).process } + end.start + + trap('INT') { exit } + trap('TERM') { exit } + sleep + end +end diff --git a/spec-insert/lib/renderers/base_mustache_renderer.rb b/spec-insert/lib/renderers/base_mustache_renderer.rb new file mode 100644 index 0000000000..b3d756304c --- /dev/null +++ b/spec-insert/lib/renderers/base_mustache_renderer.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +require 'mustache' + +# Base Mustache Renderer +class BaseMustacheRenderer < Mustache + self.template_path = "#{__dir__}/templates" + + # @param [Action] action API Action + # @param [InsertArguments] args + def initialize(action, args) + super() + @action = action + @args = args + end + + def omit_header + @args.omit_header + end +end diff --git a/spec-insert/lib/renderers/parameter_table_renderer.rb b/spec-insert/lib/renderers/parameter_table_renderer.rb new file mode 100644 index 0000000000..23312962d8 --- /dev/null +++ b/spec-insert/lib/renderers/parameter_table_renderer.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +require_relative 'table_renderer' + +# Renders a table of parameters of an API action +class ParameterTableRenderer + # @param [Array<Parameter>] parameters + # @param [InsertArguments] args + def initialize(parameters, args) + @columns = args.columns + @pretty = args.pretty + @parameters = parameters + @parameters = @parameters.reject(&:deprecated) unless args.include_deprecated + @parameters = @parameters.sort_by { |arg| [arg.required ? 0 : 1, arg.deprecated ? 1 : 0, arg.name] } + end + + # @return [String] + def render + columns = @columns.map { |col| TableRenderer::Column.new(col, col) } + rows = @parameters.map { |arg| row(arg) } + TableRenderer.new(columns, rows, pretty: @pretty).render_lines.join("\n") + end + + private + + def row(param) + { + 'Parameter' => "`#{param.name}`#{' <br> _DEPRECATED_' if param.deprecated}", + 'Description' => description(param), + 'Required' => param.required ? 'Required' : nil, + 'Type' => param.doc_type, + 'Default' => param.default + } + end + + def description(param) + deprecation = deprecation(param) + required = param.required && @columns.exclude?('Required') ? '**(Required)** ' : '' + description = param.description.gsub("\n", ' ') + default = param.default.nil? || @columns.includes('Default') ? '' : " _(Default: #{param.default})_" + + "#{deprecation}#{required}#{description}#{default}" + end + + def deprecation(param) + message = ": #{param.deprecation_message}" if param.deprecation_message.present? + since = " since #{param.version_deprecated}" if param.version_deprecated.present? + "_(Deprecated#{since}#{message})_ " if param.deprecated + end +end diff --git a/spec-insert/lib/renderers/path_parameters.rb b/spec-insert/lib/renderers/path_parameters.rb new file mode 100644 index 0000000000..b1265bcf53 --- /dev/null +++ b/spec-insert/lib/renderers/path_parameters.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +require_relative 'base_mustache_renderer' +require_relative 'parameter_table_renderer' + +# Renders path parameters +class PathParameters < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/path_parameters.mustache" + + def table + params = @action.arguments.select { |arg| arg.location == ArgLocation::PATH } + ParameterTableRenderer.new(params, @args).render + end +end diff --git a/spec-insert/lib/renderers/paths_and_methods.rb b/spec-insert/lib/renderers/paths_and_methods.rb new file mode 100644 index 0000000000..0685c03b36 --- /dev/null +++ b/spec-insert/lib/renderers/paths_and_methods.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require_relative 'base_mustache_renderer' + +# Renders paths and http methods +class PathsAndMethods < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/paths_and_methods.mustache" + + def operations + ljust = @action.operations.map { |op| op.http_verb.length }.max + @action.operations + .sort_by { |op| [op.url.length, op.http_verb] } + .map { |op| { verb: op.http_verb.ljust(ljust), path: op.url } } + end +end diff --git a/spec-insert/lib/renderers/query_parameters.rb b/spec-insert/lib/renderers/query_parameters.rb new file mode 100644 index 0000000000..37058ba5f1 --- /dev/null +++ b/spec-insert/lib/renderers/query_parameters.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require_relative 'base_mustache_renderer' +require_relative 'parameter_table_renderer' + +# Renders query parameters +class QueryParameters < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/query_parameters.mustache" + + def table + params = @action.arguments.select { |arg| arg.location == ArgLocation::QUERY } + params += Parameter.global if @args.include_global + ParameterTableRenderer.new(params, @args).render + end +end diff --git a/spec-insert/lib/renderers/spec_insert.rb b/spec-insert/lib/renderers/spec_insert.rb new file mode 100644 index 0000000000..4d5ddb3803 --- /dev/null +++ b/spec-insert/lib/renderers/spec_insert.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +require_relative 'base_mustache_renderer' +require_relative '../insert_arguments' +require_relative '../api/action' +require_relative '../spec_insert_error' +require_relative 'paths_and_methods' +require_relative 'path_parameters' +require_relative 'query_parameters' + +# Class to render spec insertions +class SpecInsert < BaseMustacheRenderer + COMPONENTS = Set.new(%w[query_params path_params paths_and_http_methods]).freeze + self.template_file = "#{__dir__}/templates/spec_insert.mustache" + + # @param [Array<String>] arg_lines the lines between "<!-- doc_insert_start" and "-->" + def initialize(arg_lines) + args = InsertArguments.new(arg_lines) + action = Action.actions[args.api] + super(action, args) + raise SpecInsertError, '`api` argument not specified.' unless @args.api + raise SpecInsertError, "API Action '#{@args.api}' does not exist in the spec." unless @action + end + + def arguments + @args.raw.map { |key, value| { key:, value: } } + end + + def content + raise SpecInsertError, '`component` argument not specified.' unless @args.component + case @args.component.to_sym + when :query_parameters + QueryParameters.new(@action, @args).render + when :path_parameters + PathParameters.new(@action, @args).render + when :paths_and_http_methods + PathsAndMethods.new(@action, @args).render + else + raise SpecInsertError, "Invalid component: #{@args.component}" + end + end +end diff --git a/spec-insert/lib/renderers/table_renderer.rb b/spec-insert/lib/renderers/table_renderer.rb new file mode 100644 index 0000000000..1cabc435bd --- /dev/null +++ b/spec-insert/lib/renderers/table_renderer.rb @@ -0,0 +1,58 @@ +# frozen_string_literal: true + +# TableRenderer renders a markdown table with the given columns and rows +class TableRenderer + # Column object for rendering markdown tables + class Column + attr_reader :title, :key + attr_accessor :width + + # @param [String] title display title + # @param [String | Symbol] key key to access in row hash + def initialize(title, key) + @title = title + @key = key + @width = 0 + end + end + + # @param [Array<Column>] columns + # @param [Array<Hash>] rows + # @param [Boolean] pretty whether to render a pretty table or a compact one + def initialize(columns, rows, pretty:) + @column = columns + @rows = rows + @pretty = pretty + end + + # @return [Array<String>] + def render_lines + calculate_column_widths if @pretty + [render_column, render_divider] + render_rows + end + + private + + def calculate_column_widths + @column.each do |column| + column.width = [@rows.map { |row| row[column.key].to_s.length }.max || 0, column.title.length].max + end + end + + def render_column + columns = @column.map { |column| column.title.ljust(column.width) }.join(' | ') + @pretty ? "| #{columns} |" : columns + end + + def render_divider + dividers = @column.map { |column| ":#{'-' * [column.width + 1, 3].max}" } + @pretty ? "|#{dividers.join('|')}|" : dividers.join(' | ') + end + + def render_rows + @rows.map do |row| + cells = @column.map { |column| row[column.key].to_s.ljust(column.width).gsub('|', '\|') }.join(' | ') + @pretty ? "| #{cells} |" : cells + end + end +end diff --git a/spec-insert/lib/renderers/templates/path_parameters.mustache b/spec-insert/lib/renderers/templates/path_parameters.mustache new file mode 100644 index 0000000000..9d9a2df9d4 --- /dev/null +++ b/spec-insert/lib/renderers/templates/path_parameters.mustache @@ -0,0 +1,4 @@ +{{^omit_header}} +## Path parameters +{{/omit_header}} +{{{table}}} \ No newline at end of file diff --git a/spec-insert/lib/renderers/templates/paths_and_methods.mustache b/spec-insert/lib/renderers/templates/paths_and_methods.mustache new file mode 100644 index 0000000000..3c2df68011 --- /dev/null +++ b/spec-insert/lib/renderers/templates/paths_and_methods.mustache @@ -0,0 +1,8 @@ +{{^omit_header}} +## Paths and HTTP methods +{{/omit_header}} +```json +{{#operations}} +{{{verb}}} {{{path}}} +{{/operations}} +``` \ No newline at end of file diff --git a/spec-insert/lib/renderers/templates/query_parameters.mustache b/spec-insert/lib/renderers/templates/query_parameters.mustache new file mode 100644 index 0000000000..d7331d8f5a --- /dev/null +++ b/spec-insert/lib/renderers/templates/query_parameters.mustache @@ -0,0 +1,7 @@ +{{^omit_header}} +## Query parameters +{{#optional}} +All query parameters are optional. +{{/optional}} +{{/omit_header}} +{{{table}}} \ No newline at end of file diff --git a/spec-insert/lib/renderers/templates/spec_insert.mustache b/spec-insert/lib/renderers/templates/spec_insert.mustache new file mode 100644 index 0000000000..63b6323d48 --- /dev/null +++ b/spec-insert/lib/renderers/templates/spec_insert.mustache @@ -0,0 +1,7 @@ +<!-- spec_insert_start +{{#arguments}} +{{{key}}}: {{{value}}} +{{/arguments}} +--> +{{{content}}} +<!-- spec_insert_end --> diff --git a/spec-insert/lib/spec_hash.rb b/spec-insert/lib/spec_hash.rb new file mode 100644 index 0000000000..06a872c9b9 --- /dev/null +++ b/spec-insert/lib/spec_hash.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +require 'yaml' +require_relative 'api/action' +require_relative 'api/parameter' + +# Spec class for parsing OpenAPI spec +# It's basically a wrapper around a Hash that allows for accessing hash values as object attributes +# and resolving of $refs +class SpecHash + def self.load_file(file_path) + @raw = YAML.load_file(file_path) + @parsed = SpecHash.new(@raw, parsed: false) + Action.actions = @parsed + Parameter.global = @parsed + end + + # @return [Hash] Raw OpenAPI Spec + class << self; attr_reader :raw; end + + # @return [Spec] Parsed OpenAPI Spec + class << self; attr_reader :parsed; end + + attr_reader :hash + + # @param [Hash] hash + def initialize(hash = {}, parsed: true) + @hash = parsed ? hash : parse(hash) + end + + def [](key) + parse(@hash[key]) + end + + def respond_to_missing?(name, include_private = false) + @hash.key?(name.to_s) || @hash.respond_to?(name) || super + end + + def method_missing(name, ...) + return @hash.send(name, ...) if @hash.respond_to?(name) + parse(@hash[name.to_s]) + end + + private + + def parse(value) + return value.map { |v| parse(v) } if value.is_a?(Array) + return value unless value.is_a?(Hash) + ref = value.delete('$ref') + value.transform_values! { |v| parse(v) } + return SpecHash.new(value) unless ref + SpecHash.new(parse(resolve(ref)).merge(value)) + end + + def resolve(ref) + parts = ref.split('/') + parts.shift + self.class.raw.dig(*parts) + end +end diff --git a/spec-insert/lib/spec_insert_error.rb b/spec-insert/lib/spec_insert_error.rb new file mode 100644 index 0000000000..0ee5ccf159 --- /dev/null +++ b/spec-insert/lib/spec_insert_error.rb @@ -0,0 +1,4 @@ +# frozen_string_literal: true + +# Error unique to the SpecInsert process +class SpecInsertError < StandardError; end diff --git a/spec-insert/spec/_fixtures/actual_output/.gitignore b/spec-insert/spec/_fixtures/actual_output/.gitignore new file mode 100644 index 0000000000..de056073af --- /dev/null +++ b/spec-insert/spec/_fixtures/actual_output/.gitignore @@ -0,0 +1 @@ +**/*.md diff --git a/spec-insert/spec/_fixtures/expected_output/param_tables.md b/spec-insert/spec/_fixtures/expected_output/param_tables.md new file mode 100644 index 0000000000..596f185458 --- /dev/null +++ b/spec-insert/spec/_fixtures/expected_output/param_tables.md @@ -0,0 +1,43 @@ +Typical Path Parameters Example + +<!-- spec_insert_start +api: search +component: path_parameters +--> +## Path parameters +Parameter | Type | Description +:--- | :--- | :--- +`index` | List | Comma-separated list of data streams, indexes, and aliases to search. Supports wildcards (`*`). To search all data streams and indexes, omit this parameter or use `*` or `_all`. +<!-- spec_insert_end --> + +Query Parameters Example with Global Parameters, Pretty Print, and Custom Columns + +<!-- spec_insert_start +api: search +component: query_parameters +include_global: true +pretty: true +columns: Type, Parameter, Description, Required, Default +--> +## Query parameters +| Type | Parameter | Description | Required | Default | +|:--------|:--------------------------|:-----------------------------------------------------------------------------------------------------------------------------------|:---------|:--------| +| Boolean | `analyze_wildcard` | If true, wildcard and prefix queries are analyzed. This parameter can only be used when the q query string parameter is specified. | Required | | +| String | `analyzer` | Analyzer to use for the query string. This parameter can only be used when the q query string parameter is specified. | | | +| Boolean | `pretty` | Whether to pretty format the returned JSON response. | | | +| Boolean | `human` <br> _DEPRECATED_ | _(Deprecated since 3.0: Use the `format` parameter instead.)_ Whether to return human readable values for statistics. | | | +<!-- spec_insert_end --> + +Query Parameters Example with only Parameter and Description Columns + +<!-- spec_insert_start +api: search +component: query_parameters +columns: Parameter, Description +omit_header: true +--> +Parameter | Description +:--- | :--- +`analyze_wildcard` | **(Required)** If true, wildcard and prefix queries are analyzed. This parameter can only be used when the q query string parameter is specified. +`analyzer` | Analyzer to use for the query string. This parameter can only be used when the q query string parameter is specified. +<!-- spec_insert_end --> diff --git a/spec-insert/spec/_fixtures/expected_output/paths_and_http_methods.md b/spec-insert/spec/_fixtures/expected_output/paths_and_http_methods.md new file mode 100644 index 0000000000..8ca1569b52 --- /dev/null +++ b/spec-insert/spec/_fixtures/expected_output/paths_and_http_methods.md @@ -0,0 +1,13 @@ + +<!-- spec_insert_start +api: search +component: paths_and_http_methods +--> +## Paths and HTTP methods +```json +GET /_search +POST /_search +GET /{index}/_search +POST /{index}/_search +``` +<!-- spec_insert_end --> diff --git a/spec-insert/spec/_fixtures/input/param_tables.md b/spec-insert/spec/_fixtures/input/param_tables.md new file mode 100644 index 0000000000..d9f24e23f9 --- /dev/null +++ b/spec-insert/spec/_fixtures/input/param_tables.md @@ -0,0 +1,39 @@ +Typical Path Parameters Example + +<!-- spec_insert_start +api: search +component: path_parameters +--> +THIS + TEXT + SHOULD + BE + REPLACED +<!-- spec_insert_end --> + +Query Parameters Example with Global Parameters, Pretty Print, and Custom Columns + +<!-- spec_insert_start +api: search +component: query_parameters +include_global: true +pretty: true +columns: Type, Parameter, Description, Required, Default +--> + THIS TEXT SHOULD BE REPLACED +<!-- spec_insert_end --> + +Query Parameters Example with only Parameter and Description Columns + +<!-- spec_insert_start +api: search +component: query_parameters +columns: Parameter, Description +omit_header: true +--> +THIS +TEXT +SHOULD +BE +REPLACED +<!-- spec_insert_end --> diff --git a/spec-insert/spec/_fixtures/input/paths_and_http_methods.md b/spec-insert/spec/_fixtures/input/paths_and_http_methods.md new file mode 100644 index 0000000000..0e92b8af8e --- /dev/null +++ b/spec-insert/spec/_fixtures/input/paths_and_http_methods.md @@ -0,0 +1,6 @@ + +<!-- spec_insert_start +api: search +component: paths_and_http_methods +--> +<!-- spec_insert_end --> diff --git a/spec-insert/spec/_fixtures/opensearch_spec.yaml b/spec-insert/spec/_fixtures/opensearch_spec.yaml new file mode 100644 index 0000000000..7c67f27e69 --- /dev/null +++ b/spec-insert/spec/_fixtures/opensearch_spec.yaml @@ -0,0 +1,120 @@ +openapi: 3.1.0 +info: + title: OpenSearch API Specification + version: 1.0.0 + x-api-version: 2.16.0 +paths: + /_search: + get: + operationId: search.0 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' + post: + operationId: search.1 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' + /{index}/_search: + get: + operationId: search.2 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___path.index' + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' + post: + operationId: search.3 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___path.index' + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' +components: + + parameters: + + _global___query.pretty: + name: pretty + in: query + description: Whether to pretty format the returned JSON response. + schema: + type: boolean + default: false + x-global: true + + _global___query.human: + name: human + in: query + description: Whether to return human readable values for statistics. + schema: + type: boolean + default: true + x-global: true + deprecated: true + x-version-deprecated: '3.0' + x-deprecation-message: Use the `format` parameter instead. + + search___path.index: + in: path + name: index + description: |- + Comma-separated list of data streams, indexes, and aliases to search. + Supports wildcards (`*`). + To search all data streams and indexes, omit this parameter or use `*` or `_all`. + required: true + schema: + $ref: '#/components/schemas/_common___Indices' + style: simple + + search___query.analyze_wildcard: + in: query + name: analyze_wildcard + required: true + description: |- + If true, wildcard and prefix queries are analyzed. + This parameter can only be used when the q query string parameter is specified. + schema: + type: boolean + default: false + style: form + + search___query.analyzer: + in: query + name: analyzer + description: |- + Analyzer to use for the query string. + This parameter can only be used when the q query string parameter is specified. + schema: + type: string + style: form + + schemas: + + _common___Indices: + oneOf: + - $ref: '#/components/schemas/_common___IndexName' + - type: array + items: + $ref: '#/components/schemas/_common___IndexName' + + _common___IndexName: + type: string diff --git a/spec-insert/spec/doc_processor_spec.rb b/spec-insert/spec/doc_processor_spec.rb new file mode 100644 index 0000000000..073613a2a9 --- /dev/null +++ b/spec-insert/spec/doc_processor_spec.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +require_relative 'spec_helper' +require_relative '../lib/doc_processor' +require_relative '../lib/spec_hash' + +describe DocProcessor do + SpecHash.load_file('spec/_fixtures/opensearch_spec.yaml') + + def test_file(file_name) + expected_output = File.read("#{__dir__}/_fixtures/expected_output/#{file_name}.md") + actual_output = described_class.new("#{__dir__}/_fixtures/input/#{file_name}.md", logger: Logger.new($stdout)).process(write_to_file: false) + File.write("./spec/_fixtures/actual_output/#{file_name}.md", actual_output) + expect(actual_output).to eq(expected_output) + end + + it 'inserts the param tables correctly' do + test_file('param_tables') + end + + it 'inserts the paths and http methods correctly' do + test_file('paths_and_http_methods') + end +end diff --git a/spec-insert/spec/spec_helper.rb b/spec-insert/spec/spec_helper.rb new file mode 100644 index 0000000000..74d9dc9bb9 --- /dev/null +++ b/spec-insert/spec/spec_helper.rb @@ -0,0 +1,102 @@ +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # This option will default to `:apply_to_host_groups` in RSpec 4 (and will + # have no way to turn it off -- the option exists only for backwards + # compatibility in RSpec 3). It causes shared context metadata to be + # inherited by the metadata hash of host groups and examples, rather than + # triggering implicit auto-inclusion in groups with matching metadata. + config.shared_context_metadata_behavior = :apply_to_host_groups + + # The settings below are suggested to provide a good initial experience + # with RSpec, but feel free to customize to your heart's content. + + # This allows you to limit a spec run to individual examples or groups + # you care about by tagging them with `:focus` metadata. When nothing + # is tagged with `:focus`, all examples get run. RSpec also provides + # aliases for `it`, `describe`, and `context` that include `:focus` + # metadata: `fit`, `fdescribe` and `fcontext`, respectively. + config.filter_run_when_matching :focus + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = 'rspec_examples.txt' + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/ + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose expected_output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed expected_output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = 'doc' + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed + + config.expose_dsl_globally = true +end + +require 'active_support/all' +require 'rspec' diff --git a/templates/EXPERIMENTAL_TEMPLATE.md b/templates/EXPERIMENTAL_TEMPLATE.md index 6aa06c5824..b954a98a5d 100644 --- a/templates/EXPERIMENTAL_TEMPLATE.md +++ b/templates/EXPERIMENTAL_TEMPLATE.md @@ -10,5 +10,5 @@ parent: This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://example.issue.link). {: .warning} -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress the feature or if you want to leave feedback, join the discussion in the [OpenSearch forum](https://forum.opensearch.org/). -{: .warning} \ No newline at end of file +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning}