diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 8aebdf8699..0ec6c5e009 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @hdhalter @kolchfa-aws @Naarcha-AWS @vagimeli @AMoo-Miki @natebower @dlvenable +* @hdhalter @kolchfa-aws @Naarcha-AWS @vagimeli @AMoo-Miki @natebower @dlvenable @scrawfor99 @epugh diff --git a/.github/dco.yml b/.github/dco.yml deleted file mode 100644 index 37e411e1be..0000000000 --- a/.github/dco.yml +++ /dev/null @@ -1,2 +0,0 @@ -require: - members: false \ No newline at end of file diff --git a/.github/vale/styles/OpenSearch/AdverbsOfTime.yml b/.github/vale/styles/OpenSearch/AdverbsOfTime.yml deleted file mode 100644 index c1b04c74dc..0000000000 --- a/.github/vale/styles/OpenSearch/AdverbsOfTime.yml +++ /dev/null @@ -1,8 +0,0 @@ -extends: existence -message: "Don't use a comma after single-word adverbs of time at the beginning of a sentence in '%s'." -link: https://github.com/opensearch-project/documentation-website/blob/main/STYLE_GUIDE.md#punctuation-and-capitalization -level: error -ignorecase: false -nonword: true -tokens: - - '(?:(^|\s)(Then|Afterward|Later|Subsequently)),' diff --git a/.github/vale/styles/OpenSearch/LinksDoubleParentheses.yml b/.github/vale/styles/OpenSearch/LinksDoubleParentheses.yml index 9a59479fab..bc4f3aed09 100644 --- a/.github/vale/styles/OpenSearch/LinksDoubleParentheses.yml +++ b/.github/vale/styles/OpenSearch/LinksDoubleParentheses.yml @@ -1,8 +1,8 @@ extends: existence -message: "Remove double slashes from the link '%s'." +message: "Remove double parentheses from the link '%s'." level: error nonword: true scope: raw tokens: - - '\[.*\](\(){2,}.*(\)){1,}' - - '\[.*\](\(){1,}.*(\)){2,}' \ No newline at end of file + - '\]\({2,}[^)]*?\){1,}' + \ No newline at end of file diff --git a/.github/vale/styles/OpenSearch/LinksDoubleSlash.yml b/.github/vale/styles/OpenSearch/LinksDoubleSlash.yml index 369db6a164..528fb73ab6 100644 --- a/.github/vale/styles/OpenSearch/LinksDoubleSlash.yml +++ b/.github/vale/styles/OpenSearch/LinksDoubleSlash.yml @@ -4,4 +4,4 @@ level: error nonword: true scope: raw tokens: - - '\(\{\{site.url\}\}\{\{site.baseurl\}\}.*\/{2,}.*\)' + - '\(\{\{site.url\}\}\{\{site.baseurl\}\}[^)]*?\/{2,}[^)]*?\)' diff --git a/.github/vale/styles/OpenSearch/SpacingWords.yml b/.github/vale/styles/OpenSearch/SpacingWords.yml index b51abacc67..7e4ed59670 100644 --- a/.github/vale/styles/OpenSearch/SpacingWords.yml +++ b/.github/vale/styles/OpenSearch/SpacingWords.yml @@ -1,5 +1,5 @@ extends: existence -message: "There should be once space between words in '%s'." +message: "There should be one space between words in '%s'." level: error nonword: true tokens: diff --git a/.github/vale/styles/OpenSearch/SubstitutionsError.yml b/.github/vale/styles/OpenSearch/SubstitutionsError.yml index 69939eb3a3..fdedce44d8 100644 --- a/.github/vale/styles/OpenSearch/SubstitutionsError.yml +++ b/.github/vale/styles/OpenSearch/SubstitutionsError.yml @@ -23,7 +23,8 @@ swap: 'Huggingface': Hugging Face 'indices': indexes 'ingestion pipeline': ingest pipeline - 'keystore': key store + 'key store': keystore + 'key/value': key-value 'kmeans': k-means 'kNN': k-NN 'machine-learning': machine learning @@ -46,7 +47,7 @@ swap: 'time stamp': timestamp 'timezone': time zone 'tradeoff': trade-off - 'truststore': trust store + 'trust store': truststore 'U.S.': US 'web page': webpage 'web site': website diff --git a/.github/vale/styles/Vocab/OpenSearch/Plugins/accept.txt b/.github/vale/styles/Vocab/OpenSearch/Plugins/accept.txt index ed339f48bb..9dc315ec68 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Plugins/accept.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Plugins/accept.txt @@ -4,6 +4,7 @@ Asynchronous Search plugin Crypto plugin Cross-Cluster Replication plugin Custom Codecs plugin +Flow Framework plugin Maps plugin Notebooks plugin Notifications plugin @@ -19,6 +20,7 @@ ML Commons plugin Neural Search plugin Observability plugin Performance Analyzer plugin +Query Insights plugin Query Workbench plugin Search Relevance plugin Security plugin diff --git a/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt b/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt index 85a7a846d1..16e7562dcc 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt @@ -59,6 +59,8 @@ Open Distro OpenAI OpenID Connect OpenSearch +OpenSearch Assistant +OpenSearch Assistant Toolkit OpenSearch Benchmark OpenSearch Dashboards OpenSearch Playground diff --git a/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt b/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt index caca926434..b588586138 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt @@ -20,11 +20,13 @@ Boolean [Dd]eallocate [Dd]eduplicates? [Dd]eduplication +[Dd]eprovision(s|ed|ing)? [Dd]eserialize [Dd]eserialization Dev [Dd]iscoverability Distro +[Dd]ownvote(s|d)? [Dd]uplicative [Ee]gress [Ee]num @@ -75,7 +77,11 @@ Levenshtein [Mm]ultivalued [Mm]ultiword [Nn]amespace +[Oo]versamples? +[Oo]nboarding pebibyte +[Pp]erformant +[Pp]laintext [Pp]luggable [Pp]reconfigure [Pp]refetch @@ -87,6 +93,7 @@ pebibyte [Pp]reprocess [Pp]retrain [Pp]seudocode +[Quantiz](e|ation|ing|er) [Rr]ebalance [Rr]ebalancing [Rr]edownload @@ -103,6 +110,7 @@ pebibyte [Ss]erverless [Ss]harding [Ss]ignificand +[Ss]napshott(ed|ing) stdout [Ss]temmers? [Ss]ubaggregation @@ -117,12 +125,14 @@ stdout [Ss]ubvector [Ss]ubwords? [Ss]uperset +[Ss]yslog tebibyte [Tt]emplated [Tt]okenization [Tt]okenizer? [Tt]ooltip [Tt]ranslog +[Tt]ruststore [Uu]nary [Uu]ncheck [Uu]ncomment @@ -130,8 +140,10 @@ tebibyte [Uu]nigram [Uu]nnesting [Uu]nrecovered +[Uu]nregister(s|ed|ing)? [Uu]pdatable [Uu]psert +[Uu]pvote(s|d)? [Ww]alkthrough [Ww]ebpage xy \ No newline at end of file diff --git a/.github/vale/tests/test-style-neg.md b/.github/vale/tests/test-style-neg.md index 4b474cb05b..4049eec3cb 100644 --- a/.github/vale/tests/test-style-neg.md +++ b/.github/vale/tests/test-style-neg.md @@ -2,8 +2,6 @@ This sentence tests Advanced Placement (AP). We should define AP before using. -Then this sentence tests adverbs of time. - This sentence tests cybersecurity. This sentence tests dash---spacing. diff --git a/.github/vale/tests/test-style-pos.md b/.github/vale/tests/test-style-pos.md index 7ab71a368b..1cf640f463 100644 --- a/.github/vale/tests/test-style-pos.md +++ b/.github/vale/tests/test-style-pos.md @@ -2,8 +2,6 @@ This sentence tests AP. AP should be defined before using. -Then, this sentence tests adverbs of time. - This sentence tests cyber security. This sentence tests dash --- spacing. diff --git a/.github/workflows/.delete_backport_branch.yml.swp b/.github/workflows/.delete_backport_branch.yml.swp new file mode 100644 index 0000000000..248b66532a Binary files /dev/null and b/.github/workflows/.delete_backport_branch.yml.swp differ diff --git a/.github/workflows/automerge-backport.yml b/.github/workflows/automerge-backport.yml new file mode 100644 index 0000000000..0d33634862 --- /dev/null +++ b/.github/workflows/automerge-backport.yml @@ -0,0 +1,33 @@ +name: Automerge Backport +on: + pull_request: + pull_request_review: + types: + - submitted + check_suite: + types: + - completed + status: {} +jobs: + automerge-backport: + if: | + github.repository == 'opensearch-project/documentation-website' && + startsWith(github.event.pull_request.head.ref, 'backport/') + runs-on: ubuntu-latest + steps: + - name: Wait some time so that label and approval is up + run: sleep 30 + - id: automerge + name: automerge + uses: "pascalgn/automerge-action@v0.16.2" + env: + GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + MERGE_LABELS: "backport-automerge,!On hold" + MERGE_FILTER_AUTHOR: "opensearch-trigger-bot[bot]" + MERGE_REQUIRED_APPROVALS: "1" + MERGE_RETRIES: "20" + MERGE_RETRY_SLEEP: "10000" + MERGE_ERROR_FAIL: "true" + MERGE_FORKS: "false" + MERGE_METHOD: "squash" + MERGE_DELETE_BRANCH: "true" diff --git a/.github/workflows/backport.yml b/.github/workflows/backport.yml index 8f933b2ada..0e70f48eea 100644 --- a/.github/workflows/backport.yml +++ b/.github/workflows/backport.yml @@ -16,6 +16,7 @@ jobs: # See https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target. if: > github.event.pull_request.merged + && github.repository == 'opensearch-project/documentation-website' && ( github.event.action == 'closed' || ( @@ -38,3 +39,25 @@ jobs: with: github_token: ${{ steps.github_app_token.outputs.token }} head_template: backport/backport-<%= number %>-to-<%= base %> + + - name: Label new backport PR with backport-automerge label + run: | + PR_LABELS=`echo "${{ toJson(github.event.pull_request.labels.*.name) }}" | sed -e 's/\[//g;s/\]//g;s/^\s*//g;s/\s*$//g' | tr -d '\n'` + echo $PR_LABELS + OLDIFS=$IFS + export IFS=',' + for label in $PR_LABELS + do + if [[ "$label" == "backport"* ]]; then + echo "Found label \"$label\"" + PR_REPO="opensearch-project/documentation-website" + PR_BRANCH=backport/backport-${{ github.event.pull_request.number }}-to-`echo $label | cut -d ' ' -f2` + PR_NUMBER=`gh pr list -R $PR_REPO --json "number,headRefName" --state open | jq -r ".[] | select(.headRefName == \"$PR_BRANCH\") | .number"` + echo "Update Backport PR '#$PR_NUMBER' on branch '$PR_BRANCH' with 'backport-automerge' label" + gh issue edit -R $PR_REPO $PR_NUMBER --add-label backport-automerge + echo "Auto approve $PR_REPO PR #$PR_NUMBER with opensearch-trigger-bot" + gh pr review -R $PR_REPO $PR_NUMBER --approve + fi + done + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/delete_backport_branch.yml b/.github/workflows/delete_backport_branch.yml deleted file mode 100644 index 387a124b8c..0000000000 --- a/.github/workflows/delete_backport_branch.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: Delete merged branch of the backport PRs -on: - pull_request: - types: - - closed - -jobs: - delete-branch: - runs-on: ubuntu-latest - if: startsWith(github.event.pull_request.head.ref,'backport/') - steps: - - name: Delete merged branch - uses: SvanBoxel/delete-merged-branch@main - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/delete_merged_branch.yml b/.github/workflows/delete_merged_branch.yml new file mode 100644 index 0000000000..f6fc3f2828 --- /dev/null +++ b/.github/workflows/delete_merged_branch.yml @@ -0,0 +1,22 @@ +name: Delete merged branch of the PRs +on: + pull_request: + types: + - closed + +jobs: + delete-branch: + runs-on: ubuntu-latest + if: | + github.repository == 'opensearch-project/documentation-website' && + ${{ !startsWith(github.event.pull_request.head.ref, 'main') }} && + ${{ !startsWith(github.event.pull_request.head.ref, '1.') }} && + ${{ !startsWith(github.event.pull_request.head.ref, '2.') }} && + ${{ !startsWith(github.event.pull_request.head.ref, 'version/') }} + steps: + - name: Echo remove branch + run: echo Removing ${{github.event.pull_request.head.ref}} + - name: Delete merged branch + uses: SvanBoxel/delete-merged-branch@main + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/encoding-check.yml b/.github/workflows/encoding-check.yml new file mode 100644 index 0000000000..ade95e5f37 --- /dev/null +++ b/.github/workflows/encoding-check.yml @@ -0,0 +1,27 @@ +name: Encoding Checker + +on: [pull_request] + +jobs: + encoding-checker: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Check for possible file that does not follow utf-8 encoding + run: | + set +e + IFS=$(echo -en "\n\b") + COUNTER=0 + for i in `find . -type f \( -name "*.txt" -o -name "*.md" -o -name "*.markdown" -o -name "*.html" \) | grep -vE "^./.git"`; + do + grep -axv '.*' "$i" + if [ "$?" -eq 0 ]; then + echo -e "######################\n$i\n######################" + COUNTER=$(( COUNTER + 1 )) + fi + done + if [ "$COUNTER" != 0 ]; then + echo "Found files that is not following utf-8 encoding, exit 1" + exit 1 + fi diff --git a/.github/workflows/jekyll-build.yml b/.github/workflows/jekyll-build.yml index e70385484e..a1b09c606d 100644 --- a/.github/workflows/jekyll-build.yml +++ b/.github/workflows/jekyll-build.yml @@ -3,14 +3,14 @@ name: Jekyll Build Verification on: [pull_request] jobs: - check: + jekyll-build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: ruby/setup-ruby@v1 with: - ruby-version: '3.0' + ruby-version: '3.2' bundler-cache: true - run: | JEKYLL_FATAL_LINK_CHECKER=internal bundle exec jekyll build --future diff --git a/.github/workflows/link-checker.yml b/.github/workflows/link-checker.yml index 2e8c63a888..6966d8cdac 100644 --- a/.github/workflows/link-checker.yml +++ b/.github/workflows/link-checker.yml @@ -4,7 +4,7 @@ on: schedule: - cron: "30 11 * * 0" jobs: - check: + link-checker: if: github.repository == 'opensearch-project/documentation-website' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml index 2eee5d82fb..732bb5568c 100644 --- a/.github/workflows/vale.yml +++ b/.github/workflows/vale.yml @@ -1,4 +1,4 @@ -name: Style check +name: Style Check on: pull_request: @@ -20,4 +20,5 @@ jobs: reporter: github-pr-check filter_mode: added vale_flags: "--no-exit" - version: 2.28.0 \ No newline at end of file + version: 2.28.0 + continue-on-error: true diff --git a/.vale.ini b/.vale.ini index a87a8a0698..2fb470b9dc 100644 --- a/.vale.ini +++ b/.vale.ini @@ -19,7 +19,6 @@ Vale.Spelling = NO Vale.Repetition = NO Vale.Terms = YES OpenSearch.AcronymParentheses = YES -OpenSearch.AdverbsOfTime = YES OpenSearch.Ampersand = YES OpenSearch.Cyber = YES OpenSearch.DashSpacing = YES diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ae32dd834..f9f1a23f51 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -82,22 +82,31 @@ Follow these steps to set up your local copy of the repository: ``` curl -sSL https://get.rvm.io | bash -s stable - rvm install 2.7 + rvm install 3.2 ruby -v ``` -1. Install [Jekyll](https://jekyllrb.com/) if you don't already have it: +1. Install [Bundler](https://bundler.io/) if you don't already have it: ``` - gem install bundler jekyll + gem install bundler ``` -1. Install the Jekyll dependencies: +1. Install Jekyll and all the dependencies: ``` bundle install ``` +#### Troubleshooting + +If you encounter an error while trying to build the documentation website, find the error in the following troubleshooting list: + +- When running `rvm install 3.2` if you receive a `Error running '__rvm_make -j10'`, resolve this by running `rvm install 3.2.0 -C --with-openssl-dir=/opt/homebrew/opt/openssl@3.2` instead of `rvm install 3.2`. +- If receive a `bundle install`: `An error occurred while installing posix-spawn (0.3.15), and Bundler cannot continue.` error when trying to run `bundle install`, resolve this by running `gem install posix-spawn -v 0.3.15 -- --with-cflags=\"-Wno-incompatible-function-pointer-types\"`. Then, run `bundle install`. + + + #### Making, viewing, and submitting changes Here's how to build the website, make changes, and view them locally: @@ -134,11 +143,11 @@ If we accept the PR, we will merge it and will backport it to the appropriate br To ensure that our documentation adheres to the [OpenSearch Project Style Guidelines](STYLE_GUIDE.md), we use the [Vale](https://github.com/errata-ai/vale) linter. Addressing Vale comments on the PR expedites the review process. You can also install Vale locally as follows so you can address the comments before creating a PR: -1. Run `brew install vale`. +1. Download and install [Vale version 2.28.0](https://github.com/errata-ai/vale/releases/tag/v2.28.0). 2. Run `vale *` from the documentation site root directory to lint all Markdown files. To lint a specific file, run `vale /path/to/file`. Optionally, you can install the [Vale VSCode](https://github.com/chrischinchilla/vale-vscode) extension, which integrates Vale with Visual Studio Code. By default, only _errors_ and _warnings_ are underlined. To change the minimum alert level to include _suggestions_, go to **Vale VSCode** > **Extension Settings** and select **suggestion** in the **Vale > Vale CLI: Min Alert Level** dropdown list. ## Getting help -For help with the contribution process, reach out to one of the [points of contact](README.md#points-of-contact). \ No newline at end of file +For help with the contribution process, reach out to one of the [points of contact](README.md#points-of-contact). diff --git a/FORMATTING_GUIDE.md b/FORMATTING_GUIDE.md index 40536444a5..ea5f711798 100644 --- a/FORMATTING_GUIDE.md +++ b/FORMATTING_GUIDE.md @@ -111,7 +111,7 @@ For a callout with multiple paragraphs or lists, use `>`: ## Collapsible blocks -To insert a collapsible block, use the `
` element as follows: +To insert an open collapsible block, use the `
` element as follows: ````html
@@ -132,6 +132,27 @@ To insert a collapsible block, use the `
` element as follows:
```` +To insert a closed collapsible block, omit the `open` state: + +````html +
+ + Response + + {: .text-delta} + +```json +{ + "_nodes" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + } +} +``` +
+```` + Collapsible blocks are useful for long responses and for the Table of Contents at the beginning of a page. ## Dashes diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 1a9c3b5fe7..1bf2a1d219 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -1,6 +1,6 @@ ## Overview -This document contains a list of maintainers in this repo. See [opensearch-project/.github/RESPONSIBILITIES.md](https://github.com/opensearch-project/.github/blob/main/RESPONSIBILITIES.md#maintainer-responsibilities) that explains what the role of maintainer means, what maintainers do in this and other repos, and how they should be doing it. If you're interested in contributing, and becoming a maintainer, see [CONTRIBUTING](CONTRIBUTING.md). +This document lists the maintainers in this repo. See [opensearch-project/.github/RESPONSIBILITIES.md](https://github.com/opensearch-project/.github/blob/main/RESPONSIBILITIES.md#maintainer-responsibilities) for information about the role of a maintainer, what maintainers do in this and other repos, and how they should be doing it. If you're interested in contributing or becoming a maintainer, see [CONTRIBUTING](CONTRIBUTING.md). ## Current Maintainers @@ -9,7 +9,9 @@ This document contains a list of maintainers in this repo. See [opensearch-proje | Heather Halter | [hdhalter](https://github.com/hdhalter) | Amazon | | Fanit Kolchina | [kolchfa-aws](https://github.com/kolchfa-aws) | Amazon | | Nate Archer | [Naarcha-AWS](https://github.com/Naarcha-AWS) | Amazon | -| Nate Bower | [natebower](https://github.com/natebower) | Amazon | +| Nathan Bower | [natebower](https://github.com/natebower) | Amazon | | Melissa Vagi | [vagimeli](https://github.com/vagimeli) | Amazon | | Miki Barahmand | [AMoo-Miki](https://github.com/AMoo-Miki) | Amazon | | David Venable | [dlvenable](https://github.com/dlvenable) | Amazon | +| Stephen Crawford | [scraw99](https://github.com/scrawfor99) | Amazon | +| Eric Pugh | [epugh](https://github.com/epugh) | OpenSource Connections | diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md index 94c4a8d240..d0602c92e1 100644 --- a/STYLE_GUIDE.md +++ b/STYLE_GUIDE.md @@ -45,8 +45,7 @@ Use lowercase when referring to features, unless you are referring to a formally * “The Notifications plugin provides a central location for all of your *notifications* from OpenSearch plugins.” * “*Remote-backed storage* is an experimental feature. Therefore, we do not recommend the use of *remote-backed storage* in a production environment.” * “You can take and restore *snapshots* using the snapshot API.” -* “You can use the *VisBuilder* visualization type in OpenSearch Dashboards to create data visualizations by using a drag-and-drop gesture.” (You can refer to VisBuilder alone or qualify the term with “visualization type”.) -* “As of OpenSearch 2.4, the *ML framework* only supports text-embedding models without GPU acceleration.” +* “You can use the *VisBuilder* visualization type in OpenSearch Dashboards to create data visualizations by using a drag-and-drop gesture” (You can refer to VisBuilder alone or qualify the term with “visualization type”). #### Plugin names @@ -67,7 +66,7 @@ Whenever possible, use the active voice instead of the passive voice. The passiv Refer to the reader as _you_ (second person), and refer to the OpenSearch Project as _we_ (first person). If there are multiple authors for a blog post, you can use _we_ to refer to the authors as individuals. Do not refer to the OpenSearch Project or to the AWS personnel working on the project as a *team*, as this implies differentiation within the community. -Describe the actions that the user takes, rather than contextualizing from the feature perspective. For example, use phrases such as “With this feature, you can...” or “Use this feature to...” instead of saying a feature *allows*, *enables*, or *lets* the user do something. +In most cases, try to describe the actions that the user takes rather than contextualizing from the feature perspective. For example, use phrases such as “With this feature, you can...” or “Use this feature to...” instead of saying a feature *allows*, *enables*, or *lets* the user do something. For procedures or instructions, ensure that action is taken by the user (“Then you can stop the container...”) rather than the writer (“We also have to stop the container...”). Reserve the first-person plural for speaking as the OpenSearch Project, with recommendations, warnings, or explanations. @@ -89,7 +88,7 @@ Avoid excessive words, such as please. Be courteous but not wordy. Extra detail | **Transparent and open** | As an open-source project, we exchange information with the community in an accessible and transparent manner. We publish our product plans in the open on GitHub, share relevant and timely information related to the project through our forum and/or our blog, and engage in open dialogues related to product and feature development in the public sphere. Anyone can view our roadmap, raise a question or an issue, or participate in our community meetings. | - Tell a complete story. If you’re walking the reader through a solution or sharing news, don’t skip important information.
- Be forthcoming. Communicate time-sensitive news and information in a thorough and timely manner.
- If there’s something the reader needs to know, say it up front. Don’t “bury the lede.” | | **Collaborative and supportive** | We’re part of a community that is here to help. We aim to be resourceful on behalf of the community and encourage others to do the same. To facilitate an open exchange of ideas, we provide forums through which the community can ask and answer one another’s questions. | - Use conversational language that welcomes and engages the audience. Have a dialogue.
- Invite discussion and feedback. We have several mechanisms for open discussion, including requests for comment (RFCs), a [community forum](https://forum.opensearch.org/), and [community meetings](https://www.meetup.com/OpenSearch/). | **Trustworthy and personable** | We stay grounded in the facts and the data. We do not overstate what our products are capable of. We demonstrate our knowledge in a humble but authoritative way and reliably deliver what we promise. We provide mechanisms and support that allow the audience to explore our products for themselves, demonstrating that our actions consistently match our words.

We speak to the community in a friendly, welcoming, judgment-free way so that our audience perceives us as being approachable. Our content is people oriented and focused on empowering the user directly. | - Claims and assertions should be grounded in facts and data and supported accordingly.
- Do not exaggerate or overstate. Let the facts and results speak for themselves.
- Encourage the audience to explore our products for themselves. Offer guidance to help them do so.
- Write directly and conversationally. Have a dialogue with your audience. Imagine writing as if you’re speaking directly to the person for whom you’re creating content.
- Write from the community, for the community. Anyone creating or consuming content about OpenSearch is a member of the same group, with shared interest in learning about and building better search and analytics solutions. | -| **Inclusive and accessible** | As an open-source project, The OpenSearch Project is for everyone, and we are inclusive. We value the diversity of backgrounds and perspectives in the OpenSearch community and welcome feedback from any contributor, regardless of their experience level.

We design and create content so that people with disabilities can perceive, navigate, and interact with it. This ensures that our documentation is available and useful for everyone and helps improve the general usability of content.

We understand our community is international and our writing takes that into account. We use plain language that avoids idioms and metaphors that may not be clear to the broader community. | - Use inclusive language to connect with the diverse and global OpenSearch Project audience.- Be careful with our word choices.
- Avoid [sensitive terms](https://github.com/opensearch-project/documentation-website/blob/main/STYLE_GUIDE.md#sensitive-terms).
- Don't use [offensive terms](https://github.com/opensearch-project/documentation-website/blob/main/STYLE_GUIDE.md#offensive-terms).
- Don't use ableist or sexist language or language that perpetuates racist structures or stereotypes.
- Links: Use link text that adequately describes the target page. For example, use the title of the target page instead of “here” or “this link.” In most cases, a formal cross-reference (the title of the page you’re linking to) is the preferred style because it provides context and helps readers understand where they’re going when they choose the link.
- Images:
  - Add introductory text that provides sufficient context for each image.
  - Add ALT text that describes the image for screen readers.
- Procedures: Not everyone uses a mouse, so use device-independent verbs; for example, use “choose” instead of “click.”
- Location: When you’re describing the location of something else in your content, such as an image or another section, use words such as “preceding,” “previous,” or “following” instead of “above” and “below.” +| **Inclusive and accessible** | As an open-source project, the OpenSearch Project is for everyone, and we are inclusive. We value the diversity of backgrounds and perspectives in the OpenSearch community and welcome feedback from any contributor, regardless of their experience level.

We design and create content so that people with disabilities can perceive, navigate, and interact with it. This ensures that our documentation is available and useful for everyone and helps improve the general usability of content.

We understand our community is international and our writing takes that into account. We use plain language that avoids idioms and metaphors that may not be clear to the broader community. | - Use inclusive language to connect with the diverse and global OpenSearch Project audience.
- Be careful with our word choices.
- Avoid [sensitive terms](https://github.com/opensearch-project/documentation-website/blob/main/STYLE_GUIDE.md#sensitive-terms).
- Don't use [offensive terms](https://github.com/opensearch-project/documentation-website/blob/main/STYLE_GUIDE.md#offensive-terms).
- Don't use ableist or sexist language or language that perpetuates racist structures or stereotypes.
- Links: Use link text that adequately describes the target page. For example, use the title of the target page instead of “here” or “this link.” In most cases, a formal cross-reference (the title of the page you’re linking to) is the preferred style because it provides context and helps readers understand where they’re going when they choose the link.
- Images:
  - Add introductory text that provides sufficient context for each image.
  - Add ALT text that describes the image for screen readers.
- Procedures: Not everyone uses a mouse, so use device-independent verbs; for example, use “choose” instead of “click.”
- Location: When you’re describing the location of something else in your content, such as an image or another section, use words such as “preceding,” “previous,” or “following” instead of “above” and “below.” ## Style guidelines @@ -118,10 +117,12 @@ The following table lists acronyms that you don't need to spell out. | Acronym | Spelled-out term | | :--------- | :------- | | 3D | three-dimensional | +| AI | artificial intelligence | | API | application programming interface | | ASCII | American Standard Code for Information Interchange | | BASIC | Beginner's All-Purpose Symbolic Instruction Code | | BM25 | Best Match 25 | +| CLI | command-line interface | | CPU | central processing unit | | CRUD | create, read, update, and delete | | CSV | comma-separated values | @@ -139,6 +140,7 @@ The following table lists acronyms that you don't need to spell out. | IP | Internet protocol | | JPEG | Joint Photographic Experts Group | | JSON | JavaScript Object Notation | +| k-NN | k-nearest neighbors | | NAT | network address translation | | NGINX | engine x | | PDF | Portable Document Format | @@ -344,7 +346,6 @@ We follow a slightly modified version of the _Microsoft Writing Style Guide_ gui - Independent clauses separated by coordinating conjunctions (but, or, yet, for, and, nor, so). - Introductory clauses, phrases, words that precede the main clause. - Words, clauses, and phrases listed in a series. Also known as the Oxford comma. - - Skip the comma after single-word adverbs of time at the beginning of a sentence, such as *afterward*, *then*, *later*, or *subsequently*. - An em dash (—) is the width of an uppercase M. Do not include spacing on either side. Use an em dash to set off parenthetical phrases within a sentence or set off phrases or clauses at the end of a sentence for restatement or emphasis. @@ -502,7 +503,6 @@ The following terms may be problematic *in some contexts*. This doesn’t mean t |--------------------------|-------------------------------------| | blackout | service outage, blocked | | demilitarized zone (DMZ) | perimeter network, perimeter zone | -| primitive | Avoid using *primitive* (especially plural *primitives*) as a colloquial way of referring to the basic concepts or elements that are associated with a feature or to the simplest elements in a programming language. For greatest clarity and to avoid sounding unpleasant, replace with *primitive data type* or *primitive type*. | ## Trademark policy diff --git a/TERMS.md b/TERMS.md index adf005b877..e12cc171ed 100644 --- a/TERMS.md +++ b/TERMS.md @@ -22,6 +22,10 @@ Avoid. Use *one-time* instead. Affect as a noun refers to emotion as expressed in face or body language. Affect as a verb means to influence. Do not confuse with effect. +**AI** + +No need to define as _artificial intelligence (AI)_. + **AI/ML** On first mention, use artificial intelligence and machine learning (AI/ML). @@ -75,10 +79,6 @@ Messages and pop-up boxes appear. Windows, pages, and applications open. The ver Do not abbreviate as app server. -**artificial intelligence** - -On first mention, use *artificial intelligence (AI)*. Use *AI* thereafter. There is no need to redefine *AI* when either *AI/ML* or *GenAI* has already been defined. - **as well as** Avoid. Replace with in addition to or and as appropriate. @@ -160,6 +160,10 @@ Use _certificates_ on first mention. It’s OK to use _certs_ thereafter. Use _continuous integration_ and _continuous delivery (CI/CD)_ or _continuous integration and delivery (CI/CD)_ on first mention. +**CLI** + +No need to define as _command-line interface (CLI)_. + **cluster** A collection of one or more nodes. @@ -232,6 +236,8 @@ Do not use *disable* to refer to users. Always hyphenated. Don’t use _double click_. +**downvote** + **dropdown list** **due to** @@ -291,6 +297,8 @@ Exception: *Execution* is unavoidable for third-party terms for which no alterna **fail over (v.), failover (n.)** +**Faiss** + **file name** **frontend (n., adj.)** @@ -301,7 +309,7 @@ Use frontend as an adjective and a noun. Do not use front end or front-end. Do n **generative AI** -On first mention, use *generative artificial intelligence (generative AI)*. Use *generative AI* thereafter. To avoid the overuse of *generative AI*, *AI/ML-powered applications* may also be used. +Do not use _GenAI_, _Gen AI_, _gen AI_, or _genAI_. To avoid the overuse of *generative AI*, *AI/ML-powered applications* may also be used. **geodistance** @@ -399,7 +407,11 @@ Use *just* in the sense of *just now* (as in "the resources that you just create ## K -**key store** +**keystore** + +**key-value** + +Not _key/value_. **kill** @@ -413,7 +425,7 @@ A simple and popular unsupervised clustering ML algorithm built on top of Tribuo **k-NN** -Short for _k-nearest neighbors_, the k-NN plugin enables users to search for the k-nearest neighbors to a query point across an index of vectors. +Short for _k-nearest neighbors_, the k-NN plugin enables users to search for the k-nearest neighbors to a query point across an index of vectors. No need to define. ## L @@ -439,6 +451,10 @@ OK to use to call out something for comparison. As a general rule, if you can replace like with similar to, it’s OK to use like. But, if you can replace _like_ with _such as_, use _such as_. +**LLM** + +Define on first appearance as _large language model (LLM)_. + **locate in, on** Located _in_ (a folder, directory, path), located on a disk drive or instance. @@ -531,7 +547,7 @@ Use _open source_ as a noun (for example, “The code used throughout this tutor **OpenSearch Playground** -OpenSearch Playground provides a central location for existing and evaluating users to explore features in OpenSearch and OpenSearch Dashboards without downloading or installing any OpenSearch components locally. +Do not precede with _the_. OpenSearch Playground provides a central location for existing and evaluating users to explore features in OpenSearch and OpenSearch Dashboards without downloading or installing any OpenSearch components locally. **operating system** @@ -564,7 +580,7 @@ The default scripting language for OpenSearch, either used inline or stored for **percent** -Spell out in blog posts (for example, 30 percent). +Spell out in blog posts (for example, _30 percent_). Use % in headlines, quotations, and tables or in technical copy. @@ -572,6 +588,10 @@ Use % in headlines, quotations, and tables or in technical copy. An agent and REST API that allows you to query numerous performance metrics for your cluster, including aggregations of those metrics, independent of the Java Virtual Machine (JVM). +**plaintext, plain text** + +Use *plaintext* only to refer to nonencrypted or decrypted text in content about encryption. Use *plain text* to refer to ASCII files. + **please** Avoid using except in quoted text. @@ -596,10 +616,6 @@ Incorrect: an on-premise solution, an on-prem solution A Lucene instance that contains data for some or all of an index. -**primitive** - -Avoid using *primitive* (especially plural *primitives*) as a colloquial way of referring to the basic concepts or elements that are associated with a feature or to the simplest elements in a programming language. For greatest clarity and to avoid sounding unpleasant, replace with *primitive data type* or *primitive type*. - **purge** Use only in reference to specific programming methods. Otherwise, use *delete*, *clear*, or *remove* instead. @@ -690,6 +706,8 @@ Never hyphenated. Use _startup_ as a noun (for example, “The following startup **Stochastic Gradient Descent (SGD)** +**syslog** + ## T **term frequency–inverse document frequency (TF–IDF)** @@ -716,7 +734,7 @@ Data that's provided as part of a metric. The time value is assumed to be when t Avoid using as a verb to refer to an action that precipitates a subsequent action. It is OK to use when referring to a feature name, such as a *trigger function* or *time-triggered architecture*. As a verb, use an alternative, such as *initiate*, *invoke*, *launch*, or *start*. -**trust store** +**truststore** **turn on, turn off** @@ -736,6 +754,8 @@ A storage tier that you can use to store and analyze your data with Elasticsearc Hyphenate as adjectives. Use instead of *top left* and *top right*, unless the field name uses *top*. For example, "The upper-right corner." +**upvote** + **US** No periods, as specified in the Chicago Manual of Style. diff --git a/_about/intro.md b/_about/intro.md deleted file mode 100644 index ef1dc4977f..0000000000 --- a/_about/intro.md +++ /dev/null @@ -1,112 +0,0 @@ ---- -layout: default -title: Intro to OpenSearch -nav_order: 2 -permalink: /intro/ ---- - -# Introduction to OpenSearch - -OpenSearch is a distributed search and analytics engine based on [Apache Lucene](https://lucene.apache.org/). After adding your data to OpenSearch, you can perform full-text searches on it with all of the features you might expect: search by field, search multiple indexes, boost fields, rank results by score, sort results by field, and aggregate results. - -Unsurprisingly, people often use search engines like OpenSearch as the backend for a search application---think [Wikipedia](https://en.wikipedia.org/wiki/Wikipedia:FAQ/Technical#What_software_is_used_to_run_Wikipedia?) or an online store. It offers excellent performance and can scale up and down as the needs of the application grow or shrink. - -An equally popular, but less obvious use case is log analytics, in which you take the logs from an application, feed them into OpenSearch, and use the rich search and visualization functionality to identify issues. For example, a malfunctioning web server might throw a 500 error 0.5% of the time, which can be hard to notice unless you have a real-time graph of all HTTP status codes that the server has thrown in the past four hours. You can use [OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/) to build these sorts of visualizations from data in OpenSearch. - - -## Clusters and nodes - -Its distributed design means that you interact with OpenSearch *clusters*. Each cluster is a collection of one or more *nodes*, servers that store your data and process search requests. - -You can run OpenSearch locally on a laptop---its system requirements are minimal---but you can also scale a single cluster to hundreds of powerful machines in a data center. - -In a single node cluster, such as a laptop, one machine has to do everything: manage the state of the cluster, index and search data, and perform any preprocessing of data prior to indexing it. As a cluster grows, however, you can subdivide responsibilities. Nodes with fast disks and plenty of RAM might be great at indexing and searching data, whereas a node with plenty of CPU power and a tiny disk could manage cluster state. For more information on setting node types, see [Cluster formation]({{site.url}}{{site.baseurl}}/opensearch/cluster/). - - -## Indexes and documents - -OpenSearch organizes data into *indexes*. Each index is a collection of JSON *documents*. If you have a set of raw encyclopedia articles or log lines that you want to add to OpenSearch, you must first convert them to [JSON](https://www.json.org/). A simple JSON document for a movie might look like this: - -```json -{ - "title": "The Wind Rises", - "release_date": "2013-07-20" -} -``` - -When you add the document to an index, OpenSearch adds some metadata, such as the unique document *ID*: - -```json -{ - "_index": "", - "_type": "_doc", - "_id": "", - "_version": 1, - "_source": { - "title": "The Wind Rises", - "release_date": "2013-07-20" - } -} -``` - -Indexes also contain mappings and settings: - -- A *mapping* is the collection of *fields* that documents in the index have. In this case, those fields are `title` and `release_date`. -- Settings include data like the index name, creation date, and number of shards. - -## Primary and replica shards - -OpenSearch splits indexes into *shards* for even distribution across nodes in a cluster. For example, a 400 GB index might be too large for any single node in your cluster to handle, but split into ten shards, each one 40 GB, OpenSearch can distribute the shards across ten nodes and work with each shard individually. - -By default, OpenSearch creates a *replica* shard for each *primary* shard. If you split your index into ten shards, for example, OpenSearch also creates ten replica shards. These replica shards act as backups in the event of a node failure---OpenSearch distributes replica shards to different nodes than their corresponding primary shards---but they also improve the speed and rate at which the cluster can process search requests. You might specify more than one replica per index for a search-heavy workload. - -Despite being a piece of an OpenSearch index, each shard is actually a full Lucene index---confusing, we know. This detail is important, though, because each instance of Lucene is a running process that consumes CPU and memory. More shards is not necessarily better. Splitting a 400 GB index into 1,000 shards, for example, would place needless strain on your cluster. A good rule of thumb is to keep shard size between 10--50 GB. - - -## REST API - -You interact with OpenSearch clusters using the REST API, which offers a lot of flexibility. You can use clients like [curl](https://curl.se/) or any programming language that can send HTTP requests. To add a JSON document to an OpenSearch index (i.e. index a document), you send an HTTP request: - -```json -PUT https://://_doc/ -{ - "title": "The Wind Rises", - "release_date": "2013-07-20" -} -``` - -To run a search for the document: - -```json -GET https://://_search?q=wind -``` - -To delete the document: - -```json -DELETE https://://_doc/ -``` - -You can change most OpenSearch settings using the REST API, modify indexes, check the health of the cluster, get statistics---almost everything. - -## Advanced concepts - -The following section describes more advanced OpenSearch concepts. - -### Translog - -Any index changes, such as document indexing or deletion, are written to disk during a Lucene commit. However, Lucene commits are expensive operations, so they cannot be performed after every change to the index. Instead, each shard records every indexing operation in a transaction log called _translog_. When a document is indexed, it is added to the memory buffer and recorded in the translog. After a process or host restart, any data in the in-memory buffer is lost. Recording the document in the translog ensures durability because the translog is written to disk. - -Frequent refresh operations write the documents in the memory buffer to a segment and then clear the memory buffer. Periodically, a [flush](#flush) performs a Lucene commit, which includes writing the segments to disk using `fsync`, purging the old translog, and starting a new translog. Thus, a translog contains all operations that have not yet been flushed. - -### Refresh - -Periodically, OpenSearch performs a _refresh_ operation, which writes the documents from the in-memory Lucene index to files. These files are not guaranteed to be durable because an `fsync` is not performed. A refresh makes documents available for search. - -### Flush - -A _flush_ operation persists the files to disk using `fsync`, ensuring durability. Flushing ensures that the data stored only in the translog is recorded in the Lucene index. OpenSearch performs a flush as needed to ensure that the translog does not grow too large. - -### Merge - -In OpenSearch, a shard is a Lucene index, which consists of _segments_ (or segment files). Segments store the indexed data and are immutable. Periodically, smaller segments are merged into larger ones. Merging reduces the overall number of segments on each shard, frees up disk space, and improves search performance. Eventually, segments reach a maximum size specified in the merge policy and are no longer merged into larger segments. The merge policy also specifies how often merges are performed. \ No newline at end of file diff --git a/_about/version-history.md b/_about/version-history.md index 0789a4ff25..9a57c1ef35 100644 --- a/_about/version-history.md +++ b/_about/version-history.md @@ -9,6 +9,8 @@ permalink: /version-history/ OpenSearch version | Release highlights | Release date :--- | :--- | :--- +[2.13.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.13.0.md) | Makes agents and tools and the OpenSearch Assistant Toolkit generally available. Introduces vector quantization within OpenSearch. Adds LLM guardrails and hybrid search with aggregations. Adds the Bloom filter skipping index for Apache Spark data sources, I/O-based admission control, and the ability to add an alerting cluster that manages all alerting tasks. For a full list of release highlights, see the Release Notes. | 2 April 2024 +[2.12.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.12.0.md) | Makes concurrent segment search and conversational search generally available. Provides an experimental OpenSearch Assistant Toolkit, including agents and tools, workflow automation, and OpenSearch Assistant for OpenSearch Dashboards UI. Adds a new match-only text field, query insights to monitor top N queries, and k-NN search on nested fields. For a full list of release highlights, see the Release Notes. | 20 February 2024 [2.11.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.11.1.md) | Includes maintenance changes and bug fixes for cross-cluster replication, alerting, observability, OpenSearch Dashboards, index management, machine learning, security, and security analytics. For a full list of release highlights, see the Release Notes. | 30 November 2023 [2.11.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.11.0.md) | Adds multimodal and sparse neural search capability and the ability to take shallow snapshots that refer to data stored in remote-backed storage. Makes the search comparison tool generally available. Includes a simplified workflow to create threat detectors in Security Analytics and improved security in OpenSearch Dashboards. Experimental features include a new framework and toolset for distributed tracing and updates to conversational search. For a full list of release highlights, see the Release Notes. | 16 October 2023 [2.10.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.10.0.md) | Makes remote-backed storage generally available. Adds hybrid search capability, custom log types for Security Analytics, IP2Geo ingest processor, and delimited term frequency token filter. Includes a new look and feel for OpenSearch Dashboards and updates the Discover tool. Adds Microsoft Teams webhook support for notifications. Experimental features include concurrent segment search and conversational search. For a full list of release highlights, see the Release Notes. | 25 September 2023 @@ -26,6 +28,7 @@ OpenSearch version | Release highlights | Release date [2.0.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.0.1.md) | Includes bug fixes and maintenance updates for Alerting and Anomaly Detection. | 16 June 2022 [2.0.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.0.0.md) | Includes document-level monitors for alerting, OpenSearch Notifications plugins, and Geo Map Tiles in OpenSearch Dashboards. Also adds support for Lucene 9 and bug fixes for all OpenSearch plugins. For a full list of release highlights, see the Release Notes. | 26 May 2022 [2.0.0-rc1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.0.0-rc1.md) | The Release Candidate for 2.0.0. This version allows you to preview the upcoming 2.0.0 release before the GA release. The preview release adds document-level alerting, support for Lucene 9, and the ability to use term lookup queries in document level security. | 03 May 2022 +[1.3.15](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.15.md) | Includes bug fixes and maintenance updates for cross-cluster replication, SQL, OpenSearch Dashboards reporting, and alerting. | 05 March 2024 [1.3.14](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.14.md) | Includes bug fixes and maintenance updates for OpenSearch security and OpenSearch Dashboards security. | 12 December 2023 [1.3.13](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.13.md) | Includes bug fixes for Anomaly Detection, adds maintenance updates and infrastructure enhancements. | 21 September 2023 [1.3.12](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.12.md) | Adds maintenance updates for OpenSearch security and OpenSearch Dashboards observability. Includes bug fixes for observability, OpenSearch Dashboards visualizations, and OpenSearch security. | 10 August 2023 diff --git a/_aggregations/bucket/diversified-sampler.md b/_aggregations/bucket/diversified-sampler.md index 303f29f9a3..7249ac3555 100644 --- a/_aggregations/bucket/diversified-sampler.md +++ b/_aggregations/bucket/diversified-sampler.md @@ -8,9 +8,11 @@ redirect_from: - /query-dsl/aggregations/bucket/diversified-sampler/ --- -# Diversified sampler aggregations +# Diversified sampler -The `diversified_sampler` aggregation lets you reduce the bias in the distribution of the sample pool. You can use the `field` setting to control the maximum number of documents collected on any one shard which shares a common value: +The `diversified_sampler` aggregation lets you reduce the bias in the distribution of the sample pool by deduplicating documents containing the same `field`. It does so by using the `max_docs_per_value` and `field` settings, which limit the maximum number of documents collected on a shard for the provided `field`. The `max_docs_per_value` setting is an optional parameter used to determine the maximum number of documents that will be returned per `field`. The default value of this setting is `1`. + +Similarly to the [`sampler` aggregation]({{site.url}}{{site.baseurl}}/aggregations/bucket/sampler/), you can use the `shard_size` setting to control the maximum number of documents collected on any one shard, as shown in the following example: ```json GET opensearch_dashboards_sample_data_logs/_search @@ -18,7 +20,7 @@ GET opensearch_dashboards_sample_data_logs/_search "size": 0, "aggs": { "sample": { - "diversified_sampler": { + "diversified_": { "shard_size": 1000, "field": "response.keyword" }, @@ -57,6 +59,8 @@ GET opensearch_dashboards_sample_data_logs/_search ] } } + } } ``` + diff --git a/_aggregations/bucket/geohash-grid.md b/_aggregations/bucket/geohash-grid.md index 778bfb86fe..13f89799ba 100644 --- a/_aggregations/bucket/geohash-grid.md +++ b/_aggregations/bucket/geohash-grid.md @@ -255,7 +255,26 @@ Geohash grid aggregation requests support the following parameters. Parameter | Data type | Description :--- | :--- | :--- field | String | The field on which aggregation is performed. This field must be mapped as a `geo_point` or `geo_shape` field. If the field contains an array, all array values are aggregated. Required. -precision | Integer | The zoom level used to determine grid cells for bucketing results. Valid values are in the [0, 15] range. Optional. Default is 5. +precision | Integer | The granularity level used to determine grid cells for bucketing results. Cells cannot exceed the specified size (diagonal) of the required precision. Valid values are in the [0, 12] range. Optional. Default is 5. bounds | Object | The bounding box for filtering geopoints and geoshapes. The bounding box is defined by the upper-left and lower-right vertices. Only shapes that intersect with this bounding box or are completely enclosed by this bounding box are included in the aggregation output. The vertices are specified as geopoints in one of the following formats:
- An object with a latitude and longitude
- An array in the [`longitude`, `latitude`] format
- A string in the "`latitude`,`longitude`" format
- A geohash
- WKT
See the [geopoint formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats) for formatting examples. Optional. size | Integer | The maximum number of buckets to return. When there are more buckets than `size`, OpenSearch returns buckets with more documents. Optional. Default is 10,000. -shard_size | Integer | The maximum number of buckets to return from each shard. Optional. Default is max (10, `size` · number of shards), which provides a more accurate count of more highly prioritized buckets. \ No newline at end of file +shard_size | Integer | The maximum number of buckets to return from each shard. Optional. Default is max (10, `size` · number of shards), which provides a more accurate count of more highly prioritized buckets. + +## Geohash precision + +The relationship between geohash precision and the approximate grid cell dimensions is described in the following table. + +Precision /
geohash length | Latitude bits | Longitude bits | Latitude error | Longitude error | Cell height | Cell width +:---:|:-------------:|:--------------:|:--------------:|:---------------:|:-----------:|:----------: + 1 | 2 | 3 | ±23 | ±23 | 4992.6 km | 5009.4 km + 2 | 5 | 5 | ±2.8 | ±5.6 | 624.1 km | 1252.3 km + 3 | 7 | 8 | ±0.70 | ±0.70 | 156 km | 156.5 km + 4 | 10 | 10 | ±0.087 | ±0.18 | 19.5 km | 39.1 km + 5 | 12 | 13 | ±0.022 | ±0.022 | 4.9 km | 4.9 km + 6 | 15 | 15 | ±0.0027 | ±0.0055 | 609.4 m | 1.2 km + 7 | 17 | 18 | ±0.00068 | ±0.00068 | 152.5 m | 152.9 m + 8 | 20 | 20 | ±0.00086 | ±0.000172 | 19 m | 38.2 m + 9 | 22 | 23 | ±0.000021 | ±0.000021 | 4.8 m | 4.8 m + 10 | 25 | 25 | ±0.00000268 | ±0.00000536 | 59.5 cm | 1.2 m + 11 | 27 | 28 | ±0.00000067 | ±0.00000067 | 14.9 cm | 14.9 cm + 12 | 30 | 30 | ±0.00000008 | ±0.00000017 | 1.9 cm | 3.7 cm \ No newline at end of file diff --git a/_aggregations/bucket/geohex-grid.md b/_aggregations/bucket/geohex-grid.md index a60be16ef5..de11061055 100644 --- a/_aggregations/bucket/geohex-grid.md +++ b/_aggregations/bucket/geohex-grid.md @@ -387,7 +387,7 @@ Geohex grid aggregation requests support the following parameters. Parameter | Data type | Description :--- | :--- | :--- field | String | The field that contains the geopoints. This field must be mapped as a `geo_point` field. If the field contains an array, all array values are aggregated. Required. -precision | Integer | The zoom level used to determine grid cells for bucketing results. Valid values are in the [0, 15] range. Optional. Default is 5. +precision | Integer | The granularity level used to determine grid cells for bucketing results. Cells cannot exceed the specified size (diagonal) of the required precision. Valid values are in the [0, 15] range. Optional. Default is 5. bounds | Object | The bounding box for filtering geopoints. The bounding box is defined by the upper-left and lower-right vertices. The vertices are specified as geopoints in one of the following formats:
- An object with a latitude and longitude
- An array in the [`longitude`, `latitude`] format
- A string in the "`latitude`,`longitude`" format
- A geohash
- WKT
See the [geopoint formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats) for formatting examples. Optional. size | Integer | The maximum number of buckets to return. When there are more buckets than `size`, OpenSearch returns buckets with more documents. Optional. Default is 10,000. shard_size | Integer | The maximum number of buckets to return from each shard. Optional. Default is max (10, `size` · number of shards), which provides a more accurate count of more highly prioritized buckets. \ No newline at end of file diff --git a/_aggregations/bucket/geotile-grid.md b/_aggregations/bucket/geotile-grid.md index cb4347288c..dd0c4f8a1f 100644 --- a/_aggregations/bucket/geotile-grid.md +++ b/_aggregations/bucket/geotile-grid.md @@ -544,7 +544,7 @@ Geotile grid aggregation requests support the following parameters. Parameter | Data type | Description :--- | :--- | :--- field | String | The field that contains the geopoints. This field must be mapped as a `geo_point` field. If the field contains an array, all array values are aggregated. Required. -precision | Integer | The zoom level used to determine grid cells for bucketing results. Valid values are in the [0, 15] range. Optional. Default is 5. +precision | Integer | The granularity level used to determine grid cells for bucketing results. Cells cannot exceed the specified size (diagonal) of the required precision. Valid values are in the [0, 29] range. Optional. Default is 7. bounds | Object | The bounding box for filtering geopoints. The bounding box is defined by the upper-left and lower-right vertices. The vertices are specified as geopoints in one of the following formats:
- An object with a latitude and longitude
- An array in the [`longitude`, `latitude`] format
- A string in the "`latitude`,`longitude`" format
- A geohash
- WKT
See the [geopoint formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats) for formatting examples. Optional. size | Integer | The maximum number of buckets to return. When there are more buckets than `size`, OpenSearch returns buckets with more documents. Optional. Default is 10,000. shard_size | Integer | The maximum number of buckets to return from each shard. Optional. Default is max (10, `size` · number of shards), which provides a more accurate count of more highly prioritized buckets. \ No newline at end of file diff --git a/_aggregations/bucket/sampler.md b/_aggregations/bucket/sampler.md index 3668f3c755..28bae47b6d 100644 --- a/_aggregations/bucket/sampler.md +++ b/_aggregations/bucket/sampler.md @@ -8,7 +8,7 @@ nav_order: 170 # Sampler aggregations -If you're aggregating over millions of documents, you can use a `sampler` aggregation to reduce its scope to a small sample of documents for a faster response. The `sampler` aggregation selects the samples by top-scoring documents. +If you're aggregating a very large number of documents, you can use a `sampler` aggregation to reduce the scope to a small sample of documents, resulting in a faster response. The `sampler` aggregation selects the samples by top-scoring documents. The results are approximate but closely represent the distribution of the real data. The `sampler` aggregation significantly improves query performance, but the estimated responses are not entirely reliable. @@ -25,6 +25,8 @@ The basic syntax is: } ``` +## Shard size property + The `shard_size` property tells OpenSearch how many documents (at most) to collect from each shard. The following example limits the number of documents collected on each shard to 1,000 and then buckets the documents by a `terms` aggregation: @@ -79,4 +81,4 @@ GET opensearch_dashboards_sample_data_logs/_search } } } -``` \ No newline at end of file +``` diff --git a/_aggregations/bucket/terms.md b/_aggregations/bucket/terms.md index 229ded6133..072ad42cc2 100644 --- a/_aggregations/bucket/terms.md +++ b/_aggregations/bucket/terms.md @@ -58,16 +58,51 @@ GET opensearch_dashboards_sample_data_logs/_search The values are returned with the key `key`. `doc_count` specifies the number of documents in each bucket. By default, the buckets are sorted in descending order of `doc-count`. + +## Size and shard size parameters + +The number of buckets returned by the `terms` aggregation is controlled by the `size` parameter, which is 10 by default. + +Additionally, the coordinating node responsible for the aggregation will prompt each shard for its top unique terms. The number of buckets returned by each shard is controlled by the `shard_size` parameter. This parameter is distinct from the `size` parameter and exists as a mechanism to increase the accuracy of the bucket document counts. + +For example, imagine a scenario in which the `size` and `shard_size` parameters both have a value of 3. The `terms` aggregation prompts each shard for its top three unique terms. The coordinating node aggregates the results to compute the final result. If a shard contains an object that is not included in the top three, then it won't show up in the response. However, increasing the `shard_size` value for this request will allow each shard to return a larger number of unique terms, increasing the likelihood that the coordinating node will receive all relevant results. + +By default, the `shard_size` parameter is set to `size * 1.5 + 10`. + +When using concurrent segment search, the `shard_size` parameter is also applied to each segment slice. + +The `shard_size` parameter serves as a way to balance the performance and document count accuracy of the `terms` aggregation. Higher `shard_size` values will ensure higher document count accuracy but will result in higher memory and compute usage. Lower `shard_size` values will be more performant but will result in lower document count accuracy. + +## Document count error + The response also includes two keys named `doc_count_error_upper_bound` and `sum_other_doc_count`. -The `terms` aggregation returns the top unique terms. So, if the data has many unique terms, then some of them might not appear in the results. The `sum_other_doc_count` field is the sum of the documents that are left out of the response. In this case, the number is 0 because all the unique values appear in the response. +The `terms` aggregation returns the top unique terms. Therefore, if the data contains many unique terms, then some of them might not appear in the results. The `sum_other_doc_count` field represents the sum of the documents that are excluded from the response. In this case, the number is 0 because all of the unique values appear in the response. + +The `doc_count_error_upper_bound` field represents the maximum possible count for a unique value that is excluded from the final results. Use this field to estimate the margin of error for the count. + +The `doc_count_error_upper_bound` value and the concept of accuracy are only applicable to aggregations using the default sort order---by document count, descending. This is because when you sort by descending document count, any terms that were not returned are guaranteed to include equal or fewer documents than those terms that were returned. Based on this, you can compute the `doc_count_error_upper_bound`. + +If the `show_term_doc_count_error` parameter is set to `true`, then the `terms` aggregation will show the `doc_count_error_upper_bound` computed for each unique bucket in addition to the overall value. + +## The `min_doc_count` and `shard_min_doc_count` parameters + +You can use the `min_doc_count` parameter to filter out any unique terms with fewer than `min_doc_count` results. The `min_doc_count` threshold is applied only after merging the results retrieved from all of the shards. Each shard is unaware of the global document count for a given term. If there is a significant difference between the top `shard_size` globally frequent terms and the top terms local to a shard, you may receive unexpected results when using the `min_doc_count` parameter. + +Separately, the `shard_min_doc_count` parameter is used to filter out the unique terms that a shard returns back to the coordinator with fewer than `shard_min_doc_count` results. + +When using concurrent segment search, the `shard_min_doc_count` parameter is not applied to each segment slice. For more information, see the [related GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/11847). + +## Collect mode + +There are two collect modes available: `depth_first` and `breadth_first`. The `depth_first` collection mode expands all branches of the aggregation tree in a depth-first manner and only performs pruning after the expansion is complete. + +However, when using nested `terms` aggregations, the cardinality of the number of buckets returned is multiplied by the cardinality of the field at each level of nesting, making it easy to see combinatorial explosion in the bucket count as you nest aggregations. -The `doc_count_error_upper_bound` field represents the maximum possible count for a unique value that's left out of the final results. Use this field to estimate the error margin for the count. +You can use the `breadth_first` collection mode to address this issue. In this case, pruning will be applied to the first level of the aggregation tree before it is expanded to the next level, potentially greatly reducing the number of buckets computed. -The count might not be accurate. A coordinating node that’s responsible for the aggregation prompts each shard for its top unique terms. Imagine a scenario where the `size` parameter is 3. -The `terms` aggregation requests each shard for its top 3 unique terms. The coordinating node takes each of the results and aggregates them to compute the final result. If a shard has an object that’s not part of the top 3, then it won't show up in the response. +Additionally, there is memory overhead associated with performing `breadth_first` collection, which is linearly related to the number of matching documents. This is because `breadth_first` collection works by caching and replaying the pruned set of buckets from the parent level. -This is especially true if `size` is set to a low number. Because the default size is 10, an error is unlikely to happen. If you don’t need high accuracy and want to increase the performance, you can reduce the size. ## Account for pre-aggregated data diff --git a/_aggregations/metric/average.md b/_aggregations/metric/average.md index 247d497aef..428f1e76b6 100644 --- a/_aggregations/metric/average.md +++ b/_aggregations/metric/average.md @@ -50,9 +50,9 @@ GET opensearch_dashboards_sample_data_ecommerce/_search "hits": [] }, "aggregations": { - "sum_taxful_total_price": { + "avg_taxful_total_price": { "value": 75.05542864304813 } } } -``` \ No newline at end of file +``` diff --git a/_analyzers/index.md b/_analyzers/index.md index 6dc0ef0a8c..95f97ec8ce 100644 --- a/_analyzers/index.md +++ b/_analyzers/index.md @@ -15,16 +15,24 @@ redirect_from: # Text analysis -When you are searching documents using a full-text search, you want to receive all relevant results and not only exact matches. If you're looking for "walk", you're interested in results that contain any form of the word, like "Walk", "walked", or "walking." To facilitate full-text search, OpenSearch uses text analysis. +When you are searching documents using a full-text search, you want to receive all relevant results. If you're looking for "walk", you're interested in results that contain any form of the word, like "Walk", "walked", or "walking". To facilitate full-text search, OpenSearch uses text analysis. -Text analysis consists of the following steps: +The objective of text analysis is to split the unstructured free text content of the source document into a sequence of terms, which are then stored in an inverted index. Subsequently, when a similar text analysis is applied to a user's query, the resulting sequence of terms facilitates the matching of relevant source documents. -1. _Tokenize_ text into terms: For example, after tokenization, the phrase `Actions speak louder than words` is split into tokens `Actions`, `speak`, `louder`, `than`, and `words`. -1. _Normalize_ the terms by converting them into a standard format, for example, converting them to lowercase or performing stemming (reducing the word to its root): For example, after normalization, `Actions` becomes `action`, `louder` becomes `loud`, and `words` becomes `word`. +From a technical point of view, the text analysis process consists of several steps, some of which are optional: + +1. Before the free text content can be split into individual words, it may be beneficial to refine the text at the character level. The primary aim of this optional step is to help the tokenizer (the subsequent stage in the analysis process) generate better tokens. This can include removal of markup tags (such as HTML) or handling specific character patterns (like replacing the 🙂 emoji with the text `:slightly_smiling_face:`). + +2. The next step is to split the free text into individual words---_tokens_. This is performed by a _tokenizer_. For example, after tokenization, the sentence `Actions speak louder than words` is split into tokens `Actions`, `speak`, `louder`, `than`, and `words`. + +3. The last step is to process individual tokens by applying a series of token filters. The aim is to convert each token into a predictable form that is directly stored in the index, for example, by converting them to lowercase or performing stemming (reducing the word to its root). For example, the token `Actions` becomes `action`, `louder` becomes `loud`, and `words` becomes `word`. + +Although the terms ***token*** and ***term*** may sound similar and are occasionally used interchangeably, it is helpful to understand the difference between the two. In the context of Apache Lucene, each holds a distinct role. A ***token*** is created by a tokenizer during text analysis and often undergoes a number of additional modifications as it passes through the chain of token filters. Each token is associated with metadata that can be further used during the text analysis process. A ***term*** is a data value that is directly stored in the inverted index and is associated with much less metadata. During search, matching operates at the term level. +{: .note} ## Analyzers -In OpenSearch, text analysis is performed by an _analyzer_. Each analyzer contains the following sequentially applied components: +In OpenSearch, the abstraction that encompasses text analysis is referred to as an _analyzer_. Each analyzer contains the following sequentially applied components: 1. **Character filters**: First, a character filter receives the original text as a stream of characters and adds, removes, or modifies characters in the text. For example, a character filter can strip HTML characters from a string so that the text `

Actions speak louder than words

` becomes `\nActions speak louder than words\n`. The output of a character filter is a stream of characters. @@ -35,6 +43,8 @@ In OpenSearch, text analysis is performed by an _analyzer_. Each analyzer contai An analyzer must contain exactly one tokenizer and may contain zero or more character filters and zero or more token filters. {: .note} +There is also a special type of analyzer called a ***normalizer***. A normalizer is similar to an analyzer except that it does not contain a tokenizer and can only include specific types of character filters and token filters. These filters can perform only character-level operations, such as character or pattern replacement, and cannot perform operations on the token as a whole. This means that replacing a token with a synonym or stemming is not supported. See [Normalizers]({{site.url}}{{site.baseurl}}/analyzers/normalizers/) for further details. + ## Built-in analyzers The following table lists the built-in analyzers that OpenSearch provides. The last column of the table contains the result of applying the analyzer to the string `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`. @@ -45,7 +55,7 @@ Analyzer | Analysis performed | Analyzer output **Simple** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Converts tokens to lowercase | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `to`, `opensearch`] **Whitespace** | - Parses strings into tokens on white space | [`It’s`, `fun`, `to`, `contribute`, `a`,`brand-new`, `PR`, `or`, `2`, `to`, `OpenSearch!`] **Stop** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Removes stop words
- Converts tokens to lowercase | [`s`, `fun`, `contribute`, `brand`, `new`, `pr`, `opensearch`] -**Keyword** (noop) | - Outputs the entire string unchanged | [`It’s fun to contribute a brand-new PR or 2 to OpenSearch!`] +**Keyword** (no-op) | - Outputs the entire string unchanged | [`It’s fun to contribute a brand-new PR or 2 to OpenSearch!`] **Pattern** | - Parses strings into tokens using regular expressions
- Supports converting strings to lowercase
- Supports removing stop words | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] [**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] **Fingerprint** | - Parses strings on any non-letter character
- Normalizes characters by converting them to ASCII
- Converts tokens to lowercase
- Sorts, deduplicates, and concatenates tokens into a single token
- Supports removing stop words | [`2 a brand contribute fun it's new opensearch or pr to`]
Note that the apostrophe was converted to its ASCII counterpart. diff --git a/_analyzers/normalizers.md b/_analyzers/normalizers.md new file mode 100644 index 0000000000..b89659f814 --- /dev/null +++ b/_analyzers/normalizers.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Normalizers +nav_order: 100 +--- + +# Normalizers + +A _normalizer_ functions similarly to an analyzer but outputs only a single token. It does not contain a tokenizer and can only include specific types of character and token filters. These filters can perform only character-level operations, such as character or pattern replacement, and cannot operate on the token as a whole. This means that replacing a token with a synonym or stemming is not supported. + +A normalizer is useful in keyword search (that is, in term-based queries) because it allows you to run token and character filters on any given input. For instance, it makes it possible to match an incoming query `Naïve` with the index term `naive`. + +Consider the following example. + +Create a new index with a custom normalizer: +```json +PUT /sample-index +{ + "settings": { + "analysis": { + "normalizer": { + "normalized_keyword": { + "type": "custom", + "char_filter": [], + "filter": [ "asciifolding", "lowercase" ] + } + } + } + }, + "mappings": { + "properties": { + "approach": { + "type": "keyword", + "normalizer": "normalized_keyword" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document: +```json +POST /sample-index/_doc/ +{ + "approach": "naive" +} +``` +{% include copy-curl.html %} + +The following query matches the document. This is expected: +```json +GET /sample-index/_search +{ + "query": { + "term": { + "approach": "naive" + } + } +} +``` +{% include copy-curl.html %} + +But this query matches the document as well: +```json +GET /sample-index/_search +{ + "query": { + "term": { + "approach": "Naïve" + } + } +} +``` +{% include copy-curl.html %} + +To understand why, consider the effect of the normalizer: +```json +GET /sample-index/_analyze +{ + "normalizer" : "normalized_keyword", + "text" : "Naïve" +} +``` + +Internally, a normalizer accepts only filters that are instances of either `NormalizingTokenFilterFactory` or `NormalizingCharFilterFactory`. The following is a list of compatible filters found in modules and plugins that are part of the core OpenSearch repository. + +### The `common-analysis` module + +This module does not require installation; it is available by default. + +Character filters: `pattern_replace`, `mapping` + +Token filters: `arabic_normalization`, `asciifolding`, `bengali_normalization`, `cjk_width`, `decimal_digit`, `elision`, `german_normalization`, `hindi_normalization`, `indic_normalization`, `lowercase`, `persian_normalization`, `scandinavian_folding`, `scandinavian_normalization`, `serbian_normalization`, `sorani_normalization`, `trim`, `uppercase` + +### The `analysis-icu` plugin + +Character filters: `icu_normalizer` + +Token filters: `icu_normalizer`, `icu_folding`, `icu_transform` + +### The `analysis-kuromoji` plugin + +Character filters: `normalize_kanji`, `normalize_kana` + +### The `analysis-nori` plugin + +Character filters: `normalize_kanji`, `normalize_kana` + +These lists of filters include only analysis components found in the [additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins) that are part of the core OpenSearch repository. +{: .note} \ No newline at end of file diff --git a/_analyzers/token-filters/index.md b/_analyzers/token-filters/index.md index ba09a7fa30..e6d9875736 100644 --- a/_analyzers/token-filters/index.md +++ b/_analyzers/token-filters/index.md @@ -13,52 +13,53 @@ Token filters receive the stream of tokens from the tokenizer and add, remove, o The following table lists all token filters that OpenSearch supports. Token filter | Underlying Lucene token filter| Description -`apostrophe` | [ApostropheFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/tr/ApostropheFilter.html) | In each token that contains an apostrophe, the `apostrophe` token filter removes the apostrophe itself and all characters following the apostrophe. -`asciifolding` | [ASCIIFoldingFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html) | Converts alphabetic, numeric, and symbolic characters. -`cjk_bigram` | [CJKBigramFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. -`cjk_width` | [CJKWidthFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html) | Normalizes Chinese, Japanese, and Korean (CJK) tokens according to the following rules:
- Folds full-width ASCII character variants into the equivalent basic Latin characters.
- Folds half-width Katakana character variants into the equivalent Kana characters. -`classic` | [ClassicFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/standard/ClassicFilter.html) | Performs optional post-processing on the tokens generated by the classic tokenizer. Removes possessives (`'s`) and removes `.` from acronyms. -`common_grams` | [CommonGramsFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html) | Generates bigrams for a list of frequently occurring terms. The output contains both single terms and bigrams. -`conditional` | [ConditionalTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.html) | Applies an ordered list of token filters to tokens that match the conditions provided in a script. -`decimal_digit` | [DecimalDigitFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). -`delimited_payload` | [DelimitedPayloadTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html) | Separates a token stream into tokens with corresponding payloads, based on a provided delimiter. A token consists of all characters before the delimiter, and a payload consists of all characters after the delimiter. For example, if the delimiter is `|`, then for the string `foo|bar`, `foo` is the token and `bar` is the payload. +`apostrophe` | [ApostropheFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/ApostropheFilter.html) | In each token that contains an apostrophe, the `apostrophe` token filter removes the apostrophe itself and all characters following the apostrophe. +`asciifolding` | [ASCIIFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html) | Converts alphabetic, numeric, and symbolic characters. +`cjk_bigram` | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. +`cjk_width` | [CJKWidthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html) | Normalizes Chinese, Japanese, and Korean (CJK) tokens according to the following rules:
- Folds full-width ASCII character variants into the equivalent basic Latin characters.
- Folds half-width Katakana character variants into the equivalent Kana characters. +`classic` | [ClassicFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/classic/ClassicFilter.html) | Performs optional post-processing on the tokens generated by the classic tokenizer. Removes possessives (`'s`) and removes `.` from acronyms. +`common_grams` | [CommonGramsFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html) | Generates bigrams for a list of frequently occurring terms. The output contains both single terms and bigrams. +`conditional` | [ConditionalTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.html) | Applies an ordered list of token filters to tokens that match the conditions provided in a script. +`decimal_digit` | [DecimalDigitFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). +`delimited_payload` | [DelimitedPayloadTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html) | Separates a token stream into tokens with corresponding payloads, based on a provided delimiter. A token consists of all characters before the delimiter, and a payload consists of all characters after the delimiter. For example, if the delimiter is `|`, then for the string `foo|bar`, `foo` is the token and `bar` is the payload. [`delimited_term_freq`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-term-frequency/) | [DelimitedTermFrequencyTokenFilter](https://lucene.apache.org/core/9_7_0/analysis/common/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.html) | Separates a token stream into tokens with corresponding term frequencies, based on a provided delimiter. A token consists of all characters before the delimiter, and a term frequency is the integer after the delimiter. For example, if the delimiter is `|`, then for the string `foo|5`, `foo` is the token and `5` is the term frequency. -`dictionary_decompounder` | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. -`edge_ngram` | [EdgeNGramTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. -`elision` | [ElisionFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). -`fingerprint` | [FingerprintFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. -`flatten_graph` | [FlattenGraphFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. -`hunspell` | [HunspellStemFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell supports a word having multiple stems, this filter can emit multiple tokens for each consumed token. Requires you to configure one or more language-specific Hunspell dictionaries. +`dictionary_decompounder` | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. +`edge_ngram` | [EdgeNGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. +`elision` | [ElisionFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). +`fingerprint` | [FingerprintFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. +`flatten_graph` | [FlattenGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. +`hunspell` | [HunspellStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell supports a word having multiple stems, this filter can emit multiple tokens for each consumed token. Requires you to configure one or more language-specific Hunspell dictionaries. `hyphenation_decompounder` | [HyphenationCompoundWordTokenFilter](https://lucene.apache.org/core/9_8_0/analysis/common/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.html) | Uses XML-based hyphenation patterns to find potential subwords in compound words and checks the subwords against the specified word list. The token output contains only the subwords found in the word list. -`keep_types` | [TypeTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. -`keep_word` | [KeepWordFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. -`keyword_marker` | [KeywordMarkerFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. -`keyword_repeat` | [KeywordRepeatFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. -`kstem` | [KStemFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides kstem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. -`length` | [LengthFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens whose lengths are shorter or longer than the length range specified by `min` and `max`. -`limit` | [LimitTokenCountFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. A common use case is to limit the size of document field values based on token count. -`lowercase` | [LowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/LowerCaseFilter.html) is for the English language. You can set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). -`min_hash` | [MinHashFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. +`keep_types` | [TypeTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. +`keep_word` | [KeepWordFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. +`keyword_marker` | [KeywordMarkerFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. +`keyword_repeat` | [KeywordRepeatFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. +`kstem` | [KStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides kstem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. +`kuromoji_completion` | [JapaneseCompletionFilter](https://lucene.apache.org/core/9_10_0/analysis/kuromoji/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.html) | Adds Japanese romanized terms to the token stream (in addition to the original tokens). Usually used to support autocomplete on Japanese search terms. Note that the filter has a `mode` parameter, which should be set to `index` when used in an index analyzer and `query` when used in a search analyzer. Requires the `analysis-kuromoji` plugin. For information about installing the plugin, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins). +`length` | [LengthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens whose lengths are shorter or longer than the length range specified by `min` and `max`. +`limit` | [LimitTokenCountFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. A common use case is to limit the size of document field values based on token count. +`lowercase` | [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) is for the English language. You can set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). +`min_hash` | [MinHashFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. `multiplexer` | N/A | Emits multiple tokens at the same position. Runs each token through each of the specified filter lists separately and outputs the results as separate tokens. -`ngram` | [NGramTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. -Normalization | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. +`ngram` | [NGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. +Normalization | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. `pattern_capture` | N/A | Generates a token for every capture group in the provided regular expression. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). `pattern_replace` | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). `phonetic` | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin. -`porter_stem` | [PorterStemFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. +`porter_stem` | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. `predicate_token_filter` | N/A | Removes tokens that don’t match the specified predicate script. Supports inline Painless scripts only. -`remove_duplicates` | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. -`reverse` | [ReverseStringFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. -`shingle` | [ShingleFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but apply to words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. +`remove_duplicates` | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. +`reverse` | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. +`shingle` | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but apply to words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. `snowball` | N/A | Stems words using a [Snowball-generated stemmer](https://snowballstem.org/). You can use the `snowball` token filter with the following languages in the `language` field: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Estonian`, `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Irish`, `Italian`, `Kp`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. `stemmer` | N/A | Provides algorithmic stemming for the following languages in the `language` field: `arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `dutch_kp`, `english`, `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english`, `estonian`, `finnish`, `light_finnish`, `french`, `light_french`, `minimal_french`, `galician`, `minimal_galician`, `german`, `german2`, `light_german`, `minimal_german`, `greek`, `hindi`, `hungarian`, `light_hungarian`, `indonesian`, `irish`, `italian`, `light_italian`, `latvian`, `Lithuanian`, `norwegian`, `light_norwegian`, `minimal_norwegian`, `light_nynorsk`, `minimal_nynorsk`, `portuguese`, `light_portuguese`, `minimal_portuguese`, `portuguese_rslp`, `romanian`, `russian`, `light_russian`, `sorani`, `spanish`, `light_spanish`, `swedish`, `light_swedish`, `turkish`. `stemmer_override` | N/A | Overrides stemming algorithms by applying a custom mapping so that the provided terms are not stemmed. `stop` | [StopFilter](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/StopFilter.html) | Removes stop words from a token stream. `synonym` | N/A | Supplies a synonym list for the analysis process. The synonym list is provided using a configuration file. `synonym_graph` | N/A | Supplies a synonym list, including multiword synonyms, for the analysis process. -`trim` | [TrimFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space from each token in a stream. -`truncate` | [TruncateTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens whose length exceeds the specified character limit. +`trim` | [TrimFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space from each token in a stream. +`truncate` | [TruncateTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens whose length exceeds the specified character limit. `unique` | N/A | Ensures each token is unique by removing duplicate tokens from a stream. -`uppercase` | [UpperCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. -`word_delimiter` | [WordDelimiterFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. -`word_delimiter_graph` | [WordDelimiterGraphFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. Assigns multi-position tokens a `positionLength` attribute. +`uppercase` | [UpperCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. +`word_delimiter` | [WordDelimiterFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. +`word_delimiter_graph` | [WordDelimiterGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. Assigns multi-position tokens a `positionLength` attribute. diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md new file mode 100644 index 0000000000..d401851f60 --- /dev/null +++ b/_analyzers/tokenizers/index.md @@ -0,0 +1,61 @@ +--- +layout: default +title: Tokenizers +nav_order: 60 +has_children: false +has_toc: false +--- + +# Tokenizers + +A tokenizer receives a stream of characters and splits the text into individual _tokens_. A token consists of a term (usually, a word) and metadata about this term. For example, a tokenizer can split text on white space so that the text `Actions speak louder than words.` becomes [`Actions`, `speak`, `louder`, `than`, `words.`]. + +The output of a tokenizer is a stream of tokens. Tokenizers also maintain the following metadata about tokens: + +- The **order** or **position** of each token: This information is used for word and phrase proximity queries. +- The starting and ending positions (**offsets**) of the tokens in the text: This information is used for highlighting search terms. +- The token **type**: Some tokenizers (for example, `standard`) classify tokens by type, for example, `` or ``. Simpler tokenizers (for example, `letter`) only classify tokens as type `word`. + +You can use tokenizers to define custom analyzers. + +## Built-in tokenizers + +The following tables list the built-in tokenizers that OpenSearch provides. + +### Word tokenizers + +Word tokenizers parse full text into words. + +Tokenizer | Description | Example +:--- | :--- | :--- +`standard` | - Parses strings into tokens at word boundaries
- Removes most punctuation | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `2`, `to`, `OpenSearch`] +`letter` | - Parses strings into tokens on any non-letter character
- Removes non-letter characters | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `to`, `OpenSearch`] +`lowercase` | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Converts terms to lowercase | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `to`, `opensearch`] +`whitespace` | - Parses strings into tokens at white space characters | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand-new`, `PR`, `or`, `2`, `to`, `OpenSearch!`] +`uax_url_email` | - Similar to the standard tokenizer
- Unlike the standard tokenizer, leaves URLs and email addresses as single terms | `It’s fun to contribute a brand-new PR or 2 to OpenSearch opensearch-project@github.com!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `2`, `to`, `OpenSearch`, `opensearch-project@github.com`] +`classic` | - Parses strings into tokens on:
  - Punctuation characters that are followed by a white space character
  - Hyphens if the term does not contain numbers
- Removes punctuation
- Leaves URLs and email addresses as single terms | `Part number PA-35234, single-use product (128.32)`
becomes
[`Part`, `number`, `PA-35234`, `single`, `use`, `product`, `128.32`] +`thai` | - Parses Thai text into terms | `สวัสดีและยินดีต`
becomes
[`สวัสด`, `และ`, `ยินดี`, `ต`] + +### Partial word tokenizers + +Partial word tokenizers parse text into words and generate fragments of those words for partial word matching. + +Tokenizer | Description | Example +:--- | :--- | :--- +`ngram`| - Parses strings into words on specified characters (for example, punctuation or white space characters) and generates n-grams of each word | `My repo`
becomes
[`M`, `My`, `y`, `y `,  ,  r, `r`, `re`, `e`, `ep`, `p`, `po`, `o`]
because the default n-gram length is 1--2 characters +`edge_ngram` | - Parses strings into words on specified characters (for example, punctuation or white space characters) and generates edge n-grams of each word (n-grams that start at the beginning of the word) | `My repo`
becomes
[`M`, `My`]
because the default n-gram length is 1--2 characters + +### Structured text tokenizers + +Structured text tokenizers parse structured text, such as identifiers, email addresses, paths, or ZIP Codes. + +Tokenizer | Description | Example +:--- | :--- | :--- +`keyword` | - No-op tokenizer
- Outputs the entire string unchanged
- Can be combined with token filters, like lowercase, to normalize terms | `My repo`
becomes
`My repo` +`pattern` | - Uses a regular expression pattern to parse text into terms on a word separator or to capture matching text as terms
- Uses [Java regular expressions](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) | `https://opensearch.org/forum`
becomes
[`https`, `opensearch`, `org`, `forum`] because by default the tokenizer splits terms at word boundaries (`\W+`)
Can be configured with a regex pattern +`simple_pattern` | - Uses a regular expression pattern to return matching text as terms
- Uses [Lucene regular expressions](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/util/automaton/RegExp.html)
- Faster than the `pattern` tokenizer because it uses a subset of the `pattern` tokenizer regular expressions | Returns an empty array by default
Must be configured with a pattern because the pattern defaults to an empty string +`simple_pattern_split` | - Uses a regular expression pattern to split the text at matches rather than returning the matches as terms
- Uses [Lucene regular expressions](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/util/automaton/RegExp.html)
- Faster than the `pattern` tokenizer because it uses a subset of the `pattern` tokenizer regular expressions | No-op by default
Must be configured with a pattern +`char_group` | - Parses on a set of configurable characters
- Faster than tokenizers that run regular expressions | No-op by default
Must be configured with a list of characters +`path_hierarchy` | - Parses text on the path separator (by default, `/`) and returns a full path to each component in the tree hierarchy | `one/two/three`
becomes
[`one`, `one/two`, `one/two/three`] + + diff --git a/_api-reference/cluster-api/cluster-stats.md b/_api-reference/cluster-api/cluster-stats.md index bbfaacc284..8f8b585a6a 100644 --- a/_api-reference/cluster-api/cluster-stats.md +++ b/_api-reference/cluster-api/cluster-stats.md @@ -127,7 +127,10 @@ Parameter | Type | Description "max_bytes" : 0 }, "max_refresh_time_lag_in_millis" : 0, - "total_time_spent_in_millis" : 516 + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } }, "download" : { "total_download_size" : { diff --git a/_api-reference/document-apis/index-document.md b/_api-reference/document-apis/index-document.md index 44f0e5e0b7..3460fc1d50 100644 --- a/_api-reference/document-apis/index-document.md +++ b/_api-reference/document-apis/index-document.md @@ -11,7 +11,7 @@ redirect_from: **Introduced 1.0** {: .label .label-purple} -Before you can search for data, you must first add documents. This operation adds a single document to your index. +You can use the `Index document` operation to add a single document to your index. ## Example @@ -33,6 +33,53 @@ PUT /_create/<_id> POST /_create/<_id> ``` +- PUT adds or updates documents in the index with a specified ID. Used for controlled document creation or updates. +- POST adds documents with auto-generated IDs to the index. Useful for adding new documents without specifying IDs. +- `_create` is a type identifier indicating that document creation should only occur if the document with the specified ID doesn't already exist. +- `` represents the name of the index to which the document will be added. +- `<_id>` represents the unique identifier of the document. + +## Adding a sample index + +Sample data can be added to the index with curl commands in the terminal or through the API. + +To test the Document APIs, add a document by following these steps: +1. Open OpenSearch Dashboards. +2. Navigate to the actions menu. +3. In the **Management** section, choose **Dev Tools**. +4. Enter a command, and then select the green triangle play button to send the request. The following are some example commands. + +### Create a sample-index +```json +PUT /sample-index +``` +{% include copy-curl.html %} + +### Example PUT request + +```json +PUT /sample_index/_doc/1 +{ + "name": "Example", + "price": 29.99, + "description": "To be or not to be, that is the question" +} +``` +{% include copy-curl.html %} + +### Example POST request + +```json +POST /sample_index/_doc +{ + "name": "Another Example", + "price": 19.99, + "description": "We are such stuff as dreams are made on" +} + +``` +{% include copy-curl.html %} + ## URL parameters In your request, you must specify the index you want to add your document to. If the index doesn't already exist, OpenSearch automatically creates the index and adds in your document. All other URL parameters are optional. @@ -43,7 +90,7 @@ Parameter | Type | Description | Required <_id> | String | A unique identifier to attach to the document. To automatically generate an ID, use `POST /doc` in your request instead of PUT. | No if_seq_no | Integer | Only perform the index operation if the document has the specified sequence number. | No if_primary_term | Integer | Only perform the index operation if the document has the specified primary term.| No -op_type | Enum | Specifies the type of operation to complete with the document. Valid values are `create` (create the index if it doesn't exist) and `index`. If a document ID is included in the request, then the default is `index`. Otherwise, the default is `create`. | No +op_type | Enum | Specifies the type of operation to complete with the document. Valid values are `create` (index a document only if it doesn't exist) and `index`. If a document ID is included in the request, then the default is `index`. Otherwise, the default is `create`. | No pipeline | String | Route the index operation to a certain pipeline. | No routing | String | value used to assign the index operation to a specific shard. | No refresh | Enum | If true, OpenSearch refreshes shards to make the operation visible to searching. Valid options are `true`, `false`, and `wait_for`, which tells OpenSearch to wait for a refresh before executing the operation. Default is false. | No diff --git a/_api-reference/document-apis/reindex.md b/_api-reference/document-apis/reindex.md index 766f5b2872..4a0346ede3 100644 --- a/_api-reference/document-apis/reindex.md +++ b/_api-reference/document-apis/reindex.md @@ -73,10 +73,11 @@ slice | Whether to manually or automatically slice the reindex operation so it e _source | Whether to reindex source fields. Specify a list of fields to reindex or true to reindex all fields. Default is true. id | The ID to associate with manual slicing. max | Maximum number of slices. -dest | Information about the destination index. Valid values are `index`, `version_type`, and `op_type`. +dest | Information about the destination index. Valid values are `index`, `version_type`, `op_type`, and `pipeline`. index | Name of the destination index. version_type | The indexing operation's version type. Valid values are `internal`, `external`, `external_gt` (retrieve the document if the specified version number is greater than the document’s current version), and `external_gte` (retrieve the document if the specified version number is greater or equal to than the document’s current version). op_type | Whether to copy over documents that are missing in the destination index. Valid values are `create` (ignore documents with the same ID from the source index) and `index` (copy everything from the source index). +pipeline | Which ingest pipeline to utilize during the reindex. script | A script that OpenSearch uses to apply transformations to the data during the reindex operation. source | The actual script that OpenSearch runs. lang | The scripting language. Valid options are `painless`, `expression`, `mustache`, and `java`. diff --git a/_api-reference/index-apis/create-index.md b/_api-reference/index-apis/create-index.md index 5e0d504fcc..53d2dc28f9 100644 --- a/_api-reference/index-apis/create-index.md +++ b/_api-reference/index-apis/create-index.md @@ -75,3 +75,4 @@ PUT /sample-index1 } } ``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_api-reference/index-apis/delete-index.md b/_api-reference/index-apis/delete-index.md index 5973a8cfdf..7b2be5e83b 100644 --- a/_api-reference/index-apis/delete-index.md +++ b/_api-reference/index-apis/delete-index.md @@ -21,7 +21,6 @@ DELETE /sample-index {% include copy-curl.html %} ## Path and HTTP methods - ``` DELETE / ``` diff --git a/_api-reference/index-apis/force-merge.md b/_api-reference/index-apis/force-merge.md index fd6e5e6da4..6c2a61bef3 100644 --- a/_api-reference/index-apis/force-merge.md +++ b/_api-reference/index-apis/force-merge.md @@ -15,6 +15,8 @@ The force merge API operation forces a merge on the shards of one or more indexe In OpenSearch, a shard is a Lucene index, which consists of _segments_ (or segment files). Segments store the indexed data. Periodically, smaller segments are merged into larger ones and the larger segments become immutable. Merging reduces the overall number of segments on each shard and frees up disk space. +OpenSearch performs background segment merges that produce segments no larger than `index.merge.policy.max_merged_segment` (the default is 5 GB). + ## Deleted documents When a document is deleted from an OpenSearch index, it is not deleted from the Lucene segment but is rather only marked to be deleted. When the segment files are merged, deleted documents are removed (or _expunged_). Thus, merging also frees up space occupied by documents marked as deleted. @@ -69,7 +71,8 @@ The following table lists the available query parameters. All query parameters a | `flush` | Boolean | Performs a flush on the indexes after the force merge. A flush ensures that the files are persisted to disk. Default is `true`. | | `ignore_unavailable` | Boolean | If `true`, OpenSearch ignores missing or closed indexes. If `false`, OpenSearch returns an error if the force merge operation encounters missing or closed indexes. Default is `false`. | | `max_num_segments` | Integer | The number of larger segments into which smaller segments are merged. Set this parameter to `1` to merge all segments into one segment. The default behavior is to perform the merge as necessary. | -| `only_expunge_deletes` | Boolean | If `true`, the merge operation only expunges segments containing a certain percentage of deleted documents. The percentage is 10% by default and is configurable in the `index.merge.policy.expunge_deletes_allowed` setting. Using `only_expunge_deletes` may produce segments larger than `index.merge.policy.max_merged_segment`, and those large segments may not participate in future merges. For more information, see [Deleted documents](#deleted-documents). Default is `false`. | +| `only_expunge_deletes` | Boolean | If `true`, the merge operation only expunges segments containing a certain percentage of deleted documents. The percentage is 10% by default and is configurable in the `index.merge.policy.expunge_deletes_allowed` setting. Prior to OpenSearch 2.12, `only_expunge_deletes` ignored the `index.merge.policy.max_merged_segment` setting. Starting with OpenSearch 2.12, using `only_expunge_deletes` does not produce segments larger than `index.merge.policy.max_merged_segment` (by default, 5 GB). For more information, see [Deleted documents](#deleted-documents). Default is `false`. | +| `primary_only` | Boolean | If set to `true`, then the merge operation is performed only on the primary shards of an index. This can be useful when you want to take a snapshot of the index after the merge is complete. Snapshots only copy segments from the primary shards. Merging the primary shards can reduce resource consumption. Default is `false`. | #### Example request: Force merge a specific index @@ -99,6 +102,13 @@ POST /.testindex-logs/_forcemerge?max_num_segments=1 ``` {% include copy-curl.html %} +#### Example request: Force merge primary shards + +```json +POST /.testindex-logs/_forcemerge?primary_only=true +``` +{% include copy-curl.html %} + #### Example response ```json diff --git a/_api-reference/index-apis/stats.md b/_api-reference/index-apis/stats.md index eb7349d3c5..d133827630 100644 --- a/_api-reference/index-apis/stats.md +++ b/_api-reference/index-apis/stats.md @@ -77,11 +77,46 @@ GET /testindex/_stats ``` {% include copy-curl.html %} +#### Example request: Comma-separated list of indexes + +```json +GET /testindex1,testindex2/_stats +``` +{% include copy-curl.html %} + +#### Example request: Wildcard expression + +```json +GET /testindex*/_stats +``` +{% include copy-curl.html %} + +#### Example request: Specific stats + +```json +GET /testindex/_stats/refresh,flush +``` +{% include copy-curl.html %} + +#### Example request: Expand wildcards + +```json +GET /testindex*/_stats?expand_wildcards=open,hidden +``` +{% include copy-curl.html %} + +#### Example request: Shard-level statistics + +```json +GET /testindex/_stats?level=shards +``` +{% include copy-curl.html %} + #### Example response By default, the returned statistics are aggregated in the `primaries` and `total` aggregations. The `primaries` aggregation contains statistics for the primary shards. The `total` aggregation contains statistics for both primary and replica shards. The following is an example Index Stats API response: -
+
Response @@ -213,7 +248,10 @@ By default, the returned statistics are aggregated in the `primaries` and `total "max_bytes" : 0 }, "max_refresh_time_lag_in_millis" : 0, - "total_time_spent_in_millis" : 516 + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } }, "download" : { "total_download_size" : { @@ -377,7 +415,10 @@ By default, the returned statistics are aggregated in the `primaries` and `total "max_bytes" : 0 }, "max_refresh_time_lag_in_millis" : 0, - "total_time_spent_in_millis" : 516 + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } }, "download" : { "total_download_size" : { @@ -545,7 +586,10 @@ By default, the returned statistics are aggregated in the `primaries` and `total "max_bytes" : 0 }, "max_refresh_time_lag_in_millis" : 0, - "total_time_spent_in_millis" : 516 + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } }, "download" : { "total_download_size" : { @@ -709,7 +753,10 @@ By default, the returned statistics are aggregated in the `primaries` and `total "max_bytes" : 0 }, "max_refresh_time_lag_in_millis" : 0, - "total_time_spent_in_millis" : 516 + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } }, "download" : { "total_download_size" : { @@ -761,50 +808,6 @@ By default, the returned statistics are aggregated in the `primaries` and `total ```
-#### Example request: Comma-separated list of indexes - -```json -GET /testindex1,testindex2/_stats -``` -{% include copy-curl.html %} - -#### Example request: Wildcard expression - -```json -GET /testindex*/_stats -``` -{% include copy-curl.html %} - -#### Example request: Specific stats - -```json -GET /testindex/_stats/refresh,flush -``` -{% include copy-curl.html %} - -#### Example request: Expand wildcards - -```json -GET /testindex*/_stats?expand_wildcards=open,hidden -``` -{% include copy-curl.html %} - -#### Example request: Shard-level statistics - -```json -GET /testindex/_stats?level=shards -``` -{% include copy-curl.html %} - -## Concurrent segment search - -Starting in OpenSearch 2.10, [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/) allows each shard-level request to search segments in parallel during the query phase. If you [enable the experimental concurrent segment search feature flag]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search#enabling-the-feature-flag), the Index Stats API response will contain several additional fields with statistics about slices (units of work executed by a thread). These fields will be provided whether or not the cluster and index settings for concurrent segment search are enabled. For more information about slices, see [Concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search#searching-segments-concurrently). - -The following table provides information about the added response fields. +## Response fields -|Response field | Description | -|:--- |:--- | -|`search.concurrent_avg_slice_count` |The average slice count of all search requests. This is computed as the total slice count divided by the total number of concurrent search requests. | -|`search.concurrent_query_total` |The total number of query operations that use concurrent segment search. | -|`search.concurrent_query_time_in_millis` |The total amount of time taken by all query operations that use concurrent segment search, in milliseconds. | -|`search.concurrent_query_current` |The number of currently running query operations that use concurrent segment search. | +For information about response fields, see [Nodes Stats API response fields]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-stats/#indices). diff --git a/_api-reference/nodes-apis/nodes-stats.md b/_api-reference/nodes-apis/nodes-stats.md index 79e692f31b..87365fa900 100644 --- a/_api-reference/nodes-apis/nodes-stats.md +++ b/_api-reference/nodes-apis/nodes-stats.md @@ -51,6 +51,8 @@ adaptive_selection | Statistics about adaptive replica selection, which selects script_cache | Statistics about script cache. indexing_pressure | Statistics about the node's indexing pressure. shard_indexing_pressure | Statistics about shard indexing pressure. +resource_usage_stats | Node-level resource usage statistics, such as CPU and JVM memory. +admission_control | Statistics about admission control. To filter the information returned for the `indices` metric, you can use specific `index_metric` values. You can use these only when you use the following query types: @@ -110,7 +112,7 @@ GET _nodes/stats/ Select the arrow to view the example response. -
+
Response @@ -291,7 +293,10 @@ Select the arrow to view the example response. "max_bytes" : 0 }, "max_refresh_time_lag_in_millis" : 0, - "total_time_spent_in_millis" : 516 + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } }, "download" : { "total_download_size" : { @@ -577,6 +582,19 @@ Select the arrow to view the example response. "full_states" : 2, "incompatible_diffs" : 0, "compatible_diffs" : 10 + }, + "cluster_state_stats" : { + "overall" : { + "update_count" : 9, + "total_time_in_millis" : 807, + "failed_count" : 0 + }, + "remote_upload" : { + "success_count" : 9, + "failed_count" : 0, + "total_time_in_millis" : 116, + "cleanup_attempt_failed_count" : 0 + } } }, "ingest" : { @@ -708,6 +726,34 @@ Select the arrow to view the example response. }, "enabled" : false, "enforced" : false + }, + "resource_usage_stats": { + "nxLWtMdXQmWA-ZBVWU8nwA": { + "timestamp": 1698401391000, + "cpu_utilization_percent": "0.1", + "memory_utilization_percent": "3.9", + "io_usage_stats": { + "max_io_utilization_percent": "99.6" + } + } + }, + "admission_control": { + "global_cpu_usage": { + "transport": { + "rejection_count": { + "search": 3, + "indexing": 1 + } + } + }, + "global_io_usage": { + "transport": { + "rejection_count": { + "search": 3, + "indexing": 1 + } + } + } } } } @@ -761,6 +807,8 @@ http.total_opened | Integer | The total number of HTTP connections the node has [indexing_pressure](#indexing_pressure) | Object | Statistics related to the node's indexing pressure. [shard_indexing_pressure](#shard_indexing_pressure) | Object | Statistics related to indexing pressure at the shard level. [search_backpressure]({{site.url}}{{site.baseurl}}/opensearch/search-backpressure#search-backpressure-stats-api) | Object | Statistics related to search backpressure. +[resource_usage_stats](#resource_usage_stats) | Object | Statistics related to resource usage for the node. +[admission_control](#admission_control) | Object | Statistics related to admission control for the node. ### `indices` @@ -794,6 +842,10 @@ get.missing_total | Integer | The number of failed get operations. get.missing_time_in_millis | Integer | The total time for all failed get operations, in milliseconds. get.current | Integer | The number of get operations that are currently running. search | Object | Statistics about the search operations for the node. +search.concurrent_avg_slice_count | Integer | The average slice count of all search requests. This is computed as the total slice count divided by the total number of concurrent search requests. +search.concurrent_query_total |Integer | The total number of query operations that use concurrent segment search. +search.concurrent_query_time_in_millis | Integer | The total amount of time taken by all query operations that use concurrent segment search, in milliseconds. +search.concurrent_query_current |Integer | The number of currently running query operations that use concurrent segment search. search.open_contexts | Integer | The number of open search contexts. search.query_total | Integer | The total number of shard query operations. search.query_time_in_millis | Integer | The total amount of time for all shard query operations, in milliseconds. @@ -883,9 +935,9 @@ segments.version_map_memory_in_bytes | Integer | The total amount of memory used segments.fixed_bit_set_memory_in_bytes | Integer | The total amount of memory used by fixed bit sets, in bytes. Fixed bit sets are used for nested objects and join fields. segments.max_unsafe_auto_id_timestamp | Integer | The timestamp for the most recently retired indexing request, in milliseconds since the epoch. segments.segment_replication | Object | Segment replication statistics for all primary shards when segment replication is enabled on the node. -segments.segment_replication.maxBytesBehind | long | The maximum number of bytes behind the primary replica. -segments.segment_replication.totalBytesBehind | long | The total number of bytes behind the primary replicas. -segments.segment_replication.maxReplicationLag | long | The maximum amount of time, in milliseconds, taken by a replica to catch up to its primary. +segments.segment_replication.max_bytes_behind | long | The maximum number of bytes behind the primary replica. +segments.segment_replication.total_bytes_behind | long | The total number of bytes behind the primary replicas. +segments.segment_replication.max_replication_lag | long | The maximum amount of time, in milliseconds, taken by a replica to catch up to its primary. segments.remote_store | Object | Statistics about remote segment store operations. segments.remote_store.upload | Object | Statistics related to uploads to the remote segment store. segments.remote_store.upload.total_upload_size | Object | The amount of data, in bytes, uploaded to the remote segment store. @@ -897,6 +949,8 @@ segments.remote_store.upload.refresh_size_lag.total_bytes | Integer | The total segments.remote_store.upload.refresh_size_lag.max_bytes | Integer | The maximum amount of lag, in bytes, during the upload refresh between the remote segment store and the local store. segments.remote_store.upload.max_refresh_time_lag_in_millis | Integer | The maximum duration, in milliseconds, that the remote refresh is behind the local refresh. segments.remote_store.upload.total_time_spent_in_millis | Integer | The total amount of time, in milliseconds, spent on uploads to the remote segment store. +segments.remote_store.upload.pressure | Object | Statistics related to segment store upload backpressure. +segments.remote_store.upload.pressure.total_rejections | Integer | The total number of requests rejected due to segment store upload backpressure. segments.remote_store.download | Object | Statistics related to downloads to the remote segment store. segments.remote_store.download.total_download_size | Object | The total amount of data download from the remote segment store. segments.remote_store.download.total_download_size.started_bytes | Integer | The number of bytes downloaded from the remote segment store after the download starts. @@ -1106,6 +1160,16 @@ published_cluster_states | Object | Statistics for the published cluster states published_cluster_states.full_states | Integer | The number of published cluster states. published_cluster_states.incompatible_diffs | Integer | The number of incompatible differences between published cluster states. published_cluster_states.compatible_diffs | Integer | The number of compatible differences between published cluster states. +cluster_state_stats | Object | Cluster state update statistics published by the active leader. +cluster_state_stats.overall | Object | Overall cluster state update statistics. +cluster_state_stats.overall.update_count | Integer | The total number of successful cluster state updates. +cluster_state_stats.overall.total_time_in_millis | Integer | The total amount of time taken for all cluster state updates, in milliseconds. +cluster_state_stats.overall.failed_count | Integer | The total number of failed cluster state updates. +cluster_state_stats.remote_upload | Object | Cluster state update statistics related to remote uploads. +cluster_state_stats.remote_upload.success_count | Integer | The total number of successful cluster state updates uploaded to the remote store. +cluster_state_stats.remote_upload.failed_count | Integer | The total number of cluster state updates that failed to upload to the remote store. +cluster_state_stats.remote_upload.total_time_in_millis | Integer | The total amount of time taken for all cluster state updates uploaded to the remote store, in milliseconds. +cluster_state_stats.remote_upload.cleanup_attempt_failed_count | Integer | The total number of failures encountered while trying to clean up older cluster states from the remote store. ### `ingest` @@ -1192,9 +1256,27 @@ total_rejections_breakup_shadow_mode.throughput_degradation_limits | Integer | T enabled | Boolean | Specifies whether the shard indexing pressure feature is turned on for the node. enforced | Boolean | If true, the shard indexing pressure runs in enforced mode (there are rejections). If false, the shard indexing pressure runs in shadow mode (there are no rejections, but statistics are recorded and can be retrieved in the `total_rejections_breakup_shadow_mode` object). Only applicable if shard indexing pressure is enabled. -## Concurrent segment search +### `resource_usage_stats` + +The `resource_usage_stats` object contains the resource usage statistics. Each entry is specified by the node ID and has the following properties. + +Field | Field type | Description +:--- |:-----------| :--- +timestamp | Integer | The last refresh time for the resource usage statistics, in milliseconds since the epoch. +cpu_utilization_percent | Float | Statistics for the average CPU usage of any OpenSearch processes within the time period configured in the `node.resource.tracker.global_cpu_usage.window_duration` setting. +memory_utilization_percent | Float | The node JVM memory usage statistics within the time period configured in the `node.resource.tracker.global_jvmmp.window_duration` setting. +max_io_utilization_percent | Float | (Linux only) Statistics for the average IO usage of any OpenSearch processes within the time period configured in the `node.resource.tracker.global_io_usage.window_duration` setting. + +### `admission_control` + +The `admission_control` object contains the rejection count of search and indexing requests based on resource consumption and has the following properties. -Starting in OpenSearch 2.10, [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/) allows each shard-level request to search segments in parallel during the query phase. If you [enable the experimental concurrent segment search feature flag]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search#enabling-the-feature-flag), the Nodes Stats API response will contain several additional fields with statistics about slices (units of work executed by a thread). For the descriptions of those fields, see [Index Stats API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/stats#concurrent-segment-search). +Field | Field type | Description +:--- | :--- | :--- +admission_control.global_cpu_usage.transport.rejection_count.search | Integer | The total number of search rejections in the transport layer when the node CPU usage limit was met. In this case, additional search requests are rejected until the system recovers. The CPU usage limit is configured in the `admission_control.search.cpu_usage.limit` setting. +admission_control.global_cpu_usage.transport.rejection_count.indexing | Integer | The total number of indexing rejections in the transport layer when the node CPU usage limit was met. Any additional indexing requests are rejected until the system recovers. The CPU usage limit is configured in the `admission_control.indexing.cpu_usage.limit` setting. +admission_control.global_io_usage.transport.rejection_count.search | Integer | The total number of search rejections in the transport layer when the node IO usage limit was met. Any additional search requests are rejected until the system recovers. The CPU usage limit is configured in the `admission_control.search.io_usage.limit` setting (Linux only). +admission_control.global_io_usage.transport.rejection_count.indexing | Integer | The total number of indexing rejections in the transport layer when the node IO usage limit was met. Any additional indexing requests are rejected until the system recovers. The IO usage limit is configured in the `admission_control.indexing.io_usage.limit` setting (Linux only). ## Required permissions diff --git a/_api-reference/profile.md b/_api-reference/profile.md index f8c3104a43..94c7857b80 100644 --- a/_api-reference/profile.md +++ b/_api-reference/profile.md @@ -18,7 +18,15 @@ The Profile API provides timing information about the execution of individual co The Profile API is a resource-consuming operation that adds overhead to search operations. {: .warning} -#### Example request +## Concurrent segment search + +Starting in OpenSearch 2.12, [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/) allows each shard-level request to search segments in parallel during the query phase. The Profile API response contains several additional fields with statistics about _slices_. + +A slice is the unit of work that can be executed by a thread. Each query can be partitioned into multiple slices, with each slice containing one or more segments. All the slices can be executed either in parallel or in some order depending on the available threads in the pool. + +In general, the max/min/avg slice time captures statistics across all slices for a timing type. For example, when profiling aggregations, the `max_slice_time_in_nanos` field in the `aggregations` section shows the maximum time consumed by the aggregation operation and its children across all slices. + +#### Example request: Non-concurrent search To use the Profile API, include the `profile` parameter set to `true` in the search request sent to the `_search` endpoint: @@ -66,7 +74,7 @@ The Profile API response is verbose, so if you're running the request through th The response contains profiling information: -
+
Response @@ -213,6 +221,220 @@ The response contains profiling information: ```
+#### Example response: Concurrent segment search + +The following is an example response for a concurrent segment search with three segment slices: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 5, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + ... + ] + }, + "aggregations": { + ... + }, + "profile": { + "shards": [ + { + "id": "[9Y7lbpaWRhyr5Y-41Zl48g][idx][0]", + "inbound_network_time_in_millis": 0, + "outbound_network_time_in_millis": 0, + "searches": [ + { + "query": [ + { + "type": "MatchAllDocsQuery", + "description": "*:*", + "time_in_nanos": 868000, + "max_slice_time_in_nanos": 19376, + "min_slice_time_in_nanos": 12250, + "avg_slice_time_in_nanos": 16847, + "breakdown": { + "max_match": 0, + "set_min_competitive_score_count": 0, + "match_count": 0, + "avg_score_count": 1, + "shallow_advance_count": 0, + "next_doc": 29708, + "min_build_scorer": 3125, + "score_count": 5, + "compute_max_score_count": 0, + "advance": 0, + "min_set_min_competitive_score": 0, + "min_advance": 0, + "score": 29250, + "avg_set_min_competitive_score_count": 0, + "min_match_count": 0, + "avg_score": 333, + "max_next_doc_count": 3, + "max_compute_max_score_count": 0, + "avg_shallow_advance": 0, + "max_shallow_advance_count": 0, + "set_min_competitive_score": 0, + "min_build_scorer_count": 2, + "next_doc_count": 8, + "min_match": 0, + "avg_next_doc": 888, + "compute_max_score": 0, + "min_set_min_competitive_score_count": 0, + "max_build_scorer": 5791, + "avg_match_count": 0, + "avg_advance": 0, + "build_scorer_count": 6, + "avg_build_scorer_count": 2, + "min_next_doc_count": 2, + "min_shallow_advance_count": 0, + "max_score_count": 2, + "avg_match": 0, + "avg_compute_max_score": 0, + "max_advance": 0, + "avg_shallow_advance_count": 0, + "avg_set_min_competitive_score": 0, + "avg_compute_max_score_count": 0, + "avg_build_scorer": 4027, + "max_set_min_competitive_score_count": 0, + "advance_count": 0, + "max_build_scorer_count": 2, + "shallow_advance": 0, + "min_compute_max_score": 0, + "max_match_count": 0, + "create_weight_count": 1, + "build_scorer": 32459, + "max_set_min_competitive_score": 0, + "max_compute_max_score": 0, + "min_shallow_advance": 0, + "match": 0, + "max_shallow_advance": 0, + "avg_advance_count": 0, + "min_next_doc": 708, + "max_advance_count": 0, + "min_score": 291, + "max_next_doc": 999, + "create_weight": 1834, + "avg_next_doc_count": 2, + "max_score": 376, + "min_compute_max_score_count": 0, + "min_score_count": 1, + "min_advance_count": 0 + } + } + ], + "rewrite_time": 8126, + "collector": [ + { + "name": "QueryCollectorManager", + "reason": "search_multi", + "time_in_nanos": 564708, + "reduce_time_in_nanos": 1251042, + "max_slice_time_in_nanos": 121959, + "min_slice_time_in_nanos": 28958, + "avg_slice_time_in_nanos": 83208, + "slice_count": 3, + "children": [ + { + "name": "SimpleTopDocsCollectorManager", + "reason": "search_top_hits", + "time_in_nanos": 500459, + "reduce_time_in_nanos": 840125, + "max_slice_time_in_nanos": 22168, + "min_slice_time_in_nanos": 5792, + "avg_slice_time_in_nanos": 12084, + "slice_count": 3 + }, + { + "name": "NonGlobalAggCollectorManager: [histo]", + "reason": "aggregation", + "time_in_nanos": 552167, + "reduce_time_in_nanos": 311292, + "max_slice_time_in_nanos": 95333, + "min_slice_time_in_nanos": 18416, + "avg_slice_time_in_nanos": 66249, + "slice_count": 3 + } + ] + } + ] + } + ], + "aggregations": [ + { + "type": "NumericHistogramAggregator", + "description": "histo", + "time_in_nanos": 2847834, + "max_slice_time_in_nanos": 117374, + "min_slice_time_in_nanos": 20624, + "avg_slice_time_in_nanos": 75597, + "breakdown": { + "min_build_leaf_collector": 9500, + "build_aggregation_count": 3, + "post_collection": 3209, + "max_collect_count": 2, + "initialize_count": 3, + "reduce_count": 0, + "avg_collect": 17055, + "max_build_aggregation": 26000, + "avg_collect_count": 1, + "max_build_leaf_collector": 64833, + "min_build_leaf_collector_count": 1, + "build_aggregation": 41125, + "min_initialize": 583, + "max_reduce": 0, + "build_leaf_collector_count": 3, + "avg_reduce": 0, + "min_collect_count": 1, + "avg_build_leaf_collector_count": 1, + "avg_build_leaf_collector": 45000, + "max_collect": 24625, + "reduce": 0, + "avg_build_aggregation": 12013, + "min_post_collection": 292, + "max_initialize": 1333, + "max_post_collection": 750, + "collect_count": 5, + "avg_post_collection": 541, + "avg_initialize": 986, + "post_collection_count": 3, + "build_leaf_collector": 86833, + "min_collect": 6250, + "min_build_aggregation": 3541, + "initialize": 2786791, + "max_build_leaf_collector_count": 1, + "min_reduce": 0, + "collect": 29834 + }, + "debug": { + "total_buckets": 1 + } + } + ] + } + ] + } +} +``` +
+ ## Response fields The response includes the following fields. @@ -236,7 +458,10 @@ Field | Data type | Description :--- | :--- | :--- `type` | String | The Lucene query type into which the search query was rewritten. Corresponds to the Lucene class name (which often has the same name in OpenSearch). `description` | String | Contains a Lucene explanation of the query. Helps differentiate queries with the same type. -`time_in_nanos` | Long | The amount of time the query took to execute, in nanoseconds. In a parent query, the time is inclusive of the execution times of all the child queries. +`time_in_nanos` | Long | The total elapsed time for this query, in nanoseconds. For concurrent segment search, `time_in_nanos` is the total time spent across all the slices (the difference between the last completed slice execution end time and the first slice execution start time). +`max_slice_time_in_nanos` | Long | The maximum amount of time taken by any slice to run a query, in nanoseconds. This field is included only if you enable concurrent segment search. +`min_slice_time_in_nanos` | Long | The minimum amount of time taken by any slice to run a query, in nanoseconds. This field is included only if you enable concurrent segment search. +`avg_slice_time_in_nanos` | Long | The average amount of time taken by any slice to run a query, in nanoseconds. This field is included only if you enable concurrent segment search. [`breakdown`](#the-breakdown-object) | Object | Contains timing statistics about low-level Lucene execution. `children` | Array of objects | If a query has subqueries (children), this field contains information about the subqueries. @@ -256,6 +481,12 @@ Field | Description `compute_max_score` | Contains the amount of time required to execute the `getMaxScore` Lucene method. `set_min_competitive_score` | Contains the amount of time required to execute the `setMinCompetitiveScore` Lucene method. `_count` | Contains the number of invocations of a ``. For example, `advance_count` contains the number of invocations of the `advance` method. Different invocations of the same method occur because the method is called on different documents. You can determine the selectivity of a query by comparing counts in different query components. +`max_` | The maximum amount of time taken by any slice to run a query method. Breakdown stats for the `create_weight` method do not include profiled `max` time because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. +`min_` | The minimum amount of time taken by any slice to run a query method. Breakdown stats for the `create_weight` method do not include profiled `min` time because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. +`avg_` | The average amount of time taken by any slice to run a query method. Breakdown stats for the `create_weight` method do not include profiled `avg` time because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. +`max__count` | The maximum number of invocations of a `` on any slice. Breakdown stats for the `create_weight` method do not include profiled `max` count because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. +`min__count` | The minimum number of invocations of a `` on any slice. Breakdown stats for the `create_weight` method do not include profiled `min` count because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. +`avg__count` | The average number of invocations of a `` on any slice. Breakdown stats for the `create_weight` method do not include profiled `avg` count because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. ### The `collector` array @@ -265,8 +496,13 @@ Field | Description :--- | :--- `name` | The collector name. In the [example response](#example-response), the `collector` is a single `SimpleTopScoreDocCollector`---the default scoring and sorting collector. `reason` | Contains a description of the collector. For possible field values, see [Collector reasons](#collector-reasons). -`time_in_nanos` | A wall-clock time, including timing for all children. +`time_in_nanos` | The total elapsed time for this collector, in nanoseconds. For concurrent segment search, `time_in_nanos` is the total amount of time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). `children` | If a collector has subcollectors (children), this field contains information about the subcollectors. +`max_slice_time_in_nanos` |The maximum amount of time taken by any slice, in nanoseconds. This field is included only if you enable concurrent segment search. +`min_slice_time_in_nanos` |The minimum amount of time taken by any slice, in nanoseconds. This field is included only if you enable concurrent segment search. +`avg_slice_time_in_nanos` |The average amount of time taken by any slice, in nanoseconds. This field is included only if you enable concurrent segment search. +`slice_count` |The total slice count for this query. This field is included only if you enable concurrent segment search. +`reduce_time_in_nanos` |The amount of time taken to reduce results for all slice collectors, in nanoseconds. This field is included only if you enable concurrent segment search. Collector times are calculated, combined, and normalized independently, so they are independent of query times. {: .note} @@ -317,7 +553,7 @@ GET /opensearch_dashboards_sample_data_ecommerce/_search The response contains profiling information: -
+
Response @@ -589,7 +825,7 @@ GET /opensearch_dashboards_sample_data_ecommerce/_search The response contains profiling information: -
+
Response @@ -738,10 +974,13 @@ Field | Data type | Description :--- | :--- | :--- `type` | String | The aggregator type. In the [non-global aggregation example response](#example-response-non-global-aggregation), the aggregator type is `AvgAggregator`. [Global aggregation example response](#example-request-global-aggregation) contains a `GlobalAggregator` with an `AvgAggregator` child. `description` | String | Contains a Lucene explanation of the aggregation. Helps differentiate aggregations with the same type. -`time_in_nanos` | Long | The amount of time taken to execute the aggregation, in nanoseconds. In a parent aggregation, the time is inclusive of the execution times of all the child aggregations. +`time_in_nanos` | Long | The total elapsed time for this aggregation, in nanoseconds. For concurrent segment search, `time_in_nanos` is the total amount of time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). [`breakdown`](#the-breakdown-object-1) | Object | Contains timing statistics about low-level Lucene execution. `children` | Array of objects | If an aggregation has subaggregations (children), this field contains information about the subaggregations. `debug` | Object | Some aggregations return a `debug` object that describes the details of the underlying execution. +`max_slice_time_in_nanos` |Long | The maximum amount of time taken by any slice to run an aggregation, in nanoseconds. This field is included only if you enable concurrent segment search. +`min_slice_time_in_nanos` |Long |The minimum amount of time taken by any slice to run an aggregation, in nanoseconds. This field is included only if you enable concurrent segment search. +`avg_slice_time_in_nanos` |Long |The average amount of time taken by any slice to run an aggregation, in nanoseconds. This field is included only if you enable concurrent segment search. ### The `breakdown` object @@ -749,221 +988,17 @@ The `breakdown` object represents the timing statistics about low-level Lucene e Field | Description :--- | :--- -`initialize` | Contains the amount of time taken to execute the `preCollection()` callback method during `AggregationCollectorManager` creation. -`build_leaf_collector`| Contains the time spent running the `getLeafCollector()` method of the aggregation, which creates a new collector to collect the given context. -`collect`| Contains the time spent collecting the documents into buckets. -`post_collection`| Contains the time spent running the aggregation’s `postCollection()` callback method. -`build_aggregation`| Contains the time spent running the aggregation’s `buildAggregations()` method, which builds the results of this aggregation. -`reduce`| Contains the time spent in the `reduce` phase. +`initialize` | Contains the amount of time taken to execute the `preCollection()` callback method during `AggregationCollectorManager` creation. For concurrent segment search, the `initialize` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`build_leaf_collector`| Contains the time spent running the aggregation's `getLeafCollector()` method, which creates a new collector to collect the given context. For concurrent segment search, the `build_leaf_collector` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`collect`| Contains the time spent collecting the documents into buckets. For concurrent segment search, the `collect` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`post_collection`| Contains the time spent running the aggregation’s `postCollection()` callback method. For concurrent segment search, the `post_collection` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`build_aggregation`| Contains the time spent running the aggregation’s `buildAggregations()` method, which builds the results of this aggregation. For concurrent segment search, the `build_aggregation` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`reduce`| Contains the time spent in the `reduce` phase. For concurrent segment search, the `reduce` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). `_count` | Contains the number of invocations of a ``. For example, `build_leaf_collector_count` contains the number of invocations of the `build_leaf_collector` method. - -## Concurrent segment search - -Starting in OpenSearch 2.10, [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/) allows each shard-level request to search segments in parallel during the query phase. If you enable the experimental concurrent segment search feature flag, the Profile API response will contain several additional fields with statistics about _slices_. - -A slice is the unit of work that can be executed by a thread. Each query can be partitioned into multiple slices, with each slice containing one or more segments. All the slices can be executed either in parallel or in some order depending on the available threads in the pool. - -In general, the max/min/avg slice time captures statistics across all slices for a timing type. For example, when profiling aggregations, the `max_slice_time_in_nanos` field in the `aggregations` section shows the maximum time consumed by the aggregation operation and its children across all slices. - -#### Example response - -The following is an example response for a concurrent search with three segment slices: - -
- - Response - - {: .text-delta} - -```json -{ - "took": 76, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 5, - "relation": "eq" - }, - "max_score": 1, - "hits": [ - ... - ] - }, - "aggregations": { - ... - }, - "profile": { - "shards": [ - { - "id": "[Sn2zHhcMTRetEjXvppU8bA][idx][0]", - "inbound_network_time_in_millis": 0, - "outbound_network_time_in_millis": 0, - "searches": [ - { - "query": [ - { - "type": "MatchAllDocsQuery", - "description": "*:*", - "time_in_nanos": 429246, - "breakdown": { - "set_min_competitive_score_count": 0, - "match_count": 0, - "shallow_advance_count": 0, - "set_min_competitive_score": 0, - "next_doc": 5485, - "match": 0, - "next_doc_count": 5, - "score_count": 5, - "compute_max_score_count": 0, - "compute_max_score": 0, - "advance": 3350, - "advance_count": 3, - "score": 5920, - "build_scorer_count": 6, - "create_weight": 429246, - "shallow_advance": 0, - "create_weight_count": 1, - "build_scorer": 2221054 - } - } - ], - "rewrite_time": 12442, - "collector": [ - { - "name": "QueryCollectorManager", - "reason": "search_multi", - "time_in_nanos": 6786930, - "reduce_time_in_nanos": 5892759, - "max_slice_time_in_nanos": 5951808, - "min_slice_time_in_nanos": 5798174, - "avg_slice_time_in_nanos": 5876588, - "slice_count": 3, - "children": [ - { - "name": "SimpleTopDocsCollectorManager", - "reason": "search_top_hits", - "time_in_nanos": 1340186, - "reduce_time_in_nanos": 1084060, - "max_slice_time_in_nanos": 457165, - "min_slice_time_in_nanos": 433706, - "avg_slice_time_in_nanos": 443332, - "slice_count": 3 - }, - { - "name": "NonGlobalAggCollectorManager: [histo]", - "reason": "aggregation", - "time_in_nanos": 5366791, - "reduce_time_in_nanos": 4637260, - "max_slice_time_in_nanos": 4526680, - "min_slice_time_in_nanos": 4414049, - "avg_slice_time_in_nanos": 4487122, - "slice_count": 3 - } - ] - } - ] - } - ], - "aggregations": [ - { - "type": "NumericHistogramAggregator", - "description": "histo", - "time_in_nanos": 16454372, - "max_slice_time_in_nanos": 7342096, - "min_slice_time_in_nanos": 4413728, - "avg_slice_time_in_nanos": 5430066, - "breakdown": { - "min_build_leaf_collector": 4320259, - "build_aggregation_count": 3, - "post_collection": 9942, - "max_collect_count": 2, - "initialize_count": 3, - "reduce_count": 0, - "avg_collect": 146319, - "max_build_aggregation": 2826399, - "avg_collect_count": 1, - "max_build_leaf_collector": 4322299, - "min_build_leaf_collector_count": 1, - "build_aggregation": 3038635, - "min_initialize": 1057, - "max_reduce": 0, - "build_leaf_collector_count": 3, - "avg_reduce": 0, - "min_collect_count": 1, - "avg_build_leaf_collector_count": 1, - "avg_build_leaf_collector": 4321197, - "max_collect": 181266, - "reduce": 0, - "avg_build_aggregation": 954896, - "min_post_collection": 1236, - "max_initialize": 11603, - "max_post_collection": 5350, - "collect_count": 5, - "avg_post_collection": 2793, - "avg_initialize": 4860, - "post_collection_count": 3, - "build_leaf_collector": 4322299, - "min_collect": 78519, - "min_build_aggregation": 8543, - "initialize": 11971068, - "max_build_leaf_collector_count": 1, - "min_reduce": 0, - "collect": 181838 - }, - "debug": { - "total_buckets": 1 - } - } - ] - } - ] - } -} -``` -
- -### Modified or added response fields - -The following sections contain definitions of all modified or added response fields for concurrent segment search. - -#### The `query` array - -|Field |Description | -|:--- |:--- | -|`time_in_nanos` |For concurrent segment search, `time_in_nanos` is the cumulative amount of time taken to run all methods across all slices, in nanoseconds. This is not equivalent to the actual amount of time the query took to run because it does not take into account that multiple slices can run the methods in parallel. | -|`breakdown.` |For concurrent segment search, this field contains the total amount of time taken by all segments to run a method. | -|`breakdown._count` |For concurrent segment search, this field contains the total number of invocations of a `` obtained by adding the number of method invocations for all segments. | - -#### The `collector` array - -|Field |Description | -|:--- |:--- | -|`time_in_nanos` |The total elapsed time for this collector, in nanoseconds. For concurrent segment search, `time_in_nanos` is the total amount of time across all slices (`max(slice_end_time) - min(slice_start_time)`). | -|`max_slice_time_in_nanos` |The maximum amount of time taken by any slice, in nanoseconds. | -|`min_slice_time_in_nanos` |The minimum amount of time taken by any slice, in nanoseconds. | -|`avg_slice_time_in_nanos` |The average amount of time taken by any slice, in nanoseconds. | -|`slice_count` |The total slice count for this query. | -|`reduce_time_in_nanos` |The amount of time taken to reduce results for all slice collectors, in nanoseconds. | - -#### The `aggregations` array - -|Field |Description | -|:--- |:--- | -|`time_in_nanos` |The total elapsed time for this aggregation, in nanoseconds. For concurrent segment search, `time_in_nanos` is the total amount of time across all slices (`max(slice_end_time) - min(slice_start_time)`). | -|`max_slice_time_in_nanos` |The maximum amount of time taken by any slice to run an aggregation, in nanoseconds. | -|`min_slice_time_in_nanos` |The minimum amount of time taken by any slice to run an aggregation, in nanoseconds. | -|`avg_slice_time_in_nanos` |The average amount of time taken by any slice to run an aggregation, in nanoseconds. | -|`` |The total elapsed time across all slices (`max(slice_end_time) - min(slice_start_time)`). For example, for the `collect` method, it is the total time spent collecting documents into buckets across all slices. | -|`max_` |The maximum amount of time taken by any slice to run an aggregation method. | -|`min_`|The minimum amount of time taken by any slice to run an aggregation method. | -|`avg_` |The average amount of time taken by any slice to run an aggregation method. | -|`_count` |The total method count across all slices. For example, for the `collect` method, it is the total number of invocations of this method needed to collect documents into buckets across all slices. | -|`max__count` |The maximum number of invocations of a `` on any slice. | -|`min__count` |The minimum number of invocations of a `` on any slice. | -|`avg__count` |The average number of invocations of a `` on any slice. | +`max_` |The maximum amount of time taken by any slice to run an aggregation method. This field is included only if you enable concurrent segment search. +`min_`|The minimum amount of time taken by any slice to run an aggregation method. This field is included only if you enable concurrent segment search. +`avg_` |The average amount of time taken by any slice to run an aggregation method. This field is included only if you enable concurrent segment search. +`_count` |The total method count across all slices. For example, for the `collect` method, it is the total number of invocations of this method needed to collect documents into buckets across all slices. +`max__count` |The maximum number of invocations of a `` on any slice. This field is included only if you enable concurrent segment search. +`min__count` |The minimum number of invocations of a `` on any slice. This field is included only if you enable concurrent segment search. +`avg__count` |The average number of invocations of a `` on any slice. This field is included only if you enable concurrent segment search. diff --git a/_api-reference/search.md b/_api-reference/search.md index b23929afc5..71d96bcaef 100644 --- a/_api-reference/search.md +++ b/_api-reference/search.md @@ -86,6 +86,7 @@ track_scores | Boolean | Whether to return document scores. Default is false. track_total_hits | Boolean or Integer | Whether to return how many documents matched the query. typed_keys | Boolean | Whether returned aggregations and suggested terms should include their types in the response. Default is true. version | Boolean | Whether to include the document version as a match. +include_named_queries_score | Boolean | Whether to return scores with named queries. Default is false. ### The `preference` query parameter @@ -107,6 +108,7 @@ All fields are optional. Field | Type | Description :--- | :--- | :--- +aggs | Object | In the optional `aggs` parameter, you can define any number of aggregations. Each aggregation is defined by its name and one of the types of aggregations that OpenSearch supports. For more information, see [Aggregations]({{site.url}}{{site.baseurl}}/aggregations/). docvalue_fields | Array of objects | The fields that OpenSearch should return using their docvalue forms. Specify a format to return results in a certain format, such as date and time. fields | Array | The fields to search for in the request. Specify a format to return results in a certain format, such as date and time. explain | String | Whether to return details about how OpenSearch computed the document's score. Default is false. diff --git a/_api-reference/snapshots/get-snapshot-status.md b/_api-reference/snapshots/get-snapshot-status.md index 02aa419042..6f8320d0b0 100644 --- a/_api-reference/snapshots/get-snapshot-status.md +++ b/_api-reference/snapshots/get-snapshot-status.md @@ -29,9 +29,9 @@ Three request variants provide flexibility: * `GET _snapshot/_status` returns the status of all currently running snapshots in all repositories. -* `GET _snapshot//_status` returns the status of only currently running snapshots in the specified repository. This is the preferred variant. +* `GET _snapshot//_status` returns all currently running snapshots in the specified repository. This is the preferred variant. -* `GET _snapshot///_status` returns the status of all snapshots in the specified repository whether they are running or not. +* `GET _snapshot///_status` returns detailed status information for a specific snapshot in the specified repository, regardless of whether it's currently running or not. Using the API to return state for other than currently running snapshots can be very costly for (1) machine machine resources and (2) processing time if running in the cloud. For each snapshot, each request causes file reads from all a snapshot's shards. {: .warning} @@ -420,4 +420,4 @@ All property values are Integers. :--- | :--- | :--- | | shards_stats | Object | See [Shard stats](#shard-stats). | | stats | Object | See [Snapshot file stats](#snapshot-file-stats). | -| shards | list of Objects | List of objects containing information about the shards that include the snapshot. Properies of the shards are listed below in bold text.

**stage**: Current state of shards in the snapshot. Shard states are:

* DONE: Number of shards in the snapshot that were successfully stored in the repository.

* FAILURE: Number of shards in the snapshot that were not successfully stored in the repository.

* FINALIZE: Number of shards in the snapshot that are in the finalizing stage of being stored in the repository.

* INIT: Number of shards in the snapshot that are in the initializing stage of being stored in the repository.

* STARTED: Number of shards in the snapshot that are in the started stage of being stored in the repository.

**stats**: See [Snapshot file stats](#snapshot-file-stats).

**total**: Total number and size of files referenced by the snapshot.

**start_time_in_millis**: Time (in milliseconds) when snapshot creation began.

**time_in_millis**: Total time (in milliseconds) that the snapshot took to complete. | \ No newline at end of file +| shards | list of Objects | List of objects containing information about the shards that include the snapshot. OpenSearch returns the following properties about the shards.

**stage**: Current state of shards in the snapshot. Shard states are:

* DONE: Number of shards in the snapshot that were successfully stored in the repository.

* FAILURE: Number of shards in the snapshot that were not successfully stored in the repository.

* FINALIZE: Number of shards in the snapshot that are in the finalizing stage of being stored in the repository.

* INIT: Number of shards in the snapshot that are in the initializing stage of being stored in the repository.

* STARTED: Number of shards in the snapshot that are in the started stage of being stored in the repository.

**stats**: See [Snapshot file stats](#snapshot-file-stats).

**total**: Total number and size of files referenced by the snapshot.

**start_time_in_millis**: Time (in milliseconds) when snapshot creation began.

**time_in_millis**: Total time (in milliseconds) that the snapshot took to complete. | diff --git a/_api-reference/tasks.md b/_api-reference/tasks.md index 19ef373806..5c3a41fd34 100644 --- a/_api-reference/tasks.md +++ b/_api-reference/tasks.md @@ -267,7 +267,7 @@ To associate requests with tasks for better tracking, you can provide a `X-Opaqu Usage: ```bash -curl -i -H "X-Opaque-Id: 111111" "https://localhost:9200/_tasks" -u 'admin:admin' --insecure +curl -i -H "X-Opaque-Id: 111111" "https://localhost:9200/_tasks" -u 'admin:' --insecure ``` {% include copy.html %} @@ -326,6 +326,6 @@ content-length: 768 This operation supports the same parameters as the `tasks` operation. The following example shows how you can associate `X-Opaque-Id` with specific tasks: ```bash -curl -i -H "X-Opaque-Id: 123456" "https://localhost:9200/_tasks?nodes=opensearch-node1" -u 'admin:admin' --insecure +curl -i -H "X-Opaque-Id: 123456" "https://localhost:9200/_tasks?nodes=opensearch-node1" -u 'admin:' --insecure ``` {% include copy.html %} diff --git a/_automating-configurations/api/create-workflow.md b/_automating-configurations/api/create-workflow.md new file mode 100644 index 0000000000..5c501ce4e8 --- /dev/null +++ b/_automating-configurations/api/create-workflow.md @@ -0,0 +1,255 @@ +--- +layout: default +title: Create or update a workflow +parent: Workflow APIs +nav_order: 10 +--- + +# Create or update a workflow + +Creating a workflow adds the content of a workflow template to the flow framework system index. You can provide workflows in JSON format (by specifying `Content-Type: application/json`) or YAML format (by specifying `Content-Type: application/yaml`). By default, the workflow is validated to help identify invalid configurations, including: + +* Workflow steps requiring an OpenSearch plugin that is not installed. +* Workflow steps relying on previous node input that is provided by those steps. +* Workflow step fields with invalid values. +* Workflow graph (node/edge) configurations containing cycles or with duplicate IDs. + +To obtain the validation template for workflow steps, call the [Get Workflow Steps API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-steps/). + +You can include placeholder expressions in the value of workflow step fields. For example, you can specify a credential field in a template as `openAI_key: '${{ openai_key }}'`. The expression will be substituted with the user-provided value during provisioning, using the format {% raw %}`${{ }}`{% endraw %}. You can pass the actual key as a parameter by using the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) or by using this API with the `provision` parameter set to `true`. + +Once a workflow is created, provide its `workflow_id` to other APIs. + +The `POST` method creates a new workflow. The `PUT` method updates an existing workflow. + +You can only update a workflow if it has not yet been provisioned. +{: .note} + +## Path and HTTP methods + +```json +POST /_plugins/_flow_framework/workflow +PUT /_plugins/_flow_framework/workflow/ +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow to be updated. Required for the `PUT` method. | + +## Query parameters + +Workflows are normally created and provisioned in separate steps. However, once you have thoroughly tested the workflow, you can combine the create and provision steps by including the `provision` query parameter: + +```json +POST /_plugins/_flow_framework/workflow?provision=true +``` +{% include copy-curl.html %} + +When set to `true`, the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) is executed immediately following creation. + +By default, workflows are validated when they are created to ensure that the syntax is valid and that the graph does not contain cycles. This behavior can be controlled with the `validation` query parameter. If `validation` is set to `all`, OpenSearch performs a complete template validation. Any other value of the `validation` parameter suppresses validation, allowing an incomplete/work-in-progress template to be saved. To disable template validation, set `validation` to `none`: + +```json +POST /_plugins/_flow_framework/workflow?validation=none +``` +{% include copy-curl.html %} + +The following table lists the available query parameters. All query parameters are optional. User-provided parameters are only allowed if the `provision` parameter is set to `true`. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `provision` | Boolean | Whether to provision the workflow as part of the request. Default is `false`. | +| `validation` | String | Whether to validate the workflow. Valid values are `all` (validate the template) and `none` (do not validate the template). Default is `all`. | +| User-provided substitution expressions | String | Parameters matching substitution expressions in the template. Only allowed if `provision` is set to `true`. Optional. If `provision` is set to `false`, you can pass these parameters in the [Provision Workflow API query parameters]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/#query-parameters). | + +## Request fields + +The following table lists the available request fields. + +|Field |Data type |Required/Optional |Description | +|:--- |:--- |:--- |:--- | +|`name` |String |Required |The name of the workflow. | +|`description` |String |Optional |A description of the workflow. | +|`use_case` |String |Optional | A use case, which can be used with the Search Workflow API to find related workflows. In the future, OpenSearch may provide some standard use cases to ease categorization, but currently you can use this field to specify custom values. | +|`version` |Object |Optional | A key-value map with two fields: `template`, which identifies the template version, and `compatibility`, which identifies a list of minimum required OpenSearch versions. | +|`workflows` |Object |Optional |A map of workflows. Presently, only the `provision` key is supported. The value for the workflow key is a key-value map that includes fields for `user_params` and lists of `nodes` and `edges`. | + +#### Example request: Register and deploy an externally hosted model (YAML) + +To provide a template in YAML format, specify `Content-Type: application/yaml` in the request header: + +```bash +curl -XPOST "http://localhost:9200/_plugins/_flow_framework/workflow" -H 'Content-Type: application/yaml' +``` + +YAML templates permit comments. +{: .tip} + +The following is an example YAML template for registering and deploying an externally hosted model: + +```yaml +# This name is required +name: createconnector-registerremotemodel-deploymodel +# Other fields are optional but useful +description: This template creates a connector to a remote model, registers it, and + deploys that model +# Other templates with a similar use case can be searched +use_case: REMOTE_MODEL_DEPLOYMENT +version: + # Templates may be versioned by their authors + template: 1.0.0 + # Compatibility with OpenSearch 2.12.0 and higher and 3.0.0 and higher + compatibility: + - 2.12.0 + - 3.0.0 +# One or more workflows can be included, presently only provision is supported +workflows: + provision: + # These nodes are the workflow steps corresponding to ML Commons APIs + nodes: + # This ID must be unique to this workflow + - id: create_connector_1 + # There may be multiple steps with the same type + type: create_connector + # These inputs match the Create Connector API body + user_inputs: + name: OpenAI Chat Connector + description: The connector to public OpenAI model service for GPT 3.5 + version: '1' + protocol: http + parameters: + endpoint: api.openai.com + model: gpt-3.5-turbo + credential: + openAI_key: '12345' + actions: + - action_type: predict + method: POST + url: https://${parameters.endpoint}/v1/chat/completions + # This ID must be unique to this workflow + - id: register_model_2 + type: register_remote_model + # This step needs the connector_id produced as an output of the previous step + previous_node_inputs: + create_connector_1: connector_id + # These inputs match the Register Model API body + user_inputs: + name: openAI-gpt-3.5-turbo + function_name: remote + description: test model + # This ID must be unique to this workflow + - id: deploy_model_3 + type: deploy_model + # This step needs the model_id produced as an output of the previous step + previous_node_inputs: + register_model_2: model_id + # Since the nodes include previous_node_inputs these are optional to define + # They will be added automatically and included in the stored template + # Additional edges may also be added here if required for sequencing + edges: + - source: create_connector_1 + dest: register_model_2 + - source: register_model_2 + dest: deploy_model_3 +``` +{% include copy-curl.html %} + +#### Example request: Register and deploy a remote model (JSON) + +To provide a template in JSON format, specify `Content-Type: application/json` in the request header: + +```bash +curl -XPOST "http://localhost:9200/_plugins/_flow_framework/workflow" -H 'Content-Type: application/json' +``` +The following JSON template is equivalent to the YAML template provided in the previous section: + +```json +{ + "name": "createconnector-registerremotemodel-deploymodel", + "description": "This template creates a connector to a remote model, registers it, and deploys that model", + "use_case": "REMOTE_MODEL_DEPLOYMENT", + "version": { + "template": "1.0.0", + "compatibility": [ + "2.12.0", + "3.0.0" + ] + }, + "workflows": { + "provision": { + "nodes": [ + { + "id": "create_connector_1", + "type": "create_connector", + "user_inputs": { + "name": "OpenAI Chat Connector", + "description": "The connector to public OpenAI model service for GPT 3.5", + "version": "1", + "protocol": "http", + "parameters": { + "endpoint": "api.openai.com", + "model": "gpt-3.5-turbo" + }, + "credential": { + "openAI_key": "12345" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://${parameters.endpoint}/v1/chat/completions" + } + ] + } + }, + { + "id": "register_model_2", + "type": "register_remote_model", + "previous_node_inputs": { + "create_connector_1": "connector_id" + }, + "user_inputs": { + "name": "openAI-gpt-3.5-turbo", + "function_name": "remote", + "description": "test model" + } + }, + { + "id": "deploy_model_3", + "type": "deploy_model", + "previous_node_inputs": { + "register_model_2": "model_id" + } + } + ], + "edges": [ + { + "source": "create_connector_1", + "dest": "register_model_2" + }, + { + "source": "register_model_2", + "dest": "deploy_model_3" + } + ] + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +OpenSearch responds with the `workflow_id`: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50" +} +``` + +Once you have created a workflow, you can use other workflow APIs with the `workflow_id`. \ No newline at end of file diff --git a/_automating-configurations/api/delete-workflow.md b/_automating-configurations/api/delete-workflow.md new file mode 100644 index 0000000000..db3a340cee --- /dev/null +++ b/_automating-configurations/api/delete-workflow.md @@ -0,0 +1,53 @@ +--- +layout: default +title: Delete a workflow +parent: Workflow APIs +nav_order: 80 +--- + +# Delete a workflow + +When you no longer need a workflow template, you can delete it by calling the Delete Workflow API. + +Note that deleting a workflow only deletes the stored template but does not deprovision its resources. + +## Path and HTTP methods + +```json +DELETE /_plugins/_flow_framework/workflow/ +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow to be retrieved. Required. | + +#### Example request + +``` +DELETE /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50 +``` +{% include copy-curl.html %} + +#### Example response + +If the workflow exists, a delete response contains the status of the deletion, where the `result` field is set to `deleted` on success or `not_found` if the workflow does not exist (it may have already been deleted): + +```json +{ + "_index": ".plugins-flow_framework-templates", + "_id": "8xL8bowB8y25Tqfenm50", + "_version": 2, + "result": "deleted", + "_shards": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "_seq_no": 2, + "_primary_term": 1 +} +``` \ No newline at end of file diff --git a/_automating-configurations/api/deprovision-workflow.md b/_automating-configurations/api/deprovision-workflow.md new file mode 100644 index 0000000000..e9219536ce --- /dev/null +++ b/_automating-configurations/api/deprovision-workflow.md @@ -0,0 +1,58 @@ +--- +layout: default +title: Deprovision a workflow +parent: Workflow APIs +nav_order: 70 +--- + +# Deprovision a workflow + +When you no longer need a workflow, you can deprovision its resources. Most workflow steps that create a resource have corresponding workflow steps to reverse that action. To retrieve all resources currently created for a workflow, call the [Get Workflow Status API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). When you call the Deprovision Workflow API, resources included in the `resources_created` field of the Get Workflow Status API response will be removed using a workflow step corresponding to the one that provisioned them. + +The workflow executes the provisioning workflow steps in reverse order. If failures occur because of resource dependencies, such as preventing deletion of a registered model if it is still deployed, the workflow attempts retries. + +## Path and HTTP methods + +```json +POST /_plugins/_flow_framework/workflow//_deprovision +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow to be deprovisioned. Required. | + +### Example request + +```json +POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_deprovision +``` +{% include copy-curl.html %} + +### Example response + +If deprovisioning is successful, OpenSearch responds with the same `workflow_id` that was used in the request: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50" +} +``` + +If deprovisioning did not completely remove all resources, OpenSearch responds with a `202 (ACCEPTED)` status and identifies the resources that were not deprovisioned: + +```json +{ + "error": "Failed to deprovision some resources: [connector_id Lw7PX4wBfVtHp98y06wV]." +} +``` + +In some cases, the failure happens because of another dependent resource that took some time to be removed. In this case, you can attempt to send the same request again. +{: .tip} + +To obtain a more detailed deprovisioning status than is provided by the summary in the error response, query the [Get Workflow Status API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). + +On success, the workflow returns to a `NOT_STARTED` state. If some resources have not yet been removed, they are provided in the response. \ No newline at end of file diff --git a/_automating-configurations/api/get-workflow-status.md b/_automating-configurations/api/get-workflow-status.md new file mode 100644 index 0000000000..280fb52195 --- /dev/null +++ b/_automating-configurations/api/get-workflow-status.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Get a workflow status +parent: Workflow APIs +nav_order: 40 +--- + +# Get a workflow status + +[Provisioning a workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) may take a significant amount of time, particularly when the action is associated with OpenSearch indexing operations. The Get Workflow State API permits monitoring of the provisioning deployment status until it is complete. + +## Path and HTTP methods + +```json +GET /_plugins/_flow_framework/workflow//_status +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow from which to obtain the status. Required for the `PUT` method. | + +## Query parameters + +The `all` parameter specifies whether the response should return all fields. + +When set to `false` (the default), the response contains the following fields: + +- `workflow_id` +- any `error` state +- `state` +- a list of `resources_created` + +When set to `true`, the response contains the following additional fields: + +- `provisioning_progress` +- `provision_start_time` +- `provision_end_time` +- `user` +- `user_outputs` + +To receive all available fields in the response, set `all` to `true`: + +```json +GET /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_status?all=true +``` +{% include copy-curl.html %} + +#### Example request + +```json +GET /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_status +``` +{% include copy-curl.html %} + + +#### Example response + +OpenSearch responds with a summary of the provisioning status and a list of created resources. + +Before provisioning has begun, OpenSearch does not return any resources: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50", + "state": "NOT_STARTED" +} +``` + +While provisioning is in progress, OpenSearch returns a partial resource list: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50", + "state": "PROVISIONING", + "resources_created": [ + { + "workflow_step_name": "create_connector", + "workflow_step_id": "create_connector_1", + "resource_type": "connector_id", + "resource_id": "NdjCQYwBLmvn802B0IwE" + } + ] +} +``` + +Upon provisioning completion, OpenSearch returns the full resource list: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50", + "state": "COMPLETED", + "resources_created": [ + { + "workflow_step_name": "create_connector", + "workflow_step_id": "create_connector_1", + "resource_type": "connector_id", + "resource_id": "NdjCQYwBLmvn802B0IwE" + }, + { + "workflow_step_name": "register_remote_model", + "workflow_step_id": "register_model_2", + "resource_type": "model_id", + "resource_id": "N9jCQYwBLmvn802B0oyh" + } + ] +} +``` \ No newline at end of file diff --git a/_automating-configurations/api/get-workflow-steps.md b/_automating-configurations/api/get-workflow-steps.md new file mode 100644 index 0000000000..38059ec80c --- /dev/null +++ b/_automating-configurations/api/get-workflow-steps.md @@ -0,0 +1,76 @@ +--- +layout: default +title: Get workflow steps +parent: Workflow APIs +nav_order: 50 +--- + +# Get workflow steps + +This API returns a list of workflow steps, including their required inputs, outputs, default timeout values, and required plugins. For example, for the `register_remote_model` step, the Get Workflow Steps API returns the following information: + +```json +{ + "register_remote_model": { + "inputs": [ + "name", + "connector_id" + ], + "outputs": [ + "model_id", + "register_model_status" + ], + "required_plugins": [ + "opensearch-ml" + ] + } +} +``` + +## Path and HTTP methods + +```json +GET /_plugins/_flow_framework/workflow/_steps +GET /_plugins/_flow_framework/workflow/_step?workflow_step= +``` + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_step` | String | The name of the step to retrieve. Specify multiple step names as a comma-separated list. For example, `create_connector,delete_model,deploy_model`. | + +#### Example request + +To fetch all workflow steps, use the following request: + +```json +GET /_plugins/_flow_framework/workflow/_steps +``` +{% include copy-curl.html %} + +To fetch specific workflow steps, pass the step names to the request as a query parameter: + +```json +GET /_plugins/_flow_framework/workflow/_step?workflow_step=create_connector,delete_model,deploy_model +``` +{% include copy-curl.html %} + + +#### Example response + +OpenSearch responds with the workflow steps. The order of fields in the returned steps may not exactly match the original JSON but will function identically. + +To retrieve the template in YAML format, specify `Content-Type: application/yaml` in the request header: + +```bash +curl -XGET "http://localhost:9200/_plugins/_flow_framework/workflow/_steps" -H 'Content-Type: application/yaml' +``` + +To retrieve the template in JSON format, specify `Content-Type: application/json` in the request header: + +```bash +curl -XGET "http://localhost:9200/_plugins/_flow_framework/workflow/_steps" -H 'Content-Type: application/json' +``` \ No newline at end of file diff --git a/_automating-configurations/api/get-workflow.md b/_automating-configurations/api/get-workflow.md new file mode 100644 index 0000000000..7b1d5987c4 --- /dev/null +++ b/_automating-configurations/api/get-workflow.md @@ -0,0 +1,47 @@ +--- +layout: default +title: Get a workflow +parent: Workflow APIs +nav_order: 20 +--- + +# Get a workflow + +The Get Workflow API retrieves the workflow template. + +## Path and HTTP methods + +```json +GET /_plugins/_flow_framework/workflow/ +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow to be retrieved. Required. | + +#### Example request + +```json +GET /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50 +``` +{% include copy-curl.html %} + +#### Example response + +To retrieve a template in YAML format, specify `Content-Type: application/yaml` in the request header: + +```bash +curl -XGET "http://localhost:9200/_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50" -H 'Content-Type: application/yaml' +``` + +To retrieve a template in JSON format, specify `Content-Type: application/json` in the request header: + +```bash +curl -XGET "http://localhost:9200/_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50" -H 'Content-Type: application/json' +``` + +OpenSearch responds with the stored template containing the same content as the body of the [create workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/create-workflow/) request. The order of fields in the returned template may not exactly match the original template but will function identically. \ No newline at end of file diff --git a/_automating-configurations/api/index.md b/_automating-configurations/api/index.md new file mode 100644 index 0000000000..716e19c41f --- /dev/null +++ b/_automating-configurations/api/index.md @@ -0,0 +1,21 @@ +--- +layout: default +title: Workflow APIs +nav_order: 40 +has_children: true +has_toc: false +--- + +# Workflow APIs + +OpenSearch supports the following workflow APIs: + +* [Create or update workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/create-workflow/) +* [Get workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow/) +* [Provision workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) +* [Get workflow status]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/) +* [Get workflow steps]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-steps/) +* [Search workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/search-workflow/) +* [Search workflow state]({{site.url}}{{site.baseurl}}/automating-configurations/api/search-workflow-state/) +* [Deprovision workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/deprovision-workflow/) +* [Delete workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/delete-workflow/) \ No newline at end of file diff --git a/_automating-configurations/api/provision-workflow.md b/_automating-configurations/api/provision-workflow.md new file mode 100644 index 0000000000..62c4954ee9 --- /dev/null +++ b/_automating-configurations/api/provision-workflow.md @@ -0,0 +1,77 @@ +--- +layout: default +title: Provision a workflow +parent: Workflow APIs +nav_order: 30 +--- + +# Provision a workflow + +Provisioning a workflow is a one-time setup process usually performed by a cluster administrator to create resources that will be used by end users. + +The `workflows` template field may contain multiple workflows. The workflow with the `provision` key can be executed with this API. This API is also executed when the [Create or Update Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/create-workflow/) is called with the `provision` parameter set to `true`. + +You can only provision a workflow if it has not yet been provisioned. Deprovision the workflow if you need to repeat provisioning. +{: .note} + +## Path and HTTP methods + +```json +POST /_plugins/_flow_framework/workflow//_provision +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow to be provisioned. Required. | + +## Query parameters + +If you have included a substitution expression in the template, you may pass it as a query parameter or as a string value of a request body field. For example, if you specified a credential field in a template as `openAI_key: '${{ openai_key }}'`, then you can include the `openai_key` parameter as a query parameter or body field so it can be substituted during provisioning. For example, the following request provides a query parameter: + +```json +POST /_plugins/_flow_framework/workflow//_provision?= +``` + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| User-provided substitution expressions | String | Parameters matching substitution expressions in the template. Optional. | + +#### Example requests + +```json +POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision +``` +{% include copy-curl.html %} + +The following request substitutes the expression `${{ openai_key }}` with the value "12345" using a query parameter: + +```json +POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision?openai_key=12345 +``` +{% include copy-curl.html %} + +The following request substitutes the expression `${{ openai_key }}` with the value "12345" using the request body: + +```json +POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision +{ + "openai_key" : "12345" +} +``` +{% include copy-curl.html %} + +#### Example response + +OpenSearch responds with the same `workflow_id` that was used in the request: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50" +} +``` + +To obtain the provisioning status, query the [Get Workflow State API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). \ No newline at end of file diff --git a/_automating-configurations/api/search-workflow-state.md b/_automating-configurations/api/search-workflow-state.md new file mode 100644 index 0000000000..1cacb3a32b --- /dev/null +++ b/_automating-configurations/api/search-workflow-state.md @@ -0,0 +1,60 @@ +--- +layout: default +title: Search for a workflow state +parent: Workflow APIs +nav_order: 65 +--- + +# Search for a workflow + +You can search for resources created by workflows by matching a query to a field. The fields you can search correspond to those returned by the [Get Workflow Status API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). + +## Path and HTTP methods + +```json +GET /_plugins/_flow_framework/workflow/state/_search +POST /_plugins/_flow_framework/workflow/state/_search +``` + +#### Example request: All workflows with a state of `NOT_STARTED` + +```json +GET /_plugins/_flow_framework/workflow/state/_search +{ + "query": { + "match": { + "state": "NOT_STARTED" + } + } +} +``` +{% include copy-curl.html %} + +#### Example request: All workflows that have a `resources_created` field with a `workflow_step_id` of `register_model_2` + +```json +GET /_plugins/_flow_framework/workflow/state/_search +{ + "query": { + "nested": { + "path": "resources_created", + "query": { + "bool": { + "must": [ + { + "match": { + "resources_created.workflow_step_id": "register_model_2" + } + } + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +The response contains documents matching the search parameters. \ No newline at end of file diff --git a/_automating-configurations/api/search-workflow.md b/_automating-configurations/api/search-workflow.md new file mode 100644 index 0000000000..b78de9e9d2 --- /dev/null +++ b/_automating-configurations/api/search-workflow.md @@ -0,0 +1,47 @@ +--- +layout: default +title: Search for a workflow +parent: Workflow APIs +nav_order: 60 +--- + +# Search for a workflow + +You can retrieve created workflows with their `workflow_id` or search for workflows by using a query matching a field. You can use the `use_case` field to search for similar workflows. + +## Path and HTTP methods + +```json +GET /_plugins/_flow_framework/workflow/_search +POST /_plugins/_flow_framework/workflow/_search +``` + +#### Example request: All created workflows + +```json +GET /_plugins/_flow_framework/workflow/_search +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} + +#### Example request: All workflows with a `use_case` of `REMOTE_MODEL_DEPLOYMENT` + +```json +GET /_plugins/_flow_framework/workflow/_search +{ + "query": { + "match": { + "use_case": "REMOTE_MODEL_DEPLOYMENT" + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +OpenSearch responds with a list of workflow templates matching the search parameters. \ No newline at end of file diff --git a/_automating-configurations/index.md b/_automating-configurations/index.md new file mode 100644 index 0000000000..144ad445c8 --- /dev/null +++ b/_automating-configurations/index.md @@ -0,0 +1,46 @@ +--- +layout: default +title: Automating configurations +nav_order: 1 +has_children: false +nav_exclude: true +redirect_from: /automating-configurations/ +--- + +# Automating configurations +**Introduced 2.13** +{: .label .label-purple } + +You can automate complex OpenSearch setup and preprocessing tasks by providing templates for common use cases. For example, automating machine learning (ML) setup tasks streamlines the use of OpenSearch ML offerings. + +In OpenSearch 2.12, configuration automation is limited to ML tasks. +{: .info} + +OpenSearch use case templates provide a compact description of the setup process in a JSON or YAML document. These templates describe automated workflow configurations for conversational chat or query generation, AI connectors, tools, agents, and other components that prepare OpenSearch as a backend for generative models. For custom template examples, see [Sample templates](https://github.com/opensearch-project/flow-framework/tree/main/sample-templates). For OpenSearch-provided templates, see [Workflow templates]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/). + +## Key features + +Workflow automation provides the following benefits: + +* **Use case templates**: Get started with predefined templates that outline the setup process for your general use cases. +* **Customizable workflows**: Customize the workflow templates to your specific use case. +* **Setup automation**: Easily configure AI connectors, tools, agents, and other components in a single API call. + +## Overview + +**Templates** implement workflow automation in OpenSearch. You can provide these templates in JSON or YAML format. You can describe one or more templates with a sequence of steps required for a particular use case. Each template consists of the following elements: + +* **Metadata**: A name, description, use case category, template version, and OpenSearch version compatibility range. +* **User input**: Parameters expected from the user that are common to all automation steps across all workflows, such as an index name. +* **Workflows**: One or more workflows containing the following elements: + * **User input**: Parameters expected from the user that are specific to the steps in this workflow. + * **Workflow Steps**: The workflow steps described as a directed acyclic graph (DAG): + * ***Nodes*** describe steps of the process, which may be executed in parallel. For the syntax of workflow steps, see [Workflow steps]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-steps/). + * ***Edges*** sequence nodes to be executed after the previous step is complete and may use the output fields of previous steps. When a node includes a key in the `previous_node_input` map referring to a previous node’s workflow step, a corresponding edge is automatically added to the template during parsing and may be omitted for the sake of simplicity. + +## Next steps + +- For supported APIs, see [Workflow APIs]({{site.url}}{{site.baseurl}}/automating-configurations/api/index/). +- For the workflow step syntax, see [Workflow steps]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-steps/). +- For a complete example, see [Workflow tutorial]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-tutorial/). +- For configurable settings, see [Workflow settings]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-settings/). diff --git a/_automating-configurations/workflow-settings.md b/_automating-configurations/workflow-settings.md new file mode 100644 index 0000000000..78762fdfbb --- /dev/null +++ b/_automating-configurations/workflow-settings.md @@ -0,0 +1,17 @@ +--- +layout: default +title: Workflow settings +nav_order: 30 +--- + +# Workflow settings + +The following keys represent configurable workflow settings. + +|Setting |Data type |Default value |Description | +|:--- |:--- |:--- |:--- | +|`plugins.flow_framework.enabled` |Boolean |`false` |Whether the Flow Framework API is enabled. | +|`plugins.flow_framework.max_workflows` |Integer |`1000` | The maximum number of workflows that you can create. When the limit is above 1,000, the number of existing workflows is defined as a lower bound for performance reasons, so the actual maximum may slightly exceed this value. | +|`plugins.flow_framework.max_workflow_steps` |Integer |`50` |The maximum number of steps a workflow can have. | +|`plugins.flow_framework.request_timeout` |Time units |`10s` |The default timeout for REST requests, which applies to internal search queries. | +|`plugins.flow_framework.task_request_retry_duration` |Time units |`5s` | When steps correspond to an API that produces a `task_id`, OpenSearch will retry them at this interval until completion. | diff --git a/_automating-configurations/workflow-steps.md b/_automating-configurations/workflow-steps.md new file mode 100644 index 0000000000..99c1f57993 --- /dev/null +++ b/_automating-configurations/workflow-steps.md @@ -0,0 +1,64 @@ +--- +layout: default +title: Workflow steps +nav_order: 10 +--- + +# Workflow steps + +_Workflow steps_ form basic "building blocks" for process automation. Most steps directly correspond to OpenSearch or plugin API operations, such as CRUD operations on machine learning (ML) connectors, models, and agents. Some steps simplify the configuration by reusing the body expected by these APIs across multiple steps. For example, once you configure a _tool_, you can use it with multiple _agents_. + +## Workflow step fields + +Workflow steps are actively being developed to expand automation capabilities. Workflow step (graph node) configuration includes the following fields. + +|Field |Data type |Required/Optional |Description | +|:--- |:--- |:--- |:--- | +|`id` |String |Required | A user-provided ID for the step. The ID must be unique within a given workflow and is useful for identifying resources created by the step. For example, a `register_agent` step may return an `agent_id` that has been registered. Using this ID, you can determine which step produced which resource. | +|`type` |String |Required |The type of action to take, such as `deploy_model`, which corresponds to the API for which the step is used. Multiple steps may share the same type but must each have their own unique ID. For a list of supported types, see [Workflow step types](#workflow-step-types). | +|`previous_node_inputs` |Object |Optional | A key-value map specifying user inputs that are produced by a previous step in the workflow. For each key-value pair, the key is the previous step's `id` and the value is an API body field name (such as `model_id`) that will be produced as an output of a previous step in the workflow. For example, `register_remote_model` (key) may produce a `model_id` (value) that is required for a subsequent `deploy_model` step.
A graph edge is automatically added to the workflow connecting the previous step's key as the source and the current node as the destination.
In some cases, you can include [additional inputs](#additional-fields) in this field. | +|`user_inputs` |Object |Optional | A key-value map of inputs supported by the corresponding API for this specific step. Some inputs are required for an API, while others are optional. Required inputs may be specified here, if known, or in the `previous_node_inputs` field. The [Get Workflow Steps API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-steps/) identifies required inputs and step outputs.
Substitutions are supported in string values, lists of strings, and maps with string values. The pattern `{% raw %}${{previous_step_id.output_key}}{% endraw %}` will be replaced by the value in the previous step's output with the given key. For example, if a parameter map in the user inputs includes a key `embedding_model_id` with a value `{% raw %}${{deploy_embedding_model.model_id}}{% endraw %}`, then the `model_id` output of the `deploy_embedding_model` step will be substituted here. This performs a similar function to the `previous_node_input` map but is not validated and does not automatically infer edges.
In some cases, you can include [additional inputs](#additional-fields) in this field. | + +## Workflow step types + +The following table lists the workflow step types. The `user_inputs` fields for these steps correspond directly to the linked APIs. + +|Step type |Corresponding API |Description | +|--- |--- |--- | +|`noop` |No API | A no-operation (no-op) step that does nothing. It may be useful in some cases for synchronizing parallel steps. | +|`create_connector` |[Create Connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/create-connector/) |Creates a connector to a model hosted on a third-party platform. | +|`delete_connector` |[Delete Connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/delete-connector/) |Deletes a connector to a model hosted on a third-party platform. | +|`register_model_group` |[Register Model Group]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/register-model-group/) |Registers a model group. The model group will be deleted automatically once no model is present in the group. | +|`register_remote_model` |[Register Model (remote)]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#register-a-model-hosted-on-a-third-party-platform) |Registers a model hosted on a third-party platform. If the `user_inputs` field contains a `deploy` key that is set to `true`, also deploys the model. | +|`register_local_pretrained_model` |[Register Model (pretrained)]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#register-a-pretrained-text-embedding-model) | Registers an OpenSearch-provided pretrained text embedding model that is hosted on your OpenSearch cluster. If the `user_inputs` field contains a `deploy` key that is set to `true`, also deploys the model. | +|`register_local_sparse_encoding_model` |[Register Model (sparse)]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#register-a-pretrained-sparse-encoding-model) | Registers an OpenSearch-provided pretrained sparse encoding model that is hosted on your OpenSearch cluster. If the `user_inputs` field contains a `deploy` key that is set to `true`, also deploys the model. | +|`register_local_custom_model` |[Register Model (custom)]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#register-a-custom-model) | Registers a custom model that is hosted on your OpenSearch cluster. If the `user_inputs` field contains a `deploy` key that is set to `true`, also deploys the model. | +|`delete_model` |[Delete Model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/delete-model/) |Unregisters and deletes a model. | +|`deploy_model` |[Deploy Model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/deploy-model/) |Deploys a registered model into memory. | +|`undeploy_model` |[Undeploy Model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/undeploy-model/) |Undeploys a deployed model from memory. | +|`register_agent` |[Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/) |Registers an agent as part of the ML Commons Agent Framework. | +|`delete_agent` |[Delete Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/) |Deletes an agent. | +|`create_tool` |No API | A special-case non-API step encapsulating the specification of a tool for an agent in the ML Commons Agent Framework. These will be listed as `previous_node_inputs` for the appropriate register agent step, with the value set to `tools`. | +|`create_index`|[Create Index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/) | Creates a new OpenSearch index. The inputs include `index_name`, which should be the name of the index to be created, and `configurations`, which contains the payload body of a regular REST request for creating an index. +|`create_ingest_pipeline`|[Create Ingest Pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/) | Creates or updates an ingest pipeline. The inputs include `pipeline_id`, which should be the ID of the pipeline, and `configurations`, which contains the payload body of a regular REST request for creating an ingest pipeline. +|`create_search_pipeline`|[Create Search Pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/) | Creates or updates a search pipeline. The inputs include `pipeline_id`, which should be the ID of the pipeline, and `configurations`, which contains the payload body of a regular REST request for creating a search pipeline. + +## Additional fields + +You can include the following additional fields in the `user_inputs` field when indicated. + +|Field |Data type |Description | +|--- |--- |--- | +|`node_timeout` |Time units |A user-provided timeout for this step. For example, `20s` for a 20-second timeout. | +|`deploy` |Boolean |Applicable to the Register Model step type. If set to `true`, also executes the Deploy Model step. | +|`tools_order` |List |Applicable only to the Register Agent step type. Specifies the ordering of `tools`. For example, specify `["foo_tool", "bar_tool"]` to sequence those tools in that order. | + +You can include the following additional fields in the `previous_node_inputs` field when indicated. + +|Field |Data type |Description | +|--- |--- |--- | +|`model_id` |String |The `model_id` is used as an input for several steps. As a special case for the Register Agent step type, if an `llm.model_id` field is not present in the `user_inputs` and not present in `previous_node_inputs`, the `model_id` field from the previous node may be used as a backup for the model ID. | + +## Example workflow steps + +For example workflow step implementations, see the [Workflow tutorial]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-tutorial/). \ No newline at end of file diff --git a/_automating-configurations/workflow-templates.md b/_automating-configurations/workflow-templates.md new file mode 100644 index 0000000000..1133148c8f --- /dev/null +++ b/_automating-configurations/workflow-templates.md @@ -0,0 +1,142 @@ +--- +layout: default +title: Workflow templates +nav_order: 25 +--- + +# Workflow templates + +OpenSearch provides several workflow templates for some common machine learning (ML) use cases. Using a template simplifies complex setups and provides many default values for use cases like semantic or conversational search. + +You can specify a workflow template when you call the [Create Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/create-workflow/): + +- To use an OpenSearch-provided workflow template, specify the template use case as the `use_case` query parameter (see the [Example](#example)). For a list of OpenSearch-provided templates, see [Supported workflow templates](#supported-workflow-templates). + +- To use a custom workflow template, provide the complete template in the request body. For an example of a custom template, see [an example JSON template]({{site.url}}{{site.baseurl}}/automating-configurations/api/create-workflow/#example-request-register-and-deploy-a-remote-model-json) or [an example YAML template]({{site.url}}{{site.baseurl}}/automating-configurations/api/create-workflow/#example-request-register-and-deploy-an-externally-hosted-model-yaml). + +To provision the workflow, specify `provision=true` as a query parameter. + +## Example + +In this example, you'll configure the `semantic_search_with_cohere_embedding_query_enricher` workflow template. The workflow created using this template performs the following configuration steps: + +- Deploys an externally hosted Cohere model +- Creates an ingest pipeline using the model +- Creates a sample k-NN index and configures a search pipeline to define the default model ID for that index + +### Step 1: Create and provision the workflow + +Send the following request to create and provision a workflow using the `semantic_search_with_cohere_embedding_query_enricher` workflow template. The only required request body field for this template is the API key for the Cohere Embed model: + +```json +POST /_plugins/_flow_framework/workflow?use_case=semantic_search_with_cohere_embedding_query_enricher&provision=true +{ + "create_connector.credential.key" : "" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a workflow ID for the created workflow: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50" +} +``` + +The workflow in the previous step creates a default k-NN index. The default index name is `my-nlp-index`: + +```json +{ + "create_index.name": "my-nlp-index" +} +``` + +For all default parameter values for this workflow template, see [Cohere Embed semantic search defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/cohere-embedding-semantic-search-defaults.json). + +### Step 2: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following request: + +```json +PUT /my-nlp-index/_doc/1 +{ + "passage_text": "Hello world", + "id": "s1" +} +``` +{% include copy-curl.html %} + +### Step 3: Perform vector search + +To perform a vector search on your index, use a [`neural` query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/) clause: + +```json +GET /my-nlp-index/_search +{ + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "neural": { + "passage_embedding": { + "query_text": "Hi world", + "k": 100 + } + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +Each workflow template has a defined schema and a set of APIs with predefined default values for each step. For more information about template parameter defaults, see [Supported workflow templates](#supported-workflow-templates). + +### Overriding default values + +To override a template's default values, provide the new values in the request body when sending a create workflow request. For example, the following request changes the Cohere model, the name of the `text_embedding` processor output field, and the name of the sparse index of the `semantic_search_with_cohere_embedding` template: + +```json +POST /_plugins/_flow_framework/workflow?use_case=semantic_search_with_cohere_embedding +{ + "create_connector.model" : "embed-multilingual-v3.0", + "text_embedding.field_map.output": "book_embedding", + "create_index.name": "sparse-book-index" +} +``` +{% include copy-curl.html %} + +## Viewing workflow resources + +The workflow you created provisioned all the necessary resources for semantic search. To view the provisioned resources, call the [Get Workflow Status API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/) and provide the `workflowID` for your workflow: + +```json +GET /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_status +``` +{% include copy-curl.html %} + +## Supported workflow templates + +The following table lists the supported workflow templates. To use a workflow template, specify it in the `use_case` query parameter when creating a workflow. + +| Template use case | Description | Required parameters | Defaults | +| `bedrock_titan_embedding_model_deploy` | Creates and deploys an Amazon Bedrock embedding model (by default, `titan-embed-text-v1`).| `create_connector.credential.access_key`, `create_connector.credential.secret_key`, `create_connector.credential.session_token` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/bedrock-titan-embedding-defaults.json)| +| `bedrock_titan_multimodal_model_deploy` | Creates and deploys an Amazon Bedrock multimodal embedding model (by default, `titan-embed-image-v1`). | `create_connector.credential.access_key`, `create_connector.credential.secret_key`, `create_connector.credential.session_token` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/bedrock-titan-multimodal-defaults.json). | +| `cohere_embedding_model_deploy`| Creates and deploys a Cohere embedding model (by default, `embed-english-v3.0`). | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/cohere-embedding-defaults.json) | +| `cohere_chat_model_deploy` | Creates and deploys a Cohere chat model (by default, Cohere Command). | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/cohere-chat-defaults.json) | +| `open_ai_embedding_model_deploy` | Creates and deploys an OpenAI embedding model (by default, `text-embedding-ada-002`). | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/openai-embedding-defaults.json) | +| `openai_chat_model_deploy` | Creates and deploys an OpenAI chat model (by default, `gpt-3.5-turbo`). | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/openai-chat-defaults.json) | +| `local_neural_sparse_search_bi_encoder` | Configures [neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/):
- Deploys a pretrained sparse encoding model.
- Creates an ingest pipeline with a sparse encoding processor.
- Creates a sample index to use for sparse search, specifying the newly created pipeline as the default pipeline. | None |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/local-sparse-search-biencoder-defaults.json) | +| `semantic_search` | Configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/):
- Creates an ingest pipeline with a `text_embedding` processor and a k-NN index
You must provide the model ID of the text embedding model to be used. | `create_ingest_pipeline.model_id` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/semantic-search-defaults.json) | +| `semantic_search_with_query_enricher` | Configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) similarly to the `semantic_search` template. Adds a [`query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) search processor that sets a default model ID for neural queries. You must provide the model ID of the text embedding model to be used. | `create_ingest_pipeline.model_id` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/semantic-search-query-enricher-defaults.json) | +| `semantic_search_with_cohere_embedding` | Configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) and deploys a Cohere embedding model. You must provide the API key for the Cohere model. | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/cohere-embedding-semantic-search-defaults.json) | +| `semantic_search_with_cohere_embedding_query_enricher` | Configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) and deploys a Cohere embedding model. Adds a [`query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) search processor that sets a default model ID for neural queries. You must provide the API key for the Cohere model. | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/cohere-embedding-semantic-search-with-query-enricher-defaults.json) | +| `multimodal_search` | Configures an ingest pipeline with a `text_image_embedding` processor and a k-NN index for [multimodal search]({{site.url}}{{site.baseurl}}/search-plugins/multimodal-search/). You must provide the model ID of the multimodal embedding model to be used. | `create_ingest_pipeline.model_id` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/multi-modal-search-defaults.json) | +| `multimodal_search_with_bedrock_titan` | Deploys an Amazon Bedrock multimodal model and configures an ingest pipeline with a `text_image_embedding` processor and a k-NN index for [multimodal search]({{site.url}}{{site.baseurl}}/search-plugins/multimodal-search/). You must provide your AWS credentials. | `create_connector.credential.access_key`, `create_connector.credential.secret_key`, `create_connector.credential.session_token` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/multimodal-search-bedrock-titan-defaults.json) | +| `hybrid_search` | Configures [hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/):
- Creates an ingest pipeline, a k-NN index, and a search pipeline with a `normalization_processor`. You must provide the model ID of the text embedding model to be used. | `create_ingest_pipeline.model_id` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/hybrid-search-defaults.json) | +| `conversational_search_with_llm_deploy` | Deploys a large language model (LLM) (by default, Cohere Chat) and configures a search pipeline with a `retrieval_augmented_generation` processor for [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/conversational-search-defaults.json) | + + diff --git a/_automating-configurations/workflow-tutorial.md b/_automating-configurations/workflow-tutorial.md new file mode 100644 index 0000000000..0074ad4691 --- /dev/null +++ b/_automating-configurations/workflow-tutorial.md @@ -0,0 +1,620 @@ +--- +layout: default +title: Workflow tutorial +nav_order: 20 +--- + +# Workflow tutorial + +You can automate the setup of common use cases, such as conversational chat, using a Chain-of-Thought (CoT) agent. An _agent_ orchestrates and runs ML models and tools. A _tool_ performs a set of specific tasks. This page presents a complete example of setting up a CoT agent. For more information about agents and tools, see [Agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/) + +The setup requires the following sequence of API requests, with provisioned resources used in subsequent requests. The following list provides an overview of the steps required for this workflow. The step names correspond to the names in the template: + +1. **Deploy a model on the cluster** + * [`create_connector_1`](#create_connector_1): Create a connector to an externally hosted model. + * [`register_model_2`](#register_model_2): Register a model using the connector that you created. + * [`deploy_model_3`](#deploy_model_3): Deploy the model. +1. **Use the deployed model for inference** + * Set up several tools that perform specific tasks: + * [`cat_index_tool`](#cat_index_tool): Set up a tool to obtain index information. + * [`ml_model_tool`](#ml_model_tool): Set up a machine learning (ML) model tool. + * Set up one or more agents that use some combination of the tools: + * [`sub_agent`](#sub_agent): Create an agent that uses the `cat_index_tool`. + * Set up tools representing these agents: + * [`agent_tool`](#agent_tool): Wrap the `sub_agent` so that you can use it as a tool. + * [`root_agent`](#root_agent): Set up a root agent that may delegate the task to either a tool or another agent. + +The following sections describe the steps in detail. For the complete workflow template, see [Complete YAML workflow template](#complete-yaml-workflow-template). + +## Workflow graph + +The workflow described in the previous section is organized into a [template](#complete-yaml-workflow-template). Note that you can order the steps in several ways. In the example template, the `ml_model_tool` step is specified right before the `root_agent` step, but you can specify it at any point after the `deploy_model_3` step and before the `root_agent` step. The following diagram shows the directed acyclic graph (DAG) that OpenSearch creates for all of the steps in the order specified in the template. + +![Example workflow steps graph]({{site.url}}{{site.baseurl}}/images/automatic-workflow-dag.png){:style="width: 100%; max-width: 600px;" class="img-centered"} + +## 1. Deploy a model on the cluster + +To deploy a model on the cluster, you need to create a connector to the model, register the model, and deploy the model. + + +### create_connector_1 + + +The first step in the workflow is to create a connector to an externally hosted model (in the following example, this step is called `create_connector_1`). The content of the `user_inputs` field exactly matches the ML Commons [Create Connector API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/create-connector/): + +```yaml +nodes: +- id: create_connector_1 + type: create_connector + user_inputs: + name: OpenAI Chat Connector + description: The connector to public OpenAI model service for GPT 3.5 + version: '1' + protocol: http + parameters: + endpoint: api.openai.com + model: gpt-3.5-turbo + credential: + openAI_key: '12345' + actions: + - action_type: predict + method: POST + url: https://${parameters.endpoint}/v1/chat/completions +``` + +When you create a connector, OpenSearch returns a `connector_id`, which you need in order to register the model. + + +### register_model_2 + + +When registering a model, the `previous_node_inputs` field tells OpenSearch to obtain the required `connector_id` from the output of the `create_connector_1` step. Other inputs required by the [Register Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/) are included in the `user_inputs` field: + +```yaml +- id: register_model_2 + type: register_remote_model + previous_node_inputs: + create_connector_1: connector_id + user_inputs: + name: openAI-gpt-3.5-turbo + function_name: remote + description: test model +``` + +The output of this step is a `model_id`. You must then deploy the registered model to the cluster. + + +### deploy_model_3 + + +The [Deploy Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/deploy-model/) requires the `model_id` from the previous step, as specified in the `previous_node_inputs` field: + +```yaml +- id: deploy_model_3 + type: deploy_model + # This step needs the model_id produced as an output of the previous step + previous_node_inputs: + register_model_2: model_id +``` + +When using the Deploy Model API directly, a task ID is returned, requiring use of the [Tasks API](https://opensearch.org/docs/latest/ml-commons-plugin/api/tasks-apis/get-task/) to determine when the deployment is complete. The automated workflow eliminates the manual status check and returns the final `model_id` directly. + +### Ordering steps + +To order these steps in a sequence, you must connect them by an edge in the graph. When a `previous_node_input` field is present in a step, OpenSearch automatically creates a node with `source` and `dest` fields for this step. The output of the `source` is required as input for the `dest`. For example, the `register_model_2` step requires the `connector_id` from the `create_connector_1` step. Similarly, the `deploy_model_3` step requires the `model_id` from the `register_model_2` step. Thus, OpenSearch creates the first two edges in the graph as follows in order to match the output with the required input and raise errors if the required input is missing: + +```yaml +edges: +- source: create_connector_1 + dest: register_model_2 +- source: register_model_2 + dest: deploy_model_3 +``` + +If you define `previous_node_inputs`, then defining edges is optional. +{: .note} + +## 2. Use the deployed model for inference + +A CoT agent can use the deployed model in a tool. This step doesn’t strictly correspond to an API but represents a component of the body required by the [Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/). This simplifies the register request and allows reuse of the same tool in multiple agents. For more information about agents and tools, see [Agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/). + + +### cat_index_tool + + +You can configure other tools to be used by the CoT agent. For example, you can configure a `cat_index_tool` as follows. This tool does not depend on any previous steps: + +```yaml +- id: cat_index_tool + type: create_tool + user_inputs: + name: CatIndexTool + type: CatIndexTool + parameters: + max_iteration: 5 +``` + + +### sub_agent + + +To use the `cat_index_tool` in the agent configuration, specify it as one of the tools in the `previous_node_inputs` field of the agent. You can add other tools to `previous_node_inputs` as necessary. The agent also needs a large language model (LLM) in order to reason with the tools. The LLM is defined by the `llm.model_id` field. This example assumes that the `model_id` from the `deploy_model_3` step will be used. However, if another model is already deployed, the `model_id` of that previously deployed model could be included in the `user_inputs` field instead: + +```yaml +- id: sub_agent + type: register_agent + previous_node_inputs: + # When llm.model_id is not present this can be used as a fallback value + deploy-model-3: model_id + cat_index_tool: tools + user_inputs: + name: Sub Agent + type: conversational + description: this is a test agent + parameters: + hello: world + llm.parameters: + max_iteration: '5' + stop_when_no_tool_found: 'true' + memory: + type: conversation_index + app_type: chatbot +``` + +OpenSearch will automatically create the following edges so that the agent can retrieve the fields from the previous node: + +```yaml +- source: cat_index_tool + dest: sub_agent +- source: deploy_model_3 + dest: sub_agent +``` + + +### agent_tool + + +You can use an agent as a tool for another agent. Registering an agent produces an `agent_id` in the output. The following step defines a tool that uses the `agent_id` from the previous step: + +```yaml +- id: agent_tool + type: create_tool + previous_node_inputs: + sub_agent: agent_id + user_inputs: + name: AgentTool + type: AgentTool + description: Agent Tool + parameters: + max_iteration: 5 +``` + +OpenSearch automatically creates an edge connection because this step specifies the `previous_node_input`: + +```yaml +- source: sub_agent + dest: agent_tool +``` + + +### ml_model_tool + + +A tool may reference an ML model. This example gets the required `model_id` from the model deployed in a previous step: + +```yaml +- id: ml_model_tool + type: create_tool + previous_node_inputs: + deploy-model-3: model_id + user_inputs: + name: MLModelTool + type: MLModelTool + alias: language_model_tool + description: A general tool to answer any question. + parameters: + prompt: Answer the question as best you can. + response_filter: choices[0].message.content +``` + +OpenSearch automatically creates an edge in order to use the `previous_node_input`: + +```yaml +- source: deploy-model-3 + dest: ml_model_tool +``` + + +### root_agent + + +A conversational chat application will communicate with a single root agent that includes the ML model tool and the agent tool in its `tools` field. It will also obtain the `llm.model_id` from the deployed model. Some agents require tools to be in a specific order, which can be enforced by including the `tools_order` field in the user inputs: + +```yaml +- id: root_agent + type: register_agent + previous_node_inputs: + deploy-model-3: model_id + ml_model_tool: tools + agent_tool: tools + user_inputs: + name: DEMO-Test_Agent_For_CoT + type: conversational + description: this is a test agent + parameters: + prompt: Answer the question as best you can. + llm.parameters: + max_iteration: '5' + stop_when_no_tool_found: 'true' + tools_order: ['agent_tool', 'ml_model_tool'] + memory: + type: conversation_index + app_type: chatbot +``` + +OpenSearch automatically creates edges for the `previous_node_input` sources: + +```yaml +- source: deploy-model-3 + dest: root_agent +- source: ml_model_tool + dest: root_agent +- source: agent_tool + dest: root_agent +``` + +For the complete DAG that OpenSearch creates for this workflow, see the [workflow graph](#workflow-graph). + +## Complete YAML workflow template + +The following is the final template including all of the `provision` workflow steps in YAML format: + +
+ + YAML template + + {: .text-delta} + +```yaml +# This template demonstrates provisioning the resources for a +# Chain-of-Thought chat bot +name: tool-register-agent +description: test case +use_case: REGISTER_AGENT +version: + template: 1.0.0 + compatibility: + - 2.12.0 + - 3.0.0 +workflows: + # This workflow defines the actions to be taken when the Provision Workflow API is used + provision: + nodes: + # The first three nodes create a connector to a remote model, registers and deploy that model + - id: create_connector_1 + type: create_connector + user_inputs: + name: OpenAI Chat Connector + description: The connector to public OpenAI model service for GPT 3.5 + version: '1' + protocol: http + parameters: + endpoint: api.openai.com + model: gpt-3.5-turbo + credential: + openAI_key: '12345' + actions: + - action_type: predict + method: POST + url: https://${parameters.endpoint}/v1/chat/completions + - id: register_model_2 + type: register_remote_model + previous_node_inputs: + create_connector_1: connector_id + user_inputs: + # deploy: true could be added here instead of the deploy step below + name: openAI-gpt-3.5-turbo + description: test model + - id: deploy_model_3 + type: deploy_model + previous_node_inputs: + register_model_2: model_id + # For example purposes, the model_id obtained as the output of the deploy_model_3 step will be used + # for several below steps. However, any other deployed model_id can be used for those steps. + # This is one example tool from the Agent Framework. + - id: cat_index_tool + type: create_tool + user_inputs: + name: CatIndexTool + type: CatIndexTool + parameters: + max_iteration: 5 + # This simple agent only has one tool, but could be configured with many tools + - id: sub_agent + type: register_agent + previous_node_inputs: + deploy-model-3: model_id + cat_index_tool: tools + user_inputs: + name: Sub Agent + type: conversational + parameters: + hello: world + llm.parameters: + max_iteration: '5' + stop_when_no_tool_found: 'true' + memory: + type: conversation_index + app_type: chatbot + # An agent can be used itself as a tool in a nested relationship + - id: agent_tool + type: create_tool + previous_node_inputs: + sub_agent: agent_id + user_inputs: + name: AgentTool + type: AgentTool + parameters: + max_iteration: 5 + # An ML Model can be used as a tool + - id: ml_model_tool + type: create_tool + previous_node_inputs: + deploy-model-3: model_id + user_inputs: + name: MLModelTool + type: MLModelTool + alias: language_model_tool + parameters: + prompt: Answer the question as best you can. + response_filter: choices[0].message.content + # This final agent will be the interface for the CoT chat user + # Using a flow agent type tools_order matters + - id: root_agent + type: register_agent + previous_node_inputs: + deploy-model-3: model_id + ml_model_tool: tools + agent_tool: tools + user_inputs: + name: DEMO-Test_Agent + type: flow + parameters: + prompt: Answer the question as best you can. + llm.parameters: + max_iteration: '5' + stop_when_no_tool_found: 'true' + tools_order: ['agent_tool', 'ml_model_tool'] + memory: + type: conversation_index + app_type: chatbot + # These edges are all automatically created with previous_node_input + edges: + - source: create_connector_1 + dest: register_model_2 + - source: register_model_2 + dest: deploy_model_3 + - source: cat_index_tool + dest: sub_agent + - source: deploy_model_3 + dest: sub_agent + - source: sub_agent + dest: agent_tool + - source: deploy-model-3 + dest: ml_model_tool + - source: deploy-model-3 + dest: root_agent + - source: ml_model_tool + dest: root_agent + - source: agent_tool + dest: root_agent +``` +
+ +## Complete JSON workflow template + +The following is the same template in JSON format: + +
+ + JSON template + + {: .text-delta} + +```json +{ + "name": "tool-register-agent", + "description": "test case", + "use_case": "REGISTER_AGENT", + "version": { + "template": "1.0.0", + "compatibility": [ + "2.12.0", + "3.0.0" + ] + }, + "workflows": { + "provision": { + "nodes": [ + { + "id": "create_connector_1", + "type": "create_connector", + "user_inputs": { + "name": "OpenAI Chat Connector", + "description": "The connector to public OpenAI model service for GPT 3.5", + "version": "1", + "protocol": "http", + "parameters": { + "endpoint": "api.openai.com", + "model": "gpt-3.5-turbo" + }, + "credential": { + "openAI_key": "12345" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://${parameters.endpoint}/v1/chat/completions" + } + ] + } + }, + { + "id": "register_model_2", + "type": "register_remote_model", + "previous_node_inputs": { + "create_connector_1": "connector_id" + }, + "user_inputs": { + "name": "openAI-gpt-3.5-turbo", + "description": "test model" + } + }, + { + "id": "deploy_model_3", + "type": "deploy_model", + "previous_node_inputs": { + "register_model_2": "model_id" + } + }, + { + "id": "cat_index_tool", + "type": "create_tool", + "user_inputs": { + "name": "CatIndexTool", + "type": "CatIndexTool", + "parameters": { + "max_iteration": 5 + } + } + }, + { + "id": "sub_agent", + "type": "register_agent", + "previous_node_inputs": { + "deploy-model-3": "llm.model_id", + "cat_index_tool": "tools" + }, + "user_inputs": { + "name": "Sub Agent", + "type": "conversational", + "parameters": { + "hello": "world" + }, + "llm.parameters": { + "max_iteration": "5", + "stop_when_no_tool_found": "true" + }, + "memory": { + "type": "conversation_index" + }, + "app_type": "chatbot" + } + }, + { + "id": "agent_tool", + "type": "create_tool", + "previous_node_inputs": { + "sub_agent": "agent_id" + }, + "user_inputs": { + "name": "AgentTool", + "type": "AgentTool", + "parameters": { + "max_iteration": 5 + } + } + }, + { + "id": "ml_model_tool", + "type": "create_tool", + "previous_node_inputs": { + "deploy-model-3": "model_id" + }, + "user_inputs": { + "name": "MLModelTool", + "type": "MLModelTool", + "alias": "language_model_tool", + "parameters": { + "prompt": "Answer the question as best you can.", + "response_filter": "choices[0].message.content" + } + } + }, + { + "id": "root_agent", + "type": "register_agent", + "previous_node_inputs": { + "deploy-model-3": "llm.model_id", + "ml_model_tool": "tools", + "agent_tool": "tools" + }, + "user_inputs": { + "name": "DEMO-Test_Agent", + "type": "flow", + "parameters": { + "prompt": "Answer the question as best you can." + }, + "llm.parameters": { + "max_iteration": "5", + "stop_when_no_tool_found": "true" + }, + "tools_order": [ + "agent_tool", + "ml_model_tool" + ], + "memory": { + "type": "conversation_index" + }, + "app_type": "chatbot" + } + } + ], + "edges": [ + { + "source": "create_connector_1", + "dest": "register_model_2" + }, + { + "source": "register_model_2", + "dest": "deploy_model_3" + }, + { + "source": "cat_index_tool", + "dest": "sub_agent" + }, + { + "source": "deploy_model_3", + "dest": "sub_agent" + }, + { + "source": "sub_agent", + "dest": "agent_tool" + }, + { + "source": "deploy-model-3", + "dest": "ml_model_tool" + }, + { + "source": "deploy-model-3", + "dest": "root_agent" + }, + { + "source": "ml_model_tool", + "dest": "root_agent" + }, + { + "source": "agent_tool", + "dest": "root_agent" + } + ] + } + } +} +``` +
+ +## Next steps + +To learn more about agents and tools, see [Agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/). \ No newline at end of file diff --git a/_benchmark/index.md b/_benchmark/index.md index 5b0d3fb463..25b3738e7d 100644 --- a/_benchmark/index.md +++ b/_benchmark/index.md @@ -18,7 +18,7 @@ OpenSearch Benchmark is a macrobenchmark utility provided by the [OpenSearch Pro - Informing decisions about when to upgrade your cluster to a new version. - Determining how changes to your workflow---such as modifying mappings or queries---might impact your cluster. -OpenSearch Benchmark can be installed directly on a compatible host running Linux and macOS. You can also run OpenSearch Benchmark in a Docker container. See [Installing OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/installing-benchmark/) for more information. +OpenSearch Benchmark can be installed directly on a compatible host running Linux or macOS. You can also run OpenSearch Benchmark in a Docker container. See [Installing OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/installing-benchmark/) for more information. The following diagram visualizes how OpenSearch Benchmark works when run against a local host: diff --git a/_benchmark/quickstart.md b/_benchmark/quickstart.md index 52415cb608..0c23f74953 100644 --- a/_benchmark/quickstart.md +++ b/_benchmark/quickstart.md @@ -31,7 +31,7 @@ After installation, you can verify OpenSearch is running by going to `localhost: Use the following command to verify OpenSearch is running with SSL certificate checks disabled: ```bash -curl -k -u admin:admin https://localhost:9200 # the "-k" option skips SSL certificate checks +curl -k -u admin: https://localhost:9200 # the "-k" option skips SSL certificate checks { "name" : "147ddae31bf8.opensearch.org", diff --git a/_benchmark/reference/commands/command-flags.md b/_benchmark/reference/commands/command-flags.md index e4de26b483..ca0606f07f 100644 --- a/_benchmark/reference/commands/command-flags.md +++ b/_benchmark/reference/commands/command-flags.md @@ -290,4 +290,40 @@ The name and path used for the chart's output. Default is `stdout`. ## limit -Limits the number of search results for recent test runs. Default is `10`. \ No newline at end of file +Limits the number of search results for recent test runs. Default is `10`. + + +## latency-percentiles + + +Specifies a comma-separated list of latency percentiles to report after the workload runs. Accepts `ints` or `floats` with values between `0` and `100` inclusive. Does not accept `min`, `median`, `mean`, or `max`. Default is `50,90,99,99.9,99.99,100`. + + +## throughput-percentiles + + +Specifies a list of throughput percentiles to report after the workload runs, in addition to min/median/mean/max which is always displayed. Like `--latency-percentiles`, the setting accepts `ints` or `floats` with values between `0` and `100` inclusive. Does not accept `min`, `median`, `mean`, or `max`. Default is `None`. + + +## randomization-enabled + + +Enables randomization of values in range queries, where the values are drawn from standard value functions registered with `register_standard_value_source` in the workload's `workload.py` file. + +A standard value function is a no-argument function that generates a random pair of values for a certain field, in a dict with keys `"gte"`, `"lte"`, and optionally `"format"`. + +If this argument is `True` but a search operation does not have a registered standard value function, OpenSearch Benchmark raises a `SystemSetupError`. + +Default is `False`. + + +## randomization-repeat-frequency + + +Sets what fraction of randomized query values can be repeated. Takes values between `0.0` and `1.0`. Default is `0.3`. This setting does not work when `--randomization-enabled` is not used. + + +## randomization-n + + +Sets how many distinct repeatable pair values are generated for each operation when randomization is used. Default is `5000`. This setting does not work when `--randomization-enabled` is not used. diff --git a/_benchmark/reference/workloads/index.md b/_benchmark/reference/workloads/index.md index 655ada92d9..1dd609cacb 100644 --- a/_benchmark/reference/workloads/index.md +++ b/_benchmark/reference/workloads/index.md @@ -16,7 +16,7 @@ A workload is a specification of one or more benchmarking scenarios. A workload This section provides a list of options and examples you can use when customizing or using a workload. -For more information about what comprises a workload, see [Anatomy of a workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/concepts#anatomy-of-a-workload). +For more information about what comprises a workload, see [Anatomy of a workload](({{site.url}}{{site.baseurl}}/benchmark/understanding-workloads/anatomy-of-a-workload/). ## Workload examples diff --git a/_benchmark/user-guide/concepts.md b/_benchmark/user-guide/concepts.md index ade9fe53b6..b353538a4a 100644 --- a/_benchmark/user-guide/concepts.md +++ b/_benchmark/user-guide/concepts.md @@ -11,7 +11,7 @@ Before using OpenSearch Benchmark, familiarize yourself with the following conce ## Core concepts and definitions -- **Workload**: The description of one or more benchmarking scenarios that use a specific document corpus to perform a benchmark against your cluster. The document corpus contains any indexes, data files, and operations invoked when the workflow runs. You can list the available workloads by using `opensearch-benchmark list workloads` or view any included workloads in the [OpenSearch Benchmark Workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/). For more information about the elements of a workload, see [Anatomy of a workload](#anatomy-of-a-workload). For information about building a custom workload, see [Creating custom workloads]({{site.url}}{{site.baseurl}}/benchmark/creating-custom-workloads/). +- **Workload**: The description of one or more benchmarking scenarios that use a specific document corpus to perform a benchmark against your cluster. The document corpus contains any indexes, data files, and operations invoked when the workflow runs. You can list the available workloads by using `opensearch-benchmark list workloads` or view any included workloads in the [OpenSearch Benchmark Workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/). For more information about the elements of a workload, see [Anatomy of a workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/anatomy-of-a-workload/). For information about building a custom workload, see [Creating custom workloads]({{site.url}}{{site.baseurl}}/benchmark/creating-custom-workloads/). - **Pipeline**: A series of steps occurring before and after a workload is run that determines benchmark results. OpenSearch Benchmark supports three pipelines: - `from-sources`: Builds and provisions OpenSearch, runs a benchmark, and then publishes the results. @@ -110,149 +110,5 @@ This latency cascade continues, increasing latency by 100ms for each subsequent ### Recommendation -As shown by the preceding examples, you should be aware of the average service time of each task and provide a `target-throughput` that accounts for the service time. The OpenSearch Benchmark latency is calculated based on the `target-throughput` set by the user. In other words, the OpenSearch Benchmark latency could be redefined as "throughput-based latency". - -## Anatomy of a workload - -The following example workload shows all of the essential elements needed to create a `workload.json` file. You can run this workload in your own benchmark configuration to understand how all of the elements work together: - -```json -{ - "description": "Tutorial benchmark for OpenSearch Benchmark", - "indices": [ - { - "name": "movies", - "body": "index.json" - } - ], - "corpora": [ - { - "name": "movies", - "documents": [ - { - "source-file": "movies-documents.json", - "document-count": 11658903, # Fetch document count from command line - "uncompressed-bytes": 1544799789 # Fetch uncompressed bytes from command line - } - ] - } - ], - "schedule": [ - { - "operation": { - "operation-type": "create-index" - } - }, - { - "operation": { - "operation-type": "cluster-health", - "request-params": { - "wait_for_status": "green" - }, - "retry-until-success": true - } - }, - { - "operation": { - "operation-type": "bulk", - "bulk-size": 5000 - }, - "warmup-time-period": 120, - "clients": 8 - }, - { - "operation": { - "name": "query-match-all", - "operation-type": "search", - "body": { - "query": { - "match_all": {} - } - } - }, - "iterations": 1000, - "target-throughput": 100 - } - ] -} -``` - -A workload usually includes the following elements: - -- [indices]({{site.url}}{{site.baseurl}}/benchmark/workloads/indices/): Defines the relevant indexes and index templates used for the workload. -- [corpora]({{site.url}}{{site.baseurl}}/benchmark/workloads/corpora/): Defines all document corpora used for the workload. -- `schedule`: Defines operations and the order in which the operations run inline. Alternatively, you can use `operations` to group operations and the `test_procedures` parameter to specify the order of operations. -- `operations`: **Optional**. Describes which operations are available for the workload and how they are parameterized. - -### Indices - -To create an index, specify its `name`. To add definitions to your index, use the `body` option and point it to the JSON file containing the index definitions. For more information, see [indices]({{site.url}}{{site.baseurl}}/benchmark/workloads/indices/). - -### Corpora - -The `corpora` element requires the name of the index containing the document corpus, for example, `movies`, and a list of parameters that define the document corpora. This list includes the following parameters: - -- `source-file`: The file name that contains the workload's corresponding documents. When using OpenSearch Benchmark locally, documents are contained in a JSON file. When providing a `base_url`, use a compressed file format: `.zip`, `.bz2`, `.gz`, `.tar`, `.tar.gz`, `.tgz`, or `.tar.bz2`. The compressed file must have one JSON file containing the name. -- `document-count`: The number of documents in the `source-file`, which determines which client indexes correlate to which parts of the document corpus. Each N client receives an Nth of the document corpus. When using a source that contains a document with a parent-child relationship, specify the number of parent documents. -- `uncompressed-bytes`: The size, in bytes, of the source file after decompression, indicating how much disk space the decompressed source file needs. -- `compressed-bytes`: The size, in bytes, of the source file before decompression. This can help you assess the amount of time needed for the cluster to ingest documents. - -### Operations - -The `operations` element lists the OpenSearch API operations performed by the workload. For example, you can set an operation to `create-index`, an index in the test cluster to which OpenSearch Benchmark can write documents. Operations are usually listed inside of `schedule`. - -### Schedule - -The `schedule` element contains a list of actions and operations that are run by the workload. Operations run according to the order in which they appear in the `schedule`. The following example illustrates a `schedule` with multiple operations, each defined by its `operation-type`: - -```json - "schedule": [ - { - "operation": { - "operation-type": "create-index" - } - }, - { - "operation": { - "operation-type": "cluster-health", - "request-params": { - "wait_for_status": "green" - }, - "retry-until-success": true - } - }, - { - "operation": { - "operation-type": "bulk", - "bulk-size": 5000 - }, - "warmup-time-period": 120, - "clients": 8 - }, - { - "operation": { - "name": "query-match-all", - "operation-type": "search", - "body": { - "query": { - "match_all": {} - } - } - }, - "iterations": 1000, - "target-throughput": 100 - } - ] -} -``` - -According to this schedule, the actions will run in the following order: - -1. The `create-index` operation creates an index. The index remains empty until the `bulk` operation adds documents with benchmarked data. -2. The `cluster-health` operation assesses the health of the cluster before running the workload. In this example, the workload waits until the status of the cluster's health is `green`. - - The `bulk` operation runs the `bulk` API to index `5000` documents simultaneously. - - Before benchmarking, the workload waits until the specified `warmup-time-period` passes. In this example, the warmup period is `120` seconds. -5. The `clients` field defines the number of clients that will run the remaining actions in the schedule concurrently. -6. The `search` runs a `match_all` query to match all documents after they have been indexed by the `bulk` API using the 8 clients specified. - - The `iterations` field indicates the number of times each client runs the `search` operation. The report generated by the benchmark automatically adjusts the percentile numbers based on this number. To generate a precise percentile, the benchmark needs to run at least 1,000 iterations. - - Lastly, the `target-throughput` field defines the number of requests per second each client performs, which, when set, can help reduce the latency of the benchmark. For example, a `target-throughput` of 100 requests divided by 8 clients means that each client will issue 12 requests per second. +As shown by the preceding examples, you should be aware of the average service time of each task and provide a `target-throughput` that accounts for the service time. The OpenSearch Benchmark latency is calculated based on the `target-throughput` set by the user, that is, the latency could be redefined as "throughput-based latency." + diff --git a/_benchmark/user-guide/contributing-workloads.md b/_benchmark/user-guide/contributing-workloads.md new file mode 100644 index 0000000000..e60f60eaed --- /dev/null +++ b/_benchmark/user-guide/contributing-workloads.md @@ -0,0 +1,57 @@ +--- +layout: default +title: Sharing custom workloads +nav_order: 11 +parent: User guide +--- + +# Sharing custom workloads + +You can share a custom workload with other OpenSearch users by uploading it to the [workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/) on GitHub. + +Make sure that any data included in the workload's dataset does not contain proprietary data or personally identifiable information (PII). + +To share a custom workload, follow these steps. + +## Create a README.md + +Provide a detailed `README.MD` file that includes the following: + +- The purpose of the workload. When creating a description for the workload, consider its specific use and how the that use case differs from others in the [workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/). +- An example document from the dataset that helps users understand the data's structure. +- The workload parameters that can be used to customize the workload. +- A list of default test procedures included in the workload as well as other test procedures that the workload can run. +- An output sample produced by the workload after a test is run. +- A copy of the open-source license that gives the user and OpenSearch Benchmark permission to use the dataset. + +For an example workload README file, go to the `http_logs` [README](https://github.com/opensearch-project/opensearch-benchmark-workloads/blob/main/http_logs/README.md). + +## Verify the workload's structure + +The workload must include the following files: + +- `workload.json` +- `index.json` +- `files.txt` +- `test_procedures/default.json` +- `operations/default.json` + +Both `default.json` file names can be customized to have a descriptive name. The workload can include an optional `workload.py` file to add more dynamic functionality. For more information about a file's contents, go to [Anatomy of a workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/anatomy-of-a-workload/). + +## Testing the workload + +All workloads contributed to OpenSearch Benchmark must fulfill the following testing requirements: + +- All tests run to explore and produce an example from the workload must target an OpenSearch cluster. +- The workload must pass all integration tests. Follow these steps to ensure that the workload passes the integration tests: + 1. Add the workload to your forked copy of the [workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/). Make sure that you've forked both the `opensearch-benchmark-workloads` repository and the [OpenSeach Benchmark](https://github.com/opensearch-project/opensearch-benchmark) repository. + 3. In your forked OpenSearch Benchmark repository, update the `benchmark-os-it.ini` and `benchmark-in-memory.ini` files in the `/osbenchmark/it/resources` directory to point to the forked workloads repository containing your workload. + 4. After you've modified the `.ini` files, commit your changes to a branch for testing. + 6. Run your integration tests using GitHub actions by selecting the branch for which you committed your changes. Verify that the tests have run as expected. + 7. If your integration tests run as expected, go to your forked workloads repository and merge your workload changes into branches `1` and `2`. This allows for your workload to appear in both major versions of OpenSearch Benchmark. + +## Create a PR + +After testing the workload, create a pull request (PR) from your fork to the `opensearch-project` [workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/). Add a sample output and summary result to the PR description. The OpenSearch Benchmark maintainers will review the PR. + +Once the PR is approved, you must share the data corpora of your dataset. The OpenSearch Benchmark team can then add the dataset to a shared S3 bucket. If your data corpora is stored in an S3 bucket, you can use [AWS DataSync](https://docs.aws.amazon.com/datasync/latest/userguide/create-s3-location.html) to share the data corpora. Otherwise, you must inform the maintainers of where the data corpora resides. diff --git a/_benchmark/user-guide/creating-osb-workloads.md b/_benchmark/user-guide/creating-custom-workloads.md similarity index 99% rename from _benchmark/user-guide/creating-osb-workloads.md rename to _benchmark/user-guide/creating-custom-workloads.md index 76c573907f..d06610467f 100644 --- a/_benchmark/user-guide/creating-osb-workloads.md +++ b/_benchmark/user-guide/creating-custom-workloads.md @@ -1,11 +1,11 @@ --- layout: default -title: Creating OpenSearch Benchmark workloads +title: Creating custom workloads nav_order: 10 parent: User guide redirect_from: - /benchmark/creating-custom-workloads/ - - /benchmark/user-guide/creating-custom-workloads + - /benchmark/user-guide/creating-osb-workloads/ --- # Creating custom workloads diff --git a/_benchmark/user-guide/distributed-load.md b/_benchmark/user-guide/distributed-load.md index ec46091974..60fc98500f 100644 --- a/_benchmark/user-guide/distributed-load.md +++ b/_benchmark/user-guide/distributed-load.md @@ -1,7 +1,7 @@ --- layout: default title: Running distributed loads -nav_order: 10 +nav_order: 15 parent: User guide --- diff --git a/_benchmark/user-guide/running-workloads.md b/_benchmark/user-guide/running-workloads.md new file mode 100644 index 0000000000..36108eb9c8 --- /dev/null +++ b/_benchmark/user-guide/running-workloads.md @@ -0,0 +1,168 @@ +--- +layout: default +title: Running a workload +nav_order: 9 +parent: User guide +--- + +# Running a workload + +Once you have a complete understanding of the various components of an OpenSearch Benchmark [workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/anatomy-of-a-workload/), you can run your first workload. + +## Step 1: Find the workload name + +To learn more about the standard workloads included with OpenSearch Benchmark, use the following command: + +``` +opensearch-benchmark list workloads +``` +{% include copy.html %} + +A list of all workloads supported by OpenSearch Benchmark appears. Review the list and select the workload that's most similar to your cluster's use case. + +## Step 2: Running the test + +After you've selected the workload, you can invoke the workload using the `opensearch-benchmark execute-test` command. Replace `--target-host` with the `host:port` pairs for your cluster and `--client-options` with any authorization options required to access the cluster. The following example runs the `nyc_taxis` workload on a localhost for testing purposes. + +If you want to run a test on an external cluster, see [Running the workload on your own cluster](#running-a-workload-on-an-external-cluster). + +```bash +opensearch-benchmark execute-test --pipeline=benchmark-only --workload=nyc_taxis --target-host=https://localhost:9200 --client-options=basic_auth_user:admin,basic_auth_password:admin,verify_certs:false +``` +{% include copy.html %} + + +Results from the test appear in the directory set by the `--output-path` option in the `execute-test` command. + +### Test mode + +If you want to run the test in test mode to make sure that your workload operates as intended, add the `--test-mode` option to the `execute-test` command. Test mode ingests only the first 1,000 documents from each index provided and runs query operations against them. + +## Step 3: Validate the test + +After running an OpenSearch Benchmark test, take the following steps to verify that it has run properly: + +1. Note the number of documents in the OpenSearch or OpenSearch Dashboards index that you plan to run the benchmark against. +2. In the results returned by OpenSearch Benchmark, compare the `workload.json` file for your specific workload and verify that the document count matches the number of documents. For example, based on the [nyc_taxis](https://github.com/opensearch-project/opensearch-benchmark-workloads/blob/main/nyc_taxis/workload.json#L20) `workload.json` file, you should expect to see `165346692` documents in your cluster. + +## Expected results + +OSB returns the following response once the benchmark completes: + +```bash +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +------------------------------------------------------ + +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|-------------------------------------------:|------------:|-------:| +| Cumulative indexing time of primary shards | | 0.02655 | min | +| Min cumulative indexing time across primary shards | | 0 | min | +| Median cumulative indexing time across primary shards | | 0.00176667 | min | +| Max cumulative indexing time across primary shards | | 0.0140333 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 0.0102333 | min | +| Cumulative merge count of primary shards | | 3 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0 | min | +| Max cumulative merge time across primary shards | | 0.0102333 | min | +| Cumulative merge throttle time of primary shards | | 0 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0 | min | +| Cumulative refresh time of primary shards | | 0.0709333 | min | +| Cumulative refresh count of primary shards | | 118 | | +| Min cumulative refresh time across primary shards | | 0 | min | +| Median cumulative refresh time across primary shards | | 0.00186667 | min | +| Max cumulative refresh time across primary shards | | 0.0511667 | min | +| Cumulative flush time of primary shards | | 0.00963333 | min | +| Cumulative flush count of primary shards | | 4 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 0 | min | +| Max cumulative flush time across primary shards | | 0.00398333 | min | +| Total Young Gen GC time | | 0 | s | +| Total Young Gen GC count | | 0 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 0.000485923 | GB | +| Translog size | | 2.01873e-05 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 32 | | +| Min Throughput | index | 3008.97 | docs/s | +| Mean Throughput | index | 3008.97 | docs/s | +| Median Throughput | index | 3008.97 | docs/s | +| Max Throughput | index | 3008.97 | docs/s | +| 50th percentile latency | index | 351.059 | ms | +| 100th percentile latency | index | 365.058 | ms | +| 50th percentile service time | index | 351.059 | ms | +| 100th percentile service time | index | 365.058 | ms | +| error rate | index | 0 | % | +| Min Throughput | wait-until-merges-finish | 28.41 | ops/s | +| Mean Throughput | wait-until-merges-finish | 28.41 | ops/s | +| Median Throughput | wait-until-merges-finish | 28.41 | ops/s | +| Max Throughput | wait-until-merges-finish | 28.41 | ops/s | +| 100th percentile latency | wait-until-merges-finish | 34.7088 | ms | +| 100th percentile service time | wait-until-merges-finish | 34.7088 | ms | +| error rate | wait-until-merges-finish | 0 | % | +| Min Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| Mean Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| Median Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| Max Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| 100th percentile latency | percolator_with_content_president_bush | 35.9822 | ms | +| 100th percentile service time | percolator_with_content_president_bush | 7.93048 | ms | +| error rate | percolator_with_content_president_bush | 0 | % | + +[...] + +| Min Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| Mean Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| Median Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| Max Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| 100th percentile latency | percolator_with_content_ignore_me | 131.798 | ms | +| 100th percentile service time | percolator_with_content_ignore_me | 69.5237 | ms | +| error rate | percolator_with_content_ignore_me | 0 | % | +| Min Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| Mean Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| Median Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| Max Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| 100th percentile latency | percolator_no_score_with_content_ignore_me | 45.5703 | ms | +| 100th percentile service time | percolator_no_score_with_content_ignore_me | 11.316 | ms | +| error rate | percolator_no_score_with_content_ignore_me | 0 | % | + + + +-------------------------------- +[INFO] SUCCESS (took 18 seconds) +-------------------------------- +``` + + + +## Running a workload on an external cluster + +Now that you're familiar with running OpenSearch Benchmark on a local cluster, you can run it on your external cluster, as described in the following steps: + +1. Replace `https://localhost:9200` with your target cluster endpoint. This could be a Uniform Resource Identifier (URI), such as `https://search.mydomain.com`, or a `HOST:PORT` specification. +2. If the cluster is configured with basic authentication, replace the username and password in the command line with the appropriate credentials. +3. Remove the `verify_certs:false` directive if you are not specifying `localhost` as your target cluster. This directive is necessary solely for clusters without SSL certificates. +4. If you are using a `HOST:PORT`specification and plan to use SSL or TLS, either specify `https://` or add the `use_ssl:true` directive to the `--client-options` string option. +5. Remove the `--test-mode` flag to run the full workload rather than an abbreviated test. + +You can copy the following command template to use it in your own terminal: + +```bash +opensearch-benchmark execute-test --pipeline=benchmark-only --workload=nyc_taxis --target-host= --client-options=basic_auth_user:admin,basic_auth_password:admin +``` +{% include copy.html %} diff --git a/_benchmark/user-guide/telemetry.md b/_benchmark/user-guide/telemetry.md index 7cc7f6b730..d4c40c790a 100644 --- a/_benchmark/user-guide/telemetry.md +++ b/_benchmark/user-guide/telemetry.md @@ -1,7 +1,7 @@ --- layout: default title: Enabling telemetry devices -nav_order: 15 +nav_order: 30 parent: User guide --- diff --git a/_benchmark/user-guide/understanding-workloads/anatomy-of-a-workload.md b/_benchmark/user-guide/understanding-workloads/anatomy-of-a-workload.md new file mode 100644 index 0000000000..b54932470d --- /dev/null +++ b/_benchmark/user-guide/understanding-workloads/anatomy-of-a-workload.md @@ -0,0 +1,790 @@ +--- +layout: default +title: Anatomy of a workload +nav_order: 15 +grand_parent: User guide +parent: Understanding workloads +--- + +# Anatomy of a workload + +All workloads contain the following files and directories: + +- [workload.json](#workloadjson): Contains all of the workload settings. +- [index.json](#indexjson): Contains the document mappings and parameters as well as index settings. +- [files.txt](#filestxt): Contains the data corpora file names. +- [_test-procedures](#_operations-and-_test-procedures): Most workloads contain only one default test procedure, which is configured in `default.json`. +- [_operations](#_operations-and-_test-procedures): Contains all of the operations used in test procedures. +- workload.py: Adds more dynamic functionality to the test. + +## workload.json + +The following example workload shows all of the essential elements needed to create a `workload.json` file. You can run this workload in your own benchmark configuration to understand how all of the elements work together: + +```json +{ + "description": "Tutorial benchmark for OpenSearch Benchmark", + "indices": [ + { + "name": "movies", + "body": "index.json" + } + ], + "corpora": [ + { + "name": "movies", + "documents": [ + { + "source-file": "movies-documents.json", + "document-count": 11658903, # Fetch document count from command line + "uncompressed-bytes": 1544799789 # Fetch uncompressed bytes from command line + } + ] + } + ], + "schedule": [ + { + "operation": { + "operation-type": "create-index" + } + }, + { + "operation": { + "operation-type": "cluster-health", + "request-params": { + "wait_for_status": "green" + }, + "retry-until-success": true + } + }, + { + "operation": { + "operation-type": "bulk", + "bulk-size": 5000 + }, + "warmup-time-period": 120, + "clients": 8 + }, + { + "operation": { + "name": "query-match-all", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + "iterations": 1000, + "target-throughput": 100 + } + ] +} +``` + +A workload usually includes the following elements: + +- [indices]({{site.url}}{{site.baseurl}}/benchmark/workloads/indices/): Defines the relevant indexes and index templates used for the workload. +- [corpora]({{site.url}}{{site.baseurl}}/benchmark/workloads/corpora/): Defines all document corpora used for the workload. +- `schedule`: Defines operations and the order in which the operations run inline. Alternatively, you can use `operations` to group operations and the `test_procedures` parameter to specify the order of operations. +- `operations`: **Optional**. Describes which operations are available for the workload and how they are parameterized. + +### Indices + +To create an index, specify its `name`. To add definitions to your index, use the `body` option and point it to the JSON file containing the index definitions. For more information, see [Indices]({{site.url}}{{site.baseurl}}/benchmark/workloads/indices/). + +### Corpora + +The `corpora` element requires the name of the index containing the document corpus, for example, `movies`, and a list of parameters that define the document corpora. This list includes the following parameters: + +- `source-file`: The file name that contains the workload's corresponding documents. When using OpenSearch Benchmark locally, documents are contained in a JSON file. When providing a `base_url`, use a compressed file format: `.zip`, `.bz2`, `.zst`, `.gz`, `.tar`, `.tar.gz`, `.tgz`, or `.tar.bz2`. The compressed file must include one JSON file containing the name. +- `document-count`: The number of documents in the `source-file`, which determines which client indexes correlate to which parts of the document corpus. Each N client is assigned an Nth of the document corpus to ingest into the test cluster. When using a source that contains a document with a parent-child relationship, specify the number of parent documents. +- `uncompressed-bytes`: The size, in bytes, of the source file after decompression, indicating how much disk space the decompressed source file needs. +- `compressed-bytes`: The size, in bytes, of the source file before decompression. This can help you assess the amount of time needed for the cluster to ingest documents. + +### Operations + +The `operations` element lists the OpenSearch API operations performed by the workload. For example, you can list an operation named `create-index` that creates an index in the benchmark cluster to which OpenSearch Benchmark can write documents. Operations are usually listed inside of the `schedule` element. + +### Schedule + +The `schedule` element contains a list of operations that are run in a specified order, as shown in the following JSON example: + +```json + "schedule": [ + { + "operation": { + "operation-type": "create-index" + } + }, + { + "operation": { + "operation-type": "cluster-health", + "request-params": { + "wait_for_status": "green" + }, + "retry-until-success": true + } + }, + { + "operation": { + "operation-type": "bulk", + "bulk-size": 5000 + }, + "warmup-time-period": 120, + "clients": 8 + }, + { + "operation": { + "name": "query-match-all", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + "iterations": 1000, + "target-throughput": 100 + } + ] +} +``` + +According to this `schedule`, the actions will run in the following order: + +1. The `create-index` operation creates an index. The index remains empty until the `bulk` operation adds documents with benchmarked data. +2. The `cluster-health` operation assesses the cluster's health before running the workload. In the JSON example, the workload waits until the cluster's health status is `green`. + - The `bulk` operation runs the `bulk` API to index `5000` documents simultaneously. + - Before benchmarking, the workload waits until the specified `warmup-time-period` passes. In the JSON example, the warmup period is `120` seconds. +3. The `clients` field defines the number of clients, in this example, eight, that will run the bulk indexing operation concurrently. +4. The `search` operation runs a `match_all` query to match all documents after they have been indexed by the `bulk` API using the specified clients. + - The `iterations` field defines the number of times each client runs the `search` operation. The benchmark report automatically adjusts the percentile numbers based on this number. To generate a precise percentile, the benchmark needs to run at least 1,000 iterations. + - The `target-throughput` field defines the number of requests per second that each client performs. When set, the setting can help reduce benchmark latency. For example, a `target-throughput` of 100 requests divided by 8 clients means that each client will issue 12 requests per second. For more information about how target throughput is defined in OpenSearch Benchmark, see [Throughput and latency](https://opensearch.org/docs/latest/benchmark/user-guide/concepts/#throughput-and-latency). + +## index.json + +The `index.json` file defines the data mappings, indexing parameters, and index settings for workload documents during `create-index` operations. + +When OpenSearch Benchmark creates an index for the workload, it uses the index settings and mappings template in the `index.json` file. Mappings in the `index.json` file are based on the mappings of a single document from the workload's corpus, which is stored in the `files.txt` file. The following is an example of the `index.json` file for the `nyc_taxis` workload. You can customize the fields, such as `number_of_shards`, `number_of_replicas`, `query_cache_enabled`, and `requests_cache_enabled`. + +```json +{ + "settings": { + "index.number_of_shards": {% raw %}{{number_of_shards | default(1)}}{% endraw %}, + "index.number_of_replicas": {% raw %}{{number_of_replicas | default(0)}}{% endraw %}, + "index.queries.cache.enabled": {% raw %}{{query_cache_enabled | default(false) | tojson}}{% endraw %}, + "index.requests.cache.enable": {% raw %}{{requests_cache_enabled | default(false) | tojson}}{% endraw %} + }, + "mappings": { + "_source": { + "enabled": {% raw %}{{ source_enabled | default(true) | tojson }}{% endraw %} + }, + "properties": { + "surcharge": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "dropoff_datetime": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss" + }, + "trip_type": { + "type": "keyword" + }, + "mta_tax": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "rate_code_id": { + "type": "keyword" + }, + "passenger_count": { + "type": "integer" + }, + "pickup_datetime": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss" + }, + "tolls_amount": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "tip_amount": { + "type": "half_float" + }, + "payment_type": { + "type": "keyword" + }, + "extra": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "vendor_id": { + "type": "keyword" + }, + "store_and_fwd_flag": { + "type": "keyword" + }, + "improvement_surcharge": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "fare_amount": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "ehail_fee": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "cab_color": { + "type": "keyword" + }, + "dropoff_location": { + "type": "geo_point" + }, + "vendor_name": { + "type": "text" + }, + "total_amount": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "trip_distance": {% raw %}{%- if trip_distance_mapping is defined %} {{ trip_distance_mapping | tojson }} {%- else %}{% endraw %} { + "scaling_factor": 100, + "type": "scaled_float" + }{% raw %}{%- endif %}{% endraw %}, + "pickup_location": { + "type": "geo_point" + } + }, + "dynamic": "strict" + } +} +``` + +## files.txt + +The `files.txt` file lists the files that store the workload data, which are typically stored in a zipped JSON file. + +## _operations and _test-procedures + +To make the workload more human-readable, `_operations` and `_test-procedures` are separated into two directories. + +The `_operations` directory contains a `default.json` file that lists all of the supported operations that the test procedure can use. Some workloads, such as `nyc_taxis`, contain an additional `.json` file that lists feature-specific operations, such as `snapshot` operations. The following JSON example shows a list of operations from the `nyc_taxis` workload: + +```json + { + "name": "index", + "operation-type": "bulk", + "bulk-size": {% raw %}{{bulk_size | default(10000)}}{% endraw %}, + "ingest-percentage": {% raw %}{{ingest_percentage | default(100)}}{% endraw %} + }, + { + "name": "update", + "operation-type": "bulk", + "bulk-size": {% raw %}{{bulk_size | default(10000)}}, + "ingest-percentage": {{ingest_percentage | default(100)}}, + "conflicts": "{{conflicts | default('random')}}", + "on-conflict": "{{on_conflict | default('update')}}", + "conflict-probability": {{conflict_probability | default(25)}}, + "recency": {{recency | default(0)}}{% endraw %} + }, + { + "name": "wait-until-merges-finish", + "operation-type": "index-stats", + "index": "_all", + "condition": { + "path": "_all.total.merges.current", + "expected-value": 0 + }, + "retry-until-success": true, + "include-in-reporting": false + }, + { + "name": "default", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + { + "name": "range", + "operation-type": "search", + "body": { + "query": { + "range": { + "total_amount": { + "gte": 5, + "lt": 15 + } + } + } + } + }, + { + "name": "distance_amount_agg", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "bool": { + "filter": { + "range": { + "trip_distance": { + "lt": 50, + "gte": 0 + } + } + } + } + }, + "aggs": { + "distance_histo": { + "histogram": { + "field": "trip_distance", + "interval": 1 + }, + "aggs": { + "total_amount_stats": { + "stats": { + "field": "total_amount" + } + } + } + } + } + } + }, + { + "name": "autohisto_agg", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "01/01/2015", + "lte": "21/01/2015", + "format": "dd/MM/yyyy" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "auto_date_histogram": { + "field": "dropoff_datetime", + "buckets": 20 + } + } + } + } + }, + { + "name": "date_histogram_agg", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "01/01/2015", + "lte": "21/01/2015", + "format": "dd/MM/yyyy" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "calendar_interval": "day" + } + } + } + } + }, + { + "name": "date_histogram_calendar_interval", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "calendar_interval": "month" + } + } + } + } + }, + { + "name": "date_histogram_calendar_interval_with_tz", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "calendar_interval": "month", + "time_zone": "America/New_York" + } + } + } + } + }, + { + "name": "date_histogram_fixed_interval", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "fixed_interval": "60d" + } + } + } + } + }, + { + "name": "date_histogram_fixed_interval_with_tz", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "fixed_interval": "60d", + "time_zone": "America/New_York" + } + } + } + } + }, + { + "name": "date_histogram_fixed_interval_with_metrics", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "fixed_interval": "60d" + }, + "aggs": { + "total_amount": { "stats": { "field": "total_amount" } }, + "tip_amount": { "stats": { "field": "tip_amount" } }, + "trip_distance": { "stats": { "field": "trip_distance" } } + } + } + } + } + }, + { + "name": "auto_date_histogram", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "auto_date_histogram": { + "field": "dropoff_datetime", + "buckets": "12" + } + } + } + } + }, + { + "name": "auto_date_histogram_with_tz", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "auto_date_histogram": { + "field": "dropoff_datetime", + "buckets": "13", + "time_zone": "America/New_York" + } + } + } + } + }, + { + "name": "auto_date_histogram_with_metrics", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "auto_date_histogram": { + "field": "dropoff_datetime", + "buckets": "12" + }, + "aggs": { + "total_amount": { "stats": { "field": "total_amount" } }, + "tip_amount": { "stats": { "field": "tip_amount" } }, + "trip_distance": { "stats": { "field": "trip_distance" } } + } + } + } + } + }, + { + "name": "desc_sort_tip_amount", + "operation-type": "search", + "index": "nyc_taxis", + "body": { + "query": { + "match_all": {} + }, + "sort" : [ + {"tip_amount" : "desc"} + ] + } + }, + { + "name": "asc_sort_tip_amount", + "operation-type": "search", + "index": "nyc_taxis", + "body": { + "query": { + "match_all": {} + }, + "sort" : [ + {"tip_amount" : "asc"} + ] + } + } +``` + +The `_test-procedures` directory contains a `default.json` file that sets the order of operations performed by the workload. Similar to the `_operations` directory, the `_test-procedures` directory can also contain feature-specific test procedures, such as `searchable_snapshots.json` for `nyc_taxis`. The following examples show the searchable snapshots test procedures for `nyc_taxis`: + +```json + { + "name": "searchable-snapshot", + "description": "Measuring performance for Searchable Snapshot feature. Based on the default test procedure 'append-no-conflicts'.", + "schedule": [ + { + "operation": "delete-index" + }, + { + "operation": { + "operation-type": "create-index", + "settings": {% raw %}{%- if index_settings is defined %} {{ index_settings | tojson }} {%- else %}{ + "index.codec": "best_compression", + "index.refresh_interval": "30s", + "index.translog.flush_threshold_size": "4g" + }{%- endif %}{% endraw %} + } + }, + { + "name": "check-cluster-health", + "operation": { + "operation-type": "cluster-health", + "index": "nyc_taxis", + "request-params": { + "wait_for_status": {% raw %}"{{ cluster_health | default('green') }}"{% endraw %}, + "wait_for_no_relocating_shards": "true" + }, + "retry-until-success": true + } + }, + { + "operation": "index", + "warmup-time-period": 240, + "clients": {% raw %}{{ bulk_indexing_clients | default(8) }}, + "ignore-response-error-level": "{{ error_level | default('non-fatal') }}"{% endraw %} + }, + { + "name": "refresh-after-index", + "operation": "refresh" + }, + { + "operation": { + "operation-type": "force-merge", + "request-timeout": 7200 + {% raw %}{%- if force_merge_max_num_segments is defined %}{% endraw %}, + "max-num-segments": {% raw %}{{ force_merge_max_num_segments | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + } + }, + { + "name": "refresh-after-force-merge", + "operation": "refresh" + }, + { + "operation": "wait-until-merges-finish" + }, + { + "operation": "create-snapshot-repository" + }, + { + "operation": "delete-snapshot" + }, + { + "operation": "create-snapshot" + }, + { + "operation": "wait-for-snapshot-creation" + }, + { + "operation": { + "name": "delete-local-index", + "operation-type": "delete-index" + } + }, + { + "operation": "restore-snapshot" + }, + { + "operation": "default", + "warmup-iterations": 50, + "iterations": 100 + {% raw %}{%- if not target_throughput %}{% endraw %} + ,"target-throughput": 3 + {% raw %}{%- elif target_throughput is string and target_throughput.lower() == 'none' %}{% endraw %} + {% raw %}{%- else %}{% endraw %} + ,"target-throughput": {% raw %}{{ target_throughput | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + {% raw %}{%-if search_clients is defined and search_clients %}{% endraw %} + ,"clients": {% raw %}{{ search_clients | tojson}}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + }, + { + "operation": "range", + "warmup-iterations": 50, + "iterations": 100 + {% raw %}{%- if not target_throughput %}{% endraw %} + ,"target-throughput": 0.7 + {% raw %}{%- elif target_throughput is string and target_throughput.lower() == 'none' %}{% endraw %} + {% raw %}{%- else %}{% endraw %} + ,"target-throughput": {% raw %}{{ target_throughput | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + {% raw %}{%-if search_clients is defined and search_clients %}{% endraw %} + ,"clients": {% raw %}{{ search_clients | tojson}}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + }, + { + "operation": "distance_amount_agg", + "warmup-iterations": 50, + "iterations": 50 + {% raw %}{%- if not target_throughput %}{% endraw %} + ,"target-throughput": 2 + {% raw %}{%- elif target_throughput is string and target_throughput.lower() == 'none' %}{% endraw %} + {% raw %}{%- else %}{% endraw %} + ,"target-throughput": {% raw %}{{ target_throughput | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + {% raw %}{%-if search_clients is defined and search_clients %}{% endraw %} + ,"clients": {% raw %}{{ search_clients | tojson}}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + }, + { + "operation": "autohisto_agg", + "warmup-iterations": 50, + "iterations": 100 + {% raw %}{%- if not target_throughput %}{% endraw %} + ,"target-throughput": 1.5 + {% raw %}{%- elif target_throughput is string and target_throughput.lower() == 'none' %}{% endraw %} + {% raw %}{%- else %}{% endraw %} + ,"target-throughput": {% raw %}{{ target_throughput | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + {% raw %}{%-if search_clients is defined and search_clients %}{% endraw %} + ,"clients": {% raw %}{{ search_clients | tojson}}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + }, + { + "operation": "date_histogram_agg", + "warmup-iterations": 50, + "iterations": 100 + {% raw %}{%- if not target_throughput %}{% endraw %} + ,"target-throughput": 1.5 + {% raw %}{%- elif target_throughput is string and target_throughput.lower() == 'none' %}{% endraw %} + {% raw %}{%- else %}{% endraw %} + ,"target-throughput": {% raw %}{{ target_throughput | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + {% raw %}{%-if search_clients is defined and search_clients %}{% endraw %} + ,"clients": {% raw %}{{ search_clients | tojson}}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + } + ] + } +``` + +## Next steps + +Now that you have familiarized yourself with the anatomy of a workload, see the criteria for [Choosing a workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/choosing-a-workload/). diff --git a/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md b/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md new file mode 100644 index 0000000000..d7ae48ad0a --- /dev/null +++ b/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md @@ -0,0 +1,29 @@ +--- +layout: default +title: Choosing a workload +nav_order: 20 +grand_parent: User guide +parent: Understanding workloads +--- + +# Choosing a workload + +The [opensearch-benchmark-workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) repository contains a list of workloads that you can use to run your benchmarks. Using a workload similar to your cluster's use cases can save you time and effort when assessing your cluster's performance. + +For example, say you're a system architect at a rideshare company. As a rideshare company, you collect and store data based on trip times, locations, and other data related to each rideshare. Instead of building a custom workload and using your own data, which requires additional time, effort, and cost, you can use the [nyc_taxis](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/nyc_taxis) workload to benchmark your cluster because the data inside the workload is similar to the data that you collect. + +## Criteria for choosing a workload + +Consider the following criteria when deciding which workload would work best for benchmarking your cluster: + +- The cluster's use case. +- The data types that your cluster uses compared to the data structure of the documents contained in the workload. Each workload contains an example document so that you can compare data types, or you can view the index mappings and data types in the `index.json` file. +- The query types most commonly used inside your cluster. The `operations/default.json` file contains information about the query types and workload operations. + +## General search clusters + +For benchmarking clusters built for general search use cases, start with the `[nyc_taxis]`(https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/nyc_taxis) workload. This workload contains data about the rides taken in yellow taxis in New York City in 2015. + +## Log data + +For benchmarking clusters built for indexing and search with log data, use the [`http_logs`](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/http_logs) workload. This workload contains data about the 1998 World Cup. \ No newline at end of file diff --git a/_benchmark/user-guide/understanding-workloads/index.md b/_benchmark/user-guide/understanding-workloads/index.md new file mode 100644 index 0000000000..844b565185 --- /dev/null +++ b/_benchmark/user-guide/understanding-workloads/index.md @@ -0,0 +1,14 @@ +--- +layout: default +title: Understanding workloads +nav_order: 7 +parent: User guide +has_children: true +--- + +# Understanding workloads + +OpenSearch Benchmark includes a set of [workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) that you can use to benchmark data from your cluster. Workloads contain descriptions of one or more benchmarking scenarios that use a specific document corpus to perform a benchmark against your cluster. The document corpus contains any indexes, data files, and operations invoked when the workflow runs. + + + diff --git a/_clients/OpenSearch-dot-net.md b/_clients/OpenSearch-dot-net.md index 9e41fffe18..86488928e5 100644 --- a/_clients/OpenSearch-dot-net.md +++ b/_clients/OpenSearch-dot-net.md @@ -400,7 +400,7 @@ internal class Program FirstName = "Paulo", LastName = "Santos", Gpa = 3.93, - GradYear = 2021 };v + GradYear = 2021 }; var response = client.Index("students", "100", PostData.Serializable(student)); Console.WriteLine(response.Body); diff --git a/_clients/index.md b/_clients/index.md index 527879a94f..fc8c23d912 100644 --- a/_clients/index.md +++ b/_clients/index.md @@ -11,7 +11,10 @@ redirect_from: # OpenSearch language clients -OpenSearch provides clients in JavaScript, Python, Ruby, Java, PHP, .NET, Go and Rust. +OpenSearch provides clients in JavaScript, Python, Ruby, Java, PHP, .NET, Go, Hadoop, and Rust. + +The OpenSearch Java high-level REST client will be deprecated starting with OpenSearch 3.0.0 and will be removed in a future release. Switching to the [Java client]({{site.url}}{{site.baseurl}}/clients/java/) is recommended. +{: .warning} ## OpenSearch clients @@ -35,19 +38,18 @@ OpenSearch provides clients for the following programming languages and platform * [OpenSearch .NET clients]({{site.url}}{{site.baseurl}}/clients/dot-net/) * **Rust** * [OpenSearch Rust client]({{site.url}}{{site.baseurl}}/clients/rust/) +* **Hadoop** + * [OpenSearch Hadoop client](https://github.com/opensearch-project/opensearch-hadoop) -For a client compatibility matrix, see the COMPATIBILITY.md file in the client's repository. -{: .note} - -The OpenSearch Java high-level REST client will be deprecated starting with OpenSearch version 3.0.0 and will be removed in a future release. We recommend switching to the [Java client]({{site.url}}{{site.baseurl}}/clients/java/) instead. -{: .warning} ## Legacy clients -Most clients that work with Elasticsearch OSS 7.10.2 *should* work with OpenSearch, but the latest versions of those clients might include license or version checks that artificially break compatibility. This page includes recommendations around which versions of those clients to use for best compatibility with OpenSearch. +Clients that work with Elasticsearch OSS 7.10.2 should work with OpenSearch 1.x. The latest versions of those clients, however, might include license or version checks that artificially break compatibility. The following table provides recommendations for which client versions to use for best compatibility with OpenSearch 1.x. For OpenSearch 2.0 and later, no Elasticsearch clients are fully compatible with OpenSearch. + +While OpenSearch and Elasticsearch share several core features, mixing and matching the client and server has a high risk of errors and unexpected results. As OpenSearch and Elasticsearch continue to diverge, such risks may increase. Although your Elasticsearch client may continue working with your OpenSearch cluster, using OpenSearch clients for OpenSearch clusters is recommended. +{: .warning} -For a client compatibility matrix, see the COMPATIBILITY.md file in the client's repository. -{: .note} +To view the compatibility matrix for a specific client, see the `COMPATIBILITY.md` file in the client's repository. Client | Recommended version :--- | :--- diff --git a/_clients/java.md b/_clients/java.md index 296e4d9a05..4c1e06a44b 100644 --- a/_clients/java.md +++ b/_clients/java.md @@ -222,10 +222,10 @@ public class OpenSearchClientExample { System.setProperty("javax.net.ssl.trustStore", "/full/path/to/keystore"); System.setProperty("javax.net.ssl.trustStorePassword", "password-to-keystore"); - final HttpHost host = new HttpHost("https", 9200, "localhost"); + final HttpHost host = new HttpHost("https", "localhost", 9200); final BasicCredentialsProvider credentialsProvider = new BasicCredentialsProvider(); //Only for demo purposes. Don't specify your credentials in code. - credentialsProvider.setCredentials(new AuthScope(host), new UsernamePasswordCredentials("admin", "admin")); + credentialsProvider.setCredentials(new AuthScope(host), new UsernamePasswordCredentials("admin", "admin".toCharArray())); //Initialize the client with SSL and TLS enabled final RestClient restClient = RestClient.builder(host). @@ -344,7 +344,7 @@ client.delete(b -> b.index(index).id("1")); The following sample code deletes an index: ```java -DeleteIndexRequest deleteIndexRequest = new DeleteRequest.Builder().index(index).build(); +DeleteIndexRequest deleteIndexRequest = new DeleteIndexRequest.Builder().index(index).build(); DeleteIndexResponse deleteIndexResponse = client.indices().delete(deleteIndexRequest); ``` {% include copy.html %} diff --git a/_clients/javascript/helpers.md b/_clients/javascript/helpers.md index b03af21f94..f88efd8e00 100644 --- a/_clients/javascript/helpers.md +++ b/_clients/javascript/helpers.md @@ -11,7 +11,7 @@ Helper methods simplify the use of complicated API tasks. For the client's compl ## Bulk helper -The bulk helper simplifies making complex bulk API requests. +The bulk helper simplifies making complex bulk API requests. The bulk helper supports operations of the same kind. Alternatively, you can use the `client.bulk` method to perform multiple types of bulk operations. For example, you can send `delete` and `index` operations in one bulk request. For more information, see the [Bulk guide](https://github.com/opensearch-project/opensearch-js/blob/main/guides/bulk.md). ### Usage @@ -199,4 +199,7 @@ client.helpers.bulk({ } }) ``` -{% include copy.html %} \ No newline at end of file +{% include copy.html %} + +## Related articles +https://github.com/opensearch-project/opensearch-js/tree/main/guides diff --git a/_clients/javascript/index.md b/_clients/javascript/index.md index 2baed89722..58e9f19051 100644 --- a/_clients/javascript/index.md +++ b/_clients/javascript/index.md @@ -17,7 +17,7 @@ You can use helper methods to simplify the use of complicated API tasks. For mor ## Setup -To add the client to your project, install it from [npm](https://www.npmjs.com): +To add the client to your project, install it from [`npm`](https://www.npmjs.com): ```bash npm install @opensearch-project/opensearch @@ -31,7 +31,7 @@ npm install @opensearch-project/opensearch@ ``` {% include copy.html %} -If you prefer to add the client manually or just want to examine the source code, see [opensearch-js](https://github.com/opensearch-project/opensearch-js) on GitHub. +If you prefer to add the client manually or only want to examine the source code, see [`opensearch-js`](https://github.com/opensearch-project/opensearch-js) on GitHub. Then require the client: @@ -48,7 +48,7 @@ To connect to the default OpenSearch host, create a client object with the addre var host = "localhost"; var protocol = "https"; var port = 9200; -var auth = "admin:admin"; // For testing only. Don't store credentials in code. +var auth = "admin:"; // For testing only. Don't store credentials in code. var ca_certs_path = "/full/path/to/root-ca.pem"; // Optional client certificates if you don't want to use HTTP basic authentication. @@ -71,7 +71,22 @@ var client = new Client({ ``` {% include copy.html %} -## Authenticating with Amazon OpenSearch Service – AWS Sigv4 +If you are not using the Security plugin, create a client object with the address `http://localhost:9200`: + +```javascript +var host = "localhost"; +var protocol = "http"; +var port = 9200; + +// Create a client +var { Client } = require("@opensearch-project/opensearch"); +var client = new Client({ + node: protocol + "://" + host + ":" + port +}); +``` +{% include copy.html %} + +## Authenticating with Amazon OpenSearch Service: AWS Signature Version 4 Use the following code to authenticate with AWS V2 SDK: @@ -327,6 +342,43 @@ var response = await client.search({ ``` {% include copy.html %} +## Updating a document + +You can update a document using the client's `update` method: + +```javascript +var response = await client.update({ + index: index_name, + id: id, + body: { + doc: { + // Specify the fields and their updated values here + field1: "new_value1", + field2: "new_value2", + // Add more fields as needed + } + } +}); +``` +{% include copy.html %} + +For example, the following code updates the `genre` field and adds a `tv_adapted` field to the document specified by `id`: + +```javascript +var response = await client.update({ + index: index_name, + id: id, + body: { + doc: { + genre: "Detective fiction", + tv_adapted: true + } + }, + refresh: true + }); +``` +{% include copy.html %} + ## Deleting a document You can delete a document using the client's `delete` method: @@ -360,14 +412,14 @@ The following sample program creates a client, adds an index with non-default se var host = "localhost"; var protocol = "https"; var port = 9200; -var auth = "admin:admin"; // For testing only. Don't store credentials in code. +var auth = "admin:"; // For testing only. Don't store credentials in code. var ca_certs_path = "/full/path/to/root-ca.pem"; -// Optional client certificates if you don't want to use HTTP basic authentication. +// Optional client certificates if you don't want to use HTTP basic authentication // var client_cert_path = '/full/path/to/client.pem' // var client_key_path = '/full/path/to/client-key.pem' -// Create a client with SSL/TLS enabled. +// Create a client with SSL/TLS enabled var { Client } = require("@opensearch-project/opensearch"); var fs = require("fs"); var client = new Client({ @@ -382,7 +434,7 @@ var client = new Client({ }); async function search() { - // Create an index with non-default settings. + // Create an index with non-default settings var index_name = "books"; var settings = { @@ -402,7 +454,7 @@ async function search() { console.log("Creating index:"); console.log(response.body); - // Add a document to the index. + // Add a document to the index var document = { title: "The Outsider", author: "Stephen King", @@ -422,7 +474,7 @@ async function search() { console.log("Adding document:"); console.log(response.body); - // Search for the document. + // Search for the document var query = { query: { match: { @@ -439,9 +491,41 @@ async function search() { }); console.log("Search results:"); - console.log(response.body.hits); + console.log(JSON.stringify(response.body.hits, null, " ")); + + // Update a document + var response = await client.update({ + index: index_name, + id: id, + body: { + doc: { + genre: "Detective fiction", + tv_adapted: true + } + }, + refresh: true + }); - // Delete the document. + // Search for the updated document + var query = { + query: { + match: { + title: { + query: "The Outsider", + }, + }, + }, + }; + + var response = await client.search({ + index: index_name, + body: query, + }); + + console.log("Search results:"); + console.log(JSON.stringify(response.body.hits, null, " ")); + + // Delete the document var response = await client.delete({ index: index_name, id: id, @@ -450,7 +534,7 @@ async function search() { console.log("Deleting document:"); console.log(response.body); - // Delete the index. + // Delete the index var response = await client.indices.delete({ index: index_name, }); @@ -462,6 +546,7 @@ async function search() { search().catch(console.log); ``` {% include copy.html %} + ## Circuit breaker The `memoryCircuitBreaker` option can be used to prevent errors caused by a response payload being too large to fit into the heap memory available to the client. @@ -481,4 +566,4 @@ var client = new Client({ }, }); ``` -{% include copy.html %} \ No newline at end of file +{% include copy.html %} diff --git a/_clients/python-low-level.md b/_clients/python-low-level.md index 74920f0bb2..894bef0e38 100644 --- a/_clients/python-low-level.md +++ b/_clients/python-low-level.md @@ -10,7 +10,7 @@ redirect_from: The OpenSearch low-level Python client (`opensearch-py`) provides wrapper methods for the OpenSearch REST API so that you can interact with your cluster more naturally in Python. Rather than sending raw HTTP requests to a given URL, you can create an OpenSearch client for your cluster and call the client's built-in functions. For the client's complete API documentation and additional examples, see the [`opensearch-py` API documentation](https://opensearch-project.github.io/opensearch-py/). -This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [opensearch-py repo](https://github.com/opensearch-project/opensearch-py). +This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [`opensearch-py` repo](https://github.com/opensearch-project/opensearch-py). ## Setup @@ -348,4 +348,9 @@ response = client.indices.delete( print('\nDeleting index:') print(response) ``` -{% include copy.html %} \ No newline at end of file +{% include copy.html %} + +## Next steps + +- For Python client API, see the [`opensearch-py` API documentation](https://opensearch-project.github.io/opensearch-py/). +- For Python code samples, see [Samples](https://github.com/opensearch-project/opensearch-py/tree/main/samples). \ No newline at end of file diff --git a/_config.yml b/_config.yml index 6e0e67e1bd..6d6be4cd89 100644 --- a/_config.yml +++ b/_config.yml @@ -5,10 +5,10 @@ baseurl: "/docs/latest" # the subpath of your site, e.g. /blog url: "https://opensearch.org" # the base hostname & protocol for your site, e.g. http://example.com permalink: /:path/ -opensearch_version: '2.11.1' -opensearch_dashboards_version: '2.11.1' -opensearch_major_minor_version: '2.11' -lucene_version: '9_7_0' +opensearch_version: '2.13.0' +opensearch_dashboards_version: '2.13.0' +opensearch_major_minor_version: '2.13' +lucene_version: '9_10_0' # Build settings markdown: kramdown @@ -112,6 +112,15 @@ collections: about: permalink: /:collection/:path/ output: true + automating-configurations: + permalink: /:collection/:path/ + output: true + dashboards-assistant: + permalink: /:collection/:path/ + output: true + getting-started: + permalink: /:collection/:path/ + output: true opensearch_collection: # Define the collections used in the theme @@ -119,6 +128,9 @@ opensearch_collection: about: name: About OpenSearch nav_fold: true + getting-started: + name: Getting started + nav_fold: true install-and-configure: name: Install and upgrade nav_fold: true @@ -166,6 +178,9 @@ opensearch_collection: ml-commons-plugin: name: Machine learning nav_fold: true + automating-configurations: + name: Automating configurations + nav_fold: true monitoring-your-cluster: name: Monitoring your cluster nav_fold: true @@ -187,6 +202,7 @@ opensearch_collection: developer-documentation: name: Developer documentation nav_fold: true + clients_collection: collections: diff --git a/_dashboards/csp/csp-dynamic-configuration.md b/_dashboards/csp/csp-dynamic-configuration.md new file mode 100644 index 0000000000..2101a83734 --- /dev/null +++ b/_dashboards/csp/csp-dynamic-configuration.md @@ -0,0 +1,50 @@ +--- +layout: default +title: Configuring Content Security Policy rules dynamically +nav_order: 110 +has_children: false +--- + +# Configuring Content Security Policy rules dynamically +Introduced 2.13 +{: .label .label-purple } + +Content Security Policy (CSP) is a security standard intended to prevent cross-site scripting (XSS), `clickjacking`, and other code injection attacks resulting from the execution of malicious content in the trusted webpage context. OpenSearch Dashboards supports configuring CSP rules in the `opensearch_dashboards.yml` file by using the `csp.rules` key. A change in the YAML file requires a server restart, which may interrupt service availability. You can, however, configure the CSP rules dynamically through the `applicationConfig` plugin without restarting the server. + +## Configuration + +The `applicationConfig` plugin provides read and write APIs that allow OpenSearch Dashboards users to manage dynamic configurations as key-value pairs in an index. The `cspHandler` plugin registers a pre-response handler to `HttpServiceSetup`, which gets CSP rules from the dependent `applicationConfig` plugin and then rewrites to the CSP header. Enable both plugins within your `opensearch_dashboards.yml` file to use this feature. The configuration is shown in the following example. Refer to the `cspHandler` plugin [README](https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/src/plugins/csp_handler/README.md) for configuration details. + +``` +application_config.enabled: true +csp_handler.enabled: true +``` + +## Enable site embedding for OpenSearch Dashboards + +To enable site embedding for OpenSearch Dashboards, update the CSP rules using CURL. When using CURL commands with single quotation marks inside the `data-raw` parameter, escape them with a backslash (`\`). For example, use `'\''` to represent `'`. The configuration is shown in the following example. Refer to the `applicationConfig` plugin [README](https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/src/plugins/application_config/README.md) for configuration details. + +``` +curl '{osd endpoint}/api/appconfig/csp.rules' -X POST -H 'Accept: application/json' -H 'Content-Type: application/json' -H 'osd-xsrf: osd-fetch' -H 'Sec-Fetch-Dest: empty' --data-raw '{"newValue":"script-src '\''unsafe-eval'\'' '\''self'\''; worker-src blob: '\''self'\''; style-src '\''unsafe-inline'\'' '\''self'\''; frame-ancestors '\''self'\'' {new site}"}' +``` + +## Delete CSP rules + +Use the following CURL command to delete CSP rules: + +``` +curl '{osd endpoint}/api/appconfig/csp.rules' -X DELETE -H 'osd-xsrf: osd-fetch' -H 'Sec-Fetch-Dest: empty' +``` + +## Get CSP rules + +Use the following CURL command to get CSP rules: + +``` +curl '{osd endpoint}/api/appconfig/csp.rules' + +``` + +## Precedence + +Dynamic configurations override YAML configurations, except for empty CSP rules. To prevent `clickjacking`, a `frame-ancestors: self` directive is automatically added to YAML-defined rules when necessary. diff --git a/_dashboards/dashboards-assistant/index.md b/_dashboards/dashboards-assistant/index.md new file mode 100644 index 0000000000..d44e6b58e8 --- /dev/null +++ b/_dashboards/dashboards-assistant/index.md @@ -0,0 +1,124 @@ +--- +layout: default +title: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 3 +has_children: false +has_toc: false +--- + +Note that machine learning models are probabilistic and that some may perform better than others, so the OpenSearch Assistant may occasionally produce inaccurate information. We recommend evaluating outputs for accuracy as appropriate to your use case, including reviewing the output or combining it with other verification factors. +{: .important} + +# OpenSearch Assistant for OpenSearch Dashboards +**Introduced 2.13** +{: .label .label-purple } + +The OpenSearch Assistant toolkit helps you create AI-powered assistants for OpenSearch Dashboards without requiring you to have specialized query tools or skills. + +## Enabling OpenSearch Assistant + +To enable **OpenSearch Assistant** in OpenSearch Dashboards, locate your copy of the `opensearch_dashboards.yml` file and set the following option: + +``` +assistant.chat.enabled: true +``` +{% include copy-curl.html %} + +Then configure the root `agent_id` through the following API: + +``` +PUT .plugins-ml-config/_doc/os_chat +{ + "type":"os_chat_root_agent", + "configuration":{ + "agent_id": "your root agent id" + } +} +``` +{% include copy-curl.html %} + +This example shows a system index. In security-enabled domains, only super admins have permission to execute this code. For information about making super admin calls, see the [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/) guide. For access permission, contact your IT administrator. +{: .warning} + +Next, restart the OpenSearch Dashboards server. Following a successful restart, **OpenSearch Assistant** appears in the OpenSearch Dashboards interface. + +A screenshot of the interface is shown in the following image. + +OpenSearch Assistant interface + +## Configuring OpenSearch Assistant + +You can use the OpenSearch Dashboards interface to configure OpenSearch Assistant. Go to the [Getting started guide](https://github.com/opensearch-project/dashboards-assistant/blob/main/GETTING_STARTED_GUIDE.md) for step-by-step instructions. For the chatbot template, go to the [Flow Framework plugin](https://github.com/opensearch-project/flow-framework) documentation. You can modify this template to use your own model and customize the chatbot tools. + +For information about configuring OpenSearch Assistant through the REST API, see [OpenSearch Assistant Toolkit]({{site.url}}{{site.baseurl}}/ml-commons-plugin/opensearch-assistant/). + +## Using OpenSearch Assistant in OpenSearch Dashboards + +The following tutorials guide you through using OpenSearch Assistant in OpenSearch Dashboards. OpenSearch Assistant can be viewed in full frame or in the sidebar. The default view is in the right sidebar. To view the assistant in the left sidebar or in full frame, select the {::nomarkdown}frame icon{:/} icon in the toolbar and choose the preferred option. + +### Start a conversation + +Start a conversation by entering a prompt in the **Ask a question** search box or by using the shortcut `ctrl + /`. Select **Go** to initiate the conversation. A response is generated. + +The following screenshot shows an example prompt and response. + +Prompt and response using OpenSearch Assistant in OpenSearch Dashboards + +### Regenerate a response + +Beneath the response, select the regenerate icon to generate an alternative answer to your original question. The new answer will replace the previous one, appearing in both the interface and the chat history. A regenerated example is shown in the following image. + +Regenerated response + +### Suggested prompts + +OpenSearch Assistant suggests prompts to help you get started, build upon your existing prompts, or explore other queries you may not have considered, among other reasons. Select a suggested prompt listed beneath the response field. A screenshot is shown in the following image. + +Suggested prompts + +### Rate a response + +To rate a response, select the thumbs up or thumbs down icon. A screenshot of the interface is shown in the following image. The feedback is stored in the `additional_info` field of the message index. + +### Response generation + +Learn how a response is generated by selecting the **How was this generated?** option. This option is included within the available suggestions to help you understand which tools were involved in creating the response. If multiple tools were involved, each step will display the tool name and its input and output. This feature can be useful for troubleshooting. A screenshot is shown in the following image. + +Response generation details + +### Resume previous conversations + +To view a previous conversation, select the clock icon to open the conversation history panel and display the chat history. The conversation history can also be searched by conversation name. A screenshot is shown in the following image. + +Conversation history + +#### Edit and delete previous conversations + +Select the pencil icon to edit a conversation name and rename it. Select the **Confirm name** button to save the new name. A screenshot is shown in the following image. + +Editing a conversation name + +Select the trash can icon to delete a conversation. Once the confirmation dialog appears, select **Delete conversation**. The conversation is now deleted from your chat history. A screenshot is shown in the following image. + +Deleting a conversation + +### Share a conversation through Notebooks + +You can use [Notebooks]({{site.url}}{{site.baseurl}}/observing-your-data/notebooks/) to save your conversations. To use this option, select **Save to notebook** from the dropdown menu to the right of **OpenSearch Assistant**. Enter a name for the notebook, then select **Save**. A pop-up message in the lower-right corner confirms the conversation has been saved. + +All conversations (prompts and responses/questions and answers) between you and the large language model (LLM) will be saved to this notebook. + +To open the saved notebook or view a list of other notebooks, select **Observability** > **Notebooks** from the OpenSeach Dashboards navigation menu. + +A screenshot of the Notebooks interface with a list of saved conversations is shown in the following image. + +Notebooks interface with saved OpenSearch Assistant conversations + +The following screenshot shows a saved conversation, along with actions you can take for the saved conversation. + +Notebooks interface with saved OpenSearch Assistant conversations + +## Related articles + +- [Getting started guide for OpenSearch Assistant in OpenSearch Dashboards](https://github.com/opensearch-project/dashboards-assistant/blob/main/GETTING_STARTED_GUIDE.md) +- [OpenSearch Assistant configuration through the REST API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/opensearch-assistant/) diff --git a/_dashboards/discover/index-discover.md b/_dashboards/discover/index-discover.md index bf1e4a739f..4e32c4f185 100644 --- a/_dashboards/discover/index-discover.md +++ b/_dashboards/discover/index-discover.md @@ -1,17 +1,17 @@ --- layout: default -title: Discover +title: Analyzing data nav_order: 20 has_children: true +redirect_from: + - /dashboards/discover/index-discover/ --- -# Discover +# Analyzing data -**Discover** is a tool for exploring your data in OpenSearch Dashboards. You can use **Discover** to visually represent your data on a dashboard and provide a high-level view of key metrics. +To analyze your data in OpenSearch and visualize key metrics, you can use the **Discover** application in OpenSearch Dashboards. An example of data analysis in **Discover** is shown in the following image. -The following image represents a typical **Discover** page using sample data. - -Discover start screen +A Discover default page ## Getting started @@ -19,70 +19,57 @@ In this tutorial, you'll learn about using **Discover** to: - Add data. - Interpret and visualize data. -- Share the data findings. +- Share data findings. - Set alerts. -## Prerequisites - -The following are prerequisites for using **Discover**: - -- Install [OpenSearch Dashboards 2.10 or later](https://opensearch.org/downloads.html). -- Add OpenSearch [sample data]({{site.url}}{{site.baseurl}}/dashboards/quickstart/) or import your own data into OpenSearch. -- Have a foundational understanding of OpenSearch [documents and indexes]({{site.url}}{{site.baseurl}}/im-plugin/index/). - -## Adding data - -Data must be added to OpenSearch before it can be analyzed. In this tutorial, you'll use the sample data. To learn about importing your own data, see [Managing indexes]({{site.url}}{{site.baseurl}}/im-plugin/index/). - -To add the sample data, follow these steps: - -1. On the OpenSearch Dashboards home page, choose **Add sample data**. -2. Choose the desired sample data and select the **Add data** button. A screenshot of the **Add sample data** interface is shown in the following image. - -Add sample data interface +Before getting started, make sure you: +- Install [OpenSearch Dashboards](https://opensearch.org/downloads.html). +- Add sample data or import your own data into OpenSearch. Go to the [OpenSearch Dashboards quickstart guide]({{site.url}}{{site.baseurl}}/dashboards/quickstart/) to learn about adding sample datasets. Go to [Managing indexes]({{site.url}}{{site.baseurl}}/im-plugin/index/) to learn about importing your own data. +- Have a foundational understanding of [OpenSearch documents and indexes]({{site.url}}{{site.baseurl}}/im-plugin/index/). + ## Defining the search To define a search, follow these steps: 1. On the OpenSearch Dashboards navigation menu, select **Discover**. 2. Choose the data you want to work with. In this case, choose `opensearch_dashboards_sample_data_flights` from the upper-left dropdown menu. -3. Select the calendar icon ({::nomarkdown}calendar icon{:/}) to change the time range of your search and then select **Refresh**. +3. Select the {::nomarkdown}calendar icon{:/} icon to change the time range of your search and then select **Refresh**. -You'll see a view similar to the one in the following image. +The resulting view is shown in the following image. Discover interface showing search of flight sample data for Last 7 days -## Adding data fields and viewing data details +## Analyzing document tables -The document table contains document data. Each row represents a single document, and each column contains a different document field representing metrics such as flight destination, average ticket price, and flight delay. You can add, delete, or modify data fields in a document table as needed to meet your data analysis requirements. +In OpenSearch, a document table stores unstructured data. In a document table, each row represents a single document, and each column contains document attributes. -To add or delete fields in a document table, follow these steps: +To examine document attributes, follow these steps: -1. View the data fields listed under **Available fields** and select the plus icon ({::nomarkdown}plus icon{:/}) to add the desired fields to the document table. The field will be automatically added to both **Selected fields** and the document table. For this example, choose the fields `Carrier`, `AvgTicketPrice`, and `Dest`. -2. To arrange or sort the columns, select **Sort fields** > **Pick fields to sort by** and then drag and drop the fields in the order you want them to be ordered. +1. From the data table's left column, choose the {::nomarkdown}inspect icon{:/} icon to open the **Document Details** window. Select the {::nomarkdown}minimize icon{:/} icon to close the **Document Details** window. +2. Examine the metadata. You can switch between the **Table** and **JSON** tabs to view the data in your preferred format. +3. Select **View surrounding documents** to view data for other log entries either preceding or following your current document or select **View single document** to view a particular log entry. -You'll see a view similar to the one in the following image. +The resulting view is shown in the following image. -Discover interface showing adding and sorting data fields +Document attributes -You can view individual or multiple fields in the document table. To gather information about the data in the document table, follow these steps: +To add or delete fields in a document table, follow these steps: -1. From the data table's left-side column, choose the inspect icon ({::nomarkdown}inspect icon{:/}) to open the **Document Details** window. Select the minimize icon ({::nomarkdown}minimize icon{:/}) to close the **Document Details** window. -2. Review the data details. You can switch between the **Table** and **JSON** tabs to view the data in your preferred format. -3. Select **View surrounding documents** to view data for other log entries either preceding or following your current document or select **View single document** to view a particular log entry. +1. View the data fields listed under **Available fields** and select the {::nomarkdown}plus icon{:/} icon to add the desired fields to the document table. The field will be automatically added to both **Selected fields** and the document table. For this example, choose the fields `Carrier`, `AvgTicketPrice`, and `Dest`. +2. Select **Sort fields** > **Pick fields to sort by**. Drag and drop the chosen fields in the desired sort order. -You'll see a view similar to the one in the following image. +The resulting view is shown in the following image. -Document details interface +Adding and deleting data fields -## Searching the data +## Searching data You can use the search toolbar to enter a [DQL]({{site.url}}{{site.baseurl}}/dashboards/discover/dql/) or [query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) query. The search toolbar is best for basic queries; for full query and filter capability, use [query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/index/) in the [Dev Tools console]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/index-dev/). For more information, see [Discover and Dashboard search toolbar]({{site.url}}{{site.baseurl}}/dashboards/index/#discover-and-dashboard-search-bar). -## Filtering the data +## Filtering data Filters allow you to narrow the results of a query by specifying certain criteria. You can filter by field, value, or range. The **Add filter** pop-up suggests the available fields and operators. @@ -91,33 +78,36 @@ To filter your data, follow these steps: 1. Under the DQL search bar, choose **Add filter**. 2. Select the desired options from the **Field**, **Operator**, and **Value** dropdown lists. For example, select `Cancelled`, `is`, and `true`. 3. Choose **Save**. -4. To remove the filter, choose the cross icon ({::nomarkdown}cross icon{:/}) next to the filter name. -5. Add more filters to further explore the data. +4. To remove a filter, choose the {::nomarkdown}cross icon{:/} icon to the right of the filter name. + +The resulting view is shown in the following image. + +Visualize data findings interface ## Saving a search To save your search, including the query text, filters, and current data view, follow these steps: -1. Select **Save** in the upper-right corner. -2. Give the search a title, and then choose **Save**. -3. Select **Open** to access the saved search. +1. Select **Save** on the upper-right toolbar. +2. Add a title, and then choose **Save**. +3. Select **Open** on the upper-right toolbar to access your saved searches. -## Creating data visualizations through Discover +## Visualizing data findings -To create visualizations of the data findings using the **Discover** app, follow these steps: +To visualize your data findings, follow these steps: -1. Select the inspect icon ({::nomarkdown}inspect icon{:/}) next to the field you want to visualize. +1. Select the {::nomarkdown}inspect icon{:/} icon to the right of the field you want to visualize. - You'll see a view similar to the following image. + The resulting view is shown in the following image. Visualize data findings interface -2. Select the **Visualize** button. The **Visualize** app opens and a visualization is displayed. Learn more about the **Visualize** app and data visualizations in [Building data visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/). +2. Select the **Visualize** button. When the **Visualize** application is launched, a visualization appears. - You'll see a view similar to the following image. + The resulting view is shown in the following image. Data visualization of flight sample data field destination ## Setting alerts -You can set alerts to notify you when your data changes beyond the thresholds you define. To learn more about using **Discover** to create and manage alerts, see [Alerting dashboards and visualizations]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/dashboards-alerting/). +Set alerts to notify you when your data exceeds your specified thresholds. Go to [Alerting dashboards and visualizations]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/dashboards-alerting/) to learn about creating and managing alerts. diff --git a/_dashboards/discover/time-filter.md b/_dashboards/discover/time-filter.md index 0241c623f5..288138d079 100644 --- a/_dashboards/discover/time-filter.md +++ b/_dashboards/discover/time-filter.md @@ -1,10 +1,11 @@ --- layout: default title: Time filter -parent: Discover +parent: Analyzing data nav_order: 20 redirect_from: - /dashboards/get-started/time-filter/ + - /dashboards/discover/time-filter/ --- # Time filter @@ -16,7 +17,7 @@ The default time range is **Last 15 minutes**. You can change the time range at To change the time range at the dashboard level, follow these steps: -1. From an OpenSearch Dashboards application (Discover, Dashboards, or Visualize), select the calendar icon ({::nomarkdown}calendar icon{:/}) on the right of the search bar. +1. From an OpenSearch Dashboards application (Discover, Dashboards, or Visualize), select the {::nomarkdown}calendar icon{:/} icon on the right of the search bar. 2. Select one of the time filter options, as shown in the following image: - **Quick select:** Choose a time based on the last or next number of seconds, minutes, hours, days, or another time unit. - **Commonly used:** Choose a common time range like **Today**, **Last 7 days**, or **Last 30 days**. diff --git a/_dashboards/dql.md b/_dashboards/dql.md index aaf2d1a15d..7ddcbc6d1b 100644 --- a/_dashboards/dql.md +++ b/_dashboards/dql.md @@ -20,7 +20,7 @@ DQL and query string query (Lucene) language are the two search bar language opt To follow this tutorial in OpenSearch Dashboards, expand the following setup steps. -
+
Setup diff --git a/_dashboards/management/index-patterns.md b/_dashboards/management/index-patterns.md index 590a9675a2..37baa210e9 100644 --- a/_dashboards/management/index-patterns.md +++ b/_dashboards/management/index-patterns.md @@ -56,7 +56,7 @@ An example of step 1 is shown in the following image. Note that the index patter Once the index pattern has been created, you can view the mapping of the matching indexes. Within the table, you can see the list of fields, along with their data type and properties. An example is shown in the following image. -Index pattern table UI +Index pattern table UI ## Next steps diff --git a/_dashboards/management/management-index.md b/_dashboards/management/management-index.md index c1757893ea..7edc4d06c2 100644 --- a/_dashboards/management/management-index.md +++ b/_dashboards/management/management-index.md @@ -9,7 +9,7 @@ has_children: true Introduced 2.10 {: .label .label-purple } -Dashboards Management serves as the command center for customizing OpenSearch Dashboards to your needs. A view of the interface is shown in the following image. +**Dashboards Management** serves as the command center for customizing OpenSearch Dashboards to your needs. A view of the interface is shown in the following image. Dashboards Management interface @@ -18,9 +18,9 @@ Dashboards Management serves as the command center for customizing OpenSearch Da ## Applications -The following applications are available in Dashboards Management: +The following applications are available in **Dashboards Management**: - **[Index Patterns]({{site.url}}{{site.baseurl}}/dashboards/management/index-patterns/):** To access OpenSearch data, you need to create an index pattern so that you can select the data you want to use and define the properties of the fields. The Index Pattern tool gives you the ability to create an index pattern from within the UI. Index patterns point to one or more indexes, data streams, or index aliases. - **[Data Sources]({{site.url}}{{site.baseurl}}/dashboards/management/multi-data-sources/):** The Data Sources tool is used to configure and manage the data sources that OpenSearch uses to collect and analyze data. You can use the tool to specify the source configuration in your copy of the [OpenSearch Dashboards configuration file]({{site.url}}{{site.baseurl}}https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/config/opensearch_dashboards.yml). -- **Saved Objects:** The Saved Objects tool helps you organize and manage your saved objects. Saved objects are files that store data, such as dashboards, visualizations, and maps, for later use. +- **[Saved Objects](https://opensearch.org/blog/enhancement-multiple-data-source-import-saved-object/):** The Saved Objects tool helps you organize and manage your saved objects. Saved objects are files that store data, such as dashboards, visualizations, and maps, for later use. - **[Advanced Settings]({{site.url}}{{site.baseurl}}/dashboards/management/advanced-settings/):** The Advanced Settings tool gives you the flexibility to personalize the behavior of OpenSearch Dashboards. The tool is divided into settings sections, such as General, Accessibility, and Notifications, and you can use it to customize and optimize many of your Dashboards settings. diff --git a/_dashboards/management/multi-data-sources.md b/_dashboards/management/multi-data-sources.md index 2b5aadc81a..dd66101f80 100644 --- a/_dashboards/management/multi-data-sources.md +++ b/_dashboards/management/multi-data-sources.md @@ -3,7 +3,7 @@ layout: default title: Configuring and using multiple data sources parent: Data sources nav_order: 10 -redirect_from: +redirect_from: - /dashboards/discover/multi-data-sources/ --- @@ -11,23 +11,22 @@ redirect_from: You can ingest, process, and analyze data from multiple data sources in OpenSearch Dashboards. You configure the data sources in the **Dashboards Management** > **Data sources** app, as shown in the following image. - Dashboards Management Data sources main screen ## Getting started -The following tutorial guides you through configuring and using multiple data sources. +The following tutorial guides you through configuring and using multiple data sources. ### Step 1: Modify the YAML file settings To use multiple data sources, you must enable the `data_source.enabled` setting. It is disabled by default. To enable multiple data sources: 1. Open your local copy of the OpenSearch Dashboards configuration file, `opensearch_dashboards.yml`. If you don't have a copy, [`opensearch_dashboards.yml`](https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/config/opensearch_dashboards.yml) is available on GitHub. -2. Set `data_source.enabled:` to `true` and save the YAML file. +2. Set `data_source.enabled:` to `true` and save the YAML file. 3. Restart the OpenSearch Dashboards container. 4. Verify that the configuration settings were configured properly by connecting to OpenSearch Dashboards and viewing the **Dashboards Management** navigation menu. **Data sources** appears in the sidebar. You'll see a view similar to the following image. - Data sources in sidebar within Dashboards Management +Data sources in sidebar within Dashboards Management ### Step 2: Create a new data source connection @@ -36,16 +35,17 @@ A data source connection specifies the parameters needed to connect to a data so To create a new data source connection: 1. From the OpenSearch Dashboards main menu, select **Dashboards Management** > **Data sources** > **Create data source connection**. -2. Add the required information to each field to configure **Connection Details** and **Authentication Method**. - + +2. Add the required information to each field to configure the **Connection Details** and **Authentication Method**. + - Under **Connection Details**, enter a title and endpoint URL. For this tutorial, use the URL `http://localhost:5601/app/management/opensearch-dashboards/dataSources`. Entering a description is optional. - Under **Authentication Method**, select an authentication method from the dropdown list. Once an authentication method is selected, the applicable fields for that method appear. You can then enter the required details. The authentication method options are: - - **No authentication**: No authentication is used to connect to the data source. - - **Username & Password**: A basic username and password are used to connect to the data source. - - **AWS SigV4**: An AWS Signature Version 4 authenticating request is used to connect to the data source. AWS Signature Version 4 requires an access key and a secret key. - - For AWS Signature Version 4 authentication, first specify the **Region**. Next, select the OpenSearch service in the **Service Name** list. The options are **Amazon OpenSearch Service** and **Amazon OpenSearch Serverless**. Last, enter the **Access Key** and **Secret Key** for authorization. - + - **No authentication**: No authentication is used to connect to the data source. + - **Username & Password**: A basic username and password are used to connect to the data source. + - **AWS SigV4**: An AWS Signature Version 4 authenticating request is used to connect to the data source. AWS Signature Version 4 requires an access key and a secret key. + - For AWS Signature Version 4 authentication, first specify the **Region**. Next, select the OpenSearch service from the **Service Name** list. The options are **Amazon OpenSearch Service** and **Amazon OpenSearch Serverless**. Last, enter the **Access Key** and **Secret Key** for authorization. + For information about available AWS Regions for AWS accounts, see [Available Regions](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html#concepts-available-regions). For more information about AWS Signature Version 4 authentication requests, see [Authenticating Requests (AWS Signature Version 4)](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-authenticating-requests.html). {: .note} @@ -58,12 +58,11 @@ To create a new data source connection: - To make changes to the data source connection, select a connection in the list on the **Data Sources** main page. The **Connection Details** window opens. - To make changes to **Connection Details**, edit one or both of the **Title** and **Description** fields and select **Save changes** in the lower-right corner of the screen. You can also cancel changes here. To change the **Authentication Method**, choose a different authentication method, enter your credentials (if applicable), and then select **Save changes** in the lower-right corner of the screen. The changes are saved. - + - When **Username & Password** is the selected authentication method, you can update the password by choosing **Update stored password** next to the **Password** field. In the pop-up window, enter a new password in the first field and then enter it again in the second field to confirm. Select **Update stored password** in the pop-up window. The new password is saved. Select **Test connection** to confirm that the connection is valid. - - When **AWS SigV4** is the selected authentication method, you can update the credentials by selecting **Update stored AWS credential**. In the pop-up window, enter a new access key in the first field and a new secret key in the second field. Select **Update stored AWS credential** in the pop-up window. The new credentials are saved. Select **Test connection** in the upper-right corner of the screen to confirm that the connection is valid. -5. Delete the data source connection by selecting the check box to the left of the title and then choosing **Delete 1 connection**. Selecting multiple check boxes for multiple connections is supported. Alternatively, select the trash can icon ({::nomarkdown}trash can icon{:/}). +5. Delete the data source connection by selecting the check box to the left of the title and then choosing **Delete 1 connection**. Selecting multiple check boxes for multiple connections is supported. Alternatively, select the {::nomarkdown}trash can icon{:/} icon. An example data source connection screen is shown in the following image. @@ -71,7 +70,7 @@ An example data source connection screen is shown in the following image. ### Selecting multiple data sources through the Dev Tools console -Alternatively, you can select multiple data sources through the [Dev Tools]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/index-dev/) console. This option provides for working with a broader range of data and gaining deeper insight into your code and applications. +Alternatively, you can select multiple data sources through the [Dev Tools]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/index-dev/) console. This option allows you to work with a broader range of data and gaining a deeper understanding of your code and applications. Watch the following 10-second video to see it in action. @@ -79,7 +78,7 @@ Watch the following 10-second video to see it in action. To select a data source through the Dev Tools console, follow these steps: -1. Locate your copy of `opensearch_dashboards.yml` and open it in the editor of your choice. +1. Locate your copy of `opensearch_dashboards.yml` and open it in the editor of your choice. 2. Set `data_source.enabled` to `true`. 3. Connect to OpenSearch Dashboards and select **Dev Tools** in the menu. 4. Enter the following query in the editor pane of the **Console** and then select the play button: @@ -89,9 +88,59 @@ To select a data source through the Dev Tools console, follow these steps: ``` {% include copy-curl.html %} -5. From the **DataSource** dropdown menu, select a data source and then query the source. +5. From the **Data source** dropdown menu, select a data source and then query the source. 6. Repeat the preceding steps for each data source you want to select. +### Upload saved objects to a dashboard from connected data sources + +To upload saved objects from connected data sources to a dashboard with multiple data sources, export them as an NDJSON file from the data source's **Saved object management** page. Then upload the file to the dashboard's **Saved object management** page. This method can simplify the transfer of saved objects between dashboards. The following 20-second video shows this feature in action. + +Multiple data sources in Saved object management{: .img-fluid} + +#### Import saved objects from a connected data source + +Follow these steps to import saved objects from a connected data source: + +1. Locate your `opensearch_dashboards.yml` file and open it in your preferred text editor. +2. Set `data_source.enabled` to `true`. +3. Connect to OpenSearch Dashboards and go to **Dashboards Management** > **Saved objects**. +4. Select **Import** > **Select file** and upload the file acquired from the connected data source. +5. Choose the appropriate **Data source** from the dropdown menu, set your **Conflict management** option, and then select the **Import** button. + +### Show or hide authentication methods for multiple data sources +Introduced 2.13 +{: .label .label-purple } + +A feature flag in your `opensearch_dashboards.yml` file allows you to show or hide authentication methods within the `data_source` plugin. The following example setting, shown in a 10-second demo, hides the authentication method for `AWSSigV4`. + +```` +# Set enabled to false to hide the authentication method from multiple data source in OpenSearch Dashboards. +# If this setting is commented out, then all three options will be available in OpenSearch Dashboards. +# The default value will be considered as true. +data_source.authTypes: + NoAuthentication: + enabled: true + UsernamePassword: + enabled: true + AWSSigV4: + enabled: false +```` + +Multiple data sources hide and show authentication{: .img-fluid} + +### Hide the local cluster option for multiple data sources +Introduced 2.13 +{: .label .label-purple } + +A feature flag in your `opensearch_dashboards.yml` file allows you to hide the local cluster option within the `data_source` plugin. This option hides the local cluster from the data source dropdown menu and index creation page, which is ideal for environments with or without a local OpenSearch cluster. The following example setting, shown in a 20-second demo, hides the local cluster. + +```` +# hide local cluster in the data source dropdown and index pattern creation page. +data_source.hideLocalCluster: true +```` + +Multiple data sources hide local cluster{: .img-fluid} + ## Next steps Once you've configured your multiple data sources, you can start exploring that data. See the following resources to learn more: @@ -106,5 +155,5 @@ Once you've configured your multiple data sources, you can start exploring that This feature has some limitations: * The multiple data sources feature is supported for index-pattern-based visualizations only. -* The visualization types Time Series Visual Builder (TSVB), Vega and Vega-Lite, and timeline are not supported. -* External plugins, such as Gantt chart, and non-visualization plugins, such as the developer console, are not supported. +* The Time Series Visual Builder (TSVB) and timeline visualization types are not supported. +* External plugins, such as `gantt-chart`, and non-visualization plugins are not supported. diff --git a/_dashboards/quickstart.md b/_dashboards/quickstart.md index 0c887e0d60..eccdeb7d6c 100644 --- a/_dashboards/quickstart.md +++ b/_dashboards/quickstart.md @@ -1,119 +1,118 @@ --- layout: default -title: Quickstart guide +title: OpenSearch Dashboards quickstart guide nav_order: 2 has_children: false redirect_from: - /dashboards/quickstart-dashboards/ --- -# Quickstart guide +# OpenSearch Dashboards quickstart guide -This quickstart guide covers the core concepts that you need to understand to get started with OpenSearch Dashboards. You'll learn how to: +This quickstart guide provides tutorials on using OpenSearch Dashboards applications and tools. You can use these tutorials, either in your own environment or on [OpenSearch Playground](https://playground.opensearch.org/app/home#/), to learn the following fundamental concepts: -- Add sample data. -- Explore and inspect data. -- Visualize data. +- **Adding sample data:** Use preloaded visualizations, dashboards, and other tools to explore OpenSearch Dashboards before adding your own data. +- **Using the Discover application:** Analyze your data to gain insights. +- **Using the Dashboards application:** Create and store data visualizations. +- **Turning dark mode on or off:** Change the Dashboards theme. -Here's a glance at the view you see when you open the **Dashboard** or **Discover** tool. +To dock or undock the navigation pane, select the {::nomarkdown}menu icon{:/} icon and then **Dock navigation** or **Undock navigation**. The OpenSearch Dashboards home page is shown in the following image. -Light and dark mode UI on Discover and Dashboard tools' home page +OpenSearch Dashboards home page {::nomarkdown}alert icon{:/} **Note**
Before you get started, make sure you've installed OpenSearch and OpenSearch Dashboards. For information about installation and configuration, see [Install and configure OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/) and [Install and configure OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/index/). {: .note} -# Adding sample data -Sample datasets come with visualizations, dashboards, and other tools to help you explore Dashboards before you add your own data. To add sample data, perform the following steps: +## Adding sample data -1. Verify access to OpenSearch Dashboards by connecting to [http://localhost:5601](http://localhost:5601) from a browser. The default username and password are `admin`. -1. On the OpenSearch Dashboards **Home** page, choose **Add sample data**. -2. Choose **Add data** to add the datasets, as shown in the following image. +The following tutorials use the [**Sample flight data**](https://playground.opensearch.org/app/home#/tutorial_directory) dataset. +{: .note} - Sample datasets +To add sample data, follow these steps: -# Exploring and inspecting data +1. On the OpenSearch Dashboards **Home** page, choose **Add sample data**. Alternatively, choose **Add data** on the upper-right toolbar. +2. On the **Add sample data** page, choose the dataset(s) you want to add to Dashboards. The following image shows the available sample datasets. -In [**Discover**]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/), you can: +Adding sample data window -- Choose data to explore, set a time range for that data, search it using [Dashboards Query Language (DQL)]({{site.url}}{{site.baseurl}}/dashboards/dql/), and filter the results. -- Explore the data, view individual documents, and create tables summarizing the data. -- Visualize your findings. +## Using the Discover application -## Try it: Getting familiar with Discover +With [**Discover**]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/), you can: -1. On the OpenSearch Dashboards **Home** page, choose **Discover**. -1. Change the [time filter]({{site.url}}{{site.baseurl}}/dashboards/discover/time-filter/) to **Last 7 days**, as shown in the following image. +- Choose data for analysis, set a time range for that data, search it using [Dashboards Query Language (DQL)]({{site.url}}{{site.baseurl}}/dashboards/dql/), and filter the results. +- Analyze your data by querying and filtering, viewing results in a table, and examining documents. +- Create histograms to display the distribution of your data. - Time filter interface +Follow these steps to use the Discover tool: -2. Search using the DQL query `FlightDelay:true AND DestCountry: US AND FlightDelayMin >= 60` and then choose **Update**. You should see results for US-bound flights delayed by 60 minutes or more, as shown in the following image. - - DQL search field example +1. From the OpenSearch Dashboards navigation menu, choose **Discover**. +2. On the **Discover** page, choose the index pattern `opensearch_dashboards_sample_data_flights` from the dropdown menu on the upper left. +3. Select the {::nomarkdown}calendar icon{:/} icon to change the [time filter]({{site.url}}{{site.baseurl}}/dashboards/discover/time-filter/) from the default of **Last 15 minutes** to **Last 7 days**. +4. In the DQL search bar, enter `FlightDelay:true AND DestCountry: US AND FlightDelayMin >= 60` and select **Update**. Results are shown for US-bound flights delayed by 60 minutes or more. +5. Filter data by selecting **Add filter** from the DQL search bar and then selecting a **Field**, **Operator**, and **Value** from the dropdown lists in the **Edit Filter** pop-up window. For example, select `FlightDelayType`, **is**, and **Weather Delay**. -3. To filter data, choose **Add filter** and then select an **Available field**. For example, select `FlightDelayType`, **is**, and **Weather delay** from the **Field**, **Operator**, and **Value** dropdown lists, as shown in the following image. +The resulting view is shown in the following image. - Filter data by FlightDelayType field +Discover output of steps 1 through 6 -# Visualizing data +## Using the Dashboards application -Raw data can be difficult to comprehend and use. Data visualizations help you prepare and present data in a visual form. In **Dashboard** you can: +With **Dashboards**, you can: - Display data in a single view. - Build dynamic dashboards. - Create and share reports. - Embed analytics to differentiate your applications. -## Try it: Getting familiar with Dashboard - -1. On the OpenSearch Dashboards **Home** page, choose **Dashboard**. -1. Choose **[Flights] Global Flight Data** in the **Dashboards** window, as shown in the following image. +The **Dashboards** application creates and stores visualizations generated from your data. Follow these steps to use the application: - Data visualization dashboard +1. On the OpenSearch Dashboards **Home** page, choose **Dashboards**. A list of dashboards generated from the sample data appears. +2. In the search toolbar, search for and select **[Flights] Global Flight Dashboard**. You'll see a dashboard preloaded with visualizations, including charts, maps, and data tables. +3. To add other panels to the dashboard, select the **Edit** button and then choose **Add** from the toolbar. The **Add panels** window opens. +4. In the search toolbar in the **Add panels** window, search for and select the existing panel **[Flights] Delay Buckets**. A pop-up message confirms that you've added the panel. +5. Select close `x` to exit the **Add panels** window. +6. The newly added panel is now displayed on the dashboard. -1. To add panels to the dashboard, choose **Edit** and then **Add** from the toolbar. -1. In the **Add panels** window, choose the existing panel **[Flights] Delay Buckets**. You'll see a pop-up window on the lower right confirming that you've added the panel. -1. Select `x` to close the **Add panels** window. -1. View the added panel **[Flights] Delay Buckets**, which is added as the last panel on the dashboard, as shown in the following image. +The resulting view is shown in the following image. - Add panel to dashboard +Add panel tutorial screen view -## Try it: Creating a visualization panel +For information about using a specific data visualization type, such as VisBuilder, go to [Building data visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/). For information about using dashboards and visualizations in **Observability**, go to [Observability]({{site.url}}{{site.baseurl}}/observing-your-data/). +{: .note} -Continuing with the preceding dashboard, you'll create a bar chart comparing the number of canceled flights and delayed flights to delay type and then add the panel to the dashboard: +### Interacting with data using dashboards -1. Change the default [time range]({{site.url}}{{site.baseurl}}/dashboards/discover/time-filter/) from **24 hours** to **Last 7 days**. -1. In the toolbar, choose **Edit**, then **Create new**. -1. Select **VisBuilder** in the **New Visualizations** window. -1. In the **Data Source** dropdown list, choose `opensearch_dashboards_sample_data_flights`. -1. Drag the fields **Cancelled** and **FlightDelay** to the y-axis column. -1. Drag the field **FlightDelayType** to the x-axis column. -1. Choose **Save** and name the visualization in the **Title** field. -2. Choose **Save and return**. The following bar chart is added as the last panel on the dashboard, as shown in the following image. +Interactive dashboards allow you to analyze data in more depth and filter it in several ways. With **Dashboards**, you can use dashboard-level filters to directly interact with data. -Creating a visualization panel +Using the **[Flights] Global Flight Dashboard** dashboard, follow these steps to further analyze and filter the sample flight data: -# Interacting with data +1. On the **[Flights] Airline Carrier** panel, choose **OpenSearch-Air**. The dashboard updates automatically. +2. Choose **Save** to save the dashboard. -Interactive dashboards allow you analyze data in more depth and filter it in several ways. In Dashboards, you can interact directly with data on a dashboard by using dashboard-level filters. For example, continuing with the preceding dashboard, you can filter to show delays and cancellations for a specific airline. +Alternatively, you can use the dashboard toolbar to apply filters by following these steps: -## Try it: Interacting with the sample flight data +1. In the dashboard toolbar, choose **Add filter**. +2. From the **Field**, **Operator**, and **Value** dropdown lists, choose **Carrier**, **is**, and **OpenSearch-Air**, respectively, as shown in the following image. +3. Choose **Save**. The dashboard updates automatically. -1. On the **[Flights] Airline Carrier** panel, choose **OpenSearch-Air**. The dashboard updates automatically. -1. Choose **Save** to save the customized dashboard. +The resulting view is shown in the following image. -Alternatively, you can apply filters using the dashboard toolbar: +Screenshot of Dashboard tutorial panel view -1. In the dashboard toolbar, choose **Add filter**. -1. From the **Field**, **Operator**, and **Value** dropdown lists, choose **Carrier**, **is**, and **OpenSearch-Air**, respectively, as shown in the following image. +## Turning dark mode on or off - Edit field interface +Changing the Dashboards theme requires admin permissions. If you are an admin, follow these steps: -1. Choose **Save**. The dashboard updates automatically, and the result is the dashboard shown in the following image. +1. Navigate to **Management** > **Dashboards Management** > **Advanced Settings**. +2. Scroll down to the **Appearance** section and locate the **Dark mode** option. +3. Use the toggle switch to turn dark mode on or off for all users of your Dashboards instance, as shown in the image following these steps. +4. Select the **Save changes** button and then the **Reload** button. The updated theme is applied immediately. - Dashboard view after applying Carrier filter +Dark mode view -# Next steps +## Next steps -- **Visualize data**. To learn more about data visualizations in OpenSearch Dashboards, see [**Building data visualizations**]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/). -- **Create dashboards**. To learn more about creating dashboards in OpenSearch Dashboards, see [**Creating dashboards**]({{site.url}}{{site.baseurl}}/dashboards/quickstart-dashboards/). -- **Explore data**. To learn more about exploring data in OpenSearch Dashboards, see [**Exploring data**]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/). \ No newline at end of file +- Go to [Building data visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/) to learn more about Dashboards data visualizations. +- Go to [Creating dashboards]({{site.url}}{{site.baseurl}}/dashboards/quickstart-dashboards/) to learn more about creating dashboards. +- Go to [Analyzing data]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/) to learn more about using Dashboards to analyze data. +- Go to [Ingest APIs]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) and [Ingest pipelines]({{site.url}}{{site.baseurl}}/ingest-pipelines/) to learn more about using OpenSearch for data ingestion. diff --git a/_dashboards/visualize/vega.md b/_dashboards/visualize/vega.md new file mode 100644 index 0000000000..7764d583a6 --- /dev/null +++ b/_dashboards/visualize/vega.md @@ -0,0 +1,192 @@ +--- +layout: default +title: Using Vega +parent: Building data visualizations +nav_order: 45 +--- + +# Using Vega + +[Vega](https://vega.github.io/vega/) and [Vega-Lite](https://vega.github.io/vega-lite/) are open-source, declarative language visualization tools that you can use to create custom data visualizations with your OpenSearch data and [Vega Data](https://vega.github.io/vega/docs/data/). These tools are ideal for advanced users comfortable with writing OpenSearch queries directly. Enable the `vis_type_vega` plugin in your `opensearch_dashboards.yml` file to write your [Vega specifications](https://vega.github.io/vega/docs/specification/) in either JSON or [HJSON](https://hjson.github.io/) format or to specify one or more OpenSearch queries within your Vega specification. By default, the plugin is set to `true`. The configuration is shown in the following example. For configuration details, refer to the `vis_type_vega` [README](https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/src/plugins/vis_type_vega/README.md). + +``` +vis_type_vega.enabled: true +``` + +The following image shows a custom Vega map created in OpenSearch. + +Map created using Vega visualization in OpenSearch Dashboards + +## Querying from multiple data sources + +If you have configured [multiple data sources]({{site.url}}{{site.baseurl}}/dashboards/management/multi-data-sources/) in OpenSearch Dashboards, you can use Vega to query those data sources. Within your Vega specification, add the `data_source_name` field under the `url` property to target a specific data source by name. By default, queries use data from the local cluster. You can assign individual `data_source_name` values to each OpenSearch query within your Vega specification. This allows you to query multiple indexes across different data sources in a single visualization. + +The following is an example Vega specification with `Demo US Cluster` as the specified `data_source_name`: + +``` +{ + $schema: https://vega.github.io/schema/vega/v5.json + config: { + kibana: {type: "map", latitude: 25, longitude: -70, zoom: 3} + } + data: [ + { + name: table + url: { + index: opensearch_dashboards_sample_data_flights + // This OpenSearchQuery will query from the Demo US Cluster datasource + data_source_name: Demo US Cluster + %context%: true + // Uncomment to enable time filtering + // %timefield%: timestamp + body: { + size: 0 + aggs: { + origins: { + terms: {field: "OriginAirportID", size: 10000} + aggs: { + originLocation: { + top_hits: { + size: 1 + _source: { + includes: ["OriginLocation", "Origin"] + } + } + } + distinations: { + terms: {field: "DestAirportID", size: 10000} + aggs: { + destLocation: { + top_hits: { + size: 1 + _source: { + includes: ["DestLocation"] + } + } + } + } + } + } + } + } + } + } + format: {property: "aggregations.origins.buckets"} + transform: [ + { + type: geopoint + projection: projection + fields: [ + originLocation.hits.hits[0]._source.OriginLocation.lon + originLocation.hits.hits[0]._source.OriginLocation.lat + ] + } + ] + } + { + name: selectedDatum + on: [ + {trigger: "!selected", remove: true} + {trigger: "selected", insert: "selected"} + ] + } + ] + signals: [ + { + name: selected + value: null + on: [ + {events: "@airport:mouseover", update: "datum"} + {events: "@airport:mouseout", update: "null"} + ] + } + ] + scales: [ + { + name: airportSize + type: linear + domain: {data: "table", field: "doc_count"} + range: [ + {signal: "zoom*zoom*0.2+1"} + {signal: "zoom*zoom*10+1"} + ] + } + ] + marks: [ + { + type: group + from: { + facet: { + name: facetedDatum + data: selectedDatum + field: distinations.buckets + } + } + data: [ + { + name: facetDatumElems + source: facetedDatum + transform: [ + { + type: geopoint + projection: projection + fields: [ + destLocation.hits.hits[0]._source.DestLocation.lon + destLocation.hits.hits[0]._source.DestLocation.lat + ] + } + {type: "formula", expr: "{x:parent.x, y:parent.y}", as: "source"} + {type: "formula", expr: "{x:datum.x, y:datum.y}", as: "target"} + {type: "linkpath", shape: "diagonal"} + ] + } + ] + scales: [ + { + name: lineThickness + type: log + clamp: true + range: [1, 8] + } + { + name: lineOpacity + type: log + clamp: true + range: [0.2, 0.8] + } + ] + marks: [ + { + from: {data: "facetDatumElems"} + type: path + interactive: false + encode: { + update: { + path: {field: "path"} + stroke: {value: "black"} + strokeWidth: {scale: "lineThickness", field: "doc_count"} + strokeOpacity: {scale: "lineOpacity", field: "doc_count"} + } + } + } + ] + } + { + name: airport + type: symbol + from: {data: "table"} + encode: { + update: { + size: {scale: "airportSize", field: "doc_count"} + xc: {signal: "datum.x"} + yc: {signal: "datum.y"} + tooltip: { + signal: "{title: datum.originLocation.hits.hits[0]._source.Origin + ' (' + datum.key + ')', connnections: length(datum.distinations.buckets), flights: datum.doc_count}" + } + } + } + } + ] +} +``` +{% include copy-curl.html %} diff --git a/_data-prepper/common-use-cases/anomaly-detection.md b/_data-prepper/common-use-cases/anomaly-detection.md new file mode 100644 index 0000000000..e7003558f1 --- /dev/null +++ b/_data-prepper/common-use-cases/anomaly-detection.md @@ -0,0 +1,210 @@ +--- +layout: default +title: Anomaly detection +parent: Common use cases +nav_order: 5 +--- + +# Anomaly detection + +You can use Data Prepper to train models and generate anomalies in near real time on time-series aggregated events. You can generate anomalies either on events generated within the pipeline or on events coming directly into the pipeline, like OpenTelemetry metrics. You can feed these tumbling window aggregated time-series events to the [`anomaly_detector` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/anomaly-detector/), which trains a model and generates anomalies with a grade score. Then you can configure your pipeline to write the anomalies to a separate index to create document monitors and trigger fast alerting. + +## Metrics from logs + +The following pipeline receives logs from an HTTP source like FluentBit, extracts important values from the logs by matching the value in the `log` key against the [Grok Apache Common Log Format](https://httpd.apache.org/docs/2.4/logs.html#accesslog), and then forwards the grokked logs to both the `log-to-metrics-pipeline` pipeline and an OpenSearch index named `logs`. + +The `log-to-metrics-pipeline` pipeline receives the grokked logs from the `apache-log-pipeline-with-metrics` pipeline, aggregates them, and derives histogram metrics based on the values in the `clientip` and `request` keys. It then sends the histogram metrics to an OpenSearch index named `histogram_metrics` as well as to the `log-to-metrics-anomaly-detector-pipeline` pipeline. + +The `log-to-metrics-anomaly-detector-pipeline` pipeline receives the aggregated histogram metrics from the `log-to-metrics-pipeline` pipeline and sends them to the `anomaly_detector` processor to detect anomalies by using the Random Cut Forest algorithm. If the algorithm detects anomalies, it sends them to an OpenSearch index named `log-metric-anomalies`. + +```json +apache-log-pipeline-with-metrics: + source: + http: + # Provide the path for ingestion. ${pipelineName} will be replaced with pipeline name configured for this pipeline. + # In this case it would be "/apache-log-pipeline-with-metrics/logs". This will be the FluentBit output URI value. + path: "/${pipelineName}/logs" + processor: + - grok: + match: + log: [ "%{COMMONAPACHELOG_DATATYPED}" ] + sink: + - opensearch: + ... + index: "logs" + - pipeline: + name: "log-to-metrics-pipeline" + +log-to-metrics-pipeline: + source: + pipeline: + name: "apache-log-pipeline-with-metrics" + processor: + - aggregate: + # Specify the required identification keys + identification_keys: ["clientip", "request"] + action: + histogram: + # Specify the appropriate values for each the following fields + key: "bytes" + record_minmax: true + units: "bytes" + buckets: [0, 25000000, 50000000, 75000000, 100000000] + # Pick the required aggregation period + group_duration: "30s" + sink: + - opensearch: + ... + index: "histogram_metrics" + - pipeline: + name: "log-to-metrics-anomaly-detector-pipeline" + +log-to-metrics-anomaly-detector-pipeline: + source: + pipeline: + name: "log-to-metrics-pipeline" + processor: + - anomaly_detector: + # Specify the key on which to run anomaly detection + keys: [ "bytes" ] + mode: + random_cut_forest: + sink: + - opensearch: + ... + index: "log-metric-anomalies" +``` +{% include copy-curl.html %} + +## Metrics from traces + +You can derive metrics from traces and find anomalies in those metrics. In this example, the `entry-pipeline` pipeline receives trace data from the OpenTelemetry Collector and forwards it to the following pipelines: + +- `span-pipeline` –- Extracts the raw spans from the traces. The pipeline sends the raw spans to any indexes OpenSearch prefixed with `otel-v1-apm-span`. + +- `service-map-pipeline` –- Aggregates and analyzes the traces to create documents that represent connections between services. The pipeline sends these documents to an OpenSearch index named `otel-v1-apm-service-map`. You can then see a visualization of the service map through the [Trace Analytics]({{site.url}}{{site.baseurl}}/observing-your-data/trace/index/) plugin for OpenSearch Dashboards. + +- `trace-to-metrics-pipeline` -- Aggregates and derives histogram metrics from the traces based on the value of the `serviceName`. The pipeline then sends the derived metrics to an OpenSearch index named `metrics_for_traces` and to the `trace-to-metrics-anomaly-detector-pipeline` pipeline. + +The `trace-to-metrics-anomaly-detector-pipeline` pipeline receives the aggregated histogram metrics from the `trace-to-metrics-pipeline` and sends them to the `anomaly_detector` processor to detect anomalies by using the Random Cut Forest algorithm. If the algorithm detects any anomalies, it sends them to an OpenSearch index named `trace-metric-anomalies`. + +```json +entry-pipeline: + source: + otel_trace_source: + # Provide the path for ingestion. ${pipelineName} will be replaced with pipeline name configured for this pipeline. + # In this case it would be "/entry-pipeline/v1/traces". This will be endpoint URI path in OpenTelemetry Exporter + # configuration. + # path: "/${pipelineName}/v1/traces" + processor: + - trace_peer_forwarder: + sink: + - pipeline: + name: "span-pipeline" + - pipeline: + name: "service-map-pipeline" + - pipeline: + name: "trace-to-metrics-pipeline" + +span-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - otel_trace_raw: + sink: + - opensearch: + ... + index_type: "trace-analytics-raw" + +service-map-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - service_map: + sink: + - opensearch: + ... + index_type: "trace-analytics-service-map" + +trace-to-metrics-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - aggregate: + # Pick the required identification keys + identification_keys: ["serviceName"] + action: + histogram: + # Pick the appropriate values for each the following fields + key: "durationInNanos" + record_minmax: true + units: "seconds" + buckets: [0, 10000000, 50000000, 100000000] + # Pick the required aggregation period + group_duration: "30s" + sink: + - opensearch: + ... + index: "metrics_for_traces" + - pipeline: + name: "trace-to-metrics-anomaly-detector-pipeline" + +trace-to-metrics-anomaly-detector-pipeline: + source: + pipeline: + name: "trace-to-metrics-pipeline" + processor: + - anomaly_detector: + # Below Key will find anomalies in the max value of histogram generated for durationInNanos. + keys: [ "max" ] + mode: + random_cut_forest: + sink: + - opensearch: + ... + index: "trace-metric-anomalies" +``` +{% include copy-curl.html %} + +## OpenTelemetry metrics + +You can create a pipeline that receives OpenTelemetry metrics and detects anomalies in those metrics. In this example, `entry-pipeline` receives metrics from the OpenTelemetry Collector. If a metric is of type `GAUGE` and the name of the metric is `totalApiBytesSent`, the processor sends it to the `ad-pipeline` pipeline. + +The `ad-pipeline` pipeline receives the metrics from the entry pipeline and performs anomaly detection on the metric values by using the [`anomaly_detector` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/anomaly-detector/). + +```json +entry-pipeline: + source: + otel_metrics_source: + processor: + - otel_metrics: + route: + - gauge_route: '/kind = "GAUGE" and /name = "totalApiBytesSent"' + sink: + - pipeline: + name: "ad-pipeline" + routes: + - gauge_route + - opensearch: + ... + index: "otel-metrics" + +ad-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - anomaly_detector: + # Use "value" as the key on which anomaly detector needs to be run + keys: [ "value" ] + mode: + random_cut_forest: + sink: + - opensearch: + ... + index: otel-metrics-anomalies +``` +{% include copy-curl.html %} diff --git a/_data-prepper/common-use-cases/codec-processor-combinations.md b/_data-prepper/common-use-cases/codec-processor-combinations.md index ae1209e973..57185f2ce9 100644 --- a/_data-prepper/common-use-cases/codec-processor-combinations.md +++ b/_data-prepper/common-use-cases/codec-processor-combinations.md @@ -2,7 +2,7 @@ layout: default title: Codec processor combinations parent: Common use cases -nav_order: 25 +nav_order: 10 --- # Codec processor combinations diff --git a/_data-prepper/common-use-cases/event-aggregation.md b/_data-prepper/common-use-cases/event-aggregation.md new file mode 100644 index 0000000000..f6e2757d9a --- /dev/null +++ b/_data-prepper/common-use-cases/event-aggregation.md @@ -0,0 +1,135 @@ +--- +layout: default +title: Event aggregation +parent: Common use cases +nav_order: 25 +--- + +# Event aggregation + +You can use Data Prepper to aggregate data from different events over a period of time. Aggregating events can help to reduce unnecessary log volume and manage use cases like multiline logs that are received as separate events. The [`aggregate` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) is a stateful processor that groups events based on the values for a set of specified identification keys and performs a configurable action on each group. + +The `aggregate` processor state is stored in memory. For example, in order to combine four events into one, the processor needs to retain pieces of the first three events. The state of an aggregate group of events is kept for a configurable amount of time. Depending on your logs, the aggregate action being used, and the number of memory options in the processor configuration, the aggregation could take place over a long period of time. + +## Basic usage + +The following example pipeline extracts the fields `sourceIp`, `destinationIp`, and `port` using the [`grok` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/grok/) and then aggregates on those fields over a period of 30 seconds using the [`aggregate` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) and the `put_all` action. At the end of the 30-second period, the aggregated log is sent to the OpenSearch sink. + +```json +aggregate_pipeline: + source: + http: + path: "/${pipelineName}/logs" + processor: + - grok: + match: + log: ["%{IPORHOST:sourceIp} %{IPORHOST:destinationIp} %{NUMBER:port:int}"] + - aggregate: + group_duration: "30s" + identification_keys: ["sourceIp", "destinationIp", "port"] + action: + put_all: + sink: + - opensearch: + ... + index: aggregated_logs +``` +{% include copy-curl.html %} + +For example, consider the following batch of logs: + +```json +{ "log": "127.0.0.1 192.168.0.1 80", "status": 200 } +{ "log": "127.0.0.1 192.168.0.1 80", "bytes": 1000 } +{ "log": "127.0.0.1 192.168.0.1 80" "http_verb": "GET" } +``` +{% include copy-curl.html %} + +The `grok` processor will extract keys such that the log events will look like the following example. These events now have the data that the `aggregate` processor will need for the `identification_keys`. + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "port": 80, "status": 200 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "port": 80, "bytes": 1000 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "port": 80, "http_verb": "GET" } +``` +{% include copy-curl.html %} + +After 30 seconds, the `aggregate` processor writes the following aggregated log to the sink: + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "port": 80, "status": 200, "bytes": 1000, "http_verb": "GET" } +``` +{% include copy-curl.html %} + +## Removing duplicates + +You can remove duplicate entries by deriving keys from incoming events and specifying the `remove_duplicates` option for the `aggregate` processor. This action immediately processes the first event for a group and drops all following events in that group. + +In the following example, the first event is processed with the identification keys `sourceIp` and `destinationIp`: + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200 } +``` +{% include copy-curl.html %} + +The pipeline will then drop the following event because it has the same keys: + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 1000 } +``` +{% include copy-curl.html %} + +The pipeline processes this event and creates a new group because the `sourceIp` is different: + +```json +{ "sourceIp": "127.0.0.2", "destinationIp": "192.168.0.1", "bytes": 1000 } +``` +{% include copy-curl.html %} + +## Log aggregation and conditional routing + +You can use multiple plugins to combine log aggregation with conditional routing. In this example, the pipeline `log-aggregate-pipeline` receives logs by using an HTTP client, like FluentBit, and extracts important values from the logs by matching the value in the `log` key against the [Apache Common Log Format](https://httpd.apache.org/docs/2.4/logs.html). + +Two of the values that the pipeline extracts from the logs with a Grok pattern include `response` and `clientip`. The `aggregate` processor then uses the `clientip` value, along with the `remove_duplicates` option, to drop any logs that contain a `clientip` that has already been processed within the given `group_duration`. + +Three routes, or conditional statements, exist in the pipeline. These routes separate the value of the response into `2xx`, `3xx`, `4xx`, and `5xx` responses. Logs with a `2xx` or `3xx` status are sent to the `aggregated_2xx_3xx` index, logs with a `4xx` status are sent to the `aggregated_4xx index`, and logs with a `5xx` status are sent to the `aggregated_5xx` index. + +```json +log-aggregate-pipeline: + source: + http: + # Provide the path for ingestion. ${pipelineName} will be replaced with pipeline name configured for this pipeline. + # In this case it would be "/log-aggregate-pipeline/logs". This will be the FluentBit output URI value. + path: "/${pipelineName}/logs" + processor: + - grok: + match: + log: [ "%{COMMONAPACHELOG_DATATYPED}" ] + - aggregate: + identification_keys: ["clientip"] + action: + remove_duplicates: + group_duration: "180s" + route: + - 2xx_status: "/response >= 200 and /response < 300" + - 3xx_status: "/response >= 300 and /response < 400" + - 4xx_status: "/response >= 400 and /response < 500" + - 5xx_status: "/response >= 500 and /response < 600" + sink: + - opensearch: + ... + index: "aggregated_2xx_3xx" + routes: + - 2xx_status + - 3xx_status + - opensearch: + ... + index: "aggregated_4xx" + routes: + - 4xx_status + - opensearch: + ... + index: "aggregated_5xx" + routes: + - 5xx_status +``` diff --git a/_data-prepper/common-use-cases/log-analytics.md b/_data-prepper/common-use-cases/log-analytics.md index e8db781714..30a021b101 100644 --- a/_data-prepper/common-use-cases/log-analytics.md +++ b/_data-prepper/common-use-cases/log-analytics.md @@ -2,7 +2,7 @@ layout: default title: Log analytics parent: Common use cases -nav_order: 10 +nav_order: 30 --- # Log analytics diff --git a/_data-prepper/common-use-cases/log-enrichment.md b/_data-prepper/common-use-cases/log-enrichment.md new file mode 100644 index 0000000000..0d8ce4ab7d --- /dev/null +++ b/_data-prepper/common-use-cases/log-enrichment.md @@ -0,0 +1,400 @@ +--- +layout: default +title: Log enrichment +parent: Common use cases +nav_order: 35 +--- + +# Log enrichment + +You can perform different types of log enrichment with Data Prepper, including: + +- Filtering. +- Extracting key-value pairs from strings. +- Mutating events. +- Mutating strings. +- Converting lists to maps. +- Processing incoming timestamps. + +## Filtering + +Use the [`drop_events`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/drop-events/) processor to filter out specific log events before sending them to a sink. For example, if you're collecting web request logs and only want to store unsuccessful requests, you can create the following pipeline, which drops any requests for which the response is less than 400 so that only log events with HTTP status codes of 400 and higher remain. + +```yaml +log-pipeline: + source: + ... + processor: + - grok: + match: + log: [ "%{COMMONAPACHELOG_DATATYPED}" ] + - drop_events: + drop_when: "/response < 400" + sink: + - opensearch: + ... + index: failure_logs +``` +{% include copy-curl.html %} + +The `drop_when` option specifies which events to drop from the pipeline. + +## Extracting key-value pairs from strings + +Log data often includes strings of key-value pairs. For example, if a user queries a URL that can be paginated, the HTTP logs might contain the following HTTP query string: + +```json +page=3&q=my-search-term +``` +{% include copy-curl.html %} + +To perform analysis using the search terms, you can extract the value of `q` from a query string. The [`key_value`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/key-value/) processor provides robust support for extracting keys and values from strings. + +The following example combines the `split_string` and `key_value` processors to extract query parameters from an Apache log line: + +```yaml +pipeline: + ... + processor: + - grok: + match: + message: [ "%{COMMONAPACHELOG_DATATYPED}" ] + - split_string: + entries: + - source: request + delimiter: "?" + - key_value: + source: "/request/1" + field_split_characters: "&" + value_split_characters: "=" + destination: query_params +``` +{% include copy-curl.html %} + +## Mutating events + +The different [mutate event]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/mutate-event/) processors let you rename, copy, add, and delete event entries. + +In this example, the first processor sets the value of the `debug` key to `true` if the key already exists in the event. The second processor only sets the `debug` key to `true` if the key doesn't exist in the event because `overwrite_if_key_exists` is set to `true`. + +```yaml +... +processor: + - add_entries: + entries: + - key: "debug" + value: true +... +processor: + - add_entries: + entries: + - key: "debug" + value: true + overwrite_if_key_exists: true +... +``` +{% include copy-curl.html %} + +You can also use a format string to construct new entries from existing events. For example, `${date}-${time}` will create a new entry based on the values of the existing entries `date` and `time`. + +For example, the following pipeline adds new event entries dynamically from existing events: + +```yaml +processor: + - add_entries: + entries: + - key: "key_three" + format: "${key_one}-${key_two} +``` +{% include copy-curl.html %} + +Consider the following incoming event: + +```json +{ + "key_one": "value_one", + "key_two": "value_two" +} +``` +{% include copy-curl.html %} + +The processor transforms it into an event with a new key named `key_three`, which combines values of other keys in the original event, as shown in the following example: + +```json +{ + "key_one": "value_one", + "key_two": "value_two", + "key_three": "value_one-value_two" +} +``` +{% include copy-curl.html %} + +## Mutating strings + +The various [mutate string]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/mutate-string/) processors offer tools that you can use to manipulate strings in incoming data. For example, if you need to split a string into an array, you can use the `split_string` processor: + +```yaml +... +processor: + - split_string: + entries: + - source: "message" + delimiter: "&" +... +``` +{% include copy-curl.html %} + +The processor will transform a string such as `a&b&c` into `["a", "b", "c"]`. + +## Converting lists to maps + +The [`list_to_map`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/list-to-map/) processor, which is one of the mutate event processors, converts a list of objects in an event to a map. + +For example, consider the following processor configuration: + +```yaml +... +processor: + - list_to_map: + key: "name" + source: "A-car-as-list" + target: "A-car-as-map" + value_key: "value" + flatten: true +... +``` +{% include copy-curl.html %} + +The following processor will convert an event that contains a list of objects to a map like this: + +```json +{ + "A-car-as-list": [ + { + "name": "make", + "value": "tesla" + }, + { + "name": "model", + "value": "model 3" + }, + { + "name": "color", + "value": "white" + } + ] +} +``` +{% include copy-curl.html %} + +```json +{ + "A-car-as-map": { + "make": "tesla", + "model": "model 3", + "color": "white" + } +} +``` +{% include copy-curl.html %} + +As another example, consider an incoming event with the following structure: + +```json +{ + "mylist" : [ + { + "somekey" : "a", + "somevalue" : "val-a1", + "anothervalue" : "val-a2" + }, + { + "somekey" : "b", + "somevalue" : "val-b1", + "anothervalue" : "val-b2" + }, + { + "somekey" : "b", + "somevalue" : "val-b3", + "anothervalue" : "val-b4" + }, + { + "somekey" : "c", + "somevalue" : "val-c1", + "anothervalue" : "val-c2" + } + ] +} +``` +{% include copy-curl.html %} + +You can define the following options in the processor configuration: + +```yaml +... +processor: + - list_to_map: + key: "somekey" + source: "mylist" + target: "myobject" + flatten: true +... +``` +{% include copy-curl.html %} + +The processor modifies the event by adding the new `myobject` object: + +```json +{ + "myobject" : { + "a" : [ + { + "somekey" : "a", + "somevalue" : "val-a1", + "anothervalue" : "val-a2" + } + ], + "b" : [ + { + "somekey" : "b", + "somevalue" : "val-b1", + "anothervalue" : "val-b2" + }, + { + "somekey" : "b", + "somevalue" : "val-b3", + "anothervalue" : "val-b4" + } + ] + "c" : [ + { + "somekey" : "c", + "somevalue" : "val-c1", + "anothervalue" : "val-c2" + } + ] + } +} +``` +{% include copy-curl.html %} + +In many cases, you may want to flatten the array for each key. In these situations, you can choose which object to retain. The processor offers a choice of either first or last. For example, consider the following: + +```yaml +... +processor: + - list_to_map: + key: "somekey" + source: "mylist" + target: "myobject" + flatten: true + flattened_element: first +... +``` +{% include copy-curl.html %} + +The fields in the newly created `myobject` are then flattened accordingly: + +```json +{ + "myobject" : { + "a" : { + "somekey" : "a", + "somevalue" : "val-a1", + "anothervalue" : "val-a2" + }, + "b" : { + "somekey" : "b", + "somevalue" : "val-b1", + "anothervalue" : "val-b2" + } + "c" : { + "somekey" : "c", + "somevalue" : "val-c1", + "anothervalue" : "val-c2" + } + } +} +``` +{% include copy-curl.html %} + +## Processing incoming timestamps + +The [`date`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/date/) processor parses the `timestamp` key from incoming events by converting it to International Organization for Standardization (ISO) 8601 format: + +```yaml +... + processor: + - date: + match: + - key: timestamp + patterns: ["dd/MMM/yyyy:HH:mm:ss"] + destination: "@timestamp" + source_timezone: "America/Los_Angeles" + destination_timezone: "America/Chicago" + locale: "en_US" +... +``` +{% include copy-curl.html %} + +If the preceding pipeline processes the following event: + +```json +{"timestamp": "10/Feb/2000:13:55:36"} +``` +{% include copy-curl.html %} + +It converts the event to the following format: + +```json +{ + "timestamp":"10/Feb/2000:13:55:36", + "@timestamp":"2000-02-10T15:55:36.000-06:00" +} +``` +{% include copy-curl.html %} + +### Generating timestamps + +The `date` processor can generate timestamps for incoming events if you specify `@timestamp` for the `destination` option: + +```yaml +... + processor: + - date: + from_time_received: true + destination: "@timestamp" +... +``` +{% include copy-curl.html %} + +### Deriving punctuation patterns + +The [`substitute_string`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/substitute-string/) processor (which is one of the mutate string processors) lets you derive a punctuation pattern from incoming events. In the following example pipeline, the processor will scan incoming Apache log events and derive punctuation patterns from them: + +```yaml +processor: + - substitute_string: + entries: + - source: "message" + from: "[a-zA-Z0-9_]+" + to:"" + - source: "message" + from: "[ ]+" + to: "_" +``` +{% include copy-curl.html %} + +The following incoming Apache HTTP log: + +```json +[{"message":"10.10.10.11 - admin [19/Feb/2015:15:50:36 -0500] \"GET /big2.pdf HTTP/1.1\" 200 33973115 0.202 \"-\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36\""}] +``` + +Generates the following punctuation pattern: +```json +{"message":"..._-_[//:::_-]_\"_/._/.\"_._\"-\"_\"/._(;_)_/._(,_)_/..._/.\""} +``` +{% include copy-curl.html %} + +You can count these generated patterns by passing them through the [`aggregate`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) processor with the `count` action. diff --git a/_data-prepper/common-use-cases/metrics-traces.md b/_data-prepper/common-use-cases/metrics-traces.md new file mode 100644 index 0000000000..c15eaa099b --- /dev/null +++ b/_data-prepper/common-use-cases/metrics-traces.md @@ -0,0 +1,51 @@ +--- +layout: default +title: Deriving metrics from traces +parent: Common use cases +nav_order: 20 +--- + +# Deriving metrics from traces + +You can use Data Prepper to derive metrics from OpenTelemetry traces. The following example pipeline receives incoming traces and extracts a metric called `durationInNanos`, aggregated over a tumbling window of 30 seconds. It then derives a histogram from the incoming traces. + +The pipeline contains the following pipelines: + +- `entry-pipeline` – Receives trace data from the OpenTelemetry collector and forwards it to the `trace_to_metrics_pipeline` pipeline. + +- `trace-to-metrics-pipeline` - Receives the trace data from the `entry-pipeline` pipeline, aggregates it, and derives a histogram of `durationInNanos` from the traces based on the value of the `serviceName` field. It then sends the derived metrics to the OpenSearch index called `metrics_for_traces`. + +```json +entry-pipeline: + source: + otel_trace_source: + # Provide the path for ingestion. ${pipelineName} will be replaced with pipeline name. + # In this case it would be "/entry-pipeline/v1/traces". This will be endpoint URI path in OpenTelemetry Exporter configuration. + path: "/${pipelineName}/v1/traces" + sink: + - pipeline: + name: "trace-to-metrics-pipeline" + +trace-to-metrics-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - aggregate: + # Pick the required identification keys + identification_keys: ["serviceName"] + action: + histogram: + # Pick the appropriate values for each of the following fields + key: "durationInNanos" + record_minmax: true + units: "seconds" + buckets: [0, 10000000, 50000000, 100000000] + # Specify an aggregation period + group_duration: "30s" + sink: + - opensearch: + ... + index: "metrics_for_traces" +``` +{% include copy-curl.html %} diff --git a/_data-prepper/common-use-cases/s3-logs.md b/_data-prepper/common-use-cases/s3-logs.md index 2987c9a677..7986a7eef8 100644 --- a/_data-prepper/common-use-cases/s3-logs.md +++ b/_data-prepper/common-use-cases/s3-logs.md @@ -2,7 +2,7 @@ layout: default title: S3 logs parent: Common use cases -nav_order: 20 +nav_order: 40 --- # S3 logs diff --git a/_data-prepper/common-use-cases/sampling.md b/_data-prepper/common-use-cases/sampling.md new file mode 100644 index 0000000000..7c77e8c3f2 --- /dev/null +++ b/_data-prepper/common-use-cases/sampling.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Sampling +parent: Common use cases +nav_order: 45 +--- + +# Sampling + +Data Prepper provides the following sampling capabilities: + +- Time sampling +- Percentage sampling +- Tail sampling + +## Time sampling + +You can use the `rate_limiter` action within the [`aggregate` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) to limit the number of events that can be processed per second. You can choose to either drop excess events or carry them forward to the next time period. + +In the following example, only 100 events with a status code of `200` are sent to the sink per second from a given IP address. The `when_exceeds` option is set to `drop`, which means that all excess events from the configured time window will be dropped. + +```json +... + processor: + - aggregate: + identification_keys: ["clientip"] + action: + rate_limiter: + events_per_second: 100 + when_exceeds: drop + when: "/status == 200" +... +``` + +If you instead set the `when_exceeds` option to `block`, the processor will block the pipeline until the time window has elapsed. Then it will process the blocked events. + +## Percentage sampling + +Use the `percent_sampler` action within the `aggregate` processor to limit the number of events that are sent to a sink. All excess events will be dropped. + +In the following example, only 20% of events with a status code of `200` are sent to the sink from a given IP address: + +```json +... + processor: + - aggregate: + identification_keys: ["clientip"] + duration : + action: + percent_sampler: + percent: 20 + when: "/status == 200" +... +``` + +## Tail sampling + +Use the `tail_sampler` action within the `aggregate` processor to sample events based on a set of defined policies. This action waits for an aggregation to complete across different aggregation periods based on the configured wait period. When an aggregation is complete, and if it matches the specific error condition, it is sent to the sink. Otherwise, only a configured percentage of events is sent to the sink. + +The following pipeline sends all OpenTelemetry traces with an error condition status of `2` to the sink. It only sends 20% of the traces that don't match this error condition to the sink. + +```json +... + processor: + - aggregate: + identification_keys: ["traceId"] + action: + tail_sampler: + percent: 20 + wait_period: "10s" + condition: "/status == 2" + +... +``` + +If you set the error condition to `false` or don't include it, only the configured percentage of events is allowed to pass through, as determined by a probabilistic outcome. + +Because it can be difficult to determine exactly when tail sampling should occur, you can use the `wait_period` option to measure the idle time since the last event was received. diff --git a/_data-prepper/common-use-cases/text-processing.md b/_data-prepper/common-use-cases/text-processing.md new file mode 100644 index 0000000000..041ca63ab2 --- /dev/null +++ b/_data-prepper/common-use-cases/text-processing.md @@ -0,0 +1,215 @@ +--- +layout: default +title: Text processing +parent: Common use cases +nav_order: 55 +--- + +# Text processing + +Data Prepper provides text processing capabilities with the [`grok processor`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/grok/). The `grok` processor is based on the [`java-grok`](https://mvnrepository.com/artifact/io.krakens/java-grok) library and supports all compatible patterns. The `java-grok` library is built using the [`java.util.regex`](https://docs.oracle.com/javase/8/docs/api/java/util/regex/package-summary.html) regular expression library. + +You can add custom patterns to your pipelines by using the `patterns_definitions` option. When debugging custom patterns, the [Grok Debugger](https://grokdebugger.com/) can be helpful. + +## Basic usage + +To get started with text processing, create the following pipeline: + +```json +patten-matching-pipeline: + source + ... + processor: + - grok: + match: + message: ['%{IPORHOST:clientip} \[%{HTTPDATE:timestamp}\] %{NUMBER:response_status:int}'] + sink: + - opensearch: + # Provide an OpenSearch cluster endpoint +``` +{% include copy-curl.html %} + +An incoming message might contain the following contents: + +```json +{"message": "127.0.0.1 198.126.12 [10/Oct/2000:13:55:36 -0700] 200"} +``` +{% include copy-curl.html %} + +In each incoming event, the pipeline will locate the value in the `message` key and attempt to match the pattern. The keywords `IPORHOST`, `HTTPDATE`, and `NUMBER` are built into the plugin. + +When an incoming record matches the pattern, it generates an internal event such as the following with identification keys extracted from the original message: + +```json +{ + "message":"127.0.0.1 198.126.12 [10/Oct/2000:13:55:36 -0700] 200", + "response_status":200, + "clientip":"198.126.12", + "timestamp":"10/Oct/2000:13:55:36 -0700" +} +``` +{% include copy-curl.html %} + +The `match` configuration for the `grok` processor specifies which record keys to match against which patterns. + +In the following example, the `match` configuration checks incoming logs for a `message` key. If the key exists, it matches the key value against the `SYSLOGBASE` pattern and then against the `COMMONAPACHELOG` pattern. It then checks the logs for a `timestamp` key. If that key exists, it attempts to match the key value against the `TIMESTAMP_ISO8601` pattern. + +```json +processor: + - grok: + match: + message: ['%{SYSLOGBASE}', "%{COMMONAPACHELOG}"] + timestamp: ["%{TIMESTAMP_ISO8601}"] +``` +{% include copy-curl.html %} + +By default, the plugin continues until it finds a successful match. For example, if there is a successful match against the value in the `message` key for a `SYSLOGBASE` pattern, the plugin doesn't attempt to match the other patterns. If you want to match logs against every pattern, include the `break_on_match` option. + +## Including named and empty captures + +Include the `keep_empty_captures` option in your pipeline configuration to include null captures or the `named_captures_only` option to include only named captures. Named captures follow the pattern `%{SYNTAX:SEMANTIC}` while unnamed captures follow the pattern `%{SYNTAX}`. + +For example, you can modify the preceding Grok configuration to remove `clientip` from the `%{IPORHOST}` pattern: + +```json +processor: + - grok: + match: + message: ['%{IPORHOST} \[%{HTTPDATE:timestamp}\] %{NUMBER:response_status:int}'] +``` +{% include copy-curl.html %} + +The resulting grokked log will look like this: + +```json +{ + "message":"127.0.0.1 198.126.12 [10/Oct/2000:13:55:36 -0700] 200", + "response_status":200, + "timestamp":"10/Oct/2000:13:55:36 -0700" +} +``` +{% include copy-curl.html %} + +Notice that the `clientip` key no longer exists because the `%{IPORHOST}` pattern is now an unnamed capture. + +However, if you set `named_captures_only` to `false`: + +```json +processor: + - grok: + match: + named_captures_only: false + message: ['%{IPORHOST} \[%{HTTPDATE:timestamp}\] %{NUMBER:message:int}'] +``` +{% include copy-curl.html %} + +Then the resulting grokked log will look like this: + +```json +{ + "message":"127.0.0.1 198.126.12 [10/Oct/2000:13:55:36 -0700] 200", + "MONTH":"Oct", + "YEAR":"2000", + "response_status":200, + "HOUR":"13", + "TIME":"13:55:36", + "MINUTE":"55", + "SECOND":"36", + "IPORHOST":"198.126.12", + "MONTHDAY":"10", + "INT":"-0700", + "timestamp":"10/Oct/2000:13:55:36 -0700" +} +``` +{% include copy-curl.html %} + +Note that the `IPORHOST` capture now shows up as a new key, along with some internal unnamed captures like `MONTH` and `YEAR`. The `HTTPDATE` keyword is currently using these patterns, which you can see in the default patterns file. + +## Overwriting keys + +Include the `keys_to_overwrite` option to specify which existing record keys to overwrite if there is a capture with the same key value. + +For example, you can modify the preceding Grok configuration to replace `%{NUMBER:response_status:int}` with `%{NUMBER:message:int}` and add `message` to the list of keys to overwrite: + +```json +processor: + - grok: + match: + keys_to_overwrite: ["message"] + message: ['%{IPORHOST:clientip} \[%{HTTPDATE:timestamp}\] %{NUMBER:message:int}'] +``` +{% include copy-curl.html %} + +In the resulting grokked log, the original message is overwritten with the number `200`: + +```json +{ + "message":200, + "clientip":"198.126.12", + "timestamp":"10/Oct/2000:13:55:36 -0700" +} +``` +{% include copy-curl.html %} + +## Using custom patterns + +Include the `pattern_definitions` option in your Grok configuration to specify custom patterns. + +The following configuration creates custom regex patterns named `CUSTOM_PATTERN-1` and `CUSTOM_PATTERN-2`. By default, the plugin continues until it finds a successful match. + +```json +processor: + - grok: + pattern_definitions: + CUSTOM_PATTERN_1: 'this-is-regex-1' + CUSTOM_PATTERN_2: '%{CUSTOM_PATTERN_1} REGEX' + match: + message: ["%{CUSTOM_PATTERN_2:my_pattern_key}"] +``` +{% include copy-curl.html %} + +If you specify `break_on_match` as `false`, the pipeline attempts to match all patterns and extract keys from the incoming events: + +```json +processor: + - grok: + pattern_definitions: + CUSTOM_PATTERN_1: 'this-is-regex-1' + CUSTOM_PATTERN_2: 'this-is-regex-2' + CUSTOM_PATTERN_3: 'this-is-regex-3' + CUSTOM_PATTERN_4: 'this-is-regex-4' + match: + message: [ "%{PATTERN1}”, "%{PATTERN2}" ] + log: [ "%{PATTERN3}", "%{PATTERN4}" ] + break_on_match: false +``` +{% include copy-curl.html %} + +You can define your own custom patterns to use for pipeline pattern matching. In the previous example, `my_pattern` will be extracted after matching the custom patterns. + +## Storing captures with a parent key + +Include the `target_key` option in your Grok configuration to wrap all record captures in an additional outer key value. + +For example, you can modify the preceding Grok configuration to add a target key named `grokked`: + +```json +processor: + - grok: + target_key: "grokked" + match: + message: ['%{IPORHOST} \[%{HTTPDATE:timestamp}\] %{NUMBER:response_status:int}'] +``` + +The resulting grokked log will look like this: + +```json +{ + "message":"127.0.0.1 198.126.12 [10/Oct/2000:13:55:36 -0700] 200", + "grokked": { + "response_status":200, + "clientip":"198.126.12", + "timestamp":"10/Oct/2000:13:55:36 -0700" + } +} +``` diff --git a/_data-prepper/common-use-cases/trace-analytics.md b/_data-prepper/common-use-cases/trace-analytics.md index 9067ce49b7..033830351a 100644 --- a/_data-prepper/common-use-cases/trace-analytics.md +++ b/_data-prepper/common-use-cases/trace-analytics.md @@ -2,7 +2,7 @@ layout: default title: Trace analytics parent: Common use cases -nav_order: 5 +nav_order: 60 --- # Trace analytics @@ -15,7 +15,7 @@ When using Data Prepper as a server-side component to collect trace data, you ca The following flowchart illustrates the trace analytics workflow, from running OpenTelemetry Collector to using OpenSearch Dashboards for visualization. -Trace analyticis component overview{: .img-fluid} +Trace analytics component overview{: .img-fluid} To monitor trace analytics, you need to set up the following components in your service environment: - Add **instrumentation** to your application so it can generate telemetry data and send it to an OpenTelemetry collector. @@ -38,9 +38,9 @@ The [OpenTelemetry source]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/c There are three processors for the trace analytics feature: -* *otel_traces_raw* - The *otel_traces_raw* processor receives a collection of [span](https://github.com/opensearch-project/data-prepper/blob/fa65e9efb3f8d6a404a1ab1875f21ce85e5c5a6d/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/trace/Span.java) records from [*otel-trace-source*]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/otel-trace/), and performs stateful processing, extraction, and completion of trace-group-related fields. -* *otel_traces_group* - The *otel_traces_group* processor fills in the missing trace-group-related fields in the collection of [span](https://github.com/opensearch-project/data-prepper/blob/298e7931aa3b26130048ac3bde260e066857df54/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/trace/Span.java) records by looking up the OpenSearch backend. -* *service_map_stateful* – The *service_map_stateful* processor performs the required preprocessing for trace data and builds metadata to display the `service-map` dashboards. +* otel_traces_raw -- The *otel_traces_raw* processor receives a collection of [span](https://github.com/opensearch-project/data-prepper/blob/fa65e9efb3f8d6a404a1ab1875f21ce85e5c5a6d/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/trace/Span.java) records from [*otel-trace-source*]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/otel-trace-source/), and performs stateful processing, extraction, and completion of trace-group-related fields. +* otel_traces_group -- The *otel_traces_group* processor fills in the missing trace-group-related fields in the collection of [span](https://github.com/opensearch-project/data-prepper/blob/298e7931aa3b26130048ac3bde260e066857df54/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/trace/Span.java) records by looking up the OpenSearch backend. +* service_map_stateful -- The *service_map_stateful* processor performs the required preprocessing for trace data and builds metadata to display the `service-map` dashboards. ### OpenSearch sink @@ -49,8 +49,8 @@ OpenSearch provides a generic sink that writes data to OpenSearch as the destina The sink provides specific configurations for the trace analytics feature. These configurations allow the sink to use indexes and index templates specific to trace analytics. The following OpenSearch indexes are specific to trace analytics: -* *otel-v1-apm-span* – The *otel-v1-apm-span* index stores the output from the [otel_traces_raw]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/otel-trace-raw/) processor. -* *otel-v1-apm-service-map* – The *otel-v1-apm-service-map* index stores the output from the [service_map_stateful]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/service-map-stateful/) processor. +* otel-v1-apm-span –- The *otel-v1-apm-span* index stores the output from the [otel_traces_raw]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/otel-trace-raw/) processor. +* otel-v1-apm-service-map –- The *otel-v1-apm-service-map* index stores the output from the [service_map_stateful]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/service-map-stateful/) processor. ## Trace tuning @@ -322,7 +322,7 @@ For other configurations available for OpenSearch sinks, see [Data Prepper OpenS ## OpenTelemetry Collector -You need to run OpenTelemetry Collector in your service environment. Follow [Getting Started](https://opentelemetry.io/docs/collector/getting-started/#getting-started) to install an OpenTelemetry collector. Ensure that you configure the collector with an exporter configured for your Data Prepper instance. The following example `otel-collector-config.yaml` file receives data from various instrumentations and exports it to Data Prepper. +You need to run OpenTelemetry Collector in your service environment. Follow [Getting Started](https://opentelemetry.io/docs/collector/getting-started/#getting-started) to install an OpenTelemetry collector. Ensure that you configure the collector with an exporter configured for your Data Prepper instance. The following example `otel-collector-config.yaml` file receives data from various instrumentations and exports it to Data Prepper. ### Example otel-collector-config.yaml file @@ -374,4 +374,4 @@ Starting with Data Prepper version 1.4, trace processing uses Data Prepper's eve * `otel_traces_group` replaces `otel_traces_group_prepper` for event-based spans. In Data Prepper version 2.0, `otel_traces_source` will only output events. Data Prepper version 2.0 also removes `otel_traces_raw_prepper` and `otel_traces_group_prepper` entirely. To migrate to Data Prepper version 2.0, you can configure your trace pipeline using the event model. - \ No newline at end of file + diff --git a/_data-prepper/index.md b/_data-prepper/index.md index 0c7a228b27..a7f51a8514 100644 --- a/_data-prepper/index.md +++ b/_data-prepper/index.md @@ -14,9 +14,9 @@ redirect_from: # Data Prepper -Data Prepper is a server-side data collector capable of filtering, enriching, transforming, normalizing, and aggregating data for downstream analytics and visualization. +Data Prepper is a server-side data collector capable of filtering, enriching, transforming, normalizing, and aggregating data for downstream analysis and visualization. Data Prepper is the preferred data ingestion tool for OpenSearch. It is recommended for most data ingestion use cases in OpenSearch and for processing large, complex datasets. -Data Prepper lets users build custom pipelines to improve the operational view of applications. Two common uses for Data Prepper are trace and log analytics. [Trace analytics]({{site.url}}{{site.baseurl}}/observability-plugin/trace/index/) can help you visualize the flow of events and identify performance problems, and [log analytics]({{site.url}}{{site.baseurl}}/observability-plugin/log-analytics/) can improve searching, analyzing and provide insights into your application. +With Data Prepper you can build custom pipelines to improve the operational view of applications. Two common use cases for Data Prepper are trace analytics and log analytics. [Trace analytics]({{site.url}}{{site.baseurl}}/observability-plugin/trace/index/) can help you visualize event flows and identify performance problems. [Log analytics]({{site.url}}{{site.baseurl}}/observability-plugin/log-analytics/) equips you with tools to enhance your search capabilities, conduct comprehensive analysis, and gain insights into your applications' performance and behavior. ## Concepts diff --git a/_data-prepper/managing-data-prepper/configuring-data-prepper.md b/_data-prepper/managing-data-prepper/configuring-data-prepper.md index bcff65ed4c..d6750daba4 100644 --- a/_data-prepper/managing-data-prepper/configuring-data-prepper.md +++ b/_data-prepper/managing-data-prepper/configuring-data-prepper.md @@ -128,6 +128,7 @@ extensions: region: sts_role_arn: refresh_interval: + disable_refresh: false : ... ``` @@ -148,7 +149,8 @@ Option | Required | Type | Description secret_id | Yes | String | The AWS secret name or ARN. | region | No | String | The AWS region of the secret. Defaults to `us-east-1`. sts_role_arn | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to the AWS Secrets Manager. Defaults to `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). -refresh_interval | No | Duration | The refreshment interval for AWS secrets extension plugin to poll new secret values. Defaults to `PT1H`. See [Automatically refreshing secrets](#automatically-refreshing-secrets) for details. +refresh_interval | No | Duration | The refreshment interval for the AWS Secrets extension plugin to poll new secret values. Defaults to `PT1H`. For more information, see [Automatically refreshing secrets](#automatically-refreshing-secrets). +disable_refresh | No | Boolean | Disables regular polling on the latest secret values inside the AWS secrets extension plugin. Defaults to `false`. When set to `true`, `refresh_interval` will not be used. #### Reference secrets ß diff --git a/_data-prepper/managing-data-prepper/core-apis.md b/_data-prepper/managing-data-prepper/core-apis.md index 33fd493d78..b810c7b15e 100644 --- a/_data-prepper/managing-data-prepper/core-apis.md +++ b/_data-prepper/managing-data-prepper/core-apis.md @@ -83,4 +83,4 @@ processorShutdownTimeout: "PT15M" sinkShutdownTimeout: 30s ``` -The values for these parameters are parsed into a `Duration` object through the [Data Prepper Duration Deserializer](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-core/src/main/java/org/opensearch/dataprepper/parser/DataPrepperDurationDeserializer.java). \ No newline at end of file +The values for these parameters are parsed into a `Duration` object through the [Data Prepper Duration Deserializer](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-pipeline-parser/src/main/java/org/opensearch/dataprepper/pipeline/parser/DataPrepperDurationDeserializer.java). diff --git a/_data-prepper/managing-data-prepper/extensions/extensions.md b/_data-prepper/managing-data-prepper/extensions/extensions.md new file mode 100644 index 0000000000..8cbfc602c7 --- /dev/null +++ b/_data-prepper/managing-data-prepper/extensions/extensions.md @@ -0,0 +1,15 @@ +--- +layout: default +title: Extensions +parent: Managing Data Prepper +has_children: true +nav_order: 18 +--- + +# Extensions + +Data Prepper extensions provide Data Prepper functionality outside of core Data Prepper pipeline components. +Many extensions provide configuration options that give Data Prepper administrators greater flexibility over Data Prepper's functionality. + +Extension configurations can be configured in the `data-prepper-config.yaml` file under the `extensions:` YAML block. + diff --git a/_data-prepper/managing-data-prepper/extensions/geoip_service.md b/_data-prepper/managing-data-prepper/extensions/geoip_service.md new file mode 100644 index 0000000000..53c21a08ff --- /dev/null +++ b/_data-prepper/managing-data-prepper/extensions/geoip_service.md @@ -0,0 +1,67 @@ +--- +layout: default +title: geoip_service +nav_order: 5 +parent: Extensions +grand_parent: Managing Data Prepper +--- + +# geoip_service + +The `geoip_service` extension configures all [`geoip`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/geoip) processors in Data Prepper. + +## Usage + +You can configure the GeoIP service that Data Prepper uses for the `geoip` processor. +By default, the GeoIP service comes with the [`maxmind`](#maxmind) option configured. + +The following example shows how to configure the `geoip_service` in the `data-prepper-config.yaml` file: + +``` +extensions: + geoip_service: + maxmind: + database_refresh_interval: PT1H + cache_count: 16_384 +``` + +## maxmind + +The GeoIP service supports the MaxMind [GeoIP and GeoLite](https://dev.maxmind.com/geoip) databases. +By default, Data Prepper will use all three of the following [MaxMind GeoLite2](https://dev.maxmind.com/geoip/geolite2-free-geolocation-data) databases: + +* City +* Country +* ASN + +The service also downloads databases automatically to keep Data Prepper up to date with changes from MaxMind. + +You can use the following options to configure the `maxmind` extension. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`databases` | No | [database](#database) | The database configuration. +`database_refresh_interval` | No | Duration | How frequently to check for updates from MaxMind. This can be any duration in the range of 15 minutes to 30 days. Default is `PT7D`. +`cache_count` | No | Integer | The maximum cache count by number of items in the cache, with a range of 100--100,000. Default is `4096`. +`database_destination` | No | String | The name of the directory in which to store downloaded databases. Default is `{data-prepper.dir}/data/geoip`. +`aws` | No | [aws](#aws) | Configures the AWS credentials for downloading the database from Amazon Simple Storage Service (Amazon S3). +`insecure` | No | Boolean | When `true`, this options allows you to download database files over HTTP. Default is `false`. + +## database + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`city` | No | String | The URL of the city in which the database resides. Can be an HTTP URL for a manifest file, an MMDB file, or an S3 URL. +`country` | No | String | The URL of the country in which the database resides. Can be an HTTP URL for a manifest file, an MMDB file, or an S3 URL. +`asn` | No | String | The URL of the Autonomous System Number (ASN) of where the database resides. Can be an HTTP URL for a manifest file, an MMDB file, or an S3 URL. +`enterprise` | No | String | The URL of the enterprise in which the database resides. Can be an HTTP URL for a manifest file, an MMDB file, or an S3 URL. + + +## aws + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`region` | No | String | The AWS Region to use for the credentials. Default is the [standard SDK behavior for determining the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon S3. Default is `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`aws_sts_header_overrides` | No | Map | A map of header overrides that the AWS Identity and Access Management (IAM) role assumes when downloading from Amazon S3. +`sts_external_id` | No | String | An STS external ID used when Data Prepper assumes the STS role. For more information, see the `ExternalID` documentation in the [STS AssumeRole](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html) API reference. diff --git a/_data-prepper/pipelines/configuration/buffers/kafka.md b/_data-prepper/pipelines/configuration/buffers/kafka.md index 675a0c9775..87600601b4 100644 --- a/_data-prepper/pipelines/configuration/buffers/kafka.md +++ b/_data-prepper/pipelines/configuration/buffers/kafka.md @@ -41,11 +41,12 @@ Use the following configuration options with the `kafka` buffer. Option | Required | Type | Description --- | --- | --- | --- -`bootstrap_servers` | Yes | String list | The host and port for the initial connection to the Kafka cluster. You can configure multiple Kafka brokers by using the IP address or the port number for each broker. When using [Amazon Managed Streaming for Apache Kafka (Amazon MSK)](https://aws.amazon.com/msk/) as your Kafka cluster, the bootstrap server information is obtained from Amazon MSK using the Amazon Resource Name (ARN) provided in the configuration. -`topics` | Yes | List | A list of [topics](#topic) to use. You must supply one topic per buffer. `authentication` | No | [Authentication](#authentication) | Sets the authentication options for both the pipeline and Kafka. For more information, see [Authentication](#authentication). -`encryption` | No | [Encryption](#encryption) | The encryption configuration for encryption in transit. For more information, see [Encryption](#encryption). `aws` | No | [AWS](#aws) | The AWS configuration. For more information, see [aws](#aws). +`bootstrap_servers` | Yes | String list | The host and port for the initial connection to the Kafka cluster. You can configure multiple Kafka brokers by using the IP address or the port number for each broker. When using [Amazon Managed Streaming for Apache Kafka (Amazon MSK)](https://aws.amazon.com/msk/) as your Kafka cluster, the bootstrap server information is obtained from Amazon MSK using the Amazon Resource Name (ARN) provided in the configuration. +`encryption` | No | [Encryption](#encryption) | The encryption configuration for encryption in transit. For more information, see [Encryption](#encryption). +`producer_properties` | No | [Producer Properties](#producer_properties) | A list of configurable Kafka producer properties. +`topics` | Yes | List | A list of [topics](#topic) for the buffer to use. You must supply one topic per buffer. ### topic @@ -73,6 +74,7 @@ Option | Required | Type | Description `retry_backoff` | No | Integer | The amount of time to wait before attempting to retry a failed request to a given topic partition. Default is `10s`. `max_poll_interval` | No | Integer | The maximum delay between invocations of a `poll()` when using group management through Kafka's `max.poll.interval.ms` option. Default is `300s`. `consumer_max_poll_records` | No | Integer | The maximum number of records returned in a single `poll()` call through Kafka's `max.poll.records` setting. Default is `500`. +`max_message_bytes` | No | Integer | The maximum size of the message, in bytes. Default is 1 MB. ### kms @@ -123,6 +125,14 @@ Option | Required | Type | Description `type` | No | String | The encryption type. Use `none` to disable encryption. Default is `ssl`. `insecure` | No | Boolean | A Boolean flag used to turn off SSL certificate verification. If set to `true`, certificate authority (CA) certificate verification is turned off and insecure HTTP requests are sent. Default is `false`. +#### producer_properties + +Use the following configuration options to configure a Kafka producer. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`max_request_size` | No | Integer | The maximum size of the request that the producer sends to Kafka. Default is 1 MB. + #### aws diff --git a/_data-prepper/pipelines/configuration/processors/add-entries.md b/_data-prepper/pipelines/configuration/processors/add-entries.md index 589f463a74..d28f2d8f6f 100644 --- a/_data-prepper/pipelines/configuration/processors/add-entries.md +++ b/_data-prepper/pipelines/configuration/processors/add-entries.md @@ -18,6 +18,10 @@ You can configure the `add_entries` processor with the following options. | :--- | :--- | :--- | | `entries` | Yes | A list of entries to add to an event. | | `key` | Yes | The key of the new entry to be added. Some examples of keys include `my_key`, `myKey`, and `object/sub_Key`. | +| `metadata_key` | Yes | The key for the new metadata attribute. The argument must be a literal string key and not a JSON Pointer. Either one string key or `metadata_key` is required. | +| `format` | No | A format string to use as the value of the new entry, for example, `${key1}-${key2}`, where `key1` and `key2` are existing keys in the event. Required if neither `value` nor `value_expression` is specified. | +| `value_expression` | No | An expression string to use as the value of the new entry. For example, `/key` is an existing key in the event with a type of either a number, a string, or a Boolean. Expressions can also contain functions returning number/string/integer. For example, `length(/key)` will return the length of the key in the event when the key is a string. For more information about keys, see [Expression syntax](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/). | +| `add_when` | No | A [conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that will be evaluated to determine whether the processor will be run on the event. | | `value` | Yes | The value of the new entry to be added. You can use the following data types: strings, Booleans, numbers, null, nested objects, and arrays. | | `overwrite_if_key_exists` | No | When set to `true`, the existing value is overwritten if `key` already exists in the event. The default value is `false`. | @@ -36,6 +40,9 @@ pipeline: - key: "newMessage" value: 3 overwrite_if_key_exists: true + - metadata_key: myMetadataKey + value_expression: 'length("newMessage")' + add_when: '/some_key == "test"' sink: ``` {% include copy.html %} @@ -53,5 +60,5 @@ And then you run the `add_entries` processor using the example pipeline, it adds {"message": "hello", "newMessage": 3} ``` -> If `newMessage` already exists, its existing value is overwritten with a value of `3`. +If `newMessage` already exists, its existing value is overwritten with a value of `3`. diff --git a/_data-prepper/pipelines/configuration/processors/copy-values.md b/_data-prepper/pipelines/configuration/processors/copy-values.md index 4d48a47e02..f654e6f027 100644 --- a/_data-prepper/pipelines/configuration/processors/copy-values.md +++ b/_data-prepper/pipelines/configuration/processors/copy-values.md @@ -19,7 +19,7 @@ You can configure the `copy_values` processor with the following options. | `entries` | Yes | A list of entries to be copied in an event. | | `from_key` | Yes | The key of the entry to be copied. | | `to_key` | Yes | The key of the new entry to be added. | -| `overwrite_if_key_exists` | No | When set to `true`, the existing value is overwritten if `key` already exists in the event. The default value is `false`. | +| `overwrite_if_to_key_exists` | No | When set to `true`, the existing value is overwritten if `key` already exists in the event. The default value is `false`. | ## Usage diff --git a/_data-prepper/pipelines/configuration/processors/csv.md b/_data-prepper/pipelines/configuration/processors/csv.md index e7ec8a35a1..e386db4bf4 100644 --- a/_data-prepper/pipelines/configuration/processors/csv.md +++ b/_data-prepper/pipelines/configuration/processors/csv.md @@ -23,9 +23,79 @@ delete_header | No | Boolean | If specified, the event header (`column_names_sou column_names_source_key | No | String | The field in the event that specifies the CSV column names, which will be automatically detected. If there need to be extra column names, the column names are automatically generated according to their index. If `column_names` is also defined, the header in `column_names_source_key` can also be used to generate the event fields. If too few columns are specified in this field, the remaining column names are automatically generated. If too many column names are specified in this field, the CSV processor omits the extra column names. column_names | No | List | User-specified names for the CSV columns. Default value is `[column1, column2, ..., columnN]` if there are no columns of data in the CSV record and `column_names_source_key` is not defined. If `column_names_source_key` is defined, the header in `column_names_source_key` generates the event fields. If too few columns are specified in this field, the remaining column names are automatically generated. If too many column names are specified in this field, the CSV processor omits the extra column names. - +Add the following examples to your `pipelines.yaml` file, depending on how you your CSV columns are formatted. + +### User-specified column names + +The following example `pipelines.yaml` configuration points to a file named `ingest.csv` as the source. Then, the `csv` processor parses the data from the `.csv` file using the column names specified in the `column_names` setting, as shown in the following example: + +```yaml +csv-pipeline: + source: + file: + path: "/full/path/to/ingest.csv" + record_type: "event" + processor: + - csv: + column_names: ["col1", "col2"] + sink: + - stdout: +``` +{% include copy.html %} + + +When run, the processor will parse the message. Although only two column names are specified in processor settings, a third column name is automatically generated because the data contained in `ingest.csv` includes three columns, `1,2,3`: + +``` +{"message": "1,2,3", "col1": "1", "col2": "2", "column3": "3"} +``` +### Automatically detect column names + +The following configuration automatically detects the header of a CSV file ingested through an [`s3 source`]({{site.url}}{{site.baseurl}}//data-prepper/pipelines/configuration/sources/s3/): + +```yaml +csv-s3-pipeline: + source: + s3: + notification_type: "sqs" + codec: + newline: + skip_lines: 1 + header_destination: "header" + compression: none + sqs: + queue_url: "https://sqs..amazonaws.com//" + aws: + region: "" + processor: + - csv: + column_names_source_key: "header" + sink: + - stdout: +``` +{% include copy.html %} + + +For example, if the `ingest.csv` file in the Amazon Simple Storage Service (Amazon S3) bucket that the Amazon Simple Queue Service (SQS) queue is attached to contains the following data: + +``` +Should,skip,this,line +a,b,c +1,2,3 +``` + +Then the `csv` processor will take the following event: + +```json +{"header": "a,b,c", "message": "1,2,3"} +``` + +Then, the processor parses the event into the following output. Because `delete_header` is `true` by default, the header `a,b,c` is deleted from the output: +```json +{"message": "1,2,3", "a": "1", "b": "2", "c": "3"} +``` ## Metrics @@ -41,4 +111,6 @@ The `csv` processor includes the following custom metrics. **Counter** -* `csvInvalidEvents`: The number of invalid events. An exception is thrown when an invalid event is parsed. An unclosed quote usually causes this exception. \ No newline at end of file +The `csv` processor includes the following counter metrics: + +* `csvInvalidEvents`: The number of invalid events, usually caused by an unclosed quotation mark in the event itself. Data Prepper throws an exception when an invalid event is parsed. diff --git a/_data-prepper/pipelines/configuration/processors/date.md b/_data-prepper/pipelines/configuration/processors/date.md index 27b571df04..7ac1040c26 100644 --- a/_data-prepper/pipelines/configuration/processors/date.md +++ b/_data-prepper/pipelines/configuration/processors/date.md @@ -9,24 +9,32 @@ nav_order: 50 # date -The `date` processor adds a default timestamp to an event, parses timestamp fields, and converts timestamp information to the International Organization for Standardization (ISO) 8601 format. This timestamp information can be used as an event timestamp. +The `date` processor adds a default timestamp to an event, parses timestamp fields, and converts timestamp information to the International Organization for Standardization (ISO) 8601 format. This timestamp information can be used as an event timestamp. ## Configuration The following table describes the options you can use to configure the `date` processor. + Option | Required | Type | Description :--- | :--- | :--- | :--- -match | Conditionally | List | List of `key` and `patterns` where patterns is a list. The list of match can have exactly one `key` and `patterns`. There is no default value. This option cannot be defined at the same time as `from_time_received`. Include multiple date processors in your pipeline if both options should be used. -from_time_received | Conditionally | Boolean | A boolean that is used for adding default timestamp to event data from event metadata which is the time when source receives the event. Default value is `false`. This option cannot be defined at the same time as `match`. Include multiple date processors in your pipeline if both options should be used. -destination | No | String | Field to store the timestamp parsed by date processor. It can be used with both `match` and `from_time_received`. Default value is `@timestamp`. -source_timezone | No | String | Time zone used to parse dates. It is used in case the zone or offset cannot be extracted from the value. If the zone or offset are part of the value, then timezone is ignored. Find all the available timezones [the list of database time zones](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List) in the **TZ database name** column. -destination_timezone | No | String | Timezone used for storing timestamp in `destination` field. The available timezone values are the same as `source_timestamp`. -locale | No | String | Locale is used for parsing dates. It's commonly used for parsing month names(`MMM`). It can have language, country and variant fields using IETF BCP 47 or String representation of [Locale](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html) object. For example `en-US` for IETF BCP 47 and `en_US` for string representation of Locale. Full list of locale fields which includes language, country and variant can be found [the language subtag registry](https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry). Default value is `Locale.ROOT`. +`match` | Conditionally | [Match](#Match) | The date match configuration. This option cannot be defined at the same time as `from_time_received`. There is no default value. +`from_time_received` | Conditionally | Boolean | When `true`, the timestamp from the event metadata, which is the time at which the source receives the event, is added to the event data. This option cannot be defined at the same time as `match`. Default is `false`. +`date_when` | No | String | Specifies under what condition the `date` processor should perform matching. Default is no condition. +`to_origination_metadata` | No | Boolean | When `true`, the matched time is also added to the event's metadata as an instance of `Instant`. Default is `false`. +`destination` | No | String | The field used to store the timestamp parsed by the date processor. Can be used with both `match` and `from_time_received`. Default is `@timestamp`. +`output_format` | No | String | Determines the format of the timestamp added to an event. Default is `yyyy-MM-dd'T'HH:mm:ss.SSSXXX`. +`source_timezone` | No | String | The time zone used to parse dates, including when the zone or offset cannot be extracted from the value. If the zone or offset are part of the value, then the time zone is ignored. A list of all the available time zones is contained in the **TZ database name** column of [the list of database time zones](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List). +`destination_timezone` | No | String | The time zone used for storing the timestamp in the `destination` field. A list of all the available time zones is contained in the **TZ database name** column of [the list of database time zones](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List). +`locale` | No | String | The location used for parsing dates. Commonly used for parsing month names (`MMM`). The value can contain language, country, or variant fields in IETF BCP 47, such as `en-US`, or a string representation of the [locale](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html) object, such as `en_US`. A full list of locale fields, including language, country, and variant, can be found in [the language subtag registry](https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry). Default is `Locale.ROOT`. + - +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`key` | Yes | String | Represents the event key against which to match patterns. Required if `match` is configured. +`patterns` | Yes | List | A list of possible patterns that the timestamp value of the key can have. The patterns are based on a sequence of letters and symbols. The `patterns` support all the patterns listed in the Java [DatetimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html) reference. The timestamp value also supports `epoch_second`, `epoch_milli`, and `epoch_nano` values, which represent the timestamp as the number of seconds, milliseconds, and nanoseconds since the epoch. Epoch values always use the UTC time zone. ## Metrics @@ -40,5 +48,29 @@ The following table describes common [Abstract processor](https://github.com/ope The `date` processor includes the following custom metrics. -* `dateProcessingMatchSuccessCounter`: Returns the number of records that match with at least one pattern specified by the `match configuration` option. -* `dateProcessingMatchFailureCounter`: Returns the number of records that did not match any of the patterns specified by the `patterns match` configuration option. \ No newline at end of file +* `dateProcessingMatchSuccessCounter`: Returns the number of records that match at least one pattern specified by the `match configuration` option. +* `dateProcessingMatchFailureCounter`: Returns the number of records that did not match any of the patterns specified by the `patterns match` configuration option. + +## Example: Add the default timestamp to an event +The following `date` processor configuration can be used to add a default timestamp in the `@timestamp` filed applied to all events: + +```yaml +- date: + from_time_received: true + destination: "@timestamp" +``` + +## Example: Parse a timestamp to convert its format and time zone +The following `date` processor configuration can be used to parse the value of the timestamp applied to `dd/MMM/yyyy:HH:mm:ss` and write it in `yyyy-MM-dd'T'HH:mm:ss.SSSXXX` format: + +```yaml +- date: + match: + - key: timestamp + patterns: ["dd/MMM/yyyy:HH:mm:ss"] + destination: "@timestamp" + output_format: "yyyy-MM-dd'T'HH:mm:ss.SSSXXX" + source_timezone: "America/Los_Angeles" + destination_timezone: "America/Chicago" + locale: "en_US" +``` diff --git a/_data-prepper/pipelines/configuration/processors/decompress.md b/_data-prepper/pipelines/configuration/processors/decompress.md new file mode 100644 index 0000000000..d03c236ac5 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/decompress.md @@ -0,0 +1,49 @@ +--- +layout: default +title: decompress +parent: Processors +grand_parent: Pipelines +nav_order: 40 +--- + +# decompress + +The `decompress` processor decompresses any Base64-encoded compressed fields inside of an event. + +## Configuration + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`keys` | Yes | List | The fields in the event that will be decompressed. +`type` | Yes | Enum | The type of decompression to use for the `keys` in the event. Only `gzip` is supported. +`decompress_when` | No | String| A [Data Prepper conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/) that determines when the `decompress` processor will run on certain events. +`tags_on_failure` | No | List | A list of strings with which to tag events when the processor fails to decompress the `keys` inside an event. Defaults to `_decompression_failure`. + +## Usage + +The following example shows the `decompress` processor used in `pipelines.yaml`: + +```yaml +processor: + - decompress: + decompress_when: '/some_key == null' + keys: [ "base_64_gzip_key" ] + type: gzip +``` + +## Metrics + +The following table describes common [abstract processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/processor/AbstractProcessor.java) metrics. + +| Metric name | Type | Description | +| ------------- | ---- | -----------| +| `recordsIn` | Counter | The ingress of records to a pipeline component. | +| `recordsOut` | Counter | The egress of records from a pipeline component. | +| `timeElapsed` | Timer | The time elapsed during execution of a pipeline component. | + +### Counter + +The `decompress` processor accounts for the following metrics: + +* `processingErrors`: The number of processing errors that have occurred in the `decompress` processor. + diff --git a/_data-prepper/pipelines/configuration/processors/delete-entries.md b/_data-prepper/pipelines/configuration/processors/delete-entries.md index 0546ed67c4..33c54a0b29 100644 --- a/_data-prepper/pipelines/configuration/processors/delete-entries.md +++ b/_data-prepper/pipelines/configuration/processors/delete-entries.md @@ -3,7 +3,7 @@ layout: default title: delete_entries parent: Processors grand_parent: Pipelines -nav_order: 51 +nav_order: 41 --- # delete_entries diff --git a/_data-prepper/pipelines/configuration/processors/dissect.md b/_data-prepper/pipelines/configuration/processors/dissect.md index 2d32ba47ae..a8258bee4e 100644 --- a/_data-prepper/pipelines/configuration/processors/dissect.md +++ b/_data-prepper/pipelines/configuration/processors/dissect.md @@ -3,7 +3,7 @@ layout: default title: dissect parent: Processors grand_parent: Pipelines -nav_order: 52 +nav_order: 45 --- # dissect diff --git a/_data-prepper/pipelines/configuration/processors/drop-events.md b/_data-prepper/pipelines/configuration/processors/drop-events.md index d030f14a27..1f601c9743 100644 --- a/_data-prepper/pipelines/configuration/processors/drop-events.md +++ b/_data-prepper/pipelines/configuration/processors/drop-events.md @@ -3,7 +3,7 @@ layout: default title: drop_events parent: Processors grand_parent: Pipelines -nav_order: 53 +nav_order: 46 --- # drop_events diff --git a/_data-prepper/pipelines/configuration/processors/flatten.md b/_data-prepper/pipelines/configuration/processors/flatten.md new file mode 100644 index 0000000000..43793c2b83 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/flatten.md @@ -0,0 +1,239 @@ +--- +layout: default +title: flatten +parent: Processors +grand_parent: Pipelines +nav_order: 48 +--- + +# flatten + +The `flatten` processor transforms nested objects inside of events into flattened structures. + +## Configuration + +The following table describes configuration options for the `flatten` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`source` | Yes | String | The source key on which to perform the operation. If set to an empty string (`""`), then the processor uses the root of the event as the source. +`target` | Yes | String | The target key to put into the flattened fields. If set to an empty string (`""`), then the processor uses the root of the event as the target. +`exclude_keys` | No | List | The keys from the source field that should be excluded from processing. Default is an empty list (`[]`). +`remove_processed_fields` | No | Boolean | When `true`, the processor removes all processed fields from the source. Default is `false`. +`remove_list_indices` | No | Boolean | When `true`, the processor converts the fields from the source map into lists and puts the lists into the target field. Default is `false`. +`flatten_when` | No | String | A [conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that determines whether the `flatten` processor will be run on the event. Default is `null`, which means that all events will be processed unless otherwise stated. +`tags_on_failure` | No | List | A list of tags to add to the event metadata when the event fails to process. + +## Usage + +The following examples show how the `flatten` processor can be used in Data Prepper pipelines. + +### Minimum configuration + +The following example shows only the parameters that are required for using the `flatten` processor, `source` and `target`: + +```yaml +... + processor: + - flatten: + source: "key2" + target: "flattened-key2" +... +``` +{% include copy.html %} + +For example, when the input event contains the following nested objects: + +```json +{ + "key1": "val1", + "key2": { + "key3": { + "key4": "val2" + } + } +} +``` + +The `flatten` processor creates a flattened structure under the `flattened-key2` object, as shown in the following output: + +```json +{ + "key1": "val1", + "key2": { + "key3": { + "key4": "val2" + } + }, + "flattened-key2": { + "key3.key4": "val2" + } +} +``` + +### Remove processed fields + +Use the `remove_processed_fields` option when flattening all of an event's nested objects. This removes all the event's processed fields, as shown in the following example: + +```yaml +... + processor: + - flatten: + source: "" # empty string represents root of event + target: "" # empty string represents root of event + remove_processed_fields: true +... +``` + +For example, when the input event contains the following nested objects: + +```json +{ + "key1": "val1", + "key2": { + "key3": { + "key4": "val2" + } + }, + "list1": [ + { + "list2": [ + { + "name": "name1", + "value": "value1" + }, + { + "name": "name2", + "value": "value2" + } + ] + } + ] +} +``` + + +The `flatten` processor creates a flattened structure in which all processed fields are absent, as shown in the following output: + +```json +{ + "key1": "val1", + "key2.key3.key4": "val2", + "list1[0].list2[0].name": "name1", + "list1[0].list2[0].value": "value1", + "list1[0].list2[1].name": "name2", + "list1[0].list2[1].value": "value2", +} +``` + +### Exclude specific keys from flattening + +Use the `exclude_keys` option to prevent specific keys from being flattened in the output, as shown in the following example, where the `key2` value is excluded: + +```yaml +... + processor: + - flatten: + source: "" # empty string represents root of event + target: "" # empty string represents root of event + remove_processed_fields: true + exclude_keys: ["key2"] +... +``` + +For example, when the input event contains the following nested objects: + +```json +{ + "key1": "val1", + "key2": { + "key3": { + "key4": "val2" + } + }, + "list1": [ + { + "list2": [ + { + "name": "name1", + "value": "value1" + }, + { + "name": "name2", + "value": "value2" + } + ] + } + ] +} +``` + +All other nested objects in the input event, excluding the `key2` key, will be flattened, as shown in the following example: + +```json +{ + "key1": "val1", + "key2": { + "key3": { + "key4": "val2" + } + }, + "list1[0].list2[0].name": "name1", + "list1[0].list2[0].value": "value1", + "list1[0].list2[1].name": "name2", + "list1[0].list2[1].value": "value2", +} +``` + +### Remove list indexes + +Use the `remove_list_indices` option to convert the fields from the source map into lists and put the lists into the target field, as shown in the following example: + +```yaml +... + processor: + - flatten: + source: "" # empty string represents root of event + target: "" # empty string represents root of event + remove_processed_fields: true + remove_list_indices: true +... +``` + +For example, when the input event contains the following nested objects: + +```json +{ + "key1": "val1", + "key2": { + "key3": { + "key4": "val2" + } + }, + "list1": [ + { + "list2": [ + { + "name": "name1", + "value": "value1" + }, + { + "name": "name2", + "value": "value2" + } + ] + } + ] +} +``` + +The processor removes all indexes from the output and places them into the source map as a flattened, structured list, as shown in the following example: + +```json +{ + "key1": "val1", + "key2.key3.key4": "val2", + "list1[].list2[].name": ["name1","name2"], + "list1[].list2[].value": ["value1","value2"] +} +``` diff --git a/_data-prepper/pipelines/configuration/processors/geoip.md b/_data-prepper/pipelines/configuration/processors/geoip.md new file mode 100644 index 0000000000..b7418c66c6 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/geoip.md @@ -0,0 +1,67 @@ +--- +layout: default +title: geoip +parent: Processors +grand_parent: Pipelines +nav_order: 49 +--- + +# geoip + +The `geoip` processor enriches events with geographic information extracted from IP addresses contained in the events. +By default, Data Prepper uses the [MaxMind GeoLite2](https://dev.maxmind.com/geoip/geolite2-free-geolocation-data) geolocation database. +Data Prepper administrators can configure the databases using the [`geoip_service`]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/extensions/geoip_service) extension configuration. + +## Usage + +You can configure the `geoip` processor to work on entries. + +The minimal configuration requires at least one entry, and each entry at least one source field. + +The following configuration extracts all available geolocation data from the IP address provided in the field named `clientip`. +It will write the geolocation data to a new field named `geo`, the default source when none is configured: + +``` +my-pipeline: + processor: + - geoip: + entries: + - source: clientip +``` + +The following example excludes Autonomous System Number (ASN) fields and puts the geolocation data into a field named `clientlocation`: + +``` +my-pipeline: + processor: + - geoip: + entries: + - source: clientip + target: clientlocation + include_fields: [asn, asn_organization, network] +``` + + +## Configuration + +You can use the following options to configure the `geoip` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`entries` | Yes | [entry](#entry) list | The list of entries marked for enrichment. +`geoip_when` | No | String | Specifies under what condition the `geoip` processor should perform matching. Default is no condition. +`tags_on_no_valid_ip` | No | String | The tags to add to the event metadata if the source field is not a valid IP address. This includes the localhost IP address. +`tags_on_ip_not_found` | No | String | The tags to add to the event metadata if the `geoip` processor is unable to find a location for the IP address. +`tags_on_engine_failure` | No | String | The tags to add to the event metadata if the `geoip` processor is unable to enrich an event due to an engine failure. + +## entry + +The following parameters allow you to configure a single geolocation entry. Each entry corresponds to a single IP address. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`source` | Yes | String | The key of the source field containing the IP address to geolocate. +`target` | No | String | The key of the target field in which to save the geolocation data. Default is `geo`. +`include_fields` | No | String list | The list of geolocation fields to include in the `target` object. By default, this is all the fields provided by the configured databases. +`exclude_fields` | No | String list | The list of geolocation fields to exclude from the `target` object. + diff --git a/_data-prepper/pipelines/configuration/processors/grok.md b/_data-prepper/pipelines/configuration/processors/grok.md index d1eea278d2..16f72c4968 100644 --- a/_data-prepper/pipelines/configuration/processors/grok.md +++ b/_data-prepper/pipelines/configuration/processors/grok.md @@ -3,7 +3,7 @@ layout: default title: Grok parent: Processors grand_parent: Pipelines -nav_order: 54 +nav_order: 50 --- # Grok @@ -15,26 +15,25 @@ The Grok processor uses pattern matching to structure and extract important keys The following table describes options you can use with the Grok processor to structure your data and make your data easier to query. Option | Required | Type | Description -:--- | :--- | :--- | :--- -break_on_match | No | Boolean | Specifies whether to match all patterns or stop once the first successful match is found. Default value is `true`. -grok_when | No | String | Specifies under what condition the `Grok` processor should perform matching. Default is no condition. -keep_empty_captures | No | Boolean | Enables the preservation of `null` captures. Default value is `false`. -keys_to_overwrite | No | List | Specifies which existing keys will be overwritten if there is a capture with the same key value. Default value is `[]`. -match | No | Map | Specifies which keys to match specific patterns against. Default value is an empty body. -named_captures_only | No | Boolean | Specifies whether to keep only named captures. Default value is `true`. -pattern_definitions | No | Map | Allows for custom pattern use inline. Default value is an empty body. -patterns_directories | No | List | Specifies the path of directories that contain customer pattern files. Default value is an empty list. -pattern_files_glob | No | String | Specifies which pattern files to use from the directories specified for `pattern_directories`. Default value is `*`. -target_key | No | String | Specifies a parent-level key used to store all captures. Default value is `null`. -timeout_millis | No | Integer | The maximum amount of time during which matching occurs. Setting to `0` disables the timeout. Default value is `30,000`. - - +:--- | :--- |:--- | :--- +`break_on_match` | No | Boolean | Specifies whether to match all patterns (`true`) or stop once the first successful match is found (`false`). Default is `true`. +`grok_when` | No | String | Specifies under what condition the `grok` processor should perform matching. Default is no condition. +`keep_empty_captures` | No | Boolean | Enables the preservation of `null` captures from the processed output. Default is `false`. +`keys_to_overwrite` | No | List | Specifies which existing keys will be overwritten if there is a capture with the same key value. Default is `[]`. +`match` | No | Map | Specifies which keys should match specific patterns. Default is an empty response body. +`named_captures_only` | No | Boolean | Specifies whether to keep only named captures. Default is `true`. +`pattern_definitions` | No | Map | Allows for a custom pattern that can be used inline inside the response body. Default is an empty response body. +`patterns_directories` | No | List | Specifies which directory paths contain the custom pattern files. Default is an empty list. +`pattern_files_glob` | No | String | Specifies which pattern files to use from the directories specified for `pattern_directories`. Default is `*`. +`target_key` | No | String | Specifies a parent-level key used to store all captures. Default value is `null`. +`timeout_millis` | No | Integer | The maximum amount of time during which matching occurs. Setting to `0` prevents any matching from occurring. Default is `30,000`. +`performance_metadata` | No | Boolean | Whether or not to add the performance metadata to events. Default is `false`. For more information, see [Grok performance metadata](#grok-performance-metadata). + ## Conditional grok -The Grok processor can be configured to run conditionally by using the `grok_when` option. The following is an example Grok processor configuration that uses `grok_when`: +The `grok` processor can be configured to run conditionally by using the `grok_when` option. The following is an example Grok processor configuration that uses `grok_when`: + ``` processor: - grok: @@ -46,8 +45,36 @@ processor: match: message: ['%{IPV6:clientip} %{WORD:request} %{POSINT:bytes}'] ``` +{% include copy.html %} + The `grok_when` option can take a conditional expression. This expression is detailed in the [Expression syntax](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/) documentation. +## Grok performance metadata + +When the `performance_metadata` option is set to `true`, the `grok` processor adds the following metadata keys to each event: + +* `_total_grok_processing_time`: The total amount of time, in milliseconds, that the `grok` processor takes to match the event. This is the sum of the processing time based on all of the `grok` processors that ran on the event and have the `performance_metadata` option enabled. +* `_total_grok_patterns_attempted`: The total number of `grok` pattern match attempts across all `grok` processors that ran on the event. + +To include Grok performance metadata when the event is sent to the sink inside the pipeline, use the `add_entries` processor to describe the metadata you want to include, as shown in the following example: + + +```yaml +processor: + - grok: + performance_metadata: true + match: + log: "%{COMMONAPACHELOG"} + - add_entries: + entries: + - add_when: 'getMetadata("_total_grok_patterns_attempted") != null' + key: "grok_patterns_attempted" + value_expression: 'getMetadata("_total_grok_patterns_attempted")' + - add_when: 'getMetadata("_total_grok_processing_time") != null' + key: "grok_time_spent" + value_expression: 'getMetadata("_total_grok_processing_time")' +``` + ## Metrics The following table describes common [Abstract processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/processor/AbstractProcessor.java) metrics. diff --git a/_data-prepper/pipelines/configuration/processors/list-to-map.md b/_data-prepper/pipelines/configuration/processors/list-to-map.md index 4b137f5ce8..15a90ffc24 100644 --- a/_data-prepper/pipelines/configuration/processors/list-to-map.md +++ b/_data-prepper/pipelines/configuration/processors/list-to-map.md @@ -16,10 +16,12 @@ The following table describes the configuration options used to generate target Option | Required | Type | Description :--- | :--- | :--- | :--- -`key` | Yes | String | The key of the fields to be extracted as keys in the generated mappings. `source` | Yes | String | The list of objects with `key` fields to be converted into keys for the generated map. `target` | No | String | The target for the generated map. When not specified, the generated map will be placed in the root node. +`key` | Conditionally | String | The key of the fields to be extracted as keys in the generated mappings. Must be specified if `use_source_key` is `false`. +`use_source_key` | No | Boolean | When `true`, keys in the generated map will use original keys from the source. Default is `false`. `value_key` | No | String | When specified, values given a `value_key` in objects contained in the source list will be extracted and converted into the value specified by this option based on the generated map. When not specified, objects contained in the source list retain their original value when mapped. +`extract_value` | No | Boolean | When `true`, object values from the source list will be extracted and added to the generated map. When `false`, object values from the source list are added to the generated map as they appear in the source list. Default is `false` `flatten` | No | Boolean | When `true`, values in the generated map output flatten into single items based on the `flattened_element`. Otherwise, objects mapped to values from the generated map appear as lists. `flattened_element` | Conditionally | String | The element to keep, either `first` or `last`, when `flatten` is set to `true`. @@ -302,4 +304,52 @@ Some objects in the response may have more than one element in their values, as "val-c" ] } +``` + +### Example: `use_source_key` and `extract_value` set to `true` + +The following example `pipeline.yaml` file sets `flatten` to `false`, causing the processor to output values from the generated map as a list: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - list_to_map: + source: "mylist" + use_source_key: true + extract_value: true + sink: + - stdout: +``` +{% include copy.html %} + +Object values from `mylist` are extracted and added to fields with the source keys `name` and `value`, as shown in the following response: + +```json +{ + "mylist": [ + { + "name": "a", + "value": "val-a" + }, + { + "name": "b", + "value": "val-b1" + }, + { + "name": "b", + "value": "val-b2" + }, + { + "name": "c", + "value": "val-c" + } + ], + "name": ["a", "b", "b", "c"], + "value": ["val-a", "val-b1", "val-b2", "val-c"] +} ``` \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/map-to-list.md b/_data-prepper/pipelines/configuration/processors/map-to-list.md new file mode 100644 index 0000000000..f3393e6c46 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/map-to-list.md @@ -0,0 +1,277 @@ +--- +layout: default +title: map_to_list +parent: Processors +grand_parent: Pipelines +nav_order: 63 +--- + +# map_to_list + +The `map_to_list` processor converts a map of key-value pairs to a list of objects. Each object contains the key and value in separate fields. + +## Configuration + +The following table describes the configuration options for the `map_to_list` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`source` | Yes | String | The source map used to perform the mapping operation. When set to an empty string (`""`), it will use the root of the event as the `source`. +`target` | Yes | String | The target for the generated list. +`key_name` | No | String | The name of the field in which to store the original key. Default is `key`. +`value_name` | No | String | The name of the field in which to store the original value. Default is `value`. +`exclude_keys` | No | List | The keys in the source map that will be excluded from processing. Default is an empty list (`[]`). +`remove_processed_fields` | No | Boolean | When `true`, the processor will remove the processed fields from the source map. Default is `false`. +`convert_field_to_list` | No | Boolean | If `true`, the processor will convert the fields from the source map into lists and place them in fields in the target list. Default is `false`. +`map_to_list_when` | No | String | A [conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that will be evaluated to determine whether the processor will be run on the event. Default is `null`. All events will be processed unless otherwise stated. +`tags_on_failure` | No | List | A list of tags to add to the event metadata when the event fails to process. + +## Usage + +The following examples show how the `map_to_list` processor can be used in your pipeline. + +### Example: Minimum configuration + +The following example shows the `map_to_list` processor with only the required parameters, `source` and `target`, configured: + +```yaml +... + processor: + - map_to_list: + source: "my-map" + target: "my-list" +... +``` +{% include copy.html %} + +When the input event contains the following data: + +```json +{ + "my-map": { + "key1": "value1", + "key2": "value2", + "key3": "value3" + } +} +``` + + +The processed event will contain the following output: + +```json +{ + "my-list": [ + { + "key": "key1", + "value": "value1" + }, + { + "key": "key2", + "value": "value2" + }, + { + "key": "key3", + "value": "value3" + } + ], + "my-map": { + "key1": "value1", + "key2": "value2", + "key3": "value3" + } +} +``` + +### Example: Custom key name and value name + +The following example shows how to configure a custom key name and value name: + +```yaml +... + processor: + - map_to_list: + source: "my-map" + target: "my-list" + key_name: "name" + value_name: "data" +... +``` +{% include copy.html %} + +When the input event contains the following data: + +```json +{ + "my-map": { + "key1": "value1", + "key2": "value2", + "key3": "value3" + } +} +``` + +The processed event will contain the following output: + +```json +{ + "my-list": [ + { + "name": "key1", + "data": "value1" + }, + { + "name": "key2", + "data": "value2" + }, + { + "name": "key3", + "data": "value3" + } + ], + "my-map": { + "key1": "value1", + "key2": "value2", + "key3": "value3" + } +} +``` + +### Example: Exclude specific keys from processing and remove any processed fields + +The following example shows how to exclude specific keys and remove any processed fields from the output: + +```yaml +... + processor: + - map_to_list: + source: "my-map" + target: "my-list" + exclude_keys: ["key1"] + remove_processed_fields: true +... +``` +{% include copy.html %} + +When the input event contains the following data: +```json +{ + "my-map": { + "key1": "value1", + "key2": "value2", + "key3": "value3" + } +} +``` + +The processed event will remove the "key2" and "key3" fields, but the "my-map" object, "key1", will remain, as shown in the following output: + +```json +{ + "my-list": [ + { + "key": "key2", + "value": "value2" + }, + { + "key": "key3", + "value": "value3" + } + ], + "my-map": { + "key1": "value1" + } +} +``` + +### Example: Use convert_field_to_list + +The following example shows how to use the `convert_field_to_list` option in the processor: + +```yaml +... + processor: + - map_to_list: + source: "my-map" + target: "my-list" + convert_field_to_list: true +... +``` +{% include copy.html %} + +When the input event contains the following data: + +```json +{ + "my-map": { + "key1": "value1", + "key2": "value2", + "key3": "value3" + } +} +``` + +The processed event will convert all fields into lists, as shown in the following output: + +```json +{ + "my-list": [ + ["key1", "value1"], + ["key2", "value2"], + ["key3", "value3"] + ], + "my-map": { + "key1": "value1", + "key2": "value2", + "key3": "value3" + } +} +``` + +### Example: Use the event root as the source + +The following example shows how you can use an event's root as the source by setting the `source` setting to an empty string (`""`): + +```yaml +... + processor: + - map_to_list: + source: "" + target: "my-list" +... +``` +{% include copy.html %} + +When the input event contains the following data: + +```json +{ + "key1": "value1", + "key2": "value2", + "key3": "value3" +} +``` + +The processed event will contain the following output: + +```json +{ + "my-list": [ + { + "key": "key1", + "value": "value1" + }, + { + "key": "key2", + "value": "value2" + }, + { + "key": "key3", + "value": "value3" + } + ], + "key1": "value1", + "key2": "value2", + "key3": "value3" +} +``` diff --git a/_data-prepper/pipelines/configuration/processors/mutate-event.md b/_data-prepper/pipelines/configuration/processors/mutate-event.md index 032bc89fcd..9b3b2afb33 100644 --- a/_data-prepper/pipelines/configuration/processors/mutate-event.md +++ b/_data-prepper/pipelines/configuration/processors/mutate-event.md @@ -11,11 +11,14 @@ nav_order: 65 Mutate event processors allow you to modify events in Data Prepper. The following processors are available: * [add_entries]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/add-entries/) allows you to add entries to an event. +* [convert_entry_type]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/convert_entry_type/) allows you to convert value types in an event. * [copy_values]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/copy-values/) allows you to copy values within an event. * [delete_entries]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/delete-entries/) allows you to delete entries from an event. -* [rename_keys]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/rename-keys/) allows you to rename keys in an event. -* [convert_entry_type]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/convert_entry_type/) allows you to convert value types in an event. * [list_to_map]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/list-to-map) allows you to convert list of objects from an event where each object contains a `key` field into a map of target keys. +* `map_to_list` allows you to convert a map of objects from an event, where each object contains a `key` field, into a list of target keys. +* [rename_keys]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/rename-keys/) allows you to rename keys in an event. +* [select_entries]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/select-entries/) allows you to select entries from an event. + diff --git a/_data-prepper/pipelines/configuration/processors/obfuscate.md b/_data-prepper/pipelines/configuration/processors/obfuscate.md index 4c33d8baab..13d906acb3 100644 --- a/_data-prepper/pipelines/configuration/processors/obfuscate.md +++ b/_data-prepper/pipelines/configuration/processors/obfuscate.md @@ -67,6 +67,8 @@ Use the following configuration options with the `obfuscate` processor. | `source` | Yes | The source field to obfuscate. | | `target` | No | The new field in which to store the obfuscated value. This leaves the original source field unchanged. When no `target` is provided, the source field updates with the obfuscated value. | | `patterns` | No | A list of regex patterns that allow you to obfuscate specific parts of a field. Only parts that match the regex pattern will obfuscate. When not provided, the processor obfuscates the whole field. | +| `obfuscate_when` | No | Specifies under what condition the Obfuscate processor should perform matching. Default is no condition. | +| `tags_on_match_failure` | No | The tag to add to an event if the obfuscate processor fails to match the pattern. | | `action` | No | The obfuscation action. As of Data Prepper 2.3, only the `mask` action is supported. | You can customize the `mask` action with the following optional configuration options. diff --git a/_data-prepper/pipelines/configuration/processors/otel-trace-raw.md b/_data-prepper/pipelines/configuration/processors/otel-trace-raw.md index ed2716c4dc..395956a668 100644 --- a/_data-prepper/pipelines/configuration/processors/otel-trace-raw.md +++ b/_data-prepper/pipelines/configuration/processors/otel-trace-raw.md @@ -8,7 +8,7 @@ nav_order: 75 # otel_trace -The `otel_trace` processor completes trace-group-related fields in all incoming Data Prepper span records by state caching the root span information for each `tradeId`. +The `otel_trace` processor completes trace-group-related fields in all incoming Data Prepper span records by state caching the root span information for each `traceId`. ## Parameters @@ -41,4 +41,4 @@ The following table describes common [Abstract processor](https://github.com/ope The `otel_trace` processor includes the following custom metrics: * `traceGroupCacheCount`: The number of trace groups in the trace group cache. -* `spanSetCount`: The number of span sets in the span set collection. \ No newline at end of file +* `spanSetCount`: The number of span sets in the span set collection. diff --git a/_data-prepper/pipelines/configuration/processors/parse-ion.md b/_data-prepper/pipelines/configuration/processors/parse-ion.md new file mode 100644 index 0000000000..0edd446c42 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/parse-ion.md @@ -0,0 +1,56 @@ +--- +layout: default +title: parse_ion +parent: Processors +grand_parent: Pipelines +nav_order: 79 +--- + +# parse_ion + +The `parse_ion` processor parses [Amazon Ion](https://amazon-ion.github.io/ion-docs/) data. + +## Configuration + +You can configure the `parse_ion` processor with the following options. + +| Option | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| `source` | No | String | The field in the `event` that is parsed. Default value is `message`. | +| `destination` | No | String | The destination field of the parsed JSON. Defaults to the root of the `event`. Cannot be `""`, `/`, or any white-space-only `string` because these are not valid `event` fields. | +| `pointer` | No | String | A JSON pointer to the field to be parsed. There is no `pointer` by default, meaning that the entire `source` is parsed. The `pointer` can access JSON array indexes as well. If the JSON pointer is invalid, then the entire `source` data is parsed into the outgoing `event`. If the key that is pointed to already exists in the `event` and the `destination` is the root, then the pointer uses the entire path of the key. | +| `tags_on_failure` | No | String | A list of strings that specify the tags to be set in the event that the processors fails or an unknown exception occurs while parsing. + +## Usage + +The following examples show how to use the `parse_ion` processor in your pipeline. + +### Example: Minimum configuration + +The following example shows the minimum configuration for the `parse_ion` processor: + +```yaml +parse-json-pipeline: + source: + stdin: + processor: + - parse_json: + source: "my_ion" + sink: + - stdout: +``` +{% include copy.html %} + +When the input event contains the following data: + +``` +{"my_ion": "{ion_value1: \"hello\", ion_value2: \"world\"}"} +``` + +The processor parses the event into the following output: + +``` +{"ion_value1": "hello", "ion_value2" : "world"} +``` + + diff --git a/_data-prepper/pipelines/configuration/processors/parse-xml.md b/_data-prepper/pipelines/configuration/processors/parse-xml.md new file mode 100644 index 0000000000..861705da2b --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/parse-xml.md @@ -0,0 +1,55 @@ +--- +layout: default +title: parse_xml +parent: Processors +grand_parent: Pipelines +nav_order: 83 +--- + +# parse_xml + +The `parse_xml` processor parses XML data for an event. + +## Configuration + +You can configure the `parse_xml` processor with the following options. + +| Option | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| `source` | No | String | Specifies which `event` field to parse. | +| `destination` | No | String | The destination field of the parsed XML. Defaults to the root of the `event`. Cannot be `""`, `/`, or any white-space-only string because these are not valid `event` fields. | +| `pointer` | No | String | A JSON pointer to the field to be parsed. The value is null by default, meaning that the entire `source` is parsed. The `pointer` can access JSON array indexes as well. If the JSON pointer is invalid, then the entire `source` data is parsed into the outgoing `event` object. If the key that is pointed to already exists in the `event` object and the `destination` is the root, then the pointer uses the entire path of the key. | +| `parse_when` | No | String | Specifies under what conditions the processor should perform parsing. Default is no condition. Accepts a Data Prepper expression string following the [Data Prepper Expression Syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). | +| `tags_on_failure` | No | String | A list of strings that specify the tags to be set if the processor fails or an unknown exception occurs while parsing. + +## Usage + +The following examples show how to use the `parse_xml` processor in your pipeline. + +### Example: Minimum configuration + +The following example shows the minimum configuration for the `parse_xml` processor: + +```yaml +parse-xml-pipeline: + source: + stdin: + processor: + - parse_xml: + source: "my_xml" + sink: + - stdout: +``` +{% include copy.html %} + +When the input event contains the following data: + +``` +{ "my_xml": "John Doe30" } +``` + +The processor parses the event into the following output: + +``` +{ "name": "John Doe", "age": "30" } +``` \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/select-entries.md b/_data-prepper/pipelines/configuration/processors/select-entries.md new file mode 100644 index 0000000000..8f1abec055 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/select-entries.md @@ -0,0 +1,50 @@ +--- +layout: default +title: select_entries +parent: Processors +grand_parent: Pipelines +nav_order: 59 +--- + +# select_entries + +The `select_entries` processor selects entries from a Data Prepper event. Only the selected entries will remain in the event, and all other entries will be removed from the event. + +## Configuration + +You can configure the `select_entries` processor using the following options. + +| Option | Required | Description | +| :--- | :--- | :--- | +| `include_keys` | Yes | A list of keys to be selected from an event. | +| `select_when` | No | A [conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that will be evaluated to determine whether the processor will be run on the event. | + +## Usage + +The following example shows how to configure the `select_entries` processor in the `pipeline.yaml` file: + +```yaml +pipeline: + source: + ... + .... + processor: + - select_entries: + include_keys: [ "key1", "key2" ] + select_when: '/some_key == "test"' + sink: +``` +{% include copy.html %} + + +For example, when your source contains the following event record: + +```json +{"message": "hello", "key1" : "value1", "key2" : "value2", "some_key" : "test"} +``` + +The `select_entries` processor includes only `key1` and `key2` in the processed output: + +```json +{"key1": "value1", "key2": "value2"} +``` diff --git a/_data-prepper/pipelines/configuration/processors/split-event.md b/_data-prepper/pipelines/configuration/processors/split-event.md new file mode 100644 index 0000000000..f059fe5b95 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/split-event.md @@ -0,0 +1,52 @@ +--- +layout: default +title: split-event +parent: Processors +grand_parent: Pipelines +nav_order: 96 +--- + +# split-event + +The `split-event` processor is used to split events based on a delimiter and generates multiple events from a user-specified field. + +## Configuration + +The following table describes the configuration options for the `split-event` processor. + +| Option | Type | Description | +|------------------|---------|-----------------------------------------------------------------------------------------------| +| `field` | String | The event field to be split. | +| `delimiter_regex`| String | The regular expression used as the delimiter for splitting the field. | +| `delimiter` | String | The delimiter used for splitting the field. If not specified, the default delimiter is used. | + +# Usage + +To use the `split-event` processor, add the following to your `pipelines.yaml` file: + +``` +split-event-pipeline: + source: + http: + processor: + - split_event: + field: query + delimiter: ' ' + sink: + - stdout: +``` +{% include copy.html %} + +When an event contains the following example input: + +``` +{"query" : "open source", "some_other_field" : "abc" } +``` + +The input will be split into multiple events based on the `query` field, with the delimiter set as white space, as shown in the following example: + +``` +{"query" : "open", "some_other_field" : "abc" } +{"query" : "source", "some_other_field" : "abc" } +``` + diff --git a/_data-prepper/pipelines/configuration/processors/truncate.md b/_data-prepper/pipelines/configuration/processors/truncate.md new file mode 100644 index 0000000000..3714d80847 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/truncate.md @@ -0,0 +1,107 @@ +--- +layout: default +title: truncate +parent: Processors +grand_parent: Pipelines +nav_order: 121 +--- + +# truncate + +The `truncate` processor truncates a key's value at the beginning, the end, or on both sides of the value string, based on the processor's configuration. If the key's value is a list, then each member in the string list is truncated. Non-string members of the list are not truncated. When the `truncate_when` option is provided, input is truncated only when the condition specified is `true` for the event being processed. + +## Configuration + +You can configure the `truncate` processor using the following options. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`entries` | Yes | String list | A list of entries to add to an event. +`source_keys` | No | String list | The list of source keys that will be modified by the processor. The default value is an empty list, which indicates that all values will be truncated. +`truncate_when` | No | Conditional expression | A condition that, when met, determines when the truncate operation is performed. +`start_at` | No | Integer | Where in the string value to start truncation. Default is `0`, which specifies to start truncation at the beginning of each key's value. +`length` | No | Integer| The length of the string after truncation. When not specified, the processor will measure the length based on where the string ends. + +Either the `start_at` or `length` options must be present in the configuration in order for the `truncate` processor to run. You can define both values in the configuration in order to further customize where truncation occurs in the string. + +## Usage + +The following examples show how to configure the `truncate` processor in the `pipeline.yaml` file: + +## Example: Minimum configuration + +The following example shows the minimum configuration for the `truncate` processor: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - truncate: + entries: + - source_keys: ["message1", "message2"] + length: 5 + - source_keys: ["info"] + length: 6 + start_at: 4 + - source_keys: ["log"] + start_at: 5 + sink: + - stdout: +``` + +For example, the following event contains several keys with string values: + +```json +{"message1": "hello,world", "message2": "test message", "info", "new information", "log": "test log message"} +``` + +The `truncate` processor produces the following output, where: + +- The `start_at` setting is `0` for the `message1` and `message 2` keys, indicating that truncation will begin at the start of the string, with the string itself truncated to a length of `5`. +- The `start_at` setting is `4` for the `info` key, indicating that truncation will begin at letter `i` of the string, with the string truncated to a length of `6`. +- The `start_at` setting is `5` for the `log` key, with no length specified, indicating that truncation will begin at letter `l` of the string. + +```json +{"message1":"hello", "message2":"test ", "info":"inform", "log": "log message"} +``` + + +## Example: Using `truncate_when` + +The following example configuration shows the `truncate` processor with the `truncate_when` option configured: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - truncate: + entries: + - source_keys: ["message"] + length: 5 + start_at: 8 + truncate_when: '/id == 1' + sink: + - stdout: +``` + +The following example contains two events: + +```json +{"message": "hello, world", "id": 1} +{"message": "hello, world,not-truncated", "id": 2} +``` + +When the `truncate` processor runs on the events, only the first event is truncated because the `id` key contains a value of `1`: + +```json +{"message": "world", "id": 1} +{"message": "hello, world,not-truncated", "id": 2} +``` diff --git a/_data-prepper/pipelines/configuration/sinks/file.md b/_data-prepper/pipelines/configuration/sinks/file.md index 74af5a1803..bd4fec1865 100644 --- a/_data-prepper/pipelines/configuration/sinks/file.md +++ b/_data-prepper/pipelines/configuration/sinks/file.md @@ -17,6 +17,7 @@ The following table describes options you can configure for the `file` sink. Option | Required | Type | Description :--- | :--- | :--- | :--- path | Yes | String | Path for the output file (e.g. `logs/my-transformed-log.log`). +append | No | Boolean | When `true`, the sink file is opened in append mode. ## Usage diff --git a/_data-prepper/pipelines/configuration/sinks/opensearch.md b/_data-prepper/pipelines/configuration/sinks/opensearch.md index b4861f68fd..7b8e99339f 100644 --- a/_data-prepper/pipelines/configuration/sinks/opensearch.md +++ b/_data-prepper/pipelines/configuration/sinks/opensearch.md @@ -50,45 +50,80 @@ pipeline: The following table describes options you can configure for the `opensearch` sink. +Option | Required | Type | Description +:--- | :--- |:---| :--- +`hosts` | Yes | List | A list of OpenSearch hosts to write to, such as `["https://localhost:9200", "https://remote-cluster:9200"]`. +`cert` | No | String | The path to the security certificate. For example, `"config/root-ca.pem"` if the cluster uses the OpenSearch Security plugin. +`username` | No | String | The username for HTTP basic authentication. +`password` | No | String | The password for HTTP basic authentication. +`aws` | No | AWS | The [AWS](#aws) configuration. +[max_retries](#configure-max_retries) | No | Integer | The maximum number of times that the `opensearch` sink should try to push data to the OpenSearch server before considering it to be a failure. Defaults to `Integer.MAX_VALUE`. When not provided, the sink will try to push data to the OpenSearch server indefinitely and exponential backoff will increase the waiting time before a retry. +`aws_sigv4` | No | Boolean | **Deprecated in Data Prepper 2.7.** Default is `false`. Whether to use AWS Identity and Access Management (IAM) signing to connect to an Amazon OpenSearch Service domain. For your access key, secret key, and optional session token, Data Prepper uses the default credential chain (environment variables, Java system properties, `~/.aws/credential`). +`aws_region` | No | String | **Deprecated in Data Prepper 2.7.** The AWS Region (for example, `"us-east-1"`) for the domain when you are connecting to Amazon OpenSearch Service. +`aws_sts_role_arn` | No | String | **Deprecated in Data Prepper 2.7.** The IAM role that the plugin uses to sign requests sent to Amazon OpenSearch Service. If this information is not provided, then the plugin uses the default credentials. +`socket_timeout` | No | Integer | The timeout value, in milliseconds, when waiting for data to be returned (the maximum period of inactivity between two consecutive data packets). A timeout value of `0` is interpreted as an infinite timeout. If this timeout value is negative or not set, then the underlying Apache HttpClient will rely on operating system settings to manage socket timeouts. +`connect_timeout` | No | Integer| The timeout value, in milliseconds, when requesting a connection from the connection manager. A timeout value of `0` is interpreted as an infinite timeout. If this timeout value is negative or not set, the underlying Apache HttpClient will rely on operating system settings to manage connection timeouts. +`insecure` | No | Boolean | Whether or not to verify SSL certificates. If set to `true`, then certificate authority (CA) certificate verification is disabled and insecure HTTP requests are sent instead. Default is `false`. +`proxy` | No | String | The address of the [forward HTTP proxy server](https://en.wikipedia.org/wiki/Proxy_server). The format is `"<hostname or IP>:<port>"` (for example, `"example.com:8100"`, `"http://example.com:8100"`, `"112.112.112.112:8100"`). The port number cannot be omitted. +`index` | Conditionally | String | The name of the export index. Only required when the `index_type` is `custom`. The index can be a plain string, such as `my-index-name`, contain [Java date-time patterns](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html), such as `my-index-${yyyy.MM.dd}` or `my-${yyyy-MM-dd-HH}-index`, be formatted using field values, such as `my-index-${/my_field}`, or use [Data Prepper expressions](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `my-index-${getMetadata(\"my_metadata_field\"}`. All formatting options can be combined to provide flexibility when creating static, dynamic, and rolling indexes. +`index_type` | No | String | Tells the sink plugin what type of data it is handling. Valid values are `custom`, `trace-analytics-raw`, `trace-analytics-service-map`, or `management-disabled`. Default is `custom`. +`template_type` | No | String | Defines what type of OpenSearch template to use. Available options are `v1` and `index-template`. The default value is `v1`, which uses the original OpenSearch templates available at the `_template` API endpoints. The `index-template` option uses composable [index templates]({{site.url}}{{site.baseurl}}/opensearch/index-templates/), which are available through the OpenSearch `_index_template` API. Composable index types offer more flexibility than the default and are necessary when an OpenSearch cluster contains existing index templates. Composable templates are available for all versions of OpenSearch and some later versions of Elasticsearch. When `distribution_version` is set to `es6`, Data Prepper enforces the `template_type` as `v1`. +`template_file` | No | String | The path to a JSON [index template]({{site.url}}{{site.baseurl}}/opensearch/index-templates/) file, such as `/your/local/template-file.json`, when `index_type` is set to `custom`. For an example template file, see [otel-v1-apm-span-index-template.json](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/opensearch/src/main/resources/otel-v1-apm-span-index-template.json). If you supply a template file, then it must match the template format specified by the `template_type` parameter. +`template_content` | No | JSON | Contains all the inline JSON found inside of the index [index template]({{site.url}}{{site.baseurl}}/opensearch/index-templates/). For an example of template content, see [the example template content](#example_template_content). +`document_id_field` | No | String | **Deprecated in Data Prepper 2.7 in favor of `document_id`.** The field from the source data to use for the OpenSearch document ID (for example, `"my-field"`) if `index_type` is `custom`. +`document_id` | No | String | A format string to use as the `_id` in OpenSearch documents. To specify a single field in an event, use `${/my_field}`. You can also use Data Prepper expressions to construct the `document_id`, for example, `${getMetadata(\"some_metadata_key\")}`. These options can be combined into more complex formats, such as `${/my_field}-test-${getMetadata(\"some_metadata_key\")}`. +`document_version` | No | String | A format string to use as the `_version` in OpenSearch documents. To specify a single field in an event, use `${/my_field}`. You can also use Data Prepper expressions to construct the `document_version`, for example, `${getMetadata(\"some_metadata_key\")}`. These options can be combined into more complex versions, such as `${/my_field}${getMetadata(\"some_metadata_key\")}`. The `document_version` format must evaluate to a long type and can only be used when `document_version_type` is set to either `external` or `external_gte`. +`document_version_type` | No | String | The document version type for index operations. Must be one of `external`, `external_gte`, or `internal`. If set to `external` or `external_gte`, then `document_version` is required. +`dlq_file` | No | String | The path to your preferred dead letter queue file (such as `/your/local/dlq-file`). Data Prepper writes to this file when it fails to index a document on the OpenSearch cluster. +`dlq` | No | N/A | [DLQ configurations]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/dlq/). +`bulk_size` | No | Integer (long) | The maximum size (in MiB) of bulk requests sent to the OpenSearch cluster. Values below `0` indicate an unlimited size. If a single document exceeds the maximum bulk request size, then Data Prepper sends each request individually. Default value is `5`. +`ism_policy_file` | No | String | The absolute file path for an Index State Management (ISM) policy JSON file. This policy file is effective only when there is no built-in policy file for the index type. For example, the `custom` index type is currently the only type without a built-in policy file, so it will use this policy file if it is provided through this parameter. For more information about the policy JSON file, see [ISM policies]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/). +`number_of_shards` | No | Integer | The number of primary shards that an index should have on the destination OpenSearch server. This parameter is effective only when `template_file` is either explicitly provided in the sink configuration or built in. If this parameter is set, then it will override the value in the index template file. For more information, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). +`number_of_replicas` | No | Integer | The number of replica shards that each primary shard should have on the destination OpenSearch server. For example, if you have 4 primary shards and set `number_of_replicas` to `3`, then the index has 12 replica shards. This parameter is effective only when `template_file` is either explicitly provided in the sink configuration or built in. If this parameter is set, then it will override the value in the index template file. For more information, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). +`distribution_version` | No | String | Indicates whether the backend version of the sink is Elasticsearch 6 or later. `es6` represents Elasticsearch 6. `default` represents the latest compatible backend version, such as Elasticsearch 7.x, OpenSearch 1.x, or OpenSearch 2.x. Default is `default`. +`enable_request_compression` | No | Boolean | Whether to enable compression when sending requests to OpenSearch. When `distribution_version` is set to `es6`, default is `false`. For all other distribution versions, default is `true`. +`action` | No | String | The OpenSearch bulk action to use for documents. Must be one of `create`, `index`, `update`, `upsert`, or `delete`. Default is `index`. +`actions` | No | List | A [list of actions](#actions) that can be used as an alternative to `action`, which reads as a switch case statement that conditionally determines the bulk action to take for an event. +`flush_timeout` | No | Long | A long class that contains the amount of time, in milliseconds, to try packing a bulk request up to the `bulk_size` before flushing the request. If this timeout expires before a bulk request has reached the `bulk_size`, the request will be flushed. Set to `-1` to disable the flush timeout and instead flush whatever is present at the end of each batch. Default is `60,000`, or 1 minute. +`normalize_index` | No | Boolean | If true, then the OpenSearch sink will try to create dynamic index names. Index names with format options specified in `${})` are valid according to the [index naming restrictions]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/#index-naming-restrictions). Any invalid characters will be removed. Default value is `false`. +`routing` | No | String | A string used as a hash for generating the `shard_id` for a document when it is stored in OpenSearch. Each incoming record is searched. When present, the string is used as the routing field for the document. When not present, the default routing mechanism (`document_id`) is used by OpenSearch when storing the document. Supports formatting with fields in events and [Data Prepper expressions]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/), such as `${/my_field}-test-${getMetadata(\"some_metadata_key\")}`. +`document_root_key` | No | String | The key in the event that will be used as the root in the document. The default is the root of the event. If the key does not exist, then the entire event is written as the document. If `document_root_key` is of a basic value type, such as a string or integer, then the document will have a structure of `{"data": }`. +`serverless` | No | Boolean | **Deprecated in Data Prepper 2.7. Use this option with the `aws` configuration instead.** Determines whether the OpenSearch backend is Amazon OpenSearch Serverless. Set this value to `true` when the destination for the `opensearch` sink is an Amazon OpenSearch Serverless collection. Default is `false`. +`serverless_options` | No | Object | **Deprecated in Data Prepper 2.7. Use this option with the `aws` configuration instead.** The network configuration options available when the backend of the `opensearch` sink is set to Amazon OpenSearch Serverless. For more information, see [Serverless options](#serverless-options). + + +## aws + Option | Required | Type | Description :--- | :--- | :--- | :--- -hosts | Yes | List | List of OpenSearch hosts to write to (for example, `["https://localhost:9200", "https://remote-cluster:9200"]`). -cert | No | String | Path to the security certificate (for example, `"config/root-ca.pem"`) if the cluster uses the OpenSearch Security plugin. -username | No | String | Username for HTTP basic authentication. -password | No | String | Password for HTTP basic authentication. -aws_sigv4 | No | Boolean | Default value is false. Whether to use AWS Identity and Access Management (IAM) signing to connect to an Amazon OpenSearch Service domain. For your access key, secret key, and optional session token, Data Prepper uses the default credential chain (environment variables, Java system properties, `~/.aws/credential`, etc.). -aws_region | No | String | The AWS region (for example, `"us-east-1"`) for the domain if you are connecting to Amazon OpenSearch Service. -aws_sts_role_arn | No | String | IAM role that the plugin uses to sign requests sent to Amazon OpenSearch Service. If this information is not provided, the plugin uses the default credentials. -[max_retries](#configure-max_retries) | No | Integer | The maximum number of times the OpenSearch sink should try to push data to the OpenSearch server before considering it to be a failure. Defaults to `Integer.MAX_VALUE`. If not provided, the sink will try to push data to the OpenSearch server indefinitely because the default value is high and exponential backoff would increase the waiting time before retry. -socket_timeout | No | Integer | The timeout, in milliseconds, waiting for data to return (or the maximum period of inactivity between two consecutive data packets). A timeout value of zero is interpreted as an infinite timeout. If this timeout value is negative or not set, the underlying Apache HttpClient would rely on operating system settings for managing socket timeouts. -connect_timeout | No | Integer | The timeout in milliseconds used when requesting a connection from the connection manager. A timeout value of zero is interpreted as an infinite timeout. If this timeout value is negative or not set, the underlying Apache HttpClient would rely on operating system settings for managing connection timeouts. -insecure | No | Boolean | Whether or not to verify SSL certificates. If set to true, certificate authority (CA) certificate verification is disabled and insecure HTTP requests are sent instead. Default value is `false`. -proxy | No | String | The address of a [forward HTTP proxy server](https://en.wikipedia.org/wiki/Proxy_server). The format is "<host name or IP>:<port>". Examples: "example.com:8100", "http://example.com:8100", "112.112.112.112:8100". Port number cannot be omitted. -index | Conditionally | String | Name of the export index. Applicable and required only when the `index_type` is `custom`. -index_type | No | String | This index type tells the Sink plugin what type of data it is handling. Valid values: `custom`, `trace-analytics-raw`, `trace-analytics-service-map`, `management-disabled`. Default value is `custom`. -template_type | No | String | Defines what type of OpenSearch template to use. The available options are `v1` and `index-template`. The default value is `v1`, which uses the original OpenSearch templates available at the `_template` API endpoints. The `index-template` option uses composable [index templates]({{site.url}}{{site.baseurl}}/opensearch/index-templates/) which are available through OpenSearch's `_index_template` API. Composable index types offer more flexibility than the default and are necessary when an OpenSearch cluster has already existing index templates. Composable templates are available for all versions of OpenSearch and some later versions of Elasticsearch. When `distribution_version` is set to `es6`, Data Prepper enforces the `template_type` as `v1`. -template_file | No | String | The path to a JSON [index template]({{site.url}}{{site.baseurl}}/opensearch/index-templates/) file such as `/your/local/template-file.json` when `index_type` is set to `custom`. For an example template file, see [otel-v1-apm-span-index-template.json](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/opensearch/src/main/resources/otel-v1-apm-span-index-template.json). If you supply a template file it must match the template format specified by the `template_type` parameter. -document_id_field | No | String | The field from the source data to use for the OpenSearch document ID (for example, `"my-field"`) if `index_type` is `custom`. -dlq_file | No | String | The path to your preferred dead letter queue file (for example, `/your/local/dlq-file`). Data Prepper writes to this file when it fails to index a document on the OpenSearch cluster. -dlq | No | N/A | DLQ configurations. See [Dead Letter Queues]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/dlq/) for details. If the `dlq_file` option is also available, the sink will fail. -bulk_size | No | Integer (long) | The maximum size (in MiB) of bulk requests sent to the OpenSearch cluster. Values below 0 indicate an unlimited size. If a single document exceeds the maximum bulk request size, Data Prepper sends it individually. Default value is 5. -ism_policy_file | No | String | The absolute file path for an ISM (Index State Management) policy JSON file. This policy file is effective only when there is no built-in policy file for the index type. For example, `custom` index type is currently the only one without a built-in policy file, thus it would use the policy file here if it's provided through this parameter. For more information, see [ISM policies]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/). -number_of_shards | No | Integer | The number of primary shards that an index should have on the destination OpenSearch server. This parameter is effective only when `template_file` is either explicitly provided in Sink configuration or built-in. If this parameter is set, it would override the value in index template file. For more information, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). -number_of_replicas | No | Integer | The number of replica shards each primary shard should have on the destination OpenSearch server. For example, if you have 4 primary shards and set number_of_replicas to 3, the index has 12 replica shards. This parameter is effective only when `template_file` is either explicitly provided in Sink configuration or built-in. If this parameter is set, it would override the value in index template file. For more information, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). -distribution_version | No | String | Indicates whether the sink backend version is Elasticsearch 6 or later. `es6` represents Elasticsearch 6. `default` represents the latest compatible backend version, such as Elasticsearch 7.x, OpenSearch 1.x, or OpenSearch 2.x. Default is `default`. -enable_request_compression | No | Boolean | Whether to enable compression when sending requests to OpenSearch. When `distribution_version` is set to `es6`, default is `false`. For all other distribution versions, default is `true`. -serverless | No | Boolean | Determines whether the OpenSearch backend is Amazon OpenSearch Serverless. Set this value to `true` when the destination for the `opensearch` sink is an Amazon OpenSearch Serverless collection. Default is `false`. -serverless_options | No | Object | The network configuration options available when the backend of the `opensearch` sink is set to Amazon OpenSearch Serverless. For more information, see [Serverless options](#serverless-options). - -### Serverless options +`region` | No | String | The AWS Region to use for credentials. Defaults to [standard SDK behavior to determine the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon SQS and Amazon S3. Defaults to `null`, which will use [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`sts_header_overrides` | No | Map | A map of header overrides that the IAM role assumes for the sink plugin. +`sts_external_id` | No | String | The external ID to attach to AssumeRole requests from AWS STS. +`serverless` | No | Boolean | Determines whether the OpenSearch backend is Amazon OpenSearch Serverless. Set this value to `true` when the destination for the `opensearch` sink is an Amazon OpenSearch Serverless collection. Default is `false`. +`serverless_options` | No | Object | The network configuration options available when the backend of the `opensearch` sink is set to Amazon OpenSearch Serverless. For more information, see [Serverless options](#serverless-options). + + +## actions + + +The following options can be used inside the `actions` option. + +Option | Required | Type | Description +:--- |:---| :--- | :--- +`type` | Yes | String | The type of bulk action to use if the `when` condition evaluates to true. Must be either `create`, `index`, `update`, `upsert`, or `delete`. +`when` | No | String | A [Data Prepper expression]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/) that conditionally evaluates whether an event will be sent to OpenSearch using the bulk action configured in `type`. When empty, the bulk action will be chosen automatically when the event is sent to OpenSearch. + + +## Serverless options The following options can be used in the `serverless_options` object. Option | Required | Type | Description :--- | :--- | :---| :--- -network_policy_name | Yes | String | The name of the network policy to create. -collection_name | Yes | String | The name of the Amazon OpenSearch Serverless collection to configure. -vpce_id | Yes | String | The virtual private cloud (VPC) endpoint to which the source connects. +`network_policy_name` | Yes | String | The name of the network policy to create. +`collection_name` | Yes | String | The name of the Amazon OpenSearch Serverless collection to configure. +`vpce_id` | Yes | String | The virtual private cloud (VPC) endpoint to which the source connects. ### Configure max_retries @@ -191,7 +226,6 @@ If your domain uses a master user in the internal user database, specify the mas sink: opensearch: hosts: ["https://your-fgac-amazon-opensearch-service-endpoint"] - aws_sigv4: false username: "master-username" password: "master-password" ``` @@ -302,3 +336,53 @@ log-pipeline: sts_role_arn: "arn:aws:iam:::role/PipelineRole" region: "us-east-1" ``` + +### Example with template_content and actions + +The following example pipeline contains both `template_content` and a list of conditional `actions`: + +```yaml +log-pipeline: + source: + http: + processor: + - date: + from_time_received: true + destination: "@timestamp" + sink: + - opensearch: + hosts: [ "https://" ] + index: "my-serverless-index" + template_type: index-template + template_content: > + { + "template" : { + "mappings" : { + "properties" : { + "Data" : { + "type" : "binary" + }, + "EncodedColors" : { + "type" : "binary" + }, + "Type" : { + "type" : "keyword" + }, + "LargeDouble" : { + "type" : "double" + } + } + } + } + } + # index is the default case + actions: + - type: "delete" + when: '/operation == "delete"' + - type: "update" + when: '/operation == "update"' + - type: "index" + aws: + sts_role_arn: "arn:aws:iam:::role/PipelineRole" + region: "us-east-1" +``` diff --git a/_data-prepper/pipelines/configuration/sinks/s3.md b/_data-prepper/pipelines/configuration/sinks/s3.md index cb881e814a..c752bf6b3d 100644 --- a/_data-prepper/pipelines/configuration/sinks/s3.md +++ b/_data-prepper/pipelines/configuration/sinks/s3.md @@ -8,7 +8,22 @@ nav_order: 55 # s3 -The `s3` sink saves batches of events to [Amazon Simple Storage Service (Amazon S3)](https://aws.amazon.com/s3/) objects. +The `s3` sink saves and writes batches of Data Prepper events to Amazon Simple Storage Service (Amazon S3) objects. The configured `codec` determines how the `s3` sink serializes the data into Amazon S3. + +The `s3` sink uses the following format when batching events: + +``` +${pathPrefix}events-%{yyyy-MM-dd'T'HH-mm-ss'Z'}-${currentTimeInNanos}-${uniquenessId}.${codecSuppliedExtension} +``` + +When a batch of objects is written to S3, the objects are formatted similarly to the following: + +``` +my-logs/2023/06/09/06/events-2023-06-09T06-00-01-1686290401871214927-ae15b8fa-512a-59c2-b917-295a0eff97c8.json +``` + + +For more information about how to configure an object, see the [Object key](#object-key-configuration) section. ## Usage @@ -22,14 +37,12 @@ pipeline: aws: region: us-east-1 sts_role_arn: arn:aws:iam::123456789012:role/Data-Prepper - sts_header_overrides: max_retries: 5 - bucket: - name: bucket_name - object_key: - path_prefix: my-elb/%{yyyy}/%{MM}/%{dd}/ + bucket: mys3bucket + object_key: + path_prefix: my-logs/%{yyyy}/%{MM}/%{dd}/ threshold: - event_count: 2000 + event_count: 10000 maximum_size: 50mb event_collect_timeout: 15s codec: @@ -37,17 +50,37 @@ pipeline: buffer_type: in_memory ``` +## IAM permissions + +In order to use the `s3` sink, configure AWS Identity and Access Management (IAM) to grant Data Prepper permissions to write to Amazon S3. You can use a configuration similar to the following JSON configuration: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "s3-access", + "Effect": "Allow", + "Action": [ + "s3:PutObject" + ], + "Resource": "arn:aws:s3:::/*" + } + ] +} +``` + ## Configuration Use the following options when customizing the `s3` sink. Option | Required | Type | Description :--- | :--- | :--- | :--- -`bucket` | Yes | String | The object from which the data is retrieved and then stored. The `name` must match the name of your object store. -`codec` | Yes | [Buffer type](#buffer-type) | Determines the buffer type. +`bucket` | Yes | String | The name of the S3 bucket to which the sink writes. +`codec` | Yes | [Codec](#codec) | The codec that determines how the data is serialized in the S3 object. `aws` | Yes | AWS | The AWS configuration. See [aws](#aws) for more information. `threshold` | Yes | [Threshold](#threshold-configuration) | Configures when to write an object to S3. -`object_key` | No | Sets the `path_prefix` and the `file_pattern` of the object store. Defaults to the S3 object `events-%{yyyy-MM-dd'T'hh-mm-ss}` found inside the root directory of the bucket. +`object_key` | No | [Object key](#object-key-configuration) | Sets the `path_prefix` of the object in S3. Defaults to the S3 object `events-%{yyyy-MM-dd'T'hh-mm-ss}` found in the root directory of the bucket. `compression` | No | String | The compression algorithm to apply: `none`, `gzip`, or `snappy`. Default is `none`. `buffer_type` | No | [Buffer type](#buffer-type) | Determines the buffer type. `max_retries` | No | Integer | The maximum number of times a single request should retry when ingesting data to S3. Defaults to `5`. @@ -59,33 +92,34 @@ Option | Required | Type | Description `region` | No | String | The AWS Region to use for credentials. Defaults to [standard SDK behavior to determine the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). `sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon SQS and Amazon S3. Defaults to `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). `sts_header_overrides` | No | Map | A map of header overrides that the IAM role assumes for the sink plugin. -`sts_external_id` | No | String | The external ID to attach to AssumeRole requests from AWS STS. +`sts_external_id` | No | String | An STS external ID used when Data Prepper assumes the role. For more information, see the `ExternalId` documentation in the [STS AssumeRole](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html) API reference. + ## Threshold configuration -Use the following options to set ingestion thresholds for the `s3` sink. +Use the following options to set ingestion thresholds for the `s3` sink. When any of these conditions are met, Data Prepper will write events to an S3 object. Option | Required | Type | Description :--- | :--- | :--- | :--- -`event_count` | Yes | Integer | The maximum number of events the S3 bucket can ingest. -`maximum_size` | Yes | String | The maximum number of bytes that the S3 bucket can ingest after compression. Defaults to `50mb`. -`event_collect_timeout` | Yes | String | Sets the time period during which events are collected before ingestion. All values are strings that represent duration, either an ISO_8601 notation string, such as `PT20.345S`, or a simple notation, such as `60s` or `1500ms`. +`event_count` | Yes | Integer | The number of Data Prepper events to accumulate before writing an object to S3. +`maximum_size` | No | String | The maximum number of bytes to accumulate before writing an object to S3. Default is `50mb`. +`event_collect_timeout` | Yes | String | The maximum amount of time before Data Prepper writes an event to S3. The value should be either an ISO-8601 duration, such as `PT2M30S`, or a simple notation, such as `60s` or `1500ms`. ## Buffer type -`buffer_type` is an optional configuration that records stored events temporarily before flushing them into an S3 bucket. The default value is `in_memory`. Use one of the following options: +`buffer_type` is an optional configuration that determines how Data Prepper temporarily stores data before writing an object to S3. The default value is `in_memory`. Use one of the following options: - `in_memory`: Stores the record in memory. -- `local_file`: Flushes the record into a file on your machine. +- `local_file`: Flushes the record into a file on your local machine. This uses your machine's temporary directory. - `multipart`: Writes using the [S3 multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpuoverview.html). Every 10 MB is written as a part. ## Object key configuration Option | Required | Type | Description :--- | :--- | :--- | :--- -`path_prefix` | Yes | String | The S3 key prefix path to use. Accepts date-time formatting. For example, you can use `%{yyyy}/%{MM}/%{dd}/%{HH}/` to create hourly folders in S3. By default, events write to the root of the bucket. +`path_prefix` | No | String | The S3 key prefix path to use for objects written to S3. Accepts date-time formatting. For example, you can use `%{yyyy}/%{MM}/%{dd}/%{HH}/` to create hourly folders in S3. The prefix path should end with `/`. By default, Data Prepper writes objects to the root of the S3 bucket. ## codec @@ -156,3 +190,49 @@ Option | Required | Type | Description `schema` | Yes | String | The Avro [schema declaration](https://avro.apache.org/docs/current/specification/#schema-declaration). Not required if `auto_schema` is set to true. `auto_schema` | No | Boolean | When set to `true`, automatically generates the Avro [schema declaration](https://avro.apache.org/docs/current/specification/#schema-declaration) from the first event. +### Setting a schema with Parquet + +The following example shows you how to configure the `s3` sink to write Parquet data into a Parquet file using a schema for [VPC Flow Logs](https://docs.aws.amazon.com/vpc/latest/userguide/flow-logs.html#flow-log-records): + +``` +pipeline: + ... + sink: + - s3: + aws: + region: us-east-1 + sts_role_arn: arn:aws:iam::123456789012:role/Data-Prepper + bucket: mys3bucket + object_key: + path_prefix: vpc-flow-logs/%{yyyy}/%{MM}/%{dd}/%{HH}/ + codec: + parquet: + schema: > + { + "type" : "record", + "namespace" : "org.opensearch.dataprepper.examples", + "name" : "VpcFlowLog", + "fields" : [ + { "name" : "version", "type" : ["null", "string"]}, + { "name" : "srcport", "type": ["null", "int"]}, + { "name" : "dstport", "type": ["null", "int"]}, + { "name" : "accountId", "type" : ["null", "string"]}, + { "name" : "interfaceId", "type" : ["null", "string"]}, + { "name" : "srcaddr", "type" : ["null", "string"]}, + { "name" : "dstaddr", "type" : ["null", "string"]}, + { "name" : "start", "type": ["null", "int"]}, + { "name" : "end", "type": ["null", "int"]}, + { "name" : "protocol", "type": ["null", "int"]}, + { "name" : "packets", "type": ["null", "int"]}, + { "name" : "bytes", "type": ["null", "int"]}, + { "name" : "action", "type": ["null", "string"]}, + { "name" : "logStatus", "type" : ["null", "string"]} + ] + } + threshold: + event_count: 500000000 + maximum_size: 20mb + event_collect_timeout: PT15M + buffer_type: in_memory +``` + diff --git a/_data-prepper/pipelines/configuration/sources/dynamo-db.md b/_data-prepper/pipelines/configuration/sources/dynamo-db.md index 597e835151..f75489f103 100644 --- a/_data-prepper/pipelines/configuration/sources/dynamo-db.md +++ b/_data-prepper/pipelines/configuration/sources/dynamo-db.md @@ -31,6 +31,7 @@ cdc-pipeline: s3_prefix: "myprefix" stream: start_position: "LATEST" # Read latest data from streams (Default) + view_on_remove: NEW_IMAGE aws: region: "us-west-2" sts_role_arn: "arn:aws:iam::123456789012:role/my-iam-role" @@ -84,12 +85,112 @@ Option | Required | Type | Description The following option lets you customize how the pipeline reads events from the DynamoDB table. -Option | Required | Type | Description +Option | Required | Type | Description :--- | :--- | :--- | :--- `start_position` | No | String | The position from where the source starts reading stream events when the DynamoDB stream option is enabled. `LATEST` starts reading events from the most recent stream record. +`view_on_remove` | No | Enum | The [stream record view](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html) to use for REMOVE events from DynamoDB streams. Must be either `NEW_IMAGE` or `OLD_IMAGE` . Defaults to `NEW_IMAGE`. If the `OLD_IMAGE` option is used and the old image can not be found, the source will find the `NEW_IMAGE`. + +## Exposed metadata attributes + +The following metadata will be added to each event that is processed by the `dynamodb` source. These metadata attributes can be accessed using the [expression syntax `getMetadata` function](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/#getmetadata). + +* `primary_key`: The primary key of the DynamoDB item. For tables that only contain a partition key, this value provides the partition key. For tables that contain both a partition and sort key, the `primary_key` attribute will be equal to the partition and sort key, separated by a `|`, for example, `partition_key|sort_key`. +* `partition_key`: The partition key of the DynamoDB item. +* `sort_key`: The sort key of the DynamoDB item. This will be null if the table does not contain a sort key. +* `dynamodb_timestamp`: The timestamp of the DynamoDB item. This will be the export time for export items and the DynamoDB stream event time for stream items. This timestamp is used by sinks to emit an `EndtoEndLatency` metric for DynamoDB stream events that tracks the latency between a change occurring in the DynamoDB table and that change being applied to the sink. +* `document_version`: Uses the `dynamodb_timestamp` to modify break ties between stream items that are received in the same second. Recommend for use with the `opensearch` sink's `document_version` setting. +* `opensearch_action`: A default value for mapping DynamoDB event actions to OpenSearch actions. This action will be `index` for export items, and `INSERT` or `MODIFY` for stream events, and `REMOVE` stream events when the OpenSearch action is `delete`. +* `dynamodb_event_name`: The exact event type for the item. Will be `null` for export items and either `INSERT`, `MODIFY`, or `REMOVE` for stream events. +* `table_name`: The name of the DynamoDB table that an event came from. + + +## Permissions + +The following are the minimum required permissions for running DynamoDB as a source: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "allowDescribeTable", + "Effect": "Allow", + "Action": [ + "dynamodb:DescribeTable" + ], + "Resource": [ + "arn:aws:dynamodb:us-east-1:{account-id}:table/my-table" + ] + }, + { + "Sid": "allowRunExportJob", + "Effect": "Allow", + "Action": [ + "dynamodb:DescribeContinuousBackups", + "dynamodb:ExportTableToPointInTime" + ], + "Resource": [ + "arn:aws:dynamodb:us-east-1:{account-id}:table/my-table" + ] + }, + { + "Sid": "allowCheckExportjob", + "Effect": "Allow", + "Action": [ + "dynamodb:DescribeExport" + ], + "Resource": [ + "arn:aws:dynamodb:us-east-1:{account-id}:table/my-table/export/*" + ] + }, + { + "Sid": "allowReadFromStream", + "Effect": "Allow", + "Action": [ + "dynamodb:DescribeStream", + "dynamodb:GetRecords", + "dynamodb:GetShardIterator" + ], + "Resource": [ + "arn:aws:dynamodb:us-east-1:{account-id}:table/my-table/stream/*" + ] + }, + { + "Sid": "allowReadAndWriteToS3ForExport", + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:AbortMultipartUpload", + "s3:PutObject", + "s3:PutObjectAcl" + ], + "Resource": [ + "arn:aws:s3:::my-bucket/*" + ] + } + ] +} +``` + +When performing an export, the `"Sid": "allowReadFromStream"` section is not required. If only reading from DynamoDB streams, the +`"Sid": "allowReadAndWriteToS3ForExport"`, `"Sid": "allowCheckExportjob"`, and ` "Sid": "allowRunExportJob"` sections are not required. + +## Metrics +The `dynamodb` source includes the following metrics. +### Counters +* `exportJobSuccess`: The number of export jobs that have been submitted successfully. +* `exportJobFailure`: The number of export job submission attempts that have failed. +* `exportS3ObjectsTotal`: The total number of export data files found in S3. +* `exportS3ObjectsProcessed`: The total number of export data files that have been processed successfully from S3. +* `exportRecordsTotal`: The total number of records found in the export. +* `exportRecordsProcessed`: The total number of export records that have been processed successfully. +* `exportRecordsProcessingErrors`: The number of export record processing errors. +* `changeEventsProcessed`: The number of change events processed from DynamoDB streams. +* `changeEventsProcessingErrors`: The number of processing errors for change events from DynamoDB streams. +* `shardProgress`: The incremented shard progress when DynamoDB streams are being read correctly. This being`0` for any significant amount of time means there is a problem with the pipeline that has streams enabled. diff --git a/_data-prepper/pipelines/configuration/sources/opensearch.md b/_data-prepper/pipelines/configuration/sources/opensearch.md index b4b3ddc663..7cc0b9a36a 100644 --- a/_data-prepper/pipelines/configuration/sources/opensearch.md +++ b/_data-prepper/pipelines/configuration/sources/opensearch.md @@ -200,6 +200,26 @@ Option | Required | Type | Description `sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon OpenSearch Service and Amazon OpenSearch Serverless. Default is `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). `serverless` | No | Boolean | Should be set to `true` when processing from an Amazon OpenSearch Serverless collection. Defaults to `false`. +## Metrics + +The `opensearch` source includes the following metrics. + +### Counters + +- `documentsProcessed`: Measures the total number of documents processed by the `opensearch` source plugin. +- `indicesProcessed`: Measures the total number of indexes processed by the `opensearch` source plugin. +- `processingErrors`: Measures the total number of index processing errors made by the `opensearch` source plugin. +- `credentialsChanged`: Measures the number of times that the `opensearch` source refreshes basic credentials (username/password). +- `clientRefreshErrors`: Measures the number of errors encountered when generating a new client due to the `opensearch` source refreshing basic credentials. + +### Timers + +- `indexProcessingTime`: Measures the `opensearch` source plugin index processing latency, in seconds. + +### Distribution summaries + +- `bytesReceived`: Measures the size distribution of incoming documents, in bytes, received by the `opensearch` source plugin. +- `bytesProcessed`: Measures the size distribution of incoming document, in bytes, successfully processed by the `opensearch` source plugin. ## OpenSearch cluster security diff --git a/_data-prepper/pipelines/configuration/sources/otel-trace.md b/_data-prepper/pipelines/configuration/sources/otel-trace-source.md similarity index 89% rename from _data-prepper/pipelines/configuration/sources/otel-trace.md rename to _data-prepper/pipelines/configuration/sources/otel-trace-source.md index 4b17647768..137592bbe8 100644 --- a/_data-prepper/pipelines/configuration/sources/otel-trace.md +++ b/_data-prepper/pipelines/configuration/sources/otel-trace-source.md @@ -1,22 +1,22 @@ --- layout: default -title: otel_trace_source source +title: otel_trace_source parent: Sources grand_parent: Pipelines nav_order: 15 +redirect_from: + - /data-prepper/pipelines/configuration/sources/otel-trace/ --- -# otel_trace source +# otel_trace_source -## Overview - -The `otel_trace` source is a source for the OpenTelemetry Collector. The following table describes options you can use to configure the `otel_trace` source. +`otel_trace_source` is a source for the OpenTelemetry Collector. The following table describes options you can use to configure the `otel_trace_source` source. Option | Required | Type | Description :--- | :--- | :--- | :--- -port | No | Integer | The port that the `otel_trace` source runs on. Default value is `21890`. +port | No | Integer | The port that the `otel_trace_source` source runs on. Default value is `21890`. request_timeout | No | Integer | The request timeout, in milliseconds. Default value is `10000`. health_check_service | No | Boolean | Enables a gRPC health check service under `grpc.health.v1/Health/Check`. Default value is `false`. unauthenticated_health_check | No | Boolean | Determines whether or not authentication is required on the health check endpoint. Data Prepper ignores this option if no authentication is defined. Default value is `false`. @@ -35,6 +35,8 @@ authentication | No | Object | An authentication configuration. By default, an u ## Metrics +The 'otel_trace_source' source includes the following metrics. + ### Counters - `requestTimeouts`: Measures the total number of requests that time out. @@ -50,4 +52,4 @@ authentication | No | Object | An authentication configuration. By default, an u ### Distribution summaries -- `payloadSize`: Measures the incoming request payload size distribution in bytes. \ No newline at end of file +- `payloadSize`: Measures the incoming request payload size distribution in bytes. diff --git a/_data-prepper/pipelines/configuration/sources/s3.md b/_data-prepper/pipelines/configuration/sources/s3.md index 7dc31caade..7a3746bab6 100644 --- a/_data-prepper/pipelines/configuration/sources/s3.md +++ b/_data-prepper/pipelines/configuration/sources/s3.md @@ -8,7 +8,10 @@ nav_order: 20 # s3 source -`s3` is a source plugin that reads events from [Amazon Simple Storage Service (Amazon S3)](https://aws.amazon.com/s3/) objects. It requires an [Amazon Simple Queue Service (Amazon SQS)](https://aws.amazon.com/sqs/) queue that receives [S3 Event Notifications](https://docs.aws.amazon.com/AmazonS3/latest/userguide/NotificationHowTo.html). After Amazon SQS is configured, the `s3` source receives messages from Amazon SQS. When the SQS message indicates that an S3 object was created, the `s3` source loads the S3 objects and then parses them using the configured [codec](#codec). You can also configure the `s3` source to use [Amazon S3 Select](https://docs.aws.amazon.com/AmazonS3/latest/userguide/selecting-content-from-objects.html) instead of Data Prepper to parse S3 objects. +`s3` is a source plugin that reads events from [Amazon Simple Storage Service (Amazon S3)](https://aws.amazon.com/s3/) objects. You can configure the source to either use an [Amazon Simple Queue Service (Amazon SQS)](https://aws.amazon.com/sqs/) queue or scan an S3 bucket: + +- To use Amazon SQS notifications, configure S3 event notifications on your S3 bucket. After Amazon SQS is configured, the `s3` source receives messages from Amazon SQS. When the SQS message indicates that an S3 object has been created, the `s3` source loads the S3 objects and then parses them using the configured [codec](#codec). +- To use an S3 bucket, configure the `s3` source to use Amazon S3 Select instead of Data Prepper to parse S3 objects. ## IAM permissions @@ -86,19 +89,23 @@ Option | Required | Type | Description :--- | :--- | :--- | :--- `notification_type` | Yes | String | Must be `sqs`. `notification_source` | No | String | Determines how notifications are received by SQS. Must be `s3` or `eventbridge`. `s3` represents notifications that are directly sent from Amazon S3 to Amazon SQS or fanout notifications from Amazon S3 to Amazon Simple Notification Service (Amazon SNS) to Amazon SQS. `eventbridge` represents notifications from [Amazon EventBridge](https://aws.amazon.com/eventbridge/) and [Amazon Security Lake](https://aws.amazon.com/security-lake/). Default is `s3`. -`compression` | No | String | The compression algorithm to apply: `none`, `gzip`, or `automatic`. Default is `none`. +`compression` | No | String | The compression algorithm to apply: `none`, `gzip`, `snappy`, or `automatic`. Default is `none`. `codec` | Yes | Codec | The [codec](#codec) to apply. `sqs` | Yes | SQS | The SQS configuration. See [sqs](#sqs) for more information. `aws` | Yes | AWS | The AWS configuration. See [aws](#aws) for more information. `on_error` | No | String | Determines how to handle errors in Amazon SQS. Can be either `retain_messages` or `delete_messages`. `retain_messages` leaves the message in the Amazon SQS queue and tries to send the message again. This is recommended for dead-letter queues. `delete_messages` deletes failed messages. Default is `retain_messages`. -buffer_timeout | No | Duration | The amount of time allowed for writing events to the Data Prepper buffer before timeout occurs. Any events that the Amazon S3 source cannot write to the buffer during the set amount of time are discarded. Default is `10s`. +`buffer_timeout` | No | Duration | The amount of time allowed for writing events to the Data Prepper buffer before timeout occurs. Any events that the Amazon S3 source cannot write to the buffer during the specified amount of time are discarded. Default is `10s`. `records_to_accumulate` | No | Integer | The number of messages that accumulate before being written to the buffer. Default is `100`. `metadata_root_key` | No | String | The base key for adding S3 metadata to each event. The metadata includes the key and bucket for each S3 object. Default is `s3/`. +`default_bucket_owner` | No | String | The AWS account ID for the owner of an S3 bucket. For more information, see [Cross-account S3 access](#s3_bucket_ownership). +`bucket_owners` | No | Map | A map of bucket names that includes the IDs of the accounts that own the buckets. For more information, see [Cross-account S3 access](#s3_bucket_ownership). `disable_bucket_ownership_validation` | No | Boolean | When `true`, the S3 source does not attempt to validate that the bucket is owned by the expected account. The expected account is the same account that owns the Amazon SQS queue. Default is `false`. `acknowledgments` | No | Boolean | When `true`, enables `s3` sources to receive [end-to-end acknowledgments]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines#end-to-end-acknowledgments) when events are received by OpenSearch sinks. `s3_select` | No | [s3_select](#s3_select) | The Amazon S3 Select configuration. `scan` | No | [scan](#scan) | The S3 scan configuration. `delete_s3_objects_on_read` | No | Boolean | When `true`, the S3 scan attempts to delete S3 objects after all events from the S3 object are successfully acknowledged by all sinks. `acknowledgments` should be enabled when deleting S3 objects. Default is `false`. +`workers` | No | Integer | Configures the number of worker threads that the source uses to read data from S3. Leaving this value at the default unless your S3 objects are less than 1MB. Performance may decrease for larger S3 objects. This setting only affects SQS-based sources. Default is `1`. + ## sqs @@ -112,7 +119,7 @@ Option | Required | Type | Description `visibility_timeout` | No | Duration | The visibility timeout to apply to messages read from the Amazon SQS queue. This should be set to the amount of time that Data Prepper may take to read all the S3 objects in a batch. Default is `30s`. `wait_time` | No | Duration | The amount of time to wait for long polling on the Amazon SQS API. Default is `20s`. `poll_delay` | No | Duration | A delay placed between the reading and processing of a batch of Amazon SQS messages and making a subsequent request. Default is `0s`. -`visibility_duplication_protection` | No | Boolean | If set to `true`, Data Prepper attempts to avoid duplicate processing by extending the visibility timeout of SQS messages. Until the data reaches the sink, Data Prepper will regularly call `ChangeMessageVisibility` to avoid reading the S3 object again. To use this feature, you need to grant permissions to `ChangeMessageVisibility` on the IAM role. Default is `false`. +`visibility_duplication_protection` | No | Boolean | If set to `true`, Data Prepper attempts to avoid duplicate processing by extending the visibility timeout of SQS messages. Until the data reaches the sink, Data Prepper will regularly call `ChangeMessageVisibility` to avoid rereading of the S3 object. To use this feature, you need to grant permissions to `sqs:ChangeMessageVisibility` on the IAM role. Default is `false`. `visibility_duplicate_protection_timeout` | No | Duration | Sets the maximum total length of time that a message will not be processed when using `visibility_duplication_protection`. Defaults to two hours. @@ -123,6 +130,7 @@ Option | Required | Type | Description `region` | No | String | The AWS Region to use for credentials. Defaults to [standard SDK behavior to determine the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). `sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon SQS and Amazon S3. Defaults to `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). `aws_sts_header_overrides` | No | Map | A map of header overrides that the IAM role assumes for the sink plugin. +`sts_external_id` | No | String | An STS external ID used when Data Prepper assumes the STS role. For more information, see the `ExternalID` documentation in the [STS AssumeRole](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html) API reference. ## codec @@ -154,9 +162,6 @@ Option | Required | Type | Description `header` | No | String list | The header containing the column names used to parse CSV data. `detect_header` | No | Boolean | Whether the first line of the Amazon S3 object should be interpreted as a header. Default is `true`. - - - ## Using `s3_select` with the `s3` source When configuring `s3_select` to parse Amazon S3 objects, use the following options: @@ -198,16 +203,18 @@ Option | Required | Type | Description `start_time` | No | String | The time from which to start scanning objects modified after the given `start_time`. This should follow [ISO LocalDateTime](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_LOCAL_DATE_TIME) format, for example, `023-01-23T10:00:00`. If `end_time` is configured along with `start_time`, all objects after `start_time` and before `end_time` will be processed. `start_time` and `range` cannot be used together. `end_time` | No | String | The time after which no objects will be scanned after the given `end_time`. This should follow [ISO LocalDateTime](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_LOCAL_DATE_TIME) format, for example, `023-01-23T10:00:00`. If `start_time` is configured along with `end_time`, all objects after `start_time` and before `end_time` will be processed. `end_time` and `range` cannot be used together. `range` | No | String | The time range from which objects are scanned from all buckets. Supports ISO_8601 notation strings, such as `PT20.345S` or `PT15M`, and notation strings for seconds (`60s`) and milliseconds (`1600ms`). `start_time` and `end_time` cannot be used with `range`. Range `P12H` scans all the objects modified in the last 12 hours from the time pipeline started. -`buckets` | Yes | List | A list of [buckets](#bucket) to scan. +`buckets` | Yes | List | A list of [scan buckets](#scan-bucket) to scan. `scheduling` | No | List | The configuration for scheduling periodic scans on all buckets. `start_time`, `end_time` and `range` can not be used if scheduling is configured. -### bucket + +### scan bucket + Option | Required | Type | Description :--- | :--- |:-----| :--- `bucket` | Yes | Map | Provides options for each bucket. -You can configure the following options inside the [bucket](#bucket) setting. +You can configure the following options in the `bucket` setting map. Option | Required | Type | Description :--- | :--- | :--- | :--- @@ -244,13 +251,17 @@ The `s3` source includes the following metrics: * `s3ObjectsNotFound`: The number of S3 objects that the `s3` source failed to read due to an S3 "Not Found" error. These are also counted toward `s3ObjectsFailed`. * `s3ObjectsAccessDenied`: The number of S3 objects that the `s3` source failed to read due to an "Access Denied" or "Forbidden" error. These are also counted toward `s3ObjectsFailed`. * `s3ObjectsSucceeded`: The number of S3 objects that the `s3` source successfully read. +* `s3ObjectNoRecordsFound`: The number of S3 objects that resulted in 0 records being added to the buffer by the `s3` source. +* `s3ObjectsDeleted`: The number of S3 objects deleted by the `s3` source. +* `s3ObjectsDeleteFailed`: The number of S3 objects that the `s3` source failed to delete. +* `s3ObjectsEmpty`: The number of S3 objects that are considered empty because they have a size of `0`. These objects will be skipped by the `s3` source. * `sqsMessagesReceived`: The number of Amazon SQS messages received from the queue by the `s3` source. * `sqsMessagesDeleted`: The number of Amazon SQS messages deleted from the queue by the `s3` source. * `sqsMessagesFailed`: The number of Amazon SQS messages that the `s3` source failed to parse. -* `s3ObjectNoRecordsFound` -- The number of S3 objects that resulted in 0 records added to the buffer by the `s3` source. * `sqsMessagesDeleteFailed` -- The number of SQS messages that the `s3` source failed to delete from the SQS queue. -* `s3ObjectsDeleted` -- The number of S3 objects deleted by the `s3` source. -* `s3ObjectsDeleteFailed` -- The number of S3 objects that the `s3` source failed to delete. +* `sqsVisibilityTimeoutChangedCount`: The number of times that the `s3` source changed the visibility timeout for an SQS message. This includes multiple visibility timeout changes on the same message. +* `sqsVisibilityTimeoutChangeFailedCount`: The number of times that the `s3` source failed to change the visibility timeout for an SQS message. This includes multiple visibility timeout change failures on the same message. +* `acknowledgementSetCallbackCounter`: The number of times that the `s3` source received an acknowledgment from Data Prepper. ### Timers diff --git a/_data-prepper/pipelines/expression-syntax.md b/_data-prepper/pipelines/expression-syntax.md index 8257ab8978..be0be6f792 100644 --- a/_data-prepper/pipelines/expression-syntax.md +++ b/_data-prepper/pipelines/expression-syntax.md @@ -230,7 +230,7 @@ The `length()` function takes one argument of the JSON pointer type and returns ### `hasTags()` -The `hastags()` function takes one or more string type arguments and returns `true` if all the arguments passed are present in an event's tags. When an argument does not exist in the event's tags, the function returns `false`. For example, if you use the expression `hasTags("tag1")` and the event contains `tag1`, Data Prepper returns `true`. If you use the expression `hasTags("tag2")` but the event only contains a `tag1` tag, Data Prepper returns `false`. +The `hasTags()` function takes one or more string type arguments and returns `true` if all of the arguments passed are present in an event's tags. When an argument does not exist in the event's tags, the function returns `false`. For example, if you use the expression `hasTags("tag1")` and the event contains `tag1`, Data Prepper returns `true`. If you use the expression `hasTags("tag2")` but the event only contains `tag1`, Data Prepper returns `false`. ### `getMetadata()` @@ -245,3 +245,21 @@ The `contains()` function takes two string arguments and determines whether eith The `cidrContains()` function takes two or more arguments. The first argument is a JSON pointer, which represents the key to the IP address that is checked. It supports both IPv4 and IPv6 addresses. Every argument that comes after the key is a string type that represents CIDR blocks that are checked against. If the IP address in the first argument is in the range of any of the given CIDR blocks, the function returns `true`. If the IP address is not in the range of the CIDR blocks, the function returns `false`. For example, `cidrContains(/sourceIp,"192.0.2.0/24","10.0.1.0/16")` will return `true` if the `sourceIp` field indicated in the JSON pointer has a value of `192.0.2.5`. + +### `join()` + +The `join()` function joins elements of a list to form a string. The function takes a JSON pointer, which represents the key to a list or a map where values are of the list type, and joins the lists as strings using commas (`,`), the default delimiter between strings. + +If `{"source": [1, 2, 3]}` is the input data, as shown in the following example: + + +```json +{"source": {"key1": [1, 2, 3], "key2": ["a", "b", "c"]}} +``` + +Then `join(/source)` will return `"1,2,3"` in the following format: + +```json +{"key1": "1,2,3", "key2": "a,b,c"} +``` +You can also specify a delimiter other than the default inside the expression. For example, `join("-", /source)` joins each `source` field using a hyphen (`-`) as the delimiter. diff --git a/_data/versions.json b/_data/versions.json index a7c197040c..03e43e6d4a 100644 --- a/_data/versions.json +++ b/_data/versions.json @@ -1,10 +1,12 @@ { - "current": "2.11", + "current": "2.13", "all": [ - "2.11", + "2.13", "1.3" ], "archived": [ + "2.12", + "2.11", "2.10", "2.9", "2.8", @@ -20,7 +22,7 @@ "1.1", "1.0" ], - "latest": "2.11" + "latest": "2.13" } diff --git a/_field-types/index.md b/_field-types/index.md index 9d8bd425f0..7a7e816ada 100644 --- a/_field-types/index.md +++ b/_field-types/index.md @@ -12,7 +12,7 @@ redirect_from: # Mappings and field types -You can define how documents and their fields are stored and indexed by creating a _mapping_. The mapping specifies the list of fields for a document. Every field in the document has a _field type_, which corresponds to the type of data the field contains. For example, you may want to specify that the `year` field should be of type `date`. To learn more, see [Supported field types]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/index/). +You can define how documents and their fields are stored and indexed by creating a _mapping_. The mapping specifies the list of fields for a document. Every field in the document has a _field type_, which defines the type of data the field contains. For example, you may want to specify that the `year` field should be of type `date`. To learn more, see [Supported field types]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/index/). If you're just starting to build out your cluster and data, you may not know exactly how your data should be stored. In those cases, you can use dynamic mappings, which tell OpenSearch to dynamically add data and its fields. However, if you know exactly what types your data falls under and want to enforce that standard, then you can use explicit mappings. @@ -219,4 +219,4 @@ The response contains the specified fields: } } } -``` \ No newline at end of file +``` diff --git a/_field-types/supported-field-types/date.md b/_field-types/supported-field-types/date.md index 5bd52a8358..e0b99f3a90 100644 --- a/_field-types/supported-field-types/date.md +++ b/_field-types/supported-field-types/date.md @@ -66,7 +66,7 @@ As of OpenSearch 2.12, the default date format is `strict_date_time_no_millis||s ## Built-in formats -Most of the date formats have a `strict_` counterpart. When the format starts with `strict_`, the date must have the correct number of digits specified in the format. For example, if the format is set to `strict_year_month_day` ("yyyy-MM-dd"), both month and day have to be two-digit numbers. So, "2020-06-09" is valid, while "2020-6-9" is invalid. +Most of the date formats have a `strict_` counterpart. When the format starts with `strict_`, the date must have the correct number of digits specified in the format. For example, if the format is set to `strict_year_month_day` (`"yyyy-MM-dd"`), both month and day have to be two-digit numbers. So, `"2020-06-09"` is valid, while `"2020-6-9"` is invalid. Epoch is defined as 00:00:00 UTC on January 1, 1970. {: .note } @@ -99,70 +99,71 @@ Components of basic date formats are not separated by a delimiter. For example, Format name and description | Pattern and examples :--- | :--- **Dates**| -`basic_date_time`
A basic date and time separated by `T`. | "yyyyMMdd`T`HHmmss.SSSZ"
"20190323T213446.123-04:00" -`basic_date_time_no_millis`
A basic date and time without milliseconds, separated by `T`. | "yyyyMMdd`T`HHmmssZ"
"20190323T213446-04:00" -`basic_date`
A date with a four-digit year, two-digit month, and two-digit day. | "yyyyMMdd"
"20190323" +`basic_date_time`
A basic date and time separated by `T`. | `"yyyyMMdd`T`HHmmss.SSSZ"`
`"20190323T213446.123-04:00"` +`basic_date_time_no_millis`
A basic date and time without milliseconds, separated by `T`. | `"yyyyMMdd`T`HHmmssZ"`
`"20190323T213446-04:00"` +`basic_date`
A date with a four-digit year, two-digit month, and two-digit day. | `"yyyyMMdd"
"20190323"` **Times** | -`basic_time`
A time with a two-digit hour, two-digit minute, two-digit second, three-digit millisecond, and time zone offset. |"HHmmss.SSSZ"
"213446.123-04:00" -`basic_time_no_millis`
A basic time without milliseconds. | "HHmmssZ"
"213446-04:00" +`basic_time`
A time with a two-digit hour, two-digit minute, two-digit second, three-digit millisecond, and time zone offset. |`"HHmmss.SSSZ"`
`"213446.123-04:00"` +`basic_time_no_millis`
A basic time without milliseconds. | `"HHmmssZ"`
`"213446-04:00"` **T times** | -`basic_t_time`
A basic time preceded by `T`. | "`T`HHmmss.SSSZ"
"T213446.123-04:00" -`basic_t_time_no_millis`
A basic time without milliseconds, preceded by `T`. | "`T`HHmmssZ"
"T213446-04:00" +`basic_t_time`
A basic time preceded by `T`. | `"`T`HHmmss.SSSZ"`
`"T213446.123-04:00"` +`basic_t_time_no_millis`
A basic time without milliseconds, preceded by `T`. | `"`T`HHmmssZ"`
`"T213446-04:00"` **Ordinal dates** | -`basic_ordinal_date_time`
A full ordinal date and time. | "yyyyDDD`T`HHmmss.SSSZ"
"2019082T213446.123-04:00" -`basic_ordinal_date_time_no_millis`
A full ordinal date and time without milliseconds. | "yyyyDDD`T`HHmmssZ"
"2019082T213446-04:00" -`basic_ordinal_date`
A date with a four-digit year and three-digit ordinal day of the year. | "yyyyDDD"
"2019082" +`basic_ordinal_date_time`
A full ordinal date and time. | `"yyyyDDD`T`HHmmss.SSSZ"`
`"2019082T213446.123-04:00"` +`basic_ordinal_date_time_no_millis`
A full ordinal date and time without milliseconds. | `"yyyyDDD`T`HHmmssZ"`
`"2019082T213446-04:00"` +`basic_ordinal_date`
A date with a four-digit year and three-digit ordinal day of the year. | `"yyyyDDD"`
`"2019082"` **Week-based dates** | -`basic_week_date_time`
`strict_basic_week_date_time`
A full week-based date and time separated by `T`. | "YYYY`W`wwe`T`HHmmss.SSSZ"
"2019W126213446.123-04:00" -`basic_week_date_time_no_millis`
`strict_basic_week_date_time_no_millis`
A basic week-based year date and time without milliseconds, separated by `T`. | "YYYY`W`wwe`T`HHmmssZ"
"2019W126213446-04:00" -`basic_week_date`
`strict_basic_week_date`
A full week-based date with a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week separated by `W`. | "YYYY`W`wwe"
"2019W126" +`basic_week_date_time`
`strict_basic_week_date_time`
A full week-based date and time separated by `T`. | `"YYYY`W`wwe`T`HHmmss.SSSZ"`
`"2019W126213446.123-04:00"` +`basic_week_date_time_no_millis`
`strict_basic_week_date_time_no_millis`
A basic week-based year date and time without milliseconds, separated by `T`. | `"YYYY`W`wwe`T`HHmmssZ"`
"2019W126213446-04:00" +`basic_week_date`
`strict_basic_week_date`
A full week-based date with a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week separated by `W`. | `"YYYY`W`wwe"`
`"2019W126"` ### Full date formats -Components of full date formats are separated by a `-` delimiter for date and `:` delimiter for time. For example, "2019-03-23T21:34". +Components of full date formats are separated by a `-` delimiter for date and `:` delimiter for time. For example, `"2019-03-23T21:34"`. Format name and description | Pattern and examples :--- | :--- **Dates** | -`date_optional_time`
`strict_date_optional_time`
A generic full date and time. Year is required. Month, day, and time are optional. Time is separated from date by `T`. | Multiple patterns.
"2019-03-23T21:34:46.123456789-04:00"
"2019-03-23T21:34:46"
"2019-03-23T21:34"
"2019" -`strict_date_optional_time_nanos`
A generic full date and time. Year is required. Month, day, and time are optional. If time is specified, it must contain hours, minutes, and seconds, but fraction of a second is optional. Fraction of a second is one to nine digits long and has nanosecond resolution. Time is separated from date by `T`. | Multiple patterns.
"2019-03-23T21:34:46.123456789-04:00"
"2019-03-23T21:34:46"
"2019" -`date_time`
`strict_date_time`
A full date and time separated by `T`. | "yyyy-MM-dd`T`HH:mm:ss.SSSZ"
"2019-03-23T21:34:46.123-04:00" -`date_time_no_millis`
`strict_date_time_no_millis`
A full date and time without milliseconds, separated by `T`. | "yyyy-MM-dd'T'HH:mm:ssZ"
"2019-03-23T21:34:46-04:00" -`date_hour_minute_second_fraction`
`strict_date_hour_minute_second_fraction`
A full date, two-digit hour, two-digit minute, two-digit second, and one- to nine-digit fraction of a second separated by `T`. | "yyyy-MM-dd`T`HH:mm:ss.SSSSSSSSS"
"2019-03-23T21:34:46.123456789"
"2019-03-23T21:34:46.1" -`date_hour_minute_second_millis`
`strict_date_hour_minute_second_millis`
A full date, two-digit hour, two-digit minute, two-digit second, and three-digit millisecond separated by `T`. | "yyyy-MM-dd`T`HH:mm:ss.SSS"
"2019-03-23T21:34:46.123" -`date_hour_minute_second`
`strict_date_hour_minute_second`
A full date, two-digit hour, two-digit minute, and two-digit second separated by `T`.| "yyyy-MM-dd`T`HH:mm:ss"
"2019-03-23T21:34:46" -`date_hour_minute`
`strict_date_hour_minute`
A full date, two-digit hour, and two-digit minute. | "yyyy-MM-dd`T`HH:mm"
"2019-03-23T21:34" -`date_hour`
`strict_date_hour`
A full date and two-digit hour, separated by `T`. | "yyyy-MM-dd`T`HH"
"2019-03-23T21" -`date`
`strict_date`
A four-digit year, two-digit month, and two-digit day. | "yyyy-MM-dd"
"2019-03-23" -`year_month_day`
`strict_year_month_day`
A four-digit year, two-digit month, and two-digit day. | "yyyy-MM-dd"
"2019-03-23" -`year_month`
`strict_year_month`
A four-digit year and two-digit month. | "yyyy-MM"
"2019-03" -`year`
`strict_year`
A four-digit year. | "yyyy"
"2019" +`date_optional_time`
`strict_date_optional_time`
A generic full date and time. Year is required. Month, day, and time are optional. Time is separated from date by `T`. | Multiple patterns.
`"2019--03--23T21:34:46.123456789--04:00"`
`"2019-03-23T21:34:46"`
`"2019-03-23T21:34"`
`"2019"` +`strict_date_optional_time_nanos`
A generic full date and time. Year is required. Month, day, and time are optional. If time is specified, it must contain hours, minutes, and seconds, but fraction of a second is optional. Fraction of a second is one to nine digits long and has nanosecond resolution. Time is separated from date by `T`. | Multiple patterns.
`"2019-03-23T21:34:46.123456789-04:00"`
`"2019-03-23T21:34:46"`
`"2019"` +`date_time`
`strict_date_time`
A full date and time separated by `T`. | `"yyyy-MM-dd`T`HH:mm:ss.SSSZ"`
`"2019-03-23T21:34:46.123-04:00"` +`date_time_no_millis`
`strict_date_time_no_millis`
A full date and time without milliseconds, separated by `T`. | `"yyyy-MM-dd'T'HH:mm:ssZ"`
`"2019-03-23T21:34:46-04:00"` +`date_hour_minute_second_fraction`
`strict_date_hour_minute_second_fraction`
A full date, two-digit hour, two-digit minute, two-digit second, and one- to nine-digit fraction of a second separated by `T`. | `"yyyy-MM-dd`T`HH:mm:ss.SSSSSSSSS"`
`"2019-03-23T21:34:46.123456789"`
`"2019-03-23T21:34:46.1"` +`date_hour_minute_second_millis`
`strict_date_hour_minute_second_millis`
A full date, two-digit hour, two-digit minute, two-digit second, and three-digit millisecond separated by `T`. | `"yyyy-MM-dd`T`HH:mm:ss.SSS"`
`"2019-03-23T21:34:46.123"` +`date_hour_minute_second`
`strict_date_hour_minute_second`
A full date, two-digit hour, two-digit minute, and two-digit second separated by `T`.| `"yyyy-MM-dd`T`HH:mm:ss"`
`"2019-03-23T21:34:46"` +`date_hour_minute`
`strict_date_hour_minute`
A full date, two-digit hour, and two-digit minute. | `"yyyy-MM-dd`T`HH:mm"`
`"2019-03-23T21:34"` +`date_hour`
`strict_date_hour`
A full date and two-digit hour, separated by `T`. | `"yyyy-MM-dd`T`HH"`
`"2019-03-23T21"` +`date`
`strict_date`
A four-digit year, two-digit month, and two-digit day. | `"yyyy-MM-dd"`
`"2019-03-23"` +`year_month_day`
`strict_year_month_day`
A four-digit year, two-digit month, and two-digit day. | `"yyyy-MM-dd"`
`"2019-03-23"` +`year_month`
`strict_year_month`
A four-digit year and two-digit month. | `"yyyy-MM"`
`"2019-03"` +`year`
`strict_year`
A four-digit year. | `"yyyy"`
`"2019"` +`rfc3339_lenient`
An RFC3339 compatible DateTimeFormatter which is much faster than other full date-lenient formats like `strict_date_optional_time` | `"YYYY"`
`"2019"`
`"YYYY-MM"`
`"2019-03"`
`"YYYY-MM-DD"`
`"2019-03-23"`
`"YYYY-MM-DDThh:mmTZD"`
`"2019-03-23T21:34Z"`
`"YYYY-MM-DDThh:mm:ssTZD"`
`"2019-03-23T21:34:46Z"`
`"YYYY-MM-DDThh:mm:ss.sTZD"`
`"2019-03-23T21:34:46.123456789-04:00"`
`"YYYY-MM-DDThh:mm:ss,sTZD"`
`"2019-03-23T21:34:46,123456789-04:00"` **Times** | -`time`
`strict_time`
A two-digit hour, two-digit minute, two-digit second, one- to nine-digit fraction of a second, and time zone offset. | "HH:mm:ss.SSSSSSSSSZ"
"21:34:46.123456789-04:00"
"21:34:46.1-04:00" -`time_no_millis`
`strict_time_no_millis`
A two-digit hour, two-digit minute, two-digit second, and time zone offset. | "HH:mm:ssZ"
"21:34:46-04:00" -`hour_minute_second_fraction`
`strict_hour_minute_second_fraction`
A two-digit hour, two-digit minute, two-digit second, and one- to nine-digit fraction of a second. | "HH:mm:ss.SSSSSSSSS"
"21:34:46.1"
"21:34:46.123456789" -`hour_minute_second_millis`
`strict_hour_minute_second_millis`
A two-digit hour, two-digit minute, two-digit second, and three-digit millisecond. | "HH:mm:ss.SSS"
"21:34:46.123" -`hour_minute_second`
`strict_hour_minute_second`
A two-digit hour, two-digit minute, and two-digit second. | "HH:mm:ss"
"21:34:46" -`hour_minute`
`strict_hour_minute`
A two-digit hour and two-digit minute. | "HH:mm"
"21:34" -`hour`
`strict_hour`
A two-digit hour. | "HH"
"21" +`time`
`strict_time`
A two-digit hour, two-digit minute, two-digit second, one- to nine-digit fraction of a second, and time zone offset. | `"HH:mm:ss.SSSSSSSSSZ"`
`"21:34:46.123456789-04:00"`
`"21:34:46.1-04:00"` +`time_no_millis`
`strict_time_no_millis`
A two-digit hour, two-digit minute, two-digit second, and time zone offset. | `"HH:mm:ssZ"`
`"21:34:46-04:00"` +`hour_minute_second_fraction`
`strict_hour_minute_second_fraction`
A two-digit hour, two-digit minute, two-digit second, and one- to nine-digit fraction of a second. | `"HH:mm:ss.SSSSSSSSS"`
`"21:34:46.1"`
`"21:34:46.123456789"` +`hour_minute_second_millis`
`strict_hour_minute_second_millis`
A two-digit hour, two-digit minute, two-digit second, and three-digit millisecond. | `"HH:mm:ss.SSS"`
`"21:34:46.123"` +`hour_minute_second`
`strict_hour_minute_second`
A two-digit hour, two-digit minute, and two-digit second. | `"HH:mm:ss"`
`"21:34:46"` +`hour_minute`
`strict_hour_minute`
A two-digit hour and two-digit minute. | `"HH:mm"`
`"21:34"` +`hour`
`strict_hour`
A two-digit hour. | `"HH"`
`"21"` **T times** | -`t_time`
`strict_t_time`
A two-digit hour, two-digit minute, two-digit second, one- to nine-digit fraction of a second, and time zone offset, preceded by `T`. | "`T`HH:mm:ss.SSSSSSSSSZ"
"T21:34:46.123456789-04:00"
"T21:34:46.1-04:00" -`t_time_no_millis`
`strict_t_time_no_millis`
A two-digit hour, two-digit minute, two-digit second, and time zone offset, preceded by `T`. | "`T`HH:mm:ssZ"
"T21:34:46-04:00" +`t_time`
`strict_t_time`
A two-digit hour, two-digit minute, two-digit second, one- to nine-digit fraction of a second, and time zone offset, preceded by `T`. | `"`T`HH:mm:ss.SSSSSSSSSZ"
"T21:34:46.123456789-04:00"`
`"T21:34:46.1-04:00"` +`t_time_no_millis`
`strict_t_time_no_millis`
A two-digit hour, two-digit minute, two-digit second, and time zone offset, preceded by `T`. | `"`T`HH:mm:ssZ"`
`"T21:34:46-04:00"` **Ordinal dates** | -`ordinal_date_time`
`strict_ordinal_date_time`
A full ordinal date and time separated by `T`. | "yyyy-DDD`T`HH:mm:ss.SSSZ"
"2019-082T21:34:46.123-04:00" -`ordinal_date_time_no_millis`
`strict_ordinal_date_time_no_millis`
A full ordinal date and time without milliseconds, separated by `T`. | "yyyy-DDD`T`HH:mm:ssZ"
"2019-082T21:34:46-04:00" -`ordinal_date`
`strict_ordinal_date`
A full ordinal date with a four-digit year and three-digit ordinal day of the year. | "yyyy-DDD"
"2019-082" +`ordinal_date_time`
`strict_ordinal_date_time`
A full ordinal date and time separated by `T`. | `"yyyy-DDD`T`HH:mm:ss.SSSZ"`
`"2019-082T21:34:46.123-04:00"` +`ordinal_date_time_no_millis`
`strict_ordinal_date_time_no_millis`
A full ordinal date and time without milliseconds, separated by `T`. | `"yyyy-DDD`T`HH:mm:ssZ"`
`"2019-082T21:34:46-04:00"` +`ordinal_date`
`strict_ordinal_date`
A full ordinal date with a four-digit year and three-digit ordinal day of the year. | `"yyyy-DDD"`
`"2019-082"` **Week-based dates** | -`week_date_time`
`strict_week_date_time`
A full week-based date and time separated by `T`. Week date is a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week. Time is a two-digit hour, two-digit minute, two-digit second, one- to nine-digit fraction of a second, and a time zone offset. | "YYYY-`W`ww-e`T`HH:mm:ss.SSSSSSSSSZ"
"2019-W12-6T21:34:46.1-04:00"
"2019-W12-6T21:34:46.123456789-04:00" -`week_date_time_no_millis`
`strict_week_date_time_no_millis`
A full week-based date and time without milliseconds, separated by `T`. Week date is a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week. Time is a two-digit hour, two-digit minute, two-digit second, and time zone offset. | "YYYY-`W`ww-e`T`HH:mm:ssZ"
"2019-W12-6T21:34:46-04:00" -`week_date`
`strict_week_date`
A full week-based date with a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week. | "YYYY-`W`ww-e"
"2019-W12-6" -`weekyear_week_day`
`strict_weekyear_week_day`
A four-digit week-based year, two-digit ordinal week of the year, and one digit day of the week. | "YYYY-'W'ww-e"
"2019-W12-6" -`weekyear_week`
`strict_weekyear_week`
A four-digit week-based year and two-digit ordinal week of the year. | "YYYY-`W`ww"
"2019-W12" -`weekyear`
`strict_weekyear`
A four-digit week-based year. | "YYYY"
"2019" +`week_date_time`
`strict_week_date_time`
A full week-based date and time separated by `T`. Week date is a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week. Time is a two-digit hour, two-digit minute, two-digit second, one- to nine-digit fraction of a second, and a time zone offset. | `"YYYY-`W`ww-e`T`HH:mm:ss.SSSSSSSSSZ"`
`"2019-W12-6T21:34:46.1-04:00"`
`"2019-W12-6T21:34:46.123456789-04:00"` +`week_date_time_no_millis`
`strict_week_date_time_no_millis`
A full week-based date and time without milliseconds, separated by `T`. Week date is a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week. Time is a two-digit hour, two-digit minute, two-digit second, and time zone offset. | `"YYYY-`W`ww-e`T`HH:mm:ssZ"`
`"2019-W12-6T21:34:46-04:00"` +`week_date`
`strict_week_date`
A full week-based date with a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week. | `"YYYY-`W`ww-e"`
`"2019-W12-6"` +`weekyear_week_day`
`strict_weekyear_week_day`
A four-digit week-based year, two-digit ordinal week of the year, and one digit day of the week. | `"YYYY-'W'ww-e"`
`"2019-W12-6"` +`weekyear_week`
`strict_weekyear_week`
A four-digit week-based year and two-digit ordinal week of the year. | `"YYYY-`W`ww"`
`"2019-W12"` +`weekyear`
`strict_weekyear`
A four-digit week-based year. | `"YYYY"`
`"2019"` ## Custom formats -You can create custom formats for date fields. For example, the following request specifies a date in the common "MM/dd/yyyy" format: +You can create custom formats for date fields. For example, the following request specifies a date in the common `"MM/dd/yyyy"` format: ```json PUT testindex @@ -257,9 +258,9 @@ Date math supports the following time units: The following example expressions illustrate using date math: - `now+1M`: The current date and time in milliseconds since the epoch, plus 1 month. -- `2022-05-18||/M`: 05/18/2022, rounded to the beginning of the month. Resolves to `2022-05-01`. -- `2022-05-18T15:23||/h`: 15:23 on 05/18/2022, rounded to the beginning of the hour. Resolves to `2022-05-18T15`. -- `2022-05-18T15:23:17.789||+2M-1d/d`: 15:23:17.789 on 05/18/2022 plus 2 months minus 1 day, rounded to the beginning of the day. Resolves to `2022-07-17`. +- `2022-05-18||/M`: `05/18/2022`, rounded to the beginning of the month. Resolves to `2022-05-01`. +- `2022-05-18T15:23||/h`: `15:23` on `05/18/2022`, rounded to the beginning of the hour. Resolves to `2022-05-18T15`. +- `2022-05-18T15:23:17.789||+2M-1d/d`: `15:23:17.789` on `05/18/2022` plus 2 months minus 1 day, rounded to the beginning of the day. Resolves to `2022-07-17`. ### Using date math in a range query @@ -300,7 +301,7 @@ PUT testindex/_doc/2 ``` {% include copy-curl.html %} -The following query searches for documents with `release_date` within 2 months and 1 day of 09/14/2022. The lower boundary of the range is rounded to the beginning of the day on 09/14/2022: +The following query searches for documents with `release_date` within 2 months and 1 day of `09/14/2022`. The lower boundary of the range is rounded to the beginning of the day on `09/14/2022`: ```json GET testindex/_search @@ -355,4 +356,4 @@ The response contains both documents: ] } } -``` \ No newline at end of file +``` diff --git a/_field-types/supported-field-types/index.md b/_field-types/supported-field-types/index.md index d1362bead5..69ca0032be 100644 --- a/_field-types/supported-field-types/index.md +++ b/_field-types/supported-field-types/index.md @@ -23,7 +23,7 @@ Boolean | [`boolean`]({{site.url}}{{site.baseurl}}/field-types/supported-field-t IP | [`ip`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/ip/): An IP address in IPv4 or IPv6 format. [Range]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/range/) | A range of values (`integer_range`, `long_range`, `double_range`, `float_range`, `date_range`, `ip_range`). [Object]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object-fields/)| [`object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object/): A JSON object.
[`nested`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/): Used when objects in an array need to be indexed independently as separate documents.
[`flat_object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/flat-object/): A JSON object treated as a string.
[`join`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/): Establishes a parent-child relationship between documents in the same index. -[String]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/string/)|[`keyword`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/keyword/): Contains a string that is not analyzed.
[`text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/): Contains a string that is analyzed.
[`token_count`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/token-count/): Stores the number of analyzed tokens in a string. +[String]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/string/)|[`keyword`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/keyword/): Contains a string that is not analyzed.
[`text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/): Contains a string that is analyzed.
[`match_only_text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/match-only-text/): A space-optimized version of a `text` field.
[`token_count`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/token-count/): Stores the number of analyzed tokens in a string. [Autocomplete]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/autocomplete/) |[`completion`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/completion/): Provides autocomplete functionality through a completion suggester.
[`search_as_you_type`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/search-as-you-type/): Provides search-as-you-type functionality using both prefix and infix completion. [Geographic]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geographic/)| [`geo_point`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-point/): A geographic point.
[`geo_shape`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-shape/): A geographic shape. [Rank]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/rank/) | Boosts or decreases the relevance score of documents (`rank_feature`, `rank_features`). diff --git a/_field-types/supported-field-types/keyword.md b/_field-types/supported-field-types/keyword.md index 628d720b02..eea6cc664b 100644 --- a/_field-types/supported-field-types/keyword.md +++ b/_field-types/supported-field-types/keyword.md @@ -14,12 +14,14 @@ redirect_from: A keyword field type contains a string that is not analyzed. It allows only exact, case-sensitive matches. +By default, keyword fields are both indexed (because `index` is enabled) and stored on disk (because `doc_values` is enabled). To reduce disk space, you can specify not to index keyword fields by setting `index` to `false`. + If you need to use a field for full-text search, map it as [`text`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/text/) instead. {: .note } ## Example -Create a mapping with a keyword field: +The following query creates a mapping with a keyword field. Setting `index` to `false` specifies to store the `genre` field on disk and to retrieve it using `doc_values`: ```json PUT movies @@ -27,7 +29,8 @@ PUT movies "mappings" : { "properties" : { "genre" : { - "type" : "keyword" + "type" : "keyword", + "index" : false } } } @@ -46,12 +49,13 @@ Parameter | Description `eager_global_ordinals` | Specifies whether global ordinals should be loaded eagerly on refresh. If the field is often used for aggregations, this parameter should be set to `true`. Default is `false`. `fields` | To index the same string in several ways (for example, as a keyword and text), provide the fields parameter. You can specify one version of the field to be used for search and another to be used for sorting and aggregations. `ignore_above` | Any string longer than this integer value should not be indexed. Default is 2147483647. Default dynamic mapping creates a keyword subfield for which `ignore_above` is set to 256. -`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. To reduce disk space, set `index` to `false`. `index_options` | Information to be stored in the index that will be considered when calculating relevance scores. Can be set to `freqs` for term frequency. Default is `docs`. `meta` | Accepts metadata for this field. -`normalizer` | Specifies how to preprocess this field before indexing (for example, make it lowercase). Default is `null` (no preprocessing). +[`normalizer`]({{site.url}}{{site.baseurl}}/analyzers/normalizers/) | Specifies how to preprocess this field before indexing (for example, make it lowercase). Default is `null` (no preprocessing). `norms` | A Boolean value that specifies whether the field length should be used when calculating relevance scores. Default is `false`. -[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. Must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. +[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. Must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. `similarity` | The ranking algorithm for calculating relevance scores. Default is `BM25`. `split_queries_on_whitespace` | A Boolean value that specifies whether full-text queries should be split on white space. Default is `false`. -`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the _source field. Default is `false`. \ No newline at end of file +`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the `_source` field. Default is `false`. + diff --git a/_field-types/supported-field-types/knn-vector.md b/_field-types/supported-field-types/knn-vector.md index 163f967a2c..741a86da64 100644 --- a/_field-types/supported-field-types/knn-vector.md +++ b/_field-types/supported-field-types/knn-vector.md @@ -7,7 +7,7 @@ parent: Supported field types has_math: true --- -# k-NN vector +# k-NN vector field type The [k-NN plugin]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/) introduces a custom data type, the `knn_vector`, that allows users to ingest their k-NN vectors into an OpenSearch index and perform different kinds of k-NN search. The `knn_vector` field is highly configurable and can serve many different k-NN workloads. In general, a `knn_vector` field can be built either by providing a method definition or specifying a model id. @@ -266,4 +266,4 @@ else: return Byte(bval) ``` -{% include copy.html %} \ No newline at end of file +{% include copy.html %} diff --git a/_field-types/supported-field-types/match-only-text.md b/_field-types/supported-field-types/match-only-text.md new file mode 100644 index 0000000000..fd2c6b5850 --- /dev/null +++ b/_field-types/supported-field-types/match-only-text.md @@ -0,0 +1,101 @@ +--- +layout: default +title: Match-only text +nav_order: 61 +has_children: false +parent: String field types +grand_parent: Supported field types +--- + +# Match-only text field type + +A `match_only_text` field is a variant of a `text` field designed for full-text search when scoring and positional information of terms within a document are not critical. + +A `match_only_text` field is different from a `text` field in the following ways: + + - Omits storing positions, frequencies, and norms, reducing storage requirements. + - Disables scoring so that all matching documents receive a constant score of 1.0. + - Supports all query types except interval and span queries. + +Choose the `match_only_text` field type to prioritize efficient full-text search over complex ranking and positional queries while optimizing storage costs. Using `match_only_text` creates significantly smaller indexes, which results in lower storage costs, especially for large datasets. + +Use a `match_only_text` field when you need to quickly find documents containing specific terms without the overhead of storing frequencies and positions. The `match_only_text` field type is not the best choice for ranking results based on relevance or for queries that rely on term proximity or order, like interval or span queries. While this field type does support phrase queries, their performance isn't as efficient as when using the `text` field type. If identifying exact phrases or their locations within documents is essential, use the `text` field type instead. + +## Example + +Create a mapping with a `match_only_text` field: + +```json +PUT movies +{ + "mappings" : { + "properties" : { + "title" : { + "type" : "match_only_text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +While `match_only_text` supports most parameters available for `text` fields, modifying most of them can be counterproductive. This field type is intended to be simple and efficient, minimizing data stored in the index to optimize storage costs. Therefore, keeping the default settings is generally the best approach. Any modifications beyond analyzer settings can reintroduce overhead and negate the efficiency benefits of `match_only_text`. + +The following table lists all parameters available for `match_text_only` fields. + +Parameter | Description +:--- | :--- +`analyzer` | The analyzer to be used for the field. By default, it will be used at index time and at search time. To override it at search time, set the `search_analyzer` parameter. Default is the `standard` analyzer, which uses grammar-based tokenization and is based on the [Unicode Text Segmentation](https://unicode.org/reports/tr29/) algorithm. +`boost` | All hits are assigned a score of 1 and are multiplied by `boost` to produce the final score for the query clause. +`eager_global_ordinals` | Specifies whether global ordinals should be loaded eagerly on refresh. If the field is often used for aggregations, this parameter should be set to `true`. Default is `false`. +`fielddata` | A Boolean value that specifies whether to access analyzed tokens for sorting, aggregation, and scripting. Default is `false`. +`fielddata_frequency_filter` | A JSON object specifying that only those analyzed tokens whose document frequency is between the `min` and `max` values (provided as either an absolute number or a percentage) should be loaded into memory. Frequency is computed per segment. Parameters: `min`, `max`, `min_segment_size`. Default is to load all analyzed tokens. +`fields` | To index the same string in several ways (for example, as a keyword and text), provide the `fields` parameter. You can specify one version of the field to be used for search and another to be used for sorting and aggregation. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. +`index_options` | You cannot modify this parameter. +`index_phrases` | Not supported. +`index_prefixes` | Not supported. +`meta` | Accepts metadata for this field. +`norms` | Norms are disabled and cannot be enabled. +`position_increment_gap` | Although positions are disabled, `position_increment_gap` behaves similarly to the `text` field when used in phrase queries. Such queries may be slower but are still functional. +`similarity` | Setting similarity has no impact. The `match_only_text` field type doesn't support queries like `more_like_this`, which rely on similarity. Use a `keyword` or `text` field for queries that rely on similarity. +`term_vector` | Term vectors are supported, but using them is discouraged because it contradicts the primary purpose of this field---storage optimization. + +## Migrating a field from `text` to `match_only_text` + +You can use the [Reindex API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/reindex/) to migrate from a `text` field to `match_only_text` by updating the correct mapping in the destination index. + +In the following example, the `source` index contains a `title` field of type `text`. + +Create a destination index with the `title` field mapped as `text`: + +```json +PUT destination +{ + "mappings" : { + "properties" : { + "title" : { + "type" : "match_only_text" + } + } + } +} +``` +{% include copy-curl.html %} + +Reindex the data: + +```json +POST _reindex +{ + "source": { + "index":"source" + }, + "dest": { + "index":"destination" + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/supported-field-types/string.md b/_field-types/supported-field-types/string.md index f24dea2325..c891f86cf6 100644 --- a/_field-types/supported-field-types/string.md +++ b/_field-types/supported-field-types/string.md @@ -18,4 +18,5 @@ Field data type | Description :--- | :--- [`keyword`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/keyword/) | A string that is not analyzed. Useful for exact-value search. [`text`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/text/) | A string that is analyzed. Useful for full-text search. +[`match_only_text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/match-only-text/) | A space-optimized version of a `text` field. [`token_count`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/token-count/) | Counts the number of tokens in a string. diff --git a/_field-types/supported-field-types/text.md b/_field-types/supported-field-types/text.md index 0a16f3aa95..16350c0cb3 100644 --- a/_field-types/supported-field-types/text.md +++ b/_field-types/supported-field-types/text.md @@ -12,12 +12,15 @@ redirect_from: # Text field type -A text field type contains a string that is analyzed. It is used for full-text search because it allows partial matches. Searches with multiple terms can match some but not all of them. Depending on the analyzer, results can be case insensitive, stemmed, stopwords removed, synonyms applied, etc. +A `text` field type contains a string that is analyzed. It is used for full-text search because it allows partial matches. Searches for multiple terms can match some but not all of them. Depending on the analyzer, results can be case insensitive, stemmed, have stopwords removed, have synonyms applied, and so on. If you need to use a field for exact-value search, map it as a [`keyword`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/keyword/) instead. {: .note } +The [`match_only_text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/match-only-text/) field is a space-optimized version of the `text` field. If you don't need to query phrases or use positional queries, map the field as `match_only_text` instead of `text`. Positional queries are queries in which the position of the term in the phrase is important, such as interval or span queries. +{: .note} + ## Example Create a mapping with a text field: diff --git a/_field-types/supported-field-types/token-count.md b/_field-types/supported-field-types/token-count.md index 07982431e7..6c3445e6a7 100644 --- a/_field-types/supported-field-types/token-count.md +++ b/_field-types/supported-field-types/token-count.md @@ -1,7 +1,7 @@ --- layout: default title: Token count -nav_order: 48 +nav_order: 70 has_children: false parent: String field types grand_parent: Supported field types diff --git a/_getting-started/communicate.md b/_getting-started/communicate.md new file mode 100644 index 0000000000..391bc9bef0 --- /dev/null +++ b/_getting-started/communicate.md @@ -0,0 +1,320 @@ +--- +layout: default +title: Communicate with OpenSearch +nav_order: 30 +--- + +# Communicate with OpenSearch + +You can communicate with OpenSearch using the REST API or one of the OpenSearch language clients. This page introduces the OpenSearch REST API. If you need to communicate with OpenSearch in your programming language, see the [Clients]({{site.url}}{{site.baseurl}}/clients/) section for a list of available clients. + +## OpenSearch REST API + +You interact with OpenSearch clusters using the REST API, which offers a lot of flexibility. Through the REST API, you can change most OpenSearch settings, modify indexes, check cluster health, get statistics---almost everything. You can use clients like [cURL](https://curl.se/) or any programming language that can send HTTP requests. + +You can send HTTP requests in your terminal or in the [Dev Tools console]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/index-dev/) in OpenSearch Dashboards. + +### Sending requests in a terminal + +When sending cURL requests in a terminal, the request format varies depending on whether you're using the Security plugin. As an example, consider a request to the Cluster Health API. + +If you're not using the Security plugin, send the following request: + +```bash +curl -XGET "http://localhost:9200/_cluster/health" +``` +{% include copy.html %} + +If you're using the Security plugin, provide the username and password in the request: + +```bash +curl -X GET "http://localhost:9200/_cluster/health" -ku admin: +``` +{% include copy.html %} + +The default username is `admin`, and the password is set in your `docker-compose.yml` file in the `OPENSEARCH_INITIAL_ADMIN_PASSWORD=` setting. + +OpenSearch generally returns responses in a flat JSON format by default. For a human-readable response body, provide the `pretty` query parameter: + +```bash +curl -XGET "http://localhost:9200/_cluster/health?pretty" +``` +{% include copy.html %} + +For more information about `pretty` and other useful query parameters, see [Common REST parameters]({{site.url}}{{site.baseurl}}/opensearch/common-parameters/). + +For requests that contain a body, specify the `Content-Type` header and provide the request payload in the `-d` (data) option: + +```json +curl -XGET "http://localhost:9200/students/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "match_all": {} + } +}' +``` +{% include copy.html %} + +### Sending requests in Dev Tools + +The Dev Tools console in OpenSearch Dashboards uses a simpler syntax to format REST requests as compared to the cURL command. To send requests in Dev Tools, use the following steps: + +1. Access OpenSearch Dashboards by opening `http://localhost:5601/` in a web browser on the same host that is running your OpenSearch cluster. The default username is `admin`, and the password is set in your `docker-compose.yml` file in the `OPENSEARCH_INITIAL_ADMIN_PASSWORD=` setting. +1. On the top menu bar, go to **Management > Dev Tools**. +1. In the left pane of the console, enter the following request: + ```json + GET _cluster/health + ``` + {% include copy-curl.html %} +1. Choose the triangle icon on the upper right of the request to submit the query. You can also submit the request by pressing `Ctrl+Enter` (or `Cmd+Enter` for Mac users). To learn more about using the OpenSearch Dashboards console for submitting queries, see [Running queries in the console]({{site.url}}{{site.baseurl}}/dashboards/run-queries/). + +In the following sections, and in most of the OpenSearch documentation, requests are presented in the Dev Tools console format. + +## Indexing documents + +To add a JSON document to an OpenSearch index (that is, to _index_ a document), you send an HTTP request with the following header: + +```json +PUT https://://_doc/ +``` + +For example, to index a document representing a student, you can send the following request: + +```json +PUT /students/_doc/1 +{ + "name": "John Doe", + "gpa": 3.89, + "grad_year": 2022 +} +``` +{% include copy-curl.html %} + +Once you send the preceding request, OpenSearch creates an index called `students` and stores the ingested document in the index. If you don't provide an ID for your document, OpenSearch generates a document ID. In the preceding request, the document ID is specified as the student ID (`1`). + +To learn more about indexing, see [Managing indexes]({{site.url}}{{site.baseurl}}/im-plugin/). + +## Dynamic mapping + +When you index a document, OpenSearch infers the field types from the JSON types submitted in the document. This process is called _dynamic mapping_. For more information, see [Dynamic mapping]({{site.url}}{{site.baseurl}}/field-types/#dynamic-mapping). + +To view the inferred field types, send a request to the `_mapping` endpoint: + +```json +GET /students/_mapping +``` +{% include copy-curl.html %} + +OpenSearch responds with the field `type` for each field: + +```json +{ + "students": { + "mappings": { + "properties": { + "gpa": { + "type": "float" + }, + "grad_year": { + "type": "long" + }, + "name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + } + } + } + } +} +``` + +OpenSearch mapped the numeric fields to the `float` and `long` types. Notice that OpenSearch mapped the `name` text field to `text` and added a `name.keyword` subfield mapped to `keyword`. Fields mapped to `text` are analyzed (lowercased and split into terms) and can be used for full-text search. Fields mapped to `keyword` are used for exact term search. + +OpenSearch mapped the `grad_year` field to `long`. If you want to map it to the `date` type instead, you need to [delete the index](#deleting-an-index) and then recreate it, explicitly specifying the mappings. For instructions on how to explicitly specify mappings, see [Index settings and mappings](#index-mappings-and-settings). + +## Searching for documents + +To run a search for the document, specify the index that you're searching and a query that will be used to match documents. The simplest query is the `match_all` query, which matches all documents in an index: + +```json +GET /students/_search +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the indexed document: + +```json +{ + "took": 12, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "students", + "_id": "1", + "_score": 1, + "_source": { + "name": "John Doe", + "gpa": 3.89, + "grad_year": 2022 + } + } + ] + } +} +``` + +For more information about search, see [Search your data]({{site.url}}{{site.baseurl}}/getting-started/search-data/). + +## Updating documents + +In OpenSearch, documents are immutable. However, you can update a document by retrieving it, updating its information, and reindexing it. You can update an entire document using the Index Document API, providing values for all existing and added fields in the document. For example, to update the `gpa` field and add an `address` field to the previously indexed document, send the following request: + +```json +PUT /students/_doc/1 +{ + "name": "John Doe", + "gpa": 3.91, + "grad_year": 2022, + "address": "123 Main St." +} +``` +{% include copy.html %} + +Alternatively, you can update parts of a document by calling the Update Document API: + +```json +POST /students/_update/1/ +{ + "doc": { + "gpa": 3.91, + "address": "123 Main St." + } +} +``` +{% include copy-curl.html %} + +For more information about partial document updates, see [Update Document API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/update-document/). + +## Deleting a document + +To delete a document, send a delete request and provide the document ID: + +```json +DELETE /students/_doc/1 +``` +{% include copy-curl.html %} + +## Deleting an index + +To delete an index, send the following request: + +```json +DELETE /students +``` +{% include copy-curl.html %} + +## Index mappings and settings + +OpenSearch indexes are configured with mappings and settings: + +- A _mapping_ is a collection of fields and the types of those fields. For more information, see [Mappings and field types]({{site.url}}{{site.baseurl}}/field-types/). +- _Settings_ include index data like the index name, creation date, and number of shards. For more information, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). + +You can specify the mappings and settings in one request. For example, the following request specifies the number of index shards and maps the `name` field to `text` and the `grad_year` field to `date`: + +```json +PUT /students +{ + "settings": { + "index.number_of_shards": 1 + }, + "mappings": { + "properties": { + "name": { + "type": "text" + }, + "grad_year": { + "type": "date" + } + } + } +} +``` +{% include copy-curl.html %} + +Now you can index the same document that you indexed in the previous section: + +```json +PUT /students/_doc/1 +{ + "name": "John Doe", + "gpa": 3.89, + "grad_year": 2022 +} +``` +{% include copy-curl.html %} + +To view the mappings for the index fields, send the following request: + +```json +GET /students/_mapping +``` +{% include copy-curl.html %} + +OpenSearch mapped the `name` and `grad_year` fields according to the specified types and inferred the field type for the `gpa` field: + +```json +{ + "students": { + "mappings": { + "properties": { + "gpa": { + "type": "float" + }, + "grad_year": { + "type": "date" + }, + "name": { + "type": "text" + } + } + } + } +} +``` + +Once a field is created, you cannot change its type. Changing a field type requires deleting the index and recreating it with the new mappings. +{: .note} + +## Further reading + +- For information about the OpenSearch REST API, see the [REST API reference]({{site.url}}{{site.baseurl}}/api-reference/). +- For information about OpenSearch language clients, see [Clients]({{site.url}}{{site.baseurl}}/clients/). +- For information about mappings, see [Mappings and field types]({{site.url}}{{site.baseurl}}/field-types/). +- For information about settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). + +## Next steps + +- See [Ingest data into OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/) to learn about ingestion options. \ No newline at end of file diff --git a/_getting-started/index.md b/_getting-started/index.md new file mode 100644 index 0000000000..b25587c522 --- /dev/null +++ b/_getting-started/index.md @@ -0,0 +1,38 @@ +--- +layout: default +title: Getting started +nav_order: 1 +has_children: true +has_toc: false +nav_exclude: true +permalink: /getting-started/ +--- + +# Getting started + +OpenSearch is a distributed search and analytics engine based on [Apache Lucene](https://lucene.apache.org/). After adding your data to OpenSearch, you can perform full-text searches on it with all of the features you might expect: search by field, search multiple indexes, boost fields, rank results by score, sort results by field, and aggregate results. + +Unsurprisingly, builders often use a search engine like OpenSearch as the backend for a search application---think [Wikipedia](https://en.wikipedia.org/wiki/Wikipedia:FAQ/Technical#What_software_is_used_to_run_Wikipedia?) or an online store. It offers excellent performance and can scale up or down as the needs of the application grow or shrink. + +An equally popular, but less obvious use case is log analytics, in which you take the logs from an application, feed them into OpenSearch, and use the rich search and visualization functionality to identify issues. For example, a malfunctioning web server might throw a 500 error 0.5% of the time, which can be hard to notice unless you have a real-time graph of all HTTP status codes that the server has thrown in the past four hours. You can use [OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/) to build these sorts of visualizations from data in OpenSearch. + +## Components + +OpenSearch is more than just the core engine. It also includes the following components: + +- [OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/): The OpenSearch data visualization UI. +- [Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/): A server-side data collector capable of filtering, enriching, transforming, normalizing, and aggregating data for downstream analysis and visualization. +- [Clients]({{site.url}}{{site.baseurl}}/clients/): Language APIs that let you communicate with OpenSearch in several popular programming languages. + +## Use cases + +OpenSearch supports a variety of use cases, for example: + +- [Observability]({{site.url}}{{site.baseurl}}/observing-your-data/): Visualize data-driven events by using Piped Processing Language (PPL) to explore, discover, and query data stored in OpenSearch. +- [Search]({{site.url}}{{site.baseurl}}/search-plugins/): Choose the best search method for your application, from regular lexical search to conversational search powered by machine learning (ML). +- [Machine learning]({{site.url}}{{site.baseurl}}/ml-commons-plugin/): Integrate ML models into your OpenSearch application. +- [Security analytics]({{site.url}}{{site.baseurl}}/security-analytics/): Investigate, detect, analyze, and respond to security threats that can jeopardize organizational success and online operations. + +## Next steps + +- See [Introduction to OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/intro/) to learn about essential OpenSearch concepts. \ No newline at end of file diff --git a/_getting-started/ingest-data.md b/_getting-started/ingest-data.md new file mode 100644 index 0000000000..73cf1502f7 --- /dev/null +++ b/_getting-started/ingest-data.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Ingest data +nav_order: 40 +--- + +# Ingest your data into OpenSearch + +There are several ways to ingest data into OpenSearch: + +- Ingest individual documents. For more information, see [Indexing documents]({{site.url}}{{site.baseurl}}/getting-started/communicate/#indexing-documents). +- Index multiple documents in bulk. For more information, see [Bulk indexing](#bulk-indexing). +- Use Data Prepper---an OpenSearch server-side data collector that can enrich data for downstream analysis and visualization. For more information, see [Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/). +- Use other ingestion tools. For more information, see [OpenSearch tools]({{site.url}}{{site.baseurl}}/tools/). + +## Bulk indexing + +To index documents in bulk, you can use the [Bulk API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/). For example, if you want to index several documents into the `students` index, send the following request: + +```json +POST _bulk +{ "create": { "_index": "students", "_id": "2" } } +{ "name": "Jonathan Powers", "gpa": 3.85, "grad_year": 2025 } +{ "create": { "_index": "students", "_id": "3" } } +{ "name": "Jane Doe", "gpa": 3.52, "grad_year": 2024 } +``` +{% include copy-curl.html %} + +## Experiment with sample data + +OpenSearch provides a fictitious e-commerce dataset that you can use to experiment with REST API requests and OpenSearch Dashboards visualizations. You can create an index and define field mappings by downloading the corresponding dataset and mapping files. + +### Create a sample index + +Use the following steps to create a sample index and define field mappings for the document fields: + +1. Download [ecommerce-field_mappings.json](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce-field_mappings.json). This file defines a [mapping]({{site.url}}{{site.baseurl}}/opensearch/mappings/) for the sample data you will use. + + To use cURL, send the following request: + + ```bash + curl -O https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce-field_mappings.json + ``` + {% include copy.html %} + + To use wget, send the following request: + + ``` + wget https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce-field_mappings.json + ``` + {% include copy.html %} + +1. Download [ecommerce.json](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json). This file contains the index data formatted so that it can be ingested by the Bulk API: + + To use cURL, send the following request: + + ```bash + curl -O https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json + ``` + {% include copy.html %} + + To use wget, send the following request: + + ``` + wget https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json + ``` + {% include copy.html %} + +1. Define the field mappings provided in the mapping file: + ```bash + curl -H "Content-Type: application/x-ndjson" -X PUT "https://localhost:9200/ecommerce" -ku admin: --data-binary "@ecommerce-field_mappings.json" + ``` + {% include copy.html %} + +1. Upload the documents using the Bulk API: + + ```bash + curl -H "Content-Type: application/x-ndjson" -X PUT "https://localhost:9200/ecommerce/_bulk" -ku admin: --data-binary "@ecommerce.json" + ``` + {% include copy.html %} + +### Query the data + +Query the data using the Search API. The following query searches for documents in which `customer_first_name` is `Sonya`: + +```json +GET ecommerce/_search +{ + "query": { + "match": { + "customer_first_name": "Sonya" + } + } +} +``` +{% include copy-curl.html %} + +### Visualize the data + +To learn how to use OpenSearch Dashboards to visualize the data, see the [OpenSearch Dashboards quickstart guide]({{site.url}}{{site.baseurl}}/dashboards/quickstart/). + +## Further reading + +- For information about Data Prepper, see [Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/). +- For information about ingestion tools, see [OpenSearch tools]({{site.url}}{{site.baseurl}}/tools/). +- For information about OpenSearch Dashboards, see [OpenSearch Dashboards quickstart guide]({{site.url}}{{site.baseurl}}/dashboards/quickstart/). +- For information about bulk indexing, see [Bulk API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/). + +## Next steps + +- See [Search your data]({{site.url}}{{site.baseurl}}/getting-started/search-data/) to learn about search options. \ No newline at end of file diff --git a/_getting-started/intro.md b/_getting-started/intro.md new file mode 100644 index 0000000000..272d8d6981 --- /dev/null +++ b/_getting-started/intro.md @@ -0,0 +1,161 @@ +--- +layout: default +title: Intro to OpenSearch +nav_order: 2 +has_math: true +redirect_from: + - /intro/ +--- + +# Introduction to OpenSearch + +OpenSearch is a distributed search and analytics engine that supports various use cases, from implementing a search box on a website to analyzing security data for threat detection. The term _distributed_ means that you can run OpenSearch on multiple computers. _Search and analytics_ means that you can search and analyze your data once you ingest it into OpenSearch. No matter your type of data, you can store and analyze it using OpenSearch. + +## Document + +A _document_ is a unit that stores information (text or structured data). In OpenSearch, documents are stored in [JSON](https://www.json.org/) format. + +You can think of a document in several ways: + +- In a database of students, a document might represent one student. +- When you search for information, OpenSearch returns documents related to your search. +- A document represents a row in a traditional database. + +For example, in a school database, a document might represent one student and contain the following data. + +ID | Name | GPA | Graduation year | +:--- | :--- | :--- | :--- | +1 | John Doe | 3.89 | 2022 | + +Here is what this document looks like in JSON format: + +```json +{ + "name": "John Doe", + "gpa": 3.89, + "grad_year": 2022 +} +``` + +You'll learn about how document IDs are assigned in [Indexing documents]({{site.url}}{{site.baseurl}}/getting-started/communicate/#indexing-documents). + +## Index + +An _index_ is a collection of documents. + +You can think of an index in several ways: + +- In a database of students, an index represents all students in the database. +- When you search for information, you query data contained in an index. +- An index represents a database table in a traditional database. + +For example, in a school database, an index might contain all students in the school. + +ID | Name | GPA | Graduation year +:--- | :--- | :--- | :--- +1 | John Doe | 3.89 | 2022 +2 | Jonathan Powers | 3.85 | 2025 +3 | Jane Doe | 3.52 | 2024 + +## Clusters and nodes + +OpenSearch is designed to be a distributed search engine, meaning that it can run on one or more _nodes_---servers that store your data and process search requests. An OpenSearch *cluster* is a collection of nodes. + +You can run OpenSearch locally on a laptop---its system requirements are minimal---but you can also scale a single cluster to hundreds of powerful machines in a data center. + +In a single-node cluster, such as one deployed on a laptop, one machine has to perform every task: manage the state of the cluster, index and search data, and perform any preprocessing of data prior to indexing it. As a cluster grows, however, you can subdivide responsibilities. Nodes with fast disks and plenty of RAM might perform well when indexing and searching data, whereas a node with plenty of CPU power and a tiny disk could manage cluster state. + +In each cluster, there is an elected _cluster manager_ node, which orchestrates cluster-level operations, such as creating an index. Nodes communicate with each other, so if your request is routed to a node, that node sends requests to other nodes, gathers the nodes' responses, and returns the final response. + +For more information about other node types, see [Cluster formation]({{site.url}}{{site.baseurl}}/opensearch/cluster/). + +## Shards + +OpenSearch splits indexes into _shards_. Each shard stores a subset of all documents in an index, as shown in the following image. + +An index is split into shards + +Shards are used for even distribution across nodes in a cluster. For example, a 400 GB index might be too large for any single node in your cluster to handle, but split into 10 shards of 40 GB each, OpenSearch can distribute the shards across 10 nodes and manage each shard individually. Consider a cluster with 2 indexes: index 1 and index 2. Index 1 is split into 2 shards, and index 2 is split into 4 shards. The shards are distributed across nodes 1 and 2, as shown in the following image. + +A cluster containing two indexes and two nodes + +Despite being one piece of an OpenSearch index, each shard is actually a full Lucene index. This detail is important because each instance of Lucene is a running process that consumes CPU and memory. More shards is not necessarily better. Splitting a 400 GB index into 1,000 shards, for example, would unnecessarily strain your cluster. A good rule of thumb is to limit shard size to 10--50 GB. + +## Primary and replica shards + +In OpenSearch, a shard may be either a _primary_ (original) shard or a _replica_ (copy) shard. By default, OpenSearch creates a replica shard for each primary shard. Thus, if you split your index into 10 shards, OpenSearch creates 10 replica shards. For example, consider the cluster described in the previous section. If you add 1 replica for each shard of each index in the cluster, your cluster will contain a total of 2 shards and 2 replicas for index 1 and 4 shards and 4 replicas for index 2, as shown in the following image. + +A cluster containing two indexes with one replica shard for each shard in the index + +These replica shards act as backups in the event of a node failure---OpenSearch distributes replica shards to different nodes than their corresponding primary shards---but they also improve the speed at which the cluster processes search requests. You might specify more than one replica per index for a search-heavy workload. + +## Inverted index + +An OpenSearch index uses a data structure called an _inverted index_. An inverted index maps words to the documents in which they occur. For example, consider an index containing the following two documents: + +- Document 1: "Beauty is in the eye of the beholder" +- Document 2: "Beauty and the beast" + +An inverted index for such an index maps the words to the documents in which they occur: + +Word | Document +:--- | :--- +beauty | 1, 2 +is | 1 +in | 1 +the | 1, 2 +eye | 1 +of | 1 +the | 1 +beholder | 1 +and | 2 +beast | 2 + +In addition to the document ID, OpenSearch stores the position of the word within the document for running phrase queries, where words must appear next to each other. + +## Relevance + +When you search for a document, OpenSearch matches the words in the query to the words in the documents. For example, if you search the index described in the previous section for the word `beauty`, OpenSearch will return documents 1 and 2. Each document is assigned a _relevance score_ that tells you how well the document matched the query. + +Individual words in a search query are called search _terms_. Each search term is scored according to the following rules: + +1. A search term that occurs more frequently in a document will tend to be scored higher. A document about dogs that uses the word `dog` many times is likely more relevant than a document that contains the word `dog` fewer times. This is the _term frequency_ component of the score. + +1. A search term that occurs in more documents will tend to be scored lower. A query for the terms `blue` and `axolotl` should prefer documents that contain `axolotl` over the likely more common word `blue`. This is the _inverse document frequency_ component of the score. + +1. A match on a longer document should tend to be scored lower than a match on a shorter document. A document that contains a full dictionary would match on any word but is not very relevant to any particular word. This corresponds to the _length normalization_ component of the score. + +OpenSearch uses the BM25 ranking algorithm to calculate document relevance scores and then returns the results sorted by relevance. To learn more, see [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25). + +## Advanced concepts + +The following section describes more advanced OpenSearch concepts. + +### Update lifecycle + +The lifecycle of an update operation consists of the following steps: + +1. An update is received by a primary shard and is written to the shard's transaction log ([translog](#translog)). The translog is flushed to disk (followed by an fsync) before the update is acknowledged. This guarantees durability. +1. The update is also passed to the Lucene index writer, which adds it to an in-memory buffer. +1. On a [refresh operation](#refresh), the Lucene index writer flushes the in-memory buffers to disk (with each buffer becoming a new Lucene segment), and a new index reader is opened over the resulting segment files. The updates are now visible for search. +1. On a [flush operation](#flush), the shard fsyncs the Lucene segments. Because the segment files are a durable representation of the updates, the translog is no longer needed to provide durability, so the updates can be purged from the translog. + +### Translog + +An indexing or bulk call responds when the documents have been written to the translog and the translog is flushed to disk, so the updates are durable. The updates will not be visible to search requests until after a [refresh operation](#refresh). + +### Refresh + +Periodically, OpenSearch performs a _refresh_ operation, which writes the documents from the in-memory Lucene index to files. These files are not guaranteed to be durable because an `fsync` is not performed. A refresh makes documents available for search. + +### Flush + +A _flush_ operation persists the files to disk using `fsync`, ensuring durability. Flushing ensures that the data stored only in the translog is recorded in the Lucene index. OpenSearch performs a flush as needed to ensure that the translog does not grow too large. + +### Merge + +In OpenSearch, a shard is a Lucene index, which consists of _segments_ (or segment files). Segments store the indexed data and are immutable. Periodically, smaller segments are merged into larger ones. Merging reduces the overall number of segments on each shard, frees up disk space, and improves search performance. Eventually, segments reach a maximum size specified in the merge policy and are no longer merged into larger segments. The merge policy also specifies how often merges are performed. + +## Next steps + +- Learn how to install OpenSearch within minutes in [Installation quickstart]({{site.url}}{{site.baseurl}}/getting-started/quickstart/). \ No newline at end of file diff --git a/_about/quickstart.md b/_getting-started/quickstart.md similarity index 56% rename from _about/quickstart.md rename to _getting-started/quickstart.md index 851293f575..5ef783959a 100644 --- a/_about/quickstart.md +++ b/_getting-started/quickstart.md @@ -1,13 +1,13 @@ --- layout: default -title: Quickstart +title: Installation quickstart nav_order: 3 -permalink: /quickstart/ redirect_from: - /opensearch/install/quickstart/ + - /quickstart/ --- -# Quickstart +# Installation quickstart Get started using OpenSearch and OpenSearch Dashboards by deploying your containers with [Docker](https://www.docker.com/). Before proceeding, you need to [get Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://github.com/docker/compose) installed on your local machine. @@ -18,33 +18,63 @@ The Docker Compose commands used in this guide are written with a hyphen (for ex You'll need a special file, called a Compose file, that Docker Compose uses to define and create the containers in your cluster. The OpenSearch Project provides a sample Compose file that you can use to get started. Learn more about working with Compose files by reviewing the official [Compose specification](https://docs.docker.com/compose/compose-file/). -1. Before running OpenSearch on your machine, you should disable memory paging and swapping performance on the host to improve performance and increase the number of memory maps available to OpenSearch. See [important system settings]({{site.url}}{{site.baseurl}}/opensearch/install/important-settings/) for more information. +1. Before running OpenSearch on your machine, you should disable memory paging and swapping performance on the host to improve performance and increase the number of memory maps available to OpenSearch. + + Disable memory paging and swapping: + ```bash - # Disable memory paging and swapping. sudo swapoff -a + ``` + {% include copy.html %} + + Edit the sysctl config file that defines the host's max map count: - # Edit the sysctl config file that defines the host's max map count. + ```bash sudo vi /etc/sysctl.conf + ``` + {% include copy.html %} - # Set max map count to the recommended value of 262144. + Set max map count to the recommended value of `262144`: + + ```bash vm.max_map_count=262144 + ``` + {% include copy.html %} - # Reload the kernel parameters. + Reload the kernel parameters: + + ``` sudo sysctl -p ``` + {% include copy.html %} + + For more information, see [important system settings]({{site.url}}{{site.baseurl}}/opensearch/install/important-settings/). + 1. Download the sample Compose file to your host. You can download the file with command line utilities like `curl` and `wget`, or you can manually copy [docker-compose.yml](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/docker-compose.yml) from the OpenSearch Project documentation-website repository using a web browser. + + To use cURL, send the following request: + ```bash - # Using cURL: curl -O https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/docker-compose.yml + ``` + {% include copy.html %} + + To use wget, send the following request: - # Using wget: + ``` wget https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/docker-compose.yml ``` -1. In your terminal application, navigate to the directory containing the `docker-compose.yml` file you just downloaded, and run the following command to create and start the cluster as a background process. + {% include copy.html %} + +1. In your terminal application, navigate to the directory containing the `docker-compose.yml` file you downloaded and run the following command to create and start the cluster as a background process: + ```bash docker-compose up -d ``` + {% include copy.html %} + 1. Confirm that the containers are running with the command `docker-compose ps`. You should see an output like the following: + ```bash $ docker-compose ps NAME COMMAND SERVICE STATUS PORTS @@ -52,11 +82,16 @@ You'll need a special file, called a Compose file, that Docker Compose uses to d opensearch-node1 "./opensearch-docker…" opensearch-node1 running 0.0.0.0:9200->9200/tcp, 9300/tcp, 0.0.0.0:9600->9600/tcp, 9650/tcp opensearch-node2 "./opensearch-docker…" opensearch-node2 running 9200/tcp, 9300/tcp, 9600/tcp, 9650/tcp ``` -1. Query the OpenSearch REST API to verify that the service is running. You should use `-k` (also written as `--insecure`) to disable host name checking because the default security configuration uses demo certificates. Use `-u` to pass the default username and password (`admin:admin`). + +1. Query the OpenSearch REST API to verify that the service is running. You should use `-k` (also written as `--insecure`) to disable hostname checking because the default security configuration uses demo certificates. Use `-u` to pass the default username and password (`admin:`): + ```bash - curl https://localhost:9200 -ku admin:admin + curl https://localhost:9200 -ku admin: ``` - Sample response: + {% include copy.html %} + + The response confirms that the installation was successful: + ```json { "name" : "opensearch-node1", @@ -76,65 +111,7 @@ You'll need a special file, called a Compose file, that Docker Compose uses to d "tagline" : "The OpenSearch Project: https://opensearch.org/" } ``` -1. Explore OpenSearch Dashboards by opening `http://localhost:5601/` in a web browser on the same host that is running your OpenSearch cluster. The default username is `admin` and the default password is `admin`. - -## Create an index and field mappings using sample data - -Create an index and define field mappings using a dataset provided by the OpenSearch Project. The same fictitious e-commerce data is also used for sample visualizations in OpenSearch Dashboards. To learn more, see [Getting started with OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/). - -1. Download [ecommerce-field_mappings.json](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce-field_mappings.json). This file defines a [mapping]({{site.url}}{{site.baseurl}}/opensearch/mappings/) for the sample data you will use. - ```bash - # Using cURL: - curl -O https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce-field_mappings.json - - # Using wget: - wget https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce-field_mappings.json - ``` -1. Download [ecommerce.json](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json). This file contains the index data formatted so that it can be ingested by the bulk API. To learn more, see [index data]({{site.url}}{{site.baseurl}}/opensearch/index-data/) and [Bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/). - ```bash - # Using cURL: - curl -O https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json - - # Using wget: - wget https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json - ``` -1. Define the field mappings with the mapping file. - ```bash - curl -H "Content-Type: application/x-ndjson" -X PUT "https://localhost:9200/ecommerce" -ku admin:admin --data-binary "@ecommerce-field_mappings.json" - ``` -1. Upload the index to the bulk API. - ```bash - curl -H "Content-Type: application/x-ndjson" -X PUT "https://localhost:9200/ecommerce/_bulk" -ku admin:admin --data-binary "@ecommerce.json" - ``` -1. Query the data using the search API. The following command submits a query that will return documents where `customer_first_name` is `Sonya`. - ```bash - curl -H 'Content-Type: application/json' -X GET "https://localhost:9200/ecommerce/_search?pretty=true" -ku admin:admin -d' {"query":{"match":{"customer_first_name":"Sonya"}}}' - ``` - Queries submitted to the OpenSearch REST API will generally return a flat JSON by default. For a human readable response body, use the query parameter `pretty=true`. For more information about `pretty` and other useful query parameters, see [Common REST parameters]({{site.url}}{{site.baseurl}}/opensearch/common-parameters/). -1. Access OpenSearch Dashboards by opening `http://localhost:5601/` in a web browser on the same host that is running your OpenSearch cluster. The default username is `admin` and the default password is `admin`. -1. On the top menu bar, go to **Management > Dev Tools**. -1. In the left pane of the console, enter the following: - ```json - GET ecommerce/_search - { - "query": { - "match": { - "customer_first_name": "Sonya" - } - } - } - ``` -1. Choose the triangle icon at the top right of the request to submit the query. You can also submit the request by pressing `Ctrl+Enter` (or `Cmd+Enter` for Mac users). To learn more about using the OpenSearch Dashboards console for submitting queries, see [Running queries in the console]({{site.url}}{{site.baseurl}}/dashboards/run-queries/). - -## Next steps - -You successfully deployed your own OpenSearch cluster with OpenSearch Dashboards and added some sample data. Now you're ready to learn about configuration and functionality in more detail. Here are a few recommendations on where to begin: -- [About the Security plugin]({{site.url}}{{site.baseurl}}/security/index/) -- [OpenSearch configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/) -- [OpenSearch plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) -- [Getting started with OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/) -- [OpenSearch tools]({{site.url}}{{site.baseurl}}/tools/index/) -- [Index APIs]({{site.url}}{{site.baseurl}}/api-reference/index-apis/index/) +1. Explore OpenSearch Dashboards by opening `http://localhost:5601/` in a web browser on the same host that is running your OpenSearch cluster. The default username is `admin` and the default password is set in your `docker-compose.yml` file in the `OPENSEARCH_INITIAL_ADMIN_PASSWORD=` setting. ## Common issues @@ -162,4 +139,19 @@ OpenSearch will fail to start if your host's `vm.max_map_count` is too low. Revi opensearch-node1 | ERROR: [1] bootstrap checks failed opensearch-node1 | [1]: max virtual memory areas vm.max_map_count [65530] is too low, increase to at least [262144] opensearch-node1 | ERROR: OpenSearch did not exit normally - check the logs at /usr/share/opensearch/logs/opensearch-cluster.log -``` \ No newline at end of file +``` + +## Other installation types + +In addition to Docker, you can install OpenSearch on various Linux distributions and on Windows. For all available installation guides, see [Install and upgrade OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/). + +## Further reading + +You successfully deployed your own OpenSearch cluster with OpenSearch Dashboards and added some sample data. Now you're ready to learn about configuration and functionality in more detail. Here are a few recommendations on where to begin: +- [About the Security plugin]({{site.url}}{{site.baseurl}}/security/index/) +- [OpenSearch configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/) +- [OpenSearch plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) + +## Next steps + +- See [Communicate with OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/communicate/) to learn about how to send requests to OpenSearch. diff --git a/_getting-started/search-data.md b/_getting-started/search-data.md new file mode 100644 index 0000000000..c6970e7e7b --- /dev/null +++ b/_getting-started/search-data.md @@ -0,0 +1,446 @@ +--- +layout: default +title: Search your data +nav_order: 50 +--- + +# Search your data + +In OpenSearch, there are several ways to search data: + +- [Query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/index/): The primary OpenSearch query language, which you can use to create complex, fully customizable queries. +- [Query string query language]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/): A scaled-down query language that you can use in a query parameter of a search request or in OpenSearch Dashboards. +- [SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/sql/index/): A traditional query language that bridges the gap between traditional relational database concepts and the flexibility of OpenSearch’s document-oriented data storage. +- [Piped Processing Language (PPL)]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/): The primary language used for observability in OpenSearch. PPL uses a pipe syntax that chains commands into a query. +- [Dashboards Query Language (DQL)]({{site.url}}{{site.baseurl}}/dashboards/dql/): A simple text-based query language for filtering data in OpenSearch Dashboards. + +## Prepare the data + +For this tutorial, you'll need to index student data if you haven't done so already. You can start by deleting the `students` index (`DELETE /students`) and then sending the following bulk request: + +```json +POST _bulk +{ "create": { "_index": "students", "_id": "1" } } +{ "name": "John Doe", "gpa": 3.89, "grad_year": 2022} +{ "create": { "_index": "students", "_id": "2" } } +{ "name": "Jonathan Powers", "gpa": 3.85, "grad_year": 2025 } +{ "create": { "_index": "students", "_id": "3" } } +{ "name": "Jane Doe", "gpa": 3.52, "grad_year": 2024 } +``` +{% include copy-curl.html %} + +## Retrieve all documents in an index + +To retrieve all documents in an index, send the following request: + +```json +GET /students/_search +``` +{% include copy-curl.html %} + +The preceding request is equivalent to the `match_all` query, which matches all documents in an index: + +```json +GET /students/_search +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the matching documents: + +```json +{ + "took": 12, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "students", + "_id": "1", + "_score": 1, + "_source": { + "name": "John Doe", + "gpa": 3.89, + "grad_year": 2022 + } + }, + { + "_index": "students", + "_id": "2", + "_score": 1, + "_source": { + "name": "Jonathan Powers", + "gpa": 3.85, + "grad_year": 2025 + } + }, + { + "_index": "students", + "_id": "3", + "_score": 1, + "_source": { + "name": "Jane Doe", + "gpa": 3.52, + "grad_year": 2024 + } + } + ] + } +} +``` + +## Response fields + +The preceding response contains the following fields. + + +### took + + +The `took` field contains the amount of time the query took to run, in milliseconds. + + +### timed_out + + +This field indicates whether the request timed out. If a request timed out, then OpenSearch returns the results that were gathered before the timeout. You can set the desired timeout value by providing the `timeout` query parameter: + +```json +GET /students/_search?timeout=20ms +``` +{% include copy-curl.html %} + + +### _shards + + +The `_shards` object specifies the total number of shards on which the query ran as well as the number of shards that succeeded or failed. A shard may fail if the shard itself and all its replicas are unavailable. If any of the involved shards fail, OpenSearch continues to run the query on the remaining shards. + + +### hits + + +The `hits` object contains the total number of matching documents and the documents themselves (listed in the `hits` array). Each matching document contains the `_index` and `_id` fields as well as the `_source` field, which contains the complete originally indexed document. + +Each document is given a relevance score in the `_score` field. Because you ran a `match_all` search, all document scores are set to `1` (there is no difference in their relevance). The `max_score` field contains the highest score of any matching document. + +## Query string queries + +Query string queries are lightweight but powerful. You can send a query string query as a `q` query parameter. For example, the following query searches for students with the name `john`: + +```json +GET /students/_search?q=name:john +``` +{% include copy-curl.html %} + +OpenSearch returns the matching document: + +```json +{ + "took": 18, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.9808291, + "hits": [ + { + "_index": "students", + "_id": "1", + "_score": 0.9808291, + "_source": { + "name": "John Doe", + "grade": 12, + "gpa": 3.89, + "grad_year": 2022, + "future_plans": "John plans to be a computer science major" + } + } + ] + } +} +``` + +For more information about query string syntax, see [Query string query language]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/). + +## Query DSL + +Using Query DSL, you can create more complex and customized queries. + +### Full-text search + +You can run a full-text search on fields mapped as `text`. By default, text fields are analyzed by the `default` analyzer. The analyzer splits text into terms and changes it to lowercase. For more information about OpenSearch analyzers, see [Analyzers]({{site.url}}{{site.baseurl}}/analyzers/). + +For example, the following query searches for students with the name `john`: + +```json +GET /students/_search +{ + "query": { + "match": { + "name": "john" + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching document: + +```json +{ + "took": 13, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.9808291, + "hits": [ + { + "_index": "students", + "_id": "1", + "_score": 0.9808291, + "_source": { + "name": "John Doe", + "gpa": 3.89, + "grad_year": 2022 + } + } + ] + } +} +``` + +Notice that the query text is lowercase while the text in the field is not, but the query still returns the matching document. + +You can reorder the terms in the search string. For example, the following query searches for `doe john`: + +```json +GET /students/_search +{ + "query": { + "match": { + "name": "doe john" + } + } +} +``` +{% include copy-curl.html %} + +The response contains two matching documents: + +```json +{ + "took": 14, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1.4508327, + "hits": [ + { + "_index": "students", + "_id": "1", + "_score": 1.4508327, + "_source": { + "name": "John Doe", + "gpa": 3.89, + "grad_year": 2022 + } + }, + { + "_index": "students", + "_id": "3", + "_score": 0.4700036, + "_source": { + "name": "Jane Doe", + "gpa": 3.52, + "grad_year": 2024 + } + } + ] + } +} +``` + +The match query type uses `OR` as an operator by default, so the query is functionally `doe OR john`. Both `John Doe` and `Jane Doe` matched the word `doe`, but `John Doe` is scored higher because it also matched `john`. + +### Keyword search + +The `name` field contains the `name.keyword` subfield, which is added by OpenSearch automatically. If you search the `name.keyword` field in a manner similar to the previous request: + +```json +GET /students/_search +{ + "query": { + "match": { + "name.keyword": "john" + } + } +} +``` +{% include copy-curl.html %} + +Then the request returns no hits because the `keyword` fields must exactly match. + +However, if you search for the exact text `John Doe`: + +```json +GET /students/_search +{ + "query": { + "match": { + "name.keyword": "John Doe" + } + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the matching document: + +```json +{ + "took": 19, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.9808291, + "hits": [ + { + "_index": "students", + "_id": "1", + "_score": 0.9808291, + "_source": { + "name": "John Doe", + "gpa": 3.89, + "grad_year": 2022 + } + } + ] + } +} +``` + +### Filters + +Using a Boolean query, you can add a filter clause to your query for fields with exact values + +Term filters match specific terms. For example, the following Boolean query searches for students whose graduation year is 2022: + +```json +GET students/_search +{ + "query": { + "bool": { + "filter": [ + { "term": { "grad_year": 2022 }} + ] + } + } +} +``` +{% include copy-curl.html %} + +With range filters, you can specify a range of values. For example, the following Boolean query searches for students whose GPA is greater than 3.6: + +```json +GET students/_search +{ + "query": { + "bool": { + "filter": [ + { "range": { "gpa": { "gt": 3.6 }}} + ] + } + } +} +``` +{% include copy-curl.html %} + +For more information about filters, see [Query and filter context]({{site.url}}{{site.baseurl}}/query-dsl/query-filter-context/). + +### Compound queries + +A compound query lets you combine multiple query or filter clauses. A Boolean query is an example of a compound query. + +For example, to search for students whose name matches `doe` and filter by graduation year and GPA, use the following request: + +```json +GET students/_search +{ + "query": { + "bool": { + "must": [ + { + "match": { + "name": "doe" + } + }, + { "range": { "gpa": { "gte": 3.6, "lte": 3.9 } } }, + { "term": { "grad_year": 2022 }} + ] + } + } +} +``` +{% include copy-curl.html %} + +For more information about Boolean and other compound queries, see [Compound queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/index/). + +## Search methods + +Along with the traditional full-text search described in this tutorial, OpenSearch supports a range of machine learning (ML)-powered search methods, including k-NN, semantic, multimodal, sparse, hybrid, and conversational search. For information about all OpenSearch-supported search methods, see [Search]({{site.url}}{{site.baseurl}}/search-plugins/). + +## Next steps + +- For information about available query types, see [Query DSL]({{site.url}}{{site.baseurl}}/query-dsl/index/). +- For information about available search methods, see [Search]({{site.url}}{{site.baseurl}}/search-plugins/). \ No newline at end of file diff --git a/_im-plugin/index-alias.md b/_im-plugin/index-alias.md index 97e10f6569..1816b0e0d2 100644 --- a/_im-plugin/index-alias.md +++ b/_im-plugin/index-alias.md @@ -16,15 +16,6 @@ For example, if you’re storing logs into indexes based on the month and you fr Because you can change the indexes an alias points to at any time, referring to indexes using aliases in your applications allows you to reindex your data without any downtime. ---- - -#### Table of contents -1. TOC -{:toc} - - ---- - ## Create aliases To create an alias, use a POST request: @@ -32,6 +23,7 @@ To create an alias, use a POST request: ```json POST _aliases ``` +{% include copy-curl.html %} Use the `actions` method to specify the list of actions that you want to perform. This command creates an alias named `alias1` and adds `index-1` to this alias: @@ -48,16 +40,18 @@ POST _aliases ] } ``` +{% include copy-curl.html %} -You should see the following response: +The following response is returned: ```json { "acknowledged": true } ``` +{% include copy-curl.html %} -If this request fails, make sure the index that you're adding to the alias already exists. +If the request fails, make sure the index that you're adding to the alias already exists. You can also create an alias using one of the following requests: @@ -67,6 +61,7 @@ POST /_aliases/ PUT /_alias/ POST /_alias/ ``` +{% include copy-curl.html %} The `` in the above requests can be an index name, a comma-separated list of index names, or a wildcard expression. Use `_all` to refer to all indexes. @@ -76,17 +71,18 @@ To check if `alias1` refers to `index-1`, run one of the following commands: GET /_alias/alias1 GET /index-1/_alias/alias1 ``` +{% include copy-curl.html %} -To get the mappings and settings information of the indexes that the alias references, run the following command: +To get the indexes' mappings and settings information referenced by the alias, run the following command: ```json GET alias1 ``` +{% include copy-curl.html %} ## Add or remove indexes -You can perform multiple actions in the same `_aliases` operation. -For example, the following command removes `index-1` and adds `index-2` to `alias1`: +You can perform multiple actions using the same `_aliases` operation. For example, the following command removes `index-1` and adds `index-2` to `alias1`: ```json POST _aliases @@ -107,10 +103,9 @@ POST _aliases ] } ``` +{% include copy-curl.html %} -The `add` and `remove` actions occur atomically, which means that at no point will `alias1` point to both `index-1` and `index-2`. - -You can also add indexes based on an index pattern: +The `add` and `remove` actions occur atomically, which means that at no point will `alias1` point to both `index-1` and `index-2`. You can also add indexes based on an index pattern, as shown in the following POST request: ```json POST _aliases @@ -125,6 +120,27 @@ POST _aliases ] } ``` +{% include copy-curl.html %} + +The `remove` action also supports the `must_exist` parameter. If the parameter is set to `true` and the specified alias does not exist, an exception is thrown. If the parameter is set to `false`, then no action is taken if the specified alias does not exist. The default value for `must_exist` is `null`. An exception will be thrown only if none of the specified aliases exist. + +The following POST request uses the `remove` action with the `must_exist` parameter set to `true`: + +```json +POST _aliases +{ + "actions": [ + { + "remove": { + "index": "index-1", + "alias": "alias1", + "must_exist": true + } + } + ] +} +``` +{% include copy-curl.html %} ## Manage aliases @@ -133,6 +149,7 @@ To list the mapping of aliases to indexes, run the following command: ```json GET _cat/aliases?v ``` +{% include copy-curl.html %} #### Example response @@ -140,12 +157,14 @@ GET _cat/aliases?v alias index filter routing.index routing.search alias1 index-1 * - - ``` +{% include copy-curl.html %} To check which indexes an alias points to, run the following command: ```json GET _alias/alias1 ``` +{% include copy-curl.html %} #### Example response @@ -158,18 +177,21 @@ GET _alias/alias1 } } ``` +{% include copy-curl.html %} Conversely, to find which alias points to a specific index, run the following command: ```json GET /index-2/_alias/* ``` +{% include copy-curl.html %} To get all index names and their aliases, run the following command: ```json GET /_alias ``` +{% include copy-curl.html %} To check if an alias exists, run one of the following commands: @@ -178,10 +200,11 @@ HEAD /alias1/_alias/ HEAD /_alias/alias1/ HEAD index-1/_alias/alias1/ ``` +{% include copy-curl.html %} ## Add aliases at index creation -You can add an index to an alias as you create the index: +You can add an index to an alias as you create the index, as shown in the following PUT request: ```json PUT index-1 @@ -191,12 +214,11 @@ PUT index-1 } } ``` +{% include copy-curl.html %} ## Create filtered aliases -You can create a filtered alias to access a subset of documents or fields from the underlying indexes. - -This command adds only a specific timestamp field to `alias1`: +You can create a filtered alias to access a subset of documents or fields in the underlying indexes. This command adds only a specific timestamp field to `alias1`. The following shows an example POST request: ```json POST _aliases @@ -216,6 +238,7 @@ POST _aliases ] } ``` +{% include copy-curl.html %} ## Index alias options @@ -229,7 +252,6 @@ Option | Valid values | Description | Required `routing` | String | Limit search to an associated shard value. You can specify `search_routing` and `index_routing` independently. | No `is_write_index` | String | Specify the index that accepts any write operations to the alias. If this value is not specified, then no write operations are allowed. | No - ## Delete aliases To delete one or more aliases from an index, use the following request: @@ -238,6 +260,7 @@ To delete one or more aliases from an index, use the following request: DELETE /_alias/ DELETE /_aliases/ ``` +{% include copy-curl.html %} Both `` and `` in the above request support comma-separated lists and wildcard expressions. Use `_all` in place of `` to delete all aliases for the indexes listed in ``. @@ -246,5 +269,6 @@ For example, if `alias1` refers to `index-1` and `index-2`, you can run the foll ```json DELETE index-1/_alias/alias1 ``` +{% include copy-curl.html %} -After you run the request above, `alias1` no longer refers to `index-1`, but still refers to `index-2`. \ No newline at end of file +After running the request, `alias1` no longer refers to `index-1` but still refers to `index-2`. diff --git a/_im-plugin/ism/policies.md b/_im-plugin/ism/policies.md index 2b9ec15adb..e6262e883b 100644 --- a/_im-plugin/ism/policies.md +++ b/_im-plugin/ism/policies.md @@ -119,8 +119,8 @@ Reduces the number of Lucene segments by merging the segments of individual shar Parameter | Description | Type | Required :--- | :--- |:--- |:--- | `max_num_segments` | The number of segments to reduce the shard to. | `number` | Yes -wait_for_completion | Boolean | When set to `false`, the request returns immediately instead of after the operation is finished. To monitor the operation status, use the [Tasks API]({{site.url}}{{site.baseurl}}/api-reference/tasks/) with the task ID returned by the request. Default is `true`. -task_execution_timeout | Time | The explicit task execution timeout. Only useful when wait_for_completion is set to `false`. Default is `1h`. | No +`wait_for_completion` | Boolean | When set to `false`, the request returns immediately instead of after the operation is finished. To monitor the operation status, use the [Tasks API]({{site.url}}{{site.baseurl}}/api-reference/tasks/) with the task ID returned by the request. Default is `true`. +`task_execution_timeout` | Time | The explicit task execution timeout. Only useful when `wait_for_completion` is set to `false`. Default is `1h`. | No ```json { @@ -189,18 +189,20 @@ Allows you to reduce the number of primary shards in your indexes. With this act "my-alias": {} } ], + "switch_aliases": true, "force_unsafe": false } ``` Parameter | Description | Type | Example | Required :--- | :--- |:--- |:--- | -`num_new_shards` | The maximum number of primary shards in the shrunken index. | integer | `5` | Yes, however it cannot be used with `max_shard_size` or `percentage_of_source_shards` -`max_shard_size` | The maximum size in bytes of a shard for the target index. | keyword | `5gb` | Yes, however it cannot be used with `num_new_shards` or `percentage_of_source_shards` +`num_new_shards` | The maximum number of primary shards in the shrunken index. | Integer | `5` | Yes. It, however, cannot be used with `max_shard_size` or `percentage_of_source_shards`. +`max_shard_size` | The maximum size in bytes of a shard for the target index. | Keyword | `5gb` | Yes, however, it cannot be used with `num_new_shards` or `percentage_of_source_shards`. `percentage_of_source_shards` | Percentage of the number of original primary shards to shrink. This parameter indicates the minimum percentage to use when shrinking the number of primary shards. Must be between 0.0 and 1.0, exclusive. | Percentage | `0.5` | Yes, however it cannot be used with `max_shard_size` or `num_new_shards` -`target_index_name_template` | The name of the shrunken index. Accepts strings and the Mustache variables `{{ctx.index}}` and `{{ctx.indexUuid}}`. | `string` or Mustache template | `{"source": "{{ctx.index}}_shrunken"}` | No -`aliases` | Aliases to add to the new index. | object | `myalias` | No, but must be an array of alias objects -`force_unsafe` | If true, executes the shrink action even if there are no replicas. | boolean | `false` | No +`target_index_name_template` | The name of the shrunken index. Accepts strings and the Mustache variables `{{ctx.index}}` and `{{ctx.indexUuid}}`. | String or Mustache template | `{"source": "{{ctx.index}}_shrunken"}` | No +`aliases` | Aliases to add to the new index. | Object | `myalias` | No. It must be an array of alias objects. +`switch_aliases` | If `true`, copies the aliases from the source index to the target index. If there is a name conflict with an alias from the `aliases` field, the alias in the `aliases` field is used instead of the name. | Boolean | `true` | No. The default implicit value is `false`, which means no aliases are copied by default. +`force_unsafe` | If `true`, shrinks the index even if it has no replicas. | Boolean | `false` | No If you want to add `aliases` to the action, the parameter must include an array of [alias objects]({{site.url}}{{site.baseurl}}/api-reference/alias/). For example, diff --git a/_im-plugin/reindex-data.md b/_im-plugin/reindex-data.md index 2e3288087a..a766589b84 100644 --- a/_im-plugin/reindex-data.md +++ b/_im-plugin/reindex-data.md @@ -91,6 +91,12 @@ Options | Valid values | Description | Required `socket_timeout` | Time Unit | The wait time for socket reads (default 30s). | No `connect_timeout` | Time Unit | The wait time for remote connection timeouts (default 30s). | No +The following table lists the retry policy cluster settings. + +Setting | Description | Default value +:--- | :--- +`reindex.remote.retry.initial_backoff` | The initial backoff time for retries. Subsequent retries will follow exponential backoff based on the initial backoff time. | 500 ms +`reindex.remote.retry.max_count` | The maximum number of retry attempts. | 15 ## Reindex a subset of documents diff --git a/_includes/banner.html b/_includes/banner.html index dadf93f578..95869b6cbe 100644 --- a/_includes/banner.html +++ b/_includes/banner.html @@ -5,7 +5,7 @@
- +
\ No newline at end of file diff --git a/_includes/cards.html b/_includes/cards.html index f5f2c960e8..6d958e61a5 100644 --- a/_includes/cards.html +++ b/_includes/cards.html @@ -4,31 +4,31 @@

OpenSearch and OpenSearch Dashboards

-

Learn how to power up your search

- +

Build your OpenSearch solution using core tooling and visualizations

+

Data Prepper

-

Prepare your data for OpenSearch

- +

Filter, mutate, and sample your data for ingestion into OpenSearch

+

Clients

-

OpenSearch in your programming language

- +

Interact with OpenSearch from your application using language APIs

+
-

Benchmark

-

Track OpenSearch performance

- +

OpenSearch Benchmark

+

Measure performance metrics for your OpenSearch cluster

+
diff --git a/_ingest-pipelines/index.md b/_ingest-pipelines/index.md index 8d1ed8d7bb..f0b52ea152 100644 --- a/_ingest-pipelines/index.md +++ b/_ingest-pipelines/index.md @@ -16,7 +16,15 @@ An _ingest pipeline_ is a sequence of _processors_ that are applied to documents Processors are customizable tasks that run in a sequential order as they appear in the request body. This order is important, as each processor depends on the output of the previous processor. The modified documents appear in your index after the processors are applied. -Ingest pipelines can only be managed using [ingest API operations]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/). +## OpenSearch ingest pipelines compared to Data Prepper + +OpenSeach ingest pipelines run within the OpenSearch cluster, whereas [Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/) is an external component that runs on the OpenSearch cluster. + +OpenSearch ingest pipelines perform actions on indexes and are preferred for use cases involving pre-processing simple datasets, [machine learning (ML) processors]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/sparse-encoding/), and [vector embedding processors]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/text-image-embedding/). OpenSearch ingest pipelines are recommended for simple data pre-processing and small datasets. + +Data Prepper is recommended for any data processing tasks it supports, particularly when dealing with large datasets and complex data pre-processing requirements. It streamlines the process of transferring and fetching large datasets while providing robust capabilities for intricate data preparation and transformation operations. Refer to the [Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/) documentation for more information. + +OpenSearch ingest pipelines can only be managed using [Ingest API operations]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/). {: .note} ## Prerequisites diff --git a/_ingest-pipelines/processors/append.md b/_ingest-pipelines/processors/append.md index f52e23db5c..8101cf97c9 100644 --- a/_ingest-pipelines/processors/append.md +++ b/_ingest-pipelines/processors/append.md @@ -6,10 +6,14 @@ nav_order: 10 redirect_from: - /api-reference/ingest-apis/processors/append/ --- - + +This documentation describes using the `append` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `add_entries` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/add-entries/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + # Append processor The `append` processor is used to add values to a field: + - If the field is an array, the `append` processor appends the specified values to that array. - If the field is a scalar field, the `append` processor converts it to an array and appends the specified values to that array. - If the field does not exist, the `append` processor creates an array with the specified values. diff --git a/_ingest-pipelines/processors/community_id.md b/_ingest-pipelines/processors/community_id.md new file mode 100644 index 0000000000..c6f74d8af7 --- /dev/null +++ b/_ingest-pipelines/processors/community_id.md @@ -0,0 +1,178 @@ +--- +layout: default +title: Community ID +parent: Ingest processors +nav_order: 55 +--- + +# Community ID processor + +The `community_id` processor is used to generate the community ID flow hash for network flow tuples. The community ID flow hash algorithm is defined in the [community ID specification](https://github.com/corelight/community-id-spec). The processor-generated hash value can be used to correlate all related network events so that you can filter the network flow data by the hash value or generate statistics by aggregating on the hash field. The processor supports the TCP, UDP, SCTP, ICMP, and IPv6-ICMP network protocols. The SHA-1 hash algorithm is used to generate the hash value. + +The following is the `community_id` processor syntax: + +```json +{ + "community_id": { + "source_ip_field": "source_ip", + "source_port_field": "source_port", + "destination_ip_field": "destination_ip", + "destination_port_field": "destination_port", + "iana_protocol_number_field": "iana_protocol_number", + "source_port_field": "source_port", + "target_field": "community_id" + } +} +``` +{% include copy-curl.html %} + +## Configuration parameters + +The following table lists the required and optional parameters for the `community_id` processor. + +Parameter | Required/Optional | Description | +|-----------|-----------|-----------| +`source_ip_field` | Required | The name of the field containing the source IP address. | +`source_port_field` | Optional | The name of the field containing the source port address. If the network protocol is TCP, UDP, or SCTP, then the field is required. Otherwise, it is not required.| +`destination_ip_field` | Required | The name of the field containing the destination IP address. | +`destination_port_field` | Optional | The name of the field containing the destination port address. If the network protocol is TCP, UDP, or SCTP, then the field is required. Otherwise, it is not required. | +`iana_protocol_number` | Optional | The name of the field containing the protocol number defined by the Internet Assigned Numbers Authority (IANA). The supported values are 1 (ICMP), 6 (TCP), 17 (UDP), 58 (IPv6-ICMP), and 132 (SCTP). | +`protocol_field` | Optional | The name of the field containing the protocol name. If `iana_protocol_number` is not set, then the field is required. Otherwise, it is not required. | +`icmp_type_field` | Optional | The name of the field containing the ICMP message type. Required when the protocol is ICMP or IPv6-ICMP. | +`icmp_code_field` | Optional | The name of the field containing the ICMP message code. For certain ICMP message types that do not have a code, the field is optional. Otherwise, it is required. | +`seed` | Optional | The seed for generating the community ID hash. The value must be between 0 and 65535. | +`target_field` | Optional | The name of the field in which to store the community ID hash value. Default target field is `community_id`. | +`ignore_missing` | Optional | Specifies whether the processor should exit quietly if one of the required fields is missing. Default is `false`. | +`description` | Optional | A brief description of the processor. | +`if` | Optional | A condition for running the processor. | +`ignore_failure` | Optional | If set to `true`, then failures are ignored. Default is `false`. | +`on_failure` | Optional | A list of processors to run if the processor fails. | +`tag` | Optional | An identifier tag for the processor. Useful for debugging in order to distinguish between processors of the same type. | + +## Using the processor + +Follow these steps to use the processor in a pipeline. + +**Step 1: Create a pipeline** + +The following query creates a pipeline named `community_id_pipeline` that uses the `community_id` processor to generate a hash value for the network flow tuple: + +```json +PUT /_ingest/pipeline/commnity_id_pipeline +{ + "description": "generate hash value for the network flow tuple", + "processors": [ + { + "community_id": { + "source_ip_field": "source_ip", + "source_port_field": "source_port", + "destination_ip_field": "destination_ip", + "destination_port_field": "destination_port", + "iana_protocol_number_field": "iana_protocol_number", + "target_field": "community_id" + } + } + ] +} +``` +{% include copy-curl.html %} + +**Step 2 (Optional): Test the pipeline** + +It is recommended that you test your pipeline before ingesting documents. +{: .tip} + +To test the pipeline, run the following query: + +```json +POST _ingest/pipeline/commnity_id_pipeline/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "source_ip": "66.35.250.204", + "source_port": 80, + "destination_ip": "128.232.110.120", + "destination_port": 34855, + "iana_protocol_number": 6 + } + } + ] +} +``` +{% include copy-curl.html %} + +#### Response + +The following example response confirms that the pipeline is working as expected: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "community_id": "1:LQU9qZlK+B5F3KDmev6m5PMibrg=", + "destination_ip": "128.232.110.120", + "destination_port": 34855, + "source_port": 80, + "iana_protocol_number": 6, + "source_ip": "66.35.250.204" + }, + "_ingest": { + "timestamp": "2024-03-11T02:17:22.329823Z" + } + } + } + ] +} +``` + +**Step 3: Ingest a document** + +The following query ingests a document into an index named `testindex1`: + +```json +PUT testindex1/_doc/1?pipeline=commnity_id_pipeline +{ + "source_ip": "66.35.250.204", + "source_port": 80, + "destination_ip": "128.232.110.120", + "destination_port": 34855, + "iana_protocol_number": 6 +} +``` +{% include copy-curl.html %} + +#### Response + +The request indexes the document into the `testindex1` index: + +```json +{ + "_index": "testindex1", + "_id": "1", + "_version": 1, + "result": "created", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 0, + "_primary_term": 1 +} +``` + +**Step 4 (Optional): Retrieve the document** + +To retrieve the document, run the following query: + +```json +GET testindex1/_doc/1 +``` +{% include copy-curl.html %} diff --git a/_ingest-pipelines/processors/convert.md b/_ingest-pipelines/processors/convert.md index a28cb3137f..f26cb145e6 100644 --- a/_ingest-pipelines/processors/convert.md +++ b/_ingest-pipelines/processors/convert.md @@ -7,6 +7,9 @@ redirect_from: - /api-reference/ingest-apis/processors/convert/ --- +This documentation describes using the `convert` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `convert_entry_type` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/convert_entry_type/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + # Convert processor The `convert` processor converts a field in a document to a different type, for example, a string to an integer or an integer to a string. For an array field, all values in the array are converted. @@ -32,7 +35,7 @@ The following table lists the required and optional parameters for the `convert` Parameter | Required/Optional | Description | |-----------|-----------|-----------| `field` | Required | The name of the field containing the data to be converted. Supports [template snippets]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/#template-snippets). | -`type` | Required | The type to convert the field value to. The supported types are `integer`, `long`, `float`, `double`, `string`, `boolean`, `ip`, and `auto`. If the `type` is `boolean`, the value is set to `true` if the field value is a string `true` (ignoring case) and to `false` if the field value is a string `false` (ignoring case). If the value is not one of the allowed values, an error will occur. | +`type` | Required | The type to convert the field value to. The supported types are `integer`, `long`, `float`, `double`, `string`, `boolean`, and `auto`. If the `type` is `boolean`, the value is set to `true` if the field value is a string `true` (ignoring case) and to `false` if the field value is a string `false` (ignoring case). If the value is not one of the allowed values, an error will occur. | `description` | Optional | A brief description of the processor. | `if` | Optional | A condition for running the processor. | `ignore_failure` | Optional | Specifies whether the processor continues execution even if it encounters errors. If set to `true`, failures are ignored. Default is `false`. | diff --git a/_ingest-pipelines/processors/copy.md b/_ingest-pipelines/processors/copy.md new file mode 100644 index 0000000000..03ee2279a5 --- /dev/null +++ b/_ingest-pipelines/processors/copy.md @@ -0,0 +1,154 @@ +--- +layout: default +title: Copy +parent: Ingest processors +nav_order: 35 +redirect_from: + - /api-reference/ingest-apis/processors/copy/ +--- + +This documentation describes using the `copy` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `copy_values` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/copy-values/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + +# Copy processor + +The `copy` processor copies an entire object in an existing field to another field. + +## Syntax + +The following is the syntax for the `copy` processor: + +```json +{ + "copy": { + "source_field": "source_field", + "target_field": "target_field", + "ignore_missing": true, + "override_target": true, + "remove_source": true + } +} +``` +{% include copy-curl.html %} + +## Configuration parameters + +The following table lists the required and optional parameters for the `copy` processor. + +| Parameter | Required/Optional | Description | +|---|---|---| +`source_field` | Required | The name of the field to be copied. Supports [template snippets]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/#template-snippets). | +`target_field` | Required | The name of the field to be copied to. Supports [template snippets]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/#template-snippets). | +`ignore_missing` | Optional | Specifies whether the processor should ignore documents that do not contain the specified `source_field`. If set to `true`, the processor does not modify the document if the `source_field` does not exist or is `null`. Default is `false`. | +`override_target` | Optional | Specifies whether the processor should override the `target_field` if it already exists in the document. If set to `true`, the processor overrides the value of `target_field` if it already exists. Default is `false`. | +`remove_source` | Optional | Specifies whether the processor should remove the `source_field` after it has been copied. If set to `true`, the processor removes the `source_field` from the document. Default is `false`. | +`description` | Optional | A brief description of the processor. | +`if` | Optional | A condition for running the processor. | +`ignore_failure` | Optional | Specifies whether the processor continues execution even if it encounters an error. If set to `true`, the failure is ignored. Default is `false`. | +`on_failure` | Optional | A list of processors to run if the processor fails. | +`tag` | Optional | An identifier tag for the processor. Useful for debugging in order to distinguish between processors of the same type. | + +## Using the processor + +Follow these steps to use the processor in a pipeline. + +**Step 1: Create a pipeline** + +The following query creates a pipeline named `copy_object` that copies a nested object from one field to the root level: + +```json +PUT /_ingest/pipeline/copy_object +{ + "description": "Pipeline that copies object.", + "processors": [ + { + "copy": { + "source_field": "message.content", + "target_field":"content", + "ignore_missing": true, + "override_target": true, + "remove_source": true + } + } + ] +} +``` +{% include copy-curl.html %} + +**Step 2 (Optional): Test the pipeline** + +It is recommended that you test your pipeline before you ingest documents. +{: .tip} + +To test the pipeline, run the following query: + +```json +POST _ingest/pipeline/copy_object/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source":{ + "message": { + "content": { + "foo": "bar", + "zoo": [1, 2, 3] + } + } + } + } + ] +} +``` +{% include copy-curl.html %} + +**Response** + +The following example response confirms that the pipeline is working as expected: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "content": { + "foo": "bar", + "zoo": [1, 2, 3] + } + }, + "_ingest": { + "timestamp": "2023-08-24T18:02:13.218986756Z" + } + } + } + ] +} +``` + +**Step 3: Ingest a document** + +The following query ingests a document into an index named `testindex1`: + +```json +PUT testindex1/_doc/1?pipeline=copy_object +{ + "content": { + "foo": "bar", + "zoo": [1, 2, 3] + } +} +``` +{% include copy-curl.html %} + +**Step 4 (Optional): Retrieve the document** + +To retrieve the document, run the following query: + +```json +GET testindex1/_doc/1 +``` +{% include copy-curl.html %} diff --git a/_ingest-pipelines/processors/csv.md b/_ingest-pipelines/processors/csv.md index baf28a9c8e..1d64fb0159 100644 --- a/_ingest-pipelines/processors/csv.md +++ b/_ingest-pipelines/processors/csv.md @@ -7,6 +7,9 @@ redirect_from: - /api-reference/ingest-apis/processors/csv/ --- +This documentation describes using the `csv` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `csv` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/csv/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + # CSV processor The `csv` processor is used to parse CSVs and store them as individual fields in a document. The processor ignores empty fields. diff --git a/_ingest-pipelines/processors/date.md b/_ingest-pipelines/processors/date.md index 1ebb8a1a59..a601cacbed 100644 --- a/_ingest-pipelines/processors/date.md +++ b/_ingest-pipelines/processors/date.md @@ -7,11 +7,14 @@ redirect_from: - /api-reference/ingest-apis/processors/date/ --- +This documentation describes using the `date` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `date` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/date/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + # Date processor The `date` processor is used to parse dates from document fields and to add the parsed data to a new field. By default, the parsed data is stored in the `@timestamp` field. -## Syntax +## Syntax example The following is the syntax for the `date` processor: diff --git a/_ingest-pipelines/processors/dissect.md b/_ingest-pipelines/processors/dissect.md new file mode 100644 index 0000000000..4a42b92423 --- /dev/null +++ b/_ingest-pipelines/processors/dissect.md @@ -0,0 +1,529 @@ +--- +layout: default +title: Dissect +parent: Ingest processors +nav_order: 60 +--- + +This documentation describes using the `dissect` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `dissect` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/dissect/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + +# Dissect + +The `dissect` processor extracts values from a document text field and maps them to individual fields based on dissect patterns. The processor is well suited for field extractions from log messages with a known structure. Unlike the `grok` processor, `dissect` does not use regular expressions and has a simpler syntax. + +## Syntax + +The following is the syntax for the `dissect` processor: + +```json +{ + "dissect": { + "field": "source_field", + "pattern": "%{dissect_pattern}" + } +} +``` +{% include copy-curl.html %} + + +## Configuration parameters + +The following table lists the required and optional parameters for the `dissect` processor. + +Parameter | Required/Optional | Description | +|-----------|-----------|-----------| +`field` | Required | The name of the field containing the data to be dissected. | +`pattern` | Required | The dissect pattern used to extract data from the specified field. | +`append_separator` | Optional | The separator character or string that separates appended fields. Default is `""` (empty string). +`description` | Optional | A brief description of the processor. | +`if` | Optional | A condition for running the processor. | +`ignore_failure` | Optional | Specifies whether the processor continues execution even if it encounters an error. If set to `true`, the processor failure is ignored. Default is `false`. | +`ignore_missing` | Optional | Specifies whether the processor should ignore documents that do not contain the specified field. If set to `true`, the processor does not modify the document if the field does not exist or is `null`. Default is `false`. | +`on_failure` | Optional | A list of processors to run if the processor fails. | +`tag` | Optional | An identifier tag for the processor. Useful for debugging to distinguish between processors of the same type. | + +## Using the processor + +Follow these steps to use the processor in a pipeline. + +**Step 1: Create a pipeline** + +The following query creates a pipeline, named `dissect-test`, that uses the `dissect` processor to parse the log line: + +```json +PUT /_ingest/pipeline/dissect-test +{ + "description": "Pipeline that dissects web server logs", + "processors": [ + { + "dissect": { + "field": "message", + "pattern": "%{client_ip} - - [%{timestamp}] \"%{http_method} %{url} %{http_version}\" %{response_code} %{response_size}" + } + } + ] +} +``` +{% include copy-curl.html %} + +**Step 2 (Optional): Test the pipeline** + +It is recommended that you test your pipeline before you ingest documents. +{: .tip} + +To test the pipeline, run the following query: + +```json +POST _ingest/pipeline/dissect-test/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "message": "192.168.1.10 - - [03/Nov/2023:15:20:45 +0000] \"POST /login HTTP/1.1\" 200 3456" + } + } + ] +} +``` +{% include copy-curl.html %} + +**Response** + +The following example response confirms that the pipeline is working as expected: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "response_code": "200", + "http_method": "POST", + "http_version": "HTTP/1.1", + "client_ip": "192.168.1.10", + "message": """192.168.1.10 - - [03/Nov/2023:15:20:45 +0000] "POST /login HTTP/1.1" 200 3456""", + "url": "/login", + "response_size": "3456", + "timestamp": "03/Nov/2023:15:20:45 +0000" + }, + "_ingest": { + "timestamp": "2023-11-03T22:28:32.830244044Z" + } + } + } + ] +} +``` + +**Step 3: Ingest a document** + +The following query ingests a document into an index named `testindex1`: + +```json +PUT testindex1/_doc/1?pipeline=dissect-test +{ + "message": "192.168.1.10 - - [03/Nov/2023:15:20:45 +0000] \"POST /login HTTP/1.1\" 200 3456" +} +``` +{% include copy-curl.html %} + +**Step 4 (Optional): Retrieve the document** + +To retrieve the document, run the following query: + +```json +GET testindex1/_doc/1 +``` +{% include copy-curl.html %} + +## Dissect patterns + +A dissect pattern is a method of telling the `dissect` processor how to parse a string into a structured format. The pattern is defined by the parts of the string that you want to discard. For example, the `%{client_ip} - - [%{timestamp}]` dissect pattern parses the string `"192.168.1.10 - - [03/Nov/2023:15:20:45 +0000] \"POST /login HTTP/1.1\" 200 3456"` into the following fields: + +```json +client_ip: "192.168.1.1" +@timestamp: "03/Nov/2023:15:20:45 +0000" +``` + +A dissect pattern works by matching a string against a set of rules. For example, the first rule discards a single space. The `dissect` processor will find this space and then assign the value of `client_ip` to all the characters before that space. The next rule matches the `[` and `]` characters and then assigns the value of `@timestamp` to everything in between. + +### Building successful dissect patterns + +When building a dissect pattern, it is important to pay attention to the parts of the string that you want to discard. If you discard too much of the string, then the `dissect` processor may not be able to successfully parse the remaining data. Conversely, if you do not discard enough of the string, then the processor may create unnecessary fields. + +If any `%{keyname}` defined in the pattern does not have a value, then an exception is thrown. You can handle this exception by providing error handling steps in the `on_failure` parameter. + +### Empty and named skip keys + +An empty key `%{}` or a [named skip key](#named-skip-key-modifier) can be used to match values but exclude the value from the final document. This can be useful if you want to parse a string but do not need to store all of its parts. + +### Converting matched values to a non-string data type + +By default, all matched values are represented as string data types. If you need to convert a value to a different data type, you can use the [`convert` processor]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/convert/). + +### Key modifiers + +The `dissect` processor supports key modifiers that can change the default processor behavior. These modifiers are always placed to the left or right of the `%{keyname}` and are always enclosed within `%{}`. For example, the `%{+keyname->}` modifier includes the append and right padding modifiers. Key modifiers are useful for combining multiple fields into a single line of output, creating formatted lists of data items, or aggregating values from multiple sources. + +The following table lists the primary modifiers for the `dissect` processor. + +Modifier | Name | Position | Example | Description | +|-----------|-----------|-----------| +`->` | Skip right padding | (far) right | `%{keyname->}` | Tells the `dissect` processor to skip over any repeated characters to the right. For example, `%{timestamp->}` could be used to tell the processor to skip any padding characters, such as two consecutive spaces or any varying character padding, that follow `timestamp`. | +`+` | Append | left | `%{keyname} %{+keyname}` | Appends two or more fields. | +`+` with `/n` | Append with order | left and right | `%{+keyname}/2 %{+keyname/1}` | Appends two or more fields in the specified order. | +`?` | Named skip key | left | `%{?skipme}` | Skips the matched value in the output. Same behavior as `%{}`. | +`*` and `&` | Reference keys | left | `%{*r1} %{&r1}` | Sets the output key as the value of `*` and the output value of `&`. | + +Detailed descriptions of each key modifier, along with usage examples, are provided in the following sections. + +### Right padding modifier (`->`) + +The dissection algorithm is precise and requires that every character in the pattern exactly match the source string. For example, the pattern `%{hellokey} %{worldkey}` (one space) will match the string "Hello world" (one space) but not the string "Hello world" (two spaces) because the pattern only has one space while the source string has two. + +The _right padding modifier_ can be used to address this issue. When added to the pattern `%{helloworldkey->} %{worldkey}`, the right padding modifier will match Hello world (1 space), Hello  world (2 spaces), and even Hello          world (10 spaces). + +The right padding modifier is used to allow for the repetition of characters following a `%{keyname->}`. The right padding modifier can be applied to any key along with any other modifiers. It should always be the rightmost modifier, for example, `%{+keyname/1->}` or `%{}`. + +#### Example usage + +The following is an example of how to use a right padding modifier: + +`%{city->}, %{state} %{zip}` + +In this pattern, the right padding modifier `->` is applied to the `%{city}` key. Both addresses contain the same information, but the second entry has an extra word, `City`, in the `city` field. The right padding modifier allows the pattern to match both of these address entries even though they have slightly different formats: + +```bash +New York, NY 10017 +New York City, NY 10017 +``` + +The following example pipeline uses the right padding modifier with an empty key `%{->}`: + +```json +PUT /_ingest/pipeline/dissect-test +{ + "description": "Pipeline that dissects web server logs", + "processors": [ + { + "dissect": { + "field": "message", + "pattern": "[%{client_ip}]%{->}[%{timestamp}]" + } + } + ] +} +``` +{% include copy-curl.html %} + +You can test the pipeline by using the following example pipeline: + +```json +POST _ingest/pipeline/dissect-test/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "message": "[192.168.1.10] [03/Nov/2023:15:20:45 +0000]" + } + } + ] +} +``` +{% include copy-curl.html %} + +Your response should appear similar to the following: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "client_ip": "192.168.1.10", + "message": "[192.168.1.10] [03/Nov/2023:15:20:45 +0000]", + "timestamp": "03/Nov/2023:15:20:45 +0000" + }, + "_ingest": { + "timestamp": "2024-01-22T22:55:42.090569297Z" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Append modifier (`+`) + +The _append modifier_ combines the values of two or more values into a single output value. The values are appended from left to right. You can also specify an optional separator to be inserted between the values. + +#### Example usage + +The following is an example pipeline with an append modifier: + +```json +PUT /_ingest/pipeline/dissect-test +{ + "description": "Pipeline that dissects web server logs", + "processors": [ + { + "dissect": { + "field": "message", + "pattern": "%{+address}, %{+address} %{+address}", + "append_separator": "|" + } + } + ] +} +``` +{% include copy-curl.html %} + +You can test the pipeline by using the following example pipeline: + +```json +POST _ingest/pipeline/dissect-test/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "message": "New York, NY 10017" + } + } + ] +} +``` +{% include copy-curl.html %} + +The substrings are appended to the `address` field, as shown in the following response: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "address": "New York|NY|10017", + "message": "New York, NY 10017" + }, + "_ingest": { + "timestamp": "2024-01-22T22:30:54.516284637Z" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Append with order modifier (`+` and `/n`) + +The _append with order modifier_ combines the values of two or more keys into a single output value based on the order specified after `/`. You have the flexibility to customize the separator that separates the appended values. The append modifier is useful for compiling multiple fields into a single formatted output line, constructing structured lists of data items, and consolidating values from various sources. + +#### Example usage + +The following example pipeline uses the append with order modifier to reverse the pattern order defined in the preceding pipeline. This pipeline specifies a separator to be inserted between the appended fields. If you don't specify a separator, all values will be appended without a separator. + +```json +PUT /_ingest/pipeline/dissect-test +{ + "description": "Pipeline that dissects web server logs", + "processors": [ + { + "dissect": { + "field": "message", + "pattern": "%{+address/3}, %{+address/2} %{+address/1}", + "append_separator": "|" + } + } + ] +} +``` +{% include copy-curl.html %} + +You can test the pipeline using the following example pipeline: + +```json +POST _ingest/pipeline/dissect-test/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "message": "New York, NY 10017" + } + } + ] +} +``` +{% include copy-curl.html %} + +The substrings are appended to the `address` field in reverse order, as shown in the following response: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "address": "10017|NY|New York", + "message": "New York, NY 10017" + }, + "_ingest": { + "timestamp": "2024-01-22T22:38:24.305974178Z" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Named skip key modifier + +The _named skip key modifier_ excludes specific matches from the final output by using an empty key `{}` or `?` modifier within the pattern. For example, the following patterns are equivalent: `%{firstName} %{lastName} %{?ignore}` and `%{firstName} %{lastName} %{}`. The named skip key modifier is useful for excluding irrelevant or unnecessary fields from the output. + +#### Example usage + +The following pattern uses a named skip key to exclude a field (in this case, `ignore`) from the output. You can assign a descriptive name to the empty key, for example, `%{?ignore}`, to clarify that the corresponding value should be excluded from the final output: + +```json +PUT /_ingest/pipeline/dissect-test +{ + "description": "Pipeline that dissects web server logs", + "processors": [ + { + "dissect": { + "field": "message", + "pattern": "%{firstName} %{lastName} %{?ignore}" + } + } + ] +} +``` +{% include copy-curl.html %} + +You can test the pipeline using the following example pipeline: + +```json +POST _ingest/pipeline/dissect-test/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "message": "John Doe M.D." + } + } + ] +} +``` +{% include copy-curl.html %} + +Your response should appear similar to the following: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "firstName": "John", + "lastName": "Doe", + "message": "John Doe M.D." + }, + "_ingest": { + "timestamp": "2024-01-22T22:41:58.161475555Z" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Reference keys (`*` and `&`) + +Reference keys use parsed values as key-value pairings for structured content. This can use useful when handling systems that partially log data in key-value pairs. By using reference keys, you can preserve the key-value relationship and maintain the integrity of the extracted information. + +#### Example usage + +The following pattern uses a reference key to extract data into a structured format. In this example, `client_ip` and two key-value pairs are extracted for the next values: + +```json +PUT /_ingest/pipeline/dissect-test +{ + "description": "Pipeline that dissects web server logs", + "processors": [ + { + "dissect": { + "field": "message", + "pattern": "%{client_ip} %{*a}:%{&a} %{*b}:%{&b}" + } + } + ] +} +``` +{% include copy-curl.html %} + +You can test the pipeline using the following example pipeline: + +```json +POST _ingest/pipeline/dissect-test/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "message": "192.168.1.10 response_code:200 response_size:3456" + } + } + ] +} +``` +{% include copy-curl.html %} + +The two key-value pairs were extracted into fields, as shown in the following response: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "client_ip": "192.168.1.10", + "response_code": "200", + "message": "192.168.1.10 response_code:200 response_size:3456", + "response_size": "3456" + }, + "_ingest": { + "timestamp": "2024-01-22T22:48:51.475535635Z" + } + } + } + ] +} +``` +{% include copy-curl.html %} diff --git a/_ingest-pipelines/processors/dot-expander.md b/_ingest-pipelines/processors/dot-expander.md new file mode 100644 index 0000000000..5cfebba758 --- /dev/null +++ b/_ingest-pipelines/processors/dot-expander.md @@ -0,0 +1,373 @@ +--- +layout: default +title: Dot expander +parent: Ingest processors +nav_order: 65 +--- + +# Dot expander + +The `dot_expander` processor is a tool that helps you work with hierarchical data. It transforms fields containing dots into object fields, making them accessible to other processors in the pipeline. Without this transformation, fields with dots cannot be processed. + +The following is the syntax for the `dot_expander` processor: + +```json +{ + "dot_expander": { + "field": "field.to.expand" + } +} +``` +{% include copy-curl.html %} + +## Configuration parameters + +The following table lists the required and optional parameters for the `dot_expander` processor. + +Parameter | Required/Optional | Description | +|-----------|-----------|-----------| +`field` | Required | The field to be expanded into an object field. | +`path` | Optional | This field is only required if the field to be expanded is nested within another object field. This is because the `field` parameter only recognizes leaf fields. | +`description` | Optional | A brief description of the processor. | +`if` | Optional | A condition for running the processor. | +`ignore_failure` | Optional | If set to `true`, failures are ignored. Default is `false`. | +`on_failure` | Optional | A list of processors to run if the processor fails. | +`tag` | Optional | An identifier tag for the processor. Useful for debugging in order to distinguish between processors of the same type. | + +## Using the processor + +Follow these steps to use the processor in a pipeline. + +### Step 1: Create a pipeline + +The following query creates a `dot_expander` processor that will expand two fields named `user.address.city` and `user.address.state` into nested objects: + +```json +PUT /_ingest/pipeline/dot-expander-pipeline +{ + "description": "Dot expander processor", + "processors": [ + { + "dot_expander": { + "field": "user.address.city" + } + }, + { + "dot_expander":{ + "field": "user.address.state" + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 2 (Optional): Test the pipeline + +It is recommended that you test your pipeline before you ingest documents. +{: .tip} + +To test the pipeline, run the following query: + +```json +POST _ingest/pipeline/dot-expander-pipeline/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "user.address.city": "New York", + "user.address.state": "NY" + } + } + ] +} +``` +{% include copy-curl.html %} + +#### Response + +The following example response confirms that the pipeline is working as expected: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "user": { + "address": { + "city": "New York", + "state": "NY" + } + } + }, + "_ingest": { + "timestamp": "2024-01-17T01:32:56.501346717Z" + } + } + } + ] +} +``` + +### Step 3: Ingest a document + +The following query ingests a document into an index named `testindex1`: + +```json +PUT testindex1/_doc/1?pipeline=dot-expander-pipeline +{ + "user.address.city": "Denver", + "user.address.state": "CO" +} +``` +{% include copy-curl.html %} + +### Step 4 (Optional): Retrieve the document + +To retrieve the document, run the following query: + +```json +GET testindex1/_doc/1 +``` +{% include copy-curl.html %} + +#### Response + +The following response confirms that the specified fields were expanded into nested fields: + +```json +{ + "_index": "testindex1", + "_id": "1", + "_version": 1, + "_seq_no": 3, + "_primary_term": 1, + "found": true, + "_source": { + "user": { + "address": { + "city": "Denver", + "state": "CO" + } + } + } +} +``` + +## The `path` parameter + +You can use the `path` parameter to specify the path to a dotted field within an object. For example, the following pipeline specifies the `address.city` field that is located within the `user` object: + +```json +PUT /_ingest/pipeline/dot-expander-pipeline +{ + "description": "Dot expander processor", + "processors": [ + { + "dot_expander": { + "field": "address.city", + "path": "user" + } + }, + { + "dot_expander":{ + "field": "address.state", + "path": "user" + } + } + ] +} +``` +{% include copy-curl.html %} + +You can simulate the pipeline as follows: + +```json +POST _ingest/pipeline/dot-expander-pipeline/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "user": { + "address.city": "New York", + "address.state": "NY" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +The `dot_expander` processor transforms the document into the following structure: + +```json +{ + "user": { + "address": { + "city": "New York", + "state": "NY" + } + } +} +``` + +## Field name conflicts + +If a field already exists with the same path as the path to which the `dot_expander` processor should expand the value, the processor merges the two values into an array. + +Consider the following pipeline that expands the field `user.name`: + +```json +PUT /_ingest/pipeline/dot-expander-pipeline +{ + "description": "Dot expander processor", + "processors": [ + { + "dot_expander": { + "field": "user.name" + } + } + ] +} +``` +{% include copy-curl.html %} + +You can simulate the pipeline with a document containing two values with the exact same path `user.name`: + +```json +POST _ingest/pipeline/dot-expander-pipeline/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "user.name": "John", + "user": { + "name": "Steve" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +The response confirms that the values were merged into an array: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "user": { + "name": [ + "Steve", + "John" + ] + } + }, + "_ingest": { + "timestamp": "2024-01-17T01:44:57.420220551Z" + } + } + } + ] +} +``` + +If a field contains the same name but a different path, then the field needs to be renamed. For example, the following `_simulate` call returns a parse exception: + +```json +POST _ingest/pipeline/dot-expander-pipeline/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "user": "John", + "user.name": "Steve" + } + } + ] +} +``` + +To avoid the parse exception, first rename the field by using the `rename` processor: + +```json +PUT /_ingest/pipeline/dot-expander-pipeline +{ + "processors" : [ + { + "rename" : { + "field" : "user", + "target_field" : "user.name" + } + }, + { + "dot_expander": { + "field": "user.name" + } + } + ] +} +``` +{% include copy-curl.html %} + +Now you can simulate the pipeline: + +```json +POST _ingest/pipeline/dot-expander-pipeline/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "user": "John", + "user.name": "Steve" + } + } + ] +} +``` +{% include copy-curl.html %} + +The response confirms that the fields were merged: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "user": { + "name": [ + "John", + "Steve" + ] + } + }, + "_ingest": { + "timestamp": "2024-01-17T01:52:12.864432419Z" + } + } + } + ] +} +``` diff --git a/_ingest-pipelines/processors/drop.md b/_ingest-pipelines/processors/drop.md new file mode 100644 index 0000000000..c7bfc3cd75 --- /dev/null +++ b/_ingest-pipelines/processors/drop.md @@ -0,0 +1,126 @@ +--- +layout: default +title: Drop +parent: Ingest processors +nav_order: 70 +--- + +This documentation describes using the `drop` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `drop_events` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/drop-events/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + +# Drop processor + +The `drop` processor is used to discard documents without indexing them. This can be useful for preventing documents from being indexed based on certain conditions. For example, you might use a `drop` processor to prevent documents that are missing important fields or contain sensitive information from being indexed. + +The `drop` processor does not raise any errors when it discards documents, making it useful for preventing indexing problems without cluttering your OpenSearch logs with error messages. + +## Syntax example + +The following is the syntax for the `drop` processor: + +```json +{ + "drop": { + "if": "ctx.foo == 'bar'" + } +} +``` +{% include copy-curl.html %} + +## Configuration parameters + +The following table lists the required and optional parameters for the `drop` processor. + +Parameter | Required | Description | +|-----------|-----------|-----------| +`description` | Optional | A brief description of the processor. | +`if` | Optional | A condition for running the processor. | +`ignore_failure` | Optional | If set to `true`, failures are ignored. Default is `false`. See [Handling pipeline failures]({{site.url}}{{site.baseurl}}/ingest-pipelines/pipeline-failures/) for more information. | +`on_failure` | Optional | A list of processors to run if the processor fails. See [Handling pipeline failures]({{site.url}}{{site.baseurl}}/ingest-pipelines/pipeline-failures/) for more information. | +`tag` | Optional | An identifier tag for the processor. Useful for distinguishing between processors of the same type when debugging. | + +## Using the processor + +Follow these steps to use the processor in a pipeline. + +**Step 1: Create a pipeline** + +The following query creates a pipeline, named `drop-pii`, that uses the `drop` processor to prevent a document containing personally identifiable information (PII) from being indexed: + +```json +PUT /_ingest/pipeline/drop-pii +{ + "description": "Pipeline that prevents PII from being indexed", + "processors": [ + { + "drop": { + "if" : "ctx.user_info.contains('password') || ctx.user_info.contains('credit card')" + } + } + ] +} +``` +{% include copy-curl.html %} + +**Step 2 (Optional): Test the pipeline** + +It is recommended that you test your pipeline before ingesting documents. +{: .tip} + +To test the pipeline, run the following query: + +```json +POST _ingest/pipeline/drop-pii/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source": { + "user_info": "Sensitive information including credit card" + } + } + ] +} +``` +{% include copy-curl.html %} + +#### Response + +The following example response confirms that the pipeline is working as expected (the document has been dropped): + +```json +{ + "docs": [ + null + ] +} +``` +{% include copy-curl.html %} + +**Step 3: Ingest a document** + +The following query ingests a document into an index named `testindex1`: + +```json +PUT testindex1/_doc/1?pipeline=drop-pii +{ + "user_info": "Sensitive information including credit card" +} +``` +{% include copy-curl.html %} + +The following response confirms that the document with the ID of `1` was not indexed: + +{ + "_index": "testindex1", + "_id": "1", + "_version": -3, + "result": "noop", + "_shards": { + "total": 0, + "successful": 0, + "failed": 0 + } +} +{% include copy-curl.html %} diff --git a/_ingest-pipelines/processors/grok.md b/_ingest-pipelines/processors/grok.md index 981ec34bbd..5579dbda13 100644 --- a/_ingest-pipelines/processors/grok.md +++ b/_ingest-pipelines/processors/grok.md @@ -6,6 +6,9 @@ grand_parent: Ingest pipelines nav_order: 140 --- +This documentation describes using the `grok` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `grok` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/grok/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + # Grok processor The `grok` processor is used to parse and structure unstructured data using pattern matching. You can use the `grok` processor to extract fields from log messages, web server access logs, application logs, and other log data that follows a consistent format. diff --git a/_ingest-pipelines/processors/index-processors.md b/_ingest-pipelines/processors/index-processors.md index 781780e47e..60fcac82e2 100644 --- a/_ingest-pipelines/processors/index-processors.md +++ b/_ingest-pipelines/processors/index-processors.md @@ -30,7 +30,9 @@ Processor type | Description :--- | :--- `append` | Adds one or more values to a field in a document. `bytes` | Converts a human-readable byte value to its value in bytes. -`convert` | Changes the data type of a field in a document. +`community_id` | Generates a community ID flow hash algorithm for the network flow tuples. +`convert` | Changes the data type of a field in a document. +`copy` | Copies an entire object in an existing field to another field. `csv` | Extracts CSVs and stores them as individual fields in a document. `date` | Parses dates from fields and then uses the date or timestamp as the timestamp for a document. `date_index_name` | Indexes documents into time-based indexes based on a date or timestamp field in a document. @@ -51,11 +53,13 @@ Processor type | Description `lowercase` | Converts text in a specific field to lowercase letters. `pipeline` | Runs an inner pipeline. `remove` | Removes fields from a document. +`remove_by_pattern` | Removes fields from a document by field pattern. `script` | Runs an inline or stored script on incoming documents. `set` | Sets the value of a field to a specified value. `sort` | Sorts the elements of an array in ascending or descending order. `sparse_encoding` | Generates a sparse vector/token and weights from text fields for neural sparse search using sparse retrieval. `split` | Splits a field into an array using a separator character. +`text_chunking` | Splits long documents into smaller chunks. `text_embedding` | Generates vector embeddings from text fields for semantic search. `text_image_embedding` | Generates combined vector embeddings from text and image fields for multimodal neural search. `trim` | Removes leading and trailing white space from a string field. diff --git a/_ingest-pipelines/processors/ip2geo.md b/_ingest-pipelines/processors/ip2geo.md index 1563208b67..8e53c778a1 100644 --- a/_ingest-pipelines/processors/ip2geo.md +++ b/_ingest-pipelines/processors/ip2geo.md @@ -208,7 +208,7 @@ POST _ingest/pipeline/my-pipeline/_simulate "_index": "testindex1", "_id": "1", "_source": { - "ip": "172.0.0.1", + "ip": "172.0.0.1" } } ] @@ -249,7 +249,7 @@ The following response confirms that the pipeline is working as expected: The following query ingests a document into an index named `my-index`: ```json -PUT /my-index/_doc/my-id?pipeline=ip2geo +PUT /my-index/_doc/my-id?pipeline=my-pipeline { "ip": "172.0.0.1" } diff --git a/_ingest-pipelines/processors/kv.md b/_ingest-pipelines/processors/kv.md index c81025360f..cc23507056 100644 --- a/_ingest-pipelines/processors/kv.md +++ b/_ingest-pipelines/processors/kv.md @@ -7,6 +7,9 @@ redirect_from: - /api-reference/ingest-apis/processors/lowercase/ --- +This documentation describes using the `kv` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `key_value` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/key-value/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + # KV processor The `kv` processor automatically extracts specific event fields or messages that are in a `key=value` format. This structured format organizes your data by grouping it together based on keys and values. It's helpful for analyzing, visualizing, and using data, such as user behavior analytics, performance optimizations, or security investigations. @@ -31,7 +34,7 @@ The following is the syntax for the `kv` processor: The following table lists the required and optional parameters for the `kv` processor. | Parameter | Required/Optional | Description | -`field` | Required | The name of the field containing the data to be parsed. Supports [template snippets]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/#template-snippets). | +`field` | Required | The name of the field containing the data to be parsed. | `field_split` | Required | The regex pattern for key-value pair splitting. | `value_split` | Required | The regex pattern for splitting the key from the value within a key-value pair, for example, equal sign `=` or colon `:`. `exclude_keys` | Optional | The keys to exclude from the document. Default is `null`. | @@ -46,7 +49,7 @@ The following table lists the required and optional parameters for the `kv` proc `on_failure` | Optional | A list of processors to run if the processor fails. | `ignore_missing` | Optional | Specifies whether the processor should ignore documents that do not contain the specified field. Default is `false`. | `tag` | Optional | An identifier tag for the processor. Useful for debugging in order to distinguish between processors of the same type. | -`target_field` | Optional | The name of the field in which to insert the extracted keys. Default is `null`. Supports [template snippets]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/#template-snippets). | +`target_field` | Optional | The name of the field in which to insert the extracted keys. Default is `null`. | ## Using the processor diff --git a/_ingest-pipelines/processors/lowercase.md b/_ingest-pipelines/processors/lowercase.md index f957e96381..5bfa370491 100644 --- a/_ingest-pipelines/processors/lowercase.md +++ b/_ingest-pipelines/processors/lowercase.md @@ -7,6 +7,9 @@ redirect_from: - /api-reference/ingest-apis/processors/lowercase/ --- +This documentation describes using the `lowercase` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `lowercase_string` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/lowercase-string/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + # Lowercase processor The `lowercase` processor converts all the text in a specific field to lowercase letters. diff --git a/_ingest-pipelines/processors/remove.md b/_ingest-pipelines/processors/remove.md index ac87fd3f9c..9656f437b3 100644 --- a/_ingest-pipelines/processors/remove.md +++ b/_ingest-pipelines/processors/remove.md @@ -30,7 +30,8 @@ The following table lists the required and optional parameters for the `remove` | Parameter | Required/Optional | Description | |---|---|---| -`field` | Required | The name of the field containing the data to be removed. Supports [template snippets]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/#template-snippets). The following metadata fields are not allowed to be removed: `_index`, `_version`, `_version_type`, and `_id`. Note that `_id` is not allowed to be removed when there's a specified external version for the ingesting document. | +`field` | Optional | The field name containing the data to be removed. Supports [template snippets]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/#template-snippets). The metadata fields `_index`, `_version`, `_version_type`, and `_id` cannot be removed. If `version` is specified, `_id` cannot be removed from the ingested document. | +`exclude_field` | Optional | The field name to be retained. All other fields, except metadata fields, will be removed. The `exclude_field` and `field` options are mutually exclusive. Supports [template snippets]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/#template-snippets). | `description` | Optional | A brief description of the processor. | `if` | Optional | A condition for running the processor. | `ignore_failure` | Optional | Specifies whether the processor continues execution even if it encounters errors. If set to `true`, failures are ignored. Default is `false`. | @@ -112,7 +113,7 @@ The following example response confirms that the pipeline is working as expected The following query ingests a document into an index named `testindex1`: ```json -PPUT testindex1/_doc/1?pipeline=remove_ip +PUT testindex1/_doc/1?pipeline=remove_ip { "ip_address": "203.0.113.1", "name": "John Doe" diff --git a/_ingest-pipelines/processors/remove_by_pattern.md b/_ingest-pipelines/processors/remove_by_pattern.md new file mode 100644 index 0000000000..5fd1516fb7 --- /dev/null +++ b/_ingest-pipelines/processors/remove_by_pattern.md @@ -0,0 +1,133 @@ +--- +layout: default +title: Remove_by_pattern +parent: Ingest processors +nav_order: 225 +redirect_from: + - /api-reference/ingest-apis/processors/remove_by_pattern/ +--- + +# Remove_by_pattern processor + +The `remove_by_pattern` processor removes the root-level fields from a document by using specified wildcard patterns. + +## Syntax + +The following is the syntax for the `remove_by_pattern` processor: + +```json +{ + "remove_by_pattern": { + "field_pattern": "field_name_prefix*" + } +} +``` +{% include copy-curl.html %} + +## Configuration parameters + +The following table lists the required and optional parameters for the `remove_by_pattern` processor. + +| Parameter | Required/Optional | Description | +|---|---|---| +`field_pattern` | Optional | Removes fields that match the specified pattern. All of the metadata fields, such as `_index`, `_version`, `_version_type`, and `_id`, are ignored if they match the pattern. This option only supports the root-level fields in the document. | +`exclude_field_pattern` | Optional | Removes fields that do not match the specified pattern. All of the metadata fields, such as `_index`, `_version`, `_version_type`, and `_id`, are ignored if they do not match the pattern. This option only supports the root-level fields in the document. The `field_pattern` and `exclude_field_pattern` options are mutually exclusive. | +`description` | Optional | A brief description of the processor. | +`if` | Optional | A condition for running the processor. | +`ignore_failure` | Optional | Specifies whether the processor continues execution even if it encounters errors. If set to `true`, the failure is ignored. Default is `false`. | +`on_failure` | Optional | A list of processors to run if the processor fails. | +`tag` | Optional | An identifier tag for the processor. Useful for debugging in order to distinguish between processors of the same type. | + +## Using the processor + +Follow these steps to use the processor in a pipeline. + +**Step 1: Create a pipeline** + +The following query creates a pipeline named `remove_fields_by_pattern` that removes the fields that match the pattern `foo*`: + +```json +PUT /_ingest/pipeline/remove_fields_by_pattern +{ + "description": "Pipeline that removes the fields by patterns.", + "processors": [ + { + "remove_by_pattern": { + "field_pattern": "foo*" + } + } + ] +} +``` +{% include copy-curl.html %} + +**Step 2 (Optional): Test the pipeline** + +It is recommended that you test your pipeline before you ingest documents. +{: .tip} + +To test the pipeline, run the following query: + +```json +POST _ingest/pipeline/remove_fields_by_pattern/_simulate +{ + "docs": [ + { + "_index": "testindex1", + "_id": "1", + "_source":{ + "foo1": "foo1", + "foo2": "foo2", + "bar": "bar" + } + } + ] +} +``` +{% include copy-curl.html %} + +**Response** + +The following example response confirms that the pipeline is working as expected: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex1", + "_id": "1", + "_source": { + "bar": "bar" + }, + "_ingest": { + "timestamp": "2023-08-24T18:02:13.218986756Z" + } + } + } + ] +} +``` + +**Step 3: Ingest a document** + +The following query ingests a document into an index named `testindex1`: + +```json +PUT testindex1/_doc/1?pipeline=remove_fields_by_pattern +{ + "foo1": "foo1", + "foo2": "foo2", + "bar": "bar" +} +``` +{% include copy-curl.html %} + +**Step 4 (Optional): Retrieve the document** + +To retrieve the document, run the following query: + +```json +GET testindex1/_doc/1 +``` +{% include copy-curl.html %} diff --git a/_ingest-pipelines/processors/text-chunking.md b/_ingest-pipelines/processors/text-chunking.md new file mode 100644 index 0000000000..d11c380bde --- /dev/null +++ b/_ingest-pipelines/processors/text-chunking.md @@ -0,0 +1,207 @@ +--- +layout: default +title: Text chunking +parent: Ingest processors +nav_order: 250 +--- + +# Text chunking processor + +The `text_chunking` processor splits a long document into shorter passages. The processor supports the following algorithms for text splitting: + +- [`fixed_token_length`](#fixed-token-length-algorithm): Splits text into passages of the specified size. +- [`delimiter`](#delimiter-algorithm): Splits text into passages on a delimiter. + +The following is the syntax for the `text_chunking` processor: + +```json +{ + "text_chunking": { + "field_map": { + "": "" + }, + "algorithm": { + "": "" + } + } +} +``` + +## Configuration parameters + +The following table lists the required and optional parameters for the `text_chunking` processor. + +| Parameter | Data type | Required/Optional | Description | +|:---|:---|:---|:---| +| `field_map` | Object | Required | Contains key-value pairs that specify the mapping of a text field to the output field. | +| `field_map.` | String | Required | The name of the field from which to obtain text for generating chunked passages. | +| `field_map.` | String | Required | The name of the field in which to store the chunked results. | +| `algorithm` | Object | Required | Contains at most one key-value pair that specifies the chunking algorithm and parameters. | +| `algorithm.` | String | Optional | The name of the chunking algorithm. Valid values are [`fixed_token_length`](#fixed-token-length-algorithm) or [`delimiter`](#delimiter-algorithm). Default is `fixed_token_length`. | +| `algorithm.` | Object | Optional | The parameters for the chunking algorithm. By default, contains the default parameters of the `fixed_token_length` algorithm. | +| `description` | String | Optional | A brief description of the processor. | +| `tag` | String | Optional | An identifier tag for the processor. Useful when debugging in order to distinguish between processors of the same type. | + +### Fixed token length algorithm + +The following table lists the optional parameters for the `fixed_token_length` algorithm. + +| Parameter | Data type | Required/Optional | Description | +|:---|:---|:---|:---| +| `token_limit` | Integer | Optional | The token limit for chunking algorithms. Valid values are integers of at least `1`. Default is `384`. | +| `tokenizer` | String | Optional | The [word tokenizer]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/index/#word-tokenizers) name. Default is `standard`. | +| `overlap_rate` | String | Optional | The degree of overlap in the token algorithm. Valid values are floats between `0` and `0.5`, inclusive. Default is `0`. | +| `max_chunk_limit` | Integer | Optional | The chunk limit for chunking algorithms. Default is 100. To disable this parameter, set it to `-1`. | + +The default value of `token_limit` is `384` so that output passages don't exceed the token limit constraint of the downstream text embedding models. For [OpenSearch-supported pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#supported-pretrained-models), like `msmarco-distilbert-base-tas-b` and `opensearch-neural-sparse-encoding-v1`, the input token limit is `512`. The `standard` tokenizer tokenizes text into words. According to [OpenAI](https://platform.openai.com/docs/introduction), 1 token equals approximately 0.75 words of English text. The default token limit is calculated as 512 * 0.75 = 384. +{: .note} + +You can set the `overlap_rate` to a decimal percentage value in the 0--0.5 range, inclusive. Per [Amazon Bedrock](https://aws.amazon.com/blogs/aws/knowledge-bases-now-delivers-fully-managed-rag-experience-in-amazon-bedrock/), we recommend setting this parameter to a value of 0–0.2 to improve accuracy. +{: .note} + +The `max_chunk_limit` parameter limits the number of chunked passages. If the number of passages generated by the processor exceeds the limit, the algorithm will return an exception, prompting you to either increase or disable the limit. +{: .note} + +### Delimiter algorithm + +The following table lists the optional parameters for the `delimiter` algorithm. + +| Parameter | Data type | Required/Optional | Description | +|:---|:---|:---|:---| +| `delimiter` | String | Optional | A string delimiter used to split text. You can set the `delimiter` to any string, for example, `\n` (split text into paragraphs on a new line) or `.` (split text into sentences). Default is `\n\n` (split text into paragraphs on two new line characters). | +| `max_chunk_limit` | Integer | Optional | The chunk limit for chunking algorithms. Default is `100`. To disable this parameter, set it to `-1`. | + +The `max_chunk_limit` parameter limits the number of chunked passages. If the number of passages generated by the processor exceeds the limit, the algorithm will return an exception, prompting you to either increase or disable the limit. +{: .note} + +## Using the processor + +Follow these steps to use the processor in a pipeline. You can specify the chunking algorithm when creating the processor. If you don't provide an algorithm name, the chunking processor will use the default `fixed_token_length` algorithm along with all its default parameters. + +**Step 1: Create a pipeline** + +The following example request creates an ingest pipeline that converts the text in the `passage_text` field into chunked passages, which will be stored in the `passage_chunk` field: + +```json +PUT _ingest/pipeline/text-chunking-ingest-pipeline +{ + "description": "A text chunking ingest pipeline", + "processors": [ + { + "text_chunking": { + "algorithm": { + "fixed_token_length": { + "token_limit": 10, + "overlap_rate": 0.2, + "tokenizer": "standard" + } + }, + "field_map": { + "passage_text": "passage_chunk" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +**Step 2 (Optional): Test the pipeline** + +It is recommended that you test your pipeline before ingesting documents. +{: .tip} + +To test the pipeline, run the following query: + +```json +POST _ingest/pipeline/text-chunking-ingest-pipeline/_simulate +{ + "docs": [ + { + "_index": "testindex", + "_id": "1", + "_source":{ + "passage_text": "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + } + } + ] +} +``` +{% include copy-curl.html %} + +#### Response + +The response confirms that, in addition to the `passage_text` field, the processor has generated chunking results in the `passage_chunk` field. The processor split the paragraph into 10-word chunks. Because of the `overlap` setting of 0.2, the last 2 words of a chunk are duplicated in the following chunk: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex", + "_id": "1", + "_source": { + "passage_text": "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.", + "passage_chunk": [ + "This is an example document to be chunked. The document ", + "The document contains a single paragraph, two sentences and 24 ", + "and 24 tokens by standard tokenizer in OpenSearch." + ] + }, + "_ingest": { + "timestamp": "2024-03-20T02:55:25.642366Z" + } + } + } + ] +} +``` + +Once you have created an ingest pipeline, you need to create an index for document ingestion. To learn more, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). + +## Cascaded text chunking processors + +You can chain multiple text chunking processors together. For example, to split documents into paragraphs, apply the `delimiter` algorithm and specify the parameter as `\n\n`. To prevent a paragraph from exceeding the token limit, append another text chunking processor that uses the `fixed_token_length` algorithm. You can configure the ingest pipeline for this example as follows: + +```json +PUT _ingest/pipeline/text-chunking-cascade-ingest-pipeline +{ + "description": "A text chunking pipeline with cascaded algorithms", + "processors": [ + { + "text_chunking": { + "algorithm": { + "delimiter": { + "delimiter": "\n\n" + } + }, + "field_map": { + "passage_text": "passage_chunk1" + } + } + }, + { + "text_chunking": { + "algorithm": { + "fixed_token_length": { + "token_limit": 500, + "overlap_rate": 0.2, + "tokenizer": "standard" + } + }, + "field_map": { + "passage_chunk1": "passage_chunk2" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +## Next steps + +- For a complete example, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). +- To learn more about semantic search, see [Semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/). +- To learn more about sparse search, see [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). +- To learn more about using models in OpenSearch, see [Choosing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#choosing-a-model). diff --git a/_ingest-pipelines/processors/uppercase.md b/_ingest-pipelines/processors/uppercase.md index e3cea317b6..7fa5192f42 100644 --- a/_ingest-pipelines/processors/uppercase.md +++ b/_ingest-pipelines/processors/uppercase.md @@ -7,6 +7,9 @@ redirect_from: - /api-reference/ingest-apis/processors/uppercase/ --- +This documentation describes using the `uppercase` processor in OpenSearch ingest pipelines. Consider using the [Data Prepper `uppercase_string` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/uppercase-string/), which runs on the OpenSearch cluster, if your use case involves large or complex datasets. +{: .note} + # Uppercase processor The `uppercase` processor converts all the text in a specific field to uppercase letters. diff --git a/_install-and-configure/configuring-opensearch/availability-recovery.md b/_install-and-configure/configuring-opensearch/availability-recovery.md index 133bcda706..d25396a63f 100644 --- a/_install-and-configure/configuring-opensearch/availability-recovery.md +++ b/_install-and-configure/configuring-opensearch/availability-recovery.md @@ -31,7 +31,7 @@ For security-related snapshot settings, see [Security settings]({{site.url}}{{si ### File system settings -For information about Amazon S3 repository settings, see [Amazon S3]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore/#shared-file-system). +For information about file system settings, see [Shared file system]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore/#shared-file-system). ### Amazon S3 settings diff --git a/_install-and-configure/configuring-opensearch/circuit-breaker.md b/_install-and-configure/configuring-opensearch/circuit-breaker.md index 8fdea52776..a32d5d924f 100644 --- a/_install-and-configure/configuring-opensearch/circuit-breaker.md +++ b/_install-and-configure/configuring-opensearch/circuit-breaker.md @@ -15,7 +15,7 @@ To learn more about static and dynamic settings, see [Configuring OpenSearch]({{ OpenSearch supports the following parent circuit breaker settings: -- `indices.breaker.total.use_real_memory` (Static, Boolean): If `true`, the parent circuit breaker considers the actual memory usage. Otherwise, the parent circuit breaker considers the amount of memory reserved by the child circuit breakers. Default is `false`. +- `indices.breaker.total.use_real_memory` (Static, Boolean): If `true`, the parent circuit breaker considers the actual memory usage. Otherwise, the parent circuit breaker considers the amount of memory reserved by the child circuit breakers. Default is `true`. - `indices.breaker.total.limit` (Dynamic, percentage): Specifies the initial memory limit for the parent circuit breaker. If `indices.breaker.total.use_real_memory` is `true`, defaults to 95% of the JVM heap. If `indices.breaker.total.use_real_memory` is `false`, defaults to 70% of the JVM heap. diff --git a/_install-and-configure/configuring-opensearch/cluster-settings.md b/_install-and-configure/configuring-opensearch/cluster-settings.md index 3564ea9e4d..c10f877300 100644 --- a/_install-and-configure/configuring-opensearch/cluster-settings.md +++ b/_install-and-configure/configuring-opensearch/cluster-settings.md @@ -114,8 +114,30 @@ OpenSearch supports the following cluster-level shard, block, and task settings: Default is `all`. -- `cluster.persistent_tasks.allocation.recheck_interval` (Time unit): The cluster manager automatically checks whether persistent tasks need to be assigned when the cluster state changes in a significant way. There are other factors, such as memory usage, that will affect whether persistent tasks are assigned to nodes but do not otherwise cause the cluster state to change. This setting defines how often assignment checks are performed in response to these factors. Default is `30 seconds`, with a minimum of `10 seconds` being required. +- `cluster.persistent_tasks.allocation.recheck_interval` (Time unit): The cluster manager automatically checks whether persistent tasks need to be assigned when the cluster state changes in a significant way. There are other factors, such as memory usage, that will affect whether persistent tasks are assigned to nodes but do not otherwise cause the cluster state to change. This setting defines how often assignment checks are performed in response to these factors. Default is `30 seconds`, with a minimum of `10 seconds` being required. + +## Cluster-level slow log settings + +For more information, see [Search request slow logs]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/logs/#search-request-slow-logs). + +- `cluster.search.request.slowlog.threshold.warn` (Time unit): Sets the request-level slow log `WARN` threshold. Default is `-1`. + +- `cluster.search.request.slowlog.threshold.info` (Time unit): Sets the request-level slow log `INFO` threshold. Default is `-1`. + +- `cluster.search.request.slowlog.threshold.debug` (Time unit): Sets the request-level slow log `DEBUG` threshold. Default is `-1`. + +- `cluster.search.request.slowlog.threshold.trace` (Time unit): Sets the request-level slow log `TRACE` threshold. Default is `-1`. + +- `cluster.search.request.slowlog.level` (String): Sets the minimum slow log level to log: `WARN`, `INFO`, `DEBUG`, and `TRACE`. Default is `TRACE`. ## Cluster-level index settings -For information about index-level index settings, see [Cluster-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#cluster-level-index-settings). \ No newline at end of file +For information about index-level index settings, see [Cluster-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#cluster-level-index-settings). + +## Cluster-level coordination settings + +OpenSearch supports the following cluster-level coordination settings. All settings in this list are dynamic: + +- `cluster.fault_detection.leader_check.timeout` (Time unit): The amount of time a node waits for a response from the elected cluster manager during a leader check before deeming the check a failure. Valid values are from `1ms` to `60s`, inclusive. Default is `10s`. Changing this setting to a value other than the default can result in an unstable cluster. + +- `cluster.fault_detection.follower_check.timeout` (Time unit): The amount of time the elected cluster manager waits for a response during a follower check before deeming the check a failure. Valid values are from `1ms` to `60s`, inclusive. Default is `10s`. Changing this setting to a value other than the default can result in an unstable cluster. diff --git a/_install-and-configure/configuring-opensearch/index-settings.md b/_install-and-configure/configuring-opensearch/index-settings.md index f88d060228..25cd4b8810 100644 --- a/_install-and-configure/configuring-opensearch/index-settings.md +++ b/_install-and-configure/configuring-opensearch/index-settings.md @@ -100,6 +100,7 @@ OpenSearch supports the following static index-level index settings: - `index.merge_on_flush.policy` (default | merge-on-flush): This setting controls which merge policy should be used when `index.merge_on_flush.enabled` is enabled. Default is `default`. +- `index.check_pending_flush.enabled` (Boolean): This setting controls the Apache Lucene `checkPendingFlushOnUpdate` index writer setting, which specifies whether an indexing thread should check for pending flushes on an update in order to flush indexing buffers to disk. Default is `true`. ### Updating a static index setting @@ -172,6 +173,8 @@ OpenSearch supports the following dynamic index-level index settings: - `index.query.default_field` (List): A field or list of fields that OpenSearch uses in queries in case a field isn't specified in the parameters. +- `index.query.max_nested_depth` (Integer): The maximum number of nesting levels for `nested` queries. Default is `Integer.MAX_VALUE`. Minimum is 1 (single `nested` query). + - `index.routing.allocation.enable` (String): Specifies options for the index’s shard allocation. Available options are `all` (allow allocation for all shards), `primaries` (allow allocation only for primary shards), `new_primaries` (allow allocation only for new primary shards), and `none` (do not allow allocation). Default is `all`. - `index.routing.rebalance.enable` (String): Enables shard rebalancing for the index. Available options are `all` (allow rebalancing for all shards), `primaries` (allow rebalancing only for primary shards), `replicas` (allow rebalancing only for replicas), and `none` (do not allow rebalancing). Default is `all`. @@ -182,6 +185,10 @@ OpenSearch supports the following dynamic index-level index settings: - `index.final_pipeline` (String): The final ingest node pipeline for the index. If the final pipeline is set and the pipeline does not exist, then index requests fail. The pipeline name `_none` specifies that the index does not have an ingest pipeline. +- `index.optimize_doc_id_lookup.fuzzy_set.enabled` (Boolean): This setting controls whether `fuzzy_set` should be enabled in order to optimize document ID lookups in index or search calls by using an additional data structure, in this case, the Bloom filter data structure. Enabling this setting improves performance for upsert and search operations that rely on document IDs by creating a new data structure (Bloom filter). The Bloom filter allows for the handling of negative cases (that is, IDs being absent in the existing index) through faster off-heap lookups. Note that creating a Bloom filter requires additional heap usage during indexing time. Default is `false`. + +- `index.optimize_doc_id_lookup.fuzzy_set.false_positive_probability` (Double): Sets the false-positive probability for the underlying `fuzzy_set` (that is, the Bloom filter). A lower false-positive probability ensures higher throughput for upsert and get operations but results in increased storage and memory use. Allowed values range between `0.01` and `0.50`. Default is `0.20`. + ### Updating a dynamic index setting You can update a dynamic index setting at any time through the API. For example, to update the refresh interval, use the following request: diff --git a/_install-and-configure/configuring-opensearch/logs.md b/_install-and-configure/configuring-opensearch/logs.md index c741c9e62e..e601a1eeaa 100644 --- a/_install-and-configure/configuring-opensearch/logs.md +++ b/_install-and-configure/configuring-opensearch/logs.md @@ -43,7 +43,7 @@ The easiest way to identify modules is not from the logs, which abbreviate the p After this sample change, OpenSearch emits much more detailed logs during reindex operations: -``` +```plaintext [2019-10-18T16:52:51,184][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: starting [2019-10-18T16:52:51,186][DEBUG][o.o.i.r.TransportReindexAction] [node1] executing initial scroll against [some-index] [2019-10-18T16:52:51,291][DEBUG][o.o.i.r.TransportReindexAction] [node1] scroll returned [3] documents with a scroll id of [DXF1Z==] @@ -95,12 +95,42 @@ There are other ways to change log levels: - `${sys:opensearch.logs.cluster_name}` is the name of the cluster. - `[%node_name]` is the name of the node. +## Search request slow logs + +New in version 2.12, OpenSearch offers request-level slow logs for search. These logs rely on thresholds to define what qualifies as "slow." All requests which exceed the threshold are logged. -## Slow logs +Search request slow logs are enabled dynamically through the [Cluster Settings API]({{site.url}}{{site.baseurl}}/api-reference/cluster-api/cluster-settings/). Unlike shard slow logs, search request slow log thresholds are configured for total request took time. By default, logs are disabled (all thresholds are set to `-1`). -OpenSearch has two *slow logs*, logs that help you identify performance issues: the search slow log and the indexing slow log. +```json +PUT /_cluster/settings +{ +"persistent" : { + "cluster.search.request.slowlog.level" : "TRACE", + "cluster.search.request.slowlog.threshold.warn": "10s", + "cluster.search.request.slowlog.threshold.info": "5s", + "cluster.search.request.slowlog.threshold.debug": "2s", + "cluster.search.request.slowlog.threshold.trace": "10ms" +} +} +``` +{% include copy-curl.html %} -These logs rely on thresholds to define what qualifies as a "slow" search or "slow" indexing operation. For example, you might decide that a query is slow if it takes more than 15 seconds to complete. Unlike application logs, which you configure for modules, you configure slow logs for indexes. By default, both logs are disabled (all thresholds are set to `-1`): +A line from `opensearch_index_search_slowlog.log` might look like this: + +```plaintext +[2023-10-30T15:47:42,630][TRACE][c.s.r.slowlog] [runTask-0] took[80.8ms], took_millis[80], phase_took_millis[{expand=0, query=39, fetch=22}], total_hits[4 hits], search_type[QUERY_THEN_FETCH], shards[{total: 10, successful: 10, skipped: 0, failed: 0}], source[{"query":{"match_all":{"boost":1.0}}}], id[] +``` + +Search request slow logs can consume considerable disk space and affect performance if you set low threshold values. Consider enabling them temporarily for troubleshooting or performance tuning. To disable search request slow logs, return all thresholds to `-1`. +{: .important} + +## Shard slow logs + +OpenSearch has two *shard slow logs*, logs that help you identify performance issues: the search slow log and the indexing slow log. + +These logs rely on thresholds to define what qualifies as a "slow" search or "slow" indexing operation. For example, you might decide that a query is slow if it takes more than 15 seconds to complete. Unlike application logs, which you configure for modules, you configure slow logs for indexes. By default, both logs are disabled (all thresholds are set to `-1`). + +Unlike search request slow logs, shard slow log thresholds are configured for individual shard took time. ```json GET /_settings?include_defaults=true @@ -174,17 +204,17 @@ In this example, OpenSearch logs indexing operations that take 15 seconds or lon A line from `opensearch_index_indexing_slowlog.log` might look like this: -``` +```plaintext node1 | [2019-10-24T19:48:51,012][WARN][i.i.s.index] [node1] [some-index/i86iF5kyTyy-PS8zrdDeAA] took[3.4ms], took_millis[3], type[_doc], id[1], routing[], source[{"title":"Your Name", "Director":"Makoto Shinkai"}] ``` -Slow logs can consume considerable disk space if you set thresholds or levels too low. Consider enabling them temporarily for troubleshooting or performance tuning. To disable slow logs, return all thresholds to `-1`. +Shard slow logs can consume considerable disk space and affect performance if you set low threshold values. Consider enabling them temporarily for troubleshooting or performance tuning. To disable shard slow logs, return all thresholds to `-1`. ## Task logs OpenSearch can log CPU time and memory utilization for the top N memory-expensive search tasks when task resource consumers are enabled. By default, task resource consumers will log the top 10 search tasks at 60 second intervals. These values can be configured in `opensearch.yml`. -Task logging is enabled dynamically through the cluster settings API: +Task logging is enabled dynamically through the [Cluster Settings API]({{site.url}}{{site.baseurl}}/api-reference/cluster-api/cluster-settings/): ```json PUT _cluster/settings diff --git a/_install-and-configure/configuring-opensearch/network-settings.md b/_install-and-configure/configuring-opensearch/network-settings.md index 4084bd6660..3728c55bc2 100644 --- a/_install-and-configure/configuring-opensearch/network-settings.md +++ b/_install-and-configure/configuring-opensearch/network-settings.md @@ -48,3 +48,12 @@ OpenSearch supports the following advanced network settings for transport commun - `transport.bind_host` (Static, list): Specifies an address or addresses to which an OpenSearch node binds to listen for incoming transport connections. - `transport.publish_host` (Static, list): Specifies an address or addresses that an OpenSearch node publishes to other nodes for transport communication. + +## Selecting the transport + +The default OpenSearch transport is provided by the `transport-netty4` module and uses the [Netty 4](https://netty.io/) engine for both internal TCP-based communication between nodes in the cluster and external HTTP-based communication with clients. This communication is fully asynchronous and non-blocking. However, there are other transport plugins available that can be used interchangeably: + +Plugin | Description +:---------- | :-------- +`transport-nio` | The OpenSearch transport based on Java NIO.
Installation: `./bin/opensearch-plugin install transport-nio`
Configuration (using `opensearch.yml`):
`transport.type: nio-transport`
`http.type: nio-http-transport` +`transport-reactor-netty4` | The OpenSearch HTTP transport based on [Project Reactor](https://github.com/reactor/reactor-netty) and Netty 4 (**experimental**)
Installation: `./bin/opensearch-plugin install transport-reactor-netty4`
Configuration (using `opensearch.yml`):
`http.type: reactor-netty4` diff --git a/_install-and-configure/configuring-opensearch/plugin-settings.md b/_install-and-configure/configuring-opensearch/plugin-settings.md index 64643cad77..cc2212fb1e 100644 --- a/_install-and-configure/configuring-opensearch/plugin-settings.md +++ b/_install-and-configure/configuring-opensearch/plugin-settings.md @@ -25,6 +25,10 @@ For information about asynchronous search settings, see [Asynchronous Search set For information about cross-cluster replication settings, see [Replication settings]({{site.url}}{{site.baseurl}}/tuning-your-cluster/replication-plugin/settings/). +## Flow Framework plugin settings + +For information about automatic workflow settings, see [Workflow settings]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-settings/). + ## Geospatial plugin settings For information about the Geospatial plugin's IP2Geo processor settings, see [Cluster settings]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/ip2geo/#cluster-settings). @@ -79,6 +83,10 @@ The Notifications plugin supports the following settings. All settings in this l - `opensearch.notifications.general.filter_by_backend_roles` (Boolean): Enables filtering by backend roles (role-based access control for the notification channels). Default is `false`. +## Query Insights plugin settings + +For information about Query Insights plugin settings, see [Query insights settings]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/index#query-insights-settings). + ## Security plugin settings For information about the Security plugin settings, see [Security settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/). diff --git a/_install-and-configure/configuring-opensearch/search-settings.md b/_install-and-configure/configuring-opensearch/search-settings.md index 3890097e6c..ae1fd1330e 100644 --- a/_install-and-configure/configuring-opensearch/search-settings.md +++ b/_install-and-configure/configuring-opensearch/search-settings.md @@ -17,9 +17,9 @@ OpenSearch supports the following search settings: - `search.default_allow_partial_results` (Dynamic, Boolean): A cluster-level setting that allows returning partial search results if a request times out or a shard fails. If a search request contains an `allow_partial_search_results` parameter, the parameter takes precedence over this setting. Default is `true`. -- `search.cancel_after_time_interval` (Dynamic, time unit): A cluster-level setting that specifies the maximum amount of time that a search request can run before it is canceled at the shard level. After this time has been reached, a request is stopped and all associated tasks are canceled. Default is `-1`. +- `search.cancel_after_time_interval` (Dynamic, time unit): A cluster-level setting that sets the default timeout for all search requests at the coordinating node level. After the specified time has been reached, the request is stopped and all associated tasks are canceled. Default is `-1` (no timeout). -- `search.default_search_timeout` (Dynamic, time unit): A cluster-level setting that sets the default timeout for all search requests at the coordinating node level. If the `timeout` is specified in the search request, it takes precedence over this setting. Default is `-1` (no timeout). +- `search.default_search_timeout` (Dynamic, time unit): A cluster-level setting that specifies the maximum amount of time that a search request can run before the request is canceled at the shard-level. If the `timeout` interval is specified in the search request, that interval takes precedence over the configured setting. Default is `-1`. - `search.default_keep_alive` (Dynamic, time unit): Specifies the default keep alive value for scroll and Point in Time (PIT) searches. Because a request may land on a shard multiple times (for example, during the query and fetch phases), OpenSearch opens a _request context_ that exists for the full duration of the request to ensure consistency of the shard state for each individual shard request. In a standard search, once the fetch phase completes, the request context is closed. For a scroll or a PIT search, OpenSearch keeps the request context open until explicitly closed (or until the keep alive time is reached). A background thread periodically checks all open scroll and PIT contexts and deletes the ones that have exceeded their keep alive timeout. The `search.keep_alive_interval` setting specifies how frequently the contexts are checked for expiration. The `search.default_keep_alive` setting is the default deadline for expiration. A scroll or PIT request can explicitly specify the keep alive, which takes precedence over this setting. Default is `5m`. diff --git a/_install-and-configure/install-dashboards/debian.md b/_install-and-configure/install-dashboards/debian.md index 79a8166f39..73aba46cd4 100644 --- a/_install-and-configure/install-dashboards/debian.md +++ b/_install-and-configure/install-dashboards/debian.md @@ -129,5 +129,46 @@ By default, OpenSearch Dashboards, like OpenSearch, binds to `localhost` when yo sudo systemctl restart opensearch-dashboards ``` 1. From a web browser, navigate to OpenSearch Dashboards. The default port is 5601. -1. Log in with the default username `admin` and the default password `admin`. +1. Log in with the default username `admin` and the default password `admin`. (For OpenSearch 2.12 and later, the password should be the custom admin password) 1. Visit [Getting started with OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/) to learn more. + + +## Upgrade to a newer version + +OpenSearch Dashboards instances installed using `dpkg` or `apt-get` can be easily upgraded to a newer version. + +### Manual upgrade with DPKG + +Download the Debian package for the desired upgrade version directly from the [OpenSearch Project downloads page](https://opensearch.org/downloads.html){:target='\_blank'}. + +Navigate to the directory containing the distribution and run the following command: + +```bash +sudo dpkg -i opensearch-dashboards-{{site.opensearch_version}}-linux-x64.deb +``` +{% include copy.html %} + +### APT-GET + +To upgrade to the latest version of OpenSearch Dashboards using `apt-get`, run the following command: + +```bash +sudo apt-get upgrade opensearch-dashboards +``` +{% include copy.html %} + +You can also upgrade to a specific OpenSearch Dashboards version by providing the version number: + +```bash +sudo apt-get upgrade opensearch-dashboards= +``` +{% include copy.html %} + +### Automatically restart the service after a package upgrade (2.13.0+) + +To automatically restart OpenSearch Dashboards after a package upgrade, enable the `opensearch-dashboards.service` through `systemd`: + +```bash +sudo systemctl enable opensearch-dashboards.service +``` +{% include copy.html %} diff --git a/_install-and-configure/install-dashboards/helm.md b/_install-and-configure/install-dashboards/helm.md index f9df5b137c..58ca995c3e 100644 --- a/_install-and-configure/install-dashboards/helm.md +++ b/_install-and-configure/install-dashboards/helm.md @@ -34,7 +34,7 @@ Before you get started, you must first use [Helm to install OpenSearch]({{site.u Make sure that you can send requests to your OpenSearch pod: ```json -$ curl -XGET https://localhost:9200 -u 'admin:admin' --insecure +$ curl -XGET https://localhost:9200 -u 'admin:' --insecure { "name" : "opensearch-cluster-master-1", "cluster_name" : "opensearch-cluster", diff --git a/_install-and-configure/install-dashboards/index.md b/_install-and-configure/install-dashboards/index.md index 5dc9cb00b0..58c2051f0b 100644 --- a/_install-and-configure/install-dashboards/index.md +++ b/_install-and-configure/install-dashboards/index.md @@ -31,6 +31,41 @@ OpenSearch Dashboards supports the following web browsers: Other Chromium-based browsers might work, as well. Internet Explorer and Microsoft Edge Legacy are **not** supported. +## Node.js compatibility + +OpenSearch Dashboards requires the Node.js runtime binary to run. One is included in the distribution packages available from the [OpenSearch downloads page](https://opensearch.org/downloads.html){:target='\_blank'}. + +OpenSearch Dashboards 2.8.0 and newer can use Node.js versions 14, 16, and 18. The distribution packages for OpenSearch Dashboards 2.10.0 and newer include Node.js 18 and 14 (for backward compatibility). + +To use a Node.js runtime binary other than the ones included in the distribution packages, follow these steps: + +1. Download and install [Node.js](https://nodejs.org/en/download){:target='\_blank'}; the compatible versions are `>=14.20.1 <19`. +2. Set the installation path to the `NODE_HOME` or `NODE_OSD_HOME` environment variables. + + - On UNIX, if Node.js is installed to `/usr/local/nodejs` and the runtime binary is `/usr/local/nodejs/bin/node`: + ```bash + export NODE_HOME=/usr/local/nodejs + ``` + + - If Node.js is installed using NVM and the runtime binary is `/Users/user/.nvm/versions/node/v18.19.0/bin/node`: + ```bash + export NODE_HOME=/Users/user/.nvm/versions/node/v18.19.0 + # or, if NODE_HOME is used for something else: + export NODE_OSD_HOME=/Users/user/.nvm/versions/node/v18.19.0 + ``` + + - On Windows, if Node.js is installed to `C:\Program Files\nodejs` and the runtime binary is `C:\Program Files\nodejs\node.exe`: + ```powershell + set "NODE_HOME=C:\Program Files\nodejs" + # or using PowerShell: + $Env:NODE_HOME = 'C:\Program Files\nodejs' + ``` + + Consult your operating system's documentation to make a persistent change to the environment variables. + +The OpenSearch Dashboards start script,`bin/opensearch-dashboards`, searches for the Node.js runtime binary using `NODE_OSD_HOME`, +and then `NODE_HOME`, before using the binaries included with the distribution packages. If a usable Node.js runtime binary is not found, the start script will attempt to find one in the system-wide `PATH` before failing. + ## Configuration To learn how to configure TLS for OpenSearch Dashboards, see [Configure TLS]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/tls/). diff --git a/_install-and-configure/install-dashboards/plugins.md b/_install-and-configure/install-dashboards/plugins.md index 5db4916a59..6a15e65f1f 100644 --- a/_install-and-configure/install-dashboards/plugins.md +++ b/_install-and-configure/install-dashboards/plugins.md @@ -9,7 +9,7 @@ redirect_from: # Managing OpenSearch Dashboards plugins -OpenSearch Dashboards provides a command line tool called `opensearch-plugin` for managing plugins. This tool allows you to: +OpenSearch Dashboards provides a command line tool called `opensearch-dashboards-plugin` for managing plugins. This tool allows you to: - List installed plugins. - Install plugins. @@ -36,7 +36,7 @@ The following table lists available OpenSearch Dashboards plugins. | Anomaly Detection Dashboards | [anomaly-detection-dashboards-plugin](https://github.com/opensearch-project/anomaly-detection-dashboards-plugin) | 1.0.0 | | Custom Import Maps Dashboards | [dashboards-maps](https://github.com/opensearch-project/dashboards-maps) | 2.2.0 | | Search Relevance Dashboards | [dashboards-search-relevance](https://github.com/opensearch-project/dashboards-search-relevance) | 2.4.0 | -| Gantt Chart Dashboards | [gantt-chart](https://github.com/opensearch-project/dashboards-visualizations/tree/main/gantt-chart) | 1.0.0 | +| Gantt Chart Dashboards | [gantt-chart](https://github.com/opensearch-project/dashboards-visualizations) | 1.0.0 | | Index Management Dashboards | [index-management-dashboards-plugin](https://github.com/opensearch-project/index-management-dashboards-plugin) | 1.0.0 | | Notebooks Dashboards | [dashboards-notebooks](https://github.com/opensearch-project/dashboards-notebooks) | 1.0.0 | | Notifications Dashboards | [dashboards-notifications](https://github.com/opensearch-project/dashboards-notifications) | 2.0.0 | diff --git a/_install-and-configure/install-dashboards/rpm.md b/_install-and-configure/install-dashboards/rpm.md index d250c4c1f3..cc5974c91e 100644 --- a/_install-and-configure/install-dashboards/rpm.md +++ b/_install-and-configure/install-dashboards/rpm.md @@ -89,4 +89,41 @@ YUM, the primary package management tool for Red Hat-based operating systems, al 1. Once complete, you can run OpenSearch Dashboards. ```bash sudo systemctl start opensearch-dashboards - ``` \ No newline at end of file + ``` + +## Upgrade to a newer version + +OpenSearch Dashboards instances installed using RPM or YUM can be easily upgraded to a newer version. We recommend using YUM, but you can also choose RPM. + + +### Manual upgrade with RPM + +Download the RPM package for the desired upgrade version directly from the [OpenSearch Project downloads page](https://opensearch.org/downloads.html){:target='\_blank'}. + +Navigate to the directory containing the distribution and run the following command: + +```bash +rpm -Uvh opensearch-dashboards-{{site.opensearch_version}}-linux-x64.rpm +``` +{% include copy.html %} + +### YUM + +To upgrade to the latest version of OpenSearch Dashboards using YUM, run the following command: + +```bash +sudo yum update opensearch-dashboards +``` +{% include copy.html %} + +You can also upgrade to a specific OpenSearch Dashboards version by providing the version number: + + ```bash + sudo yum update opensearch-dashboards- + ``` + {% include copy.html %} + +### Automatically restart the service after a package upgrade + +The OpenSearch Dashboards RPM package does not currently support automatically restarting the service after a package upgrade. + diff --git a/_install-and-configure/install-opensearch/debian.md b/_install-and-configure/install-opensearch/debian.md index 77b0473c71..72ae05d87c 100644 --- a/_install-and-configure/install-opensearch/debian.md +++ b/_install-and-configure/install-opensearch/debian.md @@ -45,7 +45,15 @@ This guide assumes that you are comfortable working from the Linux command line # arm64 sudo dpkg -i opensearch-{{site.opensearch_version}}-linux-arm64.deb ``` + For OpenSearch 2.12 and greater, a custom admin password is required in order to set up a security demo configuration. To set a custom admin password, use one the following commands: + ```bash + # x64 + sudo env OPENSEARCH_INITIAL_ADMIN_PASSWORD= dpkg -i opensearch-{{site.opensearch_version}}-linux-x64.deb + # arm64 + sudo env OPENSEARCH_INITIAL_ADMIN_PASSWORD= dpkg -i opensearch-{{site.opensearch_version}}-linux-arm64.deb + ``` + 1. After the installation succeeds, enable OpenSearch as a service. ```bash sudo systemctl enable opensearch @@ -175,7 +183,7 @@ An OpenSearch node in its default configuration (with demo certificates and user 1. Send requests to the server to verify that OpenSearch is running. Note the use of the `--insecure` flag, which is required because the TLS certificates are self-signed. - Send a request to port 9200: ```bash - curl -X GET https://localhost:9200 -u 'admin:admin' --insecure + curl -X GET https://localhost:9200 -u 'admin:' --insecure ``` {% include copy.html %} @@ -201,7 +209,7 @@ An OpenSearch node in its default configuration (with demo certificates and user ``` - Query the plugins endpoint: ```bash - curl -X GET https://localhost:9200/_cat/plugins?v -u 'admin:admin' --insecure + curl -X GET https://localhost:9200/_cat/plugins?v -u 'admin:' --insecure ``` {% include copy.html %} @@ -396,7 +404,7 @@ TLS certificates provide additional security for your cluster by allowing client ### Configure a user -Users are defined and authenticated by OpenSearch in a variety of ways. One method that does not require additional backend infrastructure is to manually configure users in `internal_users.yml`. See [YAML files]({{site.url}}{{site.baseurl}}/security-plugin/configuration/yaml/) for more information about configuring users. The following steps explain how to remove all demo users except for the `admin` user and how to replace the `admin` default password using a script. +Users are defined and authenticated by OpenSearch in a variety of ways. One method that does not require additional backend infrastructure is to manually configure users in `internal_users.yml`. See [YAML files]({{site.url}}{{site.baseurl}}/security-plugin/configuration/yaml/) for more information about configuring users. The following steps explain how to add a new internal user and how to replace the `admin` default password using a script. 1. Navigate to the Security plugins tools directory. ```bash @@ -432,7 +440,7 @@ Users are defined and authenticated by OpenSearch in a variety of ways. One meth ``` {% include copy.html %} -1. Remove all demo users except for `admin` and replace the hash with the output provided by `hash.sh` in a previous step. The file should look similar to the following example: +1. Add a new internal user and replace the hash inside `internal_users.yml` with the output provided by `hash.sh` in step 2. The file should look similar to the following example: ```bash --- # This is the internal user database @@ -450,6 +458,15 @@ Users are defined and authenticated by OpenSearch in a variety of ways. One meth backend_roles: - "admin" description: "Admin user" + + user1: + hash: "$2y$12$zoHpvTCRjjQr6h0PEaabueCaGam3/LDvT6rZZGDGMusD7oehQjw/O" + reserved: false + backend_roles: [] + description: "New internal user" + + # Other users + ... ``` {% include copy.html %} @@ -511,7 +528,7 @@ OpenSearch instances installed using `dpkg` or `apt-get` can be easily upgraded ### Manual upgrade with DPKG -Download the Debian package for the desired upgrade version directly from the [OpenSearch downloads page](https://opensearch.org/downloads.html){:target='\_blank'}. +Download the Debian package for the desired upgrade version directly from the [OpenSearch Project downloads page](https://opensearch.org/downloads.html){:target='\_blank'}. Navigate to the directory containing the distribution and run the following command: ```bash @@ -533,6 +550,15 @@ sudo apt-get upgrade opensearch= ``` {% include copy.html %} +### Automatically restart the service after a package upgrade (2.13.0+) + +To automatically restart OpenSearch after a package upgrade, enable the `opensearch.service` through `systemd`: + +```bash +sudo systemctl enable opensearch.service +``` +{% include copy.html %} + ## Related links - [OpenSearch configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/) diff --git a/_install-and-configure/install-opensearch/docker.md b/_install-and-configure/install-opensearch/docker.md index 48c9a5f215..189d647ce5 100644 --- a/_install-and-configure/install-opensearch/docker.md +++ b/_install-and-configure/install-opensearch/docker.md @@ -29,9 +29,11 @@ Docker Compose is a utility that allows users to launch multiple containers with If you need to install Docker Compose manually and your host supports Python, you can use [pip](https://pypi.org/project/pip/) to install the [Docker Compose package](https://pypi.org/project/docker-compose/) automatically. {: .tip} -## Important host settings +## Configure important host settings +Before installing OpenSearch using Docker, configure the following settings. These are the most important settings that can affect the performance of your services, but for additional information, see [important system settings]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/#important-settings){:target='\_blank'}. -Before launching OpenSearch you should review some [important system settings]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/#important-settings){:target='\_blank'} that can impact the performance of your services. +### Linux settings +For a Linux environment, run the following commands: 1. Disable memory paging and swapping performance on the host to improve performance. ```bash @@ -54,6 +56,14 @@ Before launching OpenSearch you should review some [important system settings]({ cat /proc/sys/vm/max_map_count ``` +### Windows settings +For Windows workloads using WSL through Docker Desktop, run the following commands in a terminal to set the `vm.max_map_count`: + +```bash +wsl -d docker-desktop +sysctl -w vm.max_map_count=262144 +``` + ## Run OpenSearch in a Docker container Official OpenSearch images are hosted on [Docker Hub](https://hub.docker.com/u/opensearchproject/) and [Amazon ECR](https://gallery.ecr.aws/opensearchproject/). If you want to inspect the images you can pull them individually using `docker pull`, such as in the following examples. @@ -90,9 +100,13 @@ Before continuing, you should verify that Docker is working correctly by deployi # This command maps ports 9200 and 9600, sets the discovery type to "single-node" and requests the newest image of OpenSearch docker run -d -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:latest ``` + For OpenSearch 2.12 or greater, set a new custom admin password before installation using the following command: + ```bash + docker run -d -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" -e "OPENSEARCH_INITIAL_ADMIN_PASSWORD=" opensearchproject/opensearch:latest + ``` 1. Send a request to port 9200. The default username and password are `admin`. ```bash - curl https://localhost:9200 -ku 'admin:admin' + curl https://localhost:9200 -ku 'admin:' ``` {% include copy.html %} @@ -149,7 +163,19 @@ You can specify a custom file location and name when invoking `docker-compose` w docker-compose -f /path/to/your-file.yml up ``` -If this is your first time launching an OpenSearch cluster using Docker Compose, use the following example `docker-compose.yml` file. Save it in the home directory of your host and name it `docker-compose.yml`. This file will create a cluster that contains three containers: two containers running the OpenSearch service and a single container running OpenSearch Dashboards. These containers will communicate over a bridge network called `opensearch-net` and use two volumes, one for each OpenSearch node. Because this file does not explicitly disable the demo security configuration, self-signed TLS certificates are installed and internal users with default names and passwords are created. +If this is your first time launching an OpenSearch cluster using Docker Compose, use the following example `docker-compose.yml` file. Save it in the home directory of your host and name it `docker-compose.yml`. This file creates a cluster that contains three containers: two containers running the OpenSearch service and a single container running OpenSearch Dashboards. These containers communicate over a bridge network called `opensearch-net` and use two volumes, one for each OpenSearch node. Because this file does not explicitly disable the demo security configuration, self-signed TLS certificates are installed and internal users with default names and passwords are created. + +### Setting a custom admin password + +Starting with OpenSearch 2.12, a custom admin password is required to set up a demo security configuration. Do one of the following: + +- Before running `docker-compose.yml`, set a new custom admin password using the following command: + ``` + export OPENSEARCH_INITIAL_ADMIN_PASSWORD= + ``` + {% include copy.html %} + +- Create an `.env` file in the same folder as your `docker-compose.yml` file with the `OPENSEARCH_INITIAL_ADMIN_PASSWORD` and a strong password value. ### Sample docker-compose.yml @@ -166,6 +192,7 @@ services: - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 # Nodes eligible to serve as cluster manager - bootstrap.memory_lock=true # Disable JVM heap memory swapping - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # Set min and max JVM heap sizes to at least 50% of system RAM + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and later ulimits: memlock: soft: -1 # Set memlock to unlimited (no soft or hard limit) @@ -190,6 +217,7 @@ services: - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 - bootstrap.memory_lock=true - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} ulimits: memlock: soft: -1 diff --git a/_install-and-configure/install-opensearch/helm.md b/_install-and-configure/install-opensearch/helm.md index b1df2da091..ed2aa628e3 100644 --- a/_install-and-configure/install-opensearch/helm.md +++ b/_install-and-configure/install-opensearch/helm.md @@ -98,6 +98,14 @@ You can also build the `opensearch-1.0.0.tgz` file manually: ``` {% include copy.html %} +For OpenSearch 2.12 or greater, customize the admin password in `values.yaml` under `extraEnvs`, as shown in the following example: + +```yaml +extraEnvs: + - name: OPENSEARCH_INITIAL_ADMIN_PASSWORD + value: +``` + #### Sample output ```yaml diff --git a/_install-and-configure/install-opensearch/index.md b/_install-and-configure/install-opensearch/index.md index fe94fc2347..e5d66ae560 100644 --- a/_install-and-configure/install-opensearch/index.md +++ b/_install-and-configure/install-opensearch/index.md @@ -17,15 +17,17 @@ This section details how to install OpenSearch on your host, including which ope ## Operating system compatibility -OpenSearch and OpenSearch Dashboards are compatible with Red Hat Enterprise Linux (RHEL) and Debian-based Linux distributions that use [`systemd`](https://en.wikipedia.org/wiki/Systemd), such as CentOS, Amazon Linux 2, and Ubuntu Long-Term Support (LTS). While OpenSearch and OpenSearch Dashboards should work on most Linux distributions, we only test a subset. +OpenSearch and OpenSearch Dashboards are compatible with Red Hat Enterprise Linux (RHEL) and Debian-based Linux distributions that use [`systemd`](https://en.wikipedia.org/wiki/Systemd), such as Amazon Linux, and Ubuntu Long-Term Support (LTS). While OpenSearch and OpenSearch Dashboards should work on most Linux distributions, we only test a subset. -The following table lists the operating system versions that we currently support. +The following table lists the operating system versions that we are currently testing on: OS | Version :---------- | :-------- -RHEL/CentOS | 7/8 -Rocky Linux | 8 -Ubuntu | 16.04/18.04/20.04 +CentOS | 7 +Rocky Linux | 8 +Alma Linux | 8 +Amazon Linux | 2/2023 +Ubuntu | 20.04 Windows Server | 2019 @@ -39,9 +41,10 @@ The OpenSearch distribution for Linux ships with a compatible [Adoptium JDK](htt OpenSearch Version | Compatible Java Versions | Bundled Java Version :---------- | :-------- | :----------- -1.0 - 1.2.x | 11, 15 | 15.0.1+9 -1.3.x | 8, 11, 14 | 11.0.14.1+1 -2.0.0 | 11, 17 | 17.0.2+8 +1.0--1.2.x | 11, 15 | 15.0.1+9 +1.3.x | 8, 11, 14 | 11.0.22+7 +2.0.0--2.11.x | 11, 17 | 17.0.2+8 +2.12.0 | 11, 17, 21 | 21.0.2+13 To use a different Java installation, set the `OPENSEARCH_JAVA_HOME` or `JAVA_HOME` environment variable to the Java install location. For example: ```bash @@ -113,5 +116,8 @@ OpenSearch has a number of system properties, listed in the following table, tha Property | Description :---------- | :-------- -`opensearch.xcontent.string.length.max=` | By default, OpenSearch does not impose any limits on the maximum length of the JSON string fields. To protect your cluster from potential distributed denial-of-service (DDoS) or memory issues, you can set the `opensearch.xcontent.string.length.max` system property to a reasonable limit (the maximum is 2,147,483,647), for example, `-Dopensearch.xcontent.string.length.max=5000000`. | +`opensearch.xcontent.string.length.max=` | By default, OpenSearch does not impose any limits on the maximum length of the JSON/YAML/CBOR/Smile string fields. To protect your cluster against potential distributed denial-of-service (DDoS) or memory issues, you can set the `opensearch.xcontent.string.length.max` system property to a reasonable limit (the maximum is 2,147,483,647), for example, `-Dopensearch.xcontent.string.length.max=5000000`. | `opensearch.xcontent.fast_double_writer=[true|false]` | By default, OpenSearch serializes floating-point numbers using the default implementation provided by the Java Runtime Environment. Set this value to `true` to use the Schubfach algorithm, which is faster but may lead to small differences in precision. Default is `false`. | +`opensearch.xcontent.name.length.max=` | By default, OpenSearch does not impose any limits on the maximum length of the JSON/YAML/CBOR/Smile field names. To protect your cluster against potential DDoS or memory issues, you can set the `opensearch.xcontent.name.length.max` system property to a reasonable limit (the maximum is 2,147,483,647), for example, `-Dopensearch.xcontent.name.length.max=50000`. | +`opensearch.xcontent.depth.max=` | By default, OpenSearch does not impose any limits on the maximum nesting depth for JSON/YAML/CBOR/Smile documents. To protect your cluster against potential DDoS or memory issues, you can set the `opensearch.xcontent.depth.max` system property to a reasonable limit (the maximum is 2,147,483,647), for example, `-Dopensearch.xcontent.name.length.max=1000`. | +`opensearch.xcontent.codepoint.max=` | By default, OpenSearch imposes a limit of `52428800` on the maximum size of the YAML documents (in code points). To protect your cluster against potential DDoS or memory issues, you can change the `opensearch.xcontent.codepoint.max` system property to a reasonable limit (the maximum is 2,147,483,647). For example, `-Dopensearch.xcontent.codepoint.max=5000000`. | diff --git a/_install-and-configure/install-opensearch/rpm.md b/_install-and-configure/install-opensearch/rpm.md index 7880e44d32..a22ea96d61 100644 --- a/_install-and-configure/install-opensearch/rpm.md +++ b/_install-and-configure/install-opensearch/rpm.md @@ -38,12 +38,30 @@ This guide assumes that you are comfortable working from the Linux command line 1. Download the RPM package for the desired version directly from the [OpenSearch downloads page](https://opensearch.org/downloads.html){:target='\_blank'}. The RPM package can be downloaded for both **x64** and **arm64** architectures. 1. Import the public GNU Privacy Guard (GPG) key. This key verifies that your OpenSearch instance is signed. + ```bash sudo rpm --import https://artifacts.opensearch.org/publickeys/opensearch.pgp ``` {% include copy.html %} + +1. For OpenSearch 2.12 and greater, a custom admin password is required in order to set up a security demo configuration. To set a custom admin password, use one the following commands: + ```bash + # Install the x64 package using yum. + sudo env OPENSEARCH_INITIAL_ADMIN_PASSWORD= yum install opensearch-{{site.opensearch_version}}-linux-x64.rpm + + # Install the x64 package using rpm. + sudo env OPENSEARCH_INITIAL_ADMIN_PASSWORD= rpm -ivh opensearch-{{site.opensearch_version}}-linux-x64.rpm + + # Install the arm64 package using yum. + sudo env OPENSEARCH_INITIAL_ADMIN_PASSWORD= yum install opensearch-{{site.opensearch_version}}-linux-x64.rpm + + # Install the arm64 package using rpm. + sudo env OPENSEARCH_INITIAL_ADMIN_PASSWORD= rpm -ivh opensearch-{{site.opensearch_version}}-linux-x64.rpm + ``` + 1. From the CLI, you can install the package with `rpm` or `yum`. + ```bash # Install the x64 package using yum. sudo yum install opensearch-{{site.opensearch_version}}-linux-x64.rpm @@ -57,23 +75,27 @@ This guide assumes that you are comfortable working from the Linux command line # Install the arm64 package using rpm. sudo rpm -ivh opensearch-{{site.opensearch_version}}-linux-x64.rpm ``` + 1. After the installation succeeds, enable OpenSearch as a service. - ```bash - sudo systemctl enable opensearch - ``` - {% include copy.html %} + + ```bash + sudo systemctl enable opensearch + ``` + {% include copy.html %} 1. Start OpenSearch. - ```bash - sudo systemctl start opensearch - ``` - {% include copy.html %} -1. Verify that OpenSearch launched correctly. - ```bash - sudo systemctl status opensearch - ``` - {% include copy.html %} + ```bash + sudo systemctl start opensearch + ``` + {% include copy.html %} + +1. Verify that OpenSearch launched correctly: + + ```bash + sudo systemctl status opensearch + ``` + {% include copy.html %} ### Install OpenSearch from a YUM repository @@ -147,7 +169,7 @@ An OpenSearch node in its default configuration (with demo certificates and user 1. Send requests to the server to verify that OpenSearch is running. Note the use of the `--insecure` flag, which is required because the TLS certificates are self-signed. - Send a request to port 9200: ```bash - curl -X GET https://localhost:9200 -u 'admin:admin' --insecure + curl -X GET https://localhost:9200 -u 'admin:' --insecure ``` {% include copy.html %} @@ -173,7 +195,7 @@ An OpenSearch node in its default configuration (with demo certificates and user ``` - Query the plugins endpoint: ```bash - curl -X GET https://localhost:9200/_cat/plugins?v -u 'admin:admin' --insecure + curl -X GET https://localhost:9200/_cat/plugins?v -u 'admin:' --insecure ``` {% include copy.html %} @@ -478,7 +500,7 @@ OpenSearch instances installed using RPM or YUM can be easily upgraded to a newe ### Manual upgrade with RPM -Download the RPM package for the desired upgrade version directly from the [OpenSearch downloads page](https://opensearch.org/downloads.html){:target='\_blank'}. +Download the RPM package for the desired upgrade version directly from the [OpenSearch Project downloads page](https://opensearch.org/downloads.html){:target='\_blank'}. Navigate to the directory containing the distribution and run the following command: ```bash @@ -490,7 +512,7 @@ rpm -Uvh opensearch-{{site.opensearch_version}}-linux-x64.rpm To upgrade to the latest version of OpenSearch using YUM: ```bash -sudo yum update +sudo yum update opensearch ``` {% include copy.html %} @@ -500,9 +522,13 @@ sudo yum update ``` {% include copy.html %} +### Automatically restart the service after a package upgrade + +The OpenSearch RPM package does not currently support automatically restarting the service after a package upgrade. + ## Related links - [OpenSearch configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/) - [Install and configure OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/index/) - [OpenSearch plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) -- [About the Security plugin]({{site.url}}{{site.baseurl}}/security/index/) \ No newline at end of file +- [About the Security plugin]({{site.url}}{{site.baseurl}}/security/index/) diff --git a/_install-and-configure/install-opensearch/tar.md b/_install-and-configure/install-opensearch/tar.md index c6edb51491..1e5fe47a12 100644 --- a/_install-and-configure/install-opensearch/tar.md +++ b/_install-and-configure/install-opensearch/tar.md @@ -100,10 +100,16 @@ An OpenSearch node configured by the demo security script is not suitable for a ``` {% include copy.html %} + For OpenSearch 2.12 or greater, set a new custom admin password before installation using the following command: + ```bash + $ export OPENSEARCH_INITIAL_ADMIN_PASSWORD= + ``` + {% include copy.html %} + 1. Open another terminal session and send requests to the server to verify that OpenSearch is running. Note the use of the `--insecure` flag, which is required because the TLS certificates are self-signed. - Send a request to port 9200: ```bash - curl -X GET https://localhost:9200 -u 'admin:admin' --insecure + curl -X GET https://localhost:9200 -u 'admin:' --insecure ``` {% include copy.html %} @@ -129,7 +135,7 @@ An OpenSearch node configured by the demo security script is not suitable for a ``` - Query the plugins endpoint: ```bash - curl -X GET https://localhost:9200/_cat/plugins?v -u 'admin:admin' --insecure + curl -X GET https://localhost:9200/_cat/plugins?v -u 'admin:' --insecure ``` {% include copy.html %} diff --git a/_install-and-configure/install-opensearch/windows.md b/_install-and-configure/install-opensearch/windows.md index b945c0e049..14595a0eed 100644 --- a/_install-and-configure/install-opensearch/windows.md +++ b/_install-and-configure/install-opensearch/windows.md @@ -47,18 +47,9 @@ An OpenSearch node in its default configuration (with demo certificates and user ### Option 1: Test your OpenSearch settings with security enabled -1. Run the demo batch script. +1. Run the demo batch script from Command prompt or Powershell. - There are two ways of running the batch script: - - 1. Run the batch script using the Windows UI: - - 1. Navigate to the top directory of your OpenSearch installation and open the `opensearch-{{site.opensearch_version}}` folder. - 1. Run the batch script by double-clicking the `opensearch-windows-install.bat` file. This opens a command prompt with an OpenSearch instance running. - - 1. Run the batch script from Command prompt or Powershell: - - 1. Open Command Prompt by entering `cmd`, or Powershell by entering `powershell`, in the search box next to **Start** on the taskbar. + 1. Open Command Prompt by entering `cmd` or Powershell by entering `powershell` in the search box next to **Start** on the taskbar. 1. Change to the top directory of your OpenSearch installation. ```bat cd \path\to\opensearch-{{site.opensearch_version}} @@ -66,6 +57,11 @@ An OpenSearch node in its default configuration (with demo certificates and user {% include copy.html %} 1. Run the batch script. + For OpenSearch 2.12 or later, use the following command to specify a custom admin password: + ```bat + > set OPENSEARCH_INITIAL_ADMIN_PASSWORD= + ``` + {% include copy.html %} ```bat .\opensearch-windows-install.bat ``` @@ -74,7 +70,7 @@ An OpenSearch node in its default configuration (with demo certificates and user 1. Open a new command prompt and send requests to the server to verify that OpenSearch is running. Note the use of the `--insecure` flag, which is required because the TLS certificates are self-signed. - Send a request to port 9200: ```bat - curl.exe -X GET https://localhost:9200 -u "admin:admin" --insecure + curl.exe -X GET https://localhost:9200 -u "admin:" --insecure ``` {% include copy.html %} @@ -100,7 +96,7 @@ An OpenSearch node in its default configuration (with demo certificates and user ``` - Query the plugins endpoint: ```bat - curl.exe -X GET https://localhost:9200/_cat/plugins?v -u "admin:admin" --insecure + curl.exe -X GET https://localhost:9200/_cat/plugins?v -u "admin:" --insecure ``` {% include copy.html %} diff --git a/_install-and-configure/plugins.md b/_install-and-configure/plugins.md index dcc8140f1e..6b0b28769e 100644 --- a/_install-and-configure/plugins.md +++ b/_install-and-configure/plugins.md @@ -11,7 +11,6 @@ redirect_from: You can install individual plugins for OpenSearch based on your needs. For information about available plugins, see [Available plugins](#available-plugins). - For plugins to work properly with OpenSearch, all plugins must have the ability to access the data in the cluster, including metadata about cluster operations. Therefore, to protect your cluster's data and preserve its integrity, first be sure you understand the function of a plugin before installing it on your OpenSearch cluster. Second, when selecting a custom plugin, make sure the plugin's source is a reliable one. {: .warning} @@ -32,12 +31,12 @@ If you are running OpenSearch in a Docker container, plugins must be installed, Use `list` to see a list of plugins that have already been installed. -#### Usage: +#### Usage ```bash bin/opensearch-plugin list ``` -#### Example: +#### Example ```bash $ ./opensearch-plugin list opensearch-alerting @@ -66,7 +65,7 @@ You can also list installed plugins by using the [CAT API]({{site.url}}{{site.ba GET _cat/plugins ``` -#### Sample response +#### Example response ```bash opensearch-node1 opensearch-alerting 2.0.1.0 @@ -85,20 +84,20 @@ opensearch-node1 opensearch-notifications-core 2.0.1.0 There are three ways to install plugins using the `opensearch-plugin`: -- [Install a plugin by name]({{site.url}}{{site.baseurl}}/opensearch/install/plugins#install-a-plugin-by-name) -- [Install a plugin by from a zip file]({{site.url}}{{site.baseurl}}/opensearch/install/plugins#install-a-plugin-from-a-zip-file) -- [Install a plugin using Maven coordinates]({{site.url}}{{site.baseurl}}/opensearch/install/plugins#install-a-plugin-using-maven-coordinates) +- [Install a plugin by name](#install-a-plugin-by-name). +- [Install a plugin from a ZIP file](#install-a-plugin-from-a-zip-file). +- [Install a plugin using Maven coordinates](#install-a-plugin-using-maven-coordinates). ### Install a plugin by name: -For a list of plugins that can be installed by name, see [Additional plugins]({{site.url}}{{site.baseurl}}/opensearch/install/plugins#additional-plugins). +For a list of plugins that can be installed by name, see [Additional plugins](#additional-plugins). -#### Usage: +#### Usage ```bash bin/opensearch-plugin install ``` -#### Example: +#### Example ```bash $ sudo ./opensearch-plugin install analysis-icu -> Installing analysis-icu @@ -107,16 +106,16 @@ $ sudo ./opensearch-plugin install analysis-icu -> Installed analysis-icu with folder name analysis-icu ``` -### Install a plugin from a zip file: +### Install a plugin from a zip file Remote zip files can be installed by replacing `` with the URL of the hosted file. The tool only supports downloading over HTTP/HTTPS protocols. For local zip files, replace `` with `file:` followed by the absolute or relative path to the plugin zip file as in the second example below. -#### Usage: +#### Usage ```bash bin/opensearch-plugin install ``` -#### Example: +#### Example ```bash # Zip file is hosted on a remote server - in this case, Maven central repository. $ sudo ./opensearch-plugin install https://repo1.maven.org/maven2/org/opensearch/plugin/opensearch-anomaly-detection/2.2.0.0/opensearch-anomaly-detection-2.2.0.0.zip @@ -167,16 +166,16 @@ Continue with installation? [y/N]y -> Installed opensearch-anomaly-detection with folder name opensearch-anomaly-detection ``` -### Install a plugin using Maven coordinates: +### Install a plugin using Maven coordinates The `opensearch-plugin install` tool also accepts Maven coordinates for available artifacts and versions hosted on [Maven Central](https://search.maven.org/search?q=org.opensearch.plugin). `opensearch-plugin` will parse the Maven coordinates you provide and construct a URL. As a result, the host must be able to connect directly to [Maven Central](https://search.maven.org/search?q=org.opensearch.plugin). The plugin installation will fail if you pass coordinates to a proxy or local repository. -#### Usage: +#### Usage ```bash bin/opensearch-plugin install :: ``` -#### Example: +#### Example ```bash $ sudo ./opensearch-plugin install org.opensearch.plugin:opensearch-anomaly-detection:2.2.0.0 -> Installing org.opensearch.plugin:opensearch-anomaly-detection:2.2.0.0 @@ -205,16 +204,30 @@ Continue with installation? [y/N]y Restart your OpenSearch node after installing a plugin. {: .note} +### Installing multiple plugins + +Multiple plugins can be installed in a single invocation. + +#### Usage +```bash +bin/opensearch-plugin install ... +``` + +#### Example +```bash +$ sudo $ ./opensearch-plugin install analysis-nori repository-s3 +``` + ## Remove You can remove a plugin that has already been installed with the `remove` option. -#### Usage: +#### Usage ```bash bin/opensearch-plugin remove ``` -#### Example: +#### Example ```bash $ sudo $ ./opensearch-plugin remove opensearch-anomaly-detection -> removing [opensearch-anomaly-detection]... @@ -234,20 +247,36 @@ bin/opensearch-plugin install --batch ## Available plugins -Major, minor, and patch plugin versions must match OpenSearch major, minor, and patch versions in order to be compatible. For example, plugins versions 2.3.0.x work only with OpenSearch 2.3.0. -{: .warning} +OpenSearch provides several bundled and additional plugins. + +### Plugin compatibility + +A plugin can explicitly specify compatibility with a specific OpenSearch version by listing that version in its `plugin-descriptor.properties` file. For example, a plugin with the following property is compatible only with OpenSearch 2.3.0: + +```properties +opensearch.version=2.3.0 +``` +Alternatively, a plugin can specify a range of compatible OpenSearch versions by setting the `dependencies` property in its `plugin-descriptor.properties` file using one of the following notations: +- `dependencies={ opensearch: "2.3.0" }`: The plugin is compatible only with OpenSearch version 2.3.0. +- `dependencies={ opensearch: "=2.3.0" }`: The plugin is compatible only with OpenSearch version 2.3.0. +- `dependencies={ opensearch: "~2.3.0" }`: The plugin is compatible with all versions starting from 2.3.0 up to the next minor version, in this example, 2.4.0 (exclusive). +- `dependencies={ opensearch: "^2.3.0" }`: The plugin is compatible with all versions starting from 2.3.0 up to the next major version, in this example, 3.0.0 (exclusive). + +You can specify only one of the `opensearch.version` or `dependencies` properties. +{: .note} -### Bundled Plugins +### Bundled plugins The following plugins are bundled with all OpenSearch distributions except for minimum distribution packages. -| Plugin Name | Repository | Earliest Available Version | +| Plugin name | Repository | Earliest available version | | :--- | :--- | :--- | | Alerting | [opensearch-alerting](https://github.com/opensearch-project/alerting) | 1.0.0 | | Anomaly Detection | [opensearch-anomaly-detection](https://github.com/opensearch-project/anomaly-detection) | 1.0.0 | | Asynchronous Search | [opensearch-asynchronous-search](https://github.com/opensearch-project/asynchronous-search) | 1.0.0 | | Cross Cluster Replication | [opensearch-cross-cluster-replication](https://github.com/opensearch-project/cross-cluster-replication) | 1.1.0 | | Custom Codecs | [opensearch-custom-codecs](https://github.com/opensearch-project/custom-codecs) | 2.10.0 | +| Flow Framework | [flow-framework](https://github.com/opensearch-project/flow-framework) | 2.12.0 | | Notebooks1 | [opensearch-notebooks](https://github.com/opensearch-project/dashboards-notebooks) | 1.0.0 to 1.1.0 | | Notifications | [notifications](https://github.com/opensearch-project/notifications) | 2.0.0 | Reports Scheduler | [opensearch-reports-scheduler](https://github.com/opensearch-project/dashboards-reports) | 1.0.0 | @@ -271,7 +300,7 @@ _2Performance Analyzer is not available on Windows._ Members of the OpenSearch community have built countless plugins for the service. Although it isn't possible to build an exhaustive list of every plugin, since many plugins are not maintained within the OpenSearch GitHub repository, the following list of plugins are available to be installed by name using `bin/opensearch-plugin install `. -| Plugin Name | Earliest Available Version | +| Plugin name | Earliest available version | | :--- | :--- | | analysis-icu | 1.0.0 | | analysis-kuromoji | 1.0.0 | @@ -287,6 +316,7 @@ Members of the OpenSearch community have built countless plugins for the service | mapper-annotated-text | 1.0.0 | | mapper-murmur3 | 1.0.0 | | mapper-size | 1.0.0 | +| query-insights | 2.12.0 | | repository-azure | 1.0.0 | | repository-gcs | 1.0.0 | | repository-hdfs | 1.0.0 | diff --git a/_install-and-configure/upgrade-opensearch/appendix/rolling-upgrade-lab.md b/_install-and-configure/upgrade-opensearch/appendix/rolling-upgrade-lab.md index a32b4d2692..924900dbc8 100644 --- a/_install-and-configure/upgrade-opensearch/appendix/rolling-upgrade-lab.md +++ b/_install-and-configure/upgrade-opensearch/appendix/rolling-upgrade-lab.md @@ -125,7 +125,7 @@ After selecting a host, you can begin the lab: 1. Press `Ctrl+C` to stop following container logs and return to the command prompt. 1. Use cURL to query the OpenSearch REST API. In the following command, `os-node-01` is queried by sending the request to host port `9201`, which is mapped to port `9200` on the container: ```bash - curl -s "https://localhost:9201" -ku admin:admin + curl -s "https://localhost:9201" -ku admin: ``` {% include copy.html %}

Example response

@@ -177,7 +177,7 @@ This section can be broken down into two parts: curl -H "Content-Type: application/x-ndjson" \ -X PUT "https://localhost:9201/ecommerce?pretty" \ --data-binary "@ecommerce-field_mappings.json" \ - -ku admin:admin + -ku admin: ``` {% include copy.html %}

Example response

@@ -193,7 +193,7 @@ This section can be broken down into two parts: curl -H "Content-Type: application/x-ndjson" \ -X PUT "https://localhost:9201/ecommerce/_bulk?pretty" \ --data-binary "@ecommerce.json" \ - -ku admin:admin + -ku admin: ``` {% include copy.html %}

Example response (truncated)

@@ -226,7 +226,7 @@ This section can be broken down into two parts: curl -H 'Content-Type: application/json' \ -X GET "https://localhost:9201/ecommerce/_search?pretty=true&filter_path=hits.total" \ -d'{"query":{"match":{"customer_first_name":"Sonya"}}}' \ - -ku admin:admin + -ku admin: ``` {% include copy.html %}

Example response

@@ -271,7 +271,7 @@ In this section you will be: curl -H 'Content-Type: application/json' \ -X PUT "https://localhost:9201/_snapshot/snapshot-repo?pretty" \ -d '{"type":"fs","settings":{"location":"/usr/share/opensearch/snapshots"}}' \ - -ku admin:admin + -ku admin: ``` {% include copy.html %}

Example response

@@ -284,7 +284,7 @@ In this section you will be: ```bash curl -H 'Content-Type: application/json' \ -X POST "https://localhost:9201/_snapshot/snapshot-repo/_verify?timeout=0s&master_timeout=50s&pretty" \ - -ku admin:admin + -ku admin: ``` {% include copy.html %}

Example response

@@ -315,7 +315,7 @@ Snapshots are backups of a cluster’s indexes and state. See [Snapshots]({{site ```bash curl -H 'Content-Type: application/json' \ -X PUT "https://localhost:9201/_snapshot/snapshot-repo/cluster-snapshot-v137?wait_for_completion=true&pretty" \ - -ku admin:admin + -ku admin: ``` {% include copy.html %}

Example response

@@ -448,7 +448,7 @@ Some steps included in this section, like disabling shard replication and flushi curl -H 'Content-type: application/json' \ -X PUT "https://localhost:9201/_cluster/settings?pretty" \ -d'{"persistent":{"cluster.routing.allocation.enable":"primaries"}}' \ - -ku admin:admin + -ku admin: ``` {% include copy.html %}

Example response

@@ -469,7 +469,7 @@ Some steps included in this section, like disabling shard replication and flushi ``` 1. Perform a flush operation on the cluster to commit transaction log entries to the Lucene index: ```bash - curl -X POST "https://localhost:9201/_flush?pretty" -ku admin:admin + curl -X POST "https://localhost:9201/_flush?pretty" -ku admin: ``` {% include copy.html %}

Example response

@@ -514,7 +514,7 @@ Some steps included in this section, like disabling shard replication and flushi 1. **Optional**: Query the cluster to determine which node is acting as the cluster manager. You can run this command at any time during the process to see when a new cluster manager is elected: ```bash curl -s "https://localhost:9201/_cat/nodes?v&h=name,version,node.role,master" \ - -ku admin:admin | column -t + -ku admin: | column -t ``` {% include copy.html %}

Example response

@@ -528,7 +528,7 @@ Some steps included in this section, like disabling shard replication and flushi 1. **Optional**: Query the cluster to see how shard allocation changes as nodes are removed and replaced. You can run this command at any time during the process to see how shard statuses change: ```bash curl -s "https://localhost:9201/_cat/shards" \ - -ku admin:admin + -ku admin: ``` {% include copy.html %}

Example response

@@ -644,7 +644,7 @@ Some steps included in this section, like disabling shard replication and flushi 1. Confirm that your cluster is running the new version: ```bash curl -s "https://localhost:9201/_cat/nodes?v&h=name,version,node.role,master" \ - -ku admin:admin | column -t + -ku admin: | column -t ``` {% include copy.html %}

Example response

@@ -700,7 +700,7 @@ Some steps included in this section, like disabling shard replication and flushi curl -H 'Content-type: application/json' \ -X PUT "https://localhost:9201/_cluster/settings?pretty" \ -d'{"persistent":{"cluster.routing.allocation.enable":"all"}}' \ - -ku admin:admin + -ku admin: ``` {% include copy.html %}

Example response

@@ -735,7 +735,7 @@ For this cluster, post-upgrade validation steps can include verifying the follow 1. Verify the current running version of your OpenSearch nodes: ```bash curl -s "https://localhost:9201/_cat/nodes?v&h=name,version,node.role,master" \ - -ku admin:admin | column -t + -ku admin: | column -t ``` {% include copy.html %}

Example response

@@ -781,7 +781,7 @@ For this cluster, post-upgrade validation steps can include verifying the follow 1. Query the [Cluster health]({{site.url}}{{site.baseurl}}/api-reference/cluster-api/cluster-health/) API endpoint to see information about the health of your cluster. You should see a status of `green`, which indicates that all primary and replica shards are allocated: ```bash - curl -s "https://localhost:9201/_cluster/health?pretty" -ku admin:admin + curl -s "https://localhost:9201/_cluster/health?pretty" -ku admin: ``` {% include copy.html %}

Example response

@@ -808,7 +808,7 @@ For this cluster, post-upgrade validation steps can include verifying the follow ``` 1. Query the [CAT shards]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-shards/) API endpoint to see how shards are allocated after the cluster is upgrade: ```bash - curl -s "https://localhost:9201/_cat/shards" -ku admin:admin + curl -s "https://localhost:9201/_cat/shards" -ku admin: ``` {% include copy.html %}

Example response

@@ -860,7 +860,7 @@ You need to query the ecommerce index again in order to confirm that the sample curl -H 'Content-Type: application/json' \ -X GET "https://localhost:9201/ecommerce/_search?pretty=true&filter_path=hits.total" \ -d'{"query":{"match":{"customer_first_name":"Sonya"}}}' \ - -ku admin:admin + -ku admin: ``` {% include copy.html %}

Example response

diff --git a/_layouts/home.html b/_layouts/home.html index 983cde7b1e..0b13e44e23 100644 --- a/_layouts/home.html +++ b/_layouts/home.html @@ -31,6 +31,6 @@ {% include footer.html %} - + diff --git a/_ml-commons-plugin/agents-tools/agents-tools-tutorial.md b/_ml-commons-plugin/agents-tools/agents-tools-tutorial.md new file mode 100644 index 0000000000..39051f399c --- /dev/null +++ b/_ml-commons-plugin/agents-tools/agents-tools-tutorial.md @@ -0,0 +1,401 @@ +--- +layout: default +title: Agents and tools tutorial +parent: Agents and tools +grand_parent: ML Commons APIs +nav_order: 10 +--- + +# Agents and tools tutorial +**Introduced 2.13** +{: .label .label-purple } + +The following tutorial illustrates creating a flow agent for retrieval-augmented generation (RAG). A flow agent runs its configured tools sequentially, in the order specified. In this example, you'll create an agent with two tools: + +1. `VectorDBTool`: The agent will use this tool to retrieve OpenSearch documents relevant to the user question. You'll ingest supplementary information into an OpenSearch index. To facilitate vector search, you'll deploy a text embedding model that translates text into vector embeddings. OpenSearch will translate the ingested documents into embeddings and store them in the index. When you provide a user question to the agent, the agent will construct a query from the question, run vector search on the OpenSearch index, and pass the relevant retrieved documents to the `MLModelTool`. +1. `MLModelTool`: The agent will run this tool to connect to a large language model (LLM) and send the user query augmented with OpenSearch documents to the model. In this example, you'll use the [Anthropic Claude model hosted on Amazon Bedrock](https://aws.amazon.com/bedrock/claude/). The LLM will then answer the question based on its knowledge and the provided documents. + +## Prerequisites + +To use the memory feature, first configure the following cluster settings. This tutorial assumes that you have no dedicated machine learning (ML) nodes: + +```json +PUT _cluster/settings +{ + "persistent": { + "plugins.ml_commons.only_run_on_ml_node": "false", + "plugins.ml_commons.memory_feature_enabled": "true" + } +} +``` +{% include copy-curl.html %} + +For more information, see [ML Commons cluster settings]({{site.url}}{{site.baseurl}}/ml-commons-plugin/cluster-settings/). + +## Step 1: Register and deploy a text embedding model + +You need a text embedding model to facilitate vector search. For this tutorial, you'll use one of the OpenSearch-provided pretrained models. When selecting a model, note its dimensionality because you'll need to provide it when creating an index. + +In this tutorial, you'll use the `huggingface/sentence-transformers/all-MiniLM-L12-v2` model, which generates 384-dimensional dense vector embeddings. To register and deploy the model, send the following request: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "huggingface/sentence-transformers/all-MiniLM-L12-v2", + "version": "1.0.1", + "model_format": "TORCH_SCRIPT" +} +``` +{% include copy-curl.html %} + +Registering a model is an asynchronous task. OpenSearch returns a task ID for this task: + +```json +{ + "task_id": "aFeif4oB5Vm0Tdw8yoN7", + "status": "CREATED" +} +``` + +You can check the status of the task by calling the Tasks API: + +```json +GET /_plugins/_ml/tasks/aFeif4oB5Vm0Tdw8yoN7 +``` +{% include copy-curl.html %} + +Once the task is complete, the task state changes to `COMPLETED` and the Tasks API response includes a model ID for the deployed model: + +```json +{ + "model_id": "aVeif4oB5Vm0Tdw8zYO2", + "task_type": "REGISTER_MODEL", + "function_name": "TEXT_EMBEDDING", + "state": "COMPLETED", + "worker_node": [ + "4p6FVOmJRtu3wehDD74hzQ" + ], + "create_time": 1694358489722, + "last_update_time": 1694358499139, + "is_async": true +} +``` + +## Step 2: Create an ingest pipeline + +To translate text into vector embeddings, you'll set up an ingest pipeline. The pipeline translates the `text` field and writes the resulting vector embeddings into the `embedding` field. Create the pipeline by specifying the `model_id` from the previous step in the following request: + +```json +PUT /_ingest/pipeline/test-pipeline-local-model +{ + "description": "text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "aVeif4oB5Vm0Tdw8zYO2", + "field_map": { + "text": "embedding" + } + } + } + ] +} +``` + +## Step 3: Create a k-NN index and ingest data + +Now you'll ingest supplementary data into an OpenSearch index. In OpenSearch, vectors are stored in a k-NN index. You can create a [k-NN index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/) by sending the following request: + +```json +PUT my_test_data +{ + "mappings": { + "properties": { + "text": { + "type": "text" + }, + "embedding": { + "type": "knn_vector", + "dimension": 384 + } + } + }, + "settings": { + "index": { + "knn.space_type": "cosinesimil", + "default_pipeline": "test-pipeline-local-model", + "knn": "true" + } + } +} +``` +{% include copy-curl.html %} + +Then, ingest data into the index by using a bulk request: + +```json +POST _bulk +{"index": {"_index": "my_test_data", "_id": "1"}} +{"text": "Chart and table of population level and growth rate for the Ogden-Layton metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Ogden-Layton in 2023 is 750,000, a 1.63% increase from 2022.\nThe metro area population of Ogden-Layton in 2022 was 738,000, a 1.79% increase from 2021.\nThe metro area population of Ogden-Layton in 2021 was 725,000, a 1.97% increase from 2020.\nThe metro area population of Ogden-Layton in 2020 was 711,000, a 2.16% increase from 2019."} +{"index": {"_index": "my_test_data", "_id": "2"}} +{"text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019."} +{"index": {"_index": "my_test_data", "_id": "3"}} +{"text": "Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019."} +{"index": {"_index": "my_test_data", "_id": "4"}} +{"text": "Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019."} +{"index": {"_index": "my_test_data", "_id": "5"}} +{"text": "Chart and table of population level and growth rate for the Austin metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Austin in 2023 is 2,228,000, a 2.39% increase from 2022.\\nThe metro area population of Austin in 2022 was 2,176,000, a 2.79% increase from 2021.\\nThe metro area population of Austin in 2021 was 2,117,000, a 3.12% increase from 2020.\\nThe metro area population of Austin in 2020 was 2,053,000, a 3.43% increase from 2019."} +{"index": {"_index": "my_test_data", "_id": "6"}} +{"text": "Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\\nThe metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\\nThe metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\\nThe metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019."} +``` +{% include copy-curl.html %} + +## Step 4: Create a connector to an externally hosted model + +You'll need an LLM to generate responses to user questions. An LLM is too large for an OpenSearch cluster, so you'll create a connection to an externally hosted LLM. For this example, you'll create a connector to the Anthropic Claude model hosted on Amazon Bedrock: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "BedRock test claude Connector", + "description": "The connector to BedRock service for claude model", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "us-east-1", + "service_name": "bedrock", + "anthropic_version": "bedrock-2023-05-31", + "endpoint": "bedrock.us-east-1.amazonaws.com", + "auth": "Sig_V4", + "content_type": "application/json", + "max_tokens_to_sample": 8000, + "temperature": 0.0001, + "response_filter": "$.completion" + }, + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required" + }, + "request_body": "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }" + } + ] +} +``` +{% include copy-curl.html %} + +The response contains the connector ID for the newly created connector: + +```json +{ + "connector_id": "a1eMb4kBJ1eYAeTMAljY" +} +``` + +## Step 5: Register and deploy the externally hosted model + +Like the text embedding model, an LLM needs to be registered and deployed to OpenSearch. To set up the externally hosted model, first create a model group for this model: + +```json +POST /_plugins/_ml/model_groups/_register +{ + "name": "test_model_group_bedrock", + "description": "This is a public model group" +} +``` +{% include copy-curl.html %} + +The response contains the model group ID that you’ll use to register a model to this model group: + +```json +{ + "model_group_id": "wlcnb4kBJ1eYAeTMHlV6", + "status": "CREATED" +} + +``` + +Next, register and deploy the externally hosted Claude model: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "Bedrock Claude V2 model", + "function_name": "remote", + "model_group_id": "wlcnb4kBJ1eYAeTMHlV6", + "description": "test model", + "connector_id": "a1eMb4kBJ1eYAeTMAljY" +} +``` +{% include copy-curl.html %} + +Similarly to [Step 1](#step-1-register-and-deploy-a-text-embedding-model), the response contains a task ID that you can use to check the status of the deployment. Once the model is deployed, the status changes to `COMPLETED` and the response includes the model ID for the Claude model: + +```json +{ + "model_id": "NWR9YIsBUysqmzBdifVJ", + "task_type": "REGISTER_MODEL", + "function_name": "remote", + "state": "COMPLETED", + "worker_node": [ + "4p6FVOmJRtu3wehDD74hzQ" + ], + "create_time": 1694358489722, + "last_update_time": 1694358499139, + "is_async": true +} +``` + +To test the LLM, send the following predict request: + +```json +POST /_plugins/_ml/models/NWR9YIsBUysqmzBdifVJ/_predict +{ + "parameters": { + "prompt": "\n\nHuman:hello\n\nAssistant:" + } +} +``` +{% include copy-curl.html %} + +## Step 6: Register and execute an agent + +Finally, you'll use the text embedding model created in Step 1 and the Claude model created in Step 5 to create a flow agent. This flow agent will run a `VectorDBTool` and then an `MLModelTool`. The `VectorDBTool` is configured with the model ID for the text embedding model created in Step 1 for vector search. The `MLModelTool` is configured with the Claude model created in step 5: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_RAG", + "type": "flow", + "description": "this is a test agent", + "tools": [ + { + "type": "VectorDBTool", + "parameters": { + "model_id": "aVeif4oB5Vm0Tdw8zYO2", + "index": "my_test_data", + "embedding_field": "embedding", + "source_field": ["text"], + "input": "${parameters.question}" + } + }, + { + "type": "MLModelTool", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "NWR9YIsBUysqmzBdifVJ", + "prompt": "\n\nHuman:You are a professional data analyst. You will always answer a question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say you don't know. \n\n Context:\n${parameters.VectorDBTool.output}\n\nHuman:${parameters.question}\n\nAssistant:" + } + } + ] +} +``` +{% include copy-curl.html %} + +OpenSearch returns an agent ID for the newly created agent: + +```json +{ + "agent_id": "879v9YwBjWKCe6Kg12Tx" +} +``` + +You can inspect the agent by sending a request to the `agents` endpoint and providing the agent ID: + +```json +GET /_plugins/_ml/agents/879v9YwBjWKCe6Kg12Tx +``` +{% include copy-curl.html %} + +To execute the agent, send the following request. When registering the agent, you configured it to take in `parameters.question`, so you need to provide this parameter in the request. This parameter represents a human-generated user question: + +```json +POST /_plugins/_ml/agents/879v9YwBjWKCe6Kg12Tx/_execute +{ + "parameters": { + "question": "what's the population increase of Seattle from 2021 to 2023" + } +} +``` +{% include copy-curl.html %} + +The LLM does not have the recent information in its knowledge base, so it infers the response to the question based on the ingested data, demonstrating RAG: + +```json +{ + "inference_results": [ + { + "output": [ + { + "result": """ Based on the given context, the key information is: + +The metro area population of Seattle in 2021 was 3,461,000. +The metro area population of Seattle in 2023 is 3,519,000. + +To calculate the population increase from 2021 to 2023: + +Population in 2023 (3,519,000) - Population in 2021 (3,461,000) = 58,000 + +Therefore, the population increase of Seattle from 2021 to 2023 is 58,000.""" + } + ] + } + ] +} +``` + +## Hidden agents +**Introduced 2.13** +{: .label .label-purple } + +To hide agent details from end users, including the cluster admin, you can register a _hidden_ agent. If an agent is hidden, non-superadmin users don't have permission to call any [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/index/) except for the [Execute API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/execute-agent/), on the agent. + +Only superadmin users can register a hidden agent. To register a hidden agent, you first need to authenticate with an [admin certificate]({{site.url}}{{site.baseurl}}/security/configuration/tls/#configuring-admin-certificates): + +```bash +curl -k --cert ./kirk.pem --key ./kirk-key.pem -XGET 'https://localhost:9200/.opendistro_security/_search' +``` + +All agents created by a superadmin user are automatically registered as hidden. Only the superadmin user can view hidden agent details and delete hidden agents. +To register a hidden agent, send a request to the `_register` endpoint: + +```bash +curl -k --cert ./kirk.pem --key ./kirk-key.pem -X POST 'https://localhost:9200/_plugins/_ml/agents/_register' -H 'Content-Type: application/json' -d ' +{ + "name": "Test_Agent_For_RAG", + "type": "flow", + "description": "this is a test agent", + "tools": [ + { + "name": "vector_tool", + "type": "VectorDBTool", + "parameters": { + "model_id": "zBRyYIsBls05QaITo5ex", + "index": "my_test_data", + "embedding_field": "embedding", + "source_field": [ + "text" + ], + "input": "${parameters.question}" + } + }, + { + "type": "MLModelTool", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "NWR9YIsBUysqmzBdifVJ", + "prompt": "\n\nHuman:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. \n\n Context:\n${parameters.vector_tool.output}\n\nHuman:${parameters.question}\n\nAssistant:" + } + } + ] +}' +``` diff --git a/_ml-commons-plugin/agents-tools/index.md b/_ml-commons-plugin/agents-tools/index.md new file mode 100644 index 0000000000..ba88edef2f --- /dev/null +++ b/_ml-commons-plugin/agents-tools/index.md @@ -0,0 +1,160 @@ +--- +layout: default +title: Agents and tools +has_children: true +has_toc: false +nav_order: 27 +--- + +# Agents and tools +**Introduced 2.13** +{: .label .label-purple } + +You can automate machine learning (ML) tasks using agents and tools. An _agent_ orchestrates and runs ML models and tools. A _tool_ performs a set of specific tasks. Some examples of tools are the `VectorDBTool`, which supports vector search, and the `CATIndexTool`, which executes the `cat indices` operation. For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). + +## Agents + +An _agent_ is a coordinator that uses a large language model (LLM) to solve a problem. After the LLM reasons and decides what action to take, the agent coordinates the action execution. OpenSearch supports the following agent types: + +- [_Flow agent_](#flow-agents): Runs tools sequentially, in the order specified in its configuration. The workflow of a flow agent is fixed. Useful for retrieval-augmented generation (RAG). +- [_Conversational flow agent_](#conversational-flow-agents): Runs tools sequentially, in the order specified in its configuration. The workflow of a conversational flow agent is fixed. Stores conversation history so that users can ask follow-up questions. Useful for creating a chatbot. +- [_Conversational agent_](#conversational-agents): Reasons in order to provide a response based on the available knowledge, including the LLM knowledge base and a set of tools provided to the LLM. Stores conversation history so that users can ask follow-up questions. The workflow of a conversational agent is variable, based on follow-up questions. For specific questions, uses the Chain-of-Thought (CoT) process to select the best tool from the configured tools for providing a response to the question. Useful for creating a chatbot that employs RAG. + +### Flow agents + +A flow agent is configured with a set of tools that it runs in order. For example, the following agent runs the `VectorDBTool` and then the `MLModelTool`. The agent coordinates the tools so that one tool's output can become another tool's input. In this example, the `VectorDBTool` queries the k-NN index and the agent passes its output `${parameters.VectorDBTool.output}` to the `MLModelTool` as context, along with the `${parameters.question}` (see the `prompt` parameter): + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_RAG", + "type": "flow", + "description": "this is a test agent", + "tools": [ + { + "type": "VectorDBTool", + "parameters": { + "model_id": "YOUR_TEXT_EMBEDDING_MODEL_ID", + "index": "my_test_data", + "embedding_field": "embedding", + "source_field": ["text"], + "input": "${parameters.question}" + } + }, + { + "type": "MLModelTool", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "YOUR_LLM_MODEL_ID", + "prompt": "\n\nHuman:You are a professional data analyst. You will always answer a question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say you don't know. \n\n Context:\n${parameters.VectorDBTool.output}\n\nHuman:${parameters.question}\n\nAssistant:" + } + } + ] +} +``` + +### Conversational flow agents + +Similarly to a flow agent, a conversational flow agent is configured with a set of tools that it runs in order. The difference between them is that a conversational flow agent stores the conversation in an index, in the following example, the `conversation_index`. The following agent runs the `VectorDBTool` and then the `MLModelTool`: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "population data analysis agent", + "type": "conversational_flow", + "description": "This is a demo agent for population data analysis", + "app_type": "rag", + "memory": { + "type": "conversation_index" + }, + "tools": [ + { + "type": "VectorDBTool", + "name": "population_knowledge_base", + "parameters": { + "model_id": "YOUR_TEXT_EMBEDDING_MODEL_ID", + "index": "test_population_data", + "embedding_field": "population_description_embedding", + "source_field": [ + "population_description" + ], + "input": "${parameters.question}" + } + }, + { + "type": "MLModelTool", + "name": "bedrock_claude_model", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "YOUR_LLM_MODEL_ID", + "prompt": """ + +Human:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. + +Context: +${parameters.population_knowledge_base.output:-} + +${parameters.chat_history:-} + +Human:${parameters.question} + +Assistant:""" + } + } + ] +} +``` + +### Conversational agents + +Similarly to a conversational flow agent, a conversational agent stores the conversation in an index, in the following example, the `conversation_index`. A conversational agent can be configured with an LLM and a set of supplementary tools that perform specific jobs. For example, you can set up an LLM and a `CATIndexTool` when configuring an agent. When you send a question to the model, the agent also includes the `CATIndexTool` as context. The LLM then decides whether it needs to use the `CATIndexTool` to answer questions like "How many indexes are in my cluster?" The context allows an LLM to answer specific questions that are outside of its knowledge base. For example, the following agent is configured with an LLM and a `CATIndexTool` that retrieves information about your OpenSearch indexes: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_ReAct_ClaudeV2", + "type": "conversational", + "description": "this is a test agent", + "llm": { + "model_id": "YOUR_LLM_MODEL_ID", + "parameters": { + "max_iteration": 5, + "stop_when_no_tool_found": true, + "response_filter": "$.completion" + } + }, + "memory": { + "type": "conversation_index" + }, + "tools": [ + { + "type": "VectorDBTool", + "name": "VectorDBTool", + "description": "A tool to search opensearch index with natural language quesiotn. If you don't know answer for some question, you should always try to search data with this tool. Action Input: ", + "parameters": { + "model_id": "YOUR_TEXT_EMBEDDING_MODEL_ID", + "index": "my_test_data", + "embedding_field": "embedding", + "source_field": [ "text" ], + "input": "${parameters.question}" + } + }, + { + "type": "CatIndexTool", + "name": "RetrieveIndexMetaTool", + "description": "Use this tool to get OpenSearch index information: (health, status, index, uuid, primary count, replica count, docs.count, docs.deleted, store.size, primary.store.size)." + } + ], + "app_type": "my app" +} +``` + +It is important to provide thorough descriptions of the tools so that the LLM can decide in which situations to use those tools. +{: .tip} + +## Next steps + +- For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +- For a step-by-step tutorial, see [Agents and tools tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents-tools-tutorial/). +- For supported APIs, see [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/). +- To use agents and tools in configuration automation, see [Automating configurations]({{site.url}}{{site.baseurl}}/automating-configurations/index/). \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/agent-tool.md b/_ml-commons-plugin/agents-tools/tools/agent-tool.md new file mode 100644 index 0000000000..272af51e4d --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/agent-tool.md @@ -0,0 +1,107 @@ +--- +layout: default +title: Agent tool +has_children: false +has_toc: false +nav_order: 10 +parent: Tools +grand_parent: Agents and tools +--- + + +# Agent tool +**Introduced 2.13** +{: .label .label-purple } + + +The `AgentTool` runs any agent. + +## Step 1: Set up an agent for AgentTool to run + +Set up any agent. For example, set up a flow agent that runs an `MLModelTool` by following the steps in the [ML Model Tool documentation]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/ml-model-tool/) and obtain its agent ID from [Step 3]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/ml-model-tool/#step-3-register-a-flow-agent-that-will-run-the-mlmodeltool): + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +## Step 2: Register a flow agent that will run the AgentTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request, providing the agent ID from the previous step: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test agent tool", + "type": "flow", + "description": "this is a test agent", + "tools": [ + { + "type": "AgentTool", + "description": "A general agent to answer any question", + "parameters": { + "agent_id": "9X7xWI0Bpc3sThaJdY9i" + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "EQyyZ40BT2tRrkdmhT7_" +} +``` + +## Step 3: Run the agent + +Run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/EQyyZ40BT2tRrkdmhT7_/_execute +{ + "parameters": { + "question": "what's the population increase of Seattle from 2021 to 2023" + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the inference results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": " I do not have direct data on the population increase of Seattle from 2021 to 2023 in the context provided. As a data analyst, I would need to research population statistics from credible sources like the US Census Bureau to analyze population trends and make an informed estimate. Without looking up actual data, I don't have enough information to provide a specific answer to the question." + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`agent_id` | String | Required | The agent ID of the agent to run. + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/cat-index-tool.md b/_ml-commons-plugin/agents-tools/tools/cat-index-tool.md new file mode 100644 index 0000000000..50ccf28b9b --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/cat-index-tool.md @@ -0,0 +1,130 @@ +--- +layout: default +title: CAT Index tool +has_children: false +has_toc: false +nav_order: 20 +parent: Tools +grand_parent: Agents and tools +--- + + +# CAT Index tool +**Introduced 2.13** +{: .label .label-purple } + + +The `CatIndexTool` retrieves index information for the OpenSearch cluster, similarly to the [CAT Indices API]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-indices/). + +## Step 1: Register a flow agent that will run the CatIndexTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_CatIndex_tool", + "type": "flow", + "description": "this is a test agent for the CatIndexTool", + "tools": [ + { + "type": "CatIndexTool", + "name": "DemoCatIndexTool", + "parameters": { + "input": "${parameters.question}" + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +## Step 2: Run the agent + +Before you run the agent, make sure that you add the sample OpenSearch Dashboards `Sample eCommerce orders` dataset. To learn more, see [Adding sample data]({{site.url}}{{site.baseurl}}/dashboards/quickstart#adding-sample-data). + +Then, run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "question": "How many indices do I have?" + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the index information: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": """health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open .plugins-ml-model-group lHgGEgJhT_mpADyOZoXl2g 1 1 9 2 33.4kb 16.7kb +green open .plugins-ml-memory-meta b2LEpv0QS8K60QBjXtRm6g 1 1 13 0 95.1kb 47.5kb +green open .ql-datasources 9NXm_tMXQc6s_4uRToSNkQ 1 1 0 0 416b 208b +green open sample-ecommerce UPYOQcAfRGqFAlSxcZlRjw 1 1 40320 0 4.1mb 2mb +green open .plugins-ml-task xYTlprYCQnaaYici69SOjA 1 1 117 0 115.5kb 57.6kb +green open .opendistro_security 7DAqhm9QQmeEsQYhA40cJg 1 1 10 0 117kb 58.5kb +green open sample-host-health Na5tq6UiTt6r_qYME1vV-w 1 1 40320 0 2.6mb 1.3mb +green open .opensearch-observability 6PthtLluSKyYCdZR3Mw0iw 1 1 0 0 416b 208b +green open .plugins-ml-model WYcjBHcnRuSDHeVWPVupoA 1 1 191 45 4.2gb 2.1gb +green open index_for_neural_sparse GQswGabQRIazM_trnqaDrw 1 1 5 0 28.4kb 14.2kb +green open security-auditlog-2024.01.30 BhXR7Nd3QVOVGxJNpR0-jw 1 1 27768 0 13.8mb 7mb +green open sample-http-responses 0gmYYYdOTiCbVUvl_uDL0w 1 1 40320 0 2.5mb 1.2mb +green open security-auditlog-2024.02.01 2VD1ieDGS5m-TfjIdfT8Eg 1 1 39305 0 39mb 18.6mb +green open opensearch_dashboards_sample_data_ecommerce wnE6r7OvSPqc5YHj8wHSLA 1 1 4675 0 8.8mb 4.4mb +green open security-auditlog-2024.01.31 cNRK5-2eTwes0SRlXTl0RQ 1 1 34520 0 20.5mb 9.8mb +green open .plugins-ml-memory-message wTNBU4BBQVSFcFhNlUdfBQ 1 1 93 0 358.2kb 181.9kb +green open .plugins-flow-framework-state dJUNDv9MSJ2jjwKbzXPlrw 1 1 39 0 114.1kb 57kb +green open .plugins-ml-agent 7X1IzoLuSGmIujOh9i5mmg 1 1 30 0 170.7kb 85.3kb +green open .plugins-flow-framework-templates _ecC0KahTlmG_3tFUst7Uw 1 1 18 0 175.8kb 87.9kb +green open .plugins-ml-connector q45iJfVjQ5KgxeNC65DLSw 1 1 11 0 313.1kb 156.5kb +green open .kibana_1 vRjXK4bHSUueB_4iXiQ8yw 1 1 257 0 264kb 132kb +green open .plugins-ml-config G7gxGQB7TZeQzBasHd5PUg 1 1 1 0 7.8kb 3.9kb +green open .plugins-ml-controller NQTZPREZRhWoDdjCglRLFg 1 1 0 0 50.1kb 49.9kb +green open opensearch_dashboards_sample_data_logs 9gpOTB3rRgqBLvqis_k5LQ 1 1 14074 0 18mb 9mb +green open .plugins-flow-framework-config JlKPsCh6SEq-Jh6rPL_x9Q 1 1 1 0 7.8kb 3.9kb +green open opensearch_dashboards_sample_data_flights pJde0irnTce4-uobHwYmMQ 1 1 13059 0 11.9mb 5.9mb +green open my_test_data T4hwNs7CTJGIfw2QpCqQ_Q 1 1 6 0 91.7kb 45.8kb +green open .opendistro-job-scheduler-lock XjgmXAVKQ4e8Y-ac54VBzg 1 1 3 3 36.2kb 21.3kb +""" + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`input` | String | Required | The user input used to return index information. +`index` | String | Optional | A comma-delimited list of one or more indexes on which to run the CAT operation. Default is an empty list, which means all indexes. +`local` | Boolean | Optional | When `true`, retrieves information from the local node only instead of the cluster manager node (default is `false`). + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/index-mapping-tool.md b/_ml-commons-plugin/agents-tools/tools/index-mapping-tool.md new file mode 100644 index 0000000000..8649d2d74d --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/index-mapping-tool.md @@ -0,0 +1,120 @@ +--- +layout: default +title: Index Mapping tool +has_children: false +has_toc: false +nav_order: 30 +parent: Tools +grand_parent: Agents and tools +--- + + +# Index Mapping tool +**Introduced 2.13** +{: .label .label-purple } + + +The `IndexMappingTool` retrieves mapping and setting information for indexes in your cluster. + +## Step 1: Register a flow agent that will run the IndexMappingTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_IndexMapping_tool", + "type": "flow", + "description": "this is a test agent for the IndexMappingTool", + "tools": [ + { + "type": "IndexMappingTool", + "name": "DemoIndexMappingTool", + "parameters": { + "index": "${parameters.index}", + "input": "${parameters.question}" + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +## Step 2: Run the agent + +Before you run the agent, make sure that you add the sample OpenSearch Dashboards `Sample eCommerce orders` dataset. To learn more, see [Adding sample data]({{site.url}}{{site.baseurl}}/dashboards/quickstart#adding-sample-data). + +Then, run the agent by sending the following request and providing the index name and the question: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "index": [ "sample-ecommerce" ], + "question": "What fields are in the sample-ecommerce index?" + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the mappings and settings for the specified index: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": """index: sample-ecommerce + +mappings: +properties={items_purchased_failure={type=integer}, items_purchased_success={type=integer}, order_id={type=integer}, timestamp={type=date}, total_revenue_usd={type=integer}} + + +settings: +index.creation_date=1706752839713 +index.number_of_replicas=1 +index.number_of_shards=1 +index.provided_name=sample-ecommerce +index.replication.type=DOCUMENT +index.uuid=UPYOQcAfRGqFAlSxcZlRjw +index.version.created=137217827 + + +""" + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`input` | String | Required | The user input used to return index information. +`index` | Array | Required | A comma-delimited list of one or more indexes for which to obtain mapping and setting information. Default is an empty list, which means all indexes. +`local` | Boolean | Optional | Whether to return information from the local node only instead of the cluster manager node (default is `false`). + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. +`index` | Array | Optional | A comma-delimited list of one or more indexes for which to obtain mapping and setting information. Default is an empty list, which means all indexes. \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/index.md b/_ml-commons-plugin/agents-tools/tools/index.md new file mode 100644 index 0000000000..8db522006e --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/index.md @@ -0,0 +1,51 @@ +--- +layout: default +title: Tools +parent: Agents and tools +has_children: true +has_toc: false +nav_order: 20 +redirect_from: + - /ml-commons-plugin/extensibility/index/ +--- + +# Tools +**Introduced 2.13** +{: .label .label-purple } + +A _tool_ performs a set of specific tasks. The following table lists all tools that OpenSearch supports. + +Specify a tool by providing its `type`, `parameters`, and, optionally, a `description`. For example, you can specify an `AgentTool` as follows: + +```json +{ + "type": "AgentTool", + "description": "A general agent to answer any question", + "parameters": { + "agent_id": "9X7xWI0Bpc3sThaJdY9i" + } +} +``` + +Each tool takes a list of parameters specific to that tool. In the preceding example, the `AgentTool` takes an `agent_id` of the agent it will run. For a list of parameters, see each tool's documentation. + +|Tool | Description | +|:--- |:--- | +|[`AgentTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/agent-tool/) |Runs any agent. | +|[`CatIndexTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/cat-index-tool/) |Retrieves index information for the OpenSearch cluster. | +|[`IndexMappingTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index-mapping-tool/) |Retrieves index mapping and setting information for an index. | +|[`MLModelTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/ml-model-tool/) |Runs machine learning models. | +|[`NeuralSparseSearchTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/neural-sparse-tool/) | Performs sparse vector retrieval. | +|[`PPLTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/ppl-tool/) |Translates natural language into a Piped Processing Language (PPL) query. | +|[`RAGTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/rag-tool/) |Uses neural search or neural sparse search to retrieve documents and integrates a large language model to summarize the answers. | +|[`SearchAlertsTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/search-alerts-tool/) |Searches for alerts. | +|[`SearchAnomalyDetectorsTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/search-anomaly-detectors/) | Searches for anomaly detectors. | +|[`SearchAnomalyResultsTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/search-anomaly-results/) | Searches anomaly detection results generated by anomaly detectors. | +|[`SearchIndexTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/search-index-tool/) |Searches an index using a query written in query domain-specific language (DSL). | +|[`SearchMonitorsTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/search-monitors-tool/) | Searches for alerting monitors. | +|[`VectorDBTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/vector-db-tool/) |Performs dense vector retrieval. | +|[`VisualizationTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/visualization-tool/) |Finds visualizations in OpenSearch Dashboards. | + +## Developer information + +The agents and tools framework is flexible and extensible. You can find the list of tools provided by OpenSearch in the [Tools library](https://github.com/opensearch-project/skills/tree/main/src/main/java/org/opensearch/agent/tools). For a different use case, you can build your own tool by implementing the [_Tool_ interface](https://github.com/opensearch-project/ml-commons/blob/2.x/spi/src/main/java/org/opensearch/ml/common/spi/tools/Tool.java). \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/ml-model-tool.md b/_ml-commons-plugin/agents-tools/tools/ml-model-tool.md new file mode 100644 index 0000000000..ceeda40528 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/ml-model-tool.md @@ -0,0 +1,167 @@ +--- +layout: default +title: ML Model tool +has_children: false +has_toc: false +nav_order: 40 +parent: Tools +grand_parent: Agents and tools +--- + + +# ML Model tool +plugins.ml_commons.rag_pipeline_feature_enabled: true +{: .label .label-purple } + + +The `MLModelTool` runs a machine learning (ML) model and returns inference results. + +## Step 1: Create a connector for a model + +The following example request creates a connector for a model hosted on [Amazon SageMaker](https://aws.amazon.com/pm/sagemaker/): + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "sagemaker model", + "description": "Test connector for Sagemaker model", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "", + "secret_key": "" + }, + "parameters": { + "region": "us-east-1", + "service_name": "sagemaker" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "headers": { + "content-type": "application/json" + }, + "url": "", + "request_body": """{"prompt":"${parameters.prompt}"}""" + } + ] +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a connector ID: + +```json +{ + "connector_id": "eJATWo0BkIylWTeYToTn" +} +``` + +## Step 2: Register and deploy the model + +To register and deploy the model to OpenSearch, send the following request, providing the connector ID from the previous step: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "remote-inferene", + "function_name": "remote", + "description": "test model", + "connector_id": "eJATWo0BkIylWTeYToTn" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a model ID: + +```json +{ + "task_id": "7X7pWI0Bpc3sThaJ4I8R", + "status": "CREATED", + "model_id": "h5AUWo0BkIylWTeYT4SU" +} +``` + +## Step 3: Register a flow agent that will run the MLModelTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request, providing the model ID in the `model_id` parameter: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test agent for embedding model", + "type": "flow", + "description": "this is a test agent", + "tools": [ + { + "type": "MLModelTool", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "h5AUWo0BkIylWTeYT4SU", + "prompt": "\n\nHuman:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. \n\nHuman:${parameters.question}\n\nAssistant:" + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +## Step 4: Run the agent + +Run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "question": "what's the population increase of Seattle from 2021 to 2023" + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the inference results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": " I do not have direct data on the population increase of Seattle from 2021 to 2023 in the context provided. As a data analyst, I would need to research population statistics from credible sources like the US Census Bureau to analyze population trends and make an informed estimate. Without looking up actual data, I don't have enough information to provide a specific answer to the question." + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`model_id` | String | Required | The model ID of the large language model (LLM) to use for generating the response. +`prompt` | String | Optional | The prompt to provide to the LLM. +`response_field` | String | Optional | The name of the response field. Default is `response`. + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/neural-sparse-tool.md b/_ml-commons-plugin/agents-tools/tools/neural-sparse-tool.md new file mode 100644 index 0000000000..9fee4dcbd2 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/neural-sparse-tool.md @@ -0,0 +1,222 @@ +--- +layout: default +title: Neural Sparse Search tool +has_children: false +has_toc: false +nav_order: 50 +parent: Tools +grand_parent: Agents and tools +--- + + +# Neural Sparse Search tool +**Introduced 2.13** +{: .label .label-purple } + + +The `NeuralSparseSearchTool` performs sparse vector retrieval. For more information about neural sparse search, see [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). + +## Step 1: Register and deploy a sparse encoding model + +OpenSearch supports several pretrained sparse encoding models. You can either use one of those models or your own custom model. For a list of supported pretrained models, see [Sparse encoding models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sparse-encoding-models). For more information, see [OpenSearch-provided pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/) and [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). + +In this example, you'll use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1` pretrained model for both ingestion and search. To register and deploy the model to OpenSearch, send the following request: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-v1", + "version": "1.0.1", + "model_format": "TORCH_SCRIPT" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a task ID for the model registration and deployment task: + +```json +{ + "task_id": "M_9KY40Bk4MTqirc5lP8", + "status": "CREATED" +} +``` + +You can monitor the status of the task by calling the Tasks API: + +```json +GET _plugins/_ml/tasks/M_9KY40Bk4MTqirc5lP8 +``` +{% include copy-curl.html %} + +Once the model is registered and deployed, the task `state` changes to `COMPLETED` and OpenSearch returns a model ID for the model: + +```json +{ + "model_id": "Nf9KY40Bk4MTqirc6FO7", + "task_type": "REGISTER_MODEL", + "function_name": "SPARSE_ENCODING", + "state": "COMPLETED", + "worker_node": [ + "UyQSTQ3nTFa3IP6IdFKoug" + ], + "create_time": 1706767869692, + "last_update_time": 1706767935556, + "is_async": true +} +``` + +## Step 2: Ingest data into an index + +First, you'll set up an ingest pipeline to encode documents using the sparse encoding model set up in the previous step: + +```json +PUT /_ingest/pipeline/pipeline-sparse +{ + "description": "An sparse encoding ingest pipeline", + "processors": [ + { + "sparse_encoding": { + "model_id": "Nf9KY40Bk4MTqirc6FO7", + "field_map": { + "passage_text": "passage_embedding" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +Next, create an index specifying the pipeline as the default pipeline: + +```json +PUT index_for_neural_sparse +{ + "settings": { + "default_pipeline": "pipeline-sparse" + }, + "mappings": { + "properties": { + "passage_embedding": { + "type": "rank_features" + }, + "passage_text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +Last, ingest data into the index by sending a bulk request: + +```json +POST _bulk +{ "index" : { "_index" : "index_for_neural_sparse", "_id" : "1" } } +{ "passage_text" : "company AAA has a history of 123 years" } +{ "index" : { "_index" : "index_for_neural_sparse", "_id" : "2" } } +{ "passage_text" : "company AAA has over 7000 employees" } +{ "index" : { "_index" : "index_for_neural_sparse", "_id" : "3" } } +{ "passage_text" : "Jack and Mark established company AAA" } +{ "index" : { "_index" : "index_for_neural_sparse", "_id" : "4" } } +{ "passage_text" : "company AAA has a net profit of 13 millions in 2022" } +{ "index" : { "_index" : "index_for_neural_sparse", "_id" : "5" } } +{ "passage_text" : "company AAA focus on the large language models domain" } +``` +{% include copy-curl.html %} + +## Step 3: Register a flow agent that will run the NeuralSparseSearchTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following request, providing the model ID for the model set up in Step 1. This model will encode your queries into sparse vector embeddings: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Neural_Sparse_Agent_For_RAG", + "type": "flow", + "tools": [ + { + "type": "NeuralSparseSearchTool", + "parameters": { + "description":"use this tool to search data from the knowledge base of company AAA", + "model_id": "Nf9KY40Bk4MTqirc6FO7", + "index": "index_for_neural_sparse", + "embedding_field": "passage_embedding", + "source_field": ["passage_text"], + "input": "${parameters.question}", + "doc_size":2 + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +## Step 4: Run the agent + +Before you run the agent, make sure that you add the sample OpenSearch Dashboards `Sample web logs` dataset. To learn more, see [Adding sample data]({{site.url}}{{site.baseurl}}/dashboards/quickstart#adding-sample-data). + +Then, run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "question":"how many employees does AAA have?" + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the inference results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": """{"_index":"index_for_neural_sparse","_source":{"passage_text":"company AAA has over 7000 employees"},"_id":"2","_score":30.586042} +{"_index":"index_for_neural_sparse","_source":{"passage_text":"company AAA has a history of 123 years"},"_id":"1","_score":16.088133} +""" + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`model_id` | String | Required | The model ID of the sparse encoding model to use at search time. +`index` | String | Required | The index to search. +`embedding_field` | String | Required | When the neural sparse model encodes raw text documents, the encoding result is saved in a field. Specify this field as the `embedding_field`. Neural sparse search matches documents to the query by calculating the similarity score between the query text and the text in the document's `embedding_field`. +`source_field` | String | Required | The document field or fields to return. You can provide a list of multiple fields as an array of strings, for example, `["field1", "field2"]`. +`input` | String | Required for flow agent | Runtime input sourced from flow agent parameters. If using a large language model (LLM), this field is populated with the LLM response. +`name` | String | Optional | The tool name. Useful when an LLM needs to select an appropriate tool for a task. +`description` | String | Optional | A description of the tool. Useful when an LLM needs to select an appropriate tool for a task. +`doc_size` | Integer | Optional | The number of documents to fetch. Default is `2`. + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. diff --git a/_ml-commons-plugin/agents-tools/tools/ppl-tool.md b/_ml-commons-plugin/agents-tools/tools/ppl-tool.md new file mode 100644 index 0000000000..72d8ba30b5 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/ppl-tool.md @@ -0,0 +1,204 @@ +--- +layout: default +title: PPL tool +has_children: false +has_toc: false +nav_order: 60 +parent: Tools +grand_parent: Agents and tools +--- + +# PPL tool +**Introduced 2.13** +{: .label .label-purple } + +The `PPLTool` translates natural language into a PPL query. The tool provides an `execute` flag to specify whether to run the query. If you set the flag to `true`, the `PPLTool` runs the query and returns the query and the results. + +## Prerequisite + +To create a PPL tool, you need a fine-tuned model that translates natural language into PPL queries. Alternatively, you can use large language models for prompt-based translation. The PPL tool supports the Anthropic Claude and OpenAI models. + +## Step 1: Create a connector for a model + +The following example request creates a connector for a model hosted on Amazon SageMaker: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "sagemaker: t2ppl", + "description": "Test connector for Sagemaker t2ppl model", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "", + "secret_key": "" + }, + "parameters": { + "region": "us-east-1", + "service_name": "sagemaker" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "headers": { + "content-type": "application/json" + }, + "url": "", + "request_body": """{"prompt":"${parameters.prompt}"}""" + } + ] +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a connector ID: + +```json +{ + "connector_id": "eJATWo0BkIylWTeYToTn" +} +``` + +For information about connecting to an Anthropic Claude model or OpenAI models, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +## Step 2: Register and deploy the model + +To register and deploy the model to OpenSearch, send the following request, providing the connector ID from the previous step: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "remote-inference", + "function_name": "remote", + "description": "test model", + "connector_id": "eJATWo0BkIylWTeYToTn" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a model ID: + +```json +{ + "task_id": "7X7pWI0Bpc3sThaJ4I8R", + "status": "CREATED", + "model_id": "h5AUWo0BkIylWTeYT4SU" +} +``` + + +## Step 3: Register a flow agent that will run the PPLTool + + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request, providing the model ID in the `model_id` parameter. To run the generated query, set `execute` to `true`: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_PPL", + "type": "flow", + "description": "this is a test agent", + "memory": { + "type": "demo" + }, + "tools": [ + { + "type": "PPLTool", + "name": "TransferQuestionToPPLAndExecuteTool", + "description": "Use this tool to transfer natural language to generate PPL and execute PPL to query inside. Use this tool after you know the index name, otherwise, call IndexRoutingTool first. The input parameters are: {index:IndexName, question:UserQuestion}", + "parameters": { + "model_id": "h5AUWo0BkIylWTeYT4SU", + "model_type": "FINETUNE", + "execute": true + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +## Step 4: Run the agent + +Before you run the agent, make sure that you add the sample OpenSearch Dashboards `Sample web logs` dataset. To learn more, see [Adding sample data]({{site.url}}{{site.baseurl}}/dashboards/quickstart#adding-sample-data). + +Then, run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "verbose": true, + "question": "what is the error rate yesterday", + "index": "opensearch_dashboards_sample_data_logs" + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the PPL query and the query results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result":"{\"ppl\":\"source\=opensearch_dashboards_sample_data_logs| where timestamp \> DATE_SUB(NOW(), INTERVAL 1 DAY) AND timestamp \< NOW() | eval is_error\=IF(response\=\'200\', 0, 1.0) | stats AVG(is_error) as error_rate\",\"executionResult\":\"{\\n \\\"schema\\\": [\\n {\\n \\\"name\\\": \\\"error_rate\\\",\\n \\\"type\\\": \\\"double\\\"\\n }\\n ],\\n \\\"datarows\\\": [\\n [\\n null\\n ]\\n ],\\n \\\"total\\\": 1,\\n \\\"size\\\": 1\\n}\"}" + } + ] + } + ] +} +``` + +If you set `execute` to `false`, OpenSearch only returns the query but does not run it: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": "source=opensearch_dashboards_sample_data_logs| where timestamp > DATE_SUB(NOW(), INTERVAL 1 DAY) AND timestamp < NOW() | eval is_error=IF(response='200', 0, 1.0) | stats AVG(is_error) as error_rate" + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`model_id` | String | Required | The model ID of the large language model (LLM) to use for translating text into a PPL query. +`model_type` | String | Optional | The model type. Valid values are `CLAUDE` (Anthropic Claude model), `OPENAI` (OpenAI models), and `FINETUNE` (custom fine-tuned model). +`prompt` | String | Optional | The prompt to provide to the LLM. +`execute` | Boolean | Optional | Specifies whether to run the PPL query. Default is `true`. +`input` | Object | Optional | Contains two parameters that specify the index to search and the question for the LLM. For example, `"input": "{\"index\": \"${parameters.index}\", \"question\": ${parameters.question} }"`. +`head` | Integer | Optional | Limits the number of returned execution results if `execute` is set to `true`. Default is `-1` (no limit). + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`index` | String | Required | The index on which to run the PPL query. +`question` | String | Required | The natural language question to send to the LLM. +`verbose` | Boolean | Optional | Whether to provide verbose output. Default is `false`. \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/rag-tool.md b/_ml-commons-plugin/agents-tools/tools/rag-tool.md new file mode 100644 index 0000000000..1f6fafe49a --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/rag-tool.md @@ -0,0 +1,146 @@ +--- +layout: default +title: RAG tool +has_children: false +has_toc: false +nav_order: 65 +parent: Tools +grand_parent: Agents and tools +--- + + +# RAG tool +**Introduced 2.13** +{: .label .label-purple } + + +The `RAGTool` performs retrieval-augmented generation (RAG). For more information about RAG, see [Conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). + +RAG calls a large language model (LLM) and supplements its knowledge by providing relevant OpenSearch documents along with the user question. To retrieve relevant documents from an OpenSearch index, you'll need a text embedding model that facilitates vector search. + +The RAG tool supports the following search methods: + +- [Neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/): Dense vector retrieval, which uses a text embedding model. +- [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/): Sparse vector retrieval, which uses a sparse encoding model. + +## Before you start + +To register and deploy a text embedding model and an LLM and ingest data into an index, perform Steps 1--5 of the [Agents and tools tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents-tools-tutorial/). + +The following example uses neural search. To configure neural sparse search and deploy a sparse encoding model, see [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). + + +## Step 1: Register a flow agent that will run the RAGTool + + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following request, providing the text embedding model ID in the `embedding_model_id` parameter and the LLM model ID in the `inference_model_id` parameter: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_RagTool", + "type": "flow", + "description": "this is a test flow agent", + "tools": [ + { + "type": "RAGTool", + "description": "A description of the tool", + "parameters": { + "embedding_model_id": "Hv_PY40Bk4MTqircAVmm", + "inference_model_id": "SNzSY40B_1JGmyB0WbfI", + "index": "my_test_data", + "embedding_field": "embedding", + "query_type": "neural", + "source_field": [ + "text" + ], + "input": "${parameters.question}", + "prompt": "\n\nHuman:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. \n\n Context:\n${parameters.output_field}\n\nHuman:${parameters.question}\n\nAssistant:" + } + } +] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +To create a conversational agent containing a `RAGTool`, see [Conversational agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#conversational-agents). + +## Step 2: Run the agent + +Before you run the agent, make sure that you add the sample OpenSearch Dashboards `Sample web logs` dataset. To learn more, see [Adding sample data]({{site.url}}{{site.baseurl}}/dashboards/quickstart#adding-sample-data). + +Then, run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "question": "what's the population increase of Seattle from 2021 to 2023" + } +} +``` +{% include copy-curl.html %} + +OpenSearch performs vector search and returns the relevant documents: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": """{"_index":"my_test_data","_source":{"text":"Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\n + The current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\\n + The metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\\n + The metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\\n + The metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019."},"_id":"6","_score":0.8173238} + {"_index":"my_test_data","_source":{"text":"Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\n + The current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\n + The metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\n + The metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\n + The metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019."},"_id":"2","_score":0.6641471} + """ + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`embedding_model_id` | String | Required | The model ID of the model to use for generating vector embeddings. +`inference_model_id` | String | Required | The model ID of the LLM to use for inference. +`index` | String | Required | The index from which to retrieve relevant documents to pass to the LLM. +`embedding_field` | String | Required | When the model encodes raw text documents, the encoding result is saved in a field. Specify this field as the `embedding_field`. Neural search matches documents to the query by calculating the similarity score between the query text and the text in the document's `embedding_field`. +`source_field` | String | Required | The document field or fields to return. You can provide a list of multiple fields as an array of strings, for example, `["field1", "field2"]`. +`input` | String | Required for flow agent | Runtime input sourced from flow agent parameters. If using an LLM, this field is populated with the LLM response. +`output_field` | String | Optional | The name of the output field. Default is `response`. +`query_type` | String | Optional | Specifies the type of query to run to perform neural search. Valid values are `neural` (for dense retrieval) and `neural_sparse` (for sparse retrieval). Default is `neural`. +`doc_size` | Integer | Optional | The number of documents to fetch. Default is `2`. +`prompt` | String | Optional | The prompt to provide to the LLM. +`k` | Integer | Optional | The number of nearest neighbors to search for when performing neural search. Default is 10. +`enable_Content_Generation` | Boolean | Optional | If `true`, returns results generated by an LLM. If `false`, returns results directly without LLM-assisted content generation. Default is `true`. + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/search-alerts-tool.md b/_ml-commons-plugin/agents-tools/tools/search-alerts-tool.md new file mode 100644 index 0000000000..76f9e4b4dc --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/search-alerts-tool.md @@ -0,0 +1,124 @@ +--- +layout: default +title: Search Alerts tool +has_children: false +has_toc: false +nav_order: 67 +parent: Tools +grand_parent: Agents and tools +--- + + +# Search Alerts tool +**Introduced 2.13** +{: .label .label-purple } + + +The `SearchAlertsTool` retrieves information about generated alerts. For more information about alerts, see [Alerting]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/index/). + +## Step 1: Register a flow agent that will run the SearchAlertsTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_Search_Alerts_Tool", + "type": "flow", + "description": "this is a test agent for the SearchAlertsTool", + "memory": { + "type": "demo" + }, + "tools": [ + { + "type": "SearchAlertsTool", + "name": "DemoSearchAlertsTool", + "parameters": {} + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "EuJYYo0B9RaBCvhuy1q8" +} +``` + +## Step 2: Run the agent + +Run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/EuJYYo0B9RaBCvhuy1q8/_execute +{ + "parameters": { + "question": "Do I have any alerts?" + } +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a list of generated alerts and the total number of alerts: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": "Alerts=[Alert(id=rv9nYo0Bk4MTqirc_DkW, version=394, schemaVersion=5, monitorId=ZuJnYo0B9RaBCvhuEVux, workflowId=, workflowName=, monitorName=test-monitor-2, monitorVersion=1, monitorUser=User[name=admin, backend_roles=[admin], roles=[own_index, all_access], custom_attribute_names=[], user_requested_tenant=null], triggerId=ZeJnYo0B9RaBCvhuEVul, triggerName=t-1, findingIds=[], relatedDocIds=[], state=ACTIVE, startTime=2024-02-01T02:03:18.420Z, endTime=null, lastNotificationTime=2024-02-01T08:36:18.409Z, acknowledgedTime=null, errorMessage=null, errorHistory=[], severity=1, actionExecutionResults=[], aggregationResultBucket=null, executionId=ZuJnYo0B9RaBCvhuEVux_2024-02-01T02:03:18.404853331_51c18f2c-5923-47c3-b476-0f5a66c6319b, associatedAlertIds=[])]TotalAlerts=1" + } + ] + } + ] +} +``` + +If no alerts are found, OpenSearch responds with an empty array in the results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": "Alerts=[]TotalAlerts=0" + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +`alertIds` | Array | The ID of the alert to search for. +`monitorId` | String | The name of the monitor by which to filter the alerts. +`workflowIds` | Array | A list of workflow IDs by which to filter the alerts. +`alertState` | String | The alert state by which to filter the alerts. Valid values are `ALL`, `ACTIVE`, `ERROR`, `COMPLETED`, and `ACKNOWLEDGED`. Default is `ALL`. +`severityLevel` | String| The severity level by which to filter the alerts. Valid values are `ALL`, `1`, `2`, and `3`. Default is `ALL`. +`searchString` | String | The search string to use for searching for a specific alert. +`sortOrder`| String | The sort order of the results. Valid values are `asc` (ascending) and `desc` (descending). Default is `asc`. +`sortString`| String | Specifies the monitor field by which to sort the results. Default is `monitor_name.keyword`. +`size` | Integer | The number of results to return. Default is `20`. +`startIndex`| Integer | The paginated index of the alert to start from. Default is `0`. + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. diff --git a/_ml-commons-plugin/agents-tools/tools/search-anomaly-detectors.md b/_ml-commons-plugin/agents-tools/tools/search-anomaly-detectors.md new file mode 100644 index 0000000000..9f31dea057 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/search-anomaly-detectors.md @@ -0,0 +1,109 @@ +--- +layout: default +title: Search Anomaly Detectors tool +has_children: false +has_toc: false +nav_order: 70 +parent: Tools +grand_parent: Agents and tools +--- + + +# Search Anomaly Detectors tool +**Introduced 2.13** +{: .label .label-purple } + + +The `SearchAnomalyDetectorsTool` retrieves information about anomaly detectors set up on your cluster. For more information about anomaly detectors, see [Anomaly detection]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/). + +## Step 1: Register a flow agent that will run the SearchAnomalyDetectorsTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_Search_Anomaly_Detectors_Tool", + "type": "flow", + "description": "this is a test agent for the SearchAnomalyDetectorsTool", + "memory": { + "type": "demo" + }, + "tools": [ + { + "type": "SearchAnomalyDetectorsTool", + "name": "DemoSearchAnomalyDetectorsTool", + "parameters": {} + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "EuJYYo0B9RaBCvhuy1q8" +} +``` + +## Step 2: Run the agent + +Run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/EuJYYo0B9RaBCvhuy1q8/_execute +{ + "parameters": { + "question": "Do I have any anomaly detectors?" + } +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a list of anomaly detectors set up on your cluster and the total number of anomaly detectors: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": "AnomalyDetectors=[{id=y2M-Yo0B-yCFzT-N_XXU,name=sample-http-responses-detector,type=SINGLE_ENTITY,description=A sample detector to detect anomalies with HTTP response code logs.,index=[sample-http-responses],lastUpdateTime=1706750311891}]TotalAnomalyDetectors=1" + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +`detectorName` | String | The name of the detector to search for. +`detectorNamePattern` | String | A wildcard query used to match the detector name to search for. +`indices` | String | The index name or index pattern of the indexes that the returned detectors are using as data sources. +`highCardinality` | Boolean | Whether to return information about high-cardinality detectors. Leave this parameter unset (or set it to `null`) to return information about both high-cardinality (multi-entity) and non-high-cardinality (single-entity) detectors. Set this parameter to `true` to only return information about high-cardinality detectors. Set this parameter to `false` to only return information about non-high-cardinality detectors. +`lastUpdateTime` | Long | Specifies the earliest last updated time of the detectors to return, in epoch milliseconds. Default is `null`. +`sortOrder` |String | The sort order for the results. Valid values are `asc` (ascending) and `desc` (descending). Default is `desc`. +`sortString`| String | Specifies the detector field by which to sort the results. Default is `name.keyword`. +`size` | Integer | The number of results to return. Default is `20`. +`startIndex`| Integer | The paginated index of the detector to start from. Default is `0`. +`running`| Boolean | Whether to return information about detectors that are currently running. Leave this parameter unset (or set it to `null`) to return both running and non-running detector information. Set this parameter to `true` to only return information about running detectors. Set this parameter to `false` to return only information about detectors that are not currently running. Default is `null`. +`disabled` | Boolean | Whether to return information about detectors that are currently disabled. Leave this parameter unset (or set it to `null`) to return information about both enabled and disabled detectors. Set this parameter to `true` to return only information about disabled detectors. Set this parameter to `false` to return only information about enabled detectors. Default is `null`. +`failed` | Boolean | Whether to return information about detectors that are currently failing. Leave this parameter unset (or set it to `null`) to return information about both failed and non-failed detectors. Set this parameter to `true` to return only information about failed detectors. Set this parameter to `false` to return only information about non-failed detectors. Default is `null`. + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. diff --git a/_ml-commons-plugin/agents-tools/tools/search-anomaly-results.md b/_ml-commons-plugin/agents-tools/tools/search-anomaly-results.md new file mode 100644 index 0000000000..2f2728e32d --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/search-anomaly-results.md @@ -0,0 +1,123 @@ +--- +layout: default +title: Search Anomaly Results tool +has_children: false +has_toc: false +nav_order: 80 +parent: Tools +grand_parent: Agents and tools +--- + + +# Search Anomaly Results tool +**Introduced 2.13** +{: .label .label-purple } + + +The `SearchAnomalyResultsTool` retrieves information about anomaly detector results. For more information about anomaly detectors, see [Anomaly detection]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/). + +## Step 1: Register a flow agent that will run the SearchAnomalyResultsTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_Search_Anomaly_Results_Tool", + "type": "flow", + "description": "this is a test agent for the SearchAnomalyResultsTool", + "memory": { + "type": "demo" + }, + "tools": [ + { + "type": "SearchAnomalyResultsTool", + "name": "DemoSearchAnomalyResultsTool", + "parameters": {} + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "HuJZYo0B9RaBCvhuUlpy" +} +``` + +## Step 2: Run the agent + +Run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/HuJZYo0B9RaBCvhuUlpy/_execute +{ + "parameters": { + "question": "Do I have any anomalies?" + } +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a list of individual anomaly detectors set up on your cluster (where each result contains the detector ID, the anomaly grade, and the confidence level) and the total number of anomaly results found: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": "AnomalyResults=[{detectorId=ef9lYo0Bk4MTqircmjnm,grade=1.0,confidence=0.9403051246569198}{detectorId=E-JlYo0B9RaBCvhunFtw,grade=1.0,confidence=0.9163498216870274}]TotalAnomalyResults=2" + } + ] + } + ] +} +``` + +If no anomalies are found, OpenSearch responds with an empty array in the results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": "AnomalyResults=[]TotalAnomalyResults=0" + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +`detectorId` | String | The ID of the detector from which to return results. +`realTime` | Boolean | Whether to return real-time anomaly detector results. Set this parameter to `false` to return only historical analysis results. +`anomalyGradeThreshold` | Float | The minimum anomaly grade for the returned anomaly detector results. Anomaly grade is a number between 0 and 1 that indicates how anomalous a data point is. +`dataStartTime` | Long | The earliest time for which to return anomaly detector results, in epoch milliseconds. +`dataEndTime` | Long | The latest time for which to return anomaly detector results, in epoch milliseconds. +`sortOrder` |String | The sort order for the results. Valid values are `asc` (ascending) and `desc` (descending). Default is `desc`. +`sortString`| String | Specifies the detector field by which to sort the results. Default is `data_start_time`. +`size` | Integer | The number of results to return. Default is `20`. +`startIndex`| Integer | The paginated index of the result to start from. Default is `0`. + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. diff --git a/_ml-commons-plugin/agents-tools/tools/search-index-tool.md b/_ml-commons-plugin/agents-tools/tools/search-index-tool.md new file mode 100644 index 0000000000..b023522893 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/search-index-tool.md @@ -0,0 +1,120 @@ +--- +layout: default +title: Search Index tool +has_children: false +has_toc: false +nav_order: 90 +parent: Tools +grand_parent: Agents and tools +--- + + +# Search Index tool +**Introduced 2.13** +{: .label .label-purple } + + +The `SearchIndexTool` searches an index using a query written in query domain-specific language (DSL) and returns the query results. + +## Step 1: Register a flow agent that will run the SearchIndexTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_Search_Index_Tool", + "type": "flow", + "description": "this is a test for search index tool", + "memory": { + "type": "demo" + }, + "tools": [ + { + "type": "SearchIndexTool" + } + ] +} +``` +{% include copy-curl.html %} + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +## Step 2: Run the agent + +Before you run the agent, make sure that you add the sample OpenSearch Dashboards `Sample eCommerce orders` dataset. To learn more, see [Adding sample data]({{site.url}}{{site.baseurl}}/dashboards/quickstart#adding-sample-data). + +Then, run the agent by sending the following request. The `SearchIndexTool` takes one parameter named `input`. This parameter includes the index name and the query: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "input": "{\"index\": \"opensearch_dashboards_sample_data_ecommerce\", \"query\": {\"size\": 20, \"_source\": \"email\"}}" + } +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Execute parameters](#execute-parameters). + +The query passed in the previous request is equivalent to the following query: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 20, + "_source": "email" +} +``` + +OpenSearch returns the query results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": """{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"eddie@underwood-family.zzz"},"_id":"_bJVWY0BAehlDanXJnAJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"mary@bailey-family.zzz"},"_id":"_rJVWY0BAehlDanXJnAJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"gwen@butler-family.zzz"},"_id":"_7JVWY0BAehlDanXJnAJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"diane@chandler-family.zzz"},"_id":"ALJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"eddie@weber-family.zzz"},"_id":"AbJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"diane@goodwin-family.zzz"},"_id":"ArJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"oliver@rios-family.zzz"},"_id":"A7JVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"abd@sutton-family.zzz"},"_id":"BLJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"wilhemina st.@tran-family.zzz"},"_id":"BbJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"rabbia al@baker-family.zzz"},"_id":"BrJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"rabbia al@romero-family.zzz"},"_id":"B7JVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"eddie@gregory-family.zzz"},"_id":"CLJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"sultan al@pratt-family.zzz"},"_id":"CbJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"eddie@wolfe-family.zzz"},"_id":"CrJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"sultan al@thompson-family.zzz"},"_id":"C7JVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"sultan al@boone-family.zzz"},"_id":"DLJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"george@hubbard-family.zzz"},"_id":"DbJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"boris@maldonado-family.zzz"},"_id":"DrJVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"yahya@rivera-family.zzz"},"_id":"D7JVWY0BAehlDanXJnEJ","_score":1.0} +{"_index":"opensearch_dashboards_sample_data_ecommerce","_source":{"email":"brigitte@morris-family.zzz"},"_id":"ELJVWY0BAehlDanXJnEJ","_score":1.0} +""" + } + ] + } + ] +} +``` + +## Execute parameters + +The following table lists all tool parameters that are available when registering an agent. + +Parameter | Type | Description +:--- | :--- | :--- +`input`| String | The index name and the query to use for search, in JSON format. The `index` parameter contains the name of the index and the `query` parameter contains the query formatted in Query DSL. For example, `"{\"index\": \"opensearch_dashboards_sample_data_ecommerce\", \"query\": {\"size\": 22, \"_source\": \"category\"}}"`. The `input` parameter and the `index` and `query` parameters it contains are required. \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/search-monitors-tool.md b/_ml-commons-plugin/agents-tools/tools/search-monitors-tool.md new file mode 100644 index 0000000000..77b51d4964 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/search-monitors-tool.md @@ -0,0 +1,124 @@ +--- +layout: default +title: Search Monitors tool +has_children: false +has_toc: false +nav_order: 100 +parent: Tools +grand_parent: Agents and tools +--- + + +# Search Monitors tool +**Introduced 2.13** +{: .label .label-purple } + + +The `SearchMonitorsTool` retrieves information about alerting monitors set up on your cluster. For more information about alerting monitors, see [Monitors]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/monitors/). + +## Step 1: Register a flow agent that will run the SearchMonitorsTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_Search_Monitors_Tool", + "type": "flow", + "description": "this is a test agent for the SearchMonitorsTool", + "memory": { + "type": "demo" + }, + "tools": [ + { + "type": "SearchMonitorsTool", + "name": "DemoSearchMonitorsTool", + "parameters": {} + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "EuJYYo0B9RaBCvhuy1q8" +} +``` + +## Step 2: Run the agent + +Run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/EuJYYo0B9RaBCvhuy1q8/_execute +{ + "parameters": { + "question": "Do I have any alerting monitors?" + } +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a list of alerting monitors set up on your cluster and the total number of alerting monitors: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": "Monitors=[{id=j_9mYo0Bk4MTqircEzk_,name=test-monitor,type=query_level_monitor,enabled=true,enabledTime=1706752873144,lastUpdateTime=1706752873145}{id=ZuJnYo0B9RaBCvhuEVux,name=test-monitor-2,type=query_level_monitor,enabled=true,enabledTime=1706752938405,lastUpdateTime=1706752938405}]TotalMonitors=2" + } + ] + } + ] +} +``` + +If no monitors are found, OpenSearch responds with an empty array in the results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": "Monitors=[]TotalMonitors=0" + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +`monitorId` | String | The ID of the monitor to search for. +`monitorName` | String | The name of the monitor to search for. +`monitorNamePattern` | String | A wildcard query used to match the monitor name to search for. +`enabled` | Boolean | Whether to return information about monitors that are currently enabled. Leave this parameter unset (or set it to `null`) to return information about both enabled and disabled monitors. Set this parameter to `true` to return only information about enabled monitors. Set this parameter to `false` to return only information about disabled monitors. Default is `null`. +`hasTriggers` | Boolean | Whether to return information about monitors that have triggers enabled. Leave this parameter unset (or set it to `null`) to return information about monitors that have triggers enabled and disabled. Set this parameter to `true` to return only information about monitors with triggers enabled. Set this parameter to `false` to return only information about monitors with triggers disabled. Default is `null`. +`indices` | String | The index name or index pattern of the indexes tracked by the returned monitors. +`sortOrder`| String | The sort order of the results. Valid values are `asc` (ascending) and `desc` (descending). Default is `asc`. +`sortString`| String | Specifies the monitor field by which to sort the results. Default is `name.keyword`. +`size` | Integer | The number of results to return. Default is `20`. +`startIndex`| Integer | The paginated index of the monitor to start from. Default is `0`. + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. diff --git a/_ml-commons-plugin/agents-tools/tools/vector-db-tool.md b/_ml-commons-plugin/agents-tools/tools/vector-db-tool.md new file mode 100644 index 0000000000..9093541cbb --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/vector-db-tool.md @@ -0,0 +1,235 @@ +--- +layout: default +title: Vector DB tool +has_children: false +has_toc: false +nav_order: 110 +parent: Tools +grand_parent: Agents and tools +--- + + +# Vector DB tool +**Introduced 2.13** +{: .label .label-purple } + + +The `VectorDBTool` performs dense vector retrieval. For more information about OpenSearch vector database capabilities, see [neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/). + +## Step 1: Register and deploy a sparse encoding model + +OpenSearch supports several pretrained models. You can use one of those models, use your own custom model, or create a connector for an externally hosted model. For a list of supported pretrained models, see [OpenSearch-provided pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/). For more information about custom models, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). For information about integrating an externally hosted model, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). + +In this example, you'll use the `huggingface/sentence-transformers/all-MiniLM-L12-v2` pretrained model for both ingestion and search. To register and deploy the model to OpenSearch, send the following request: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "huggingface/sentence-transformers/all-MiniLM-L12-v2", + "version": "1.0.1", + "model_format": "TORCH_SCRIPT" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a task ID for the model registration and deployment task: + +```json +{ + "task_id": "M_9KY40Bk4MTqirc5lP8", + "status": "CREATED" +} +``` + +You can monitor the status of the task by calling the Tasks API: + +```json +GET _plugins/_ml/tasks/M_9KY40Bk4MTqirc5lP8 +``` +{% include copy-curl.html %} + +Once the model is registered and deployed, the task `state` changes to `COMPLETED` and OpenSearch returns a model ID for the model: + +```json +{ + "model_id": "Hv_PY40Bk4MTqircAVmm", + "task_type": "REGISTER_MODEL", + "function_name": "TEXT_EMBEDDING", + "state": "COMPLETED", + "worker_node": [ + "UyQSTQ3nTFa3IP6IdFKoug" + ], + "create_time": 1706767869692, + "last_update_time": 1706767935556, + "is_async": true +} +``` + +## Step 2: Ingest data into an index + +First, you'll set up an ingest pipeline to encode documents using the sparse encoding model set up in the previous step: + +```json +PUT /_ingest/pipeline/test-pipeline-local-model +{ + "description": "text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "Hv_PY40Bk4MTqircAVmm", + "field_map": { + "text": "embedding" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +Next, create a k-NN index specifying the pipeline as the default pipeline: + +```json +PUT my_test_data +{ + "mappings": { + "properties": { + "text": { + "type": "text" + }, + "embedding": { + "type": "knn_vector", + "dimension": 384 + } + } + }, + "settings": { + "index": { + "knn.space_type": "cosinesimil", + "default_pipeline": "test-pipeline-local-model", + "knn": "true" + } + } +} +``` +{% include copy-curl.html %} + +Last, ingest data into the index by sending a bulk request: + +```json +POST _bulk +{"index": {"_index": "my_test_data", "_id": "1"}} +{"text": "Chart and table of population level and growth rate for the Ogden-Layton metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Ogden-Layton in 2023 is 750,000, a 1.63% increase from 2022.\nThe metro area population of Ogden-Layton in 2022 was 738,000, a 1.79% increase from 2021.\nThe metro area population of Ogden-Layton in 2021 was 725,000, a 1.97% increase from 2020.\nThe metro area population of Ogden-Layton in 2020 was 711,000, a 2.16% increase from 2019."} +{"index": {"_index": "my_test_data", "_id": "2"}} +{"text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019."} +{"index": {"_index": "my_test_data", "_id": "3"}} +{"text": "Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019."} +{"index": {"_index": "my_test_data", "_id": "4"}} +{"text": "Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019."} +{"index": {"_index": "my_test_data", "_id": "5"}} +{"text": "Chart and table of population level and growth rate for the Austin metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Austin in 2023 is 2,228,000, a 2.39% increase from 2022.\\nThe metro area population of Austin in 2022 was 2,176,000, a 2.79% increase from 2021.\\nThe metro area population of Austin in 2021 was 2,117,000, a 3.12% increase from 2020.\\nThe metro area population of Austin in 2020 was 2,053,000, a 3.43% increase from 2019."} +{"index": {"_index": "my_test_data", "_id": "6"}} +{"text": "Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\\nThe metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\\nThe metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\\nThe metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019."} +``` +{% include copy-curl.html %} + +## Step 3: Register a flow agent that will run the VectorDBTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following request, providing the model ID for the model set up in Step 1. This model will encode your queries into vector embeddings: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_VectorDB", + "type": "flow", + "description": "this is a test agent", + "tools": [ + { + "type": "VectorDBTool", + "parameters": { + "model_id": "Hv_PY40Bk4MTqircAVmm", + "index": "my_test_data", + "embedding_field": "embedding", + "source_field": ["text"], + "input": "${parameters.question}" + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +## Step 4: Run the agent + +Before you run the agent, make sure that you add the sample OpenSearch Dashboards `Sample web logs` dataset. To learn more, see [Adding sample data]({{site.url}}{{site.baseurl}}/dashboards/quickstart#adding-sample-data). + +Then, run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "question": "what's the population increase of Seattle from 2021 to 2023" + } +} +``` +{% include copy-curl.html %} + +OpenSearch performs vector search and returns the relevant documents: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": """{"_index":"my_test_data","_source":{"text":"Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\n + The current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\\n + The metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\\n + The metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\\n + The metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019."},"_id":"6","_score":0.8173238} + {"_index":"my_test_data","_source":{"text":"Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\n + The current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\n + The metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\n + The metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\n + The metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019."},"_id":"2","_score":0.6641471} + """ + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`model_id` | String | Required | The model ID of the model to use at search time. +`index` | String | Required | The index to search. +`embedding_field` | String | Required | When the model encodes raw text documents, the encoding result is saved in a field. Specify this field as the `embedding_field`. Neural search matches documents to the query by calculating the similarity score between the query text and the text in the document's `embedding_field`. +`source_field` | String | Required | The document field or fields to return. You can provide a list of multiple fields as an array of strings, for example, `["field1", "field2"]`. +`input` | String | Required for flow agent | Runtime input sourced from flow agent parameters. If using a large language model (LLM), this field is populated with the LLM response. +`doc_size` | Integer | Optional | The number of documents to fetch. Default is `2`. +`k` | Integer | Optional | The number of nearest neighbors to search for when performing neural search. Default is `10`. + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. diff --git a/_ml-commons-plugin/agents-tools/tools/visualization-tool.md b/_ml-commons-plugin/agents-tools/tools/visualization-tool.md new file mode 100644 index 0000000000..98457932c2 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/visualization-tool.md @@ -0,0 +1,103 @@ +--- +layout: default +title: Visualization tool +has_children: false +has_toc: false +nav_order: 120 +parent: Tools +grand_parent: Agents and tools +--- + +# Visualization tool +**Introduced 2.13** +{: .label .label-purple } + +Use the `VisualizationTool` to find visualizations relevant to a question. + +## Step 1: Register a flow agent that will run the VisualizationTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_Visualization_tool", + "type": "flow", + "description": "this is a test agent for the VisuailizationTool", + "tools": [ + { + "type": "VisualizationTool", + "name": "DemoVisualizationTool", + "parameters": { + "index": ".kibana", + "input": "${parameters.question}", + "size": 3 + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +## Step 2: Run the agent + +Before you run the agent, make sure that you add the sample OpenSearch Dashboards `Sample eCommerce orders` dataset. To learn more, see [Adding sample data]({{site.url}}{{site.baseurl}}/dashboards/quickstart#adding-sample-data). + +Then, run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "question": "what's the revenue for today?" + } +} +``` +{% include copy-curl.html %} + +By default, OpenSearch returns the top three matching visualizations. You can use the `size` parameter to specify the number of results returned. The output is returned in CSV format. The output includes two columns: `Title` (the visualization title displayed in OpenSearch Dashboards) and `Id` (a unique ID for this visualization): + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": """Title,Id +[eCommerce] Total Revenue,10f1a240-b891-11e8-a6d9-e546fe2bba5f +""" + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`input` | String | Required | The user input used to match visualizations. +`index` | String | Optional | The index to search. Default is `.kibana` (the system index for OpenSearch Dashboards data). +`size` | Integer | Optional | The number of visualizations to return. Default is `3`. + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. diff --git a/_ml-commons-plugin/api/agent-apis/delete-agent.md b/_ml-commons-plugin/api/agent-apis/delete-agent.md new file mode 100644 index 0000000000..ddde8fb19b --- /dev/null +++ b/_ml-commons-plugin/api/agent-apis/delete-agent.md @@ -0,0 +1,44 @@ +--- +layout: default +title: Delete agent +parent: Agent APIs +grand_parent: ML Commons APIs +nav_order: 50 +--- + +# Delete an agent +**Introduced 2.13** +{: .label .label-purple } + +You can use this API to delete an agent based on the `agent_id`. + +## Path and HTTP methods + +```json +DELETE /_plugins/_ml/agents/ +``` + +#### Example request + +```json +DELETE /_plugins/_ml/agents/MzcIJX8BA7mbufL6DOwl +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_index" : ".plugins-ml-agent", + "_id" : "MzcIJX8BA7mbufL6DOwl", + "_version" : 2, + "result" : "deleted", + "_shards" : { + "total" : 2, + "successful" : 2, + "failed" : 0 + }, + "_seq_no" : 27, + "_primary_term" : 18 +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/agent-apis/execute-agent.md b/_ml-commons-plugin/api/agent-apis/execute-agent.md new file mode 100644 index 0000000000..27d50bced0 --- /dev/null +++ b/_ml-commons-plugin/api/agent-apis/execute-agent.md @@ -0,0 +1,65 @@ +--- +layout: default +title: Execute agent +parent: Agent APIs +grand_parent: ML Commons APIs +nav_order: 20 +--- + +# Execute an agent +**Introduced 2.13** +{: .label .label-purple } + +When an agent is executed, it runs the tools with which it is configured. + +### Path and HTTP methods + +```json +POST /_plugins/_ml/agents//_execute +``` + +## Request fields + +The following table lists the available request fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- +`parameters`| Object | Required | The parameters required by the agent. +`parameters.verbose`| Boolean | Optional | Provides verbose output. + +#### Example request + +```json +POST /_plugins/_ml/agents/879v9YwBjWKCe6Kg12Tx/_execute +{ + "parameters": { + "question": "what's the population increase of Seattle from 2021 to 2023" + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "inference_results": [ + { + "output": [ + { + "result": """ Based on the given context, the key information is: + +The metro area population of Seattle in 2021 was 3,461,000. +The metro area population of Seattle in 2023 is 3,519,000. + +To calculate the population increase from 2021 to 2023: + +Population in 2023 (3,519,000) - Population in 2021 (3,461,000) = 58,000 + +Therefore, the population increase of Seattle from 2021 to 2023 is 58,000.""" + } + ] + } + ] +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/agent-apis/get-agent.md b/_ml-commons-plugin/api/agent-apis/get-agent.md new file mode 100644 index 0000000000..6190406649 --- /dev/null +++ b/_ml-commons-plugin/api/agent-apis/get-agent.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Get agent +parent: Agent APIs +grand_parent: ML Commons APIs +nav_order: 20 +--- + +# Get an agent +**Introduced 2.13** +{: .label .label-purple } + +You can retrieve agent information using the `agent_id`. + +## Path and HTTP methods + +```json +GET /_plugins/_ml/agents/ +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `agent_id` | String | The agent ID of the agent to retrieve. | + + +#### Example request + +```json +GET /_plugins/_ml/agents/N8AE1osB0jLkkocYjz7D +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "name": "Test_Agent_For_RAG", + "type": "flow", + "description": "this is a test agent", + "tools": [ + { + "type": "VectorDBTool", + "parameters": { + "input": "${parameters.question}", + "source_field": """["text"]""", + "embedding_field": "embedding", + "index": "my_test_data", + "model_id": "zBRyYIsBls05QaITo5ex" + }, + "include_output_in_agent_response": false + }, + { + "type": "MLModelTool", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "ygAzT40Bdo8gePIqxk0H", + "prompt": """ + +Human:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. + + Context: +${parameters.VectorDBTool.output} + +Human:${parameters.question} + +Assistant:""" + }, + "include_output_in_agent_response": false + } + ], + "created_time": 1706821658743, + "last_updated_time": 1706821658743 +} +``` + +## Response fields + +For response field descriptions, see [Register Agent API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent#request-fields). \ No newline at end of file diff --git a/_ml-commons-plugin/api/agent-apis/index.md b/_ml-commons-plugin/api/agent-apis/index.md new file mode 100644 index 0000000000..72bf6082ce --- /dev/null +++ b/_ml-commons-plugin/api/agent-apis/index.md @@ -0,0 +1,23 @@ +--- +layout: default +title: Agent APIs +parent: ML Commons APIs +has_children: true +has_toc: false +nav_order: 27 +redirect_from: /ml-commons-plugin/api/agent-apis/ +--- + +# Agent APIs +**Introduced 2.13** +{: .label .label-purple } + +You can automate machine learning (ML) tasks using agents and tools. An _agent_ orchestrates and runs ML models and tools. For more information, see [Agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/). + +ML Commons supports the following agent-level APIs: + +- [Register agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/) +- [Execute agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/execute-agent/) +- [Get agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/get-agent/) +- [Search agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/search-agent/) +- [Delete agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/delete-agent/) \ No newline at end of file diff --git a/_ml-commons-plugin/api/agent-apis/register-agent.md b/_ml-commons-plugin/api/agent-apis/register-agent.md new file mode 100644 index 0000000000..820bb923f7 --- /dev/null +++ b/_ml-commons-plugin/api/agent-apis/register-agent.md @@ -0,0 +1,193 @@ +--- +layout: default +title: Register agent +parent: Agent APIs +grand_parent: ML Commons APIs +nav_order: 10 +--- + +# Register an agent +**Introduced 2.13** +{: .label .label-purple } + +Use this API to register an agent. + +Agents may be of the following types: + +- Flow agent +- Conversational flow agent +- Conversational agent + +For more information about agents, see [Agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/). + +## Path and HTTP methods + +```json +POST /_plugins/_ml/agents/_register +``` +{% include copy-curl.html %} + +## Request fields + +The following table lists the available request fields. + +Field | Data type | Required/Optional | Agent type | Description +:--- | :--- | :--- | :--- | :--- +`name`| String | Required | All | The agent name. | +`type` | String | Required | All | The agent type. Valid values are `flow`, `conversational_flow`, and `conversational`. For more information, see [Agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/). | +`description` | String | Optional| All | A description of the agent. | +`tools` | Array | Optional | All | A list of tools for the agent to execute. +`app_type` | String | Optional | All | Specifies an optional agent category. You can then perform operations on all agents in the category. For example, you can delete all messages for RAG agents. +`memory.type` | String | Optional | `conversational_flow`, `conversational` | Specifies where to store the conversational memory. Currently, the only supported type is `conversation_index` (store the memory in a conversational system index). +`llm.model_id` | String | Required | `conversational` | The model ID of the LLM to which to send questions. +`llm.parameters.response_filter` | String | Required | `conversational` | The pattern for parsing the LLM response. For each LLM, you need to provide the field where the response is located. For example, for the Anthropic Claude model, the response is located in the `completion` field, so the pattern is `$.completion`. For OpenAI models, the pattern is `$.choices[0].message.content`. +`llm.parameters.max_iteration` | Integer | Optional | `conversational` | The maximum number of messages to send to the LLM. Default is `3`. + +The `tools` array contains a list of tools for the agent. Each tool contains the following fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- +`name`| String | Optional | The tool name. The tool name defaults to the `type` parameter value. If you need to include multiple tools of the same type in an agent, specify different names for the tools. | +`type` | String | Required | The tool type. For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +`parameters` | Object | Optional | The parameters for this tool. The parameters are highly dependent on the tool type. You can find information about specific tool types in [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). + +#### Example request: Flow agent + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_RAG", + "type": "flow", + "description": "this is a test agent", + "tools": [ + { + "name": "vector_tool", + "type": "VectorDBTool", + "parameters": { + "model_id": "zBRyYIsBls05QaITo5ex", + "index": "my_test_data", + "embedding_field": "embedding", + "source_field": [ + "text" + ], + "input": "${parameters.question}" + } + }, + { + "type": "MLModelTool", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "NWR9YIsBUysqmzBdifVJ", + "prompt": "\n\nHuman:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. \n\n Context:\n${parameters.vector_tool.output}\n\nHuman:${parameters.question}\n\nAssistant:" + } + } + ] +} +``` +{% include copy-curl.html %} + +#### Example request: Conversational flow agent + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "population data analysis agent", + "type": "conversational_flow", + "description": "This is a demo agent for population data analysis", + "app_type": "rag", + "memory": { + "type": "conversation_index" + }, + "tools": [ + { + "type": "VectorDBTool", + "name": "population_knowledge_base", + "parameters": { + "model_id": "your_text_embedding_model_id", + "index": "test_population_data", + "embedding_field": "population_description_embedding", + "source_field": [ + "population_description" + ], + "input": "${parameters.question}" + } + }, + { + "type": "MLModelTool", + "name": "bedrock_claude_model", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "your_LLM_model_id", + "prompt": """ + +Human:You are a professional data analysist. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. + +Context: +${parameters.population_knowledge_base.output:-} + +${parameters.chat_history:-} + +Human:${parameters.question} + +Assistant:""" + } + } + ] +} +``` +{% include copy-curl.html %} + +#### Example request: Conversational agent + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_ReAct_ClaudeV2", + "type": "conversational", + "description": "this is a test agent", + "app_type": "my chatbot", + "llm": { + "model_id": "", + "parameters": { + "max_iteration": 5, + "stop_when_no_tool_found": true, + "response_filter": "$.completion" + } + }, + "memory": { + "type": "conversation_index" + }, + "tools": [ + { + "type": "VectorDBTool", + "name": "VectorDBTool", + "description": "A tool to search opensearch index with natural language quesiotn. If you don't know answer for some question, you should always try to search data with this tool. Action Input: ", + "parameters": { + "model_id": "", + "index": "", + "embedding_field": "", + "source_field": [ + "" + ], + "input": "${parameters.question}" + } + }, + { + "type": "CatIndexTool", + "name": "RetrieveIndexMetaTool", + "description": "Use this tool to get OpenSearch index information: (health, status, index, uuid, primary count, replica count, docs.count, docs.deleted, store.size, primary.store.size)." + } + ] +} +``` +{% include copy-curl.html %} + +#### Example response + +OpenSearch responds with an agent ID that you can use to refer to the agent: + +```json +{ + "agent_id": "bpV_Zo0BRhAwb9PZqGja" +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/agent-apis/search-agent.md b/_ml-commons-plugin/api/agent-apis/search-agent.md new file mode 100644 index 0000000000..3d950cde8f --- /dev/null +++ b/_ml-commons-plugin/api/agent-apis/search-agent.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Search agent +parent: Agent APIs +grand_parent: ML Commons APIs +nav_order: 30 +--- + +# Search for an agent +**Introduced 2.13** +{: .label .label-purple } + +Use this command to search for agents you've already created. You can provide any OpenSearch search query in the request body. + +## Path and HTTP methods + +```json +GET /_plugins/_ml/agents/_search +POST /_plugins/_ml/agents/_search +``` + +#### Example request: Searching for all agents + +```json +POST /_plugins/_ml/agents/_search +{ + "query": { + "match_all": {} + }, + "size": 1000 +} +``` +{% include copy-curl.html %} + +#### Example request: Searching for agents of a certain type + +```json +POST /_plugins/_ml/agents/_search +{ + "query": { + "term": { + "type": { + "value": "flow" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example: Searching for an agent by description + +```json +GET _plugins/_ml/agents/_search +{ + "query": { + "bool": { + "should": [ + { + "match": { + "description": "test agent" + } + } + ] + } + }, + "size": 1000 +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 0.15019803, + "hits": [ + { + "_index": ".plugins-ml-agent", + "_id": "8HXlkI0BfUsSoeNTP_0P", + "_version": 1, + "_seq_no": 17, + "_primary_term": 2, + "_score": 0.13904166, + "_source": { + "created_time": 1707532959502, + "last_updated_time": 1707532959502, + "name": "Test_Agent_For_RagTool", + "description": "this is a test flow agent", + "type": "flow", + "tools": [ + { + "description": "A description of the tool", + "include_output_in_agent_response": false, + "type": "RAGTool", + "parameters": { + "inference_model_id": "gnDIbI0BfUsSoeNT_jAw", + "embedding_model_id": "Yg7HZo0B9ggZeh2gYjtu_2", + "input": "${parameters.question}", + "source_field": """["text"]""", + "embedding_field": "embedding", + "index": "my_test_data", + "query_type": "neural", + "prompt": """ + +Human:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. + + Context: +${parameters.output_field} + +Human:${parameters.question} + +Assistant:""" + } + } + ] + } + } + ] + } +} +``` + +## Response fields + +For response field descriptions, see [Register Agent API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent#request-fields). diff --git a/_ml-commons-plugin/api/connector-apis/create-connector.md b/_ml-commons-plugin/api/connector-apis/create-connector.md index 86af3d7a51..4225a24053 100644 --- a/_ml-commons-plugin/api/connector-apis/create-connector.md +++ b/_ml-commons-plugin/api/connector-apis/create-connector.md @@ -2,7 +2,7 @@ layout: default title: Create connector parent: Connector APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 10 --- @@ -16,6 +16,10 @@ Creates a standalone connector. For more information, see [Connectors]({{site.ur POST /_plugins/_ml/connectors/_create ``` +## Request fields + +For a list of request fields, see [Blueprint configuration parameters]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints#configuration-parameters). + #### Example request To create a standalone connector, send a request to the `connectors/_create` endpoint and provide all of the parameters described in [Connector blueprints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/): diff --git a/_ml-commons-plugin/api/connector-apis/delete-connector.md b/_ml-commons-plugin/api/connector-apis/delete-connector.md index 9f09e7ec4e..75dff32016 100644 --- a/_ml-commons-plugin/api/connector-apis/delete-connector.md +++ b/_ml-commons-plugin/api/connector-apis/delete-connector.md @@ -2,7 +2,7 @@ layout: default title: Delete connector parent: Connector APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 30 --- diff --git a/_ml-commons-plugin/api/connector-apis/get-connector.md b/_ml-commons-plugin/api/connector-apis/get-connector.md index cf82b5b357..6a8507cd32 100644 --- a/_ml-commons-plugin/api/connector-apis/get-connector.md +++ b/_ml-commons-plugin/api/connector-apis/get-connector.md @@ -2,21 +2,12 @@ layout: default title: Get connector parent: Connector APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 20 --- # Get a connector -Use the `_search` endpoint to search for a connector. - -To retrieve information about a connector, you can: - -- [Get a connector by ID](#get-a-connector-by-id) -- [Search for a connector](#search-for-a-connector) - -## Get a connector by ID - This API retrieves a connector by its ID. ### Path and HTTP methods @@ -62,160 +53,3 @@ GET /_plugins/_ml/connectors/N8AE1osB0jLkkocYjz7D ] } ``` - -## Search for a connector - -This API searches for matching connectors using a query. - -### Path and HTTP methods - -```json -POST /_plugins/_ml/connectors/_search -GET /_plugins/_ml/connectors/_search -``` - -#### Example request - -```json -POST /_plugins/_ml/connectors/_search -{ - "query": { - "match_all": {} - }, - "size": 1000 -} -``` -{% include copy-curl.html %} - -#### Example response - -```json -{ - "took" : 1, - "timed_out" : false, - "_shards" : { - "total" : 1, - "successful" : 1, - "skipped" : 0, - "failed" : 0 - }, - "hits" : { - "total" : { - "value" : 3, - "relation" : "eq" - }, - "max_score" : 1.0, - "hits" : [ - { - "_index" : ".plugins-ml-connector", - "_id" : "7W-d74sBPD67W0wkEZdE", - "_version" : 1, - "_seq_no" : 2, - "_primary_term" : 1, - "_score" : 1.0, - "_source" : { - "protocol" : "aws_sigv4", - "name" : "BedRock claude Connector", - "description" : "The connector to BedRock service for claude model", - "version" : "1", - "parameters" : { - "endpoint" : "bedrock.us-east-1.amazonaws.com", - "content_type" : "application/json", - "auth" : "Sig_V4", - "max_tokens_to_sample" : "8000", - "service_name" : "bedrock", - "temperature" : "1.0E-4", - "response_filter" : "$.completion", - "region" : "us-east-1", - "anthropic_version" : "bedrock-2023-05-31" - }, - "actions" : [ - { - "headers" : { - "x-amz-content-sha256" : "required", - "content-type" : "application/json" - }, - "method" : "POST", - "request_body" : "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", - "action_type" : "PREDICT", - "url" : "https://bedrock.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke" - } - ] - } - }, - { - "_index" : ".plugins-ml-connector", - "_id" : "9W-d74sBPD67W0wk4pf_", - "_version" : 1, - "_seq_no" : 3, - "_primary_term" : 1, - "_score" : 1.0, - "_source" : { - "protocol" : "aws_sigv4", - "name" : "BedRock claude Connector", - "description" : "The connector to BedRock service for claude model", - "version" : "1", - "parameters" : { - "endpoint" : "bedrock.us-east-1.amazonaws.com", - "content_type" : "application/json", - "auth" : "Sig_V4", - "max_tokens_to_sample" : "8000", - "service_name" : "bedrock", - "temperature" : "1.0E-4", - "response_filter" : "$.completion", - "region" : "us-east-1", - "anthropic_version" : "bedrock-2023-05-31" - }, - "actions" : [ - { - "headers" : { - "x-amz-content-sha256" : "required", - "content-type" : "application/json" - }, - "method" : "POST", - "request_body" : "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", - "action_type" : "PREDICT", - "url" : "https://bedrock.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke" - } - ] - } - }, - { - "_index" : ".plugins-ml-connector", - "_id" : "rm_u8osBPD67W0wkCpsG", - "_version" : 1, - "_seq_no" : 4, - "_primary_term" : 1, - "_score" : 1.0, - "_source" : { - "protocol" : "aws_sigv4", - "name" : "BedRock Claude-Instant v1", - "description" : "Bedrock connector for Claude Instant testing", - "version" : "1", - "parameters" : { - "endpoint" : "bedrock.us-east-1.amazonaws.com", - "content_type" : "application/json", - "auth" : "Sig_V4", - "service_name" : "bedrock", - "region" : "us-east-1", - "anthropic_version" : "bedrock-2023-05-31" - }, - "actions" : [ - { - "headers" : { - "x-amz-content-sha256" : "required", - "content-type" : "application/json" - }, - "method" : "POST", - "request_body" : "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", - "action_type" : "PREDICT", - "url" : "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-instant-v1/invoke" - } - ] - } - } - ] - } -} -``` - diff --git a/_ml-commons-plugin/api/connector-apis/index.md b/_ml-commons-plugin/api/connector-apis/index.md index fc1c316037..a8945bb822 100644 --- a/_ml-commons-plugin/api/connector-apis/index.md +++ b/_ml-commons-plugin/api/connector-apis/index.md @@ -1,8 +1,9 @@ --- layout: default title: Connector APIs -parent: ML Commons API +parent: ML Commons APIs has_children: true +has_toc: false nav_order: 25 --- @@ -11,7 +12,9 @@ nav_order: 25 ML Commons supports the following connector APIs: - [Create connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/create-connector/) -- [Search for a connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/get-connector/) +- [Get connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/get-connector/) +- [Search connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/search-connector/) +- [Update connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/update-connector/) - [Delete connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/delete-connector/) For more information, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). diff --git a/_ml-commons-plugin/api/connector-apis/search-connector.md b/_ml-commons-plugin/api/connector-apis/search-connector.md new file mode 100644 index 0000000000..3b59d51a2e --- /dev/null +++ b/_ml-commons-plugin/api/connector-apis/search-connector.md @@ -0,0 +1,164 @@ +--- +layout: default +title: Search connector +parent: Connector APIs +grand_parent: ML Commons APIs +nav_order: 25 +--- + +# Search for a connector + +Use the `_search` endpoint to search for a connector. This API uses a query to search for matching connectors. + +## Path and HTTP methods + +```json +POST /_plugins/_ml/connectors/_search +GET /_plugins/_ml/connectors/_search +``` + +#### Example request + +```json +POST /_plugins/_ml/connectors/_search +{ + "query": { + "match_all": {} + }, + "size": 1000 +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : ".plugins-ml-connector", + "_id" : "7W-d74sBPD67W0wkEZdE", + "_version" : 1, + "_seq_no" : 2, + "_primary_term" : 1, + "_score" : 1.0, + "_source" : { + "protocol" : "aws_sigv4", + "name" : "BedRock claude Connector", + "description" : "The connector to BedRock service for claude model", + "version" : "1", + "parameters" : { + "endpoint" : "bedrock.us-east-1.amazonaws.com", + "content_type" : "application/json", + "auth" : "Sig_V4", + "max_tokens_to_sample" : "8000", + "service_name" : "bedrock", + "temperature" : "1.0E-4", + "response_filter" : "$.completion", + "region" : "us-east-1", + "anthropic_version" : "bedrock-2023-05-31" + }, + "actions" : [ + { + "headers" : { + "x-amz-content-sha256" : "required", + "content-type" : "application/json" + }, + "method" : "POST", + "request_body" : "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", + "action_type" : "PREDICT", + "url" : "https://bedrock.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke" + } + ] + } + }, + { + "_index" : ".plugins-ml-connector", + "_id" : "9W-d74sBPD67W0wk4pf_", + "_version" : 1, + "_seq_no" : 3, + "_primary_term" : 1, + "_score" : 1.0, + "_source" : { + "protocol" : "aws_sigv4", + "name" : "BedRock claude Connector", + "description" : "The connector to BedRock service for claude model", + "version" : "1", + "parameters" : { + "endpoint" : "bedrock.us-east-1.amazonaws.com", + "content_type" : "application/json", + "auth" : "Sig_V4", + "max_tokens_to_sample" : "8000", + "service_name" : "bedrock", + "temperature" : "1.0E-4", + "response_filter" : "$.completion", + "region" : "us-east-1", + "anthropic_version" : "bedrock-2023-05-31" + }, + "actions" : [ + { + "headers" : { + "x-amz-content-sha256" : "required", + "content-type" : "application/json" + }, + "method" : "POST", + "request_body" : "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", + "action_type" : "PREDICT", + "url" : "https://bedrock.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke" + } + ] + } + }, + { + "_index" : ".plugins-ml-connector", + "_id" : "rm_u8osBPD67W0wkCpsG", + "_version" : 1, + "_seq_no" : 4, + "_primary_term" : 1, + "_score" : 1.0, + "_source" : { + "protocol" : "aws_sigv4", + "name" : "BedRock Claude-Instant v1", + "description" : "Bedrock connector for Claude Instant testing", + "version" : "1", + "parameters" : { + "endpoint" : "bedrock.us-east-1.amazonaws.com", + "content_type" : "application/json", + "auth" : "Sig_V4", + "service_name" : "bedrock", + "region" : "us-east-1", + "anthropic_version" : "bedrock-2023-05-31" + }, + "actions" : [ + { + "headers" : { + "x-amz-content-sha256" : "required", + "content-type" : "application/json" + }, + "method" : "POST", + "request_body" : "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", + "action_type" : "PREDICT", + "url" : "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-instant-v1/invoke" + } + ] + } + } + ] + } +} +``` + diff --git a/_ml-commons-plugin/api/connector-apis/update-connector.md b/_ml-commons-plugin/api/connector-apis/update-connector.md new file mode 100644 index 0000000000..64790bb57f --- /dev/null +++ b/_ml-commons-plugin/api/connector-apis/update-connector.md @@ -0,0 +1,70 @@ +--- +layout: default +title: Update connector +parent: Connector APIs +grand_parent: ML Commons APIs +nav_order: 27 +--- + +# Update a connector +**Introduced 2.12** +{: .label .label-purple } + +Use this API to update a standalone connector based on the `model_ID`. To update a connector created within a specific model, use the [Update Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/update-model/). + +Before updating a standalone connector, you must undeploy all models that use the connector. For information about undeploying a model, see [Undeploy Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/undeploy-model/). +{: .note} + +Using this API, you can update the connector fields listed in the [Request fields](#request-fields) section and add optional fields to your connector. You cannot delete fields from a connector using this API. + +For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). + +## Path and HTTP methods + +```json +PUT /_plugins/_ml/connectors/ +``` + +## Request fields + +The following table lists the updatable fields. For more information about all connector fields, see [Blueprint configuration parameters]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints#configuration-parameters). + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `name` | String | The name of the connector. | +| `description` | String | A description of the connector. | +| `version` | Integer | The version of the connector. | +| `protocol` | String | The protocol for the connection. For AWS services, such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. | +| `parameters` | JSON object | The default connector parameters, including `endpoint` and `model`. Any parameters included in this field can be overridden by parameters specified in a predict request. | +| `credential` | JSON object | Defines any credential variables required in order to connect to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the connection to the cluster first starts, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. | +| `actions` | JSON array | Defines which actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. | +| `backend_roles` | JSON array | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). | +| `access_mode` | String | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). | + +#### Example request + +```json +PUT /_plugins/_ml/connectors/u3DEbI0BfUsSoeNTti-1 +{ + "description": "The connector to public OpenAI model service for GPT 3.5" +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_index": ".plugins-ml-connector", + "_id": "u3DEbI0BfUsSoeNTti-1", + "_version": 2, + "result": "updated", + "_shards": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "_seq_no": 2, + "_primary_term": 1 +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/controller-apis/create-controller.md b/_ml-commons-plugin/api/controller-apis/create-controller.md new file mode 100644 index 0000000000..91a6be4387 --- /dev/null +++ b/_ml-commons-plugin/api/controller-apis/create-controller.md @@ -0,0 +1,188 @@ +--- +layout: default +title: Create controller +parent: Controller APIs +grand_parent: ML Commons APIs +nav_order: 10 +--- + +# Create or update a controller +**Introduced 2.12** +{: .label .label-purple } + +Use this API to create or update a controller for a model. A model may be shared by multiple users. A controller sets rate limits for the number of [Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/predict/) calls users can make on the model. A controller consists of a set of rate limiters for different users. + +You can only create a controller for a model once you have registered the model and received a model ID. +{: .tip} + +The POST method creates a new controller. The PUT method updates an existing controller. + +To learn how to set rate limits at the model level for all users, see [Update Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/update-model/). The rate limit is set to either the model-level limit or the user-level limit, whichever is more restrictive. For example, if the model-level limit is 2 requests per minute and the user-level limit is 4 requests per minute, the overall limit will be set to 2 requests per minute. + +## Path and HTTP methods + +```json +POST /_plugins/_ml/controllers/ +PUT /_plugins/_ml/controllers/ +``` +{% include copy-curl.html %} + +## Path parameters + +The following table lists the available path parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`model_id` | String | The model ID of the model for which you want to set rate limits. Required. + +## Request fields + +The following table lists the available request fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`user_rate_limiter`| Object | Required | Limits the number of times users can call the Predict API on the model. For more information, see [Rate limiting inference calls]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#rate-limiting-inference-calls). + +The `user_rate_limiter` object contains an object for each user, specified by username. The user object contains the following fields. + +Field | Data type | Description +:--- | :--- | :--- +`limit` | Integer | The maximum number of times the user can call the Predict API on the model per `unit` of time. By default, there is no limit on the number of Predict API calls. Once you set a limit, you cannot reset it to no limit. As an alternative, you can specify a high limit value and a small time unit, for example, 1 request per nanosecond. +`unit` | String | The unit of time for the rate limiter. Valid values are `DAYS`, `HOURS`, `MICROSECONDS`, `MILLISECONDS`, `MINUTES`, `NANOSECONDS`, and `SECONDS`. + + +#### Example request: Create a controller + +```json +POST _plugins/_ml/controllers/mtw-ZI0B_1JGmyB068C0 +{ + "user_rate_limiter": { + "user1": { + "limit": 4, + "unit": "MINUTES" + }, + "user2": { + "limit": 4, + "unit": "MINUTES" + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "model_id": "mtw-ZI0B_1JGmyB068C0", + "status": "CREATED" +} +``` + +#### Example request: Update the rate limit for one user + +To update the limit for `user1`, send a PUT request and specify the updated information: + +```json +PUT _plugins/_ml/controllers/mtw-ZI0B_1JGmyB068C0 +{ + "user_rate_limiter": { + "user1": { + "limit": 6, + "unit": "MINUTES" + } + } +} +``` +{% include copy-curl.html %} + +This will update only the `user1` object, leaving all other user limits intact: + +```json +{ + "model_id": "mtw-ZI0B_1JGmyB068C0", + "user_rate_limiter": { + "user1": { + "limit": "6", + "unit": "MINUTES" + }, + "user2": { + "limit": "4", + "unit": "MINUTES" + } + } +} +``` + +#### Example response + +```json +{ + "_index": ".plugins-ml-controller", + "_id": "mtw-ZI0B_1JGmyB068C0", + "_version": 2, + "result": "updated", + "forced_refresh": true, + "_shards": { + "total": 2, + "successful": 2, + "failed": 0 + }, + "_seq_no": 1, + "_primary_term": 1 +} +``` + +#### Example request: Delete the rate limit for one user + +To delete the limit for `user2`, send a POST request containing all other users' limits: + +```json +POST _plugins/_ml/controllers/mtw-ZI0B_1JGmyB068C0 +{ + "user_rate_limiter": { + "user1": { + "limit": 6, + "unit": "MINUTES" + } + } +} +``` +{% include copy-curl.html %} + +This will overwrite the controller with the new information: + +```json +{ + "model_id": "mtw-ZI0B_1JGmyB068C0", + "user_rate_limiter": { + "user1": { + "limit": "6", + "unit": "MINUTES" + } + } +} +``` + +#### Example response + +```json +{ + "_index": ".plugins-ml-controller", + "_id": "mtw-ZI0B_1JGmyB068C0", + "_version": 2, + "result": "updated", + "forced_refresh": true, + "_shards": { + "total": 2, + "successful": 2, + "failed": 0 + }, + "_seq_no": 1, + "_primary_term": 1 +} +``` + +## Required permissions + +If you use the Security plugin, make sure you have the appropriate permissions: `cluster:admin/opensearch/ml/controllers/create` and `cluster:admin/opensearch/ml/controllers/update`. \ No newline at end of file diff --git a/_ml-commons-plugin/api/controller-apis/delete-controller.md b/_ml-commons-plugin/api/controller-apis/delete-controller.md new file mode 100644 index 0000000000..44120198fa --- /dev/null +++ b/_ml-commons-plugin/api/controller-apis/delete-controller.md @@ -0,0 +1,56 @@ +--- +layout: default +title: Delete controller +parent: Controller APIs +grand_parent: ML Commons APIs +nav_order: 50 +--- + +# Delete a controller +**Introduced 2.12** +{: .label .label-purple } + +Use this API to delete a controller for a model based on the `model_id`. + +## Path and HTTP methods + +```json +DELETE /_plugins/_ml/controllers/ +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `model_id` | String | The model ID of the model for which to delete the controller. | + +#### Example request + +```json +DELETE /_plugins/_ml/controllers/MzcIJX8BA7mbufL6DOwl +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_index" : ".plugins-ml-controller", + "_id" : "MzcIJX8BA7mbufL6DOwl", + "_version" : 2, + "result" : "deleted", + "_shards" : { + "total" : 2, + "successful" : 2, + "failed" : 0 + }, + "_seq_no" : 27, + "_primary_term" : 18 +} +``` + +## Required permissions + +If you use the Security plugin, make sure you have the appropriate permissions: `cluster:admin/opensearch/ml/controllers/delete`. \ No newline at end of file diff --git a/_ml-commons-plugin/api/controller-apis/get-controller.md b/_ml-commons-plugin/api/controller-apis/get-controller.md new file mode 100644 index 0000000000..b30fe15679 --- /dev/null +++ b/_ml-commons-plugin/api/controller-apis/get-controller.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Get controller +parent: Controller APIs +grand_parent: ML Commons APIs +nav_order: 20 +--- + +# Get a controller +**Introduced 2.12** +{: .label .label-purple } + +Use this API to retrieve information about a controller for a model by model ID. + +### Path and HTTP methods + +```json +GET /_plugins/_ml/controllers/ +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `model_id` | String | The model ID of the model for which to retrieve the controller. | + +#### Example request + +```json +GET /_plugins/_ml/controllers/T_S-cY0BKCJ3ot9qr0aP +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "model_id": "T_S-cY0BKCJ3ot9qr0aP", + "user_rate_limiter": { + "user1": { + "limit": "4", + "unit": "MINUTES" + }, + "user2": { + "limit": "4", + "unit": "MINUTES" + } + } +} +``` + +If there is no controller defined for the model, OpenSearch returns an error: + +```json +{ + "error": { + "root_cause": [ + { + "type": "status_exception", + "reason": "Failed to find model controller with the provided model ID: T_S-cY0BKCJ3ot9qr0aP" + } + ], + "type": "status_exception", + "reason": "Failed to find model controller with the provided model ID: T_S-cY0BKCJ3ot9qr0aP" + }, + "status": 404 +} +``` + +## Response fields + +For response field descriptions, see [Create Controller API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/controller-apis/create-controller#request-fields). + +## Required permissions + +If you use the Security plugin, make sure you have the appropriate permissions: `cluster:admin/opensearch/ml/controllers/get`. \ No newline at end of file diff --git a/_ml-commons-plugin/api/controller-apis/index.md b/_ml-commons-plugin/api/controller-apis/index.md new file mode 100644 index 0000000000..2f9afc1491 --- /dev/null +++ b/_ml-commons-plugin/api/controller-apis/index.md @@ -0,0 +1,25 @@ +--- +layout: default +title: Controller APIs +parent: ML Commons APIs +has_children: true +has_toc: false +nav_order: 29 +redirect_from: /ml-commons-plugin/api/controller-apis/ +--- + +# Controller APIs +**Introduced 2.12** +{: .label .label-purple } + +You can configure a rate limit for a specific user or users of a model by calling the Controller APIs. + +ML Commons supports the following controller-level APIs: + +- [Create or update controller]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/controller-apis/create-controller/) +- [Get controller]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/controller-apis/get-controller/) +- [Delete controller]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/controller-apis/delete-controller/) + +## Required permissions + +To call the Controller APIs, you must have `cluster:admin/opensearch/ml/controllers/` permissions. Links to more information about each Controller API are provided in the preceding section. \ No newline at end of file diff --git a/_ml-commons-plugin/api/execute-algorithm.md b/_ml-commons-plugin/api/execute-algorithm.md index 3005ca85c1..7b06cfefe8 100644 --- a/_ml-commons-plugin/api/execute-algorithm.md +++ b/_ml-commons-plugin/api/execute-algorithm.md @@ -1,7 +1,7 @@ --- layout: default title: Execute algorithm -parent: ML Commons API +parent: ML Commons APIs nav_order: 30 --- diff --git a/_ml-commons-plugin/api/index.md b/_ml-commons-plugin/api/index.md index 00ddd01942..ec4cf12492 100644 --- a/_ml-commons-plugin/api/index.md +++ b/_ml-commons-plugin/api/index.md @@ -1,6 +1,6 @@ --- layout: default -title: ML Commons API +title: ML Commons APIs has_children: false nav_order: 130 has_children: true @@ -9,15 +9,18 @@ redirect_from: - /ml-commons-plugin/api/ --- -# ML Commons API +# ML Commons APIs -ML Commons supports the following API types: +ML Commons supports the following APIs: - [Model APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/) - [Model group APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/index/) - [Connector APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/index/) +- [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/index/) +- [Memory APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/index/) +- [Controller APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/controller-apis/index/) +- [Execute Algorithm API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/execute-algorithm/) - [Tasks APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/index/) - [Train and Predict APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/index/) -- [Execute Algorithm API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/execute-algorithm/) - [Profile API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/profile/) - [Stats API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/stats/) diff --git a/_ml-commons-plugin/api/memory-apis/create-memory.md b/_ml-commons-plugin/api/memory-apis/create-memory.md new file mode 100644 index 0000000000..c7dac1783d --- /dev/null +++ b/_ml-commons-plugin/api/memory-apis/create-memory.md @@ -0,0 +1,61 @@ +--- +layout: default +title: Create or update memory +parent: Memory APIs +grand_parent: ML Commons APIs +nav_order: 10 +--- + +# Create or update a memory +**Introduced 2.12** +{: .label .label-purple } + +Use this API to create or update a conversational memory for [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). A memory stores conversation history for the current conversation. + +Once a memory is created, you'll provide its `memory_id` to other APIs. + +The POST method creates a new memory. The PUT method updates an existing memory. + +When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. +{: .important} + +## Path and HTTP methods + +```json +POST /_plugins/_ml/memory/ +PUT /_plugins/_ml/memory/ +``` + +## Path parameters + +The following table lists the available path parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`memory_id` | String | The ID of the memory to be updated. Required for the PUT method. + +## Request fields + +The following table lists the available request fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`name` | String | Optional | The name of the memory. + +#### Example request + +```json +POST /_plugins/_ml/memory/ +{ + "name": "Conversation for a RAG pipeline" +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "memory_id": "gW8Aa40BfUsSoeNTvOKI" +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/memory-apis/create-message.md b/_ml-commons-plugin/api/memory-apis/create-message.md new file mode 100644 index 0000000000..345f411ccd --- /dev/null +++ b/_ml-commons-plugin/api/memory-apis/create-message.md @@ -0,0 +1,173 @@ +--- +layout: default +title: Create or update message +parent: Memory APIs +grand_parent: ML Commons APIs +nav_order: 40 +--- + +# Create or update a message +**Introduced 2.12** +{: .label .label-purple } + +Use this API to create or update a message within a conversational memory for [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). A memory stores conversation history for the current conversation. A message represents one question/answer pair within a conversation. + +Once a message is created, you'll provide its `message_id` to other APIs. + +The POST method creates a new message. The PUT method updates an existing message. + +You can only update the `additional_info` field of a message. +{: .note} + +When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. +{: .important} + +## Path and HTTP methods + +```json +POST /_plugins/_ml/memory//messages +PUT /_plugins/_ml/memory/message/ +``` + +## Path parameters + +The following table lists the available path parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`memory_id` | String | The ID of the memory to which to add the message. Required for the POST method. +`message_id` | String | The ID of the message to be updated. Required for the PUT method. + +## Request fields + +The following table lists the available request fields. + +Field | Data type | Required/Optional | Updatable | Description +:--- | :--- | :--- | :--- | :--- +| `input` | String | Optional | No | The question (human input) in the message. | +| `prompt_template` | String | Optional | No | The prompt template that was used for the message. The template may contain instructions or examples that were sent to the large language model. | +| `response` | String | Optional | No | The answer (generative AI output) to the question. | +| `origin` | String | Optional | No | The name of the AI or other system that generated the response. | +| `additional_info` | Object | Optional | Yes | Any other information that was sent to the `origin`. | + +#### Example request: Create a message + +```json +POST /_plugins/_ml/memory/SXA2cY0BfUsSoeNTz-8m/messages +{ + "input": "How do I make an interaction?", + "prompt_template": "Hello OpenAI, can you answer this question?", + "response": "Hello, this is OpenAI. Here is the answer to your question.", + "origin": "MyFirstOpenAIWrapper", + "additional_info": { + "suggestion": "api.openai.com" + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "memory_id": "WnA3cY0BfUsSoeNTI-_J" +} +``` + +#### Example request: Add a field to `additional_info` + +```json +PUT /_plugins/_ml/memory/message/WnA3cY0BfUsSoeNTI-_J +{ + "additional_info": { + "feedback": "positive" + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_index": ".plugins-ml-memory-message", + "_id": "WnA3cY0BfUsSoeNTI-_J", + "_version": 2, + "result": "updated", + "forced_refresh": true, + "_shards": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "_seq_no": 45, + "_primary_term": 1 +} +``` + +The updated message contains an additional `feedback` field: + +```json +{ + "memory_id": "SXA2cY0BfUsSoeNTz-8m", + "message_id": "WnA3cY0BfUsSoeNTI-_J", + "create_time": "2024-02-03T23:04:15.554370024Z", + "input": "How do I make an interaction?", + "prompt_template": "Hello OpenAI, can you answer this question?", + "response": "Hello, this is OpenAI. Here is the answer to your question.", + "origin": "MyFirstOpenAIWrapper", + "additional_info": { + "feedback": "positive", + "suggestion": "api.openai.com" + } +} +``` + +#### Example request: Change a field in `additional_info` + +```json +PUT /_plugins/_ml/memory/message/WnA3cY0BfUsSoeNTI-_J +{ + "additional_info": { + "feedback": "negative" + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_index": ".plugins-ml-memory-message", + "_id": "WnA3cY0BfUsSoeNTI-_J", + "_version": 3, + "result": "updated", + "forced_refresh": true, + "_shards": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "_seq_no": 46, + "_primary_term": 1 +} +``` + +The updated message contains the updated `feedback` field: + +```json +{ + "memory_id": "SXA2cY0BfUsSoeNTz-8m", + "message_id": "WnA3cY0BfUsSoeNTI-_J", + "create_time": "2024-02-03T23:04:15.554370024Z", + "input": "How do I make an interaction?", + "prompt_template": "Hello OpenAI, can you answer this question?", + "response": "Hello, this is OpenAI. Here is the answer to your question.", + "origin": "MyFirstOpenAIWrapper", + "additional_info": { + "feedback": "negative", + "suggestion": "api.openai.com" + } +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/memory-apis/delete-memory.md b/_ml-commons-plugin/api/memory-apis/delete-memory.md new file mode 100644 index 0000000000..99e4cdb574 --- /dev/null +++ b/_ml-commons-plugin/api/memory-apis/delete-memory.md @@ -0,0 +1,45 @@ +--- +layout: default +title: Delete memory +parent: Memory APIs +grand_parent: ML Commons APIs +nav_order: 30 +--- + +# Delete a memory +**Introduced 2.12** +{: .label .label-purple } + +Use this API to delete a memory based on the `memory_id`. + +When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. +{: .important} + +## Path and HTTP methods + +```json +DELETE /_plugins/_ml/memory/ +``` + +## Path parameters + +The following table lists the available path parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`memory_id` | String | The ID of the memory to be deleted. + +#### Example request + +```json +DELETE /_plugins/_ml/memory/MzcIJX8BA7mbufL6DOwl +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "success": true +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/memory-apis/get-memory.md b/_ml-commons-plugin/api/memory-apis/get-memory.md new file mode 100644 index 0000000000..63ab548c00 --- /dev/null +++ b/_ml-commons-plugin/api/memory-apis/get-memory.md @@ -0,0 +1,133 @@ +--- +layout: default +title: Get memory +parent: Memory APIs +grand_parent: ML Commons APIs +nav_order: 20 +--- + +# Get a memory +**Introduced 2.12** +{: .label .label-purple } + +Use this API to retrieve a conversational memory for [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). + +To retrieve memory information, you can: + +- [Get a memory by ID](#get-a-memory-by-id). +- [Get all memories](#get-all-memories). + +To retrieve message information for a memory, you can: + +- [Get all messages within a memory]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/get-message#get-all-messages-within-a-memory). +- [Search for messages within a memory]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/search-message/). + +When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. +{: .important} + +## Get a memory by ID + +You can retrieve memory information by using the `memory_id`. The response includes all messages within the memory. + +### Path and HTTP methods + +```json +GET /_plugins/_ml/memory/ +``` +### Path parameters + +The following table lists the available path parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`memory_id` | String | The ID of the memory to retrieve. + +#### Example request + +```json +GET /_plugins/_ml/memory/N8AE1osB0jLkkocYjz7D +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "memory_id": "gW8Aa40BfUsSoeNTvOKI", + "create_time": "2024-02-02T18:07:06.887061463Z", + "updated_time": "2024-02-02T19:01:32.121444968Z", + "name": "Conversation for a RAG pipeline", + "user": "admin" +} +``` + +## Get all memories + +Use this command to get all memories. + +### Path and HTTP methods + +```json +GET /_plugins/_ml/memory +``` + +### Query parameters + +Use the following query parameters to customize your results. All query parameters are optional. + +Parameter | Data type | Description +:--- | :--- | :--- +`max_results` | Integer | The maximum number of results to return. If there are fewer memories than the number set in `max_results`, the response returns only the number of memories that exist. Default is `10`. +`next_token` | Integer | The index of the first memory in the sorted list of memories to return. Memories are ordered by `create_time`. For example, if memories A, B, and C exist, `next_token=1` returns memories B and C. Default is `0` (return all memories). + +### Paginating results + +The `next_token` parameter provides the ordered position of the first memory within the sorted list of memories to return in the results. When a memory is added between subsequent GET Memory calls, one of the listed memories will be duplicated in the results. For example, suppose the current ordered list of memories is `BCDEF`, where `B` is the memory created most recently. When you call the Get Memory API with `next_token=0` and `max_results=3`, the API returns `BCD`. Suppose you then create another memory A. The memory list now appears as `ABCDEF`. The next time you call the Get Memory API with `next_token=3` and `max_results=3`, you'll receive `DEF` in the results. Notice that `D` will be returned in the first and second batches of results. The following diagram illustrates the duplication. + +Request | List of memories (returned memories are enclosed in brackets) | Results returned in the response +:--- | :--- | :--- +Get Memory (next_token = 0, max_results = 3) | [BCD]EF | BCD +Create Memory | ABCDEF | - +Get Memory (next_token = 3, max_results = 3) -> ABC[DEF] | DEF + + +#### Example request: Get all memories + +```json +GET /_plugins/_ml/memory/ +``` +{% include copy-curl.html %} + +#### Example request: Paginating results + +```json +GET /_plugins/_ml/memory?max_results=2&next_token=1 +``` + +#### Example response + +```json +{ + "memories": [ + { + "memory_id": "gW8Aa40BfUsSoeNTvOKI", + "create_time": "2024-02-02T18:07:06.887061463Z", + "updated_time": "2024-02-02T19:01:32.121444968Z", + "name": "Conversation for a RAG pipeline", + "user": "admin" + } + ] +} +``` + +## Response fields + +The following table lists the available response fields. + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `memory_id` | String | The memory ID. | +| `create_time` | String | The time at which the memory was created. | +| `updated_time` | String | The time at which the memory was last updated. | +| `name` | String | The memory name. | +| `user` | String | The username of the user who created the memory. | \ No newline at end of file diff --git a/_ml-commons-plugin/api/memory-apis/get-message-traces.md b/_ml-commons-plugin/api/memory-apis/get-message-traces.md new file mode 100644 index 0000000000..300adfc11d --- /dev/null +++ b/_ml-commons-plugin/api/memory-apis/get-message-traces.md @@ -0,0 +1,142 @@ +--- +layout: default +title: Get message traces +parent: Memory APIs +grand_parent: ML Commons APIs +nav_order: 70 +--- + +# Get message traces +**Introduced 2.12** +{: .label .label-purple } + +Use this API to retrieve message trace information for [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). This can be useful for debugging. + +For each message, an agent may need to run different tools. You can use the Get Traces API to get all trace data for a message. The trace data includes detailed steps of a message execution. + +When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. +{: .important} + + +## Path and HTTP methods + +```json +GET /_plugins/_ml/memory/message//traces +``` + +## Path parameters + +The following table lists the available path parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`message_id` | String | The ID of the message to trace. + +#### Example request + +```json +GET /_plugins/_ml/memory/message/TAuCZY0BT2tRrkdmCPqZ/traces +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "traces": [ + { + "memory_id": "7Qt4ZY0BT2tRrkdmSPlo", + "message_id": "TQuCZY0BT2tRrkdmEvpp", + "create_time": "2024-02-01T16:30:39.719968032Z", + "input": "Which index has most documents", + "prompt_template": null, + "response": "Let me check the document counts of each index", + "origin": null, + "additional_info": {}, + "parent_message_id": "TAuCZY0BT2tRrkdmCPqZ", + "trace_number": 1 + }, + { + "memory_id": "7Qt4ZY0BT2tRrkdmSPlo", + "message_id": "TguCZY0BT2tRrkdmEvp7", + "create_time": "2024-02-01T16:30:39.732979687Z", + "input": "", + "prompt_template": null, + "response": """health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open .plugins-ml-model-group lHgGEgJhT_mpADyOZoXl2g 1 1 9 2 33.4kb 16.7kb +green open .plugins-ml-memory-meta b2LEpv0QS8K60QBjXtRm6g 1 1 13 0 117.5kb 58.7kb +green open .ql-datasources 9NXm_tMXQc6s_4uRToSNkQ 1 1 0 0 416b 208b +green open sample-ecommerce UPYOQcAfRGqFAlSxcZlRjw 1 1 40320 0 4.1mb 2mb +green open .plugins-ml-task xYTlprYCQnaaYici69SOjA 1 1 117 0 115.5kb 57.6kb +green open .opendistro_security 7DAqhm9QQmeEsQYhA40cJg 1 1 10 0 117kb 58.5kb +green open sample-host-health Na5tq6UiTt6r_qYME1vV-w 1 1 40320 0 2.6mb 1.3mb +green open .opensearch-observability 6PthtLluSKyYCdZR3Mw0iw 1 1 0 0 416b 208b +green open .plugins-ml-model WYcjBHcnRuSDHeVWPVupoA 1 1 191 45 4.2gb 2.1gb +green open index_for_neural_sparse GQswGabQRIazM_trnqaDrw 1 1 5 0 28.4kb 14.2kb +green open security-auditlog-2024.01.30 BhXR7Nd3QVOVGxJNpR0-jw 1 1 27768 0 13.8mb 7mb +green open sample-http-responses 0gmYYYdOTiCbVUvl_uDL0w 1 1 40320 0 2.5mb 1.2mb +green open security-auditlog-2024.02.01 2VD1ieDGS5m-TfjIdfT8Eg 1 1 36386 0 37mb 18.2mb +green open opensearch_dashboards_sample_data_ecommerce wnE6r7OvSPqc5YHj8wHSLA 1 1 4675 0 8.8mb 4.4mb +green open security-auditlog-2024.01.31 cNRK5-2eTwes0SRlXTl0RQ 1 1 34520 0 20.5mb 9.8mb +green open .plugins-ml-memory-message wTNBU4BBQVSFcFhNlUdfBQ 1 1 88 1 399.7kb 205kb +green open .plugins-flow-framework-state dJUNDv9MSJ2jjwKbzXPlrw 1 1 39 0 114.1kb 57kb +green open .plugins-ml-agent 7X1IzoLuSGmIujOh9i5mmg 1 1 27 0 146.6kb 73.3kb +green open .plugins-flow-framework-templates _ecC0KahTlmG_3tFUst7Uw 1 1 18 0 175.8kb 87.9kb +green open .plugins-ml-connector q45iJfVjQ5KgxeNC65DLSw 1 1 11 0 313.1kb 156.5kb +green open .kibana_1 vRjXK4bHSUueB_4iXiQ8yw 1 1 257 0 264kb 132kb +green open .plugins-ml-config G7gxGQB7TZeQzBasHd5PUg 1 1 1 0 7.8kb 3.9kb +green open .plugins-ml-controller NQTZPREZRhWoDdjCglRLFg 1 1 0 0 50.1kb 49.9kb +green open opensearch_dashboards_sample_data_logs 9gpOTB3rRgqBLvqis_k5LQ 1 1 14074 0 18mb 9mb +green open .plugins-flow-framework-config JlKPsCh6SEq-Jh6rPL_x9Q 1 1 1 0 7.8kb 3.9kb +green open opensearch_dashboards_sample_data_flights pJde0irnTce4-uobHwYmMQ 1 1 13059 0 11.9mb 5.9mb +green open my_test_data T4hwNs7CTJGIfw2QpCqQ_Q 1 1 6 0 91.7kb 45.8kb +green open .opendistro-job-scheduler-lock XjgmXAVKQ4e8Y-ac54VBzg 1 1 3 0 38.7kb 19.4kb +""", + "origin": "CatIndexTool", + "additional_info": {}, + "parent_message_id": "TAuCZY0BT2tRrkdmCPqZ", + "trace_number": 2 + }, + { + "memory_id": "7Qt4ZY0BT2tRrkdmSPlo", + "message_id": "UwuCZY0BT2tRrkdmHPos", + "create_time": "2024-02-01T16:30:42.217897656Z", + "input": "Which index has most documents", + "prompt_template": null, + "response": "Based on the cluster health information provided, the index with the most documents is .plugins-ml-model with 191 documents", + "origin": null, + "additional_info": {}, + "parent_message_id": "TAuCZY0BT2tRrkdmCPqZ", + "trace_number": 3 + }, + { + "memory_id": "7Qt4ZY0BT2tRrkdmSPlo", + "message_id": "UQuCZY0BT2tRrkdmHPos", + "create_time": "2024-02-01T16:30:42.218120716Z", + "input": "Which index has most documents", + "prompt_template": null, + "response": "The index with the most documents is the .plugins-ml-model index, which contains 191 documents based on the cluster health information provided.", + "origin": null, + "additional_info": {}, + "parent_message_id": "TAuCZY0BT2tRrkdmCPqZ", + "trace_number": 4 + }, + { + "memory_id": "7Qt4ZY0BT2tRrkdmSPlo", + "message_id": "UguCZY0BT2tRrkdmHPos", + "create_time": "2024-02-01T16:30:42.218240713Z", + "input": "Which index has most documents", + "prompt_template": null, + "response": "The index with the most documents is the .plugins-ml-model index, which contains 191 documents based on the cluster health information provided.", + "origin": null, + "additional_info": {}, + "parent_message_id": "TAuCZY0BT2tRrkdmCPqZ", + "trace_number": 5 + } + ] +} +``` + +## Response fields + +For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-fields). \ No newline at end of file diff --git a/_ml-commons-plugin/api/memory-apis/get-message.md b/_ml-commons-plugin/api/memory-apis/get-message.md new file mode 100644 index 0000000000..2f4cfc949f --- /dev/null +++ b/_ml-commons-plugin/api/memory-apis/get-message.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Get message +parent: Memory APIs +grand_parent: ML Commons APIs +nav_order: 50 +--- + +# Get message +**Introduced 2.12** +{: .label .label-purple } + +Use this API to retrieve message information for [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). + +To retrieve message information, you can: + +- [Get a message by ID](#get-a-message-by-id). +- [Get all messages within a memory](#get-all-messages-within-a-memory). + +When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. +{: .important} + +## Get a message by ID + +You can retrieve message information by using the `message_id`. + +### Path and HTTP methods + +```json +GET /_plugins/_ml/memory/message/ +``` + +### Path parameters + +The following table lists the available path parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`message_id` | String | The ID of the message to retrieve. + +#### Example request + +```json +GET /_plugins/_ml/memory/message/0m8ya40BfUsSoeNTj-pU +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "memory_id": "gW8Aa40BfUsSoeNTvOKI", + "message_id": "0m8ya40BfUsSoeNTj-pU", + "create_time": "2024-02-02T19:01:32.113621539Z", + "input": null, + "prompt_template": null, + "response": "Hello, this is OpenAI. Here is the answer to your question.", + "origin": null, + "additional_info": { + "suggestion": "api.openai.com" + } +} +``` + +For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-fields). + +## Get all messages within a memory + +Use this command to get a list of messages for a certain memory. + +### Path and HTTP methods + +```json +GET /_plugins/_ml/memory//messages +``` + +### Path parameters + +The following table lists the available path parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`memory_id` | String | The ID of the memory for which to retrieve messages. + +#### Example request + +```json +GET /_plugins/_ml/memory/gW8Aa40BfUsSoeNTvOKI/messages +``` +{% include copy-curl.html %} + +```json +POST /_plugins/_ml/message/_search +{ + "query": { + "match_all": {} + }, + "size": 1000 +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "messages": [ + { + "memory_id": "gW8Aa40BfUsSoeNTvOKI", + "message_id": "BW8ha40BfUsSoeNT8-i3", + "create_time": "2024-02-02T18:43:23.566994302Z", + "input": "How do I make an interaction?", + "prompt_template": "Hello OpenAI, can you answer this question?", + "response": "Hello, this is OpenAI. Here is the answer to your question.", + "origin": "MyFirstOpenAIWrapper", + "additional_info": { + "suggestion": "api.openai.com" + } + }, + { + "memory_id": "gW8Aa40BfUsSoeNTvOKI", + "message_id": "0m8ya40BfUsSoeNTj-pU", + "create_time": "2024-02-02T19:01:32.113621539Z", + "input": null, + "prompt_template": null, + "response": "Hello, this is OpenAI. Here is the answer to your question.", + "origin": null, + "additional_info": { + "suggestion": "api.openai.com" + } + } + ] +} +``` + +## Response fields + +For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-fields). + diff --git a/_ml-commons-plugin/api/memory-apis/index.md b/_ml-commons-plugin/api/memory-apis/index.md new file mode 100644 index 0000000000..a279eafac9 --- /dev/null +++ b/_ml-commons-plugin/api/memory-apis/index.md @@ -0,0 +1,32 @@ +--- +layout: default +title: Memory APIs +parent: ML Commons APIs +has_children: true +has_toc: false +nav_order: 28 +redirect_from: /ml-commons-plugin/api/memory-apis/ +--- + +# Memory APIs +**Introduced 2.12** +{: .label .label-purple } + +Memory APIs provide operations needed to implement [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). A memory stores conversation history for the current conversation. A message represents one question/answer interaction between the user and a large language model. Messages are organized into memories. + +ML Commons supports the following memory-level APIs: + +- [Create or update memory]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-memory/) +- [Get memory]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/get-memory/) +- [Search memory]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/search-memory/) +- [Delete memory]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/delete-memory/) + +ML Commons supports the following message-level APIs: + +- [Create or update message]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message/) +- [Get message]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/get-message/) +- [Search message]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/search-message/) +- [Get message traces]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/get-message-traces/) + +When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. +{: .important} \ No newline at end of file diff --git a/_ml-commons-plugin/api/memory-apis/search-memory.md b/_ml-commons-plugin/api/memory-apis/search-memory.md new file mode 100644 index 0000000000..fc8dd3e1d9 --- /dev/null +++ b/_ml-commons-plugin/api/memory-apis/search-memory.md @@ -0,0 +1,133 @@ +--- +layout: default +title: Search memory +parent: Memory APIs +grand_parent: ML Commons APIs +nav_order: 25 +--- + +# Search for a memory +**Introduced 2.12** +{: .label .label-purple } + +This API retrieves a conversational memory for [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). Use this command to search for memories. + +When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. +{: .important} + +## Path and HTTP methods + +```json +GET /_plugins/_ml/memory/_search +POST /_plugins/_ml/memory/_search +``` + +#### Example request: Searching for all memories + +```json +POST /_plugins/_ml/memory/_search +{ + "query": { + "match_all": {} + }, + "size": 1000 +} +``` +{% include copy-curl.html %} + +#### Example request: Searching for a memory by name + +```json +POST /_plugins/_ml/memory/_search +{ + "query": { + "term": { + "name": { + "value": "conversation" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 0.2195382, + "hits": [ + { + "_index": ".plugins-ml-memory-meta", + "_id": "znCqcI0BfUsSoeNTntd7", + "_version": 3, + "_seq_no": 39, + "_primary_term": 1, + "_score": 0.2195382, + "_source": { + "updated_time": "2024-02-03T20:36:10.252213029Z", + "create_time": "2024-02-03T20:30:46.395829411Z", + "application_type": null, + "name": "Conversation about NYC population", + "user": "admin" + } + }, + { + "_index": ".plugins-ml-memory-meta", + "_id": "iXC4bI0BfUsSoeNTjS30", + "_version": 4, + "_seq_no": 11, + "_primary_term": 1, + "_score": 0.20763937, + "_source": { + "updated_time": "2024-02-03T02:59:39.862347093Z", + "create_time": "2024-02-03T02:07:30.804554275Z", + "application_type": null, + "name": "Test conversation for RAG pipeline", + "user": "admin" + } + }, + { + "_index": ".plugins-ml-memory-meta", + "_id": "gW8Aa40BfUsSoeNTvOKI", + "_version": 4, + "_seq_no": 6, + "_primary_term": 1, + "_score": 0.19754036, + "_source": { + "updated_time": "2024-02-02T19:01:32.121444968Z", + "create_time": "2024-02-02T18:07:06.887061463Z", + "application_type": null, + "name": "Conversation for a RAG pipeline", + "user": "admin" + } + } + ] + } +} +``` + +## Response fields + +The following table lists all response fields. + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `memory_id` | String | The memory ID. | +| `create_time` | String | The time at which the memory was created. | +| `updated_time` | String | The time at which the memory was last updated. | +| `name` | String | The memory name. | +| `user` | String | The username of the user who created the memory. | \ No newline at end of file diff --git a/_ml-commons-plugin/api/memory-apis/search-message.md b/_ml-commons-plugin/api/memory-apis/search-message.md new file mode 100644 index 0000000000..a88ccfbb41 --- /dev/null +++ b/_ml-commons-plugin/api/memory-apis/search-message.md @@ -0,0 +1,94 @@ +--- +layout: default +title: Search message +parent: Memory APIs +grand_parent: ML Commons APIs +nav_order: 60 +--- + +# Search for a message +**Introduced 2.12** +{: .label .label-purple } + +Retrieves message information for [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). You can send queries to the `_search` endpoint to search for matching messages within a memory. + +When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. +{: .important} + +## Path and HTTP methods + +```json +POST /_plugins/_ml/memory//_search +GET /_plugins/_ml/memory//_search +``` + +### Path parameters + +The following table lists the available path parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`memory_id` | String | The ID of the memory used to search for messages matching the query. + +#### Example request + +```json +GET /_plugins/_ml/memory/gW8Aa40BfUsSoeNTvOKI/_search +{ + "query": { + "match": { + "input": "interaction" + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.47000366, + "hits": [ + { + "_index": ".plugins-ml-memory-message", + "_id": "BW8ha40BfUsSoeNT8-i3", + "_version": 1, + "_seq_no": 0, + "_primary_term": 1, + "_score": 0.47000366, + "_source": { + "input": "How do I make an interaction?", + "memory_id": "gW8Aa40BfUsSoeNTvOKI", + "trace_number": null, + "create_time": "2024-02-02T18:43:23.566994302Z", + "additional_info": { + "suggestion": "api.openai.com" + }, + "response": "Hello, this is OpenAI. Here is the answer to your question.", + "origin": "MyFirstOpenAIWrapper", + "parent_message_id": null, + "prompt_template": "Hello OpenAI, can you answer this question?" + } + } + ] + } +} +``` + +## Response fields + +For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-fields). \ No newline at end of file diff --git a/_ml-commons-plugin/api/model-apis/delete-model.md b/_ml-commons-plugin/api/model-apis/delete-model.md index b75abd0b81..b35e7c808b 100644 --- a/_ml-commons-plugin/api/model-apis/delete-model.md +++ b/_ml-commons-plugin/api/model-apis/delete-model.md @@ -2,7 +2,7 @@ layout: default title: Delete model parent: Model APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 50 --- diff --git a/_ml-commons-plugin/api/model-apis/deploy-model.md b/_ml-commons-plugin/api/model-apis/deploy-model.md index 7c6de7ab1f..2c6991ba22 100644 --- a/_ml-commons-plugin/api/model-apis/deploy-model.md +++ b/_ml-commons-plugin/api/model-apis/deploy-model.md @@ -2,13 +2,25 @@ layout: default title: Deploy model parent: Model APIs -grand_parent: ML Commons API -nav_order: 30 +grand_parent: ML Commons APIs +nav_order: 20 --- # Deploy a model -The deploy model operation reads the model's chunks from the model index and then creates an instance of the model to cache into memory. This operation requires the `model_id`. +The deploy model operation reads the model's chunks from the model index and then creates an instance of the model to cache in memory. This operation requires the `model_id`. + +Starting with OpenSearch version 2.13, [externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index) are deployed automatically by default when you send a Predict API request for the first time. To disable automatic deployment for an externally hosted model, set `plugins.ml_commons.model_auto_deploy.enable` to `false`: + +```json +PUT _cluster/settings +{ + "persistent": { + "plugins.ml_commons.model_auto_deploy.enable": "false" + } +} +``` +{% include copy-curl.html %} For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). diff --git a/_ml-commons-plugin/api/model-apis/get-model.md b/_ml-commons-plugin/api/model-apis/get-model.md index 7351372154..0286497d31 100644 --- a/_ml-commons-plugin/api/model-apis/get-model.md +++ b/_ml-commons-plugin/api/model-apis/get-model.md @@ -2,236 +2,57 @@ layout: default title: Get model parent: Model APIs -grand_parent: ML Commons API -nav_order: 20 +grand_parent: ML Commons APIs +nav_order: 30 --- # Get a model -To retrieve information about a model, you can: - -- [Get a model by ID](#get-a-model-by-id) -- [Search for a model](#search-for-a-model) - -## Get a model by ID - You can retrieve model information using the `model_id`. For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). -### Path and HTTP methods - -```json -GET /_plugins/_ml/models/ -``` - -#### Example request - -```json -GET /_plugins/_ml/models/N8AE1osB0jLkkocYjz7D -``` -{% include copy-curl.html %} - -#### Example response - -```json -{ -"name" : "all-MiniLM-L6-v2_onnx", -"algorithm" : "TEXT_EMBEDDING", -"version" : "1", -"model_format" : "TORCH_SCRIPT", -"model_state" : "LOADED", -"model_content_size_in_bytes" : 83408741, -"model_content_hash_value" : "9376c2ebd7c83f99ec2526323786c348d2382e6d86576f750c89ea544d6bbb14", -"model_config" : { - "model_type" : "bert", - "embedding_dimension" : 384, - "framework_type" : "SENTENCE_TRANSFORMERS", - "all_config" : """{"_name_or_path":"nreimers/MiniLM-L6-H384-uncased","architectures":["BertModel"],"attention_probs_dropout_prob":0.1,"gradient_checkpointing":false,"hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":384,"initializer_range":0.02,"intermediate_size":1536,"layer_norm_eps":1e-12,"max_position_embeddings":512,"model_type":"bert","num_attention_heads":12,"num_hidden_layers":6,"pad_token_id":0,"position_embedding_type":"absolute","transformers_version":"4.8.2","type_vocab_size":2,"use_cache":true,"vocab_size":30522}""" -}, -"created_time" : 1665961344044, -"last_uploaded_time" : 1665961373000, -"last_loaded_time" : 1665961815959, -"total_chunks" : 9 -} -``` - -## Search for a model - -Use this command to search for models you've already created. - -The response will contain only those model versions to which you have access. For example, if you send a match all query, model versions for the following model group types will be returned: - -- All public model groups in the index. -- Private model groups for which you are the model owner. -- Model groups that share at least one backend role with your backend roles. - -For more information, see [Model access control]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control/). - -### Path and HTTP methods - -```json -GET /_plugins/_ml/models/_search -POST /_plugins/_ml/models/_search -``` - -#### Example request: Searching for all models +## Path and HTTP methods ```json -POST /_plugins/_ml/models/_search -{ - "query": { - "match_all": {} - }, - "size": 1000 -} +GET /_plugins/_ml/models/ ``` -{% include copy-curl.html %} -#### Example request: Searching for models with algorithm "FIT_RCF" +## Path parameters -```json -POST /_plugins/_ml/models/_search -{ - "query": { - "term": { - "algorithm": { - "value": "FIT_RCF" - } - } - } -} -``` -{% include copy-curl.html %} - -#### Example: Excluding model chunks - -```json -GET /_plugins/_ml/models/_search -{ - "query": { - "bool": { - "must_not": { - "exists": { - "field": "chunk_number" - } - } - } - }, - "sort": [ - { - "created_time": { - "order": "desc" - } - } - ] -} -``` -{% include copy-curl.html %} +The following table lists the available path parameters. -#### Example: Searching for all model chunks +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `model_id` | String | The model ID of the model to retrieve. | -The following query searches for all chunks of the model with the ID `979y9YwBjWKCe6KgNGTm` and sorts the chunks in ascending order: +#### Example request ```json -GET /_plugins/_ml/models/_search -{ - "query": { - "bool": { - "filter": [ - { - "term": { - "model_id": "9r9w9YwBjWKCe6KgyGST" - } - } - ] - } - }, - "sort": [ - { - "chunk_number": { - "order": "asc" - } - } - ] -} +GET /_plugins/_ml/models/N8AE1osB0jLkkocYjz7D ``` {% include copy-curl.html %} -#### Example: Searching for a model by description +#### Example response ```json -GET _plugins/_ml/models/_search { - "query": { - "bool": { - "should": [ - { - "match": { - "description": "sentence transformer" - } - } - ], - "must_not": { - "exists": { - "field": "chunk_number" - } - } - } + "name" : "all-MiniLM-L6-v2_onnx", + "algorithm" : "TEXT_EMBEDDING", + "version" : "1", + "model_format" : "TORCH_SCRIPT", + "model_state" : "LOADED", + "model_content_size_in_bytes" : 83408741, + "model_content_hash_value" : "9376c2ebd7c83f99ec2526323786c348d2382e6d86576f750c89ea544d6bbb14", + "model_config" : { + "model_type" : "bert", + "embedding_dimension" : 384, + "framework_type" : "SENTENCE_TRANSFORMERS", + "all_config" : """{"_name_or_path":"nreimers/MiniLM-L6-H384-uncased","architectures":["BertModel"],"attention_probs_dropout_prob":0.1,"gradient_checkpointing":false,"hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":384,"initializer_range":0.02,"intermediate_size":1536,"layer_norm_eps":1e-12,"max_position_embeddings":512,"model_type":"bert","num_attention_heads":12,"num_hidden_layers":6,"pad_token_id":0,"position_embedding_type":"absolute","transformers_version":"4.8.2","type_vocab_size":2,"use_cache":true,"vocab_size":30522}""" }, - "size": 1000 + "created_time" : 1665961344044, + "last_uploaded_time" : 1665961373000, + "last_loaded_time" : 1665961815959, + "total_chunks" : 9 } -``` -{% include copy-curl.html %} - -#### Example response - -```json -{ - "took" : 8, - "timed_out" : false, - "_shards" : { - "total" : 1, - "successful" : 1, - "skipped" : 0, - "failed" : 0 - }, - "hits" : { - "total" : { - "value" : 2, - "relation" : "eq" - }, - "max_score" : 2.4159138, - "hits" : [ - { - "_index" : ".plugins-ml-model", - "_id" : "-QkKJX8BvytMh9aUeuLD", - "_version" : 1, - "_seq_no" : 12, - "_primary_term" : 15, - "_score" : 2.4159138, - "_source" : { - "name" : "FIT_RCF", - "version" : 1, - "content" : "xxx", - "algorithm" : "FIT_RCF" - } - }, - { - "_index" : ".plugins-ml-model", - "_id" : "OxkvHn8BNJ65KnIpck8x", - "_version" : 1, - "_seq_no" : 2, - "_primary_term" : 8, - "_score" : 2.4159138, - "_source" : { - "name" : "FIT_RCF", - "version" : 1, - "content" : "xxx", - "algorithm" : "FIT_RCF" - } - } - ] - } - } ``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/model-apis/index.md b/_ml-commons-plugin/api/model-apis/index.md index 5cbdbe27f1..444da1fe70 100644 --- a/_ml-commons-plugin/api/model-apis/index.md +++ b/_ml-commons-plugin/api/model-apis/index.md @@ -1,9 +1,10 @@ --- layout: default title: Model APIs -parent: ML Commons API +parent: ML Commons APIs has_children: true nav_order: 10 +has_toc: false --- # Model APIs @@ -11,10 +12,13 @@ nav_order: 10 ML Commons supports the following model-level APIs: - [Register model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/) -- [Get model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/get-model/) - [Deploy model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/deploy-model/) +- [Get model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/get-model/) +- [Search model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/search-model/) +- [Update model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/update-model/) - [Undeploy model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/undeploy-model/) - [Delete model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/delete-model/) +- [Predict]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/predict/) (invokes a model) ## Model access control considerations diff --git a/_ml-commons-plugin/api/model-apis/register-model.md b/_ml-commons-plugin/api/model-apis/register-model.md index fbfe3c0f2e..dd157ed264 100644 --- a/_ml-commons-plugin/api/model-apis/register-model.md +++ b/_ml-commons-plugin/api/model-apis/register-model.md @@ -2,7 +2,7 @@ layout: default title: Register model parent: Model APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 10 --- @@ -29,7 +29,14 @@ If the model is more than 10 MB in size, ML Commons splits it into smaller chunk ```json POST /_plugins/_ml/models/_register ``` -{% include copy-curl.html %} + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `deploy` | Boolean | Whether to deploy the model after registering it. The deploy operation is performed by calling the [Deploy Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/deploy-model/). Default is `false`. | ## Register an OpenSearch-provided pretrained model @@ -50,6 +57,7 @@ Field | Data type | Required/Optional | Description `model_format` | String | Required | The portable format of the model file. Valid values are `TORCH_SCRIPT` and `ONNX`. | `description` | String | Optional| The model description. | `model_group_id` | String | Optional | The model group ID of the model group to register this model to. +`is_enabled`| Boolean | Specifies whether the model is enabled. Disabling the model makes it unavailable for Predict API requests, regardless of the model's deployment status. Default is `true`. #### Example request: OpenSearch-provided text embedding model @@ -77,11 +85,12 @@ Field | Data type | Required/Optional | Description `name`| String | Required | The model name. | `version` | String | Required | The model version. | `model_format` | String | Required | The portable format of the model file. Valid values are `TORCH_SCRIPT` and `ONNX`. | -`function_name` | String | Required | Set this parameter to `SPARSE_ENCODING` or `SPARSE_TOKENIZE`. +`function_name` | String | Required | For text embedding models, set this parameter to `TEXT_EMBEDDING`. For sparse encoding models, set this parameter to `SPARSE_ENCODING` or `SPARSE_TOKENIZE`. For cross-encoder models, set this parameter to `TEXT_SIMILARITY`. `model_content_hash_value` | String | Required | The model content hash generated using the SHA-256 hashing algorithm. `url` | String | Required | The URL that contains the model. | `description` | String | Optional| The model description. | `model_group_id` | String | Optional | The model group ID of the model group to register this model to. +`is_enabled`| Boolean | Specifies whether the model is enabled. Disabling the model makes it unavailable for Predict API requests, regardless of the model's deployment status. Default is `true`. #### Example request: OpenSearch-provided sparse encoding model @@ -89,13 +98,9 @@ Field | Data type | Required/Optional | Description POST /_plugins/_ml/models/_register { "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1", - "version": "1.0.0", + "version": "1.0.1", "model_group_id": "Z1eQf4oB5Vm0Tdw8EIP2", - "description": "This is a neural sparse encoding model: It transfers text into sparse vector, and then extract nonzero index and value to entry and weights. It serves only in ingestion and customer should use tokenizer model in query.", - "model_format": "TORCH_SCRIPT", - "function_name": "SPARSE_ENCODING", - "model_content_hash_value": "9a41adb6c13cf49a7e3eff91aef62ed5035487a6eca99c996156d25be2800a9a", - "url": "https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1/1.0.0/torch_script/opensearch-neural-sparse-encoding-doc-v1-1.0.0-torch_script.zip" + "model_format": "TORCH_SCRIPT" } ``` {% include copy-curl.html %} @@ -119,6 +124,7 @@ Field | Data type | Required/Optional | Description `url` | String | Required | The URL that contains the model. | `description` | String | Optional| The model description. | `model_group_id` | String | Optional | The model group ID of the model group to register this model to. +`is_enabled`| Boolean | Specifies whether the model is enabled. Disabling the model makes it unavailable for Predict API requests, regardless of the model's deployment status. Default is `true`. #### The `model_config` object @@ -176,8 +182,10 @@ Field | Data type | Required/Optional | Description `connector` | Object | Required | Contains specifications for a connector for a model hosted on a third-party platform. For more information, see [Creating a connector for a specific model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/#creating-a-connector-for-a-specific-model). You must provide either `connector_id` or `connector`. `description` | String | Optional| The model description. | `model_group_id` | String | Optional | The model group ID of the model group to register this model to. +`is_enabled`| Boolean | Specifies whether the model is enabled. Disabling the model makes it unavailable for Predict API requests, regardless of the model's deployment status. Default is `true`. +`guardrails`| Object | Optional | The guardrails for the model input. For more information, see [Guardrails](#the-guardrails-parameter).| -#### Example request: Remote model with a standalone connector +#### Example request: Externally hosted with a standalone connector ```json POST /_plugins/_ml/models/_register @@ -191,7 +199,7 @@ POST /_plugins/_ml/models/_register ``` {% include copy-curl.html %} -#### Example request: Remote model with a connector specified as part of the model +#### Example request: Externally hosted with a connector specified as part of the model ```json POST /_plugins/_ml/models/_register @@ -241,6 +249,70 @@ OpenSearch responds with the `task_id` and task `status`. } ``` +### The `guardrails` parameter + +Guardrails are safety measures for large language models (LLMs). They provide a set of rules and boundaries that control how an LLM behaves and what kind of output it generates. + +To register an externally hosted model with guardrails, provide the `guardrails` parameter, which supports the following fields. All fields are optional. + +Field | Data type | Description +:--- | :--- | :--- +`type` | String | The guardrail type. Currently, only `local_regex` is supported. +`input_guardrail`| Object | The guardrail for the model input. | +`output_guardrail`| Object | The guardrail for the model output. | +`stop_words`| Object | The list of indexes containing stopwords used for the model input/output validation. If the model prompt/response contains a stopword contained in any of the indexes, the predict request on this model is rejected. | +`index_name`| Object | The name of the index storing the stopwords. | +`source_fields`| Object | The name of the field storing the stopwords. | +`regex`| Object | A regular expression used for input/output validation. If the model prompt/response matches the regular expression, the predict request on this model is rejected. | + +#### Example request: Externally hosted model with guardrails + +```json +POST /_plugins/_ml/models/_register +{ + "name": "openAI-gpt-3.5-turbo", + "function_name": "remote", + "model_group_id": "1jriBYsBq7EKuKzZX131", + "description": "test model", + "connector_id": "a1eMb4kBJ1eYAeTMAljY", + "guardrails": { + "type": "local_regex", + "input_guardrail": { + "stop_words": [ + { + "index_name": "stop_words_input", + "source_fields": ["title"] + } + ], + "regex": ["regex1", "regex2"] + }, + "output_guardrail": { + "stop_words": [ + { + "index_name": "stop_words_output", + "source_fields": ["title"] + } + ], + "regex": ["regex1", "regex2"] + } + } +} +``` +{% include copy-curl.html %} + +For a complete example, see [Guardrails]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/guardrails/). + +#### Example response + +OpenSearch responds with the `task_id` and task `status`: + +```json +{ + "task_id" : "ew8I44MBhyWuIwnfvDIH", + "status" : "CREATED" +} +``` + ## Check the status of model registration To see the status of your model registration and retrieve the model ID created for the new model version, pass the `task_id` as a path parameter to the Tasks API: diff --git a/_ml-commons-plugin/api/model-apis/search-model.md b/_ml-commons-plugin/api/model-apis/search-model.md new file mode 100644 index 0000000000..729237eb74 --- /dev/null +++ b/_ml-commons-plugin/api/model-apis/search-model.md @@ -0,0 +1,187 @@ +--- +layout: default +title: Search model +parent: Model APIs +grand_parent: ML Commons APIs +nav_order: 35 +--- + +# Search for a model + +You can use this command to search for models you've already created. + +The response will contain only those model versions to which you have access. For example, if you send a `match_all` query, model versions for the following model group types will be returned: + +- All public model groups in the index +- Private model groups for which you are the model owner +- Model groups that share at least one backend role with your backend roles + +For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). + +## Path and HTTP methods + +```json +GET /_plugins/_ml/models/_search +POST /_plugins/_ml/models/_search +``` + +#### Example request: Searching for all models + +```json +POST /_plugins/_ml/models/_search +{ + "query": { + "match_all": {} + }, + "size": 1000 +} +``` +{% include copy-curl.html %} + +#### Example request: Searching for models with the algorithm "FIT_RCF" + +```json +POST /_plugins/_ml/models/_search +{ + "query": { + "term": { + "algorithm": { + "value": "FIT_RCF" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example: Excluding model chunks + +```json +GET /_plugins/_ml/models/_search +{ + "query": { + "bool": { + "must_not": { + "exists": { + "field": "chunk_number" + } + } + } + }, + "sort": [ + { + "created_time": { + "order": "desc" + } + } + ] +} +``` +{% include copy-curl.html %} + +#### Example: Searching for all model chunks + +The following query searches for all chunks of the model with the ID `979y9YwBjWKCe6KgNGTm` and sorts the chunks in ascending order: + +```json +GET /_plugins/_ml/models/_search +{ + "query": { + "bool": { + "filter": [ + { + "term": { + "model_id": "9r9w9YwBjWKCe6KgyGST" + } + } + ] + } + }, + "sort": [ + { + "chunk_number": { + "order": "asc" + } + } + ] +} +``` +{% include copy-curl.html %} + +#### Example: Searching for a model by description + +```json +GET _plugins/_ml/models/_search +{ + "query": { + "bool": { + "should": [ + { + "match": { + "description": "sentence transformer" + } + } + ], + "must_not": { + "exists": { + "field": "chunk_number" + } + } + } + }, + "size": 1000 +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took" : 8, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 2.4159138, + "hits" : [ + { + "_index" : ".plugins-ml-model", + "_id" : "-QkKJX8BvytMh9aUeuLD", + "_version" : 1, + "_seq_no" : 12, + "_primary_term" : 15, + "_score" : 2.4159138, + "_source" : { + "name" : "FIT_RCF", + "version" : 1, + "content" : "xxx", + "algorithm" : "FIT_RCF" + } + }, + { + "_index" : ".plugins-ml-model", + "_id" : "OxkvHn8BNJ65KnIpck8x", + "_version" : 1, + "_seq_no" : 2, + "_primary_term" : 8, + "_score" : 2.4159138, + "_source" : { + "name" : "FIT_RCF", + "version" : 1, + "content" : "xxx", + "algorithm" : "FIT_RCF" + } + } + ] + } + } +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/model-apis/undeploy-model.md b/_ml-commons-plugin/api/model-apis/undeploy-model.md index 193da04391..0346d04566 100644 --- a/_ml-commons-plugin/api/model-apis/undeploy-model.md +++ b/_ml-commons-plugin/api/model-apis/undeploy-model.md @@ -2,8 +2,8 @@ layout: default title: Undeploy model parent: Model APIs -grand_parent: ML Commons API -nav_order: 40 +grand_parent: ML Commons APIs +nav_order: 45 --- # Undeploy a model diff --git a/_ml-commons-plugin/api/model-apis/update-model.md b/_ml-commons-plugin/api/model-apis/update-model.md new file mode 100644 index 0000000000..877d0b5c51 --- /dev/null +++ b/_ml-commons-plugin/api/model-apis/update-model.md @@ -0,0 +1,112 @@ +--- +layout: default +title: Update model +parent: Model APIs +grand_parent: ML Commons APIs +nav_order: 40 +--- + +# Update a model +**Introduced 2.12** +{: .label .label-purple } + +Updates a model based on the `model_ID`. + +For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). + +## Path and HTTP methods + +```json +PUT /_plugins/_ml/models/ +``` + +## Request fields + +The following table lists the updatable fields. Not all request fields are applicable to all models. To determine whether the field is applicable to your model type, see [Register Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/). + +Field | Data type | Description +:--- | :--- | :--- +`connector` | Object | Contains specifications for a connector for a model hosted on a third-party platform. For more information, see [Creating a connector for a specific model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/#creating-a-connector-for-a-specific-model). For information about the updatable fields within a connector, see [Update Connector API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/update-connector/#request-fields). +`connector_id` | Optional | The connector ID of a standalone connector for a model hosted on a third-party platform. For more information, see [Standalone connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/#creating-a-standalone-connector). To update a standalone connector, you must undeploy the model, update the connector, and then redeploy the model. +`description` | String | The model description. +`is_enabled`| Boolean | Specifies whether the model is enabled. Disabling the model makes it unavailable for Predict API requests, regardless of the model's deployment status. Default is `true`. +`model_config` | Object | The model's configuration, including the `model_type`, `embedding_dimension`, and `framework_type`. `all_config` is an optional JSON string that contains all model configurations. For more information, see [The `model_config` object]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model#the-model_config-object). | +`model_group_id` | String | The model group ID of the model group to which to register this model. +`name`| String | The model name. +`rate_limiter` | Object | Limits the number of times any user can call the Predict API on the model. For more information, see [Rate limiting inference calls]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#rate-limiting-inference-calls). +`rate_limiter.limit` | Integer | The maximum number of times any user can call the Predict API on the model per `unit` of time. By default, there is no limit on the number of Predict API calls. Once you set a limit, you cannot reset it to no limit. As an alternative, you can specify a high limit value and a small time unit, for example, 1 request per nanosecond. +`rate_limiter.unit` | String | The unit of time for the rate limiter. Valid values are `DAYS`, `HOURS`, `MICROSECONDS`, `MILLISECONDS`, `MINUTES`, `NANOSECONDS`, and `SECONDS`. +`guardrails`| Object | The guardrails for the model. + +#### Example request: Disabling a model + +```json +PUT /_plugins/_ml/models/MzcIJX8BA7mbufL6DOwl +{ + "is_enabled": false +} +``` +{% include copy-curl.html %} + +#### Example request: Rate limiting inference calls for a model + +The following request limits the number of times you can call the Predict API on the model to 4 Predict API calls per minute: + +```json +PUT /_plugins/_ml/models/T_S-cY0BKCJ3ot9qr0aP +{ + "rate_limiter": { + "limit": "4", + "unit": "MINUTES" + } +} +``` +{% include copy-curl.html %} + +#### Example request: Updating the guardrails + +```json +PUT /_plugins/_ml/models/MzcIJX8BA7mbufL6DOwl +{ + "guardrails": { + "input_guardrail": { + "stop_words": [ + { + "index_name": "updated_stop_words_input", + "source_fields": ["updated_title"] + } + ], + "regex": ["updated_regex1", "updated_regex2"] + }, + "output_guardrail": { + "stop_words": [ + { + "index_name": "updated_stop_words_output", + "source_fields": ["updated_title"] + } + ], + "regex": ["updated_regex1", "updated_regex2"] + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_index": ".plugins-ml-model", + "_id": "MzcIJX8BA7mbufL6DOwl", + "_version": 10, + "result": "updated", + "_shards": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "_seq_no": 48, + "_primary_term": 4 +} +``` + diff --git a/_ml-commons-plugin/api/model-group-apis/delete-model-group.md b/_ml-commons-plugin/api/model-group-apis/delete-model-group.md index 5070d97dcf..8ea19224c8 100644 --- a/_ml-commons-plugin/api/model-group-apis/delete-model-group.md +++ b/_ml-commons-plugin/api/model-group-apis/delete-model-group.md @@ -2,7 +2,7 @@ layout: default title: Delete model group parent: Model group APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 40 --- diff --git a/_ml-commons-plugin/api/model-group-apis/index.md b/_ml-commons-plugin/api/model-group-apis/index.md index 30d69e9570..6df8b3e8fe 100644 --- a/_ml-commons-plugin/api/model-group-apis/index.md +++ b/_ml-commons-plugin/api/model-group-apis/index.md @@ -1,8 +1,9 @@ --- layout: default title: Model group APIs -parent: ML Commons API +parent: ML Commons APIs has_children: true +has_toc: false nav_order: 20 --- @@ -12,5 +13,5 @@ ML Commons supports the following model-group-level APIs: - [Register model group]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/register-model-group/) - [Update model group]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/update-model-group/) -- [Search for a model group]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/search-model-group/) +- [Search model group]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/search-model-group/) - [Delete model group]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/delete-model-group/) \ No newline at end of file diff --git a/_ml-commons-plugin/api/model-group-apis/register-model-group.md b/_ml-commons-plugin/api/model-group-apis/register-model-group.md index 25f95b84a3..312513ff3f 100644 --- a/_ml-commons-plugin/api/model-group-apis/register-model-group.md +++ b/_ml-commons-plugin/api/model-group-apis/register-model-group.md @@ -2,7 +2,7 @@ layout: default title: Register model group parent: Model group APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 10 --- diff --git a/_ml-commons-plugin/api/model-group-apis/search-model-group.md b/_ml-commons-plugin/api/model-group-apis/search-model-group.md index 9ee8eb1b9b..1d47d550f0 100644 --- a/_ml-commons-plugin/api/model-group-apis/search-model-group.md +++ b/_ml-commons-plugin/api/model-group-apis/search-model-group.md @@ -1,8 +1,8 @@ --- layout: default -title: Search for a model group +title: Search model group parent: Model group APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 30 --- diff --git a/_ml-commons-plugin/api/model-group-apis/update-model-group.md b/_ml-commons-plugin/api/model-group-apis/update-model-group.md index b813cde8c4..5aa5239794 100644 --- a/_ml-commons-plugin/api/model-group-apis/update-model-group.md +++ b/_ml-commons-plugin/api/model-group-apis/update-model-group.md @@ -2,7 +2,7 @@ layout: default title: Update model group parent: Model group APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 20 --- diff --git a/_ml-commons-plugin/api/profile.md b/_ml-commons-plugin/api/profile.md index 1991bc8dd2..e8f65bb16c 100644 --- a/_ml-commons-plugin/api/profile.md +++ b/_ml-commons-plugin/api/profile.md @@ -1,7 +1,7 @@ --- layout: default title: Profile -parent: ML Commons API +parent: ML Commons APIs nav_order: 40 --- diff --git a/_ml-commons-plugin/api/stats.md b/_ml-commons-plugin/api/stats.md index c6b3eeb26e..8d93a96d98 100644 --- a/_ml-commons-plugin/api/stats.md +++ b/_ml-commons-plugin/api/stats.md @@ -1,7 +1,7 @@ --- layout: default title: Stats -parent: ML Commons API +parent: ML Commons APIs nav_order: 50 --- diff --git a/_ml-commons-plugin/api/tasks-apis/delete-task.md b/_ml-commons-plugin/api/tasks-apis/delete-task.md index a4c8a57dc7..f3e0b0896f 100644 --- a/_ml-commons-plugin/api/tasks-apis/delete-task.md +++ b/_ml-commons-plugin/api/tasks-apis/delete-task.md @@ -2,7 +2,7 @@ layout: default title: Delete task parent: Tasks APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 20 --- diff --git a/_ml-commons-plugin/api/tasks-apis/get-task.md b/_ml-commons-plugin/api/tasks-apis/get-task.md index f45074d1eb..14b28e8457 100644 --- a/_ml-commons-plugin/api/tasks-apis/get-task.md +++ b/_ml-commons-plugin/api/tasks-apis/get-task.md @@ -2,22 +2,15 @@ layout: default title: Get task parent: Tasks APIs -grand_parent: ML Commons API +grand_parent: ML Commons APIs nav_order: 10 --- # Get task -To retrieve information about a model, you can: - -- [Get a task by ID](#get-a-task-by-id) -- [Search for a task](#search-for-a-task) - -## Get a task by ID - You can retrieve information about a task using the `task_id`. -### Path and HTTP methods +## Path and HTTP methods ```json GET /_plugins/_ml/tasks/ @@ -30,6 +23,8 @@ GET /_plugins/_ml/tasks/MsBi1YsB0jLkkocYjD5f ``` {% include copy-curl.html %} +#### Example response + The response includes information about the task. ```json @@ -45,95 +40,3 @@ The response includes information about the task. "is_async" : true } ``` - -## Search for a task - -Searches tasks based on parameters indicated in the request body. - -### Path and HTTP methods - -```json -GET /_plugins/_ml/tasks/_search -``` - -#### Example request: Search for a task in which `function_name` is `KMEANS` - -```json -GET /_plugins/_ml/tasks/_search -{ - "query": { - "bool": { - "filter": [ - { - "term": { - "function_name": "KMEANS" - } - } - ] - } - } -} -``` -{% include copy-curl.html %} - -#### Example response - -```json -{ - "took" : 12, - "timed_out" : false, - "_shards" : { - "total" : 1, - "successful" : 1, - "skipped" : 0, - "failed" : 0 - }, - "hits" : { - "total" : { - "value" : 2, - "relation" : "eq" - }, - "max_score" : 0.0, - "hits" : [ - { - "_index" : ".plugins-ml-task", - "_id" : "_wnLJ38BvytMh9aUi-Ia", - "_version" : 4, - "_seq_no" : 29, - "_primary_term" : 4, - "_score" : 0.0, - "_source" : { - "last_update_time" : 1645640125267, - "create_time" : 1645640125209, - "is_async" : true, - "function_name" : "KMEANS", - "input_type" : "SEARCH_QUERY", - "worker_node" : "jjqFrlW7QWmni1tRnb_7Dg", - "state" : "COMPLETED", - "model_id" : "AAnLJ38BvytMh9aUi-M2", - "task_type" : "TRAINING" - } - }, - { - "_index" : ".plugins-ml-task", - "_id" : "wwRRLX8BydmmU1x6I-AI", - "_version" : 3, - "_seq_no" : 38, - "_primary_term" : 7, - "_score" : 0.0, - "_source" : { - "last_update_time" : 1645732766656, - "create_time" : 1645732766472, - "is_async" : true, - "function_name" : "KMEANS", - "input_type" : "SEARCH_QUERY", - "worker_node" : "A_IiqoloTDK01uZvCjREaA", - "state" : "COMPLETED", - "model_id" : "xARRLX8BydmmU1x6I-CG", - "task_type" : "TRAINING" - } - } - ] - } -} -``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/tasks-apis/index.md b/_ml-commons-plugin/api/tasks-apis/index.md index a5f07f94c4..e6f17aca08 100644 --- a/_ml-commons-plugin/api/tasks-apis/index.md +++ b/_ml-commons-plugin/api/tasks-apis/index.md @@ -1,8 +1,9 @@ --- layout: default title: Tasks APIs -parent: ML Commons API +parent: ML Commons APIs has_children: true +has_toc: false nav_order: 30 --- diff --git a/_ml-commons-plugin/api/tasks-apis/search-task.md b/_ml-commons-plugin/api/tasks-apis/search-task.md new file mode 100644 index 0000000000..526684a9ef --- /dev/null +++ b/_ml-commons-plugin/api/tasks-apis/search-task.md @@ -0,0 +1,99 @@ +--- +layout: default +title: Search task +parent: Tasks APIs +grand_parent: ML Commons APIs +nav_order: 15 +--- + +# Search for a task + +Searches tasks based on parameters indicated in the request body. + +## Path and HTTP methods + +```json +GET /_plugins/_ml/tasks/_search +``` + +#### Example request: Search for a task in which `function_name` is `KMEANS` + +```json +GET /_plugins/_ml/tasks/_search +{ + "query": { + "bool": { + "filter": [ + { + "term": { + "function_name": "KMEANS" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took" : 12, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 0.0, + "hits" : [ + { + "_index" : ".plugins-ml-task", + "_id" : "_wnLJ38BvytMh9aUi-Ia", + "_version" : 4, + "_seq_no" : 29, + "_primary_term" : 4, + "_score" : 0.0, + "_source" : { + "last_update_time" : 1645640125267, + "create_time" : 1645640125209, + "is_async" : true, + "function_name" : "KMEANS", + "input_type" : "SEARCH_QUERY", + "worker_node" : "jjqFrlW7QWmni1tRnb_7Dg", + "state" : "COMPLETED", + "model_id" : "AAnLJ38BvytMh9aUi-M2", + "task_type" : "TRAINING" + } + }, + { + "_index" : ".plugins-ml-task", + "_id" : "wwRRLX8BydmmU1x6I-AI", + "_version" : 3, + "_seq_no" : 38, + "_primary_term" : 7, + "_score" : 0.0, + "_source" : { + "last_update_time" : 1645732766656, + "create_time" : 1645732766472, + "is_async" : true, + "function_name" : "KMEANS", + "input_type" : "SEARCH_QUERY", + "worker_node" : "A_IiqoloTDK01uZvCjREaA", + "state" : "COMPLETED", + "model_id" : "xARRLX8BydmmU1x6I-CG", + "task_type" : "TRAINING" + } + } + ] + } +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/train-predict/index.md b/_ml-commons-plugin/api/train-predict/index.md index 9c937c5e41..8486b4beb9 100644 --- a/_ml-commons-plugin/api/train-predict/index.md +++ b/_ml-commons-plugin/api/train-predict/index.md @@ -1,8 +1,9 @@ --- layout: default title: Train and Predict APIs -parent: ML Commons API +parent: ML Commons APIs has_children: true +has_toc: false nav_order: 30 --- diff --git a/_ml-commons-plugin/api/train-predict/predict.md b/_ml-commons-plugin/api/train-predict/predict.md index 06c4a87167..299c957122 100644 --- a/_ml-commons-plugin/api/train-predict/predict.md +++ b/_ml-commons-plugin/api/train-predict/predict.md @@ -2,8 +2,7 @@ layout: default title: Predict parent: Train and Predict APIs -grand_parent: ML Commons API -has_children: true +grand_parent: ML Commons APIs nav_order: 20 --- diff --git a/_ml-commons-plugin/api/train-predict/train-and-predict.md b/_ml-commons-plugin/api/train-predict/train-and-predict.md index 73b51ad73a..1df0e5e3be 100644 --- a/_ml-commons-plugin/api/train-predict/train-and-predict.md +++ b/_ml-commons-plugin/api/train-predict/train-and-predict.md @@ -2,8 +2,7 @@ layout: default title: Train and predict parent: Train and Predict APIs -grand_parent: ML Commons API -has_children: true +grand_parent: ML Commons APIs nav_order: 10 --- diff --git a/_ml-commons-plugin/api/train-predict/train.md b/_ml-commons-plugin/api/train-predict/train.md index 09a632cf26..8de486198d 100644 --- a/_ml-commons-plugin/api/train-predict/train.md +++ b/_ml-commons-plugin/api/train-predict/train.md @@ -2,8 +2,7 @@ layout: default title: Train parent: Train and Predict APIs -grand_parent: ML Commons API -has_children: true +grand_parent: ML Commons APIs nav_order: 10 --- diff --git a/_ml-commons-plugin/cluster-settings.md b/_ml-commons-plugin/cluster-settings.md index 5bf1c13599..c473af81a1 100644 --- a/_ml-commons-plugin/cluster-settings.md +++ b/_ml-commons-plugin/cluster-settings.md @@ -239,6 +239,33 @@ plugins.ml_commons.native_memory_threshold: 90 - Default value: 90 - Value range: [0, 100] +## Set JVM heap memory threshold + +Sets a circuit breaker that checks JVM heap memory usage before running an ML task. If the heap usage exceeds the threshold, OpenSearch triggers a circuit breaker and throws an exception to maintain optimal performance. + +Values are based on the percentage of JVM heap memory available. When set to `0`, no ML tasks will run. When set to `100`, the circuit breaker closes and no threshold exists. + +### Setting + +``` +plugins.ml_commons.jvm_heap_memory_threshold: 85 +``` + +### Values + +- Default value: 85 +- Value range: [0, 100] + +## Exclude node names + +Use this setting to specify the names of nodes on which you don't want to run ML tasks. The value should be a valid node name or a comma-separated node name list. + +### Setting + +``` +plugins.ml_commons.exclude_nodes._name: node1, node2 +``` + ## Allow custom deployment plans When enabled, this setting grants users the ability to deploy models to specific ML nodes according to that user's permissions. @@ -254,6 +281,21 @@ plugins.ml_commons.allow_custom_deployment_plan: false - Default value: false - Valid values: `false`, `true` +## Enable auto deploy + +This setting is applicable when you send a prediction request for an externally hosted model that has not been deployed. When set to `true`, this setting automatically deploys the model to the cluster if the model has not been deployed already. + +### Setting + +``` +plugins.ml_commons.model_auto_deploy.enable: false +``` + +### Values + +- Default value: `true` +- Valid values: `false`, `true` + ## Enable auto redeploy This setting automatically redeploys deployed or partially deployed models upon cluster failure. If all ML nodes inside a cluster crash, the model switches to the `DEPLOYED_FAILED` state, and the model must be deployed manually. @@ -326,10 +368,110 @@ plugins.ml_commons.connector_access_control_enabled: true ### Values -- Default value: false +- Default value: `false` - Valid values: `false`, `true` +## Enable a local model + +This setting allows a cluster admin to enable running local models on the cluster. When this setting is `false`, users will not be able to run register, deploy, or predict operations on any local model. + +### Setting + +``` +plugins.ml_commons.local_model.enabled: true +``` +### Values + +- Default value: `true` +- Valid values: `false`, `true` +## Node roles that can run externally hosted models +This setting allows a cluster admin to control the types of nodes on which externally hosted models can run. + +### Setting + +``` +plugins.ml_commons.task_dispatcher.eligible_node_role.remote_model: ["ml"] +``` + +### Values + +- Default value: `["data", "ml"]`, which allows externally hosted models to run on data nodes and ML nodes. + + +## Node roles that can run local models + +This setting allows a cluster admin to control the types of nodes on which local models can run. The `plugins.ml_commons.only_run_on_ml_node` setting only allows the model to run on ML nodes. For a local model, if `plugins.ml_commons.only_run_on_ml_node` is set to `true`, then the model will always run on ML nodes. If `plugins.ml_commons.only_run_on_ml_node` is set to `false`, then the model will run on nodes defined in the `plugins.ml_commons.task_dispatcher.eligible_node_role.local_model` setting. + +### Setting +``` +plugins.ml_commons.task_dispatcher.eligible_node_role.remote_model: ["ml"] +``` + +### Values + +- Default value: `["data", "ml"]` + +## Enable remote inference + +This setting allows a cluster admin to enable remote inference on the cluster. If this setting is `false`, users will not be able to run register, deploy, or predict operations on any externally hosted model or create a connector for remote inference. + +### Setting + +``` +plugins.ml_commons.remote_inference.enabled: true +``` + +### Values + +- Default value: `true` +- Valid values: `false`, `true` + +## Enable agent framework + +When set to `true`, this setting enables the agent framework (including agents and tools) on the cluster and allows users to run register, execute, delete, get, and search operations on an agent. + +### Setting + +``` +plugins.ml_commons.agent_framework_enabled: true +``` + +### Values + +- Default value: `true` +- Valid values: `false`, `true` + +## Enable memory + +When set to `true`, this setting enables conversational memory, which stores all messages from a conversation for conversational search. + +### Setting + +``` +plugins.ml_commons.memory_feature_enabled: true +``` + +### Values + +- Default value: `true` +- Valid values: `false`, `true` + + +## Enable RAG pipeline + +When set to `true`, this setting enables the search processors for retrieval-augmented generation (RAG). RAG enhances query results by generating responses using relevant information from memory and previous conversations. + +### Setting + +``` +plugins.ml_commons.agent_framework_enabled: true +``` + +### Values + +- Default value: `true` +- Valid values: `false`, `true` diff --git a/_ml-commons-plugin/custom-local-models.md b/_ml-commons-plugin/custom-local-models.md index a7356a18ac..a265d8804a 100644 --- a/_ml-commons-plugin/custom-local-models.md +++ b/_ml-commons-plugin/custom-local-models.md @@ -7,7 +7,7 @@ nav_order: 120 --- # Custom local models -**Generally available 2.9** +**Introduced 2.9** {: .label .label-purple } To use a custom model locally, you can upload it to the OpenSearch cluster. @@ -18,9 +18,16 @@ As of OpenSearch 2.6, OpenSearch supports local text embedding models. As of OpenSearch 2.11, OpenSearch supports local sparse encoding models. +As of OpenSearch 2.12, OpenSearch supports local cross-encoder models. + +As of OpenSearch 2.13, OpenSearch supports local question answering models. + +Running local models on the CentOS 7 operating system is not supported. Moreover, not all local models can run on all hardware and operating systems. +{: .important} + ## Preparing a model -For both text embedding and sparse encoding models, you must provide a tokenizer JSON file within the model zip file. +For all the models, you must provide a tokenizer JSON file within the model zip file. For sparse encoding models, make sure your output format is `{"output":}` so that ML Commons can post-process the sparse vector. @@ -152,7 +159,7 @@ POST /_plugins/_ml/models/_register ``` {% include copy.html %} -For a description of Register API parameters, see [Register a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/). +For descriptions of Register API parameters, see [Register a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/). The `model_task_type` corresponds to the model type. For text embedding models, set this parameter to `TEXT_EMBEDDING`. For sparse encoding models, set this parameter to `SPARSE_ENCODING` or `SPARSE_TOKENIZE`. For cross-encoder models, set this parameter to `TEXT_SIMILARITY`. For question answering models, set this parameter to `QUESTION_ANSWERING`. OpenSearch returns the task ID of the register operation: @@ -315,4 +322,61 @@ The response contains the tokens and weights: ## Step 5: Use the model for search -To learn how to use the model for vector search, see [Set up neural search]({{site.url}}{{site.baseurl}}http://localhost:4000/docs/latest/search-plugins/neural-search/#set-up-neural-search). +To learn how to use the model for vector search, see [Using an ML model for neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/#using-an-ml-model-for-neural-search). + +## Question answering models + +A question answering model extracts the answer to a question from a given context. ML Commons supports context in `text` format. + +To register a question answering model, send a request in the following format. Specify the `function_name` as `QUESTION_ANSWERING`: + +```json +POST /_plugins/_ml/models/_register +{ + "name": "question_answering", + "version": "1.0.0", + "function_name": "QUESTION_ANSWERING", + "description": "test model", + "model_format": "TORCH_SCRIPT", + "model_group_id": "lN4AP40BKolAMNtR4KJ5", + "model_content_hash_value": "e837c8fc05fd58a6e2e8383b319257f9c3859dfb3edc89b26badfaf8a4405ff6", + "model_config": { + "model_type": "bert", + "framework_type": "huggingface_transformers" + }, + "url": "https://github.com/opensearch-project/ml-commons/blob/main/ml-algorithms/src/test/resources/org/opensearch/ml/engine/algorithms/question_answering/question_answering_pt.zip?raw=true" +} +``` +{% include copy-curl.html %} + +Then send a request to deploy the model: + +```json +POST _plugins/_ml/models//_deploy +``` +{% include copy-curl.html %} + +To test a question answering model, send the following request. It requires a `question` and the relevant `context` from which the answer will be generated: + +```json +POST /_plugins/_ml/_predict/question_answering/ +{ + "question": "Where do I live?" + "context": "My name is John. I live in New York" +} +``` +{% include copy-curl.html %} + +The response provides the answer based on the context: + +```json +{ + "inference_results": [ + { + "output": [ + { + "result": "New York" + } + } +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/integrating-ml-models.md b/_ml-commons-plugin/integrating-ml-models.md index b5c423a976..4dbf169e54 100644 --- a/_ml-commons-plugin/integrating-ml-models.md +++ b/_ml-commons-plugin/integrating-ml-models.md @@ -45,15 +45,30 @@ For a step-by-step tutorial, see [Neural search tutorial]({{site.url}}{{site.bas You can use an ML model in one of the following ways: -- [Make predictions](#making-predictions). +- [Invoke a model for inference](#invoking-a-model-for-inference). - [Use a model for search](#using-a-model-for-search). -### Making predictions +### Invoking a model for inference -[Models trained]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/train/) through the ML Commons plugin support model-based algorithms, such as k-means. After you've trained a model to your precision requirements, use the model to [make predictions]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/predict/). +You can invoke your model by calling the [Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/predict/). For example, testing text embedding models lets you see the vector embeddings they generate. -If you don't want to use a model, you can use the [Train and Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/train-and-predict/) to test your model without having to evaluate the model's performance. +[Models trained]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/train/) through the ML Commons plugin support model-based algorithms, such as k-means. After you've trained a model to your precision requirements, you can use such a model for inference. Alternatively, you can use the [Train and Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/train-and-predict/) to test your model without having to evaluate the model's performance. ### Using a model for search -OpenSearch supports multiple search methods that integrate with ML models. For more information, see [Search methods]({{site.url}}{{site.baseurl}}/search-plugins/index/#search-methods). \ No newline at end of file +OpenSearch supports multiple search methods that integrate with ML models. For more information, see [Search methods]({{site.url}}{{site.baseurl}}/search-plugins/index/#search-methods). + +## Disabling a model + +You can temporarily disable a model when you don't want to undeploy or delete it. Disable a model by calling the [Update Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/update-model/) and setting `is_enabled` to `false`. When you disable a model, it becomes unavailable for [Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/predict/) requests. If you disable a model that is undeployed, the model remains disabled after deployment. You'll need to enable it in order to use it for inference. + +## Rate limiting inference calls + +Setting a rate limit for Predict API calls on your ML models allows you to reduce your model inference costs. You can set a rate limit for the number of Predict API calls at the following levels: + +- **Model level**: Configure a rate limit for all users of the model by calling the Update Model API and specifying a `rate_limiter`. For more information, see [Update Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/update-model/). +- **User level**: Configure a rate limit for a specific user or users of the model by creating a controller. A model may be shared by multiple users; you can configure the controller to set different rate limits for different users. For more information, see [Create Controller API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/controller-apis/create-controller/). + +Model-level rate limiting applies to all users of the model. If you specify both a model-level rate limit and a user-level rate limit, the overall rate limit is set to the more restrictive of the two. For example, if the model-level limit is 2 requests per minute and the user-level limit is 4 requests per minute, the overall limit will be set to 2 requests per minute. + +To set the rate limit, you must provide two inputs: the maximum number of requests and the time frame. OpenSearch uses these inputs to calculate the rate limit as the maximum number of requests divided by the time frame. For example, if you set the limit to be 4 requests per minute, the rate limit is `4 requests / 1 minute`, which is `1 request / 0.25 minutes`, or `1 request / 15 seconds`. OpenSearch processes predict requests sequentially, in a first-come-first-served manner, and will limit those requests to 1 request per 15 seconds. Imagine two users, Alice and Bob, calling the Predict API for the same model, which has a rate limit of 1 request per 15 seconds. If Alice calls the Predict API and immediately after that Bob calls the Predict API, OpenSearch processes Alice's predict request and rejects Bob's request. Once 15 seconds has passed since Alice's request, Bob can send a request again, and this request will be processed. \ No newline at end of file diff --git a/_ml-commons-plugin/ml-dashboard.md b/_ml-commons-plugin/ml-dashboard.md index 3195aff8de..20c4e636bb 100644 --- a/_ml-commons-plugin/ml-dashboard.md +++ b/_ml-commons-plugin/ml-dashboard.md @@ -7,7 +7,7 @@ redirect_from: --- # Managing ML models in OpenSearch Dashboards -**Generally available 2.9** +**Introduced 2.9** {: .label .label-purple } Administrators of machine learning (ML) clusters can use OpenSearch Dashboards to manage and check the status of ML models running inside a cluster. This can help ML developers provision nodes to ensure their models run efficiently. diff --git a/_ml-commons-plugin/model-access-control.md b/_ml-commons-plugin/model-access-control.md index eb8a667d66..91d134cbb9 100644 --- a/_ml-commons-plugin/model-access-control.md +++ b/_ml-commons-plugin/model-access-control.md @@ -114,4 +114,65 @@ PUT _cluster/settings Model access control is achieved through the Model Group APIs. These APIs include the register, search, update, and delete model group operations. -For information about model access control API, see [Model group APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/index/). \ No newline at end of file +For information about APIs related to model access control, see [Model Group APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/index/). + +## Hidden models +**Introduced 2.12** +{: .label .label-purple } + +To hide model details from end users, including the cluster admin, you can register a _hidden_ model. If a model is hidden, the non-superadmin users don't have permission to call any [Model APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/) except for the [Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/predict/) on the model. + +Only superadmin users can register a hidden model. A hidden model can be one of the OpenSearch-provided pretrained models, your own custom model, or an externally hosted model. To register a hidden model, you first need to authenticate with an [admin certificate]({{site.url}}{{site.baseurl}}/security/configuration/tls/#configuring-admin-certificates): + +```bash +curl -k --cert ./kirk.pem --key ./kirk-key.pem -XGET 'https://localhost:9200/.opendistro_security/_search' +``` + +All models created by a superadmin user are automatically registered as hidden. To register a hidden model, send a request to the `_register` endpoint: + +```bash +curl -k --cert ./kirk.pem --key ./kirk-key.pem -X POST 'https://localhost:9200/_plugins/_ml/models/_register' -H 'Content-Type: application/json' -d ' +{ + "name": "OPENSEARCH_ASSISTANT_MODEL", + "function_name": "remote", + "description": "OpenSearch Assistant Model", + "connector": { + "name": "Bedrock Claude Connector", + "description": "The connector to Bedrock Claude", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "us-east-1", + "service_name": "bedrock" + }, + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "headers": { + "content-type": "application/json" + }, + "url": "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke", + "request_body": "{\"prompt\":\"\\n\\nHuman: ${parameters.inputs}\\n\\nAssistant:\",\"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_k\":250,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}" + } + ] + } +}' +``` +{% include copy.html %} + +Once a hidden model is registered, only a superadmin can invoke operations on the model, including the deploy, undeploy, delete, and get API operations. For example, to deploy a hidden model, send the following request. In this request, `q7wLt4sBaDRBsUkl9BJV` is the model ID: + +```json +curl -k --cert ./kirk.pem --key ./kirk-key.pem -X POST 'https://localhost:9200/_plugins/_ml/models/q7wLt4sBaDRBsUkl9BJV/_deploy' +``` +{% include copy.html %} + +The `model_id` of a hidden model is the model `name`. A hidden model includes an `is_hidden` parameter that is set to `true`. You cannot change a hidden model's `is_hidden` parameter. + +Admin users can change access to a model by updating its backend roles. \ No newline at end of file diff --git a/_ml-commons-plugin/opensearch-assistant.md b/_ml-commons-plugin/opensearch-assistant.md new file mode 100644 index 0000000000..0a058d73a0 --- /dev/null +++ b/_ml-commons-plugin/opensearch-assistant.md @@ -0,0 +1,38 @@ +--- +layout: default +title: OpenSearch Assistant Toolkit +has_children: false +has_toc: false +nav_order: 28 +--- + +# OpenSearch Assistant Toolkit +**Introduced 2.13** +{: .label .label-purple } + +The OpenSearch Assistant Toolkit helps you create AI-powered assistants for OpenSearch Dashboards. The toolkit includes the following elements: + +- [**Agents and tools**]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/): _Agents_ interface with a large language model (LLM) and execute high-level tasks, such as summarization or generating Piped Processing Language (PPL) queries from natural language. The agent's high-level tasks consist of low-level tasks called _tools_, which can be reused by multiple agents. +- [**Configuration automation**]({{site.url}}{{site.baseurl}}/automating-configurations/index/): Uses templates to set up infrastructure for artificial intelligence and machine learning (AI/ML) applications. For example, you can automate configuring agents to be used for chat or generating PPL queries from natural language. +- [**OpenSearch Assistant for OpenSearch Dashboards**]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/index/): This is the OpenSearch Dashboards UI for the AI-powered assistant. The assistant's workflow is configured with various agents and tools. + +## Enabling OpenSearch Assistant + +To enable OpenSearch Assistant, perform the following steps: + +- Enable the agent framework and retrieval-augmented generation (RAG) by configuring the following settings: + ```yaml + plugins.ml_commons.agent_framework_enabled: true + plugins.ml_commons.rag_pipeline_feature_enabled: true + ``` + {% include copy.html %} +- Enable the assistant by configuring the following settings: + ```yaml + assistant.chat.enabled: true + observability.query_assist.enabled: true + ``` + {% include copy.html %} + +## Next steps + +- For more information about the OpenSearch Assistant UI, see [OpenSearch Assistant for OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/index/) \ No newline at end of file diff --git a/_ml-commons-plugin/pretrained-models.md b/_ml-commons-plugin/pretrained-models.md index 5f8a2c8832..8847d36291 100644 --- a/_ml-commons-plugin/pretrained-models.md +++ b/_ml-commons-plugin/pretrained-models.md @@ -7,22 +7,77 @@ nav_order: 120 --- # OpenSearch-provided pretrained models -**Generally available 2.9** +**Introduced 2.9** {: .label .label-purple } OpenSearch provides a variety of open-source pretrained models that can assist with a range of machine learning (ML) search and analytics use cases. You can upload any supported model to the OpenSearch cluster and use it locally. -## Prerequisites +## Supported pretrained models + +OpenSearch supports the following models, categorized by type. Text embedding models are sourced from [Hugging Face](https://huggingface.co/). Sparse encoding models are trained by OpenSearch. Although models with the same type will have similar use cases, each model has a different model size and will perform differently depending on your cluster setup. For a performance comparison of some pretrained models, see the [SBERT documentation](https://www.sbert.net/docs/pretrained_models.html#model-overview). + +Running local models on the CentOS 7 operating system is not supported. Moreover, not all local models can run on all hardware and operating systems. +{: .important} + +### Sentence transformers + +Sentence transformer models map sentences and paragraphs across a dimensional dense vector space. The number of vectors depends on the type of model. You can use these models for use cases such as clustering or semantic search. + +The following table provides a list of sentence transformer models and artifact links you can use to download them. Note that you must prefix the model name with `huggingface/`, as shown in the **Model name** column. + +| Model name | Version | Vector dimensions | Auto-truncation | TorchScript artifact | ONNX artifact | +|:---|:---|:---|:---|:---|:---| +| `huggingface/sentence-transformers/all-distilroberta-v1` | 1.0.1 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/torch_script/sentence-transformers_all-distilroberta-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/onnx/sentence-transformers_all-distilroberta-v1-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/onnx/config.json) | +| `huggingface/sentence-transformers/all-MiniLM-L6-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/torch_script/sentence-transformers_all-MiniLM-L6-v2-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/onnx/sentence-transformers_all-MiniLM-L6-v2-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/onnx/config.json) | +| `huggingface/sentence-transformers/all-MiniLM-L12-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/torch_script/sentence-transformers_all-MiniLM-L12-v2-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/onnx/sentence-transformers_all-MiniLM-L12-v2-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/onnx/config.json) | +| `huggingface/sentence-transformers/all-mpnet-base-v2` | 1.0.1 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/torch_script/sentence-transformers_all-mpnet-base-v2-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/onnx/sentence-transformers_all-mpnet-base-v2-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/onnx/config.json) | +| `huggingface/sentence-transformers/msmarco-distilbert-base-tas-b` | 1.0.2 | 768-dimensional dense vector space. Optimized for semantic search. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/torch_script/sentence-transformers_msmarco-distilbert-base-tas-b-1.0.2-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/onnx/sentence-transformers_msmarco-distilbert-base-tas-b-1.0.2-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/onnx/config.json) | +| `huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1` | 1.0.1 | 384-dimensional dense vector space. Designed for semantic search and trained on 215 million question/answer pairs. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/torch_script/sentence-transformers_multi-qa-MiniLM-L6-cos-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/onnx/sentence-transformers_multi-qa-MiniLM-L6-cos-v1-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/onnx/config.json) | +| `huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/torch_script/sentence-transformers_multi-qa-mpnet-base-dot-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/onnx/sentence-transformers_multi-qa-mpnet-base-dot-v1-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/onnx/config.json) | +| `huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/torch_script/sentence-transformers_paraphrase-MiniLM-L3-v2-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/onnx/sentence-transformers_paraphrase-MiniLM-L3-v2-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/onnx/config.json) | +| `huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/torch_script/sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/onnx/sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/onnx/config.json) | +| `huggingface/sentence-transformers/paraphrase-mpnet-base-v2` | 1.0.0 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/torch_script/sentence-transformers_paraphrase-mpnet-base-v2-1.0.0-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/onnx/sentence-transformers_paraphrase-mpnet-base-v2-1.0.0-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/onnx/config.json) | +| `huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1` | 1.0.1 | 512-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1/1.0.1/torch_script/sentence-transformers_distiluse-base-multilingual-cased-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1/1.0.1/torch_script/config.json) | Not available | + + +### Sparse encoding models +**Introduced 2.11** +{: .label .label-purple } + +Sparse encoding models transfer text into a sparse vector and convert the vector to a list of `` pairs representing the text entry and its corresponding weight in the sparse vector. You can use these models for use cases such as clustering or sparse neural search. + +We recommend the following models for optimal performance: + +- Use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1` model during both ingestion and search. +- Use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1` model during ingestion and the +`amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` model during search. + +The following table provides a list of sparse encoding models and artifact links you can use to download them. + +| Model name | Version | Auto-truncation | TorchScript artifact | Description | +|:---|:---|:---|:---|:---| +| `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1` | 1.0.1 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-v1/1.0.1/torch_script/neural-sparse_opensearch-neural-sparse-encoding-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-v1/1.0.1/torch_script/config.json) | A neural sparse encoding model. The model transforms text into a sparse vector, identifies the indexes of non-zero elements in the vector, and then converts the vector into `` pairs, where each entry corresponds to a non-zero element index. To experiment with this model using transformers and the PyTorch API, see the [HuggingFace documentation](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1). | +| `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1` | 1.0.1 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1/1.0.1/torch_script/neural-sparse_opensearch-neural-sparse-encoding-doc-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1/1.0.1/torch_script/config.json) | A neural sparse encoding model. The model transforms text into a sparse vector, identifies the indexes of non-zero elements in the vector, and then converts the vector into `` pairs, where each entry corresponds to a non-zero element index. To experiment with this model using transformers and the PyTorch API, see the [HuggingFace documentation](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v1). | +| `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` | 1.0.1 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1/1.0.1/torch_script/neural-sparse_opensearch-neural-sparse-tokenizer-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1/1.0.1/torch_script/config.json) | A neural sparse tokenizer model. The model tokenizes text into tokens and assigns each token a predefined weight, which is the token's inverse document frequency (IDF). If the IDF file is not provided, the weight defaults to 1. For more information, see [Preparing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/#preparing-a-model). | -To get started, select one of the [supported pretrained models](#supported-pretrained-models). +### Cross-encoder models +**Introduced 2.12** +{: .label .label-purple } + +Cross-encoder models support query reranking. -### Cluster settings +The following table provides a list of cross-encoder models and artifact links you can use to download them. Note that you must prefix the model name with `huggingface/cross-encoders`, as shown in the **Model name** column. -This example uses a simple setup with no dedicated ML nodes and allows running a model on a non-ML node. +| Model name | Version | TorchScript artifact | ONNX artifact | +|:---|:---|:---|:---| +| `huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2` | 1.0.2 | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2/1.0.2/torch_script/cross-encoders_ms-marco-MiniLM-L-6-v2-1.0.2-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2/1.0.2/onnx/cross-encoders_ms-marco-MiniLM-L-6-v2-1.0.2-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2/1.0.2/onnx/config.json) | +| `huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2` | 1.0.2 | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/torch_script/cross-encoders_ms-marco-MiniLM-L-12-v2-1.0.2-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/onnx/cross-encoders_ms-marco-MiniLM-L-12-v2-1.0.2-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/onnx/config.json) -On clusters with dedicated ML nodes, specify `"only_run_on_ml_node": "true"` for improved performance. For more information, see [ML Commons cluster settings]({{site.url}}{{site.baseurl}}/ml-commons-plugin/cluster-settings/). +## Prerequisites -To ensure that this basic local setup works, specify the following cluster settings: +On clusters with dedicated ML nodes, specify `"only_run_on_ml_node": "true"` for improved performance. For more information, see [ML Commons cluster settings]({{site.url}}{{site.baseurl}}/ml-commons-plugin/cluster-settings/). + +This example uses a simple setup with no dedicated ML nodes and allows running a model on a non-ML node. To ensure that this basic local setup works, specify the following cluster settings: ```json PUT _cluster/settings @@ -95,7 +150,7 @@ OpenSearch returns the task ID of the register operation: } ``` -To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/#get-a-task-by-id): +To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/): ```bash GET /_plugins/_ml/tasks/cVeMb4kBJ1eYAeTMFFgj @@ -172,6 +227,8 @@ If a cluster or node is restarted, then you need to redeploy the model. To learn Use the [Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/predict/) to test the model. +### Text embedding model + For a text embedding model, send the following request: ```json @@ -210,6 +267,8 @@ The response contains text embeddings for the provided sentence: } ``` +### Sparse encoding model + For a sparse encoding model, send the following request: ```json @@ -245,55 +304,121 @@ The response contains the tokens and weights: } ``` -## Step 5: Use the model for search - -To learn how to set up a vector index and use text embedding models for search, see [Semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/). - -To learn how to set up a vector index and use sparse encoding models for search, see [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). - - -## Supported pretrained models - -OpenSearch supports the following models, categorized by type. Text embedding models are sourced from [Hugging Face](https://huggingface.co/). Sparse encoding models are trained by OpenSearch. Although models with the same type will have similar use cases, each model has a different model size and will perform differently depending on your cluster setup. For a performance comparison of some pretrained models, see the [SBERT documentation](https://www.sbert.net/docs/pretrained_models.html#model-overview). +### Cross-encoder model +For a cross-encoder model, send the following request: -### Sentence transformers +```json +POST _plugins/_ml/models//_predict +{ + "query_text": "today is sunny", + "text_docs": [ + "how are you", + "today is sunny", + "today is july fifth", + "it is winter" + ] +} +``` +{% include copy-curl.html %} -Sentence transformer models map sentences and paragraphs across a dimensional dense vector space. The number of vectors depends on the type of model. You can use these models for use cases such as clustering or semantic search. +The model calculates the similarity score of `query_text` and each document in `text_docs` and returns a list of scores for each document in the order they were provided in `text_docs`: -The following table provides a list of sentence transformer models and artifact links you can use to download them. Note that you must prefix the model name with `huggingface/`, as shown in the **Model name** column. +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + -6.077798 + ], + "byte_buffer": { + "array": "Un3CwA==", + "order": "LITTLE_ENDIAN" + } + } + ] + }, + { + "output": [ + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 10.223609 + ], + "byte_buffer": { + "array": "55MjQQ==", + "order": "LITTLE_ENDIAN" + } + } + ] + }, + { + "output": [ + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + -1.3987057 + ], + "byte_buffer": { + "array": "ygizvw==", + "order": "LITTLE_ENDIAN" + } + } + ] + }, + { + "output": [ + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + -4.5923924 + ], + "byte_buffer": { + "array": "4fSSwA==", + "order": "LITTLE_ENDIAN" + } + } + ] + } + ] +} +``` -| Model name | Version | Vector dimensions | Auto-truncation | TorchScript artifact | ONNX artifact | -|:---|:---|:---|:---|:---| -| `huggingface/sentence-transformers/all-distilroberta-v1` | 1.0.1 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/torch_script/sentence-transformers_all-distilroberta-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/onnx/sentence-transformers_all-distilroberta-v1-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/all-MiniLM-L6-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/torch_script/sentence-transformers_all-MiniLM-L6-v2-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/onnx/sentence-transformers_all-MiniLM-L6-v2-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/all-MiniLM-L12-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/torch_script/sentence-transformers_all-MiniLM-L12-v2-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/onnx/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/onnx/sentence-transformers_all-MiniLM-L12-v2-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/all-mpnet-base-v2` | 1.0.1 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/torch_script/sentence-transformers_all-mpnet-base-v2-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/onnx/sentence-transformers_all-mpnet-base-v2-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/msmarco-distilbert-base-tas-b` | 1.0.2 | 768-dimensional dense vector space. Optimized for semantic search. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/torch_script/sentence-transformers_msmarco-distilbert-base-tas-b-1.0.2-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/onnx/sentence-transformers_msmarco-distilbert-base-tas-b-1.0.2-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/onnx/config.json) | -| `huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1` | 1.0.1 | 384-dimensional dense vector space. Designed for semantic search and trained on 215 million question/answer pairs. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/torch_script/sentence-transformers_multi-qa-MiniLM-L6-cos-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/onnx/sentence-transformers_multi-qa-MiniLM-L6-cos-v1-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/torch_script/sentence-transformers_multi-qa-mpnet-base-dot-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/onnx/sentence-transformers_multi-qa-mpnet-base-dot-v1-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/torch_script/sentence-transformers_paraphrase-MiniLM-L3-v2-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/onnx/sentence-transformers_paraphrase-MiniLM-L3-v2-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/torch_script/sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/onnx/sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2-1.0.1-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/paraphrase-mpnet-base-v2` | 1.0.0 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/torch_script/sentence-transformers_paraphrase-mpnet-base-v2-1.0.0-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/onnx/sentence-transformers_paraphrase-mpnet-base-v2-1.0.0-onnx.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/onnx/config.json) | -| `huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1` | 1.0.1 | 512-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1/1.0.1/torch_script/sentence-transformers_distiluse-base-multilingual-cased-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1/1.0.1/torch_script/config.json) | Not available | +A higher document score means higher similarity. In the preceding response, documents are scored as follows against the query text `today is sunny`: +Document text | Score +:--- | :--- +`how are you` | -6.077798 +`today is sunny` | 10.223609 +`today is july fifth` | -1.3987057 +`it is winter` | -4.5923924 -### Sparse encoding models -**Introduced 2.11** -{: .label .label-purple } +The document that contains the same text as the query is scored the highest, and the remaining documents are scored based on the text similarity. -Sparse encoding models transfer text into a sparse vector and convert the vector to a list of `` pairs representing the text entry and its corresponding weight in the sparse vector. You can use these models for use cases such as clustering or sparse neural search. +## Step 5: Use the model for search -We recommend the following models for optimal performance: +To learn how to set up a vector index and use text embedding models for search, see [Semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/). -- Use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1` model during both ingestion and search. -- Use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1` model during ingestion and the -`amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` model during search. +To learn how to set up a vector index and use sparse encoding models for search, see [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). -The following table provides a list of sparse encoding models and artifact links you can use to download them. +To learn how to use cross-encoder models for reranking, see [Reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/). -| Model name | Version | Auto-truncation | TorchScript artifact | Description | -|---|---|---|---| -| `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1` | 1.0.1 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-v1/1.0.1/torch_script/neural-sparse_opensearch-neural-sparse-encoding-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-v1/1.0.1/torch_script/config.json) | A neural sparse encoding model. The model transforms text into a sparse vector, identifies the indexes of non-zero elements in the vector, and then converts the vector into `` pairs, where each entry corresponds to a non-zero element index. | -| `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1` | 1.0.1 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1/1.0.1/torch_script/neural-sparse_opensearch-neural-sparse-encoding-doc-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1/1.0.1/torch_script/config.json) | A neural sparse encoding model. The model transforms text into a sparse vector, identifies the indexes of non-zero elements in the vector, and then converts the vector into `` pairs, where each entry corresponds to a non-zero element index. | -| `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` | 1.0.1 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1/1.0.1/torch_script/neural-sparse_opensearch-neural-sparse-tokenizer-v1-1.0.1-torch_script.zip)
- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1/1.0.1/torch_script/config.json) | A neural sparse tokenizer model. The model tokenizes text into tokens and assigns each token a predefined weight, which is the token's IDF (if the IDF file is not provided, the weight defaults to 1). For more information, see [Preparing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/#preparing-a-model). | \ No newline at end of file diff --git a/_ml-commons-plugin/remote-models/blueprints.md b/_ml-commons-plugin/remote-models/blueprints.md index 431cdd007f..5cac2f3d3b 100644 --- a/_ml-commons-plugin/remote-models/blueprints.md +++ b/_ml-commons-plugin/remote-models/blueprints.md @@ -6,7 +6,7 @@ nav_order: 65 parent: Connecting to externally hosted models grand_parent: Integrating ML models redirect_from: - - ml-commons-plugin/extensibility/blueprints/ + - /ml-commons-plugin/extensibility/blueprints/ --- # Connector blueprints @@ -55,32 +55,41 @@ As an ML developer, you can build connector blueprints for other platforms. Usin ## Configuration parameters -The following configuration parameters are **required** in order to build a connector blueprint. - -| Field | Data type | Description | -| :--- | :--- | :--- | -| `name` | String | The name of the connector. | -| `description` | String | A description of the connector. | -| `version` | Integer | The version of the connector. | -| `protocol` | String | The protocol for the connection. For AWS services such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. | -| `parameters` | JSON object | The default connector parameters, including `endpoint` and `model`. Any parameters indicated in this field can be overridden by parameters specified in a predict request. | -| `credential` | JSON object | Defines any credential variables required in order to connect to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the connection to the cluster first starts, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. | -| `actions` | JSON array | Defines what actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. | -| `backend_roles` | JSON array | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). | -| `access_mode` | String | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). | -| `add_all_backend_roles` | Boolean | When set to `true`, adds all `backend_roles` to the access list, which only a user with admin permissions can adjust. When set to `false`, non-admins can add `backend_roles`. | - -The `action` parameter supports the following options. - -| Field | Data type | Description | -| :--- | :--- | :--- | -| `action_type` | String | Required. Sets the ML Commons API operation to use upon connection. As of OpenSearch 2.9, only `predict` is supported. | -| `method` | String | Required. Defines the HTTP method for the API call. Supports `POST` and `GET`. | -| `url` | String | Required. Sets the connection endpoint at which the action occurs. This must match the regex expression for the connection used when [adding trusted endpoints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index#adding-trusted-endpoints). | -| `headers` | JSON object | Sets the headers used inside the request or response body. Default is `ContentType: application/json`. If your third-party ML tool requires access control, define the required `credential` parameters in the `headers` parameter. | -| `request_body` | String | Required. Sets the parameters contained inside the request body of the action. The parameters must include `\"inputText\`, which specifies how users of the connector should construct the request payload for the `action_type`. | -| `pre_process_function` | String | Optional. A built-in or custom Painless script used to preprocess the input data. OpenSearch provides the following built-in preprocess functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere](https://cohere.com/) embedding models
- `connector.pre_process.openai.embedding` for [OpenAI](https://openai.com/) embedding models
- `connector.pre_process.default.embedding`, which you can use to preprocess documents in neural search requests so that they are in the format that ML Commons can process with the default preprocessor (OpenSearch 2.11 or later). For more information, see [built-in functions](#built-in-pre--and-post-processing-functions). | -| `post_process_function` | String | Optional. A built-in or custom Painless script used to post-process the model output data. OpenSearch provides the following built-in post-process functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere text embedding models](https://docs.cohere.com/reference/embed)
- `connector.pre_process.openai.embedding` for [OpenAI text embedding models](https://platform.openai.com/docs/api-reference/embeddings)
- `connector.post_process.default.embedding`, which you can use to post-process documents in the model response so that they are in the format that neural search expects (OpenSearch 2.11 or later). For more information, see [built-in functions](#built-in-pre--and-post-processing-functions). | +| Field | Data type | Is required | Description | +|:------------------------|:------------|:------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `name` | String | Yes | The name of the connector. | +| `description` | String | Yes | A description of the connector. | +| `version` | Integer | Yes | The version of the connector. | +| `protocol` | String | Yes | The protocol for the connection. For AWS services such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. | +| `parameters` | JSON object | Yes | The default connector parameters, including `endpoint` and `model`. Any parameters indicated in this field can be overridden by parameters specified in a predict request. | +| `credential` | JSON object | Yes | Defines any credential variables required to connect to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the connection to the cluster first starts, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. | +| `actions` | JSON array | Yes | Defines what actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. | +| `backend_roles` | JSON array | Yes | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). | +| `access_mode` | String | Yes | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). | +| `add_all_backend_roles` | Boolean | Yes | When set to `true`, adds all `backend_roles` to the access list, which only a user with admin permissions can adjust. When set to `false`, non-admins can add `backend_roles`. | +| `client_config` | JSON object | No | The client configuration object, which provides settings that control the behavior of the client connections used by the connector. These settings allow you to manage connection limits and timeouts, ensuring efficient and reliable communication. | + + +The `actions` parameter supports the following options. + +| Field | Data type | Description | +|:------------------------|:------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `action_type` | String | Required. Sets the ML Commons API operation to use upon connection. As of OpenSearch 2.9, only `predict` is supported. | +| `method` | String | Required. Defines the HTTP method for the API call. Supports `POST` and `GET`. | +| `url` | String | Required. Sets the connection endpoint at which the action occurs. This must match the regex expression for the connection used when [adding trusted endpoints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index#adding-trusted-endpoints). | +| `headers` | JSON object | Sets the headers used inside the request or response body. Default is `ContentType: application/json`. If your third-party ML tool requires access control, define the required `credential` parameters in the `headers` parameter. | +| `request_body` | String | Required. Sets the parameters contained in the request body of the action. The parameters must include `\"inputText\`, which specifies how users of the connector should construct the request payload for the `action_type`. | +| `pre_process_function` | String | Optional. A built-in or custom Painless script used to preprocess the input data. OpenSearch provides the following built-in preprocess functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere](https://cohere.com/) embedding models
- `connector.pre_process.openai.embedding` for [OpenAI](https://openai.com/) embedding models
- `connector.pre_process.default.embedding`, which you can use to preprocess documents in neural search requests so that they are in the format that ML Commons can process with the default preprocessor (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). | +| `post_process_function` | String | Optional. A built-in or custom Painless script used to post-process the model output data. OpenSearch provides the following built-in post-process functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere text embedding models](https://docs.cohere.com/reference/embed)
- `connector.pre_process.openai.embedding` for [OpenAI text embedding models](https://platform.openai.com/docs/api-reference/embeddings)
- `connector.post_process.default.embedding`, which you can use to post-process documents in the model response so that they are in the format that neural search expects (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). | + + +The `client_config` parameter supports the following options. + +| Field | Data type | Description | +|:---------------------|:----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `max_connection` | Integer | The maximum number of concurrent connections that the client can establish with the server. | +| `connection_timeout` | Integer | The maximum amount of time (in seconds) that the client will wait while trying to establish a connection to the server. A timeout prevents the client from waiting indefinitely and allows it to recover from unreachable network endpoints. | +| `read_timeout` | Integer | The maximum amount of time (in seconds) that the client will wait for a response from the server after sending a request. Useful when the server is slow to respond or encounters issues while processing a request. | ## Built-in pre- and post-processing functions diff --git a/_ml-commons-plugin/remote-models/connectors.md b/_ml-commons-plugin/remote-models/connectors.md index 6d77ad5210..64b55142a7 100644 --- a/_ml-commons-plugin/remote-models/connectors.md +++ b/_ml-commons-plugin/remote-models/connectors.md @@ -7,7 +7,7 @@ nav_order: 61 parent: Connecting to externally hosted models grand_parent: Integrating ML models redirect_from: - - ml-commons-plugin/extensibility/connectors/ + - /ml-commons-plugin/extensibility/connectors/ --- # Creating connectors for third-party ML platforms @@ -37,12 +37,12 @@ The following table lists all connector blueprints provided by OpenSearch. Follo Platform | Model | Connector blueprint :--- | :--- | :--- -[Amazon Bedrock](https://aws.amazon.com/bedrock/) | [A21 Labs Jurassic-2 Mid](https://aws.amazon.com/bedrock/jurassic/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_ai21labs_jurassic_blueprint.md) +[Amazon Bedrock](https://aws.amazon.com/bedrock/) | [AI21 Labs Jurassic-2 Mid](https://aws.amazon.com/bedrock/jurassic/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_ai21labs_jurassic_blueprint.md) [Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Anthropic Claude v2](https://aws.amazon.com/bedrock/claude/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_anthropic_claude_blueprint.md) [Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Titan Text Embeddings](https://aws.amazon.com/bedrock/titan/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_titan_embedding_blueprint.md) [Amazon SageMaker](https://aws.amazon.com/sagemaker/) | Text embedding models | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/sagemaker_connector_blueprint.md) -[Cohere](https://cohere.com/) | The `embed-english-v2.0` [text embedding model](https://docs.cohere.com/reference/embed) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/cohere_v2_connector_embedding_blueprint.md) -[Cohere](https://cohere.com/) | The `embed-english-v3.0` [text embedding model](https://docs.cohere.com/reference/embed) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/cohere_v3_connector_embedding_blueprint.md) +[Cohere](https://cohere.com/) | [Text Embedding models](https://docs.cohere.com/reference/embed) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/cohere_connector_embedding_blueprint.md) +[Cohere](https://cohere.com/) | [Chat models](https://docs.cohere.com/reference/chat) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/cohere_connector_chat_blueprint.md) [OpenAI](https://openai.com/) | Chat models (for example, `gpt-3.5-turbo`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/open_ai_connector_chat_blueprint.md) [OpenAI](https://openai.com/) | Completion models (for example, `text-davinci-003`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/open_ai_connector_completion_blueprint.md) [OpenAI](https://openai.com/) | Text embedding models (for example, `text-embedding-ada-002`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/openai_connector_embedding_blueprint.md) @@ -214,20 +214,21 @@ The `parameters` section requires the following options when using `aws_sigv4` a ### Cohere connector -You can use the following example request to create a standalone Cohere connector: +You can use the following example request to create a standalone Cohere connector using the Embed V3 model. For more information, see [Cohere connector blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/cohere_connector_embedding_blueprint.md). ```json POST /_plugins/_ml/connectors/_create { - "name": "", - "description": "", - "version": "", + "name": "Cohere Embed Model", + "description": "The connector to Cohere's public embed API", + "version": "1", "protocol": "http", "credential": { - "cohere_key": "" + "cohere_key": "" }, "parameters": { - "model": "embed-english-v2.0", + "model": "embed-english-v3.0", + "input_type":"search_document", "truncate": "END" }, "actions": [ @@ -236,9 +237,10 @@ POST /_plugins/_ml/connectors/_create "method": "POST", "url": "https://api.cohere.ai/v1/embed", "headers": { - "Authorization": "Bearer ${credential.cohere_key}" + "Authorization": "Bearer ${credential.cohere_key}", + "Request-Source": "unspecified:opensearch" }, - "request_body": "{ \"texts\": ${parameters.texts}, \"truncate\": \"${parameters.truncate}\", \"model\": \"${parameters.model}\" }", + "request_body": "{ \"texts\": ${parameters.texts}, \"truncate\": \"${parameters.truncate}\", \"model\": \"${parameters.model}\", \"input_type\": \"${parameters.input_type}\" }", "pre_process_function": "connector.pre_process.cohere.embedding", "post_process_function": "connector.post_process.cohere.embedding" } @@ -285,6 +287,21 @@ POST /_plugins/_ml/connectors/_create ``` {% include copy-curl.html %} +## Updating connector credentials + +In some cases, you may need to update credentials, like `access_key`, that you use to connect to externally hosted models. You can update credentials without undeploying the model by providing the new credentials in the following request: + +```json +PUT /_plugins/_ml/models/ +{ + "connector": { + "credential": { + "openAI_key": "YOUR NEW OPENAI KEY" + } + } +} +``` + ## Next steps - To learn more about connecting to external models, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). diff --git a/_ml-commons-plugin/remote-models/guardrails.md b/_ml-commons-plugin/remote-models/guardrails.md new file mode 100644 index 0000000000..ca34eb335c --- /dev/null +++ b/_ml-commons-plugin/remote-models/guardrails.md @@ -0,0 +1,298 @@ +--- +layout: default +title: Guardrails +has_children: false +has_toc: false +nav_order: 70 +parent: Connecting to externally hosted models +grand_parent: Integrating ML models +--- + +# Configuring model guardrails +**Introduced 2.13** +{: .label .label-purple } + +Guardrails can guide a large language model (LLM) toward desired behavior. They act as a filter, preventing the LLM from generating output that is harmful or violates ethical principles and facilitating safer use of AI. Guardrails also cause the LLM to produce more focused and relevant output. + +To configure guardrails for your LLM, you can provide a list of words to be prohibited in the input or output of the model. Alternatively, you can provide a regular expression against which the model input or output will be matched. + +## Prerequisites + +Before you start, make sure you have fulfilled the [prerequisites]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/#prerequisites) for connecting to an externally hosted model. + +## Step 1: Create a guardrail index + +To start, create an index that will store the excluded words (_stopwords_). In the index settings, specify a `title` field, which will contain excluded words, and a `query` field of the [percolator]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/percolator/) type. The percolator query will be used to match the LLM input or output: + +```json +PUT /words0 +{ + "mappings": { + "properties": { + "title": { + "type": "text" + }, + "query": { + "type": "percolator" + } + } + } +} +``` +{% include copy-curl.html %} + +## Step 2: Index excluded words or phrases + +Next, index a query string query that will be used to match excluded words in the model input or output: + +```json +PUT /words0/_doc/1?refresh +{ + "query": { + "query_string": { + "query": "title: blacklist" + } + } +} +``` +{% include copy-curl.html %} + +```json +PUT /words0/_doc/2?refresh +{ + "query": { + "query_string": { + "query": "title: \"Master slave architecture\"" + } + } +} +``` +{% include copy-curl.html %} + +For more query string options, see [Query string query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/). + +## Step 3: Register a model group + +To register a model group, send the following request: + +```json +POST /_plugins/_ml/model_groups/_register +{ + "name": "bedrock", + "description": "This is a public model group." +} +``` +{% include copy-curl.html %} + +The response contains the model group ID that you'll use to register a model to this model group: + +```json +{ + "model_group_id": "wlcnb4kBJ1eYAeTMHlV6", + "status": "CREATED" +} +``` + +To learn more about model groups, see [Model access control]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control/). + +## Step 4: Create a connector + +Now you can create a connector for the model. In this example, you'll create a connector to the Anthropic Claude model hosted on Amazon Bedrock: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "BedRock test claude Connector", + "description": "The connector to BedRock service for claude model", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "us-east-1", + "service_name": "bedrock", + "anthropic_version": "bedrock-2023-05-31", + "endpoint": "bedrock.us-east-1.amazonaws.com", + "auth": "Sig_V4", + "content_type": "application/json", + "max_tokens_to_sample": 8000, + "temperature": 0.0001, + "response_filter": "$.completion" + }, + "credential": { + "access_key": "", + "secret_key": "" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required" + }, + "request_body": "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }" + } + ] +} +``` +{% include copy-curl.html %} + +The response contains the connector ID for the newly created connector: + +```json +{ + "connector_id": "a1eMb4kBJ1eYAeTMAljY" +} +``` + +## Step 5: Register and deploy the model with guardrails + +To register an externally hosted model, provide the model group ID from step 3 and the connector ID from step 4 in the following request. To configure guardrails, include the `guardrails` object: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "Bedrock Claude V2 model", + "function_name": "remote", + "model_group_id": "wlcnb4kBJ1eYAeTMHlV6", + "description": "test model", + "connector_id": "a1eMb4kBJ1eYAeTMAljY", + "guardrails": { + "type": "local_regex", + "input_guardrail": { + "stop_words": [ + { + "index_name": "words0", + "source_fields": [ + "title" + ] + } + ], + "regex": [ + ".*abort.*", + ".*kill.*" + ] + }, + "output_guardrail": { + "stop_words": [ + { + "index_name": "words0", + "source_fields": [ + "title" + ] + } + ], + "regex": [ + ".*abort.*", + ".*kill.*" + ] + } + } +} +``` +{% include copy-curl.html %} + +For more information, see [The `guardrails` parameter]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-guardrails-parameter). + +OpenSearch returns the task ID of the register operation: + +```json +{ + "task_id": "cVeMb4kBJ1eYAeTMFFgj", + "status": "CREATED" +} +``` + +To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/): + +```bash +GET /_plugins/_ml/tasks/cVeMb4kBJ1eYAeTMFFgj +``` +{% include copy-curl.html %} + +When the operation is complete, the state changes to `COMPLETED`: + +```json +{ + "model_id": "cleMb4kBJ1eYAeTMFFg4", + "task_type": "DEPLOY_MODEL", + "function_name": "REMOTE", + "state": "COMPLETED", + "worker_node": [ + "n-72khvBTBi3bnIIR8FTTw" + ], + "create_time": 1689793851077, + "last_update_time": 1689793851101, + "is_async": true +} +``` + +## Step 6 (Optional): Test the model + +To demonstrate how guardrails are applied, first run the predict operation that does not contain any excluded words: + +```json +POST /_plugins/_ml/models/p94dYo4BrXGpZpgPp98E/_predict +{ + "parameters": { + "prompt": "\n\nHuman:this is a test\n\nnAssistant:" + } +} +``` +{% include copy-curl.html %} + +The response contains inference results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "response": " Thank you for the test, I appreciate you taking the time to interact with me. I'm an AI assistant created by Anthropic to be helpful, harmless, and honest." + } + } + ], + "status_code": 200 + } + ] +} +``` + +Then run the predict operation that contains excluded words: + +```json +POST /_plugins/_ml/models/p94dYo4BrXGpZpgPp98E/_predict +{ + "parameters": { + "prompt": "\n\nHuman:this is a test of Master slave architecture\n\nnAssistant:" + } +} +``` +{% include copy-curl.html %} + +The response contains an error message because guardrails were triggered: + +```json +{ + "error": { + "root_cause": [ + { + "type": "illegal_argument_exception", + "reason": "guardrails triggered for user input" + } + ], + "type": "illegal_argument_exception", + "reason": "guardrails triggered for user input" + }, + "status": 400 +} +``` + +Guardrails are also triggered when a prompt matches the supplied regular expression. + +## Next steps + +- For more information about configuring guardrails, see [The `guardrails` parameter]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-guardrails-parameter). \ No newline at end of file diff --git a/_ml-commons-plugin/remote-models/index.md b/_ml-commons-plugin/remote-models/index.md index 99eca95fcd..0b92adaab6 100644 --- a/_ml-commons-plugin/remote-models/index.md +++ b/_ml-commons-plugin/remote-models/index.md @@ -6,7 +6,7 @@ has_children: true has_toc: false nav_order: 60 redirect_from: - - ml-commons-plugin/extensibility/index/ + - /ml-commons-plugin/extensibility/index/ --- # Connecting to externally hosted models @@ -177,7 +177,7 @@ OpenSearch returns the task ID of the register operation: } ``` -To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/#get-a-task-by-id): +To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/): ```bash GET /_plugins/_ml/tasks/cVeMb4kBJ1eYAeTMFFgj @@ -205,7 +205,18 @@ Take note of the returned `model_id` because you’ll need it to deploy the mode ## Step 4: Deploy the model -To deploy the registered model, provide its model ID from step 3 in the following request: +Starting with OpenSearch version 2.13, externally hosted models are deployed automatically by default when you send a Predict API request for the first time. To disable automatic deployment for an externally hosted model, set `plugins.ml_commons.model_auto_deploy.enable` to `false`: +```json +PUT _cluster/settings +{ + "persistent": { + "plugins.ml_commons.model_auto_deploy.enable" : "false" + } +} +``` +{% include copy-curl.html %} + +To undeploy the model, use the [Undeploy API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/undeploy-model/). ```bash POST /_plugins/_ml/models/cleMb4kBJ1eYAeTMFFg4/_deploy @@ -309,7 +320,7 @@ The response contains the inference results provided by the OpenAI model: ## Step 6: Use the model for search -To learn how to use the model for vector search, see [Set up neural search]({{site.url}}{{site.baseurl}}http://localhost:4000/docs/latest/search-plugins/neural-search/#set-up-neural-search). +To learn how to use the model for vector search, see [Using an ML model for neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/#using-an-ml-model-for-neural-search). ## Next steps @@ -317,3 +328,4 @@ To learn how to use the model for vector search, see [Set up neural search]({{si - For more information about connector parameters, see [Connector blueprints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/). - For more information about managing ML models in OpenSearch, see [Using ML models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-serving-framework/). - For more information about interacting with ML models in OpenSearch, see [Managing ML models in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/ml-commons-plugin/ml-dashboard/) +For instructions on how to configure model guardrails, see [Guardrails]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/guardrails/). diff --git a/_ml-commons-plugin/using-ml-models.md b/_ml-commons-plugin/using-ml-models.md index 2e158310b6..db50626721 100644 --- a/_ml-commons-plugin/using-ml-models.md +++ b/_ml-commons-plugin/using-ml-models.md @@ -10,7 +10,7 @@ redirect_from: --- # Using ML models within OpenSearch -**Generally available 2.9** +**Introduced 2.9** {: .label .label-purple } To integrate machine learning (ML) models into your OpenSearch cluster, you can upload and serve them locally. Choose one of the following options: @@ -19,7 +19,9 @@ To integrate machine learning (ML) models into your OpenSearch cluster, you can - **Custom models** such as PyTorch deep learning models: To learn more, see [Custom models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). +Running local models on the CentOS 7 operating system is not supported. Moreover, not all local models can run on all hardware and operating systems. +{: .important} + ## GPU acceleration For better performance, you can take advantage of GPU acceleration on your ML node. For more information, see [GPU acceleration]({{site.url}}{{site.baseurl}}/ml-commons-plugin/gpu-acceleration/). - diff --git a/_monitoring-your-cluster/index.md b/_monitoring-your-cluster/index.md index 1d0f5bfe5f..105c3231d2 100644 --- a/_monitoring-your-cluster/index.md +++ b/_monitoring-your-cluster/index.md @@ -18,4 +18,5 @@ OpenSearch provides several ways for you to monitor your cluster health and perf - [Performance analyzer]({{site.url}}{{site.baseurl}}/monitoring-your-cluster/pa/index/) is an agent and REST API that allows you to query numerous performance metrics for your cluster, including aggregations of those metrics. -- OpenSearch [Job Scheduler]({{site.url}}{{site.baseurl}}/monitoring-your-cluster/job-scheduler/index/) plugin provides a framework that you can use to build schedules for common cluster management tasks. \ No newline at end of file +- OpenSearch [Job Scheduler]({{site.url}}{{site.baseurl}}/monitoring-your-cluster/job-scheduler/index/) plugin provides a framework that you can use to build schedules for common cluster management tasks. +- The OpenSearch [Metrics Framework]({{site.url}}{{site.baseurl}}/monitoring-your-cluster/metrics/) plugin provides a framework that you can use to export the telemetry metrics to the store of your choice. diff --git a/_monitoring-your-cluster/metrics/getting-started.md b/_monitoring-your-cluster/metrics/getting-started.md new file mode 100644 index 0000000000..659614a07c --- /dev/null +++ b/_monitoring-your-cluster/metrics/getting-started.md @@ -0,0 +1,107 @@ +--- +layout: default +title: Metrics framework +nav_order: 1 +has_children: false +has_toc: false +redirect_from: + - /monitoring-your-cluster/metrics/ +--- + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/10141). +{: .warning} + +While the OpenSearch Stats APIs offer insight into the inner workings of each node and an OpenSearch cluster as a whole, the statistics lack certain details, such as percentiles, and do not provide the semantics of richer metric types, like histograms. Consequently, identifying outliers within cluster statistics becomes challenging when using only the Stats API. + +The metrics framework feature adds comprehensive metrics support to effectively monitor an OpenSearch cluster. Using the Metrics Framework APIs, plugin, and extension, developers can add new monitoring metrics. In addition, the OpenSearch distribution bundles the `telemetry-otel` plugin, which provides the implementation for metrics instrumentation based on the [OpenTelemetry](https://opentelemetry.io) Java SDK. + + +## Getting started + +The metrics framework feature is experimental as of OpenSearch 2.11. To begin using the metrics framework feature, you need to first enable the `telemetry feature` by using the `opensearch.experimental.feature.telemetry.enabled` feature flag and subsequently by using the metrics framework feature flag. + +Enabling this feature can consume system resources. Before enabling the metrics framework feature, determine whether you have sufficient cluster resources to allocate. +{: .warning} + +### Enabling the feature flag on a node using tarball + +The `enable` flag is toggled using a Java Virtual Machine (JVM) parameter that is set either in `OPENSEARCH_JAVA_OPTS` or in `config/jvm.options`. + +#### Option 1: Enable the experimental feature flag in the `opensearch.yml` file + +1. Navigate to your OpenSearch directory using the following command: + + ```bash + cd \path\to\opensearch + ``` + +2. Open your `opensearch.yaml` file. +3. Add the following setting to `opensearch.yaml`: + + ```bash + opensearch.experimental.feature.telemetry.enabled=true + ``` + {% include copy.html %} + +4. Save your changes and close the file. + +#### Option 2: Modify jvm.options + +To enable the metrics framework feature using `jvm`, add the following line to `config/jvm.options` before starting OpenSearch: + +```bash +-Dopensearch.experimental.feature.telemetry.enabled=true +``` +{% include copy.html %} + +#### Option 3: Enable from an environment variable + +You can enable the metrics framework feature with a single command by adding the metrics framework environment variable to the `OPENSEARCH_JAVA_OPTS` command, as shown in the following example: + +```bash +OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.telemetry.enabled=true" ./opensearch-2.9.0/bin/opensearch +``` +{% include copy.html %} + +You can also define the environment variable separately before running OpenSearch by running the following command: + +```bash +export OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.telemetry.enabled=true" + ./bin/opensearch +``` +{% include copy.html %} + +### Enable with Docker + +If you’re running OpenSearch using Docker, add the following line to `docker-compose.yml` under `environment`: + +```bash +OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.telemetry.enabled=true" +``` +{% include copy.html %} + + +### Enable the metrics framework feature + +Once you've enabled the feature flag, you can enable the metrics framework feature by using the following setting, which enables metrics in the `opensearch.yaml` file: + +```bash +telemetry.feature.metrics.enabled=true +``` + +The metrics framework feature supports various telemetry solutions through plugins. Use the following instructions to enable the `telemetry-otel` plugin: + + +1. **Publish interval:** The metrics framework feature can locally aggregate metrics with unique information about the configured publish interval and then export those metrics. By default, the interval is 1 minute. However, you can change the interval using the `telemetry.otel.metrics.publish.interval` cluster setting. +2. **Exporters:** Exporters are responsible for persisting the data. OpenTelemetry provides several out-of-the-box exporters. OpenSearch supports the following exporters: + - `LoggingMetricExporter`: Exports metrics to a log file, generating a separate file in the logs directory `_otel_metrics.log`. Default is `telemetry.otel.metrics.exporter.class=io.opentelemetry.exporter.logging.LoggingMetricExporter`. + - `OtlpGrpcMetricExporter`: Exports spans through gRPC. To use this exporter, you need to install the `otel-collector` on the node. By default, it writes to the http://localhost:4317/ endpoint. To use this exporter, set the following static setting: `telemetry.otel.metrics.exporter.class=io.opentelemetry.exporter.otlp.metrics.OtlpGrpcMetricExporter`. + +### Supported metric types + +The metrics framework feature supports the following metric types: + +1. **Counters:** Counters are continuous and synchronous meters used to track the frequency of events over time. Counters can only be incremented with positive values, making them ideal for measuring the number of monitoring occurrences such as errors, processed or received bytes, and total requests. +2. **UpDown counters:** UpDown counters can be incremented with positive values or decremented with negative values. UpDown counters are well suited for tracking metrics like open connections, active requests, and other fluctuating quantities. +3. **Histograms:** Histograms are valuable tools for visualizing the distribution of continuous data. Histograms offer insight into the central tendency, spread, skewness, and potential outliers that might exist in your metrics. Patterns such as normal distribution, skewed distribution, or bimodal distribution can be readily identified, making histograms ideal for analyzing latency metrics and assessing percentiles. +4. **Asynchronous Gauges:** Asynchronous gauges capture the current value at the moment a metric is read. These metrics are non-additive and are commonly used to measure CPU utilization on a per-minute basis, memory utilization, and other real-time values. diff --git a/_monitoring-your-cluster/pa/dashboards.md b/_monitoring-your-cluster/pa/dashboards.md index d77a3c90ba..ef4e45ac0c 100644 --- a/_monitoring-your-cluster/pa/dashboards.md +++ b/_monitoring-your-cluster/pa/dashboards.md @@ -9,11 +9,14 @@ redirect_from: # PerfTop dashboards -Dashboards are defined in JSON and composed of three main elements: tables, line graphs, and bar graphs. You define a grid of rows and columns and then place elements within that grid, with each element spanning as many rows and columns as you specify. +You can view metrics derived from Performance Analyzer in a PerfTop dashboard. A PerfTop dashboard is a command line interface (CLI) for displaying the metrics. A PerfTop dashboard consists of three main elements: tables, line graphs, and bar graphs. Using JSON, you define a grid of rows and columns and then place elements within that grid, with each element spanning as many rows and columns as you specify. The best way to get started with building custom dashboards is to duplicate and modify one of the existing JSON files in the `dashboards` directory. {: .tip } +An example dashboard is shown in the following image. +![PerfTop dashboard]({{site.url}}{{site.baseurl}}/images/perftop.jpg) + --- #### Table of contents diff --git a/_monitoring-your-cluster/pa/index.md b/_monitoring-your-cluster/pa/index.md index e88831ba4e..bb4f9c6c30 100644 --- a/_monitoring-your-cluster/pa/index.md +++ b/_monitoring-your-cluster/pa/index.md @@ -245,7 +245,7 @@ curl -XPOST http://localhost:9200/_plugins/_performanceanalyzer/rca/cluster/conf If you encounter the `curl: (52) Empty reply from server` response, run the following command to enable RCA: ```bash -curl -XPOST https://localhost:9200/_plugins/_performanceanalyzer/rca/cluster/config -H 'Content-Type: application/json' -d '{"enabled": true}' -u 'admin:admin' -k +curl -XPOST https://localhost:9200/_plugins/_performanceanalyzer/rca/cluster/config -H 'Content-Type: application/json' -d '{"enabled": true}' -u 'admin:' -k ``` ### Example API query and response diff --git a/_monitoring-your-cluster/pa/reference.md b/_monitoring-your-cluster/pa/reference.md index 4d9e85328b..8b076b1ba5 100644 --- a/_monitoring-your-cluster/pa/reference.md +++ b/_monitoring-your-cluster/pa/reference.md @@ -219,84 +219,6 @@ This list is extensive. We recommend using Ctrl/Cmd + F to find what you're look Shard request cache memory size in bytes. - - Refresh_Event - - The total number of refreshes executed in the past five seconds. - - - - Refresh_Time - - The total time (milliseconds) spent executing refreshes in the past five seconds - - - - Flush_Event - - The total number of flushes executed in the past five seconds. - - - - Flush_Time - - The total time (milliseconds) spent executing flushes in the past five seconds. - - - - Merge_Event - - The total number of merges executed in the past five seconds. - - - - Merge_Time - - The total time (milliseconds) spent executing merges in the past five seconds. - - - - Merge_CurrentEvent - - The current number of merges executing. - - - - Indexing_Buffer - - Index buffer memory size in bytes. - - - - Segments_Total - - The number of segments. - - - - IndexWriter_Memory - - Estimated memory usage by the index writer in bytes. - - - - Bitset_Memory - - Estimated memory usage for the cached bit sets in bytes. - - - - VersionMap_Memory - - Estimated memory usage of the version map in bytes. - - - - Shard_Size_In_Bytes - - Estimated disk usage of the shard in bytes. - - @@ -821,27 +743,173 @@ The following metrics are relevant to the cluster as a whole and do not require +## Relevant dimensions: `NodeID`, `searchbp_mode` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MetricDescription
SearchBP_Shard_Stats_CancellationCount + The number of tasks marked for cancellation at the shard task level. +
SearchBP_Shard_Stats_LimitReachedCount + The number of times that the cancellable task total exceeded the set cancellation threshold at the shard task level. +
SearchBP_Shard_Stats_Resource_Heap_Usage_CancellationCount + The number of tasks marked for cancellation because of excessive heap usage since the node last restarted at the shard task level. +
SearchBP_Shard_Stats_Resource_Heap_Usage_CurrentMax + The maximum heap usage for tasks currently running at the shard task level. +
SearchBP_Shard_Stats_Resource_Heap_Usage_RollingAvg + The rolling average heap usage for the _n_ most recent tasks at the shard task level. The default value for _n_ is `100`. +
SearchBP_Shard_Stats_Resource_CPU_Usage_CancellationCount + The number of tasks marked for cancellation because of excessive CPU usage since the node last restarted at the shard task level. +
SearchBP_Shard_Stats_Resource_CPU_Usage_CurrentMax + The maximum CPU time for all tasks currently running on the node at the shard task level. +
SearchBP_Shard_Stats_Resource_CPU_Usage_CurrentAvg + The average CPU time for all tasks currently running on the node at the shard task level. +
SearchBP_Shard_Stats_Resource_ElaspedTime_Usage_CancellationCount + The number of tasks marked for cancellation because of excessive time elapsed since the node last restarted at the shard task level. +
SearchBP_Shard_Stats_Resource_ElaspedTime_Usage_CurrentMax + The maximum time elapsed for all tasks currently running on the node at the shard task level. +
SearchBP_Shard_Stats_Resource_ElaspedTime_Usage_CurrentAvg + The average time elapsed for all tasks currently running on the node at the shard task level. +
Searchbp_Task_Stats_CancellationCount + The number of tasks marked for cancellation at the search task level. +
SearchBP_Task_Stats_LimitReachedCount + The number of times that the cancellable task total exceeded the set cancellation threshold at the search task level. +
SearchBP_Task_Stats_Resource_Heap_Usage_CancellationCount + The number of tasks marked for cancellation because of excessive heap usage since the node last restarted at the search task level. +
SearchBP_Task_Stats_Resource_Heap_Usage_CurrentMax + The maximum heap usage for tasks currently running at the search task level. +
SearchBP_Task_Stats_Resource_Heap_Usage_RollingAvg + The rolling average heap usage for the _n_ most recent tasks at the search task level. The default value for _n_ is `10`. +
SearchBP_Task_Stats_Resource_CPU_Usage_CancellationCount + The number of tasks marked for cancellation because of excessive CPU usage since the node last restarted at the search task level. +
SearchBP_Task_Stats_Resource_CPU_Usage_CurrentMax + The maximum CPU time for all tasks currently running on the node at the search task level. +
SearchBP_Task_Stats_Resource_CPU_Usage_CurrentAvg + The average CPU time for all tasks currently running on the node at the search task level. +
SearchBP_Task_Stats_Resource_ElaspedTime_Usage_CancellationCount + The number of tasks marked for cancellation because of excessive time elapsed since the node last restarted at the search task level. +
SearchBP_Task_Stats_Resource_ElaspedTime_Usage_CurrentMax + The maximum time elapsed for all tasks currently running on the node at the search task level. +
SearchBP_Task_Stats_Resource_ElaspedTime_Usage_CurrentAvg + The average time elapsed for all tasks currently running on the node at the search task level. +
+ ## Dimensions reference | Dimension | Return values | |----------------------|-------------------------------------------------| -| ShardID | The ID of the shard, for example, `1`. | -| IndexName | The name of the index, for example, `my-index`. | -| Operation | The type of operation, for example, `shardbulk`. | -| ShardRole | The shard role, for example, `primary` or `replica`. | -| Exception | OpenSearch exceptions, for example, `org.opensearch.index_not_found_exception`. | -| Indices | The list of indexes in the request URL. | -| HTTPRespCode | The response code from OpenSearch, for example, `200`. | -| MemType | The memory type, for example, `totYoungGC`, `totFullGC`, `Survivor`, `PermGen`, `OldGen`, `Eden`, `NonHeap`, or `Heap`. | -| DiskName | The name of the disk, for example, `sda1`. | -| DestAddr | The destination address, for example, `010015AC`. | -| Direction | The direction, for example, `in` or `out`. | -| ThreadPoolType | The OpenSearch thread pools, for example, `index`, `search`, or `snapshot`. | -| CBType | The circuit breaker type, for example, `accounting`, `fielddata`, `in_flight_requests`, `parent`, or `request`. | -| ClusterManagerTaskInsertOrder| The order in which the task was inserted, for example, `3691`. | -| ClusterManagerTaskPriority | The priority of the task, for example, `URGENT`. OpenSearch executes higher-priority tasks before lower-priority ones, regardless of `insert_order`. | -| ClusterManagerTaskType | The task type, for example, `shard-started`, `create-index`, `delete-index`, `refresh-mapping`, `put-mapping`, `CleanupSnapshotRestoreState`, or `Update snapshot state`. | -| ClusterManagerTaskMetadata | The metadata for the task (if any). | -| CacheType | The cache type, for example, `Field_Data_Cache`, `Shard_Request_Cache`, or `Node_Query_Cache`. | - +| `ShardID` | The ID of the shard, for example, `1`. | +| `IndexName` | The name of the index, for example, `my-index`. | +| `Operation` | The type of operation, for example, `shardbulk`. | +| `ShardRole` | The shard role, for example, `primary` or `replica`. | +| `Exception` | OpenSearch exceptions, for example, `org.opensearch.index_not_found_exception`. | +| `Indices` | The list of indexes in the request URL. | +| `HTTPRespCode` | The OpenSearch response code, for example, `200`. | +| `MemType` | The memory type, for example, `totYoungGC`, `totFullGC`, `Survivor`, `PermGen`, `OldGen`, `Eden`, `NonHeap`, or `Heap`. | +| `DiskName` | The name of the disk, for example, `sda1`. | +| `DestAddr` | The destination address, for example, `010015AC`. | +| `Direction` | The direction, for example, `in` or `out`. | +| `ThreadPoolType` | The OpenSearch thread pools, for example, `index`, `search`, or `snapshot`. | +| `CBType` | The circuit breaker type, for example, `accounting`, `fielddata`, `in_flight_requests`, `parent`, or `request`. | +| `ClusterManagerTaskInsertOrder`| The order in which the task was inserted, for example, `3691`. | +| `ClusterManagerTaskPriority` | The priority of the task, for example, `URGENT`. OpenSearch executes higher-priority tasks before lower-priority ones, regardless of `insert_order`. | +| `ClusterManagerTaskType` | The task type, for example, `shard-started`, `create-index`, `delete-index`, `refresh-mapping`, `put-mapping`, `CleanupSnapshotRestoreState`, or `Update snapshot state`. | +| `ClusterManagerTaskMetadata` | The metadata for the task (if any). | +| `CacheType` | The cache type, for example, `Field_Data_Cache`, `Shard_Request_Cache`, or `Node_Query_Cache`. | +| `NodeID` | The ID of the node. | +| `Searchbp_mode` | The search backpressure mode, for example, `monitor_only` (default), `enforced`, or `disabled`. | diff --git a/_observing-your-data/alerting/api.md b/_observing-your-data/alerting/api.md index a66573d450..6b95196b11 100644 --- a/_observing-your-data/alerting/api.md +++ b/_observing-your-data/alerting/api.md @@ -13,7 +13,7 @@ Use the Alerting API to programmatically create, update, and manage monitors and --- -
+
Table of contents diff --git a/_observing-your-data/alerting/composite-monitors.md b/_observing-your-data/alerting/composite-monitors.md index 80c95e677e..eb26dcc15e 100644 --- a/_observing-your-data/alerting/composite-monitors.md +++ b/_observing-your-data/alerting/composite-monitors.md @@ -13,7 +13,7 @@ redirect_from: --- -
+
Table of contents diff --git a/_observing-your-data/alerting/per-cluster-metrics-monitors.md b/_observing-your-data/alerting/per-cluster-metrics-monitors.md index 6c4da96eac..99984a16a5 100644 --- a/_observing-your-data/alerting/per-cluster-metrics-monitors.md +++ b/_observing-your-data/alerting/per-cluster-metrics-monitors.md @@ -9,7 +9,7 @@ has_children: false # Per cluster metrics monitors -Per cluster metrics monitors are a type of alert monitor that collects and analyzes metrics from a single cluster, providing insights into the cluster's performance and health. You can set alerts to monitor certain conditions, such as when: +_Per cluster metrics monitors_ are a type of alert monitor that collects and analyzes metrics from a single cluster, providing insights into the cluster's performance and health. You can set alerts to monitor certain conditions, such as when: - Cluster health reaches yellow or red status. - Cluster-level metrics---for example, CPU usage and JVM memory usage---reach specified thresholds. @@ -51,7 +51,7 @@ Trigger conditions use responses from the following API endpoints. Most APIs tha If you want to hide fields from the API response and not expose them for alerting, reconfigure the [supported_json_payloads.json](https://github.com/opensearch-project/alerting/blob/main/alerting/src/main/resources/org/opensearch/alerting/settings/supported_json_payloads.json) file inside the Alerting plugin. The file functions as an allow list for the API fields you want to use in an alert. By default, all APIs and their parameters can be used for monitors and trigger conditions. -However, you can modify the file so that cluster metric monitors can only be created for APIs referenced. Furthermore, only fields referenced in the supported files can create trigger conditions. This `supported_json_payloads.json` allows for a cluster metrics monitor to be created for the `_cluster/stats` API, and triggers conditions for the `indices.shards.total` and `indices.shards.index.shards.min` fields. +However, you can modify the file so that cluster metrics monitors can only be created for APIs referenced. Furthermore, only fields referenced in the supported files can create trigger conditions. This `supported_json_payloads.json` allows for a cluster metrics monitor to be created for the `_cluster/stats` API, and triggers conditions for the `indices.shards.total` and `indices.shards.index.shards.min` fields. ```json "/_cluster/stats": { @@ -68,7 +68,9 @@ Painless scripts define triggers for cluster metrics monitors, similar to per qu The cluster metrics monitor supports up to **ten** triggers. -In the following example, a JSON object creates a trigger that sends an alert when the cluster health is yellow. `script` points the `source` to the Painless script `ctx.results[0].status == \"yellow\`. +In the following example, the monitor is configured to call the Cluster Health API for two clusters, `cluster-1` and `cluster-2`. The trigger condition will create an alert when either of the clusters' `status` is not `green`. + +The `script` parameter points the `source` to the Painless script `for (cluster in ctx.results[0].keySet()) if (ctx.results[0][cluster].status != \"green\") return true`. See [Trigger variables]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/triggers/#trigger-variables) for more `painless ctx` variable options. ```json { @@ -88,7 +90,8 @@ In the following example, a JSON object creates a trigger that sends an alert wh "api_type": "CLUSTER_HEALTH", "path": "_cluster/health/", "path_params": "", - "url": "http://localhost:9200/_cluster/health/" + "url": "http://localhost:9200/_cluster/health/", + "cluster": ["cluster-1", "cluster-2"] } } ], @@ -100,7 +103,7 @@ In the following example, a JSON object creates a trigger that sends an alert wh "severity": "1", "condition": { "script": { - "source": "ctx.results[0].status == \"yellow\"", + "source": "for (cluster in ctx.results[0].keySet()) if (ctx.results[0][cluster].status != \"green\") return true", "lang": "painless" } }, @@ -110,14 +113,14 @@ In the following example, a JSON object creates a trigger that sends an alert wh ] } ``` +The dashboards interface supports the selection of clusters to be monitored and the desired API. A view of the interface is shown in the following image. -See [Trigger variables]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/triggers/#trigger-variables) for more `painless ctx` variable options. +Cluster metrics monitor ### Limitations Per cluster metrics monitors have the following limitations: -- You cannot create monitors for remote clusters. - The OpenSearch cluster must be in a state where an index's conditions can be monitored and actions can be executed against the index. - Removing resource permissions from a user will not prevent that user’s preexisting monitors for that resource from executing. - Users with permissions to create monitors are not blocked from creating monitors for resources for which they do not have permissions; however, those monitors will not run. diff --git a/_observing-your-data/alerting/per-query-bucket-monitors.md b/_observing-your-data/alerting/per-query-bucket-monitors.md index 244cf92a60..d4fe0ff9d7 100644 --- a/_observing-your-data/alerting/per-query-bucket-monitors.md +++ b/_observing-your-data/alerting/per-query-bucket-monitors.md @@ -13,6 +13,10 @@ Per query monitors are a type of alert monitor that can be used to identify and Per bucket monitors are a type of alert monitor that can be used to identify and alert on specific buckets of data that are created by a query against an OpenSearch index. +Both monitor types support querying remote indexes using the same `cluster-name:index-name` pattern used by [cross-cluster search](https://opensearch.org/docs/latest/security/access-control/cross-cluster-search/) or by using OpenSearch Dashboards 2.12 or later. + +Cluster metrics monitor + ## Creating a per query or per bucket monitor To create a per query monitor, follow these steps: diff --git a/_observing-your-data/alerting/settings.md b/_observing-your-data/alerting/settings.md index 8ca2e8f917..c50636b3ae 100644 --- a/_observing-your-data/alerting/settings.md +++ b/_observing-your-data/alerting/settings.md @@ -54,6 +54,7 @@ Setting | Default | Description `plugins.alerting.alert_history_retention_period` | 60d | The amount of time to keep history indexes before automatically deleting them. `plugins.alerting.destination.allow_list` | ["chime", "slack", "custom_webhook", "email", "test_action"] | The list of allowed destinations. If you don't want to allow users to a certain type of destination, you can remove it from this list, but we recommend leaving this setting as-is. `plugins.alerting.filter_by_backend_roles` | "false" | Restricts access to monitors by backend role. See [Alerting security]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/security/). +`plugins.alerting.remote_monitoring_enabled` | "false" | Toggles whether cluster metrics monitors support executing against remote clusters. `plugins.scheduled_jobs.sweeper.period` | 5m | The alerting feature uses its "job sweeper" component to periodically check for new or updated jobs. This setting is the rate at which the sweeper checks to see if any jobs (monitors) have changed and need to be rescheduled. `plugins.scheduled_jobs.sweeper.page_size` | 100 | The page size for the sweeper. You shouldn't need to change this value. `plugins.scheduled_jobs.sweeper.backoff_millis` | 50ms | The amount of time the sweeper waits between retries---increases exponentially after each failed retry. diff --git a/_observing-your-data/event-analytics.md b/_observing-your-data/event-analytics.md index a32445fdd4..b8fe72964c 100644 --- a/_observing-your-data/event-analytics.md +++ b/_observing-your-data/event-analytics.md @@ -8,15 +8,15 @@ redirect_from: # Event analytics -Event analytics in Observability is where you can use [Piped Processing Language]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index) (PPL) queries to build and view different visualizations of your data. +Event analytics in OpenSearch Observability allow you to create data visualizations using [Piped Processing Language]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/) (PPL) queries. ## Getting started with event analytics -To get started, choose **Observability** in OpenSearch Dashboards and then choose **Event analytics**. If you want to start exploring without adding any of your own data, choose **Add samples**, and Dashboards adds sample visualizations you can interact with. +To get started, choose **Observability** in OpenSearch Dashboards and then choose **Logs**. If you want to start exploring without adding your own data, choose **Add samples**. Dashboards adds sample visualizations you can interact with. You can also try out preconfigured analytics in [OpenSearch Playground](https://playground.opensearch.org/app/observability-logs#/). ## Building a query -To generate custom visualizations, you must first specify a PPL query. OpenSearch Dashboards then automatically creates a visualization based on the results of your query. +To generate custom visualizations, you must first specify a PPL query. OpenSearch Dashboards then automatically creates a visualization based on your query results. For example, the following PPL query returns a count of how many host addresses are currently in your data. @@ -24,41 +24,64 @@ For example, the following PPL query returns a count of how many host addresses source = opensearch_dashboards_sample_data_logs | fields host | stats count() ``` -By default, Dashboards shows results from the last 15 minutes of your data. To see data from a different time frame, use the date and time selector. +By default, Dashboards shows results from the last 15 minutes of your data. To see data from a different time frame, use the date and time selector to choose the desired settings. -For more information about building PPL queries, see [Piped Processing Language]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index). +For more information about building PPL queries, see [Piped Processing Language]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/). -## Saving a visualization +### OpenSearch Dashboards Query Assistant + +Note that machine learning models are probabilistic and that some may perform better than others, so the OpenSearch Assistant may occasionally produce inaccurate information. We recommend evaluating outputs for accuracy as appropriate to your use case, including reviewing the output or combining it with other verification factors. +{: .important} + +To simplify query building, the **OpenSearch Assistant** toolkit offers an assistant that converts natural language queries into PPL. A screenshot is shown in the following image. + +![Sample OpenSearch Query Assist screen view]({{site.url}}{{site.baseurl}}/images/log-explorer-query-assist.png) + +#### Enabling Query Assistant + +By default, **Query Assistant** is enabled in OpenSearch Dashboards. To enable summarization of responses, locate your copy of the `opensearch_dashboards.yml` file and set the following option: + +```yaml +observability.summarize.enabled: true +observability.summarize.response_summary_agent_name: "Response summary agent" +observability.summarize.error_summary_agent_name: "Error summary agent" +``` -After Dashboards generates a visualization, you must save it if you want to return to it at a later time or if you want to add it to an [operational panel]({{site.url}}{{site.baseurl}}/observing-your-data/operational-panels). +To disable Query Assistant, add `observability.query_assist.enabled: false` to your `opensearch_dashboards.yml`. -To save a visualization, expand the save dropdown menu next to **Refresh**, enter a name for your visualization, then choose **Save**. You can reopen any saved visualizations on the event analytics page. +#### Setting up Query Assistant + +To set up **Query Assistant**, follow the steps in the [Getting started guide](https://github.com/opensearch-project/dashboards-assistant/blob/main/GETTING_STARTED_GUIDE.md) on GitHub. This guide provides step-by-step setup instructions for **OpenSearch Assistant** and **Query Assistant**. To set up **Query Assistant** only, use the `query-assist-agent` template included in the guide. + +## Saving a visualization + +After Dashboards generates a visualization, save it if you want to revisit it or include it in an [operational panel]({{site.url}}{{site.baseurl}}/observing-your-data/operational-panels/). To save a visualization, expand the **Save** dropdown menu in the upper-right corner, enter a name for the visualization, and then select the **Save** button. You can reopen saved visualizations on the event analytics page. ## Creating event analytics visualizations and adding them to dashboards -This feature is available in OpenSearch Dashboards version 2.7 and later. It works with new visualizations created in version 2.7 or later that use PPL to query data from OpenSearch or federated data sources such as Prometheus. +This feature is available in OpenSearch Dashboards 2.7 and later. It works with new visualizations that use PPL to query data from OpenSearch or federated data sources such as Prometheus. {: .note} -Presenting your visualizations on a dashboard, instead of the event analytics page, makes it easier for users to understand and interpret the data at a glance. - To create a PPL visualization, follow these steps: 1. On the main menu, choose **Visualize** > **PPL**. -2. In the **Observability** > **Logs** > **Explorer** window, enter the index source in the **PPL query** field, for example, `source = opensearch_dashboards_sample_data_flights | stats count() by DestCountry`. You must enter the query using PPL syntax. +2. From the **Observability** > **Logs** > **Explorer** window, enter the index source in the **PPL query** field, for example, `source = opensearch_dashboards_sample_data_flights | stats count() by DestCountry`. You must enter the query using PPL syntax. 3. Set the time filter, for example, **This week**, and then select **Refresh**. -4. Choose the visualization type, for example, **Pie**, from the right sidebar dropdown menu. +4. Choose the visualization type, for example, **Pie**, from the sidebar dropdown menu to the right. 5. Select **Save** and enter a name for the visualization. -You've created a new visualization that can be added to a new or existing dashboard. To add a PPL query to a dashboard, follow these steps: +You've now created a new visualization that can be added to a new or existing dashboard. To add a PPL query to a dashboard, follow these steps: -1. Select **Dashboard** from the main menu. -2. In the **Dashboards** window, select **Create > Dashboard**. +1. Select **Dashboards** from the main menu. +2. In the **Dashboards** window, select **Create** > **Dashboard**. 3. In the **Editing New Dashboard** window, choose **Add an existing**. -4. In the **Add panels** window, choose **PPL** and select the visualization. It is now displayed on your dashboard. +4. In the **Add panels** window, choose **PPL** from the **Types** dropdown menu, and then select the visualization. It is now displayed on your dashboard. 5. Select **Save** and enter a name for the dashboard. -6. To add more visualizations to the dashboard, choose **Select existing visualization** and follow the steps above. Alternatively, choose **Create new** and then select **PPL** in the **New Visualization** window. You'll return to the event analytics page and follow steps 1--6 in the preceding instructions. +6. To add more visualizations to the dashboard, choose **Select existing visualization** and follow steps 1--5. Alternatively, choose **Create new** and then select **PPL** in the **New Visualization** window. You'll return to the event analytics page and follow steps 1--5 in the preceding instructions. + +The following demo provides an overview of creating event analytics visualizations and adding them to a dashboard. -![Demo of how to create event analytics visualizations and add them to a dashboard]({{site.url}}{{site.baseurl}}/images/dashboards/event-analytics-dashboard.gif) +![Demo of creating event analytics visualizations and adding them to a dashboard]({{site.url}}{{site.baseurl}}/images/dashboards/event-analytics-dashboard.gif) ### Limitations of event analytics visualizations @@ -73,20 +96,26 @@ The following are methods you can use to view logs. ### Correlating logs and traces -If you regularly track events across applications, you can correlate logs and traces. To view the correlation, you have to index the traces according to Open Telemetry standards (similar to trace analytics). Once you add a `TraceId` field to your logs, you can view the correlated trace information in the event explorer log details. This method lets you correlate logs and traces that correspond to the same execution context. +If you regularly track events across applications, you can correlate logs and traces. To view correlations, you must index the traces according to OpenTelemetry standards, similarly to [trace analytics]({{site.url}}{{site.baseurl}}/observing-your-data/trace/index/). Once you add a `TraceId` field to your logs, you can view the correlated trace information in the event explorer log details. This method correlates logs and traces that correspond to the same execution context. The following demo shows this feature in action. ![Trace Log Correlation]({{site.url}}{{site.baseurl}}/images/trace_log_correlation.gif) ### Viewing surrounding events -If you want to know more about a log event you're looking at, you can select **View surrounding events** to get a bigger picture of what was happening around the time of interest. +If you need more information about a log event, you can select **View surrounding events** to gain a more comprehensive understanding of the context around the time of interest. The following demo shows this feature in action. ![Surrounding Events]({{site.url}}{{site.baseurl}}/images/surrounding_events.gif) ### Livestreaming logs -If you prefer watching events happen live, you can configure an interval so event analytics automatically refreshes the content. Live tail lets you stream logs live to OpenSearch observability event analytics based on the provided PPL query, as well as provide rich functionality such as filters. Doing so improves your debugging experience and lets you monitor your logs in real-time without having to manually refresh. +If you prefer real-time monitoring, you can set up an interval at which event analytics content will be automatically refreshed. With Live Tail, you can stream logs directly to OpenSearch Observability event analytics using the specified PPL query while leveraging robust features like filters. This can enhance your debugging process and enables seamless real-time monitoring of logs without the need to manually refresh content. -You can also choose intervals and switch between them to dictate how often live tail should stream live logs. This feature is similar to the CLI's `tail -f` command in that it only retrieves the most recent live logs by possibly eliminating a large portion of live logs. Live tail also provides you with the total count of live logs received by OpenSearch during the live stream, which you can use to better understand the incoming traffic. +With Live Tail, you can select intervals and seamlessly switch between them to control the frequency of live log streaming. This functionality is similar to the `tail -f` CLI command, as it retrieves only the most recent live logs, potentially eliminating a significant portion of live logs. Live Tail displays the total number of live logs received by OpenSearch during the live stream, offering insight into incoming traffic patterns. The following demo shows this feature in action. ![Live Tail]({{site.url}}{{site.baseurl}}/images/live_tail.gif) + +## Related articles + +- [Demonstrating the OpenSearch Assistant toolkit](https://www.youtube.com/watch?v=VTiJtGI2Sr4&t=152s) +- [Getting started guide for OpenSearch Assistant in OpenSearch Dashboards](https://github.com/opensearch-project/dashboards-assistant/blob/main/GETTING_STARTED_GUIDE.md) +- OpenSearch Assistant configuration through the REST API diff --git a/_observing-your-data/log-ingestion.md b/_observing-your-data/log-ingestion.md index 751f538c3c..61f427d30e 100644 --- a/_observing-your-data/log-ingestion.md +++ b/_observing-your-data/log-ingestion.md @@ -63,7 +63,7 @@ This should result in a single document being written to the OpenSearch cluster Run the following command to see one of the raw documents in the OpenSearch cluster: ```bash -curl -X GET -u 'admin:admin' -k 'https://localhost:9200/apache_logs/_search?pretty&size=1' +curl -X GET -u 'admin:' -k 'https://localhost:9200/apache_logs/_search?pretty&size=1' ``` The response should show the parsed log data: diff --git a/_observing-your-data/notifications/index.md b/_observing-your-data/notifications/index.md index 6a8446699d..b352988c59 100644 --- a/_observing-your-data/notifications/index.md +++ b/_observing-your-data/notifications/index.md @@ -100,6 +100,9 @@ Then add this policy into the IAM user’s trust relationship to actually assume } ``` +### Host deny list + +Define IP ranges or hostnames where OpenSearch nodes should not initiate requests. ## Email as a channel type @@ -123,8 +126,8 @@ To send or receive notifications with email, choose **Email** as the channel typ If your email provider requires SSL or TLS, you must authenticate each sender account before you can send an email. Enter the sender account credentials in the OpenSearch keystore using the command line interface (CLI). Run the following commands (in your OpenSearch directory) to enter your user name and password. The <sender_name> is the name you entered for **Sender** earlier. ```json -opensearch.notifications.core.email..username -opensearch.notifications.core.email..password +/usr/share/opensearch/bin/opensearch-keystore add opensearch.notifications.core.email..username +/usr/share/opensearch/bin/opensearch-keystore add opensearch.notifications.core.email..password ``` To change or update your credentials (after you’ve added them to the keystore on every node), call the reload API to automatically update those credentials without restarting OpenSearch. diff --git a/_observing-your-data/query-insights/index.md b/_observing-your-data/query-insights/index.md new file mode 100644 index 0000000000..7bad169d1d --- /dev/null +++ b/_observing-your-data/query-insights/index.md @@ -0,0 +1,38 @@ +--- +layout: default +title: Query insights +nav_order: 40 +has_children: true +has_toc: false +--- + +# Query insights + +To monitor and analyze the search queries within your OpenSearch clusterQuery information, you can obtain query insights. With minimal performance impact, query insights features aim to provide comprehensive insights into search query execution, enabling you to better understand search query characteristics, patterns, and system behavior during query execution stages. Query insights facilitate enhanced detection, diagnosis, and prevention of query performance issues, ultimately improving query processing performance, user experience, and overall system resilience. + +Typical use cases for query insights features include the following: + +- Identifying top queries by latency within specific time frames +- Debugging slow search queries and latency spikes + +Query insights features are supported by the Query Insights plugin. At a high level, query insights features comprise the following components: + +* _Collectors_: Gather performance-related data points at various stages of search query execution. +* _Processors_: Perform lightweight aggregation and processing on data collected by the collectors. +* _Exporters_: Export the data into different sinks. + + +## Installing the Query Insights plugin + +You need to install the `query-insights` plugin to enable query insights features. To install the plugin, run the following command: + +```bash +bin/opensearch-plugin install query-insights +``` +For information about installing plugins, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). + +## Query insights settings + +Query insights features support the following settings: + +- [Top n queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/) diff --git a/_observing-your-data/query-insights/top-n-queries.md b/_observing-your-data/query-insights/top-n-queries.md new file mode 100644 index 0000000000..44469fa64b --- /dev/null +++ b/_observing-your-data/query-insights/top-n-queries.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Top N queries +parent: Query insights +nav_order: 65 +--- + +# Top N queries + +Monitoring the top N queries in query insights features can help you gain real-time insights into the top queries with high latency within a certain time frame (for example, the last hour). + +## Getting started + +To enable monitoring of the top N queries, configure the following [dynamic settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#dynamic-settings): + +- `search.insights.top_queries.latency.enabled`: Set to `true` to [enable monitoring of the top N queries](#enabling-the-top-n-queries-feature). +- `search.insights.top_queries.latency.window_size`: [Configure the window size](#configuring-window-size). +- `search.insights.top_queries.latency.top_n_size`: [Specify the value of n](#configuring-the-value-of-n). + +It's important to exercise caution when enabling this feature because it can consume system resources. +{: .important} + + +For detailed information about enabling and configuring this feature, see the following sections. + +## Enabling the top N queries feature + +After installing the `query-insights` plugin, you can enable the top N queries feature (which is disabled by default) by using the following dynamic setting. This setting enables the corresponding collectors and aggregators in the running cluster: + +```json +PUT _cluster/settings +{ + "persistent" : { + "search.insights.top_queries.latency.enabled" : true + } +} +``` +{% include copy-curl.html %} + +## Configuring window size + +You can configure the window size for the top N queries by latency with `search.insights.top_queries.latency.window_size`. For example, a cluster with the following configuration will collect top N queries in a 60-minute window: + +```json +PUT _cluster/settings +{ + "persistent" : { + "search.insights.top_queries.latency.window_size" : "60m" + } +} +``` +{% include copy-curl.html %} + +## Configuring the value of N + +You can configure the value of N in the `search.insights.top_queries.latency.top_n_size` parameter. For example, a cluster with the following configuration will collect the top 10 queries in the specified window size: + +``` +PUT _cluster/settings +{ + "persistent" : { + "search.insights.top_queries.latency.top_n_size" : 10 + } +} +``` +{% include copy-curl.html %} + +## Monitoring the top N queries + +You can use the Insights API endpoint to obtain top N queries by latency: + +```json +GET /_insights/top_queries +``` +{% include copy-curl.html %} + +Specify a metric type to filter the response by metric type (latency is the only supported type as of 2.12): + +```json +GET /_insights/top_queries?type=latency +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_observing-your-data/trace/getting-started.md b/_observing-your-data/trace/getting-started.md index b146b42a0e..d1bffb7050 100644 --- a/_observing-your-data/trace/getting-started.md +++ b/_observing-your-data/trace/getting-started.md @@ -76,7 +76,7 @@ node-0.example.com | [2020-11-19T16:29:55,267][INFO ][o.e.c.m.MetadataMappingSe In a new terminal window, run the following command to see one of the raw documents in the OpenSearch cluster: ```bash -curl -X GET -u 'admin:admin' -k 'https://localhost:9200/otel-v1-apm-span-000001/_search?pretty&size=1' +curl -X GET -u 'admin:' -k 'https://localhost:9200/otel-v1-apm-span-000001/_search?pretty&size=1' ``` Navigate to `http://localhost:5601` in a web browser and choose **Trace Analytics**. You can see the results of your single click in the Jaeger HotROD web interface: the number of traces per API and HTTP method, latency trends, a color-coded map of the service architecture, and a list of trace IDs that you can use to drill down on individual operations. diff --git a/_plugins/link-checker.rb b/_plugins/link-checker.rb index ec072596b7..9eb9c48bc9 100644 --- a/_plugins/link-checker.rb +++ b/_plugins/link-checker.rb @@ -64,6 +64,7 @@ class CheckTypes 'crates.io', # 404s on bots 'www.cloudflare.com', # 403s on bots 'platform.openai.com', # 403s on bots + 'mvnrepository.com', # 403s on bots 'example.issue.link' # a fake example link from the template ] diff --git a/_query-dsl/full-text/intervals.md b/_query-dsl/full-text/intervals.md index dd401933b5..082f8fbe46 100644 --- a/_query-dsl/full-text/intervals.md +++ b/_query-dsl/full-text/intervals.md @@ -73,7 +73,7 @@ GET /testindex/_search The query returns both documents: -
+
Response @@ -262,7 +262,7 @@ POST /testindex/_search The response contains only document 2: -
+
Response @@ -332,7 +332,7 @@ POST /testindex/_search The response contains document 2: -
+
Response diff --git a/_query-dsl/full-text/match-bool-prefix.md b/_query-dsl/full-text/match-bool-prefix.md index 8ab0bf5002..3a0d304ce4 100644 --- a/_query-dsl/full-text/match-bool-prefix.md +++ b/_query-dsl/full-text/match-bool-prefix.md @@ -94,7 +94,7 @@ GET testindex/_search The response contains both documents: -
+
Response diff --git a/_query-dsl/full-text/match-phrase-prefix.md b/_query-dsl/full-text/match-phrase-prefix.md index f9316d8990..354dd35c61 100644 --- a/_query-dsl/full-text/match-phrase-prefix.md +++ b/_query-dsl/full-text/match-phrase-prefix.md @@ -82,7 +82,7 @@ GET testindex/_search The response contains the matching document: -
+
Response diff --git a/_query-dsl/full-text/match-phrase.md b/_query-dsl/full-text/match-phrase.md index 61b9cf1825..18dd6a858c 100644 --- a/_query-dsl/full-text/match-phrase.md +++ b/_query-dsl/full-text/match-phrase.md @@ -80,7 +80,7 @@ GET testindex/_search The response contains the matching document: -
+
Response @@ -139,7 +139,7 @@ GET testindex/_search The `english` analyzer removes the stopword `the` and performs stemming, producing the token `wind`. Both documents match this token and are returned in the results: -
+
Response @@ -204,7 +204,7 @@ GET _search The query still returns the matching document: -
+
Response diff --git a/_query-dsl/full-text/match.md b/_query-dsl/full-text/match.md index f0ae5d6c7b..746a4cf5b6 100644 --- a/_query-dsl/full-text/match.md +++ b/_query-dsl/full-text/match.md @@ -91,7 +91,7 @@ GET testindex/_search The query is constructed as `wind AND rise` and returns document 1 as the matching document: -
+
Response @@ -151,7 +151,7 @@ GET testindex/_search Now documents are required to match both terms, so only document 1 is returned (this is equivalent to the `and` operator): -
+
Response @@ -210,7 +210,7 @@ GET testindex/_search The `english` analyzer removes the stopword `the` and performs stemming, producing the tokens `wind` and `rise`. The latter token matches document 1, which is returned in the results: -
+
Response @@ -315,7 +315,7 @@ GET testindex/_search The token `wnid` matches `wind` and the query returns documents 1 and 2: -
+
Response diff --git a/_query-dsl/full-text/multi-match.md b/_query-dsl/full-text/multi-match.md index 6dda1a797b..7450b74721 100644 --- a/_query-dsl/full-text/multi-match.md +++ b/_query-dsl/full-text/multi-match.md @@ -480,7 +480,7 @@ The `cross_fields` query only works as a term-centric query on fields with the s For example, consider an index where the `first_name` and `last_name` fields are analyzed with the default `standard` analyzer and their `.edge` subfields are analyzed with an edge n-gram analyzer: -
+
Response @@ -713,7 +713,7 @@ GET articles/_search Because by default a `phrase` query matches text only when the terms appear in the same order, only document 1 is returned in the results: -
+
Response @@ -770,7 +770,7 @@ GET articles/_search The response contains document 2: -
+
Response diff --git a/_query-dsl/full-text/query-string.md b/_query-dsl/full-text/query-string.md index f98655bacf..12609e29c0 100644 --- a/_query-dsl/full-text/query-string.md +++ b/_query-dsl/full-text/query-string.md @@ -50,7 +50,7 @@ You can use query string syntax in the following cases: GET _search?q=wind ``` -A query string consists of _terms_ and _operators_. A term is a single word (for example, in the query `wind rises`, the terms are `wind` and `rises`). If several terms are surrounded by quotation marks, they are treated as one phrase where words are marched in the order they appear (for example, `"wind rises"`). Operators (such as `OR`, `AND`, and `NOT`) specify the Boolean logic used to interpret text in the query string. +A query string consists of _terms_ and _operators_. A term is a single word (for example, in the query `wind rises`, the terms are `wind` and `rises`). If several terms are surrounded by quotation marks, they are treated as one phrase where words are matched in the order they appear (for example, `"wind rises"`). Operators (such as `OR`, `AND`, and `NOT`) specify the Boolean logic used to interpret text in the query string. The examples in this section use an index containing the following mapping and documents: @@ -645,4 +645,4 @@ Parameter | Data type | Description `time_zone` | String | Specifies the number of hours to offset the desired time zone from `UTC`. You need to indicate the time zone offset number if the query string contains a date range. For example, set `time_zone": "-08:00"` for a query with a date range such as `"query": "wind rises release_date[2012-01-01 TO 2014-01-01]"`). The default time zone format used to specify number of offset hours is `UTC`. Query string queries may be internally converted into [prefix queries]({{site.url}}{{site.baseurl}}/query-dsl/term/prefix/). If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `false`, prefix queries are not executed. If `index_prefixes` is enabled, the `search.allow_expensive_queries` setting is ignored and an optimized query is built and executed. -{: .important} \ No newline at end of file +{: .important} diff --git a/_query-dsl/full-text/simple-query-string.md b/_query-dsl/full-text/simple-query-string.md index f84735419a..fbf37f588d 100644 --- a/_query-dsl/full-text/simple-query-string.md +++ b/_query-dsl/full-text/simple-query-string.md @@ -109,7 +109,7 @@ GET /customers/_search However, the results include not only the expected document, but all four documents: -
+
Response @@ -213,7 +213,7 @@ GET /customers/_search The preceding query returns document 2: -
+
Response diff --git a/_query-dsl/index.md b/_query-dsl/index.md index 01bd126774..bf75d89936 100644 --- a/_query-dsl/index.md +++ b/_query-dsl/index.md @@ -109,5 +109,5 @@ PUT _cluster/settings ``` {% include copy-curl.html %} -To track expensive queries, enable [slow logs]({{site.url}}{{site.baseurl}}/monitoring-your-cluster/logs/#slow-logs). +To track expensive queries, enable [shard slow logs]({{site.url}}{{site.baseurl}}/monitoring-your-cluster/logs/#shard-slow-logs). {: .tip} \ No newline at end of file diff --git a/_query-dsl/minimum-should-match.md b/_query-dsl/minimum-should-match.md index 9ec65431b1..e2032b8911 100644 --- a/_query-dsl/minimum-should-match.md +++ b/_query-dsl/minimum-should-match.md @@ -26,7 +26,7 @@ GET /shakespeare/_search } ``` -In this example, the query has three optional clauses that are combined with an `OR`, so the document must match either `prince`, `king`, or `star`. +In this example, the query has three optional clauses that are combined with an `OR`, so the document must match either `prince` and `king`, or `prince` and `star`, or `king` and `star`. ## Valid values @@ -448,4 +448,4 @@ The results contain only four documents that match at least one of the optional ] } } -``` \ No newline at end of file +``` diff --git a/_query-dsl/specialized/neural-sparse.md b/_query-dsl/specialized/neural-sparse.md index c91c491dcf..70fcfd892c 100644 --- a/_query-dsl/specialized/neural-sparse.md +++ b/_query-dsl/specialized/neural-sparse.md @@ -20,8 +20,7 @@ Include the following request fields in the `neural_sparse` query: "neural_sparse": { "": { "query_text": "", - "model_id": "", - "max_token_score": "" + "model_id": "" } } ``` @@ -32,7 +31,7 @@ Field | Data type | Required/Optional | Description :--- | :--- | :--- `query_text` | String | Required | The query text from which to generate vector embeddings. `model_id` | String | Required | The ID of the sparse encoding model or tokenizer model that will be used to generate vector embeddings from the query text. The model must be deployed in OpenSearch before it can be used in sparse neural search. For more information, see [Using custom models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/) and [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). -`max_token_score` | Float | Optional | The theoretical upper bound of the score for all tokens in the vocabulary (required for performance optimization). For OpenSearch-provided [pretrained sparse embedding models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sparse-encoding-models), we recommend setting `max_token_score` to 2 for `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1` and to 3.5 for `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1`. +`max_token_score` | Float | Optional | (Deprecated) The theoretical upper bound of the score for all tokens in the vocabulary (required for performance optimization). For OpenSearch-provided [pretrained sparse embedding models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sparse-encoding-models), we recommend setting `max_token_score` to 2 for `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1` and to 3.5 for `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1`. This field has been deprecated as of OpenSearch 2.12. #### Example request @@ -43,8 +42,7 @@ GET my-nlp-index/_search "neural_sparse": { "passage_embedding": { "query_text": "Hi world", - "model_id": "aP2Q8ooBpBj3wT4HVS8a", - "max_token_score": 2 + "model_id": "aP2Q8ooBpBj3wT4HVS8a" } } } diff --git a/_query-dsl/specialized/neural.md b/_query-dsl/specialized/neural.md index b985fb3b3e..a4b2ad7f36 100644 --- a/_query-dsl/specialized/neural.md +++ b/_query-dsl/specialized/neural.md @@ -33,6 +33,7 @@ Field | Data type | Required/Optional | Description `query_image` | String | Optional | A base-64 encoded string that corresponds to the query image from which to generate vector embeddings. You must specify at least one `query_text` or `query_image`. `model_id` | String | Required if the default model ID is not set. For more information, see [Setting a default model on an index or field]({{site.url}}{{site.baseurl}}/search-plugins/neural-text-search/#setting-a-default-model-on-an-index-or-field). | The ID of the model that will be used to generate vector embeddings from the query text. The model must be deployed in OpenSearch before it can be used in neural search. For more information, see [Using custom models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/) and [Neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/). `k` | Integer | Optional | The number of results returned by the k-NN search. Default is 10. +`filter` | Object | Optional | A query that can be used to reduce the number of documents considered. For more information about filter usage, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). **Important**: Filter can only be used with the `faiss` or `lucene` engines. #### Example request @@ -44,7 +45,26 @@ GET /my-nlp-index/_search "passage_embedding": { "query_text": "Hi world", "query_image": "iVBORw0KGgoAAAAN...", - "k": 100 + "k": 100, + "filter": { + "bool": { + "must": [ + { + "range": { + "rating": { + "gte": 8, + "lte": 10 + } + } + }, + { + "term": { + "parking": "true" + } + } + ] + } + } } } } diff --git a/_query-dsl/term-vs-full-text.md b/_query-dsl/term-vs-full-text.md index 0bae2fb4a4..e5019c4eb2 100644 --- a/_query-dsl/term-vs-full-text.md +++ b/_query-dsl/term-vs-full-text.md @@ -8,7 +8,7 @@ redirect_from: # Term-level and full-text queries compared -You can use both term-level and full-text queries to search text, but while term-level queries are usually used to search structured data, full-text queries are used for full-text search. The main difference between term-level and full-text queries is that term-level queries search documents for an exact specified term, while full-text queries analyze the query string. The following table summarizes the differences between term-level and full-text queries. +You can use both term-level and full-text queries to search text, but while term-level queries are usually used to search structured data, full-text queries are used for full-text search. The main difference between term-level and full-text queries is that term-level queries search documents for an exact specified term, while full-text queries [analyze]({{site.url}}{{site.baseurl}}/analyzers/) the query string. The following table summarizes the differences between term-level and full-text queries. | | Term-level queries | Full-text queries :--- | :--- | :--- diff --git a/_query-dsl/term/exists.md b/_query-dsl/term/exists.md index a62dda981b..1d52744c91 100644 --- a/_query-dsl/term/exists.md +++ b/_query-dsl/term/exists.md @@ -146,4 +146,8 @@ The response contains the matching document: ## Parameters -The query accepts the name of the field (``) as a top-level parameter. \ No newline at end of file +The query accepts the name of the field (``) as a top-level parameter. + +Parameter | Data type | Description +:--- | :--- | :--- +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. diff --git a/_query-dsl/term/fuzzy.md b/_query-dsl/term/fuzzy.md index f5a9773aeb..9afa85ea93 100644 --- a/_query-dsl/term/fuzzy.md +++ b/_query-dsl/term/fuzzy.md @@ -67,7 +67,7 @@ GET _search "fuzzy": { "": { "value": "sample", - ... + ... } } } @@ -80,11 +80,12 @@ The `` accepts the following parameters. All parameters except `value` ar Parameter | Data type | Description :--- | :--- | :--- `value` | String | The term to search for in the field specified in ``. +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. `fuzziness` | `AUTO`, `0`, or a positive integer | The number of character edits (insert, delete, substitute) needed to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. `max_expansions` | Positive integer | The maximum number of terms to which the query can expand. Fuzzy queries “expand to” a number of matching terms that are within the distance specified in `fuzziness`. Then OpenSearch tries to match those terms. Default is `50`. `prefix_length` | Non-negative integer | The number of leading characters that are not considered in fuzziness. Default is `0`. `rewrite` | String | Determines how OpenSearch rewrites and scores multi-term queries. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. Default is `constant_score`. -`transpositions` | Boolean | Specifies whether to allow transpositions of two adjacent characters (`ab` to `ba`) as edits. Default is `true`. +`transpositions` | Boolean | Specifies whether to allow transpositions of two adjacent characters (`ab` to `ba`) as edits. Default is `true`. Specifying a large value in `max_expansions` can lead to poor performance, especially if `prefix_length` is set to `0`, because of the large number of variations of the word that OpenSearch tries to match. {: .warning} diff --git a/_query-dsl/term/ids.md b/_query-dsl/term/ids.md index a1a098f586..0c3b5393fb 100644 --- a/_query-dsl/term/ids.md +++ b/_query-dsl/term/ids.md @@ -32,3 +32,4 @@ The query accepts the following parameter. Parameter | Data type | Description :--- | :--- | :--- `values` | Array of strings | The document IDs to search for. Required. +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. diff --git a/_query-dsl/term/prefix.md b/_query-dsl/term/prefix.md index 14f208f3c5..eda5307d14 100644 --- a/_query-dsl/term/prefix.md +++ b/_query-dsl/term/prefix.md @@ -50,7 +50,7 @@ GET _search "prefix": { "": { "value": "sample", - ... + ... } } } @@ -63,8 +63,9 @@ The `` accepts the following parameters. All parameters except `value` ar Parameter | Data type | Description :--- | :--- | :--- `value` | String | The term to search for in the field specified in ``. +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. `case_insensitive` | Boolean | If `true`, allows case-insensitive matching of the value with the indexed field values. Default is `false` (case sensitivity is determined by the field's mapping). `rewrite` | String | Determines how OpenSearch rewrites and scores multi-term queries. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. Default is `constant_score`. If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `false`, prefix queries are not run. If `index_prefixes` is enabled, the `search.allow_expensive_queries` setting is ignored and an optimized query is built and run. -{: .important} \ No newline at end of file +{: .important} diff --git a/_query-dsl/term/range.md b/_query-dsl/term/range.md index 076ab5ad15..8a8f53c480 100644 --- a/_query-dsl/term/range.md +++ b/_query-dsl/term/range.md @@ -90,7 +90,7 @@ OpenSearch populates missing date components with the following values: - `SECOND_OF_MINUTE`: `59` - `NANO_OF_SECOND`: `999_999_999` -If the year is missing, it is not populated. +If the year is missing, it is not populated. For example, consider the following request that specifies only the year in the start date: @@ -131,7 +131,7 @@ GET products/_search ``` {% include copy-curl.html %} -In the preceding example, `2019/01/01` is the anchor date (the starting point) for the date math. After the two pipe characters (`||`), you are specifying a mathematical expression relative to the anchor date. In this example, you are subtracting 1 year (`-1y`) and 1 day (`-1d`). +In the preceding example, `2019/01/01` is the anchor date (the starting point) for the date math. After the two pipe characters (`||`), you are specifying a mathematical expression relative to the anchor date. In this example, you are subtracting 1 year (`-1y`) and 1 day (`-1d`). You can also round off dates by adding a forward slash to the date or time unit. @@ -175,8 +175,8 @@ GET /products/_search "query": { "range": { "created": { - "time_zone": "-04:00", - "gte": "2022-04-17T06:00:00" + "time_zone": "-04:00", + "gte": "2022-04-17T06:00:00" } } } @@ -184,7 +184,7 @@ GET /products/_search ``` {% include copy-curl.html %} -The `gte` parameter in the preceding query is converted to `2022-04-17T10:00:00 UTC`, which is the UTC equivalent of `2022-04-17T06:00:00-04:00`. +The `gte` parameter in the preceding query is converted to `2022-04-17T10:00:00 UTC`, which is the UTC equivalent of `2022-04-17T06:00:00-04:00`. The `time_zone` parameter does not affect the `now` value because `now` always corresponds to the current system time in UTC. {: .note} @@ -200,7 +200,7 @@ GET _search "range": { "": { "gt": 10, - ... + ... } } } @@ -215,7 +215,7 @@ Parameter | Data type | Description :--- | :--- | :--- `format` | String | A [format]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/#formats) for dates in this query. Default is the field's mapped format. `relation` | String | Indicates how the range query matches values for [`range`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/range/) fields. Valid values are:
- `INTERSECTS` (default): Matches documents whose `range` field value intersects the range provided in the query.
- `CONTAINS`: Matches documents whose `range` field value contains the entire range provided in the query.
- `WITHIN`: Matches documents whose `range` field value is entirely within the range provided in the query. -`boost` | Floating-point | Boosts the query by the given multiplier. Useful for searches that contain more than one query. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. `time_zone` | String | The time zone used to convert [`date`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/) values to UTC in the query. Valid values are ISO 8601 [UTC offsets](https://en.wikipedia.org/wiki/List_of_UTC_offsets) and [IANA time zone IDs](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). For more information, see [Time zone](#time-zone). If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `false`, range queries on [`text`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/text/) and [`keyword`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/keyword/) fields are not run. diff --git a/_query-dsl/term/regexp.md b/_query-dsl/term/regexp.md index 31bc6460aa..65d6953516 100644 --- a/_query-dsl/term/regexp.md +++ b/_query-dsl/term/regexp.md @@ -43,7 +43,7 @@ GET _search "regexp": { "": { "value": "[Ss]ample", - ... + ... } } } @@ -56,6 +56,7 @@ The `` accepts the following parameters. All parameters except `value` ar Parameter | Data type | Description :--- | :--- | :--- `value` | String | The regular expression used for matching terms in the field specified in ``. +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. `case_insensitive` | Boolean | If `true`, allows case-insensitive matching of the regular expression value with the indexed field values. Default is `false` (case sensitivity is determined by the field's mapping). `flags` | String | Enables optional operators for Lucene’s regular expression engine. `max_determinized_states` | Integer | Lucene converts a regular expression to an automaton with a number of determinized states. This parameter specifies the maximum number of automaton states the query requires. Use this parameter to prevent high resource consumption. To run complex regular expressions, you may need to increase the value of this parameter. Default is 10,000. @@ -63,4 +64,3 @@ Parameter | Data type | Description If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `false`, `regexp` queries are not run. {: .important} - diff --git a/_query-dsl/term/term.md b/_query-dsl/term/term.md index 20694fb455..c1c296b9a0 100644 --- a/_query-dsl/term/term.md +++ b/_query-dsl/term/term.md @@ -82,7 +82,7 @@ GET _search "term": { "": { "value": "sample", - ... + ... } } } @@ -95,5 +95,5 @@ The `` accepts the following parameters. All parameters except `value` ar Parameter | Data type | Description :--- | :--- | :--- `value` | String | The term to search for in the field specified in ``. A document is returned in the results only if its field value exactly matches the term, with the correct spacing and capitalization. -`boost` | Floating-point | Boosts the query by the given multiplier. Useful for searches that contain more than one query. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. -`case_insensitive` | Boolean | If `true`, allows case-insensitive matching of the value with the indexed field values. Default is `false` (case sensitivity is determined by the field's mapping). \ No newline at end of file +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. +`case_insensitive` | Boolean | If `true`, allows case-insensitive matching of the value with the indexed field values. Default is `false` (case sensitivity is determined by the field's mapping). diff --git a/_query-dsl/term/terms-set.md b/_query-dsl/term/terms-set.md index 452cae66b2..ea0251ddff 100644 --- a/_query-dsl/term/terms-set.md +++ b/_query-dsl/term/terms-set.md @@ -153,7 +153,7 @@ GET _search "terms_set": { "": { "terms": [ "term1", "term2" ], - ... + ... } } } @@ -167,4 +167,5 @@ Parameter | Data type | Description :--- | :--- | :--- `terms` | Array of strings | The array of terms to search for in the field specified in ``. A document is returned in the results only if the required number of terms matches the document's field values exactly, with the correct spacing and capitalization. `minimum_should_match_field` | String | The name of the [numeric]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/numeric/) field that specifies the number of matching terms required in order to return a document in the results. -`minimum_should_match_script` | String | A script that returns the number of matching terms required in order to return a document in the results. \ No newline at end of file +`minimum_should_match_script` | String | A script that returns the number of matching terms required in order to return a document in the results. +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. diff --git a/_query-dsl/term/terms.md b/_query-dsl/term/terms.md index 5e5524e6a3..fd15126255 100644 --- a/_query-dsl/term/terms.md +++ b/_query-dsl/term/terms.md @@ -39,7 +39,7 @@ The query accepts the following parameters. All parameters are optional. Parameter | Data type | Description :--- | :--- | :--- `` | String | The field in which to search. A document is returned in the results only if its field value exactly matches at least one term, with the correct spacing and capitalization. -`boost` | Floating-point | Boosts the query by the given multiplier. Useful for searches that contain more than one query. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. ## Terms lookup @@ -249,4 +249,5 @@ Parameter | Data type | Description `index` | String | The name of the index in which to fetch field values. Required. `id` | String | The document ID of the document from which to fetch field values. Required. `path` | String | The name of the field from which to fetch field values. Specify nested fields using dot path notation. Required. -`routing` | String | Custom routing value of the document from which to fetch field values. Optional. Required if a custom routing value was provided when the document was indexed. \ No newline at end of file +`routing` | String | Custom routing value of the document from which to fetch field values. Optional. Required if a custom routing value was provided when the document was indexed. +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. diff --git a/_query-dsl/term/wildcard.md b/_query-dsl/term/wildcard.md index 897ab6ed0f..0652581941 100644 --- a/_query-dsl/term/wildcard.md +++ b/_query-dsl/term/wildcard.md @@ -61,7 +61,7 @@ The `` accepts the following parameters. All parameters except `value` ar Parameter | Data type | Description :--- | :--- | :--- `value` | String | The wildcard pattern used for matching terms in the field specified in ``. -`boost` | Floating-point | Boosts the query by the given multiplier. Useful for searches that contain more than one query. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. `case_insensitive` | Boolean | If `true`, allows case-insensitive matching of the value with the indexed field values. Default is `false` (case sensitivity is determined by the field's mapping). `rewrite` | String | Determines how OpenSearch rewrites and scores multi-term queries. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. Default is `constant_score`. diff --git a/_reporting/rep-cli-env-var.md b/_reporting/rep-cli-env-var.md index a4e079501d..0c80c81ca5 100644 --- a/_reporting/rep-cli-env-var.md +++ b/_reporting/rep-cli-env-var.md @@ -30,7 +30,7 @@ Values from the command line argument have higher priority than the environment The following command requests a report with basic authentication in PNG format: ``` -opensearch-reporting-cli --url https://localhost:5601/app/dashboards#/view/7adfa750-4c81-11e8-b3d7-01146121b73d --format png --auth basic --credentials admin:admin +opensearch-reporting-cli --url https://localhost:5601/app/dashboards#/view/7adfa750-4c81-11e8-b3d7-01146121b73d --format png --auth basic --credentials admin: ``` Upon success, the report will download to the current directory. diff --git a/_sass/_home.scss b/_sass/_home.scss index a529aa910c..9b5dd864a9 100644 --- a/_sass/_home.scss +++ b/_sass/_home.scss @@ -61,7 +61,6 @@ color: $blue-vibrant-300; } > .last-link { - transform: translateY(-1px); text-decoration: underline; text-underline-offset: 3px; } @@ -145,7 +144,7 @@ } .banner-text-header { - font-size: 2.3rem; + font-size: 2.1rem; font-family: "Open Sans"; font-weight: 300; @include mq(md) { @@ -154,12 +153,12 @@ } .banner-text-description { - font-size: 1.3rem; + font-size: 1rem; font-weight: 700; font-style: normal; line-height: 2.25rem; @include mq(md) { - font-size: 1rem; + font-size: 1.3rem; } } diff --git a/_sass/custom/custom.scss b/_sass/custom/custom.scss index 2f7d6dc5ed..0f1c549504 100755 --- a/_sass/custom/custom.scss +++ b/_sass/custom/custom.scss @@ -153,6 +153,12 @@ img { @extend .panel; } +.img-centered { + max-width: 100%; + margin: 0 auto; + display: block; +} + .no-border { border: none; box-shadow: none; diff --git a/_search-plugins/async/index.md b/_search-plugins/async/index.md index a663e7faf9..099279ba91 100644 --- a/_search-plugins/async/index.md +++ b/_search-plugins/async/index.md @@ -31,6 +31,7 @@ Options | Description | Default value | Required `wait_for_completion_timeout` | The amount of time that you plan to wait for the results. You can see whatever results you get within this time just like in a normal search. You can poll the remaining results based on an ID. The maximum value is 300 seconds. | 1 second | No `keep_on_completion` | Whether you want to save the results in the cluster after the search is complete. You can examine the stored results at a later time. | `false` | No `keep_alive` | The amount of time that the result is saved in the cluster. For example, `2d` means that the results are stored in the cluster for 48 hours. The saved search results are deleted after this period or if the search is canceled. Note that this includes the query execution time. If the query overruns this time, the process cancels this query automatically. | 12 hours | No +`index` | The name of the index to be searched. Can be an individual name, a comma-separated list of indexes, or a wildcard expression of index names. | All indexes in the cluster | No #### Example request diff --git a/_search-plugins/concurrent-segment-search.md b/_search-plugins/concurrent-segment-search.md index 8ece3493f1..0bb7657937 100644 --- a/_search-plugins/concurrent-segment-search.md +++ b/_search-plugins/concurrent-segment-search.md @@ -7,9 +7,6 @@ nav_order: 53 # Concurrent segment search -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/2587) or the [project board](https://github.com/orgs/opensearch-project/projects/117/views/1). -{: .warning} - Use concurrent segment search to search segments in parallel during the query phase. Cases in which concurrent segment search improves search latency include the following: - When sending long-running requests, for example, requests that contain aggregations or large ranges @@ -23,91 +20,34 @@ In OpenSearch, each search request follows the scatter-gather protocol. The coor Without concurrent segment search, Lucene executes a request sequentially across all segments on each shard during the query phase. The query phase then collects the top hits for the search request. With concurrent segment search, each shard-level request will search the segments in parallel during the query phase. For each shard, the segments are divided into multiple _slices_. Each slice is the unit of work that can be executed in parallel on a separate thread, so the slice count determines the maximum degree of parallelism for a shard-level request. Once all the slices complete their work, Lucene performs a reduce operation on the slices, merging them and creating the final result for this shard-level request. Slices are executed using a new `index_searcher` thread pool, which is different from the `search` thread pool that handles shard-level requests. -## Enabling the feature flag - -There are several methods for enabling concurrent segment search, depending on the installation type. - -### Enable in opensearch.yml - -If you are running an OpenSearch cluster and want to enable concurrent segment search in the config file, add the following line to `opensearch.yml`: - -```yaml -opensearch.experimental.feature.concurrent_segment_search.enabled: true -``` -{% include copy.html %} - -### Enable with Docker containers - -If you’re running Docker, add the following line to `docker-compose.yml` under the `opensearch-node` > `environment` section: - -```bash -OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.concurrent_segment_search.enabled=true" -``` -{% include copy.html %} - -### Enable on a node using a tarball installation - -To enable concurrent segment search on a tarball installation, provide the new JVM parameter either in `config/jvm.options` or `OPENSEARCH_JAVA_OPTS`. - -#### OPTION 1: Modify jvm.options - -Add the following lines to `config/jvm.options` before starting the `opensearch` process to enable the feature and its dependency: - -```bash --Dopensearch.experimental.feature.concurrent_segment_search.enabled=true -``` -{% include copy.html %} - -Then run OpenSearch: +## Enabling concurrent segment search at the index or cluster level -```bash -./bin/opensearch -``` -{% include copy.html %} - -#### OPTION 2: Enable with an environment variable +By default, concurrent segment search is disabled on the cluster. You can enable concurrent segment search at two levels: -As an alternative to directly modifying `config/jvm.options`, you can define the properties by using an environment variable. This can be done using a single command when you start OpenSearch or by defining the variable with `export`. +- Cluster level +- Index level -To add these flags inline when starting OpenSearch, run the following command: - -```bash -OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.concurrent_segment_search.enabled=true" ./opensearch-{{site.opensearch_version}}/bin/opensearch -``` -{% include copy.html %} +The index-level setting takes priority over the cluster-level setting. Thus, if the cluster setting is enabled but the index setting is disabled, then concurrent segment search will be disabled for that index. Because of this, the index-level setting is not evaluated unless it is explicitly set, regardless of the default value configured for the setting. You can retrieve the current value of the index-level setting by calling the [Index Settings API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/get-settings/) and omitting the `?include_defaults` query parameter. +{: .note} -If you want to define the environment variable separately prior to running OpenSearch, run the following commands: - -```bash -export OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.concurrent_segment_search.enabled=true" -``` -{% include copy.html %} - -```bash -./bin/opensearch -``` -{% include copy.html %} - -## Disabling concurrent search at the index or cluster level - -After you enable the experimental feature flag, all search requests will use concurrent segment search during the query phase. To disable concurrent segment search for all indexes, set the following dynamic cluster setting: +To enable concurrent segment search for all indexes in the cluster, set the following dynamic cluster setting: ```json PUT _cluster/settings { "persistent":{ - "search.concurrent_segment_search.enabled": false + "search.concurrent_segment_search.enabled": true } } ``` {% include copy-curl.html %} -To disable concurrent segment search for a particular index, specify the index name in the endpoint: +To enable concurrent segment search for a particular index, specify the index name in the endpoint: ```json PUT /_settings { - "index.search.concurrent_segment_search.enabled": false + "index.search.concurrent_segment_search.enabled": true } ``` {% include copy-curl.html %} @@ -137,26 +77,33 @@ The `search.concurrent.max_slice_count` setting can take the following valid val - `0`: Use the default Lucene mechanism. - Positive integer: Use the max target slice count mechanism. Usually, a value between 2 and 8 should be sufficient. -## The `terminate_after` search parameter +## Limitations + +The following aggregations do not support the concurrent search model. If a search request contains one of these aggregations, the request will be executed using the non-concurrent path even if concurrent segment search is enabled at the cluster level or index level. +- Parent aggregations on [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) fields. See [this GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/9316) for more information. +- `sampler` and `diversified_sampler` aggregations. See [this GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/11075) for more information. +- Composite aggregations. See [this GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/12331) for more information. + +## Other considerations + +The following sections provide additional considerations for concurrent segment search. + +### The `terminate_after` search parameter The [`terminate_after` search parameter]({{site.url}}{{site.baseurl}}/api-reference/search/#url-parameters) is used to terminate a search request once a specified number of documents has been collected. If you include the `terminate_after` parameter in a request, concurrent segment search is disabled and the request is run in a non-concurrent manner. Typically, queries are used with smaller `terminate_after` values and thus complete quickly because the search is performed on a reduced dataset. Therefore, concurrent search may not further improve performance in this case. Moreover, when `terminate_after` is used with other search request parameters, such as `track_total_hits` or `size`, it adds complexity and changes the expected query behavior. Falling back to a non-concurrent path for search requests that include `terminate_after` ensures consistent results between concurrent and non-concurrent requests. -## API changes +### Sorting -If you enable the concurrent segment search feature flag, the following Stats API responses will contain several additional fields with statistics about slices: +Depending on the data layout of the segments, the sort optimization feature can prune entire segments based on the min and max values as well as previously collected values. If the top values are present in the first few segments and all other segments are pruned, query latency may increase when sorting with concurrent segment search. Conversely, if the last few segments contain the top values, then latency may improve with concurrent segment search. -- [Index Stats]({{site.url}}{{site.baseurl}}/api-reference/index-apis/stats/) -- [Nodes Stats]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-stats/) +### Terms aggregations -For descriptions of the added fields, see [Index Stats API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/stats#concurrent-segment-search). +Non-concurrent search calculates the document count error and returns it in the `doc_count_error_upper_bound` response parameter. During concurrent segment search, the `shard_size` parameter is applied at the segment slice level. Because of this, concurrent search may introduce an additional document count error. -Additionally, some [Profile API]({{site.url}}{{site.baseurl}}/api-reference/profile/) response fields will be modified and others added. For more information, see the [concurrent segment search section of the Profile API]({{site.url}}{{site.baseurl}}/api-reference/profile#concurrent-segment-search). - -## Limitations +For more information about how `shard_size` can affect both `doc_count_error_upper_bound` and collected buckets, see [this GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/11680#issuecomment-1885882985). -Parent aggregations on [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) fields do not support the concurrent search model. Thus, if a search request contains a parent aggregation, the aggregation will be executed using the non-concurrent path even if concurrent segment search is enabled at the cluster level. ## Developer information: AggregatorFactory changes diff --git a/_search-plugins/conversational-search.md b/_search-plugins/conversational-search.md index f77a4ea1ee..be4c97b425 100644 --- a/_search-plugins/conversational-search.md +++ b/_search-plugins/conversational-search.md @@ -7,407 +7,441 @@ redirect_from: - /ml-commons-plugin/conversational-search/ --- -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://forum.opensearch.org/t/feedback-conversational-search-and-retrieval-augmented-generation-using-search-pipeline-experimental-release/16073). -{: .warning} - # Conversational search -Conversational search is an experimental machine learning (ML) feature that enables a new search interface. Whereas traditional document search allows you to ask a question and receive a list of documents that might contain the answer to that question, conversational search uses large language models (LLMs) to read the top N documents and synthesizes those documents into a plaintext "answer" to your question. - -Currently, conversational search uses two systems to synthesize documents: - -- [Conversation memory](#conversation-memory) -- [Retrieval Augmented Generation (RAG) pipeline](#rag-pipeline) - -## Conversation memory - -Conversation memory consists of a simple CRUD-like API comprising two resources: **Conversations** and **Interactions**. Conversations are made up of interactions. An interaction represents a pair of messages: a human input and an artificial intelligence (AI) response. You cannot create any interactions until you've created a conversation. - -To make it easier to build and debug applications that use conversation memory, `conversation-meta` and `conversation-interactions` are stored in two system indexes. - -### `conversation-meta` index - -In the `conversation-meta` index, you can customize the `name` field to make it easier for end users to know how to continue a conversation with the AI, as shown in the following schema: +Conversational search allows you to ask questions in natural language and refine the answers by asking follow-up questions. Thus, the conversation becomes a dialog between you and a large language model (LLM). For this to happen, instead of answering each question individually, the model needs to remember the context of the entire conversation. -```jsx -.plugins-ml-conversation-meta -{ - "_meta": { - "schema_version": 1 - }, - "properties": { - "name": {"type": "keyword"}, - "create_time": {"type": "date", "format": "strict_date_time||epoch_millis"}, - "user": {"type": "keyword"} - } -} -``` +Conversational search is implemented with the following components: -### `conversation-interactions` index +- [Conversation history](#conversation-history): Allows an LLM to remember the context of the current conversation and understand follow-up questions. +- [Retrieval-Augmented Generation (RAG)](#rag): Allows an LLM to supplement its static knowledge base with proprietary or current information. -In the `conversation-interactions` index, all of the following fields are set by the user or AI application. Each field is entered as a string. +## Conversation history -| Field | Description | -| :--- | :--- | -| `input` | The question that forms the basis for an interaction. | -| `prompt_template` | The prompt template that was used as the framework for this interaction. | -| `response` | The AI response to the prompt. | -| `origin` | The name of the AI or other system that generated the response. | -| `additional_info` | Any other information that was sent to the "origin" in the prompt. | +Conversation history consists of a simple CRUD-like API comprising two resources: _memories_ and _messages_. All messages for the current conversation are stored within one conversation _memory_. A _message_ represents a question/answer pair: a human-input question and an AI answer. Messages do not exist by themselves; they must be added to a memory. -The `conversation-interactions` index creates a clean interaction abstraction and make it easy for the index to reconstruct the exact prompts sent to the LLM, enabling robust debugging and explainability, as shown in the following schema: +## RAG -```jsx -.plugins-ml-conversation-interactions -{ - "_meta": { - "schema_version": 1 - }, - "properties": { - "conversation_id": {"type": "keyword"}, - "create_time": {"type": "date", "format": "strict_date_time||epoch_millis"}, - "input": {"type": "text"}, - "prompt_template": {"type": "text"}, - "response": {"type": "text"}, - "origin": {"type": "keyword"}, - "additional_info": {"type": "text"} - } -} -``` +RAG retrieves data from the index and history and sends all the information as context to the LLM. The LLM then supplements its static knowledge base with the dynamically retrieved data. In OpenSearch, RAG is implemented through a search pipeline containing a [retrieval-augmented generation processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rag-processor/). The processor intercepts OpenSearch query results, retrieves previous messages in the conversation from the conversation memory, and sends a prompt to the LLM. After the processor receives a response from the LLM, it saves the response in conversation memory and returns both the original OpenSearch query results and the LLM response. -## Working with conversations and interactions +As of OpenSearch 2.11, the RAG technique has only been tested with OpenAI models and the Anthropic Claude model on Amazon Bedrock. +{: .warning} -When the Security plugin is enabled, all conversations in ML Commons exist in a "private" security mode. Only the user who created a conversation can interact with that conversation. No users on the cluster can see another user's conversation. +When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory. No user can see another user's memory. {: .note} -To begin using conversation memory, enable the following cluster setting: +## Prerequisites + +To begin using conversational search, enable conversation memory and RAG pipeline features: ```json PUT /_cluster/settings { "persistent": { - "plugins.ml_commons.memory_feature_enabled": true + "plugins.ml_commons.memory_feature_enabled": true, + "plugins.ml_commons.rag_pipeline_feature_enabled": true } } ``` {% include copy-curl.html %} -After conversation memory is enabled, you can use the Memory API to create a conversation. +## Using conversational search + +To use conversational search, follow these steps: -To make the conversation easily identifiable, use the optional `name` field in the Memory API, as shown in the following example. This will be your only opportunity to name your conversation. +1. [Create a connector to a model](#step-1-create-a-connector-to-a-model). +1. [Register and deploy the model](#step-2-register-and-deploy-the-model) +1. [Create a search pipeline](#step-3-create-a-search-pipeline). +1. [Ingest RAG data into an index](#step-4-ingest-rag-data-into-an-index). +1. [Create a conversation memory](#step-5-create-a-conversation-memory). +1. [Use the pipeline for RAG](#step-6-use-the-pipeline-for-rag). +### Step 1: Create a connector to a model +RAG requires an LLM in order to function. To connect to an LLM, create a [connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). The following request creates a connector for the OpenAI GPT 3.5 model: ```json -POST /_plugins/_ml/memory/conversation +POST /_plugins/_ml/connectors/_create { - "name": Example conversation + "name": "OpenAI Chat Connector", + "description": "The connector to public OpenAI model service for GPT 3.5", + "version": 2, + "protocol": "http", + "parameters": { + "endpoint": "api.openai.com", + "model": "gpt-3.5-turbo", + "temperature": 0 + }, + "credential": { + "openAI_key": "" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://${parameters.endpoint}/v1/chat/completions", + "headers": { + "Authorization": "Bearer ${credential.openAI_key}" + }, + "request_body": """{ "model": "${parameters.model}", "messages": ${parameters.messages}, "temperature": ${parameters.temperature} }""" + } + ] } ``` {% include copy-curl.html %} -The Memory API responds with the conversation ID, as shown in the following example response: +OpenSearch responds with a connector ID for the connector: ```json -{ "conversation_id": "4of2c9nhoIuhcr" } +{ + "connector_id": "u3DEbI0BfUsSoeNTti-1" +} ``` -You'll use the `conversation_id` to create interactions inside the conversation. To create interactions, enter the `conversation_id` into the Memory API path. Then customize the [fields](#conversation-interactions-index) in the request body, as shown in the following example: +For example requests that connect to other services and models, see [Connector blueprints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/). +{: .tip} + +### Step 2: Register and deploy the model + +Register the LLM for which you created a connector in the previous step. To register the model with OpenSearch, provide the `connector_id` returned in the previous step: ```json -POST /_plugins/_ml/memory/conversation/4of2c9nhoIuhcr +POST /_plugins/_ml/models/_register { - "input": "How do I make an interaction?", - "prompt_template": "Hello OpenAI, can you answer this question? \ - Here's some extra info that may help. \ - [INFO] \n [QUESTION]", - "response": "Hello, this is OpenAI. Here is the answer to your question.", - "origin": "MyFirstOpenAIWrapper", - "additional_info": "Additional text related to the answer \ - A JSON or other semi-structured response" + "name": "openAI-gpt-3.5-turbo", + "function_name": "remote", + "description": "test model", + "connector_id": "u3DEbI0BfUsSoeNTti-1" } -``` +``` {% include copy-curl.html %} -The Memory API then responds with an interaction ID, as shown in the following example response: +OpenSearch returns a task ID for the register task and a model ID for the registered model: ```json -{ "interaction_id": "948vh_PoiY2hrnpo" } +{ + "task_id": "gXDIbI0BfUsSoeNT_jAb", + "status": "CREATED", + "model_id": "gnDIbI0BfUsSoeNT_jAw" +} ``` -### Getting conversations - -You can get a list of conversations using the following Memory API operation: +To verify that the registration is complete, call the Tasks API: ```json -GET /_plugins/_ml/memory/conversation?max_results=3&next_token=0 +GET /_plugins/_ml/tasks/gXDIbI0BfUsSoeNT_jAb ``` {% include copy-curl.html %} -Use the following path parameters to customize your results. - -Parameter | Data type | Description -:--- | :--- | :--- -`max_results` | Integer | The maximum number of results returned by the response. Default is `10`. -`next_token` | Integer | Represents the conversation order position that will be retrieved. For example, if conversations A, B, and C exist, `next_token=1` would return conversations B and C. Default is `0`. - -The Memory API responds with the most recent conversation, as indicated in the `create_time` field of the following example response: +The `state` changes to `COMPLETED` in the response: ```json { - "conversations": [ - { - "conversation_id": "0y4hto_in1", - "name": "", - "create_time": "2023-4-23 10:25.324662" - }, ... (2 more since we specified max_results=3) - ], - "next_token": 3 + "model_id": "gnDIbI0BfUsSoeNT_jAw", + "task_type": "REGISTER_MODEL", + "function_name": "REMOTE", + "state": "COMPLETED", + "worker_node": [ + "kYv-Z5-mQ4uCUy_cRC6LXA" + ], + "create_time": 1706927128091, + "last_update_time": 1706927128125, + "is_async": false } ``` +To deploy the model, provide the `model_id` to the Deploy API: -If there are fewer conversations than the number set in `max_results`, the response only returns the number of conversations that exist. Lastly, `next_token` provides an ordered position of the sorted list of conversations. When a conversation is added between subsequent GET conversation calls, one of the listed conversations will be duplicated in the results, for example: - -```plaintext -GetConversations -> [BCD]EFGH -CreateConversation -> ABCDEFGH -GetConversations(next_token=3) -> ABC[DEF]GH +```json +POST /_plugins/_ml/models/gnDIbI0BfUsSoeNT_jAw/_deploy ``` +{% include copy-curl.html %} -### Getting interactions - -To see a list of interactions in a conversation, enter the `conversation_id` at the end of the API request, as shown in the following example. You can use `max_results` and `next_token` to sort the response: +OpenSearch acknowledges that the model is deployed: ```json -GET /_plugins/_ml/memory/conversation/4of2c9nhoIuhcr +{ + "task_id": "cnDObI0BfUsSoeNTDzGd", + "task_type": "DEPLOY_MODEL", + "status": "COMPLETED" +} ``` -{% include copy-curl.html %} -The Memory API returns the following interaction information: +### Step 3: Create a search pipeline + +Next, create a search pipeline with a `retrieval_augmented_generation` processor: ```json +PUT /_search/pipeline/rag_pipeline { - "interactions": [ + "response_processors": [ { - "interaction_id": "342968y2u4-0", - "conversation_id": "0y4hto_in1", - "create_time": "2023-4-23 10:25.324662", - "input": "How do I make an interaction?", - "prompt_template": "Hello OpenAI, can you answer this question? \ - Here's some extra info that may help. \ - [INFO] \n [QUESTION]", - "response": "Hello, this is OpenAI. Here is the answer to your question.", - "origin": "MyFirstOpenAIWrapper", - "additional_info": "Additional text related to the answer \ - A JSON or other semi-structured response" - }, ... (9 more since max_results defaults to 10) - ], - "next_token": 10 + "retrieval_augmented_generation": { + "tag": "openai_pipeline_demo", + "description": "Demo pipeline Using OpenAI Connector", + "model_id": "gnDIbI0BfUsSoeNT_jAw", + "context_field_list": ["text"], + "system_prompt": "You are a helpful assistant", + "user_instructions": "Generate a concise and informative answer in less than 100 words for the given question" + } + } + ] } ``` +{% include copy-curl.html %} -### Deleting conversations +For information about the processor fields, see [Retrieval-augmented generation processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rag-processor/). -To delete a conversation, use the `DELETE` operation, as shown in the following example: +### Step 4: Ingest RAG data into an index + +RAG augments the LLM's knowledge with some supplementary data. + +First, create an index in which to store this data and set the default search pipeline to the pipeline created in the previous step: ```json -DELETE /_plugins/_ml/memory/conversation/4of2c9nhoIuhcr +PUT /my_rag_test_data +{ + "settings": { + "index.search.default_pipeline" : "rag_pipeline" + }, + "mappings": { + "properties": { + "text": { + "type": "text" + } + } + } +} ``` {% include copy-curl.html %} -The Memory API responds with the following: +Next, ingest the supplementary data into the index: ```json -{ "success": true } +POST _bulk +{"index": {"_index": "my_rag_test_data", "_id": "1"}} +{"text": "Abraham Lincoln was born on February 12, 1809, the second child of Thomas Lincoln and Nancy Hanks Lincoln, in a log cabin on Sinking Spring Farm near Hodgenville, Kentucky.[2] He was a descendant of Samuel Lincoln, an Englishman who migrated from Hingham, Norfolk, to its namesake, Hingham, Massachusetts, in 1638. The family then migrated west, passing through New Jersey, Pennsylvania, and Virginia.[3] Lincoln was also a descendant of the Harrison family of Virginia; his paternal grandfather and namesake, Captain Abraham Lincoln and wife Bathsheba (née Herring) moved the family from Virginia to Jefferson County, Kentucky.[b] The captain was killed in an Indian raid in 1786.[5] His children, including eight-year-old Thomas, Abraham's father, witnessed the attack.[6][c] Thomas then worked at odd jobs in Kentucky and Tennessee before the family settled in Hardin County, Kentucky, in the early 1800s."} +{"index": {"_index": "my_rag_test_data", "_id": "2"}} +{"text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019."} ``` +{% include copy-curl.html %} ## RAG pipeline RAG is a technique that retrieves documents from an index, passes them through a seq2seq model, such as an LLM, and then supplements the static LLM information with the dynamically retrieved data in context. -As of OpenSearch 2.11, the RAG technique has only been tested with OpenAI models and the Anthropic Claude model on Amazon Bedrock. +As of OpenSearch 2.12, the RAG technique has only been tested with OpenAI models, the Anthropic Claude model on Amazon Bedrock, and Cohere Command models. {: .warning} +Configuring the Cohere Command model to enable RAG requires using a post-processing function to transform the model output. For more information, see the [Cohere RAG Tutorial](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/tutorials/conversational_search/conversational_search_with_Cohere_Command.md). + ### Enabling RAG -Use the following cluster setting to enable the RAG pipeline feature: +### Step 5: Create a conversation memory + +You'll need to create a conversation memory that will store all messages from a conversation. To make the memory easily identifiable, provide a name for the memory in the optional `name` field, as shown in the following example. Because the `name` parameter is not updatable, this is your only opportunity to name your conversation. ```json -PUT /_cluster/settings +POST /_plugins/_ml/memory/ { - "persistent": {"plugins.ml_commons.rag_pipeline_feature_enabled": "true"} + "name": "Conversation about NYC population" } ``` {% include copy-curl.html %} -### Connecting the model - -RAG requires an LLM in order to function. We recommend using a [connector]({{site.url}}{{site.baseurl}}ml-commons-plugin/remote-models/connectors/). - -Use the following steps to set up an HTTP connector using the OpenAI GPT 3.5 model: +OpenSearch responds with a memory ID for the newly created memory: -1. Use the Connector API to create the HTTP connector: +```json +{ + "memory_id": "znCqcI0BfUsSoeNTntd7" +} +``` - ```json - POST /_plugins/_ml/connectors/_create - { - "name": "OpenAI Chat Connector", - "description": "The connector to public OpenAI model service for GPT 3.5", - "version": 2, - "protocol": "http", - "parameters": { - "endpoint": "api.openai.com", - "model": "gpt-3.5-turbo", - "temperature": 0 - }, - "credential": { - "openAI_key": "" - }, - "actions": [ - { - "action_type": "predict", - "method": "POST", - "url": "https://${parameters.endpoint}/v1/chat/completions", - "headers": { - "Authorization": "Bearer ${credential.openAI_key}" - }, - "request_body": "{ \"model\": \"${parameters.model}\", \"messages\": ${parameters.messages}, \"temperature\": ${parameters.temperature} }" - } - ] - } - ``` - {% include copy-curl.html %} +You'll use the `memory_id` to add messages to the memory. -1. Create a new model group for the connected model. You'll use the `model_group_id` returned by the Register API to register the model: - ```json - POST /_plugins/_ml/model_groups/_register - { - "name": "public_model_group", - "description": "This is a public model group" - } - ``` - {% include copy-curl.html %} +### Step 6: Use the pipeline for RAG -1. Register and deploy the model using the `connector_id` from the Connector API response in Step 1 and the `model_group_id` returned in Step 2: +To use the RAG pipeline, send a query to OpenSearch and provide additional parameters in the `ext.generative_qa_parameters` object. - ```json - POST /_plugins/_ml/models/_register - { - "name": "openAI-gpt-3.5-turbo", - "function_name": "remote", - "model_group_id": "fp-hSYoBu0R6vVqGMnM1", - "description": "test model", - "connector_id": "f5-iSYoBu0R6vVqGI3PA" - } - ``` - {% include copy-curl.html %} +The `generative_qa_parameters` object supports the following parameters. -1. With the model registered, use the `task_id` returned in the registration response to get the `model_id`. You'll use the `model_id` to deploy the model to OpenSearch: +Parameter | Required | Description +:--- | :--- | :--- +`llm_question` | Yes | The question that the LLM must answer. +`llm_model` | No | Overrides the original model set in the connection in cases where you want to use a different model (for example, GPT 4 instead of GPT 3.5). This option is required if a default model is not set during pipeline creation. +`memory_id` | No | If you provide a `memory_id`, the pipeline retrieves the 10 most recent messages in the specified memory and adds them to the LLM prompt. If you don't specify a `memory_id`, the prior context is not added to the LLM prompt. +`context_size` | No | The number of search results sent to the LLM. This is typically needed in order to meet the token size limit, which can vary by model. Alternatively, you can use the `size` parameter in the Search API to control the number of search results sent to the LLM. +`message_size` | No | The number of messages sent to the LLM. Similarly to the number of search results, this affects the total number of tokens received by the LLM. When not set, the pipeline uses the default message size of `10`. +`timeout` | No | The number of seconds that the pipeline waits for the remote model using a connector to respond. Default is `30`. - ```json - GET /_plugins/_ml/tasks/ - ``` - {% include copy-curl.html %} +If your LLM includes a set token limit, set the `size` field in your OpenSearch query to limit the number of documents used in the search response. Otherwise, the RAG pipeline will send every document in the search results to the LLM. +{: .note} -1. Using the `model_id` from step 4, deploy the model: +If you ask an LLM a question about the present, it cannot provide an answer because it was trained on data from a few years ago. However, if you add current information as context, the LLM is able to generate a response. For example, you can ask the LLM about the population of the New York City metro area in 2023. You'll construct a query that includes an OpenSearch match query and an LLM query. Provide the `memory_id` so that the message is stored in the appropriate memory object: - ```json - POST /_plugins/_ml/models//_deploy - ``` - {% include copy-curl.html %} +```json +GET /my_rag_test_data/_search +{ + "query": { + "match": { + "text": "What's the population of NYC metro area in 2023" + } + }, + "ext": { + "generative_qa_parameters": { + "llm_model": "gpt-3.5-turbo", + "llm_question": "What's the population of NYC metro area in 2023", + "memory_id": "znCqcI0BfUsSoeNTntd7", + "context_size": 5, + "message_size": 5, + "timeout": 15 + } + } +} +``` +{% include copy-curl.html %} -### Setting up the pipeline +Because the context included a document containing information about the population of New York City, the LLM was able to correctly answer the question (though it included the word "projected" because it was trained on data from previous years). The response contains the matching documents from the supplementary RAG data and the LLM response: -Next, you'll create a search pipeline for the connector model. Use the following Search API request to create a pipeline: +
+ + Response + + {: .text-delta} ```json -PUT /_search/pipeline/ { - "response_processors": [ - { - "retrieval_augmented_generation": { - "tag": "openai_pipeline_demo", - "description": "Demo pipeline Using OpenAI Connector", - "model_id": "", - "context_field_list": ["text"], - "system_prompt": "You are a helpful assistant", - "user_instructions": "Generate a concise and informative answer in less than 100 words for the given question" + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 5.781642, + "hits": [ + { + "_index": "my_rag_test_data", + "_id": "2", + "_score": 5.781642, + "_source": { + "text": """Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019.""" + } + }, + { + "_index": "my_rag_test_data", + "_id": "1", + "_score": 0.9782871, + "_source": { + "text": "Abraham Lincoln was born on February 12, 1809, the second child of Thomas Lincoln and Nancy Hanks Lincoln, in a log cabin on Sinking Spring Farm near Hodgenville, Kentucky.[2] He was a descendant of Samuel Lincoln, an Englishman who migrated from Hingham, Norfolk, to its namesake, Hingham, Massachusetts, in 1638. The family then migrated west, passing through New Jersey, Pennsylvania, and Virginia.[3] Lincoln was also a descendant of the Harrison family of Virginia; his paternal grandfather and namesake, Captain Abraham Lincoln and wife Bathsheba (née Herring) moved the family from Virginia to Jefferson County, Kentucky.[b] The captain was killed in an Indian raid in 1786.[5] His children, including eight-year-old Thomas, Abraham's father, witnessed the attack.[6][c] Thomas then worked at odd jobs in Kentucky and Tennessee before the family settled in Hardin County, Kentucky, in the early 1800s." + } } + ] + }, + "ext": { + "retrieval_augmented_generation": { + "answer": "The population of the New York City metro area in 2023 is projected to be 18,937,000.", + "message_id": "x3CecI0BfUsSoeNT9tV9" } - ] + } } ``` -{% include copy-curl.html %} - -### Context field list +
-`context_field_list` is the list of fields in document sources that the pipeline uses as context for the RAG. For example, when `context_field_list` parses through the following document, the pipeline sends the `text` field from the response to OpenAI model: +Now you'll ask an LLM a follow-up question as part of the same conversation. Again, provide the `memory_id` in the request: ```json +GET /my_rag_test_data/_search { - "_index": "qa_demo", - "_id": "SimKcIoBOVKVCYpk1IL-", - "_source": { - "title": "Abraham Lincoln 2", - "text": "Abraham Lincoln was born on February 12, 1809, the second child of Thomas Lincoln and Nancy Hanks Lincoln, in a log cabin on Sinking Spring Farm near Hodgenville, Kentucky.[2] He was a descendant of Samuel Lincoln, an Englishman who migrated from Hingham, Norfolk, to its namesake, Hingham, Massachusetts, in 1638. The family then migrated west, passing through New Jersey, Pennsylvania, and Virginia.[3] Lincoln was also a descendant of the Harrison family of Virginia; his paternal grandfather and namesake, Captain Abraham Lincoln and wife Bathsheba (née Herring) moved the family from Virginia to Jefferson County, Kentucky.[b] The captain was killed in an Indian raid in 1786.[5] His children, including eight-year-old Thomas, Abraham's father, witnessed the attack.[6][c] Thomas then worked at odd jobs in Kentucky and Tennessee before the family settled in Hardin County, Kentucky, in the early 1800s.[6]\n" + "query": { + "match": { + "text": "What was it in 2022" + } + }, + "ext": { + "generative_qa_parameters": { + "llm_model": "gpt-3.5-turbo", + "llm_question": "What was it in 2022", + "memory_id": "znCqcI0BfUsSoeNTntd7", + "context_size": 5, + "message_size": 5, + "timeout": 15 + } } } ``` +{% include copy-curl.html %} -You can customize `context_field_list` in your RAG pipeline to send any fields that exist in your documents to the LLM. +The LLM correctly identifies the subject of the conversation and returns a relevant response: -### RAG parameter options +```json +{ + ... + "ext": { + "retrieval_augmented_generation": { + "answer": "The population of the New York City metro area in 2022 was 18,867,000.", + "message_id": "p3CvcI0BfUsSoeNTj9iH" + } + } +} +``` -Use the following options when setting up a RAG pipeline under the `retrieval_augmented_generation` argument. +To verify that both messages were added to the memory, provide the `memory_ID` to the Get Messages API: -Parameter | Required | Description -:--- | :--- | :--- -`tag` | No | A tag to help identify the pipeline. -`description` | Yes | A description of the pipeline. -`model_id` | Yes | The ID of the model used in the pipeline. -`context_field_list` | Yes | The list of fields in document sources that the pipeline uses as context for the RAG. For more information, see [Context Field List](#context-field-list). -`system_prompt` | No | The message sent to the LLM with a `system` role. This is the message the user sees when the LLM receives an interaction. -`user_instructions` | No | An additional message sent by the LLM with a `user` role. This parameter allows for further customization of what the user receives when interacting with the LLM. +```json +GET /_plugins/_ml/memory/znCqcI0BfUsSoeNTntd7/messages +``` -### Using the pipeline +The response contains both messages: -Using the pipeline is similar to submitting [search queries]({{site.url}}{{site.baseurl}}/api-reference/search/#example) to OpenSearch, as shown in the following example: +
+ + Response + + {: .text-delta} ```json -GET //_search?search_pipeline= { - "query" : {...}, - "ext": { - "generative_qa_parameters": { - "llm_model": "gpt-3.5-turbo", - "llm_question": "Was Abraham Lincoln a good politician", - "conversation_id": "_ikaSooBHvd8_FqDUOjZ", - "context_size": 5, - "interaction_size": 5, - "timeout": 15 - } - } + "messages": [ + { + "memory_id": "znCqcI0BfUsSoeNTntd7", + "message_id": "x3CecI0BfUsSoeNT9tV9", + "create_time": "2024-02-03T20:33:50.754708446Z", + "input": "What's the population of NYC metro area in 2023", + "prompt_template": """[{"role":"system","content":"You are a helpful assistant"},{"role":"user","content":"Generate a concise and informative answer in less than 100 words for the given question"}]""", + "response": "The population of the New York City metro area in 2023 is projected to be 18,937,000.", + "origin": "retrieval_augmented_generation", + "additional_info": { + "metadata": """["Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019.","Abraham Lincoln was born on February 12, 1809, the second child of Thomas Lincoln and Nancy Hanks Lincoln, in a log cabin on Sinking Spring Farm near Hodgenville, Kentucky.[2] He was a descendant of Samuel Lincoln, an Englishman who migrated from Hingham, Norfolk, to its namesake, Hingham, Massachusetts, in 1638. The family then migrated west, passing through New Jersey, Pennsylvania, and Virginia.[3] Lincoln was also a descendant of the Harrison family of Virginia; his paternal grandfather and namesake, Captain Abraham Lincoln and wife Bathsheba (née Herring) moved the family from Virginia to Jefferson County, Kentucky.[b] The captain was killed in an Indian raid in 1786.[5] His children, including eight-year-old Thomas, Abraham's father, witnessed the attack.[6][c] Thomas then worked at odd jobs in Kentucky and Tennessee before the family settled in Hardin County, Kentucky, in the early 1800s."]""" + } + }, + { + "memory_id": "znCqcI0BfUsSoeNTntd7", + "message_id": "p3CvcI0BfUsSoeNTj9iH", + "create_time": "2024-02-03T20:36:10.24453505Z", + "input": "What was it in 2022", + "prompt_template": """[{"role":"system","content":"You are a helpful assistant"},{"role":"user","content":"Generate a concise and informative answer in less than 100 words for the given question"}]""", + "response": "The population of the New York City metro area in 2022 was 18,867,000.", + "origin": "retrieval_augmented_generation", + "additional_info": { + "metadata": """["Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019.","Abraham Lincoln was born on February 12, 1809, the second child of Thomas Lincoln and Nancy Hanks Lincoln, in a log cabin on Sinking Spring Farm near Hodgenville, Kentucky.[2] He was a descendant of Samuel Lincoln, an Englishman who migrated from Hingham, Norfolk, to its namesake, Hingham, Massachusetts, in 1638. The family then migrated west, passing through New Jersey, Pennsylvania, and Virginia.[3] Lincoln was also a descendant of the Harrison family of Virginia; his paternal grandfather and namesake, Captain Abraham Lincoln and wife Bathsheba (née Herring) moved the family from Virginia to Jefferson County, Kentucky.[b] The captain was killed in an Indian raid in 1786.[5] His children, including eight-year-old Thomas, Abraham's father, witnessed the attack.[6][c] Thomas then worked at odd jobs in Kentucky and Tennessee before the family settled in Hardin County, Kentucky, in the early 1800s."]""" + } + } + ] } ``` -{% include copy-curl.html %} - -The RAG search query uses the following request objects under the `generative_qa_parameters` option. - -Parameter | Required | Description -:--- | :--- | :--- -`llm_question` | Yes | The question the LLM must answer. -`llm_model` | No | Overrides the original model set in the connection in cases where you want to use a different model (for example, GPT 4 instead of GPT 3.5). This option is required if a default model is not set during pipeline creation. -`conversation_id` | No | Integrates conversation memory into your RAG pipeline by adding the 10 most recent conversations into the context of the search query to the LLM. -`context_size` | No | The number of search results sent to the LLM. This is typically needed in order to meet the token size limit, which can vary by model. Alternatively, you can use the `size` parameter in the Search API to control the amount of information sent to the LLM. -`interaction_size` | No | The number of interactions sent to the LLM. Similarly to the number of search results, this affects the total number of tokens seen by the LLM. When not set, the pipeline uses the default interaction size of `10`. -`timeout` | No | The number of seconds that the pipeline waits for the remote model using a connector to respond. Default is `30`. - -If your LLM includes a set token limit, set the `size` field in your OpenSearch query to limit the number of documents used in the search response. Otherwise, the RAG pipeline will send every document in the search results to the LLM. +
## Next steps -- To learn more about connecting to models on external platforms, see [Connectors]({{site.url}}{{site.baseurl}}ml-commons-plugin/remote-models/connectors/). -- To learn more about using custom models within your OpenSearch cluster, see [Using ML models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). - +- To learn more about connecting to models on external platforms, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). +- For supported APIs, see [Memory APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/index/). +- To learn more about search pipelines and processors, see [Search pipelines]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/). +- For available OpenSearch queries, see [Query DSL]({{site.url}}{{site.baseurl}}/query-dsl/). \ No newline at end of file diff --git a/_search-plugins/cross-cluster-search.md b/_search-plugins/cross-cluster-search.md new file mode 100644 index 0000000000..c25f3d70d9 --- /dev/null +++ b/_search-plugins/cross-cluster-search.md @@ -0,0 +1,299 @@ +--- +layout: default +title: Cross-cluster search +nav_order: 65 +redirect_from: + - /security/access-control/cross-cluster-search/ + - /security-plugin/access-control/cross-cluster-search/ +--- + +# Cross-cluster search + +You can use the cross-cluster search feature in OpenSearch to search and analyze data across multiple clusters, enabling you to gain insights from distributed data sources. Cross-cluster search is available by default with the Security plugin, but you need to configure each cluster to allow remote connections from other clusters. This involves setting up remote cluster connections and configuring access permissions. + +--- + +#### Table of contents +1. TOC +{:toc} + + +--- + +## Authentication flow + +The following sequence describes the authentication flow when using cross-cluster search to access a *remote cluster* from a *coordinating cluster*. You can have different authentication and authorization configurations on the remote and coordinating clusters, but we recommend using the same settings on both. + +1. The Security plugin authenticates the user on the coordinating cluster. +1. The Security plugin fetches the user's backend roles on the coordinating cluster. +1. The call, including the authenticated user, is forwarded to the remote cluster. +1. The user's permissions are evaluated on the remote cluster. + + +## Setting permissions + +To query indexes on remote clusters, users must have `READ` or `SEARCH` permissions. Furthermore, when the search request includes the query parameter `ccs_minimize_roundtrips=false`---which tells OpenSearch not to minimize outgoing and incoming requests to remote clusters---users need to have the following additional index permission: + +``` +indices:admin/shards/search_shards +``` + +For more information about the `ccs_minimize_roundtrips` parameter, see the list of [URL Parameters]({{site.url}}{{site.baseurl}}/api-reference/search/#url-parameters) for the Search API. + +#### Example roles.yml configuration + +```yml +humanresources: + cluster: + - CLUSTER_COMPOSITE_OPS_RO + indices: + 'humanresources': + '*': + - READ + - indices:admin/shards/search_shards # needed when the search request includes parameter setting 'ccs_minimize_roundtrips=false'. +``` + + +#### Example role in OpenSearch Dashboards + +![OpenSearch Dashboards UI for creating a cross-cluster search role]({{site.url}}{{site.baseurl}}/images/security-ccs.png) + + +## Sample Docker setup + +To define Docker permissions, save the following sample file as `docker-compose.yml` and run `docker-compose up` to start two single-node clusters on the same network: + +```yml +version: '3' +services: + opensearch-ccs-node1: + image: opensearchproject/opensearch:{{site.opensearch_version}} + container_name: opensearch-ccs-node1 + environment: + - cluster.name=opensearch-ccs-cluster1 + - discovery.type=single-node + - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM + - "OPENSEARCH_INITIAL_ADMIN_PASSWORD=" # The initial admin password used by the demo configuration + ulimits: + memlock: + soft: -1 + hard: -1 + volumes: + - opensearch-data1:/usr/share/opensearch/data + ports: + - 9200:9200 + - 9600:9600 # required for Performance Analyzer + networks: + - opensearch-net + + opensearch-ccs-node2: + image: opensearchproject/opensearch:{{site.opensearch_version}} + container_name: opensearch-ccs-node2 + environment: + - cluster.name=opensearch-ccs-cluster2 + - discovery.type=single-node + - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM + - "OPENSEARCH_INITIAL_ADMIN_PASSWORD=" # The initial admin password used by the demo configuration + ulimits: + memlock: + soft: -1 + hard: -1 + volumes: + - opensearch-data2:/usr/share/opensearch/data + ports: + - 9250:9200 + - 9700:9600 # required for Performance Analyzer + networks: + - opensearch-net + +volumes: + opensearch-data1: + opensearch-data2: + +networks: + opensearch-net: +``` + +After the clusters start, verify the names of each cluster using the following commands: + +```json +curl -XGET -u 'admin:' -k 'https://localhost:9200' +{ + "cluster_name" : "opensearch-ccs-cluster1", + ... +} + +curl -XGET -u 'admin:' -k 'https://localhost:9250' +{ + "cluster_name" : "opensearch-ccs-cluster2", + ... +} +``` + +Both clusters run on `localhost`, so the important identifier is the port number. In this case, use port 9200 (`opensearch-ccs-node1`) as the remote cluster, and port 9250 (`opensearch-ccs-node2`) as the coordinating cluster. + +To get the IP address for the remote cluster, first identify its container ID: + +```bash +docker ps +CONTAINER ID IMAGE PORTS NAMES +6fe89ebc5a8e opensearchproject/opensearch:{{site.opensearch_version}} 0.0.0.0:9200->9200/tcp, 0.0.0.0:9600->9600/tcp, 9300/tcp opensearch-ccs-node1 +2da08b6c54d8 opensearchproject/opensearch:{{site.opensearch_version}} 9300/tcp, 0.0.0.0:9250->9200/tcp, 0.0.0.0:9700->9600/tcp opensearch-ccs-node2 +``` + +Then get that container's IP address: + +```bash +docker inspect --format='{% raw %}{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}{% endraw %}' 6fe89ebc5a8e +172.31.0.3 +``` + +On the coordinating cluster, add the remote cluster name and the IP address (with port 9300) for each "seed node." In this case, you only have one seed node: + +```json +curl -k -XPUT -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9250/_cluster/settings' -d ' +{ + "persistent": { + "cluster.remote": { + "opensearch-ccs-cluster1": { + "seeds": ["172.31.0.3:9300"] + } + } + } +}' +``` +All of the cURL requests can also be sent using OpenSearch Dashboards Dev Tools. +{: .tip } +The following image shows an example of a cURL request using Dev Tools. +![OpenSearch Dashboards UI for configuring remote cluster for Cross-cluster search]({{site.url}}{{site.baseurl}}/images/ccs-devtools.png) + +On the remote cluster, index a document: + +```bash +curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/books/_doc/1' -d '{"Dracula": "Bram Stoker"}' +``` + +At this point, cross-cluster search works. You can test it using the `admin` user: + +```bash +curl -XGET -k -u 'admin:' 'https://localhost:9250/opensearch-ccs-cluster1:books/_search?pretty' +{ + ... + "hits": [{ + "_index": "opensearch-ccs-cluster1:books", + "_id": "1", + "_score": 1.0, + "_source": { + "Dracula": "Bram Stoker" + } + }] +} +``` + +To continue testing, create a new user on both clusters: + +```bash +curl -XPUT -k -u 'admin:' 'https://localhost:9200/_plugins/_security/api/internalusers/booksuser' -H 'Content-Type: application/json' -d '{"password":"password"}' +curl -XPUT -k -u 'admin:' 'https://localhost:9250/_plugins/_security/api/internalusers/booksuser' -H 'Content-Type: application/json' -d '{"password":"password"}' +``` + +Then run the same search as before with `booksuser`: + +```json +curl -XGET -k -u booksuser:password 'https://localhost:9250/opensearch-ccs-cluster1:books/_search?pretty' +{ + "error" : { + "root_cause" : [ + { + "type" : "security_exception", + "reason" : "no permissions for [indices:admin/shards/search_shards, indices:data/read/search] and User [name=booksuser, roles=[], requestedTenant=null]" + } + ], + "type" : "security_exception", + "reason" : "no permissions for [indices:admin/shards/search_shards, indices:data/read/search] and User [name=booksuser, roles=[], requestedTenant=null]" + }, + "status" : 403 +} +``` + +Note the permissions error. On the remote cluster, create a role with the appropriate permissions, and map `booksuser` to that role: + +```bash +curl -XPUT -k -u 'admin:' -H 'Content-Type: application/json' 'https://localhost:9200/_plugins/_security/api/roles/booksrole' -d '{"index_permissions":[{"index_patterns":["books"],"allowed_actions":["indices:admin/shards/search_shards","indices:data/read/search"]}]}' +curl -XPUT -k -u 'admin:' -H 'Content-Type: application/json' 'https://localhost:9200/_plugins/_security/api/rolesmapping/booksrole' -d '{"users" : ["booksuser"]}' +``` + +Both clusters must have the user role, but only the remote cluster needs both the role and mapping. In this case, the coordinating cluster handles authentication (that is, "Does this request include valid user credentials?"), and the remote cluster handles authorization (that is, "Can this user access this data?"). +{: .tip } + +Finally, repeat the search: + +```bash +curl -XGET -k -u booksuser:password 'https://localhost:9250/opensearch-ccs-cluster1:books/_search?pretty' +{ + ... + "hits": [{ + "_index": "opensearch-ccs-cluster1:books", + "_id": "1", + "_score": 1.0, + "_source": { + "Dracula": "Bram Stoker" + } + }] +} +``` + +## Sample bare metal/virtual machine setup + +If you are running OpenSearch on a bare metal server or using a virtual machine, you can run the same commands, specifying the IP (or domain) of the OpenSearch cluster. +For example, in order to configure a remote cluster for cross-cluster search, find the IP of the remote node or domain of the remote cluster and run the following command: + +```json +curl -k -XPUT -H 'Content-Type: application/json' -u 'admin:' 'https://opensearch-domain-1:9200/_cluster/settings' -d ' +{ + "persistent": { + "cluster.remote": { + "opensearch-ccs-cluster2": { + "seeds": ["opensearch-domain-2:9300"] + } + } + } +}' +``` +It is sufficient to point to only one of the node IPs on the remote cluster because all nodes in the cluster will be queried as part of the node discovery process. +{: .tip } + +You can now run queries across both clusters: + +```bash +curl -XGET -k -u 'admin:' 'https://opensearch-domain-1:9200/opensearch-ccs-cluster2:books/_search?pretty' +{ + ... + "hits": [{ + "_index": "opensearch-ccs-cluster2:books", + "_id": "1", + "_score": 1.0, + "_source": { + "Dracula": "Bram Stoker" + } + }] +} +``` + +## Sample Kubernetes/Helm setup +If you are using Kubernetes clusters to deploy OpenSearch, you need to configure the remote cluster using either the `LoadBalancer` or `Ingress`. The Kubernetes services created using the following [Helm]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/helm/) example are of the `ClusterIP` type and are only accessible from within the cluster; therefore, you must use an externally accessible endpoint: + +```json +curl -k -XPUT -H 'Content-Type: application/json' -u 'admin:' 'https://opensearch-domain-1:9200/_cluster/settings' -d ' +{ + "persistent": { + "cluster.remote": { + "opensearch-ccs-cluster2": { + "seeds": ["ingress:9300"] + } + } + } +}' +``` diff --git a/_search-plugins/hybrid-search.md b/_search-plugins/hybrid-search.md index ebd014b0de..b0fb4d5bef 100644 --- a/_search-plugins/hybrid-search.md +++ b/_search-plugins/hybrid-search.md @@ -146,7 +146,9 @@ PUT /_search/pipeline/nlp-search-pipeline To perform hybrid search on your index, use the [`hybrid` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/hybrid/), which combines the results of keyword and semantic search. -The following example request combines two query clauses---a neural query and a `match` query. It specifies the search pipeline created in the previous step as a query parameter: +#### Example: Combining a neural query and a match query + +The following example request combines two query clauses---a `neural` query and a `match` query. It specifies the search pipeline created in the previous step as a query parameter: ```json GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline @@ -161,7 +163,7 @@ GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline "queries": [ { "match": { - "text": { + "passage_text": { "query": "Hi world" } } @@ -216,3 +218,355 @@ The response contains the matching document: } } ``` +{% include copy-curl.html %} + +#### Example: Combining a match query and a term query + +The following example request combines two query clauses---a `match` query and a `term` query. It specifies the search pipeline created in the previous step as a query parameter: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "_source": { + "exclude": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match":{ + "passage_text": "hello" + } + }, + { + "term":{ + "passage_text":{ + "value":"planet" + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +```json +{ + "took": 11, + "timed_out": false, + "_shards": { + "total": 2, + "successful": 2, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.7, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "2", + "_score": 0.7, + "_source": { + "id": "s2", + "passage_text": "Hi planet" + } + }, + { + "_index": "my-nlp-index", + "_id": "1", + "_score": 0.3, + "_source": { + "id": "s1", + "passage_text": "Hello world" + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Hybrid search with post-filtering +**Introduced 2.13** +{: .label .label-purple } + +You can perform post-filtering on hybrid search results by providing the `post_filter` parameter in your query. + +The `post_filter` clause is applied after the search results have been retrieved. Post-filtering is useful for applying additional filters to the search results without impacting the scoring or the order of the results. + +Post-filtering does not impact document relevance scores or aggregation results. +{: .note} + +#### Example: Post-filtering + +The following example request combines two query clauses---a `term` query and a `match` query. This is the same query as in the [preceding example](#example-combining-a-match-query-and-a-term-query), but it contains a `post_filter`: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid":{ + "queries":[ + { + "match":{ + "passage_text": "hello" + } + }, + { + "term":{ + "passage_text":{ + "value":"planet" + } + } + } + ] + } + + }, + "post_filter":{ + "match": { "passage_text": "world" } + } +} + +``` +{% include copy-curl.html %} + +Compare the results to the results without post-filtering in the [preceding example](#example-combining-a-match-query-and-a-term-query). Unlike the preceding example response, which contains two documents, the response in this example contains one document because the second document is filtered using post-filtering: + +```json +{ + "took": 18, + "timed_out": false, + "_shards": { + "total": 2, + "successful": 2, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.3, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "1", + "_score": 0.3, + "_source": { + "id": "s1", + "passage_text": "Hello world" + } + } + ] + } +} +``` + + +## Combining hybrid search and aggregations +**Introduced 2.13** +{: .label .label-purple } + +You can enhance search results by combining a hybrid query clause with any aggregation that OpenSearch supports. Aggregations allow you to use OpenSearch as an analytics engine. For more information about aggregations, see [Aggregations]({{site.url}}{{site.baseurl}}/aggregations/). + +Most aggregations are performed on the subset of documents that is returned by a hybrid query. The only aggregation that operates on all documents is the [`global`]({{site.url}}{{site.baseurl}}/aggregations/bucket/global/) aggregation. + +To use aggregations with a hybrid query, first create an index. Aggregations are typically used on fields of special types, like `keyword` or `integer`. The following example creates an index with several such fields: + +```json +PUT /my-nlp-index +{ + "settings": { + "number_of_shards": 2 + }, + "mappings": { + "properties": { + "doc_index": { + "type": "integer" + }, + "doc_keyword": { + "type": "keyword" + }, + "category": { + "type": "keyword" + } + } + } +} +``` +{% include copy-curl.html %} + +The following request ingests six documents into your new index: + +```json +POST /_bulk +{ "index": { "_index": "my-nlp-index" } } +{ "category": "permission", "doc_keyword": "workable", "doc_index": 4976, "doc_price": 100} +{ "index": { "_index": "my-nlp-index" } } +{ "category": "sister", "doc_keyword": "angry", "doc_index": 2231, "doc_price": 200 } +{ "index": { "_index": "my-nlp-index" } } +{ "category": "hair", "doc_keyword": "likeable", "doc_price": 25 } +{ "index": { "_index": "my-nlp-index" } } +{ "category": "editor", "doc_index": 9871, "doc_price": 30 } +{ "index": { "_index": "my-nlp-index" } } +{ "category": "statement", "doc_keyword": "entire", "doc_index": 8242, "doc_price": 350 } +{ "index": { "_index": "my-nlp-index" } } +{ "category": "statement", "doc_keyword": "idea", "doc_index": 5212, "doc_price": 200 } +{ "index": { "_index": "index-test" } } +{ "category": "editor", "doc_keyword": "bubble", "doc_index": 1298, "doc_price": 130 } +{ "index": { "_index": "index-test" } } +{ "category": "editor", "doc_keyword": "bubble", "doc_index": 521, "doc_price": 75 } +``` +{% include copy-curl.html %} + +Now you can combine a hybrid query clause with a `min` aggregation: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "term": { + "category": "permission" + } + }, + { + "bool": { + "should": [ + { + "term": { + "category": "editor" + } + }, + { + "term": { + "category": "statement" + } + } + ] + } + } + ] + } + }, + "aggs": { + "total_price": { + "sum": { + "field": "doc_price" + } + }, + "keywords": { + "terms": { + "field": "doc_keyword", + "size": 10 + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching documents and the aggregation results: + +```json +{ + "took": 9, + "timed_out": false, + "_shards": { + "total": 2, + "successful": 2, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 0.5, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "mHRPNY4BlN82W_Ar9UMY", + "_score": 0.5, + "_source": { + "doc_price": 100, + "doc_index": 4976, + "doc_keyword": "workable", + "category": "permission" + } + }, + { + "_index": "my-nlp-index", + "_id": "m3RPNY4BlN82W_Ar9UMY", + "_score": 0.5, + "_source": { + "doc_price": 30, + "doc_index": 9871, + "category": "editor" + } + }, + { + "_index": "my-nlp-index", + "_id": "nXRPNY4BlN82W_Ar9UMY", + "_score": 0.5, + "_source": { + "doc_price": 200, + "doc_index": 5212, + "doc_keyword": "idea", + "category": "statement" + } + }, + { + "_index": "my-nlp-index", + "_id": "nHRPNY4BlN82W_Ar9UMY", + "_score": 0.5, + "_source": { + "doc_price": 350, + "doc_index": 8242, + "doc_keyword": "entire", + "category": "statement" + } + } + ] + }, + "aggregations": { + "total_price": { + "value": 680 + }, + "doc_keywords": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "entire", + "doc_count": 1 + }, + { + "key": "idea", + "doc_count": 1 + }, + { + "key": "workable", + "doc_count": 1 + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_search-plugins/knn/approximate-knn.md b/_search-plugins/knn/approximate-knn.md index 869058c2aa..16d1a7e686 100644 --- a/_search-plugins/knn/approximate-knn.md +++ b/_search-plugins/knn/approximate-knn.md @@ -21,14 +21,12 @@ The k-NN plugin builds a native library index of the vectors for each knn-vector Because the native library indexes are constructed during indexing, it is not possible to apply a filter on an index and then use this search method. All filters are applied on the results produced by the approximate nearest neighbor search. -### Recommendations for engines and cluster node sizing +## Recommendations for engines and cluster node sizing Each of the three engines used for approximate k-NN search has its own attributes that make one more sensible to use than the others in a given situation. You can follow the general information below to help determine which engine will best meet your requirements. In general, nmslib outperforms both faiss and Lucene on search. However, to optimize for indexing throughput, faiss is a good option. For relatively smaller datasets (up to a few million vectors), the Lucene engine demonstrates better latencies and recall. At the same time, the size of the index is smallest compared to the other engines, which allows it to use smaller AWS instances for data nodes. -Also, the Lucene engine uses a pure Java implementation and does not share any of the limitations that engines using platform-native code experience. However, one exception to this is that the maximum dimension count for the Lucene engine is 1,024, compared with 16,000 for the other engines. Refer to the sample mapping parameters in the following section to see where this is configured. - When considering cluster node sizing, a general approach is to first establish an even distribution of the index across the cluster. However, there are other considerations. To help make these choices, you can refer to the OpenSearch managed service guidance in the section [Sizing domains](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/sizing-domains.html). ## Get started with approximate k-NN @@ -82,7 +80,7 @@ PUT my-knn-index-1 In the example above, both `knn_vector` fields are configured from method definitions. Additionally, `knn_vector` fields can also be configured from models. You can learn more about this in the [knn_vector data type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) section. -The `knn_vector` data type supports a vector of floats that can have a dimension count of up to 16,000 for the nmslib and faiss engines, as set by the dimension mapping parameter. The maximum dimension count for the Lucene library is 1,024. +The `knn_vector` data type supports a vector of floats that can have a dimension count of up to 16,000 for the NMSLIB, Faiss, and Lucene engines, as set by the dimension mapping parameter. In OpenSearch, codecs handle the storage and retrieval of indexes. The k-NN plugin uses a custom codec to write vector data to native library indexes so that the underlying k-NN search library can read it. {: .tip } @@ -246,6 +244,10 @@ After data is ingested, it can be search just like any other `knn_vector` field! To learn about using filters with k-NN search, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). +### Using approximate k-NN with nested fields + +To learn about using k-NN search with nested fields, see [k-NN search with nested fields]({{site.url}}{{site.baseurl}}/search-plugins/knn/nested-search-knn/). + ## Spaces A space corresponds to the function used to measure the distance between two points in order to determine the k-nearest neighbors. From the k-NN perspective, a lower score equates to a closer and better result. This is the opposite of how OpenSearch scores results, where a greater score equates to a better result. To convert distances to OpenSearch scores, we take 1 / (1 + distance). The k-NN plugin supports the following spaces. @@ -285,9 +287,15 @@ Not every method supports each of these spaces. Be sure to check out [the method nmslib and faiss:\[ score = {1 \over 1 + d } \]
Lucene:\[ score = {2 - d \over 2}\] - innerproduct (not supported for Lucene) - \[ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} · \mathbf{y}} = - \sum_{i=1}^n x_i y_i \] - \[ \text{If} d \ge 0, \] \[score = {1 \over 1 + d }\] \[\text{If} d < 0, score = −d + 1\] + innerproduct (supported for Lucene in OpenSearch version 2.13 and later) + \[ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} · \mathbf{y}} = - \sum_{i=1}^n x_i y_i \] +
Lucene: + \[ d(\mathbf{x}, \mathbf{y}) = {\mathbf{x} · \mathbf{y}} = \sum_{i=1}^n x_i y_i \] + + \[ \text{If} d \ge 0, \] \[score = {1 \over 1 + d }\] \[\text{If} d < 0, score = −d + 1\] +
Lucene: + \[ \text{If} d > 0, score = d + 1 \] \[\text{If} d \le 0\] \[score = {1 \over 1 + (-1 · d) }\] + @@ -295,3 +303,8 @@ The cosine similarity formula does not include the `1 -` prefix. However, becaus smaller scores with closer results, they return `1 - cosineSimilarity` for cosine similarity space---that's why `1 -` is included in the distance function. {: .note } + +With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of +such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests +containing the zero vector will be rejected and a corresponding exception will be thrown. +{: .note } \ No newline at end of file diff --git a/_search-plugins/knn/knn-index.md b/_search-plugins/knn/knn-index.md index dd81338eec..01b82b425b 100644 --- a/_search-plugins/knn/knn-index.md +++ b/_search-plugins/knn/knn-index.md @@ -11,13 +11,68 @@ has_children: false The k-NN plugin introduces a custom data type, the `knn_vector`, that allows users to ingest their k-NN vectors into an OpenSearch index and perform different kinds of k-NN search. The `knn_vector` field is highly configurable and can serve many different k-NN workloads. For more information, see [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/). +To create a k-NN index, set the `settings.index.knn` parameter to `true`: + +```json +PUT /test-index +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 3, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "lucene", + "parameters": { + "ef_construction": 128, + "m": 24 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + ## Lucene byte vector Starting with k-NN plugin version 2.9, you can use `byte` vectors with the `lucene` engine in order to reduce the amount of storage space needed. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector). +## SIMD optimization for the Faiss engine + +Starting with version 2.13, the k-NN plugin supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency. + +SIMD optimization is applicable only if the vector dimension is a multiple of 8. +{: .note} + + +### x64 architecture + + +For the x64 architecture, two different versions of the Faiss library are built and shipped with the artifact: + +- `libopensearchknn_faiss.so`: The non-optimized Faiss library without SIMD instructions. +- `libopensearchknn_faiss_avx2.so`: The Faiss library that contains AVX2 SIMD instructions. + +If your hardware supports AVX2, the k-NN plugin loads the `libopensearchknn_faiss_avx2.so` library at runtime. + +To disable AVX2 and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx2.disabled` static setting as `true` in `opensearch.yml` (default is `false`). Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings). + +### ARM64 architecture + +For the ARM64 architecture, only one performance-boosting Faiss library (`libopensearchknn_faiss.so`) is built and shipped. The library contains Neon SIMD instructions and cannot be disabled. + ## Method definitions -A method definition refers to the underlying configuration of the Approximate k-NN algorithm you want to use. Method definitions are used to either create a `knn_vector` field (when the method does not require training) or [create a model during training]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#train-model) that can then be used to [create a `knn_vector` field]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model). +A method definition refers to the underlying configuration of the approximate k-NN algorithm you want to use. Method definitions are used to either create a `knn_vector` field (when the method does not require training) or [create a model during training]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#train-model) that can then be used to [create a `knn_vector` field]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model). A method definition will always contain the name of the method, the space_type the method is built for, the engine (the library) to use, and a map of parameters. @@ -33,24 +88,27 @@ Mapping parameter | Required | Default | Updatable | Description Method name | Requires training | Supported spaces | Description :--- | :--- | :--- | :--- -`hnsw` | false | l2, innerproduct, cosinesimil, l1, linf | Hierarchical proximity graph approach to Approximate k-NN search. For more details on the algorithm, see this [abstract](https://arxiv.org/abs/1603.09320). +`hnsw` | false | l2, innerproduct, cosinesimil, l1, linf | Hierarchical proximity graph approach to approximate k-NN search. For more details on the algorithm, see this [abstract](https://arxiv.org/abs/1603.09320). #### HNSW parameters Parameter name | Required | Default | Updatable | Description :--- | :--- | :--- | :--- | :--- -`ef_construction` | false | 512 | false | The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph but slower indexing speed. +`ef_construction` | false | 100 | false | The size of the dynamic list used during k-NN graph creation. Higher values result in a more accurate graph but slower indexing speed. `m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100. For nmslib, *ef_search* is set in the [index settings](#index-settings). {: .note} -### Supported faiss methods +An index created in OpenSearch version 2.11 or earlier will still use the old `ef_construction` value (`512`). +{: .note} + +### Supported Faiss methods Method name | Requires training | Supported spaces | Description :--- | :--- | :--- | :--- -`hnsw` | false | l2, innerproduct | Hierarchical proximity graph approach to Approximate k-NN search. -`ivf` | true | l2, innerproduct | Bucketing approach where vectors are assigned different buckets based on clustering and, during search, only a subset of the buckets is searched. +`hnsw` | false | l2, innerproduct | Hierarchical proximity graph approach to approximate k-NN search. +`ivf` | true | l2, innerproduct | Stands for _inverted file index_. Bucketing approach where vectors are assigned different buckets based on clustering and, during search, only a subset of the buckets is searched. For hnsw, "innerproduct" is not available when PQ is used. {: .note} @@ -59,11 +117,14 @@ For hnsw, "innerproduct" is not available when PQ is used. Parameter name | Required | Default | Updatable | Description :--- | :--- | :--- | :--- | :--- -`ef_search` | false | 512 | false | The size of the dynamic list used during k-NN searches. Higher values lead to more accurate but slower searches. -`ef_construction` | false | 512 | false | The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph but slower indexing speed. +`ef_search` | false | 100 | false | The size of the dynamic list used during k-NN searches. Higher values result in more accurate but slower searches. +`ef_construction` | false | 100 | false | The size of the dynamic list used during k-NN graph creation. Higher values result in a more accurate graph but slower indexing speed. `m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100. `encoder` | false | flat | false | Encoder definition for encoding vectors. Encoders can reduce the memory footprint of your index, at the expense of search accuracy. +An index created in OpenSearch version 2.11 or earlier will still use the old `ef_construction` and `ef_search` values (`512`). +{: .note} + #### IVF parameters Parameter name | Required | Default | Updatable | Description @@ -84,39 +145,38 @@ Training data can be composed of either the same data that is going to be ingest ### Supported Lucene methods Method name | Requires training | Supported spaces | Description -:--- | :--- | :--- | :--- -`hnsw` | false | l2, cosinesimil | Hierarchical proximity graph approach to Approximate k-NN search. +:--- | :--- |:--------------------------------------------------------------------------------| :--- +`hnsw` | false | l2, cosinesimil, innerproduct (supported in OpenSearch 2.13 and later) | Hierarchical proximity graph approach to approximate k-NN search. #### HNSW parameters Parameter name | Required | Default | Updatable | Description :--- | :--- | :--- | :--- | :--- -`ef_construction` | false | 512 | false | The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph but slower indexing speed.
The Lucene engine uses the proprietary term "beam_width" to describe this function, which corresponds directly to "ef_construction". To be consistent throughout OpenSearch documentation, we retain the term "ef_construction" to label this parameter. +`ef_construction` | false | 100 | false | The size of the dynamic list used during k-NN graph creation. Higher values result in a more accurate graph but slower indexing speed.
The Lucene engine uses the proprietary term "beam_width" to describe this function, which corresponds directly to "ef_construction". To be consistent throughout the OpenSearch documentation, we retain the term "ef_construction" for this parameter. `m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100.
The Lucene engine uses the proprietary term "max_connections" to describe this function, which corresponds directly to "m". To be consistent throughout OpenSearch documentation, we retain the term "m" to label this parameter. Lucene HNSW implementation ignores `ef_search` and dynamically sets it to the value of "k" in the search request. Therefore, there is no need to make settings for `ef_search` when using the Lucene engine. {: .note} +An index created in OpenSearch version 2.11 or earlier will still use the old `ef_construction` value (`512`). +{: .note} + ```json -{ - "type": "knn_vector", - "dimension": 100, - "method": { - "name":"hnsw", - "engine":"lucene", - "space_type": "l2", - "parameters":{ - "m":2048, - "ef_construction": 245 - } +"method": { + "name":"hnsw", + "engine":"lucene", + "space_type": "l2", + "parameters":{ + "m":2048, + "ef_construction": 245 } } ``` -### Supported faiss encoders +### Supported Faiss encoders -You can use encoders to reduce the memory footprint of a k-NN index at the expense of search accuracy. faiss has -several encoder types, but the plugin currently only supports *flat* and *pq* encoding. +You can use encoders to reduce the memory footprint of a k-NN index at the expense of search accuracy. The k-NN plugin currently supports the +`flat`, `pq`, and `sq` encoders in the Faiss library. The following example method definition specifies the `hnsw` method and a `pq` encoder: @@ -142,11 +202,27 @@ The `hnsw` method supports the `pq` encoder for OpenSearch versions 2.10 and lat Encoder name | Requires training | Description :--- | :--- | :--- -`flat` | false | Encode vectors as floating point arrays. This encoding does not reduce memory footprint. +`flat` (Default) | false | Encode vectors as floating-point arrays. This encoding does not reduce memory footprint. `pq` | true | An abbreviation for _product quantization_, it is a lossy compression technique that uses clustering to encode a vector into a fixed size of bytes, with the goal of minimizing the drop in k-NN search accuracy. At a high level, vectors are broken up into `m` subvectors, and then each subvector is represented by a `code_size` code obtained from a code book produced during training. For more information about product quantization, see [this blog post](https://medium.com/dotstar/understanding-faiss-part-2-79d90b1e5388). +`sq` | false | An abbreviation for _scalar quantization_. Starting with k-NN plugin version 2.13, you can use the `sq` encoder to quantize 32-bit floating-point vectors into 16-bit floats. In version 2.13, the built-in `sq` encoder is the SQFP16 Faiss encoder. The encoder reduces memory footprint with a minimal loss of precision and improves performance by using SIMD optimization (using AVX2 on x86 architecture or Neon on ARM64 architecture). For more information, see [Faiss scalar quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization#faiss-scalar-quantization). -#### Examples +#### PQ parameters + +Parameter name | Required | Default | Updatable | Description +:--- | :--- | :--- | :--- | :--- +`m` | false | 1 | false | Determines the number of subvectors into which to break the vector. Subvectors are encoded independently of each other. This vector dimension must be divisible by `m`. Maximum value is 1,024. +`code_size` | false | 8 | false | Determines the number of bits into which to encode a subvector. Maximum value is 8. For IVF, this value must be less than or equal to 8. For HNSW, this value can only be 8. +#### SQ parameters + +Parameter name | Required | Default | Updatable | Description +:--- | :--- | :-- | :--- | :--- +`type` | false | `fp16` | false | The type of scalar quantization to be used to encode 32-bit float vectors into the corresponding type. As of OpenSearch 2.13, only the `fp16` encoder type is supported. For the `fp16` encoder, vector values must be in the [-65504.0, 65504.0] range. +`clip` | false | `false` | false | If `true`, then any vector values outside of the supported range for the specified vector type are rounded so that they are in the range. If `false`, then the request is rejected if any vector values are outside of the supported range. Setting `clip` to `true` may decrease recall. + +For more information and examples, see [Using Faiss scalar quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#using-faiss-scalar-quantization). + +#### Examples The following example uses the `ivf` method without specifying an encoder (by default, OpenSearch uses the `flat` encoder): @@ -195,12 +271,46 @@ The following example uses the `hnsw` method without specifying an encoder (by d } ``` -#### PQ parameters +The following example uses the `hnsw` method with an `sq` encoder of type `fp16` with `clip` enabled: -Paramater Name | Required | Default | Updatable | Description -:--- | :--- | :--- | :--- | :--- -`m` | false | 1 | false | Determine how many many sub-vectors to break the vector into. sub-vectors are encoded independently of each other. This dimension of the vector must be divisible by `m`. Max value is 1024. -`code_size` | false | 8 | false | Determines the number of bits to encode a sub-vector into. Max value is 8. **Note** --- for IVF, this value must be less than or equal to 8. For HNSW, this value can only be 8. +```json +"method": { + "name":"hnsw", + "engine":"faiss", + "space_type": "l2", + "parameters":{ + "encoder": { + "name": "sq", + "parameters": { + "type": "fp16", + "clip": true + } + }, + "ef_construction": 256, + "m": 8 + } +} +``` + +The following example uses the `ivf` method with an `sq` encoder of type `fp16`: + +```json +"method": { + "name":"ivf", + "engine":"faiss", + "space_type": "l2", + "parameters":{ + "encoder": { + "name": "sq", + "parameters": { + "type": "fp16", + "clip": false + } + }, + "nprobes": 2 + } +} +``` ### Choosing the right method @@ -212,6 +322,8 @@ If you want to use less memory and index faster than HNSW, while maintaining sim If memory is a concern, consider adding a PQ encoder to your HNSW or IVF index. Because PQ is a lossy encoding, query quality will drop. +You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#faiss-scalar-quantization). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#lucene-byte-vector) in order to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/). + ### Memory estimation In a typical OpenSearch cluster, a certain portion of RAM is set aside for the JVM heap. The k-NN plugin allocates @@ -221,6 +333,9 @@ the `circuit_breaker_limit` cluster setting. By default, the limit is set at 50% Having a replica doubles the total number of vectors. {: .note } +For information about using memory estimation with vector quantization, see the [vector quantization documentation]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#memory-estimation). +{: .note } + #### HNSW memory estimation The memory required for HNSW is estimated to be `1.1 * (4 * dimension + 8 * M)` bytes/vector. @@ -250,8 +365,11 @@ At the moment, several parameters defined in the settings are in the deprecation Setting | Default | Updatable | Description :--- | :--- | :--- | :--- -`index.knn` | false | false | Whether the index should build native library indexes for the `knn_vector` fields. If set to false, the `knn_vector` fields will be stored in doc values, but Approximate k-NN search functionality will be disabled. -`index.knn.algo_param.ef_search` | 512 | true | The size of the dynamic list used during k-NN searches. Higher values lead to more accurate but slower searches. Only available for nmslib. -`index.knn.algo_param.ef_construction` | 512 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. +`index.knn` | false | false | Whether the index should build native library indexes for the `knn_vector` fields. If set to false, the `knn_vector` fields will be stored in doc values, but approximate k-NN search functionality will be disabled. +`index.knn.algo_param.ef_search` | 100 | true | The size of the dynamic list used during k-NN searches. Higher values result in more accurate but slower searches. Only available for NMSLIB. +`index.knn.algo_param.ef_construction` | 100 | false | Deprecated in 1.0.0. Instead, use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value. `index.knn.algo_param.m` | 16 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. `index.knn.space_type` | l2 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. + +An index created in OpenSearch version 2.11 or earlier will still use the old `ef_construction` and `ef_search` values (`512`). +{: .note} diff --git a/_search-plugins/knn/knn-score-script.md b/_search-plugins/knn/knn-score-script.md index 602346803d..cc79e90850 100644 --- a/_search-plugins/knn/knn-score-script.md +++ b/_search-plugins/knn/knn-score-script.md @@ -313,9 +313,11 @@ A space corresponds to the function used to measure the distance between two poi \[ score = 2 - d \] - innerproduct (not supported for Lucene) - \[ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} · \mathbf{y}} = - \sum_{i=1}^n x_i y_i \] - \[ \text{If} d \ge 0, \] \[score = {1 \over 1 + d }\] \[\text{If} d < 0, score = −d + 1\] + innerproduct (supported for Lucene in OpenSearch version 2.13 and later) + \[ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} · \mathbf{y}} = - \sum_{i=1}^n x_i y_i \] + + \[ \text{If} d \ge 0, \] \[score = {1 \over 1 + d }\] \[\text{If} d < 0, score = −d + 1\] + hammingbit @@ -326,3 +328,8 @@ A space corresponds to the function used to measure the distance between two poi Cosine similarity returns a number between -1 and 1, and because OpenSearch relevance scores can't be below 0, the k-NN plugin adds 1 to get the final score. + +With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...`]) as input. This is because the magnitude of +such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests +containing the zero vector will be rejected and a corresponding exception will be thrown. +{: .note } \ No newline at end of file diff --git a/_search-plugins/knn/knn-vector-quantization.md b/_search-plugins/knn/knn-vector-quantization.md new file mode 100644 index 0000000000..3373f104c2 --- /dev/null +++ b/_search-plugins/knn/knn-vector-quantization.md @@ -0,0 +1,159 @@ +--- +layout: default +title: k-NN vector quantization +nav_order: 27 +parent: k-NN search +grand_parent: Search methods +has_children: false +has_math: true +--- + +# k-NN vector quantization + +By default, the k-NN plugin supports the indexing and querying of vectors of type `float`, where each dimension of the vector occupies 4 bytes of memory. For use cases that require ingestion on a large scale, keeping `float` vectors can be expensive because OpenSearch needs to construct, load, save, and search graphs (for native `nmslib` and `faiss` engines). To reduce the memory footprint, you can use vector quantization. + +## Lucene byte vector + +Starting with k-NN plugin version 2.9, you can use `byte` vectors with the `lucene` engine in order to reduce the amount of required memory. This requires quantizing the vectors outside of OpenSearch before ingesting them into an OpenSearch index. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector). + +## Faiss scalar quantization + +Starting with version 2.13, the k-NN plugin supports performing scalar quantization for the Faiss engine within OpenSearch. Within the Faiss engine, a scalar quantizer (SQfp16) performs the conversion between 32-bit and 16-bit vectors. At ingestion time, when you upload 32-bit floating-point vectors to OpenSearch, SQfp16 quantizes them into 16-bit floating-point vectors and stores the quantized vectors in a k-NN index. At search time, SQfp16 decodes the vector values back into 32-bit floating-point values for distance computation. The SQfp16 quantization can decrease the memory footprint by a factor of 2. Additionally, it leads to a minimal loss in recall when differences between vector values are large compared to the error introduced by eliminating their two least significant bits. When used with [SIMD optimization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#simd-optimization-for-the-faiss-engine), SQfp16 quantization can also significantly reduce search latencies and improve indexing throughput. + +SIMD optimization is not supported on Windows. Using Faiss scalar quantization on Windows can lead to a significant drop in performance, including decreased indexing throughput and increased search latencies. +{: .warning} + +### Using Faiss scalar quantization + +To use Faiss scalar quantization, set the k-NN vector field's `method.parameters.encoder.name` to `sq` when creating a k-NN index: + +```json +PUT /test-index +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 3, + "method": { + "name": "hnsw", + "engine": "faiss", + "space_type": "l2", + "parameters": { + "encoder": { + "name": "sq", + }, + "ef_construction": 256, + "m": 8 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Optionally, you can specify the parameters in `method.parameters.encoder`. For more information about `encoder` object parameters, see [SQ parameters]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#sq-parameters). + +The `fp16` encoder converts 32-bit vectors into their 16-bit counterparts. For this encoder type, the vector values must be in the [-65504.0, 65504.0] range. To define how to handle out-of-range values, the preceding request specifies the `clip` parameter. By default, this parameter is `false`, and any vectors containing out-of-range values are rejected. When `clip` is set to `true` (as in the preceding request), out-of-range vector values are rounded up or down so that they are in the supported range. For example, if the original 32-bit vector is `[65510.82, -65504.1]`, the vector will be indexed as a 16-bit vector `[65504.0, -65504.0]`. + +We recommend setting `clip` to `true` only if very few elements lie outside of the supported range. Rounding the values may cause a drop in recall. +{: .note} + +The following example method definition specifies the Faiss SQfp16 encoder, which rejects any indexing request that contains out-of-range vector values (because the `clip` parameter is `false` by default): + +```json +PUT /test-index +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 3, + "method": { + "name": "hnsw", + "engine": "faiss", + "space_type": "l2", + "parameters": { + "encoder": { + "name": "sq", + "parameters": { + "type": "fp16" + } + }, + "ef_construction": 256, + "m": 8 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +During ingestion, make sure each dimension of the vector is in the supported range ([-65504.0, 65504.0]): + +```json +PUT test-index/_doc/1 +{ + "my_vector1": [-65504.0, 65503.845, 55.82] +} +``` +{% include copy-curl.html %} + +During querying, there is no range limitation for the query vector: + +```json +GET test-index/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector1": { + "vector": [265436.876, -120906.256, 99.84], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +## Memory estimation + +In the best-case scenario, 16-bit vectors produced by the Faiss SQfp16 quantizer require 50% of the memory that 32-bit vectors require. + +#### HNSW memory estimation + +The memory required for HNSW is estimated to be `1.1 * (2 * dimension + 8 * M)` bytes/vector. + +As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows: + +```bash +1.1 * (2 * 256 + 8 * 16) * 1,000,000 ~= 0.656 GB +``` + +#### IVF memory estimation + +The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * d))` bytes/vector. + +As an example, assume that you have 1 million vectors with a dimension of 256 and `nlist` of 128. The memory requirement can be estimated as follows: + +```bash +1.1 * (((2 * 256) * 1,000,000) + (4 * 128 * 256)) ~= 0.525 GB +``` + diff --git a/_search-plugins/knn/nested-search-knn.md b/_search-plugins/knn/nested-search-knn.md new file mode 100644 index 0000000000..49465edea5 --- /dev/null +++ b/_search-plugins/knn/nested-search-knn.md @@ -0,0 +1,347 @@ +--- +layout: default +title: k-NN search with nested fields +nav_order: 21 +parent: k-NN search +grand_parent: Search methods +has_children: false +has_math: true +--- + +# k-NN search with nested fields + +Using [nested fields]({{site.url}}{{site.baseurl}}/field-types/nested/) in a k-nearest neighbors (k-NN) index, you can store multiple vectors in a single document. For example, if your document consists of various components, you can generate a vector value for each component and store each vector in a nested field. + +A k-NN document search operates at the field level. For a document with nested fields, OpenSearch examines only the vector nearest to the query vector to decide whether to include the document in the results. For example, consider an index containing documents `A` and `B`. Document `A` is represented by vectors `A1` and `A2`, and document `B` is represented by vector `B1`. Further, the similarity order for a query Q is `A1`, `A2`, `B1`. If you search using query Q with a k value of 2, the search will return both documents `A` and `B` instead of only document `A`. + +Note that in the case of an approximate search, the results are approximations and not exact matches. + +k-NN search with nested fields is supported by the HNSW algorithm for the Lucene and Faiss engines. + + +## Indexing and searching nested fields + +To use k-NN search with nested fields, you must create a k-NN index by setting `index.knn` to `true`. Create a nested field by setting its `type` to `nested` and specify one or more fields of the `knn_vector` data type within the nested field. In this example, the `knn_vector` field `my_vector` is nested inside the `nested_field` field: + +```json +PUT my-knn-index-1 +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "nested_field": { + "type": "nested", + "properties": { + "my_vector": { + "type": "knn_vector", + "dimension": 3, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "lucene", + "parameters": { + "ef_construction": 100, + "m": 16 + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +After you create the index, add some data to it: + +```json +PUT _bulk?refresh=true +{ "index": { "_index": "my-knn-index-1", "_id": "1" } } +{"nested_field":[{"my_vector":[1,1,1]},{"my_vector":[2,2,2]},{"my_vector":[3,3,3]}]} +{ "index": { "_index": "my-knn-index-1", "_id": "2" } } +{"nested_field":[{"my_vector":[10,10,10]},{"my_vector":[20,20,20]},{"my_vector":[30,30,30]}]} +``` +{% include copy-curl.html %} + +Then run a k-NN search on the data by using the `knn` query type: + +```json +GET my-knn-index-1/_search +{ + "query": { + "nested": { + "path": "nested_field", + "query": { + "knn": { + "nested_field.my_vector": { + "vector": [1,1,1], + "k": 2 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Even though all three vectors nearest to the query vector are in document 1, the query returns both documents 1 and 2 because k is set to 2: + +```json +{ + "took": 23, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "my-knn-index-1", + "_id": "1", + "_score": 1, + "_source": { + "nested_field": [ + { + "my_vector": [ + 1, + 1, + 1 + ] + }, + { + "my_vector": [ + 2, + 2, + 2 + ] + }, + { + "my_vector": [ + 3, + 3, + 3 + ] + } + ] + } + }, + { + "_index": "my-knn-index-1", + "_id": "2", + "_score": 0.0040983604, + "_source": { + "nested_field": [ + { + "my_vector": [ + 10, + 10, + 10 + ] + }, + { + "my_vector": [ + 20, + 20, + 20 + ] + }, + { + "my_vector": [ + 30, + 30, + 30 + ] + } + ] + } + } + ] + } +} +``` + +## k-NN search with filtering on nested fields + +You can apply a filter to a k-NN search with nested fields. A filter can be applied to either a top-level field or a field inside a nested field. + +The following example applies a filter to a top-level field. + +First, create a k-NN index with a nested field: + +```json +PUT my-knn-index-1 +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "nested_field": { + "type": "nested", + "properties": { + "my_vector": { + "type": "knn_vector", + "dimension": 3, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "lucene", + "parameters": { + "ef_construction": 100, + "m": 16 + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +After you create the index, add some data to it: + +```json +PUT _bulk?refresh=true +{ "index": { "_index": "my-knn-index-1", "_id": "1" } } +{"parking": false, "nested_field":[{"my_vector":[1,1,1]},{"my_vector":[2,2,2]},{"my_vector":[3,3,3]}]} +{ "index": { "_index": "my-knn-index-1", "_id": "2" } } +{"parking": true, "nested_field":[{"my_vector":[10,10,10]},{"my_vector":[20,20,20]},{"my_vector":[30,30,30]}]} +{ "index": { "_index": "my-knn-index-1", "_id": "3" } } +{"parking": true, "nested_field":[{"my_vector":[100,100,100]},{"my_vector":[200,200,200]},{"my_vector":[300,300,300]}]} +``` +{% include copy-curl.html %} + +Then run a k-NN search on the data using the `knn` query type with a filter. The following query returns documents whose `parking` field is set to `true`: + +```json +GET my-knn-index-1/_search +{ + "query": { + "nested": { + "path": "nested_field", + "query": { + "knn": { + "nested_field.my_vector": { + "vector": [ + 1, + 1, + 1 + ], + "k": 3, + "filter": { + "term": { + "parking": true + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Even though all three vectors nearest to the query vector are in document 1, the query returns documents 2 and 3 because document 1 is filtered out: + +```json +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.0040983604, + "hits": [ + { + "_index": "my-knn-index-1", + "_id": "2", + "_score": 0.0040983604, + "_source": { + "parking": true, + "nested_field": [ + { + "my_vector": [ + 10, + 10, + 10 + ] + }, + { + "my_vector": [ + 20, + 20, + 20 + ] + }, + { + "my_vector": [ + 30, + 30, + 30 + ] + } + ] + } + }, + { + "_index": "my-knn-index-1", + "_id": "3", + "_score": 3.400898E-5, + "_source": { + "parking": true, + "nested_field": [ + { + "my_vector": [ + 100, + 100, + 100 + ] + }, + { + "my_vector": [ + 200, + 200, + 200 + ] + }, + { + "my_vector": [ + 300, + 300, + 300 + ] + } + ] + } + } + ] + } +} +``` diff --git a/_search-plugins/knn/painless-functions.md b/_search-plugins/knn/painless-functions.md index 2b28f753ef..1f27cc29a6 100644 --- a/_search-plugins/knn/painless-functions.md +++ b/_search-plugins/knn/painless-functions.md @@ -67,3 +67,8 @@ cosineSimilarity | `float cosineSimilarity (float[] queryVector, doc['vector fie ``` Because scores can only be positive, this script ranks documents with vector fields higher than those without. + +With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...`]) as input. This is because the magnitude of +such a vector is 0, which raises a `divide by 0` exception when computing the value. Requests +containing the zero vector will be rejected and a corresponding exception will be thrown. +{: .note } \ No newline at end of file diff --git a/_search-plugins/knn/settings.md b/_search-plugins/knn/settings.md index 1f43654fbe..f4ef057cfb 100644 --- a/_search-plugins/knn/settings.md +++ b/_search-plugins/knn/settings.md @@ -25,3 +25,4 @@ Setting | Default | Description `knn.model.index.number_of_shards`| 1 | The number of shards to use for the model system index, the OpenSearch index that stores the models used for Approximate Nearest Neighbor (ANN) search. `knn.model.index.number_of_replicas`| 1 | The number of replica shards to use for the model system index. Generally, in a multi-node cluster, this should be at least 1 to increase stability. `knn.advanced.filtered_exact_search_threshold`| null | The threshold value for the filtered IDs that is used to switch to exact search during filtered ANN search. If the number of filtered IDs in a segment is less than this setting's value, exact search will be performed on the filtered IDs. +`knn.faiss.avx2.disabled` | False | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx2.so` library and load the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine). diff --git a/_search-plugins/neural-search-tutorial.md b/_search-plugins/neural-search-tutorial.md index 3803a417c5..5f5a5fe79e 100644 --- a/_search-plugins/neural-search-tutorial.md +++ b/_search-plugins/neural-search-tutorial.md @@ -159,7 +159,7 @@ OpenSearch sends back the model group ID: You'll use this ID to register the chosen model to the model group. -
+
Test it @@ -271,7 +271,7 @@ Once the task is complete, the task state will be `COMPLETED` and the Tasks API You'll need the model ID in order to use this model for several of the following steps. -
+
Test it @@ -391,7 +391,7 @@ Once the task is complete, the task state will be `COMPLETED`: } ``` -
+
Test it @@ -469,7 +469,7 @@ PUT /_ingest/pipeline/nlp-ingest-pipeline ``` {% include copy-curl.html %} -
+
Test it @@ -541,7 +541,7 @@ PUT /my-nlp-index Setting up a k-NN index allows you to later perform a vector search on the `passage_embedding` field. -
+
Test it @@ -670,7 +670,7 @@ GET /my-nlp-index/_search Document 3 is not returned because it does not contain the specified keywords. Documents containing the words `rodeo` and `cowboy` are scored lower because semantic meaning is not considered: -
+
Results @@ -762,7 +762,7 @@ GET /my-nlp-index/_search This time, the response not only contains all five documents, but the document order is also improved because neural search considers semantic meaning: -
+
Results @@ -910,7 +910,7 @@ GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline Not only does OpenSearch return documents that match the semantic meaning of `wild west`, but now the documents containing words related to the wild west theme are also scored higher relative to the others: -
+
Results diff --git a/_search-plugins/neural-sparse-search.md b/_search-plugins/neural-sparse-search.md index c46da172a7..58918565c4 100644 --- a/_search-plugins/neural-sparse-search.md +++ b/_search-plugins/neural-sparse-search.md @@ -55,6 +55,9 @@ PUT /_ingest/pipeline/nlp-ingest-pipeline-sparse ``` {% include copy-curl.html %} +To split long text into passages, use the `text_chunking` ingest processor before the `sparse_encoding` processor. For more information, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). + + ## Step 2: Create an index for ingestion In order to use the text embedding processor defined in your pipeline, create a rank features index, adding the pipeline created in the previous step as the default pipeline. Ensure that the fields defined in the `field_map` are mapped as correct types. Continuing with the example, the `passage_embedding` field must be mapped as [`rank_features`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/rank/#rank-features). Similarly, the `passage_text` field should be mapped as `text`. @@ -154,8 +157,7 @@ GET my-nlp-index/_search "neural_sparse": { "passage_embedding": { "query_text": "Hi world", - "model_id": "aP2Q8ooBpBj3wT4HVS8a", - "max_token_score": 2 + "model_id": "aP2Q8ooBpBj3wT4HVS8a" } } } @@ -238,3 +240,133 @@ The response contains the matching documents: } } ``` + +## Setting a default model on an index or field + +A [`neural_sparse`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural-sparse/) query requires a model ID for generating sparse embeddings. To eliminate passing the model ID with each neural_sparse query request, you can set a default model on index-level or field-level. + +First, create a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) with a [`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) request processor. To set a default model for an index, provide the model ID in the `default_model_id` parameter. To set a default model for a specific field, provide the field name and the corresponding model ID in the `neural_field_default_id` map. If you provide both `default_model_id` and `neural_field_default_id`, `neural_field_default_id` takes precedence: + +```json +PUT /_search/pipeline/default_model_pipeline +{ + "request_processors": [ + { + "neural_query_enricher" : { + "default_model_id": "bQ1J8ooBpBj3wT4HVUsb", + "neural_field_default_id": { + "my_field_1": "uZj0qYoBMtvQlfhaYeud", + "my_field_2": "upj0qYoBMtvQlfhaZOuM" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +Then set the default model for your index: + +```json +PUT /my-nlp-index/_settings +{ + "index.search.default_pipeline" : "default_model_pipeline" +} +``` +{% include copy-curl.html %} + +You can now omit the model ID when searching: + +```json +GET /my-nlp-index/_search +{ + "query": { + "neural_sparse": { + "passage_embedding": { + "query_text": "Hi world" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains both documents: + +```json +{ + "took" : 688, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 30.0029, + "hits" : [ + { + "_index" : "my-nlp-index", + "_id" : "1", + "_score" : 30.0029, + "_source" : { + "passage_text" : "Hello world", + "passage_embedding" : { + "!" : 0.8708904, + "door" : 0.8587369, + "hi" : 2.3929274, + "worlds" : 2.7839446, + "yes" : 0.75845814, + "##world" : 2.5432441, + "born" : 0.2682308, + "nothing" : 0.8625516, + "goodbye" : 0.17146169, + "greeting" : 0.96817183, + "birth" : 1.2788506, + "come" : 0.1623208, + "global" : 0.4371151, + "it" : 0.42951578, + "life" : 1.5750692, + "thanks" : 0.26481047, + "world" : 4.7300377, + "tiny" : 0.5462298, + "earth" : 2.6555297, + "universe" : 2.0308156, + "worldwide" : 1.3903781, + "hello" : 6.696973, + "so" : 0.20279501, + "?" : 0.67785245 + }, + "id" : "s1" + } + }, + { + "_index" : "my-nlp-index", + "_id" : "2", + "_score" : 16.480486, + "_source" : { + "passage_text" : "Hi planet", + "passage_embedding" : { + "hi" : 4.338913, + "planets" : 2.7755864, + "planet" : 5.0969057, + "mars" : 1.7405145, + "earth" : 2.6087382, + "hello" : 3.3210192 + }, + "id" : "s2" + } + } + ] + } +} +``` + +## Next steps + +- To learn more about splitting long text into passages for neural search, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). \ No newline at end of file diff --git a/_search-plugins/search-pipelines/collapse-processor.md b/_search-plugins/search-pipelines/collapse-processor.md new file mode 100644 index 0000000000..cea0a15396 --- /dev/null +++ b/_search-plugins/search-pipelines/collapse-processor.md @@ -0,0 +1,144 @@ +--- +layout: default +title: Collapse +nav_order: 7 +has_children: false +parent: Search processors +grand_parent: Search pipelines +--- + +# Collapse processor + +The `collapse` response processor discards hits that have the same value for a particular field as a previous document in the result set. +This is similar to passing the `collapse` parameter in a search request, but the response processor is applied to the +response after fetching from all shards. The `collapse` response processor may be used in conjunction with the `rescore` search +request parameter or may be applied after a reranking response processor. + +Using the `collapse` response processor will likely result in fewer than `size` results being returned because hits are discarded +from a set whose size is already less than or equal to `size`. To increase the likelihood of returning `size` hits, use the +`oversample` request processor and `truncate_hits` response processor, as shown in [this example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/#oversample-collapse-and-truncate-hits). + +## Request fields + +The following table lists all request fields. + +Field | Data type | Description +:--- | :--- | :--- +`field` | String | The field whose value will be read from each returned search hit. Only the first hit for each given field value will be returned in the search response. Required. +`context_prefix` | String | May be used to read the `original_size` variable from a specific scope in order to avoid collisions. Optional. +`tag` | String | The processor's identifier. Optional. +`description` | String | A description of the processor. Optional. +`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. + +## Example + +The following example demonstrates using a search pipeline with a `collapse` processor. + +### Setup + +Create many documents containing a field to use for collapsing: + +```json +POST /_bulk +{ "create":{"_index":"my_index","_id":1}} +{ "title" : "document 1", "color":"blue" } +{ "create":{"_index":"my_index","_id":2}} +{ "title" : "document 2", "color":"blue" } +{ "create":{"_index":"my_index","_id":3}} +{ "title" : "document 3", "color":"red" } +{ "create":{"_index":"my_index","_id":4}} +{ "title" : "document 4", "color":"red" } +{ "create":{"_index":"my_index","_id":5}} +{ "title" : "document 5", "color":"yellow" } +{ "create":{"_index":"my_index","_id":6}} +{ "title" : "document 6", "color":"yellow" } +{ "create":{"_index":"my_index","_id":7}} +{ "title" : "document 7", "color":"orange" } +{ "create":{"_index":"my_index","_id":8}} +{ "title" : "document 8", "color":"orange" } +{ "create":{"_index":"my_index","_id":9}} +{ "title" : "document 9", "color":"green" } +{ "create":{"_index":"my_index","_id":10}} +{ "title" : "document 10", "color":"green" } +``` +{% include copy-curl.html %} + +Create a pipeline that only collapses on the `color` field: + +```json +PUT /_search/pipeline/collapse_pipeline +{ + "response_processors": [ + { + "collapse" : { + "field": "color" + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using a search pipeline + +In this example, you request the top three documents before collapsing on the `color` field. Because the first two documents have the same `color`, the second one is discarded, +and the request returns the first and third documents: + +```json +POST /my_index/_search?search_pipeline=collapse_pipeline +{ + "size": 3 +} +``` +{% include copy-curl.html %} + + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "title" : "document 1", + "color" : "blue" + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "title" : "document 3", + "color" : "red" + } + } + ] + }, + "profile" : { + "shards" : [ ] + } +} +``` +
diff --git a/_search-plugins/search-pipelines/neural-query-enricher.md b/_search-plugins/search-pipelines/neural-query-enricher.md index 117eca5051..e187ea17a9 100644 --- a/_search-plugins/search-pipelines/neural-query-enricher.md +++ b/_search-plugins/search-pipelines/neural-query-enricher.md @@ -9,7 +9,7 @@ grand_parent: Search pipelines # Neural query enricher processor -The `neural_query_enricher` search request processor is designed to set a default machine learning (ML) model ID at the index or field level for [neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/) queries. To learn more about ML models, see [Using ML models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/) and [Connecting to remote models]({{site.url}}{{site.baseurl}}ml-commons-plugin/remote-models/index/). +The `neural_query_enricher` search request processor is designed to set a default machine learning (ML) model ID at the index or field level for [neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/) queries. To learn more about ML models, see [Using ML models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/) and [Connecting to remote models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). ## Request fields diff --git a/_search-plugins/search-pipelines/oversample-processor.md b/_search-plugins/search-pipelines/oversample-processor.md new file mode 100644 index 0000000000..698d9572cf --- /dev/null +++ b/_search-plugins/search-pipelines/oversample-processor.md @@ -0,0 +1,292 @@ +--- +layout: default +title: Oversample +nav_order: 17 +has_children: false +parent: Search processors +grand_parent: Search pipelines +--- + +# Oversample processor + +The `oversample` request processor multiplies the `size` parameter of the search request by a specified `sample_factor` (>= 1.0), saving the original value in the `original_size` pipeline variable. The `oversample` processor is designed to work with the [`truncate_hits` response processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/) but may be used on its own. + +## Request fields + +The following table lists all request fields. + +Field | Data type | Description +:--- | :--- | :--- +`sample_factor` | Float | The multiplicative factor (>= 1.0) that will be applied to the `size` parameter before processing the search request. Required. +`context_prefix` | String | May be used to scope the `original_size` variable in order to avoid collisions. Optional. +`tag` | String | The processor's identifier. Optional. +`description` | String | A description of the processor. Optional. +`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. + + +## Example + +The following example demonstrates using a search pipeline with an `oversample` processor. + +### Setup + +Create an index named `my_index` containing many documents: + +```json +POST /_bulk +{ "create":{"_index":"my_index","_id":1}} +{ "doc": { "title" : "document 1" }} +{ "create":{"_index":"my_index","_id":2}} +{ "doc": { "title" : "document 2" }} +{ "create":{"_index":"my_index","_id":3}} +{ "doc": { "title" : "document 3" }} +{ "create":{"_index":"my_index","_id":4}} +{ "doc": { "title" : "document 4" }} +{ "create":{"_index":"my_index","_id":5}} +{ "doc": { "title" : "document 5" }} +{ "create":{"_index":"my_index","_id":6}} +{ "doc": { "title" : "document 6" }} +{ "create":{"_index":"my_index","_id":7}} +{ "doc": { "title" : "document 7" }} +{ "create":{"_index":"my_index","_id":8}} +{ "doc": { "title" : "document 8" }} +{ "create":{"_index":"my_index","_id":9}} +{ "doc": { "title" : "document 9" }} +{ "create":{"_index":"my_index","_id":10}} +{ "doc": { "title" : "document 10" }} +``` +{% include copy-curl.html %} + +### Creating a search pipeline + +The following request creates a search pipeline named `my_pipeline` with an `oversample` request processor that requests 50% more hits than specified in `size`: + +```json +PUT /_search/pipeline/my_pipeline +{ + "request_processors": [ + { + "oversample" : { + "tag" : "oversample_1", + "description" : "This processor will multiply `size` by 1.5.", + "sample_factor" : 1.5 + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using a search pipeline + +Search for documents in `my_index` without a search pipeline: + +```json +POST /my_index/_search +{ + "size": 5 +} +``` +{% include copy-curl.html %} + +The response contains five hits: + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 1" + } + } + }, + { + "_index" : "my_index", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 2" + } + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 3" + } + } + }, + { + "_index" : "my_index", + "_id" : "4", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 4" + } + } + }, + { + "_index" : "my_index", + "_id" : "5", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 5" + } + } + } + ] + } +} +``` +
+ +To search with a pipeline, specify the pipeline name in the `search_pipeline` query parameter: + +```json +POST /my_index/_search?search_pipeline=my_pipeline +{ + "size": 5 +} +``` +{% include copy-curl.html %} + +The response contains 8 documents (5 * 1.5 = 7.5, rounded up to 8): + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 13, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 1" + } + } + }, + { + "_index" : "my_index", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 2" + } + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 3" + } + } + }, + { + "_index" : "my_index", + "_id" : "4", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 4" + } + } + }, + { + "_index" : "my_index", + "_id" : "5", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 5" + } + } + }, + { + "_index" : "my_index", + "_id" : "6", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 6" + } + } + }, + { + "_index" : "my_index", + "_id" : "7", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 7" + } + } + }, + { + "_index" : "my_index", + "_id" : "8", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 8" + } + } + } + ] + } +} +``` +
diff --git a/_search-plugins/search-pipelines/personalize-search-ranking.md b/_search-plugins/search-pipelines/personalize-search-ranking.md index b73ebb7476..c7a7dd8dde 100644 --- a/_search-plugins/search-pipelines/personalize-search-ranking.md +++ b/_search-plugins/search-pipelines/personalize-search-ranking.md @@ -1,7 +1,7 @@ --- layout: default title: Personalize search ranking -nav_order: 40 +nav_order: 18 has_children: false parent: Search processors grand_parent: Search pipelines diff --git a/_search-plugins/search-pipelines/rag-processor.md b/_search-plugins/search-pipelines/rag-processor.md new file mode 100644 index 0000000000..7137134aff --- /dev/null +++ b/_search-plugins/search-pipelines/rag-processor.md @@ -0,0 +1,100 @@ +--- +layout: default +title: Retrieval-augmented generation +nav_order: 18 +has_children: false +parent: Search processors +grand_parent: Search pipelines +--- + +# Retrieval-augmented generation processor + +The `retrieval_augmented_generation` processor is a search results processor that you can use in [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/) for retrieval-augmented generation (RAG). The processor intercepts query results, retrieves previous messages from the conversation from the conversational memory, and sends a prompt to a large language model (LLM). After the processor receives a response from the LLM, it saves the response in conversational memory and returns both the original OpenSearch query results and the LLM response. + +As of OpenSearch 2.12, the `retrieval_augmented_generation` processor supports only OpenAI and Amazon Bedrock models. +{: .note} + +## Request fields + +The following table lists all available request fields. + +Field | Data type | Description +:--- | :--- | :--- +`model_id` | String | The ID of the model used in the pipeline. Required. +`context_field_list` | Array | A list of fields contained in document sources that the pipeline uses as context for RAG. Required. For more information, see [Context field list](#context-field-list). +`system_prompt` | String | The system prompt that is sent to the LLM to adjust its behavior, such as its response tone. Can be a persona description or a set of instructions. Optional. +`user_instructions` | String | Human-generated instructions sent to the LLM to guide it in producing results. +`tag` | String | The processor's identifier. Optional. +`description` | String | A description of the processor. Optional. + +### Context field list + +The `context_field_list` is a list of fields contained in document sources that the pipeline uses as context for RAG. For example, suppose your OpenSearch index contains a collection of documents, each including a `title` and `text`: + +```json +{ + "_index": "qa_demo", + "_id": "SimKcIoBOVKVCYpk1IL-", + "_source": { + "title": "Abraham Lincoln 2", + "text": "Abraham Lincoln was born on February 12, 1809, the second child of Thomas Lincoln and Nancy Hanks Lincoln, in a log cabin on Sinking Spring Farm near Hodgenville, Kentucky.[2] He was a descendant of Samuel Lincoln, an Englishman who migrated from Hingham, Norfolk, to its namesake, Hingham, Massachusetts, in 1638. The family then migrated west, passing through New Jersey, Pennsylvania, and Virginia.[3] Lincoln was also a descendant of the Harrison family of Virginia; his paternal grandfather and namesake, Captain Abraham Lincoln and wife Bathsheba (née Herring) moved the family from Virginia to Jefferson County, Kentucky.[b] The captain was killed in an Indian raid in 1786.[5] His children, including eight-year-old Thomas, Abraham's father, witnessed the attack.[6][c] Thomas then worked at odd jobs in Kentucky and Tennessee before the family settled in Hardin County, Kentucky, in the early 1800s.[6]\n" + } +} +``` + +You can specify that only the `text` contents should be sent to the LLM by setting `"context_field_list": ["text"]` in the processor. + +## Example + +The following example demonstrates using a search pipeline with a `retrieval_augmented_generation` processor. + +### Creating a search pipeline + +The following request creates a search pipeline containing a `retrieval_augmented_generation` processor for an OpenAI model: + +```json +PUT /_search/pipeline/rag_pipeline +{ + "response_processors": [ + { + "retrieval_augmented_generation": { + "tag": "openai_pipeline_demo", + "description": "Demo pipeline Using OpenAI Connector", + "model_id": "gnDIbI0BfUsSoeNT_jAw", + "context_field_list": ["text"], + "system_prompt": "You are a helpful assistant", + "user_instructions": "Generate a concise and informative answer in less than 100 words for the given question" + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using a search pipeline + +Combine an OpenSearch query with an `ext` object that stores generative question answering parameters for the LLM: + +```json +GET /my_rag_test_data/_search?search_pipeline=rag_pipeline +{ + "query": { + "match": { + "text": "Abraham Lincoln" + } + }, + "ext": { + "generative_qa_parameters": { + "llm_model": "gpt-3.5-turbo", + "llm_question": "Was Abraham Lincoln a good politician", + "memory_id": "iXC4bI0BfUsSoeNTjS30", + "context_size": 5, + "message_size": 5, + "timeout": 15 + } + } +} +``` +{% include copy-curl.html %} + +For more information about setting up conversational search, see [Using conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/#using-conversational-search). diff --git a/_search-plugins/search-pipelines/rerank-processor.md b/_search-plugins/search-pipelines/rerank-processor.md new file mode 100644 index 0000000000..73bacd35c9 --- /dev/null +++ b/_search-plugins/search-pipelines/rerank-processor.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Rerank +nav_order: 25 +has_children: false +parent: Search processors +grand_parent: Search pipelines +--- + +# Rerank processor + +The `rerank` search request processor intercepts search results and passes them to a cross-encoder model to be reranked. The model reranks the results, taking into account the scoring context. Then the processor orders documents in the search results based on their new scores. + +## Request fields + +The following table lists all available request fields. + +Field | Data type | Description +:--- | :--- | :--- +`` | Object | The reranker type provides the rerank processor with static information needed across all reranking calls. Required. +`context` | Object | Provides the rerank processor with information necessary for generating reranking context at query time. +`tag` | String | The processor's identifier. Optional. +`description` | String | A description of the processor. Optional. +`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. + +### The `ml_opensearch` reranker type + +The `ml_opensearch` reranker type is designed to work with the cross-encoder model provided by OpenSearch. For this reranker type, specify the following fields. + +Field | Data type | Description +:--- | :--- | :--- +`ml_opensearch` | Object | Provides the rerank processor with model information. Required. +`ml_opensearch.model_id` | String | The model ID for the cross-encoder model. Required. For more information, see [Using ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). +`context.document_fields` | Array | An array of document fields that specifies the fields from which to retrieve context for the cross-encoder model. Required. + +## Example + +The following example demonstrates using a search pipeline with a `rerank` processor. + +### Creating a search pipeline + +The following request creates a search pipeline with a `rerank` response processor: + +```json +PUT /_search/pipeline/rerank_pipeline +{ + "response_processors": [ + { + "rerank": { + "ml_opensearch": { + "model_id": "gnDIbI0BfUsSoeNT_jAw" + }, + "context": { + "document_fields": [ "title", "text_representation"] + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using a search pipeline + +Combine an OpenSearch query with an `ext` object that contains the query context for the large language model (LLM). Provide the `query_text` that will be used to rerank the results: + +```json +POST /_search?search_pipeline=rerank_pipeline +{ + "query": { + "match": { + "text_representation": "Where is Albuquerque?" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text": "Where is Albuquerque?" + } + } + } +} +``` +{% include copy-curl.html %} + +Instead of specifying `query_text`, you can provide a full path to the field containing text to use for reranking. For example, if you specify a subfield `query` in the `text_representation` object, specify its path in the `query_text_path` parameter: + +```json +POST /_search?search_pipeline=rerank_pipeline +{ + "query": { + "match": { + "text_representation": { + "query": "Where is Albuquerque?" + } + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text_path": "query.match.text_representation.query" + } + } + } +} +``` +{% include copy-curl.html %} + +The `query_context` object contains the following fields. + +Field name | Description +:--- | :--- +`query_text` | The natural language text of the question that you want to use to rerank the search results. Either `query_text` or `query_text_path` (not both) is required. +`query_text_path` | The full JSON path to the text of the question that you want to use to rerank the search results. Either `query_text` or `query_text_path` (not both) is required. The maximum number of characters in the path is `1000`. + +For more information about setting up reranking, see [Reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/). \ No newline at end of file diff --git a/_search-plugins/search-pipelines/search-processors.md b/_search-plugins/search-pipelines/search-processors.md index a8f55b7321..5e53cf5615 100644 --- a/_search-plugins/search-pipelines/search-processors.md +++ b/_search-plugins/search-pipelines/search-processors.md @@ -24,8 +24,10 @@ The following table lists all supported search request processors. Processor | Description | Earliest available version :--- | :--- | :--- [`filter_query`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor/) | Adds a filtering query that is used to filter requests. | 2.8 -[`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) | Sets a default model for neural search at the index or field level. | 2.11 +[`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) | Sets a default model for neural search and neural sparse search at the index or field level. | 2.11(neural), 2.13(neural sparse) [`script`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/script-processor/) | Adds a script that is run on newly indexed documents. | 2.8 +[`oversample`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/oversample-processor/) | Increases the search request `size` parameter, storing the original value in the pipeline state. | 2.12 + ## Search response processors @@ -36,7 +38,11 @@ The following table lists all supported search response processors. Processor | Description | Earliest available version :--- | :--- | :--- [`personalize_search_ranking`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/personalize-search-ranking/) | Uses [Amazon Personalize](https://aws.amazon.com/personalize/) to rerank search results (requires setting up the Amazon Personalize service). | 2.9 +[`retrieval_augmented_generation`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rag-processor/) | Used for retrieval-augmented generation (RAG) in [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). | 2.10 (generally available in 2.12) [`rename_field`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rename-field-processor/)| Renames an existing field. | 2.8 +[`rerank`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/)| Reranks search results using a cross-encoder model. | 2.12 +[`collapse`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/collapse-processor/)| Deduplicates search hits based on a field value, similarly to `collapse` in a search request. | 2.12 +[`truncate_hits`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/)| Discards search hits after a specified target count is reached. Can undo the effect of the `oversample` request processor. | 2.12 ## Search phase results processors diff --git a/_search-plugins/search-pipelines/truncate-hits-processor.md b/_search-plugins/search-pipelines/truncate-hits-processor.md new file mode 100644 index 0000000000..871879efe3 --- /dev/null +++ b/_search-plugins/search-pipelines/truncate-hits-processor.md @@ -0,0 +1,516 @@ +--- +layout: default +title: Truncate hits +nav_order: 35 +has_children: false +parent: Search processors +grand_parent: Search pipelines +--- + +# Truncate hits processor + +The `truncate_hits` response processor discards returned search hits after a given hit count is reached. The `truncate_hits` processor is designed to work with the [`oversample` request processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/oversample-processor/) but may be used on its own. + +The `target_size` parameter (which specifies where to truncate) is optional. If it is not specified, then OpenSearch uses the `original_size` variable set by the +`oversample` processor (if available). + +The following is a common usage pattern: + +1. Add the `oversample` processor to a request pipeline to fetch a larger set of results. +1. In the response pipeline, apply a reranking processor (which may promote results from beyond the originally requested top N) or the `collapse` processor (which may discard results after deduplication). +1. Apply the `truncate` processor to return (at most) the originally requested number of hits. + +## Request fields + +The following table lists all request fields. + +Field | Data type | Description +:--- | :--- | :--- +`target_size` | Integer | The maximum number of search hits to return (>=0). If not specified, the processor will try to read the `original_size` variable and will fail if it is not available. Optional. +`context_prefix` | String | May be used to read the `original_size` variable from a specific scope in order to avoid collisions. Optional. +`tag` | String | The processor's identifier. Optional. +`description` | String | A description of the processor. Optional. +`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. + +## Example + +The following example demonstrates using a search pipeline with a `truncate` processor. + +### Setup + +Create an index named `my_index` containing many documents: + +```json +POST /_bulk +{ "create":{"_index":"my_index","_id":1}} +{ "doc": { "title" : "document 1" }} +{ "create":{"_index":"my_index","_id":2}} +{ "doc": { "title" : "document 2" }} +{ "create":{"_index":"my_index","_id":3}} +{ "doc": { "title" : "document 3" }} +{ "create":{"_index":"my_index","_id":4}} +{ "doc": { "title" : "document 4" }} +{ "create":{"_index":"my_index","_id":5}} +{ "doc": { "title" : "document 5" }} +{ "create":{"_index":"my_index","_id":6}} +{ "doc": { "title" : "document 6" }} +{ "create":{"_index":"my_index","_id":7}} +{ "doc": { "title" : "document 7" }} +{ "create":{"_index":"my_index","_id":8}} +{ "doc": { "title" : "document 8" }} +{ "create":{"_index":"my_index","_id":9}} +{ "doc": { "title" : "document 9" }} +{ "create":{"_index":"my_index","_id":10}} +{ "doc": { "title" : "document 10" }} +``` +{% include copy-curl.html %} + +### Creating a search pipeline + +The following request creates a search pipeline named `my_pipeline` with a `truncate_hits` response processor that discards hits after the first five: + +```json +PUT /_search/pipeline/my_pipeline +{ + "response_processors": [ + { + "truncate_hits" : { + "tag" : "truncate_1", + "description" : "This processor will discard results after the first 5.", + "target_size" : 5 + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using a search pipeline + +Search for documents in `my_index` without a search pipeline: + +```json +POST /my_index/_search +{ + "size": 8 +} +``` +{% include copy-curl.html %} + +The response contains eight hits: + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 13, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 1" + } + } + }, + { + "_index" : "my_index", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 2" + } + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 3" + } + } + }, + { + "_index" : "my_index", + "_id" : "4", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 4" + } + } + }, + { + "_index" : "my_index", + "_id" : "5", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 5" + } + } + }, + { + "_index" : "my_index", + "_id" : "6", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 6" + } + } + }, + { + "_index" : "my_index", + "_id" : "7", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 7" + } + } + }, + { + "_index" : "my_index", + "_id" : "8", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 8" + } + } + } + ] + } +} +``` +
+ +To search with a pipeline, specify the pipeline name in the `search_pipeline` query parameter: + +```json +POST /my_index/_search?search_pipeline=my_pipeline +{ + "size": 8 +} +``` +{% include copy-curl.html %} + +The response contains only 5 hits, even though 8 were requested and 10 were available: + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 1" + } + } + }, + { + "_index" : "my_index", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 2" + } + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 3" + } + } + }, + { + "_index" : "my_index", + "_id" : "4", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 4" + } + } + }, + { + "_index" : "my_index", + "_id" : "5", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 5" + } + } + } + ] + } +} +``` +
+ +## Oversample, collapse, and truncate hits + +The following is a more realistic example in which you use `oversample` to request many candidate documents, use `collapse` to remove documents that duplicate a particular field (to get more diverse results), and then use `truncate` to return the originally requested document count (to avoid returning a large result payload from the cluster). + + +### Setup + +Create many documents containing a field that you'll use for collapsing: + +```json +POST /_bulk +{ "create":{"_index":"my_index","_id":1}} +{ "title" : "document 1", "color":"blue" } +{ "create":{"_index":"my_index","_id":2}} +{ "title" : "document 2", "color":"blue" } +{ "create":{"_index":"my_index","_id":3}} +{ "title" : "document 3", "color":"red" } +{ "create":{"_index":"my_index","_id":4}} +{ "title" : "document 4", "color":"red" } +{ "create":{"_index":"my_index","_id":5}} +{ "title" : "document 5", "color":"yellow" } +{ "create":{"_index":"my_index","_id":6}} +{ "title" : "document 6", "color":"yellow" } +{ "create":{"_index":"my_index","_id":7}} +{ "title" : "document 7", "color":"orange" } +{ "create":{"_index":"my_index","_id":8}} +{ "title" : "document 8", "color":"orange" } +{ "create":{"_index":"my_index","_id":9}} +{ "title" : "document 9", "color":"green" } +{ "create":{"_index":"my_index","_id":10}} +{ "title" : "document 10", "color":"green" } +``` +{% include copy-curl.html %} + +Create a pipeline that collapses only on the `color` field: + +```json +PUT /_search/pipeline/collapse_pipeline +{ + "response_processors": [ + { + "collapse" : { + "field": "color" + } + } + ] +} +``` +{% include copy-curl.html %} + +Create another pipeline that oversamples, collapses, and then truncates results: + +```json +PUT /_search/pipeline/oversampling_collapse_pipeline +{ + "request_processors": [ + { + "oversample": { + "sample_factor": 3 + } + } + ], + "response_processors": [ + { + "collapse" : { + "field": "color" + } + }, + { + "truncate_hits": { + "description": "Truncates back to the original size before oversample increased it." + } + } + ] +} +``` +{% include copy-curl.html %} + +### Collapse without oversample + +In this example, you request the top three documents before collapsing on the `color` field. Because the first two documents have the same `color`, the second one is discarded, and the request returns the first and third documents: + +```json +POST /my_index/_search?search_pipeline=collapse_pipeline +{ + "size": 3 +} +``` +{% include copy-curl.html %} + + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "title" : "document 1", + "color" : "blue" + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "title" : "document 3", + "color" : "red" + } + } + ] + }, + "profile" : { + "shards" : [ ] + } +} +``` +
+ + +### Oversample, collapse, and truncate + +Now you will use the `oversampling_collapse_pipeline`, which requests the top 9 documents (multiplying the size by 3), deduplicates by `color`, and then returns the top 3 hits: + +```json +POST /my_index/_search?search_pipeline=oversampling_collapse_pipeline +{ + "size": 3 +} +``` +{% include copy-curl.html %} + + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "title" : "document 1", + "color" : "blue" + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "title" : "document 3", + "color" : "red" + } + }, + { + "_index" : "my_index", + "_id" : "5", + "_score" : 1.0, + "_source" : { + "title" : "document 5", + "color" : "yellow" + } + } + ] + }, + "profile" : { + "shards" : [ ] + } +} +``` +
+ + diff --git a/_search-plugins/search-relevance/compare-search-results.md b/_search-plugins/search-relevance/compare-search-results.md index 6d3d07d378..9e34b7cfd7 100644 --- a/_search-plugins/search-relevance/compare-search-results.md +++ b/_search-plugins/search-relevance/compare-search-results.md @@ -1,6 +1,6 @@ --- layout: default -title: Compare Search Results +title: Comparing search results nav_order: 55 parent: Search relevance has_children: true @@ -9,7 +9,7 @@ redirect_from: - /search-plugins/search-relevance/ --- -# Compare Search Results +# Comparing search results With Compare Search Results in OpenSearch Dashboards, you can compare results from two queries side by side to determine whether one query produces better results than the other. Using this tool, you can evaluate search quality by experimenting with queries. diff --git a/_search-plugins/search-relevance/index.md b/_search-plugins/search-relevance/index.md index 9ca39d4fe0..f0c5a2e4c5 100644 --- a/_search-plugins/search-relevance/index.md +++ b/_search-plugins/search-relevance/index.md @@ -14,6 +14,8 @@ Search relevance evaluates the accuracy of the search results returned by a quer OpenSearch provides the following search relevance features: -- [Compare Search Results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/compare-search-results/) in OpenSearch Dashboards lets you compare results from two queries side by side. +- [Comparing search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/compare-search-results/) from two queries side by side in OpenSearch Dashboards. -- [Querqy]({{site.url}}{{site.baseurl}}/search-plugins/querqy/) offers query rewriting capability. \ No newline at end of file +- [Reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/) using a cross-encoder reranker. + +- Rewriting queries using [Querqy]({{site.url}}{{site.baseurl}}/search-plugins/querqy/). \ No newline at end of file diff --git a/_search-plugins/search-relevance/reranking-search-results.md b/_search-plugins/search-relevance/reranking-search-results.md new file mode 100644 index 0000000000..14c418020d --- /dev/null +++ b/_search-plugins/search-relevance/reranking-search-results.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Reranking search results +parent: Search relevance +has_children: false +nav_order: 60 +--- + +# Reranking search results +Introduced 2.12 +{: .label .label-purple } + +You can rerank search results using a cross-encoder reranker in order to improve search relevance. To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) to them. The `rerank` processor evaluates the search results and sorts them based on the new scores provided by the cross-encoder model. + +**PREREQUISITE**
+Before configuring a reranking pipeline, you must set up a cross-encoder model. For information about using an OpenSearch-provided model, see [Cross-encoder models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#cross-encoder-models). For information about using a custom model, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). +{: .note} + +## Running a search with reranking + +To run a search with reranking, follow these steps: + +1. [Configure a search pipeline](#step-1-configure-a-search-pipeline). +1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). +1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). +1. [Search using reranking](#step-4-search-using-reranking). + +## Step 1: Configure a search pipeline + +Next, configure a search pipeline with a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). + +The following example request creates a search pipeline with an `ml_opensearch` rerank processor. In the request, provide a model ID for the cross-encoder model and the document fields to use as context: + +```json +PUT /_search/pipeline/my_pipeline +{ + "description": "Pipeline for reranking with a cross-encoder", + "response_processors": [ + { + "rerank": { + "ml_opensearch": { + "model_id": "gnDIbI0BfUsSoeNT_jAw" + }, + "context": { + "document_fields": [ + "passage_text" + ] + } + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information about the request fields, see [Request fields]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#request-fields). + +## Step 2: Create an index for ingestion + +In order to use the rerank processor defined in your pipeline, create an OpenSearch index and add the pipeline created in the previous step as the default pipeline: + +```json +PUT /my-index +{ + "settings": { + "index.search.default_pipeline" : "my_pipeline" + }, + "mappings": { + "properties": { + "passage_text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following bulk request: + +```json +POST /_bulk +{ "index": { "_index": "my-index" } } +{ "passage_text" : "I said welcome to them and we entered the house" } +{ "index": { "_index": "my-index" } } +{ "passage_text" : "I feel welcomed in their family" } +{ "index": { "_index": "my-index" } } +{ "passage_text" : "Welcoming gifts are great" } + +``` +{% include copy-curl.html %} + +## Step 4: Search using reranking + +To perform reranking search on your index, use any OpenSearch query and provide an additional `ext.rerank` field: + +```json +POST /my-index/_search +{ + "query": { + "match": { + "passage_text": "how to welcome in family" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text": "how to welcome in family" + } + } + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can provide the full path to the field containing the context. For more information, see [Rerank processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#example). \ No newline at end of file diff --git a/_search-plugins/semantic-search.md b/_search-plugins/semantic-search.md index f4753bee1c..7c3fbb738f 100644 --- a/_search-plugins/semantic-search.md +++ b/_search-plugins/semantic-search.md @@ -48,6 +48,8 @@ PUT /_ingest/pipeline/nlp-ingest-pipeline ``` {% include copy-curl.html %} +To split long text into passages, use the `text_chunking` ingest processor before the `text_embedding` processor. For more information, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). + ## Step 2: Create an index for ingestion In order to use the text embedding processor defined in your pipeline, create a k-NN index, adding the pipeline created in the previous step as the default pipeline. Ensure that the fields defined in the `field_map` are mapped as correct types. Continuing with the example, the `passage_embedding` field must be mapped as a k-NN vector with a dimension that matches the model dimension. Similarly, the `passage_text` field should be mapped as `text`. diff --git a/_search-plugins/sql/ppl/index.md b/_search-plugins/sql/ppl/index.md index c39e3429e1..850a540bc4 100644 --- a/_search-plugins/sql/ppl/index.md +++ b/_search-plugins/sql/ppl/index.md @@ -12,6 +12,8 @@ redirect_from: - /search-plugins/ppl/index/ - /search-plugins/ppl/endpoint/ - /search-plugins/ppl/protocol/ + - /search-plugins/sql/ppl/index/ + - /observability-plugin/ppl/index/ --- # PPL @@ -45,3 +47,4 @@ Developers can find information in the following resources: - [OpenSearch PPL Reference Manual](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.rst) - [Observability](https://github.com/opensearch-project/dashboards-observability/) using [PPL-based visualizations](https://github.com/opensearch-project/dashboards-observability#event-analytics) - PPL [Data Types](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/general/datatypes.rst) +- [Cross-cluster search](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/cross_cluster_search.rst#using-cross-cluster-search-in-ppl) in PPL diff --git a/_search-plugins/sql/sql/index.md b/_search-plugins/sql/sql/index.md index 9a466902ff..7035b6d664 100644 --- a/_search-plugins/sql/sql/index.md +++ b/_search-plugins/sql/sql/index.md @@ -61,7 +61,7 @@ POST _plugins/_sql To run the preceding query in the command line, use the [curl](https://curl.haxx.se/) command: ```bash -curl -XPOST https://localhost:9200/_plugins/_sql -u 'admin:admin' -k -H 'Content-Type: application/json' -d '{"query": "SELECT * FROM my-index* LIMIT 50"}' +curl -XPOST https://localhost:9200/_plugins/_sql -u 'admin:' -k -H 'Content-Type: application/json' -d '{"query": "SELECT * FROM my-index* LIMIT 50"}' ``` {% include copy.html %} diff --git a/_search-plugins/text-chunking.md b/_search-plugins/text-chunking.md new file mode 100644 index 0000000000..b66cfeda61 --- /dev/null +++ b/_search-plugins/text-chunking.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Text chunking +nav_order: 65 +--- + +# Text chunking +Introduced 2.13 +{: .label .label-purple } + +To split long text into passages, you can use a `text_chunking` processor as a preprocessing step for a `text_embedding` or `sparse_encoding` processor in order to obtain embeddings for each chunked passage. For more information about the processor parameters, see [Text chunking processor]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/text-chunking/). Before you start, follow the steps outlined in the [pretrained model documentation]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/) to register an embedding model. The following example preprocesses text by splitting it into passages and then produces embeddings using the `text_embedding` processor. + +## Step 1: Create a pipeline + +The following example request creates an ingest pipeline that converts the text in the `passage_text` field into chunked passages, which will be stored in the `passage_chunk` field. The text in the `passage_chunk` field is then converted into text embeddings, and the embeddings are stored in the `passage_embedding` field: + +```json +PUT _ingest/pipeline/text-chunking-embedding-ingest-pipeline +{ + "description": "A text chunking and embedding ingest pipeline", + "processors": [ + { + "text_chunking": { + "algorithm": { + "fixed_token_length": { + "token_limit": 10, + "overlap_rate": 0.2, + "tokenizer": "standard" + } + }, + "field_map": { + "passage_text": "passage_chunk" + } + } + }, + { + "text_embedding": { + "model_id": "LMLPWY4BROvhdbtgETaI", + "field_map": { + "passage_chunk": "passage_chunk_embedding" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +## Step 2: Create an index for ingestion + +In order to use the ingest pipeline, you need to create a k-NN index. The `passage_chunk_embedding` field must be of the `nested` type. The `knn.dimension` field must contain the number of dimensions for your model: + +```json +PUT testindex +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "text": { + "type": "text" + }, + "passage_chunk_embedding": { + "type": "nested", + "properties": { + "knn": { + "type": "knn_vector", + "dimension": 768 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Step 3: Ingest documents into the index + +To ingest a document into the index created in the previous step, send the following request: + +```json +POST testindex/_doc?pipeline=text-chunking-embedding-ingest-pipeline +{ + "passage_text": "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." +} +``` +{% include copy-curl.html %} + +## Step 4: Search the index using neural search + +You can use a `nested` query to perform vector search on your index. We recommend setting `score_mode` to `max`, where the document score is set to the highest score out of all passage embeddings: + +```json +GET testindex/_search +{ + "query": { + "nested": { + "score_mode": "max", + "path": "passage_chunk_embedding", + "query": { + "neural": { + "passage_chunk_embedding.knn": { + "query_text": "document", + "model_id": "-tHZeI4BdQKclr136Wl7" + } + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_security-analytics/api-tools/alert-finding-api.md b/_security-analytics/api-tools/alert-finding-api.md index a22b601b08..f2631f2a50 100644 --- a/_security-analytics/api-tools/alert-finding-api.md +++ b/_security-analytics/api-tools/alert-finding-api.md @@ -149,13 +149,230 @@ You can specify the following parameters when getting findings. Parameter | Description :--- | :--- -`detector_id` | The ID of the detector used to fetch alerts. Optional when the `detectorType` is specified. Otherwise required. -`detectorType` | The type of detector used to fetch alerts. Optional when the `detector_id` is specified. Otherwise required. +`detector_id` | The ID of the detector used to fetch alerts. Optional. +`detectorType` | The type of detector used to fetch alerts. Optional. `sortOrder` | The order used to sort the list of findings. Possible values are `asc` or `desc`. Optional. `size` | An optional limit for the maximum number of results returned in the response. Optional. +`startIndex` | The pagination indicator. Optional. +`detectionType` | The detection rule type that dictates the retrieval type for the findings. When the detection type is `threat`, it fetches threat intelligence feeds. When the detection type is `rule`, findings are fetched based on the detector's rule. Optional. +`severity` | The severity of the detector rule used to fetch alerts. Severity can be `critical`, `high`, `medium`, or `low`. Optional. ### Example request +```json +GET /_plugins/_security_analytics/findings/_search +{ + "total_findings": 2, + "findings": [ + { + "detectorId": "b9ZN040Bjlggkcgx1d1W", + "id": "35efb736-c5d9-499d-b9b5-31f0a7d61251", + "related_doc_ids": [ + "1" + ], + "index": "smallidx", + "queries": [ + { + "id": "QdZN040Bjlggkcgxdd3X", + "name": "QdZN040Bjlggkcgxdd3X", + "fields": [], + "query": "field1: *value1*", + "tags": [ + "high", + "ad_ldap" + ] + } + ], + "timestamp": 1708647166500, + "document_list": [ + { + "index": "smallidx", + "id": "1", + "found": true, + "document": "{\n \"field1\": \"value1\"\n}\n" + } + ] + }, + { + "detectorId": "O9ZM040Bjlggkcgx6N1S", + "id": "a5022930-4503-4ca8-bf0a-320a2b1fb433", + "related_doc_ids": [ + "1" + ], + "index": "smallidx", + "queries": [ + { + "id": "KtZM040Bjlggkcgxkd04", + "name": "KtZM040Bjlggkcgxkd04", + "fields": [], + "query": "field1: *value1*", + "tags": [ + "critical", + "ad_ldap" + ] + } + ], + "timestamp": 1708647166500, + "document_list": [ + { + "index": "smallidx", + "id": "1", + "found": true, + "document": "{\n \"field1\": \"value1\"\n}\n" + } + ] + } + ] +} + +``` + +```json +GET /_plugins/_security_analytics/findings/_search?severity=high +{ + "total_findings": 1, + "findings": [ + { + "detectorId": "b9ZN040Bjlggkcgx1d1W", + "id": "35efb736-c5d9-499d-b9b5-31f0a7d61251", + "related_doc_ids": [ + "1" + ], + "index": "smallidx", + "queries": [ + { + "id": "QdZN040Bjlggkcgxdd3X", + "name": "QdZN040Bjlggkcgxdd3X", + "fields": [], + "query": "field1: *value1*", + "tags": [ + "high", + "ad_ldap" + ] + } + ], + "timestamp": 1708647166500, + "document_list": [ + { + "index": "smallidx", + "id": "1", + "found": true, + "document": "{\n \"field1\": \"value1\"\n}\n" + } + ] + } + ] +} + +``` + +```json +GET /_plugins/_security_analytics/findings/_search?detectionType=rule +{ + "total_findings": 2, + "findings": [ + { + "detectorId": "b9ZN040Bjlggkcgx1d1W", + "id": "35efb736-c5d9-499d-b9b5-31f0a7d61251", + "related_doc_ids": [ + "1" + ], + "index": "smallidx", + "queries": [ + { + "id": "QdZN040Bjlggkcgxdd3X", + "name": "QdZN040Bjlggkcgxdd3X", + "fields": [], + "query": "field1: *value1*", + "tags": [ + "high", + "ad_ldap" + ] + } + ], + "timestamp": 1708647166500, + "document_list": [ + { + "index": "smallidx", + "id": "1", + "found": true, + "document": "{\n \"field1\": \"value1\"\n}\n" + } + ] + }, + { + "detectorId": "O9ZM040Bjlggkcgx6N1S", + "id": "a5022930-4503-4ca8-bf0a-320a2b1fb433", + "related_doc_ids": [ + "1" + ], + "index": "smallidx", + "queries": [ + { + "id": "KtZM040Bjlggkcgxkd04", + "name": "KtZM040Bjlggkcgxkd04", + "fields": [], + "query": "field1: *value1*", + "tags": [ + "critical", + "ad_ldap" + ] + } + ], + "timestamp": 1708647166500, + "document_list": [ + { + "index": "smallidx", + "id": "1", + "found": true, + "document": "{\n \"field1\": \"value1\"\n}\n" + } + ] + } + ] +} + + +``` +```json +GET /_plugins/_security_analytics/findings/_search?detectionType=rule&severity=high +{ + "total_findings": 1, + "findings": [ + { + "detectorId": "b9ZN040Bjlggkcgx1d1W", + "id": "35efb736-c5d9-499d-b9b5-31f0a7d61251", + "related_doc_ids": [ + "1" + ], + "index": "smallidx", + "queries": [ + { + "id": "QdZN040Bjlggkcgxdd3X", + "name": "QdZN040Bjlggkcgxdd3X", + "fields": [], + "query": "field1: *value1*", + "tags": [ + "high", + "ad_ldap" + ] + } + ], + "timestamp": 1708647166500, + "document_list": [ + { + "index": "smallidx", + "id": "1", + "found": true, + "document": "{\n \"field1\": \"value1\"\n}\n" + } + ] + } + ] +} + +``` + ```json GET /_plugins/_security_analytics/findings/_search?*detectorType*= { diff --git a/_security-analytics/index.md b/_security-analytics/index.md index c4bdae9748..c9cdcbee1b 100644 --- a/_security-analytics/index.md +++ b/_security-analytics/index.md @@ -39,7 +39,7 @@ For information about configuring detectors, see [Creating detectors]({{site.url ### Log types -Log types provide the data used to evaluate events occurring in a system. OpenSearch supports several types of logs and provides out-of-the-box mappings for the most common log sources. See [Supported log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/) for a list of log types currently supported by Security Analytics. +[Log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/) provide the data used to evaluate events occurring in a system. OpenSearch supports several types of logs and provides out-of-the-box mappings for the most common log sources. Log types are specified during the creation of detectors, including steps for mapping log fields to the detector. Security Analytics also automatically selects an appropriate set of rules based on a specific log type and populates them for the detector. diff --git a/_security-analytics/log-types-reference/ad-ldap.md b/_security-analytics/log-types-reference/ad-ldap.md new file mode 100644 index 0000000000..823bcf4c62 --- /dev/null +++ b/_security-analytics/log-types-reference/ad-ldap.md @@ -0,0 +1,114 @@ +--- +layout: default +title: AD LDAP +parent: Supported log types +nav_order: 20 +--- + +# AD LDAP + +The `ad_ldap` log type tracks Active Directory logs, such as: + +- Lightweight Directory Access Protocol (LDAP) queries. +- Errors from the LDAP server. +- Timeout events. +- Unsecured LDAP binds. + +The following code snippet contains all `raw_field` and `ecs` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"TargetUserName", + "ecs":"azure.signinlogs.properties.user_id" + }, + { + "raw_field":"creationTime", + "ecs":"timestamp" + }, + { + "raw_field":"Category", + "ecs":"azure.activitylogs.category" + }, + { + "raw_field":"OperationName", + "ecs":"azure.platformlogs.operation_name" + }, + { + "raw_field":"ModifiedProperties_NewValue", + "ecs":"modified_properties.new_value" + }, + { + "raw_field":"ResourceProviderValue", + "ecs":"azure.resource.provider" + }, + { + "raw_field":"conditionalAccessStatus", + "ecs":"azure.signinlogs.properties.conditional_access_status" + }, + { + "raw_field":"SearchFilter", + "ecs":"SearchFilter" + }, + { + "raw_field":"Operation", + "ecs":"azure.platformlogs.operation_name" + }, + { + "raw_field":"ResultType", + "ecs":"azure.platformlogs.result_type" + }, + { + "raw_field":"DeviceDetail_isCompliant", + "ecs":"azure.signinlogs.properties.device_detail.is_compliant" + }, + { + "raw_field":"ResourceDisplayName", + "ecs":"resource_display_name" + }, + { + "raw_field":"AuthenticationRequirement", + "ecs":"azure.signinlogs.properties.authentication_requirement" + }, + { + "raw_field":"TargetResources", + "ecs":"target_resources" + }, + { + "raw_field":"Workload", + "ecs":"workload" + }, + { + "raw_field":"DeviceDetail.deviceId", + "ecs":"azure.signinlogs.properties.device_detail.device_id" + }, + { + "raw_field":"OperationNameValue", + "ecs":"azure.platformlogs.operation_name" + }, + { + "raw_field":"ResourceId", + "ecs":"azure.signinlogs.properties.resource_id" + }, + { + "raw_field":"ResultDescription", + "ecs":"azure.signinlogs.result_description" + }, + { + "raw_field":"EventID", + "ecs":"EventID" + }, + { + "raw_field":"NetworkLocationDetails", + "ecs":"azure.signinlogs.properties.network_location_details" + }, + { + "raw_field":"CategoryValue", + "ecs":"azure.activitylogs.category" + }, + { + "raw_field":"ActivityDisplayName", + "ecs":"azure.auditlogs.properties.activity_display_name" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/apache-access.md b/_security-analytics/log-types-reference/apache-access.md new file mode 100644 index 0000000000..381c13f0bf --- /dev/null +++ b/_security-analytics/log-types-reference/apache-access.md @@ -0,0 +1,10 @@ +--- +layout: default +title: Apache Access +parent: Supported log types +nav_order: 25 +--- + +# Apache Access + +The `apache_access` log type records data for all requests processed by Apache HTTP servers. It contains no `raw_field` or `ecs` mappings. \ No newline at end of file diff --git a/_security-analytics/log-types-reference/azure.md b/_security-analytics/log-types-reference/azure.md new file mode 100644 index 0000000000..f8c9fb18fb --- /dev/null +++ b/_security-analytics/log-types-reference/azure.md @@ -0,0 +1,225 @@ +--- +layout: default +title: Azure +parent: Supported log types +nav_order: 29 +--- + +# Azure + +The `azure` log type monitors log data for cloud applications managed by Azure Cloud Services. + +The following code snippet contains all `raw_field` and `ecs` mappings for this log type: + +```json +"mappings": [ + { + "raw_field":"Resultdescription", + "ecs":"azure.signinlogs.result_description" + }, + { + "raw_field":"eventSource", + "ecs":"eventSource" + }, + { + "raw_field":"eventName", + "ecs":"eventName" + }, + { + "raw_field":"Status", + "ecs":"azure.platformlogs.status" + }, + { + "raw_field":"LoggedByService", + "ecs":"azure.auditlogs.properties.logged_by_service" + }, + { + "raw_field":"properties_message", + "ecs":"properties_message" + }, + { + "raw_field":"status", + "ecs":"azure.platformlogs.status" + }, + { + "raw_field":"TargetUserName", + "ecs":"azure.signinlogs.properties.user_id" + }, + { + "raw_field":"creationTime", + "ecs":"timestamp" + }, + { + "raw_field":"Category", + "ecs":"azure.activitylogs.category" + }, + { + "raw_field":"OperationName", + "ecs":"azure.platformlogs.operation_name" + }, + { + "raw_field":"ModifiedProperties_NewValue", + "ecs":"modified_properties.new_value" + }, + { + "raw_field":"ResourceProviderValue", + "ecs":"azure.resource.provider" + }, + { + "raw_field":"conditionalAccessStatus", + "ecs":"azure.signinlogs.properties.conditional_access_status" + }, + { + "raw_field":"SearchFilter", + "ecs":"search_filter" + }, + { + "raw_field":"Operation", + "ecs":"azure.platformlogs.operation_name" + }, + { + "raw_field":"ResultType", + "ecs":"azure.platformlogs.result_type" + }, + { + "raw_field":"DeviceDetail_isCompliant", + "ecs":"azure.signinlogs.properties.device_detail.is_compliant" + }, + { + "raw_field":"ResourceDisplayName", + "ecs":"resource_display_name" + }, + { + "raw_field":"AuthenticationRequirement", + "ecs":"azure.signinlogs.properties.authentication_requirement" + }, + { + "raw_field":"TargetResources", + "ecs":"target_resources" + }, + { + "raw_field":"Workload", + "ecs":"Workload" + }, + { + "raw_field":"DeviceDetail_deviceId", + "ecs":"azure.signinlogs.properties.device_detail.device_id" + }, + { + "raw_field":"OperationNameValue", + "ecs":"azure.platformlogs.operation_name" + }, + { + "raw_field":"ResourceId", + "ecs":"azure.signinlogs.properties.resource_id" + }, + { + "raw_field":"ResultDescription", + "ecs":"azure.signinlogs.result_description" + }, + { + "raw_field":"EventID", + "ecs":"EventID" + }, + { + "raw_field":"NetworkLocationDetails", + "ecs":"azure.signinlogs.properties.network_location_details" + }, + { + "raw_field":"CategoryValue", + "ecs":"azure.activitylogs.category" + }, + { + "raw_field":"ActivityDisplayName", + "ecs":"azure.auditlogs.properties.activity_display_name" + }, + { + "raw_field":"Initiatedby", + "ecs":"azure.activitylogs.identity.claims_initiated_by_user.name" + }, + { + "raw_field":"Count", + "ecs":"Count" + }, + { + "raw_field":"ResourceTenantId", + "ecs":"azure.signinlogs.properties.resource_tenant_id" + }, + { + "raw_field":"failure_status_reason", + "ecs":"failure_status_reason" + }, + { + "raw_field":"AppId", + "ecs":"azure.signinlogs.properties.app_id" + }, + { + "raw_field":"properties.message", + "ecs":"properties.message" + }, + { + "raw_field":"ClientApp", + "ecs":"azure.signinlogs.properties.client_app_used" + }, + { + "raw_field":"ActivityDetails", + "ecs":"ActivityDetails" + }, + { + "raw_field":"Target", + "ecs":"Target" + }, + { + "raw_field":"DeviceDetail.trusttype", + "ecs":"azure.signinlogs.properties.device_detail.trust_type" + }, + { + "raw_field":"HomeTenantId", + "ecs":"azure.signinlogs.properties.home_tenant_id" + }, + { + "raw_field":"ConsentContext.IsAdminConsent", + "ecs":"ConsentContext.IsAdminConsent" + }, + { + "raw_field":"InitiatedBy", + "ecs":"InitiatedBy" + }, + { + "raw_field":"ActivityType", + "ecs":"azure.auditlogs.properties.activity_display_name" + }, + { + "raw_field":"operationName", + "ecs":"azure.activitylogs.operation_name" + }, + { + "raw_field":"ModifiedProperties{}.NewValue", + "ecs":"modified_properties.new_value" + }, + { + "raw_field":"userAgent", + "ecs":"user_agent.name" + }, + { + "raw_field":"RiskState", + "ecs":"azure.signinlogs.properties.risk_state" + }, + { + "raw_field":"Username", + "ecs":"azure.activitylogs.identity.claims_initiated_by_user.name" + }, + { + "raw_field":"DeviceDetail.deviceId", + "ecs":"azure.signinlogs.properties.device_detail.device_id" + }, + { + "raw_field":"DeviceDetail.isCompliant", + "ecs":"azure.signinlogs.properties.device_detail.is_compliant" + }, + { + "raw_field":"Location", + "ecs":"azure.signinlogs.properties.network_location_details" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/cloudtrail.md b/_security-analytics/log-types-reference/cloudtrail.md new file mode 100644 index 0000000000..24edfe10ab --- /dev/null +++ b/_security-analytics/log-types-reference/cloudtrail.md @@ -0,0 +1,232 @@ +--- +layout: default +title: AWS CloudTrail +parent: Supported log types +nav_order: 28 +--- + +# AWS CloudTrail + +The `cloudtrail` log type monitors events from the [AWS CloudTrail](https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-user-guide.html) accounts. OpenSearch can ingest AWS CloudTrail log data from both [Amazon Simple Storage Service](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html) (Amazon S3) accounts and [Amazon Security Lake](https://docs.aws.amazon.com/security-lake/latest/userguide/what-is-security-lake.html) accounts. + +The following code snippet contains all the `raw_field`, `ecs`, and `ocsf` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"eventName", + "ecs":"aws.cloudtrail.event_name", + "ocsf": "api.operation" + }, + { + "raw_field":"eventSource", + "ecs":"aws.cloudtrail.event_source", + "ocsf": "api.service.name" + }, + { + "raw_field":"eventVersion", + "ecs":"aws.cloudtrail.event_version", + "ocsf": "metadata.product.version" + }, + { + "raw_field":"eventID", + "ecs":"aws.cloudtrail.event_id", + "ocsf": "metadata.uid" + }, + { + "raw_field":"eventType", + "ecs":"aws.cloudtrail.event_type", + "ocsf": "unmapped.eventType" + }, + { + "raw_field":"eventCategory", + "ecs":"aws.cloudtrail.event_category", + "ocsf": "metadata.product.feature.name" + }, + { + "raw_field":"errorMessage", + "ecs":"aws.cloudtrail.error_message", + "ocsf": "api.response.message" + }, + { + "raw_field":"errorCode", + "ecs":"aws.cloudtrail.error_code", + "ocsf": "api.response.error" + }, + { + "raw_field":"apiVersion", + "ecs":"aws.cloudtrail.api_version", + "ocsf": "api.version" + }, + { + "raw_field":"awsRegion", + "ecs":"aws.cloudtrail.aws_region", + "ocsf": "cloud.region" + }, + { + "raw_field":"additionalEventData.LoginTo", + "ecs":"aws.cloudtrail.additional_event_data.loginTo", + "ocsf": "dst_endpoint.svc_name" + }, + { + "raw_field":"additionalEventData.MFAUsed", + "ecs":"aws.cloudtrail.additional_event_data.mfaUsed", + "ocsf": "mfa" + }, + { + "raw_field":"responseElements", + "ecs":"aws.cloudtrail.response_elements.text", + "ocsf": "unmapped.responseElements" + }, + { + "raw_field":"requestID", + "ecs":"aws.cloudtrail.request_id", + "ocsf": "api.request.uid" + }, + { + "raw_field":"sourceIPAddress", + "ecs":"aws.cloudtrail.source_ip_address", + "ocsf": "src_endpoint.ip" + }, + { + "raw_field":"userAgent", + "ecs":"aws.cloudtrail.user_agent", + "ocsf": "http_request.user_agent" + }, + { + "raw_field":"vpcEndpointId", + "ecs":"aws.cloudtrail.vpc_endpoint_id", + "ocsf": "src_endpoint.uid" + }, + { + "raw_field":"responseElements.pendingModifiedValues.masterUserPassword", + "ecs":"aws.cloudtrail.response_elements.pending_modified_values.master_user_password", + "ocsf": "unmapped.responseElements.pendingModifiedValues.masterUserPassword" + }, + { + "raw_field":"responseElements.publiclyAccessible", + "ecs":"aws.cloudtrail.response_elements.publicly_accessible", + "ocsf": "unmapped.responseElements.publiclyAccessible" + }, + { + "raw_field":"responseElements.ConsoleLogin", + "ecs":"aws.cloudtrail.response_elements.publicly_accessible", + "ocsf": "status_id" + }, + { + "raw_field":"requestParameters.arn", + "ecs":"aws.cloudtrail.request_parameters.arn", + "ocsf": "unmapped.requestParameters.arn" + }, + { + "raw_field":"requestParameters.attribute", + "ecs":"aws.cloudtrail.request_parameters.attribute", + "ocsf": "unmapped.requestParameters.attribute" + }, + { + "raw_field":"requestParameters.userName", + "ecs":"aws.cloudtrail.request_parameters.username", + "ocsf": "unmapped.requestParameters.userName" + }, + { + "raw_field":"requestParameters.roleArn", + "ecs":"aws.cloudtrail.request_parameters.roleArn", + "ocsf": "user.uuid" + }, + { + "raw_field":"requestParameters.roleSessionName", + "ecs":"aws.cloudtrail.request_parameters.roleSessionName", + "ocsf": "user.name" + }, + { + "raw_field":"requestParameters.containerDefinitions.command", + "ecs":"aws.cloudtrail.request_parameters.container_definitions.command", + "ocsf": "unmapped.requestParameters.containerDefinitions.command" + }, + { + "raw_field":"userIdentity.type", + "ecs":"aws.cloudtrail.user_identity.type", + "ocsf": "actor.user.type" + }, + { + "raw_field":"userIdentity.principalId", + "ecs":"aws.cloudtrail.user_identity.principalId", + "ocsf": "actor.user.uid" + }, + { + "raw_field":"userIdentity.arn", + "ecs":"aws.cloudtrail.user_identity.arn", + "ocsf": "actor.user.uuid" + }, + { + "raw_field":"userIdentity.accountId", + "ecs":"aws.cloudtrail.user_identity.accountId", + "ocsf": "actor.user.account_uid" + }, + { + "raw_field":"userIdentity.accessKeyId", + "ecs":"aws.cloudtrail.user_identity.accessKeyId", + "ocsf": "actor.user.credential_uid" + }, + { + "raw_field":"userIdentity.identityProvider", + "ecs":"aws.cloudtrail.user_identity.identityProvider", + "ocsf": "actor.idp.name" + }, + { + "raw_field":"userIdentity.userName", + "ecs":"aws.cloudtrail.user_identity.userName", + "ocsf": "actor.user.name" + }, + { + "raw_field":"userIdentity.invokedBy", + "ecs":"aws.cloudtrail.user_identity.invokedBy", + "ocsf": "actor.invoked_by" + }, + { + "raw_field":"userIdentity.sessionContext.sessionIssuer.type", + "ecs":"aws.cloudtrail.user_identity.session_context.session_issuer.type", + "ocsf": "unmapped.userIdentity.sessionContext.sessionIssuer.type" + }, + { + "raw_field":"userIdentity.sessionContext.sessionIssuer.arn", + "ecs":"aws.cloudtrail.user_identity.session_context.session_issuer.arn", + "ocsf": "actor.session.issuer" + }, + { + "raw_field":"userIdentity.sessionContext.attributes.creationDate", + "ecs":"aws.cloudtrail.user_identity.session_context.attributes.creationDate", + "ocsf": "actor.session.created_time" + }, + { + "raw_field":"userIdentity.sessionContext.attributes.mfaAuthenticated", + "ecs":"aws.cloudtrail.user_identity.session_context.attributes.mfaAuthenticated", + "ocsf": "actor.session.mfa" + }, + { + "raw_field":"userIdentity.webIdFederationData.federatedProvider", + "ecs":"aws.cloudtrail.user_identity.web_id_federation_data.federatedProvider", + "ocsf": "actor.idp.name" + }, + { + "raw_field":"resources[].ARN", + "ecs":"aws.cloudtrail.resources.ARN", + "ocsf": "resources[].uid" + }, + { + "raw_field":"resources[].accountId", + "ecs":"aws.cloudtrail.resources.account_uid", + "ocsf": "resources[].account_uid" + }, + { + "raw_field":"resources[].type", + "ecs":"aws.cloudtrail.resources.type", + "ocsf": "resources[].type" + }, + { + "raw_field":"eventTime", + "ecs":"timestamp", + "ocsf": "time" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/dns.md b/_security-analytics/log-types-reference/dns.md new file mode 100644 index 0000000000..2e10868d17 --- /dev/null +++ b/_security-analytics/log-types-reference/dns.md @@ -0,0 +1,127 @@ +--- +layout: default +title: DNS +parent: Supported log types +nav_order: 35 +--- + +# DNS + +The `dns` log type stores DNS activity. + +The following code snippet contains all the `raw_field`, `ecs`, and `ocsf` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"record_type", + "ecs":"dns.answers.type", + "ocsf": "unmapped.record_type" + }, + { + "raw_field":"answers[].Type", + "ecs":"aws.route53.answers.Type", + "ocsf": "answers[].type" + }, + { + "raw_field":"answers[].Rdata", + "ecs":"aws.route53.answers.Rdata", + "ocsf": "answers[].rdata" + }, + { + "raw_field":"answers[].Class", + "ecs":"aws.route53.answers.Class", + "ocsf": "answers[].class" + }, + { + "raw_field":"query", + "ecs":"dns.question.name", + "ocsf": "unmapped.query" + }, + { + "raw_field":"query_name", + "ecs":"aws.route53.query_name", + "ocsf": "query.hostname" + }, + { + "raw_field":"parent_domain", + "ecs":"dns.question.registered_domain", + "ocsf": "unmapped.parent_domain" + }, + { + "raw_field":"version", + "ecs":"aws.route53.version", + "ocsf": "metadata.product.version" + }, + { + "raw_field":"account_id", + "ecs":"aws.route53.account_id", + "ocsf": "cloud.account_uid" + }, + { + "raw_field":"region", + "ecs":"aws.route53.region", + "ocsf": "cloud.region" + }, + { + "raw_field":"vpc_id", + "ecs":"aws.route53.vpc_id", + "ocsf": "src_endpoint.vpc_uid" + }, + { + "raw_field":"query_timestamp", + "ecs":"aws.route53.query_timestamp", + "ocsf": "time" + }, + { + "raw_field":"query_class", + "ecs":"aws.route53.query_class", + "ocsf": "query.class" + }, + { + "raw_field":"query_type", + "ecs":"aws.route53.query_type", + "ocsf": "query.type" + }, + { + "raw_field":"srcaddr", + "ecs":"aws.route53.srcaddr", + "ocsf": "src_endpoint.ip" + }, + { + "raw_field":"srcport", + "ecs":"aws.route53.srcport", + "ocsf": "src_endpoint.port" + }, + { + "raw_field":"transport", + "ecs":"aws.route53.transport", + "ocsf": "connection_info.protocol_name" + }, + { + "raw_field":"srcids.instance", + "ecs":"aws.route53.srcids.instance", + "ocsf": "src_endpoint.instance_uid" + }, + { + "raw_field":"srcids.resolver_endpoint", + "ecs":"aws.route53.srcids.resolver_endpoint", + "ocsf": "dst_endpoint.instance_uid" + }, + { + "raw_field":"srcids.resolver_network_interface", + "ecs":"aws.route53.srcids.resolver_network_interface", + "ocsf": "dst_endpoint.interface_uid" + }, + { + "raw_field":"firewall_rule_action", + "ecs":"aws.route53.srcids.firewall_rule_action", + "ocsf": "disposition_id" + }, + { + "raw_field":"creationTime", + "ecs":"timestamp", + "ocsf": "unmapped.creationTime" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/github.md b/_security-analytics/log-types-reference/github.md new file mode 100644 index 0000000000..fd0b4067e0 --- /dev/null +++ b/_security-analytics/log-types-reference/github.md @@ -0,0 +1,21 @@ +--- +layout: default +title: GitHub +parent: Supported log types +nav_order: 40 +--- + +# GitHub + +The `github` log type monitors workflows created by [GitHub Actions](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions). + +The following code snippet contains all the `raw_field` and `ecs` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"action", + "ecs":"github.action" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/gworkspace.md b/_security-analytics/log-types-reference/gworkspace.md new file mode 100644 index 0000000000..43172f684a --- /dev/null +++ b/_security-analytics/log-types-reference/gworkspace.md @@ -0,0 +1,33 @@ +--- +layout: default +title: Google Workspace +parent: Supported log types +nav_order: 45 +--- + +# Google Workspace + +The `gworkspace` log type monitors Google Workspace log entries, such as the following: + +- Admin actions +- Group and group membership actions +- Events related to logins + +The following code snippet contains all the `raw_field` and `ecs` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"eventSource", + "ecs":"google_workspace.admin.service.name" + }, + { + "raw_field":"eventName", + "ecs":"google_workspace.event.name" + }, + { + "raw_field":"new_value", + "ecs":"google_workspace.admin.new_value" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/index.md b/_security-analytics/log-types-reference/index.md new file mode 100644 index 0000000000..bd98bb3982 --- /dev/null +++ b/_security-analytics/log-types-reference/index.md @@ -0,0 +1,17 @@ +--- +layout: default +title: Supported log types +has_children: yes +nav_order: 16 +redirect_from: + - /security-analytics/sec-analytics-config/log-types/ +--- + +# Supported log types + +Logs contain raw data about events that happen throughout a system and within its separate parts. This reference describes the standard log types supported by Security Analytics and the automatic mappings they contain. + +For more information about field mappings, refer to the [About field mappings]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types#about-field-mappings) section in the [Working with log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/) documentation. + +For more information about log types and detectors, refer to the [Creating detectors]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/detectors-config/) documentation. + diff --git a/_security-analytics/log-types-reference/linux.md b/_security-analytics/log-types-reference/linux.md new file mode 100644 index 0000000000..ed18e8f8bc --- /dev/null +++ b/_security-analytics/log-types-reference/linux.md @@ -0,0 +1,73 @@ +--- +layout: default +title: Linux +parent: Supported log types +nav_order: 50 +--- + +# Linux + +The `linux` log type records Linux syslog events. + +The following code snippet contains all the `raw_field` and `ecs` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"name", + "ecs":"user.filesystem.name" + }, + { + "raw_field":"a0", + "ecs":"auditd.log.a0" + }, + { + "raw_field":"comm", + "ecs":"auditd.log.comm" + }, + { + "raw_field":"exe", + "ecs":"auditd.log.exe" + }, + { + "raw_field":"uid", + "ecs":"auditd.log.uid" + }, + { + "raw_field":"USER", + "ecs":"system.auth.user" + }, + { + "raw_field":"User", + "ecs":"system.auth.user" + }, + { + "raw_field":"Image", + "ecs":"process.exe" + }, + { + "raw_field":"DestinationHostname", + "ecs":"rsa.web.remote_domain" + }, + { + "raw_field":"CommandLine", + "ecs":"process.command_line" + }, + { + "raw_field":"ParentImage", + "ecs":"process.parent.executable" + }, + { + "raw_field":"CurrentDirectory", + "ecs":"process.working_directory" + }, + { + "raw_field":"LogonId", + "ecs":"process.real_user.id" + }, + { + "raw_field":"creationTime", + "ecs":"timestamp" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/m365.md b/_security-analytics/log-types-reference/m365.md new file mode 100644 index 0000000000..679ffc3137 --- /dev/null +++ b/_security-analytics/log-types-reference/m365.md @@ -0,0 +1,39 @@ +--- +layout: default +title: Microsoft 365 +parent: Supported log types +nav_order: 55 +--- + +# Microsoft 365 + +The `m365` log type collects a range of data for Microsoft 365, such as the following: + +- Records from call details +- Performance data +- SQL Server events +- Security events +- Access control activity + +The following code snippet contains all the `raw_field` and `ecs` mappings for this log type: + +```json +"mappings": [ + { + "raw_field":"eventSource", + "ecs":"rsa.misc.event_source" + }, + { + "raw_field":"eventName", + "ecs":"rsa.misc.event_desc" + }, + { + "raw_field":"status", + "ecs":"rsa.misc.status" + }, + { + "raw_field":"Payload", + "ecs":"rsa.misc.payload_dst" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/netflow.md b/_security-analytics/log-types-reference/netflow.md new file mode 100644 index 0000000000..8e920ab494 --- /dev/null +++ b/_security-analytics/log-types-reference/netflow.md @@ -0,0 +1,45 @@ +--- +layout: default +title: NetFlow +parent: Supported log types +nav_order: 60 +--- + +# NetFlow + +The `netflow` log type records NetFlow events used during integration testing. + +The following code snippet contains all the `raw_field` and `ecs` mappings for this log type: + +```json +"mappings": [ + { + "raw_field":"netflow.source_ipv4_address", + "ecs":"source.ip" + }, + { + "raw_field":"netflow.source_transport_port", + "ecs":"source.port" + }, + { + "raw_field":"netflow.destination_ipv4_address", + "ecs":"destination.ip" + }, + { + "raw_field":"netflow.destination_transport_port", + "ecs":"destination.port" + }, + { + "raw_field":"http.request.method", + "ecs":"http.request.method" + }, + { + "raw_field":"http.response.status_code", + "ecs":"http.response.status_code" + }, + { + "raw_field":"timestamp", + "ecs":"timestamp" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/network.md b/_security-analytics/log-types-reference/network.md new file mode 100644 index 0000000000..11a9e7c9c8 --- /dev/null +++ b/_security-analytics/log-types-reference/network.md @@ -0,0 +1,145 @@ +--- +layout: default +title: Network +parent: Supported log types +nav_order: 70 +--- + +# Network + +The `network` log type records events that happen in a system's network, such as login attempts and application events. + +The following code snippet contains all the `raw_field` and `ecs` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"action", + "ecs":"netflow.firewall_event" + }, + { + "raw_field":"certificate.serial", + "ecs":"zeek.x509.certificate.serial" + }, + { + "raw_field":"name", + "ecs":"zeek.smb_files.name" + }, + { + "raw_field":"path", + "ecs":"zeek.smb_files.path" + }, + { + "raw_field":"dst_port", + "ecs":"destination.port" + }, + { + "raw_field":"qtype_name", + "ecs":"zeek.dns.qtype_name" + }, + { + "raw_field":"operation", + "ecs":"zeek.dce_rpc.operation" + }, + { + "raw_field":"endpoint", + "ecs":"zeek.dce_rpc.endpoint" + }, + { + "raw_field":"zeek.dce_rpc.endpoint", + "ecs":"zeek.dce_rpc.endpoint" + }, + { + "raw_field":"answers", + "ecs":"zeek.dns.answers" + }, + { + "raw_field":"query", + "ecs":"zeek.dns.query" + }, + { + "raw_field":"client_header_names", + "ecs":"zeek.http.client_header_names" + }, + { + "raw_field":"resp_mime_types", + "ecs":"zeek.http.resp_mime_types" + }, + { + "raw_field":"cipher", + "ecs":"zeek.kerberos.cipher" + }, + { + "raw_field":"request_type", + "ecs":"zeek.kerberos.request_type" + }, + { + "raw_field":"creationTime", + "ecs":"timestamp" + }, + { + "raw_field":"method", + "ecs":"http.request.method" + }, + { + "raw_field":"id.resp_p", + "ecs":"id.resp_p" + }, + { + "raw_field":"blocked", + "ecs":"blocked-flag" + }, + { + "raw_field":"id.orig_h", + "ecs":"id.orig_h" + }, + { + "raw_field":"Z", + "ecs":"Z-flag" + }, + { + "raw_field":"id.resp_h", + "ecs":"id.resp_h" + }, + { + "raw_field":"uri", + "ecs":"url.path" + }, + { + "raw_field":"c-uri", + "ecs":"url.path" + }, + { + "raw_field":"c-useragent", + "ecs":"user_agent.name" + }, + { + "raw_field":"status_code", + "ecs":"http.response.status_code" + }, + { + "raw_field":"rejected", + "ecs":"rejected" + }, + { + "raw_field":"dst_ip", + "ecs":"destination.ip" + }, + { + "raw_field":"src_ip", + "ecs":"source.ip" + }, + { + "raw_field":"user_agent", + "ecs":"user_agent.name" + }, + { + "raw_field":"request_body_len", + "ecs":"http.request.body.bytes" + }, + { + "raw_field":"service", + "ecs":"service" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/okta.md b/_security-analytics/log-types-reference/okta.md new file mode 100644 index 0000000000..72097f0741 --- /dev/null +++ b/_security-analytics/log-types-reference/okta.md @@ -0,0 +1,25 @@ +--- +layout: default +title: Okta +parent: Supported log types +nav_order: 80 +--- + +# Okta + +The `okta` log type records Okta events generated from a range of actions, such as downloading an export file, requesting application access, or revoking privileges. + +The following code snippet contains all the `raw_field` and `ecs` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"eventtype", + "ecs":"okta.event_type" + }, + { + "raw_field":"displaymessage", + "ecs":"okta.display_message" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/other.md b/_security-analytics/log-types-reference/other.md new file mode 100644 index 0000000000..acf23c10c0 --- /dev/null +++ b/_security-analytics/log-types-reference/other.md @@ -0,0 +1,40 @@ +--- +layout: default +title: Other log type mappings +parent: Supported log types +nav_order: 110 +--- + +# Other log type mappings + +Security Analytics supports field mappings that are not specific to a single service or system. These mapping types are separated into the following categories: + +- Application: Records application logs. +- Advanced Persistent Threat (APT): Records logs commonly associated with APT attacks. +- Compliance: Records logs related to compliance. +- macOS: Records event logs when using a Mac device to access a network. +- Proxy: Records logs related to proxy events. +- Web: Records logs related to network access from the web. + +Each log type contains the same field mappings, as shown in the following code snippet: + +```json + "mappings": [ + { + "raw_field":"record_type", + "ecs":"dns.answers.type" + }, + { + "raw_field":"query", + "ecs":"dns.question.name" + }, + { + "raw_field":"parent_domain", + "ecs":"dns.question.registered_domain" + }, + { + "raw_field":"creationTime", + "ecs":"timestamp" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/s3.md b/_security-analytics/log-types-reference/s3.md new file mode 100644 index 0000000000..945370031e --- /dev/null +++ b/_security-analytics/log-types-reference/s3.md @@ -0,0 +1,29 @@ +--- +layout: default +title: Amazon S3 +parent: Supported log types +nav_order: 24 +--- + +# Amazon S3 + +The `s3` log type tracks network requests for access to Amazon S3 buckets. + +The following code snippet contains all the `raw_field` and `ecs` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"eventName", + "ecs":"aws.cloudtrail.event_name" + }, + { + "raw_field":"eventSource", + "ecs":"aws.cloudtrail.event_source" + }, + { + "raw_field":"eventTime", + "ecs":"timestamp" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/vpc.md b/_security-analytics/log-types-reference/vpc.md new file mode 100644 index 0000000000..781bef16a9 --- /dev/null +++ b/_security-analytics/log-types-reference/vpc.md @@ -0,0 +1,143 @@ +--- +layout: default +title: VPC Flow +parent: Supported log types +nav_order: 90 +--- + +# VPC Flow + +The `vpcflow` log type records data about the IP traffic flowing to and from the network interfaces within a virtual private cloud (VPC). This data is stored using the [VPC Flow Logs](https://docs.aws.amazon.com/vpc/latest/userguide/flow-logs.html) feature. + +The following code snippet contains all the `raw_field`, `ecs`, and `ocsf` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"version", + "ecs":"netflow.version", + "ocsf": "metadata.product.version" + }, + { + "raw_field":"account_id", + "ecs":"netflow.account_id", + "ocsf": "cloud.account_uid" + }, + { + "raw_field":"region", + "ecs":"netflow.region", + "ocsf": "cloud.region" + }, + { + "raw_field":"az_id", + "ecs":"netflow.az_id", + "ocsf": "cloud.zone" + }, + { + "raw_field":"srcport", + "ecs":"netflow.srcport", + "ocsf": "src_endpoint.port" + }, + { + "raw_field":"dstport", + "ecs":"netflow.dstport", + "ocsf": "dst_endpoint.port" + }, + { + "raw_field":"protocol", + "ecs":"netflow.protocol", + "ocsf": "connection_info.protocol_num" + }, + { + "raw_field":"packets", + "ecs":"netflow.packets", + "ocsf": "traffic.packets" + }, + { + "raw_field":"bytes", + "ecs":"netflow.bytes", + "ocsf": "traffic.bytes" + }, + { + "raw_field":"end", + "ecs":"netflow.end", + "ocsf": "end_time" + }, + { + "raw_field":"tcp_flags", + "ecs":"netflow.tcp_flags", + "ocsf": "connection_info.tcp_flags" + }, + { + "raw_field":"protocol_ver", + "ecs":"netflow.protocol_ver", + "ocsf": "connection_info.protocol_ver" + }, + { + "raw_field":"pkt_src_aws_service", + "ecs":"netflow.pkt_src_aws_service", + "ocsf": "src_endpoint.svc_name" + }, + { + "raw_field":"pkt_dst_aws_service", + "ecs":"netflow.pkt_dst_aws_service", + "ocsf": "dst_endpoint.svc_name" + }, + { + "raw_field":"log_status", + "ecs":"netflow.log_status", + "ocsf": "status_code" + }, + { + "raw_field":"action", + "ecs":"netflow.action", + "ocsf": "disposition_id" + }, + { + "raw_field":"traffic_path", + "ecs":"netflow.traffic_path", + "ocsf": "boundary_id" + }, + { + "raw_field":"flow_direction", + "ecs":"netflow.flow_direction", + "ocsf": "connection_info.direction_id" + }, + { + "raw_field":"dstaddr", + "ecs":"netflow.dstaddr", + "ocsf": "dst_endpoint.ip" + }, + { + "raw_field":"srcaddr", + "ecs":"netflow.srcaddr", + "ocsf": "src_endpoint.ip" + }, + { + "raw_field":"interface_id", + "ecs":"netflow.interface_id", + "ocsf": "dst_endpoint.interface_uid" + }, + { + "raw_field":"vpc_id", + "ecs":"netflow.vpc_id", + "ocsf": "dst_endpoint.vpc_uid" + }, + { + "raw_field":"instance_id", + "ecs":"netflow.instance_id", + "ocsf": "dst_endpoint.instance_uid" + }, + { + "raw_field":"subnet_id", + "ecs":"netflow.subnet_id", + "ocsf": "dst_endpoint.subnet_uid" + }, + { + "raw_field":"start", + "ecs":"timestamp", + "ocsf": "time" + } + ] +``` + diff --git a/_security-analytics/log-types-reference/waf.md b/_security-analytics/log-types-reference/waf.md new file mode 100644 index 0000000000..ffa8c45e95 --- /dev/null +++ b/_security-analytics/log-types-reference/waf.md @@ -0,0 +1,63 @@ +--- +layout: default +title: WAF +parent: Supported log types +nav_order: 95 +--- + +The `waf` log type monitors web application firewall (WAF) logs. The role of a WAF is to monitor and filter HTTP traffic flowing between a web application and the internet. A WAF prevents common security attacks, such as cross-site scripting (XSS) and SQL injection (SQLi). + +The following code snippet contains all the `raw_field` and `ecs` mappings for this log type: + +```json + "mappings": [ + { + "raw_field":"cs-method", + "ecs":"waf.request.method" + }, + { + "raw_field":"httpRequest.httpMethod", + "ecs":"waf.request.method" + }, + { + "raw_field":"cs-uri-query", + "ecs":"waf.request.uri_query" + }, + { + "raw_field":"httpRequest.uri", + "ecs":"waf.request.uri_query" + }, + { + "raw_field":"httpRequest.args", + "ecs":"waf.request.uri_query" + }, + { + "raw_field":"cs-user-agent", + "ecs":"waf.request.headers.user_agent" + }, + { + "raw_field":"httpRequest.headers", + "ecs":"waf.request.headers" + }, + { + "raw_field":"sc-status", + "ecs":"waf.response.code" + }, + { + "raw_field":"responseCodeSent", + "ecs":"waf.response.code" + }, + { + "raw_field":"timestamp", + "ecs":"timestamp" + }, + { + "raw_field":"httpRequest.headers.value", + "ecs":"waf.request.headers.value" + }, + { + "raw_field":"httpRequest.headers.name", + "ecs":"waf.request.headers.name" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/log-types-reference/windows.md b/_security-analytics/log-types-reference/windows.md new file mode 100644 index 0000000000..e5eef0b0ee --- /dev/null +++ b/_security-analytics/log-types-reference/windows.md @@ -0,0 +1,845 @@ +--- +layout: default +title: Windows +parent: Supported log types +nav_order: 100 +--- + +# Windows + +The `windows` log type records events that happen in Windows applications, system services, and the Windows operating system. + +The following code snippet contains all the `raw_field` and `ecs` mappings for this log type: + +```json + "mappings":[ + { + "raw_field":"AccountName", + "ecs":"winlog.computerObject.name" + }, + { + "raw_field":"AuthenticationPackageName", + "ecs":"winlog.event_data.AuthenticationPackageName" + }, + { + "raw_field":"Channel", + "ecs":"winlog.channel" + }, + { + "raw_field":"Company", + "ecs":"winlog.event_data.Company" + }, + { + "raw_field":"ComputerName", + "ecs":"winlog.computer_name" + }, + { + "raw_field":"Description", + "ecs":"winlog.event_data.Description" + }, + { + "raw_field":"Details", + "ecs":"winlog.event_data.Detail" + }, + { + "raw_field":"Device", + "ecs":"winlog.event_data.Device" + }, + { + "raw_field":"FileName", + "ecs":"winlog.event_data.FileName" + }, + { + "raw_field":"FileVersion", + "ecs":"winlog.event_data.FileVersion" + }, + { + "raw_field":"IntegrityLevel", + "ecs":"winlog.event_data.IntegrityLevel" + }, + { + "raw_field":"IpAddress", + "ecs":"winlog.event_data.IpAddress" + }, + { + "raw_field":"KeyLength", + "ecs":"winlog.event_data.KeyLength" + }, + { + "raw_field":"Keywords", + "ecs":"winlog.keywords" + }, + { + "raw_field":"LogonId", + "ecs":"winlog.event_data.LogonId" + }, + { + "raw_field":"LogonProcessName", + "ecs":"winlog.event_data.LogonProcessName" + }, + { + "raw_field":"LogonType", + "ecs":"winlog.event_data.LogonType" + }, + { + "raw_field":"OriginalFilename", + "ecs":"winlog.event_data.OriginalFileName" + }, + { + "raw_field":"Path", + "ecs":"winlog.event_data.Path" + }, + { + "raw_field":"PrivilegeList", + "ecs":"winlog.event_data.PrivilegeList" + }, + { + "raw_field":"ProcessId", + "ecs":"winlog.event_data.ProcessId" + }, + { + "raw_field":"Product", + "ecs":"winlog.event_data.Product" + }, + { + "raw_field":"Provider", + "ecs":"winlog.provider_name" + }, + { + "raw_field":"ProviderName", + "ecs":"winlog.provider_name" + }, + { + "raw_field":"ScriptBlockText", + "ecs":"winlog.event_data.ScriptBlockText" + }, + { + "raw_field":"ServerName", + "ecs":"winlog.event_data.TargetServerName" + }, + { + "raw_field":"Service", + "ecs":"winlog.event_data.ServiceName" + }, + { + "raw_field":"Signed", + "ecs":"winlog.event_data.Signed" + }, + { + "raw_field":"State", + "ecs":"winlog.event_data.State" + }, + { + "raw_field":"Status", + "ecs":"winlog.event_data.Status" + }, + { + "raw_field":"SubjectDomainName", + "ecs":"winlog.event_data.SubjectDomainName" + }, + { + "raw_field":"SubjectLogonId", + "ecs":"winlog.event_data.SubjectLogonId" + }, + { + "raw_field":"SubjectUserName", + "ecs":"winlog.event_data.SubjectUserName" + }, + { + "raw_field":"SubjectUserSid", + "ecs":"winlog.event_data.SubjectUserSid" + }, + { + "raw_field":"TargetLogonId", + "ecs":"winlog.event_data.TargetLogonId" + }, + { + "raw_field":"TargetName", + "ecs":"winlog.event_data.TargetUserName" + }, + { + "raw_field":"TargetServerName", + "ecs":"winlog.event_data.TargetServerName" + }, + { + "raw_field":"TargetUserName", + "ecs":"winlog.event_data.TargetUserName" + }, + { + "raw_field":"TargetUserSid", + "ecs":"winlog.event_data.TargetUserSid" + }, + { + "raw_field":"TaskName", + "ecs":"winlog.task" + }, + { + "raw_field":"Type", + "ecs":"winlog.user.type" + }, + { + "raw_field":"User", + "ecs":"winlog.user.name" + }, + { + "raw_field":"UserName", + "ecs":"winlog.user.name" + }, + { + "raw_field":"Workstation", + "ecs":"winlog.event_data.Workstation" + }, + { + "raw_field":"WorkstationName", + "ecs":"winlog.event_data.Workstation" + }, + { + "raw_field":"event_uid", + "ecs":"winlog.event_id" + }, + { + "raw_field":"CommandLine", + "ecs":"process.command_line" + }, + { + "raw_field":"hostname", + "ecs":"host.hostname" + }, + { + "raw_field":"message", + "ecs":"windows.message" + }, + { + "raw_field":"Provider_Name", + "ecs":"winlog.provider_name" + }, + { + "raw_field":"EventId", + "ecs":"winlog.event_id" + }, + { + "raw_field":"processPath", + "ecs":"winlog.event_data.ProcessPath" + }, + { + "raw_field":"ProcessName", + "ecs":"winlog.event_data.ProcessName" + }, + { + "raw_field":"ObjectName", + "ecs":"winlog.computerObject.name" + }, + { + "raw_field":"param1", + "ecs":"winlog.event_data.param1" + }, + { + "raw_field":"param2", + "ecs":"winlog.event_data.param2" + }, + { + "raw_field":"creationTime", + "ecs":"timestamp" + }, + { + "raw_field":"Origin", + "ecs":"winlog.event_data.Origin" + }, + { + "raw_field":"ParentImage", + "ecs":"winlog.event_data.ParentImage" + }, + { + "raw_field":"TargetPort", + "ecs":"winlog.event_data.TargetPort" + }, + { + "raw_field":"Query", + "ecs":"winlog.event_data.Query" + }, + { + "raw_field":"DestinationPort", + "ecs":"destination.port" + }, + { + "raw_field":"StartAddress", + "ecs":"winlog.event_data.StartAddress" + }, + { + "raw_field":"TicketOptions", + "ecs":"winlog.event_data.TicketOptions" + }, + { + "raw_field":"ParentCommandLine", + "ecs":"winlog.event_data.ParentCommandLine" + }, + { + "raw_field":"AllowedToDelegateTo", + "ecs":"winlog.event_data.AllowedToDelegateTo" + }, + { + "raw_field":"HostApplication", + "ecs":"winlog.event_data.HostApplication" + }, + { + "raw_field":"AccessMask", + "ecs":"winlog.event_data.AccessMask" + }, + { + "raw_field":"Hashes", + "ecs":"winlog.event_data.Hashes" + }, + { + "raw_field":"SidHistory", + "ecs":"winlog.event_data.SidHistory" + }, + { + "raw_field":"Initiated", + "ecs":"winlog.event_data.Initiated" + }, + { + "raw_field":"DestinationIp", + "ecs":"destination.ip" + }, + { + "raw_field":"RelativeTargetName", + "ecs":"winlog.event_data.RelativeTargetName" + }, + { + "raw_field":"Source_Name", + "ecs":"winlog.event_data.Source_Name" + }, + { + "raw_field":"AttributeLDAPDisplayName", + "ecs":"winlog.event_data.AttributeLDAPDisplayName" + }, + { + "raw_field":"DeviceDescription", + "ecs":"winlog.event_data.DeviceDescription" + }, + { + "raw_field":"AttributeValue", + "ecs":"winlog.event_data.AttributeValue" + }, + { + "raw_field":"ObjectValueName", + "ecs":"winlog.event_data.ObjectValueName" + }, + { + "raw_field":"QueryStatus", + "ecs":"winlog.event_data.QueryStatus" + }, + { + "raw_field":"TargetParentProcessId", + "ecs":"winlog.event_data.TargetParentProcessId" + }, + { + "raw_field":"OldUacValue", + "ecs":"winlog.event_data.OldUacValue" + }, + { + "raw_field":"FailureCode", + "ecs":"winlog.event_data.FailureCode" + }, + { + "raw_field":"OldTargetUserName", + "ecs":"winlog.event_data.OldTargetUserName" + }, + { + "raw_field":"NewUacValue", + "ecs":"winlog.event_data.NewUacValue" + }, + { + "raw_field":"ServiceName", + "ecs":"winlog.event_data.ServiceName" + }, + { + "raw_field":"Imphash", + "ecs":"winlog.event_data.Imphash" + }, + { + "raw_field":"NewValue", + "ecs":"winlog.event_data.NewValue" + }, + { + "raw_field":"Action", + "ecs":"winlog.event_data.Action" + }, + { + "raw_field":"SourceImage", + "ecs":"winlog.event_data.SourceImage" + }, + { + "raw_field":"QNAME", + "ecs":"winlog.event_data.QNAME" + }, + { + "raw_field":"Properties", + "ecs":"winlog.event_data.Properties" + }, + { + "raw_field":"AuditPolicyChanges", + "ecs":"winlog.event_data.AuditPolicyChanges" + }, + { + "raw_field":"Accesses", + "ecs":"winlog.event_data.Accesses" + }, + { + "raw_field":"ClassName", + "ecs":"winlog.event_data.ClassName" + }, + { + "raw_field":"ObjectClass", + "ecs":"winlog.event_data.ObjectClass" + }, + { + "raw_field":"PipeName", + "ecs":"winlog.event_data.PipeName" + }, + { + "raw_field":"HiveName", + "ecs":"winlog.event_data.HiveName" + }, + { + "raw_field":"StartModule", + "ecs":"winlog.event_data.StartModule" + }, + { + "raw_field":"HostVersion", + "ecs":"winlog.event_data.HostVersion" + }, + { + "raw_field":"DestinationHostname", + "ecs":"winlog.event_data.DestinationHostname" + }, + { + "raw_field":"QueryName", + "ecs":"winlog.event_data.QueryName" + }, + { + "raw_field":"RemoteName", + "ecs":"winlog.event_data.RemoteName" + }, + { + "raw_field":"PasswordLastSet", + "ecs":"winlog.event_data.PasswordLastSet" + }, + { + "raw_field":"ErrorCode", + "ecs":"winlog.event_data.ErrorCode" + }, + { + "raw_field":"AccessList", + "ecs":"winlog.event_data.AccessList" + }, + { + "raw_field":"Address", + "ecs":"winlog.event_data.Address" + }, + { + "raw_field":"PossibleCause", + "ecs":"winlog.event_data.PossibleCause" + }, + { + "raw_field":"DestPort", + "ecs":"destination.port" + }, + { + "raw_field":"Image", + "ecs":"winlog.event_data.Image" + }, + { + "raw_field":"CertThumbprint", + "ecs":"winlog.event_data.CertThumbprint" + }, + { + "raw_field":"TicketEncryptionType", + "ecs":"winlog.event_data.TicketEncryptionType" + }, + { + "raw_field":"ServiceType", + "ecs":"winlog.event_data.ServiceType" + }, + { + "raw_field":"ObjectServer", + "ecs":"winlog.event_data.ObjectServer" + }, + { + "raw_field":"ImagePath", + "ecs":"winlog.event_data.ImagePath" + }, + { + "raw_field":"NewName", + "ecs":"winlog.event_data.NewName" + }, + { + "raw_field":"CallTrace", + "ecs":"winlog.event_data.CallTrace" + }, + { + "raw_field":"SamAccountName", + "ecs":"winlog.event_data.SamAccountName" + }, + { + "raw_field":"GrantedAccess", + "ecs":"winlog.event_data.GrantedAccess" + }, + { + "raw_field":"EngineVersion", + "ecs":"winlog.event_data.EngineVersion" + }, + { + "raw_field":"OriginalName", + "ecs":"winlog.event_data.OriginalName" + }, + { + "raw_field":"AuditSourceName", + "ecs":"winlog.event_data.AuditSourceName" + }, + { + "raw_field":"sha1", + "ecs":"hash.sha1" + }, + { + "raw_field":"SourceIp", + "ecs":"source.ip" + }, + { + "raw_field":"Payload", + "ecs":"winlog.event_data.Payload" + }, + { + "raw_field":"Level", + "ecs":"winlog.event_data.Level" + }, + { + "raw_field":"Application", + "ecs":"winlog.event_data.Application" + }, + { + "raw_field":"RemoteAddress", + "ecs":"winlog.event_data.RemoteAddress" + }, + { + "raw_field":"SearchFilter", + "ecs":"winlog.event_data.SearchFilter" + }, + { + "raw_field":"ApplicationPath", + "ecs":"winlog.event_data.ApplicationPath" + }, + { + "raw_field":"TargetFilename", + "ecs":"winlog.event_data.TargetFilename" + }, + { + "raw_field":"CurrentDirectory", + "ecs":"winlog.event_data.CurrentDirectory" + }, + { + "raw_field":"ObjectType", + "ecs":"winlog.event_data.ObjectType" + }, + { + "raw_field":"ServicePrincipalNames", + "ecs":"winlog.event_data.ServicePrincipalNames" + }, + { + "raw_field":"TemplateContent", + "ecs":"winlog.event_data.TemplateContent" + }, + { + "raw_field":"QueryResults", + "ecs":"winlog.event_data.QueryResults" + }, + { + "raw_field":"ServiceStartType", + "ecs":"winlog.event_data.ServiceStartType" + }, + { + "raw_field":"EventType", + "ecs":"winlog.event_data.EventType" + }, + { + "raw_field":"TargetSid", + "ecs":"winlog.event_data.TargetSid" + }, + { + "raw_field":"ParentUser", + "ecs":"winlog.event_data.ParentUser" + }, + { + "raw_field":"NewTargetUserName", + "ecs":"winlog.event_data.NewTargetUserName" + }, + { + "raw_field":"DestAddress", + "ecs":"winlog.event_data.DestAddress" + }, + { + "raw_field":"ContextInfo", + "ecs":"winlog.event_data.ContextInfo" + }, + { + "raw_field":"HostName", + "ecs":"host.name" + }, + { + "raw_field":"NewTemplateContent", + "ecs":"winlog.event_data.NewTemplateContent" + }, + { + "raw_field":"LayerRTID", + "ecs":"winlog.event_data.LayerRTID" + }, + { + "raw_field":"ImageFileName", + "ecs":"winlog.event_data.ImageFileName" + }, + { + "raw_field":"StartFunction", + "ecs":"winlog.event_data.StartFunction" + }, + { + "raw_field":"Value", + "ecs":"winlog.event_data.Value" + }, + { + "raw_field":"ModifyingApplication", + "ecs":"winlog.event_data.ModifyingApplication" + }, + { + "raw_field":"Destination", + "ecs":"winlog.event_data.Destination" + }, + { + "raw_field":"Commandline", + "ecs":"winlog.event_data.Commandline" + }, + { + "raw_field":"Message", + "ecs":"winlog.event_data.Message" + }, + { + "raw_field":"ShareName", + "ecs":"winlog.event_data.ShareName" + }, + { + "raw_field":"SourcePort", + "ecs":"source.port" + }, + { + "raw_field":"CallerProcessName", + "ecs":"winlog.event_data.CallerProcessName" + }, + { + "raw_field":"ServiceFileName", + "ecs":"winlog.event_data.ServiceFileName" + }, + { + "raw_field":"DestinationIsIpv6", + "ecs":"winlog.event_data.DestinationIsIpv6" + }, + { + "raw_field":"TargetImage", + "ecs":"winlog.event_data.TargetImage" + }, + { + "raw_field":"SourceAddress", + "ecs":"source.ip" + }, + { + "raw_field":"TargetObject", + "ecs":"winlog.event_data.TargetObject" + }, + { + "raw_field":"Caption", + "ecs":"winlog.event_data.Caption" + }, + { + "raw_field":"LocalName", + "ecs":"winlog.event_data.LocalName" + }, + { + "raw_field":"ImageLoaded", + "ecs":"winlog.event_data.ImageLoaded" + }, + { + "raw_field":"EventID", + "ecs":"winlog.event_id" + }, + { + "raw_field":"sha256", + "ecs":"hash.sha256" + }, + { + "raw_field":"ScriptBlockLogging", + "ecs":"winlog.event_data.ScriptBlockLogging" + }, + { + "raw_field":"SourceParentImage", + "ecs":"winlog.event_data.SourceParentImage" + }, + { + "raw_field":"SourceFilename", + "ecs":"winlog.event_data.SourceFilename" + }, + { + "raw_field":"Protocol", + "ecs":"winlog.event_data.Protocol" + }, + { + "raw_field":"ValidatedPolicy", + "ecs":"winlog.event_data.ValidatedPolicy" + }, + { + "raw_field":"ProcessPath", + "ecs":"winlog.event_data.ProcessPath" + }, + { + "raw_field":"OldValue", + "ecs":"winlog.event_data.OldValue" + }, + { + "raw_field":"ParentProcessId", + "ecs":"winlog.event_data.ParentProcessId" + }, + { + "raw_field":"TaskContentNew", + "ecs":"winlog.event_data.TaskContentNew" + }, + { + "raw_field":"Name", + "ecs":"winlog.event_data.Name" + }, + { + "raw_field":"payload", + "ecs":"winlog.event_data.payload" + }, + { + "raw_field":"SourceHostname", + "ecs":"winlog.event_data.SourceHostname" + }, + { + "raw_field":"ClientProcessId", + "ecs":"winlog.event_data.ClientProcessId" + }, + { + "raw_field":"TargetParentImage", + "ecs":"winlog.event_data.TargetParentImage" + }, + { + "raw_field":"ImpersonationLevel", + "ecs":"winlog.event_data.ImpersonationLevel" + }, + { + "raw_field":"ExceptionCode", + "ecs":"winlog.event_data.ExceptionCode" + }, + { + "raw_field":"FilterOrigin", + "ecs":"winlog.event_data.FilterOrigin" + }, + { + "raw_field":"PackagePath", + "ecs":"winlog.event_data.PackagePath" + }, + { + "raw_field":"SignatureStatus", + "ecs":"winlog.event_data.SignatureStatus" + }, + { + "raw_field":"Hash", + "ecs":"winlog.event_data.Hash" + }, + { + "raw_field":"AppID", + "ecs":"winlog.event_data.AppID" + }, + { + "raw_field":"SidList", + "ecs":"winlog.event_data.SidList" + }, + { + "raw_field":"ProcessNameBuffer", + "ecs":"winlog.event_data.ProcessNameBuffer" + }, + { + "raw_field":"PreviousCreationUtcTime", + "ecs":"winlog.event_data.PreviousCreationUtcTime" + }, + { + "raw_field":"Contents", + "ecs":"winlog.event_data.Contents" + }, + { + "raw_field":"TargetOutboundUserName", + "ecs":"winlog.event_data.TargetOutboundUserName" + }, + { + "raw_field":"ImageName", + "ecs":"winlog.event_data.ImageName" + }, + { + "raw_field":"md5", + "ecs":"hash.md5" + }, + { + "raw_field":"DeviceName", + "ecs":"winlog.event_data.DeviceName" + }, + { + "raw_field":"RequestedPolicy", + "ecs":"winlog.event_data.RequestedPolicy" + }, + { + "raw_field":"FileNameBuffer", + "ecs":"winlog.event_data.FileNameBuffer" + }, + { + "raw_field":"TaskContent", + "ecs":"winlog.event_data.TaskContent" + }, + { + "raw_field":"SourceCommandLine", + "ecs":"winlog.event_data.SourceCommandLine" + }, + { + "raw_field":"CreationUtcTime", + "ecs":"winlog.event_data.CreationUtcTime" + }, + { + "raw_field":"AppName", + "ecs":"winlog.event_data.AppName" + }, + { + "raw_field":"subjectName", + "ecs":"winlog.event_data.subjectName" + }, + { + "raw_field":"process", + "ecs":"winlog.event_data.process" + }, + { + "raw_field":"PackageFullName", + "ecs":"winlog.event_data.PackageFullName" + }, + { + "raw_field":"SourceName", + "ecs":"winlog.event_data.SourceName" + }, + { + "raw_field":"Data", + "ecs":"winlog.event_data.Data" + }, + { + "raw_field":"param3", + "ecs":"winlog.event_data.param3" + }, + { + "raw_field":"Signature", + "ecs":"winlog.event_data.Signature" + } + ] +``` \ No newline at end of file diff --git a/_security-analytics/sec-analytics-config/custom-log-type.md b/_security-analytics/sec-analytics-config/custom-log-type.md deleted file mode 100644 index 0bf628d85c..0000000000 --- a/_security-analytics/sec-analytics-config/custom-log-type.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -layout: default -title: Creating custom log types -parent: Setting up Security Analytics -nav_order: 18 ---- - - -# Creating custom log types - -Log types represent the different sources of data used for threat detection in Security Analytics. In addition to the standard [log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/) supported by Security Analytics, you can create custom log types for your threat detectors. - -## Creating a custom log type - -To create a custom log type: -1. From the dashboard, select **OpenSearch Plugins** > **Security Analytics**, and then select **Detectors** > **Log types**. -1. Select **Create log type**. -1. Enter a name and, optionally, a description for the log type. - - The log type name supports characters a--z (lowercase), 0--9, hyphens, and underscores. - {: .note } - -1. Select a category. The categories are listed in [Supported log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/). -1. Select **Create log type** in the lower-right corner of the screen. The screen returns to the **Log types** page, and the new log type appears in the list. Note that the source for the new log type indicates **Custom**. - -## Log type API - -To perform operations for custom log types using the REST API, see [Log type APIs]({{site.url}}{{site.baseurl}}/security-analytics/api-tools/log-type-api/). - diff --git a/_security-analytics/sec-analytics-config/detectors-config.md b/_security-analytics/sec-analytics-config/detectors-config.md index 88ec95f6f7..74fa981279 100644 --- a/_security-analytics/sec-analytics-config/detectors-config.md +++ b/_security-analytics/sec-analytics-config/detectors-config.md @@ -20,7 +20,10 @@ To define a detector: 1. On the **Security Analytics** home page or the **Detectors** page, choose **Create detector**. 1. Give the detector a name and, optionally, a description. -1. In the **Data source** section, select one or more sources for the log data. Use an asterisk (*) to indicate a wildcard pattern. When selecting multiple data sources, their logs must be of the same type. We recommend creating separate detectors for different log types. +1. In the **Data source** section, select one or more sources for the log data. Use an asterisk (*) to indicate a wildcard pattern. When selecting multiple data sources, their logs must be of the same type. It is good practice to create separate detectors for different log types. Security Analytics also supports the following: + + - [Aliases]({{site.url}}{{site.baseurl}}/im-plugin/index-alias/): When configuring an alias as the data source, it must be attached to a **Write** index alias. When using an alias, ensure that your documents are ingested through the alias and **not** through the index for which the alias was created. + - [Data streams]({{site.url}}{{site.baseurl}}/im-plugin/data-streams/): A set of time-series data stored across multiple indexes but called using a single named resource. 1. In the **Detection** section, select a log type for the data source. For a list of supported log types, see [Supported log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/). To create your own log types, see [Creating custom log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/custom-log-type/). @@ -40,7 +43,9 @@ To define a detector: To quickly select one or more known rules and dismiss others, first deselect all rules by turning off the **Rule name** toggle, then search for your target rule names and select each individually by turning its toggle on. {: .tip } -1. Review the field mappings. Field mappings allow the system to accurately pass event data from the log to the detector and then use the data to trigger alerts. For more information about field mappings, see the **About field mappings** section later in this topic. +1. Review the field mappings. Field mappings allow the system to accurately pass event data from the log to the detector and then use the data to trigger alerts. For more information about field mappings, refer to the [A note on field mappings](#a-note-on-field-names) section. + +1. Choose whether to enable [threat intelligence]({{site.url}}{{site.baseurl}}/security-analytics/usage/detectors#threat-intelligence-feeds) feeds. Threat intelligence feeds only work with **standard** log types. 1. In the **Detector schedule** section, create a schedule for how often to run the detector. Specify a unit of time and a corresponding number to set the interval. The following image shows that the detector runs every 3 minutes. @@ -94,30 +99,8 @@ For more information about composite monitors and their workflows, see [Composit --- -## About field mappings -The data source (log index), log type, and detection rules specified in the first step determine which fields are available for mapping. For example, when "Windows logs" is selected as the log type, this parameter, along with the specific detection rules, determines the list of detection field names available for the mapping. Similarly, the selected data source determines the list of log source field names that are available for the mapping. - -The system uses prepackaged Sigma rules for detector creation. It can automatically map important fields for a specific log type to the corresponding fields in the Sigma rules. The field mapping step presents a view of automatically mapped fields while also providing the option to customize, change, or add new field mappings. When a detector includes customized rules, you can follow this step to manually map detector rule field names to log source field names. - -Because the system has the ability to automatically map field names, this step is optional. However, the more fields that can be mapped between detector fields and log source fields, the greater the accuracy of generated findings. - -### A note on field names -If you choose to perform manual field mapping, you should be familiar with the field names in the log index and have an understanding of the data contained in those fields. If you have an understanding of the log source fields in the index, the mapping is typically a straightforward process. - -Security Analytics takes advantage of prepackaged Sigma rules for security event detection. Therefore, the field names are derived from a Sigma rule field standard. To make them easier to identify, however, we have created aliases for the Sigma rule fields based on the open-source Elastic Common Schema (ECS) specification. These alias rule field names are the field names used in these steps. They appear in the **Detector field name** column of the mapping tables. - -Although the ECS rule field names are largely self-explanatory, you can find predefined mappings of the Sigma rule field names to ECS rule field names, for all supported log types, in the GitHub Security Analytics repository. Navigate to the [OSMappings](https://github.com/opensearch-project/security-analytics/tree/main/src/main/resources/OSMapping) folder and select the file for the specific log type. For example, to see the Sigma rule fields that correspond to ECS rule fields for the Windows log type, select the [`windows_logtype.json` file](https://github.com/opensearch-project/security-analytics/blob/main/src/main/resources/OSMapping/windows_logtype.json). The `raw_field` value in the file represents the Sigma rule field name in the mapping. - -### Amazon Security Lake logs -[Amazon Security Lake](https://docs.aws.amazon.com/security-lake/latest/userguide/what-is-security-lake.html) converts security log and event data to the [Open Cybersecurity Schema Framework](https://docs.aws.amazon.com/security-lake/latest/userguide/open-cybersecurity-schema-framework.html) (OCSF) to normalize combined data and facilitate its management. OpenSearch supports ingestion of log data from Security Lake in the OCSF format, and Security Analytics can automatically map fields from OCSF to ECS (the default field-mapping schema). - -The Security Lake log types that can be used as log sources for detector creation include CloudTrail, Route 53, and VPC Flow. Given that Route 53 is a log that captures DNS activity, its log type should be specified as **dns** when [defining a detector](#step-1-define-a-detector). Furthermore, because logs such as CloudTrail logs can conceivably be captured in both raw format and OCSF, it's good practice to name indexes in a way that keeps these logs separate and easily identifiable. This becomes helpful when specifying an index name in any of the APIs associated with Security Analytics. - -To reveal fields for a log index in either raw format or OCSF, use the [Get Mappings View]({{site.url}}{{site.baseurl}}/security-analytics/api-tools/mappings-api/#get-mappings-view) API and specify the index in the `index_name` field of the request. -{: .tip } - -### Automatically mapped fields +## Automatically mapped fields Once you select a data source and log type, the system attempts to automatically map fields between the log and rule fields. Switch to the **Mapped fields** tab to show the list of these mappings. When the field names are similar to one another, the system can successfully match the two, as shown in the following image. @@ -125,7 +108,9 @@ Once you select a data source and log type, the system attempts to automatically Although these automatic matches are normally dependable, it's still a good idea to review the mappings in the **Mapped fields** table and verify that they are correct and matched as expected. If you find a mapping that doesn't appear to be accurate, you can use the dropdown list to search for and select the correct field name. For more information about matching field names, see the following section. -### Available fields +For more information about field mappings, refer to the [About field mappings]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types#about-field-mappings) section within the [Working with log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types) documentation. + +## Available fields The field names that are not automatically mapped appear in the **Available fields** table. In this table you can manually map detection rule fields to data source fields, as shown in the following image. @@ -139,6 +124,47 @@ While mapping fields, consider the following: * Once the log source field name is selected and mapped to the detector field name, the icon in the **Status** column to the right changes from the alert icon to a check mark. * Make as many matches between field names as possible to complete an accurate mapping for the detector and log source fields. +### A note on field names + +If you choose to perform manual field mapping, you should be familiar with the field names in the log index and have an understanding of the data contained in those fields. If you have an understanding of the log source fields in the index, the mapping is typically a straightforward process. + +Security Analytics takes advantage of prepackaged Sigma rules for security event detection. Therefore, the field names are derived from a Sigma rule field standard. To make them more identifiable, aliases for the Sigma rule fields have been created based on the following specifications: + +- For all log types, the open-source Elastic Common Schema (ECS) +- For AWS CloudTrail and DNS log types, the [Open Cybersecurity Schema Framework](https://github.com/ocsf/ocsf-schema) (OCSF) + +The field names for alias rules are used in the following steps and are listed in the **Detector field name** column within the mapping tables. + +Predefined mappings that correlate the field names from Sigma rules to those of ECS rules for all supported log types are available in the following resources: + +- The [Supported log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/) reference documentation +- The GitHub Security Analytics repository. To find the field mappings: + 1. Navigate to the [OSMappings](https://github.com/opensearch-project/security-analytics/tree/main/src/main/resources/OSMapping) folder. + 2. Select the file for the specific log type. For example, for the `windows` log type, to view field names that correlate from Sigma rules to those of ECS rules, select the [`windows_logtype.json`](https://github.com/opensearch-project/security-analytics/blob/main/src/main/resources/OSMapping/windows_logtype.json) file. The `raw_field` value represents the field name for the Sigma rule in the mapping. + +## Amazon Security Lake logs + +[Amazon Security Lake](https://docs.aws.amazon.com/security-lake/latest/userguide/what-is-security-lake.html) converts security log and event data to the [OCSF](https://docs.aws.amazon.com/security-lake/latest/userguide/open-cybersecurity-schema-framework.html) format to normalize combined data and facilitate its management. OpenSearch supports ingestion of log data from Amazon Security Lake in the OCSF format. Security Analytics can automatically map fields from OCSF to ECS (the default field mapping schema). + +The Amazon Security Lake log types that can be used as log sources for detector creation include AWS CloudTrail, Amazon Route 53, and VPC Flow Logs. Because Amazon Route 53 logs capture DNS activity, the log type must be specified as **dns** when [defining a detector](#step-1-define-a-detector). Because AWS CloudTrail logs can be captured in both raw format and OCSF, you should name indexes distinctly and identifiably. This can be helpful when specifying an index name in an API associated with Security Analytics. + +Supported log types are available in the following resources: + +- For all log types, see the open-source ECS specification. +- For AWS CloudTrail, DNS log types, and VPC Flow Logs, see the [OCSF](https://github.com/ocsf/ocsf-schema). + + +The field names for alias rules are used in the following steps and are listed in the **Detector field name** column in the mapping tables. + +Predefined mappings that correlate the field names from Sigma rules to those of ECS rules for all supported log types are available in the following resources: + +- The [Supported log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/) reference documentation. + +- The [GitHub Security Analytics](https://github.com/opensearch-project/security-analytics) repository. To find the field mappings: + 1. Navigate to the [OSMappings](https://github.com/opensearch-project/security-analytics/tree/main/src/main/resources/OSMapping) folder. + 2. Select the file for the specific log type. For example, for the `windows` log type, to view field names that correlate from Sigma rules to those of ECS rules, select the [`windows_logtype.json`](https://github.com/opensearch-project/security-analytics/blob/main/src/main/resources/OSMapping/windows_logtype.json) file. The `raw_field` value represents the field name for the Sigma rule in the mapping. + + ## What's next If you are ready to view findings generated by the new detector, see the [Working with findings]({{site.url}}{{site.baseurl}}/security-analytics/usage/findings/) section. If you would like to import rules or set up custom rules before working with findings, see the [Working with detection rules]({{site.url}}{{site.baseurl}}/security-analytics/usage/rules/) section. diff --git a/_security-analytics/sec-analytics-config/log-types.md b/_security-analytics/sec-analytics-config/log-types.md index fc016556be..7861393e23 100644 --- a/_security-analytics/sec-analytics-config/log-types.md +++ b/_security-analytics/sec-analytics-config/log-types.md @@ -1,51 +1,79 @@ --- layout: default -title: Supported log types +title: Working with log types parent: Setting up Security Analytics -nav_order: 16 +nav_order: 14 +redirect_from: + - /security-analytics/sec-analytics-config/custom-log-type/ --- +# Working with log types -# Supported log types +Log types represent the different data sources used for threat detection in Security Analytics. Log types are useful for categorizing or prepopulating [detection rules]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/detectors-config/) when creating detectors from your source. -Logs contain raw data about events that happen throughout a system and in its separate parts. As of OpenSearch 2.11, log types are grouped by category to help select, filter, and search the log types. +Security Analytics supports the following log types: -To navigate to the **Log types** page, select **Log types** under **Detectors** in the **Security Analytics** navigation menu. The page shows the name of the log type, its description, its category, and identifies whether it's a standard OpenSearch-defined log type or a custom log type. The following image shows the **Log types** landing page with the Category column selected and the **Category** filter you can use to filter the list by the category. +- [Standard log types](#standard-log-types): Security Analytics automatically generates a list of data sources, along with their field mappings and rules, based on the data indexed from each source. +- [Custom log types](#creating-custom-log-types): When your data cannot be categorized as one of the [standard log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/), you can create a user-defined log type. For enhanced threat detection, Security Analytics supports the integration of custom log types. + +To navigate to the **Log types** page, select **Log types** under **Detectors** in the **Security Analytics** navigation menu. + +## Page actions + +The main **Log types** UI features and actions are shown in the following image. These features are described in the list following the image. The Log types landing page. -The following table shows the log types that are currently supported by Security Analytics for ingestion, mapping, and monitoring. - -| Category | Log type | Description | -| :--- |:--- |:--- | -| Access Management | `Ad_ldap` | Active Directory logs that track LDAP queries, errors from the LDAP server, timeout events, and unsecure LDAP binds. | -| Access Management | `Apache_access` | Apache access logs that record data for all requests processed by an Apache HTTP server. | -| Access Management | `Okta` | Okta logs that record Okta events from a range of actions, such as downloading an export file, requesting application access, or revoking privileges. | -| Applications | `GitHub` | GitHub logs that monitor workflows created by [GitHub Actions](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions). | -| Applications| `Gworkspace` | Google Workspace logs that monitor log entries such as admin actions, group and group membership actions, and events related to logging in. | -| Applications| `M365` | Microsoft 365 audit logs that collect a range of data for Microsoft 365, including records from call details, performance data, SQL Server, security events, and access control activity. | -| Cloud Services | `Azure` | Microsoft Azure logs that monitor log data for cloud applications managed by Azure Cloud Services. | -| Cloud Services | `CloudTrail` | AWS CloudTrail logs that monitor events for an AWS CloudTrail account. OpenSearch can ingest CloudTrail log data from both [Amazon Simple Storage Service](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html) (Amazon S3) accounts and [Amazon Security Lake](https://docs.aws.amazon.com/security-lake/latest/userguide/what-is-security-lake.html) service accounts. | -| Cloud Services | `S3` | Amazon S3 logs that track requests for access to an S3 bucket. | -| Network Activity| `Dns` | DNS logs that store DNS activity. | -| Network Activity | `Network` | Network logs that record events that happen in a system's network, such as login attempts and application events. | -| Network Activity | `vpcflow` | [VPC Flow Logs](https://docs.aws.amazon.com/prescriptive-guidance/latest/logging-monitoring-for-application-owners/vpc-flow-logs.html) that capture information about the IP traffic going to and from network interfaces in your virtual private cloud (VPC). | -| Security | `Waf` | Web Application Firewall (WAF) logs (introduced in OpenSearch 2.11) for users that require monitoring of the WAF use case that's provided out of the box with Security Analytics. The role of WAF is to monitor and filter HTTP traffic between a web application and the internet. WAF prevents common security attacks, such as cross-site scripting (XSS) and SQL Injection (SQi). | -| System Activity | `Linux` | Linux system logs that record Linux syslog events. | -| System Activity | `Windows` | Windows logs that record events that have happened in the operating system, applications, and other Windows system services. | -| Other | `Email` | Logs that record email activity. | +1. Search **Standard** and **Custom** log types. + - For a list of **Standard** log types, see [Supported log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/). +2. Create a [custom log type](#creating-custom-log-types). +3. Select the log type **Name** to open the details page. The **Details** tab is shown by default. This tab includes the log type's ID. You can also select the **Detection rules** tab to show all detection rules associated with the log type. +4. Select the **Category** or **Source** dropdown menu to sort by log type category or source. +5. From the **Actions** column, select the {::nomarkdown}trash can icon{:/} icon to delete a custom log type (you cannot delete a standard OpenSearch-defined log type). Then follow the prompts to confirm and delete it. -## Page actions +## Standard log types + +As of OpenSearch 2.11, all standard log types are grouped by the following categories: + +- **Access Management** includes [AD/LDAP]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/ad-ldap/), [Apache Access]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/apache-access/), and [Okta]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/okta/). +- **Applications** includes [GitHub]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/github/), [Google Workspace]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/gworkspace/), and [Microsoft 365]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/m365/). +- **Cloud Services** includes [Azure]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/azure/), [AWS CloudTrail]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/cloudtrail/), and [Amazon S3]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/s3/). +- **Network Activity** includes [DNS]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/dns/), [Network]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/network/), [NetFlow]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/netflow/), and [VPC Flow]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/vpc/). +- **Security** includes [WAF]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/waf/). +- **System Activity** includes [Linux]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/linux/) and [Windows]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/windows/). +- **Other** includes accounts for log types that are not contained in a specific category. For more information, refer to [Other log types]({{site.url}}{{site.baseurl}}/security-analytics/log-types-reference/other/). + + +## Creating custom log types + +When connecting to a data source not supported by a standard log type, create a custom log type by following these steps: + +1. In OpenSearch Dashboards, select **OpenSearch Plugins** > **Security Analytics**, and then select **Detectors** > **Log types**. +1. Select **Create log type**. +1. Enter a name and, optionally, a description for the log type. + + The log type name supports characters a--z (lowercase), 0--9, hyphens, and underscores. + {: .note } + +1. Select a category. The categories are listed in the [Supported log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/log-types/) documentation. +1. Select **Create log type** in the lower-right corner of the screen. The screen returns to the **Log types** page. The new log type appears in the list. Note that the source for the new log type indicates **Custom**. + +## About field mappings + +The log type specified when creating a detector determines which fields are available for mapping. For example, when **Windows logs** is selected, this parameter and the specific detection rules determine the list of detection field names available for the mapping. Similarly, the selected data source determines the list of log source field names that are available for the mapping. + +Security Analytics uses prepackaged Sigma rules for detector creation. It can automatically map important fields of a particular log type to the relevant fields in Sigma rules. The **Field Mappings** section shows fields that have been mapped automatically. In this section, you can customize, change, or add new field mappings. When a detector includes customized rules, you can manually map detector rule field names to log source field names. + +Because the system can automatically map field names, it is optional to manually map the fields when `ecs` fields exist in a document. Detector rules, however, require certain mappings in order to operate. These mapping depend on the detector rule. The more fields that can be mapped between detector fields and log source fields, the greater the accuracy of generated findings. + + + +## Log type APIs + +Use the log type APIs to perform custom log type operations using the REST API. For more information, refer to the [log type APIs]({{site.url}}{{site.baseurl}}/security-analytics/api-tools/log-type-api/) documentation. -The following list describes the main features found on the **Log types** page and the actions you can take: -* Select the log type **Name** to open the log type's details page. The **Details** tab is shown by default. This tab includes the log type's ID. You can also select the **Detection rules** tab to show all detection rules associated with the log type. -* In the **Actions** column, you can select the trash can icon ({::nomarkdown}trash can icon{:/}) to delete a custom log type (you cannot delete a standard OpenSearch-defined log type). Follow the prompts to confirm and safely remove the custom log type. -* Select **Create log type** in the upper-right corner of the screen to begin creating a custom log type. The **Create log type** page opens. Continue with the steps in the section that follows to create a custom log type. -* Using the **Category** and **Source** dropdowns, you can sort by the log type category or source, respectively. -## Related articles -[Creating custom log types]({{site.url}}{{site.baseurl}}/security-analytics/sec-analytics-config/custom-log-type/) diff --git a/_security-analytics/usage/detectors.md b/_security-analytics/usage/detectors.md index a439b21f00..bd7868bc37 100644 --- a/_security-analytics/usage/detectors.md +++ b/_security-analytics/usage/detectors.md @@ -35,6 +35,16 @@ To edit a detector, begin by selecting the link to the detector in the Detector After you select the **Alert triggers** tab, you also have the option to add additional alerts for the detector by selecting **Add another alert condition** at the bottom of the page. {: .tip } +### Threat intelligence feeds + +A threat intelligence feed is a real-time, continuous data stream that gathers information related to risks or threats. A piece of information in the tactical threat intelligence feed suggesting that your cluster may have been compromised, such as a login from an unknown user or location or anomalous activity like an increase in read volume, is called an *indicator of compromise* (IoC). These IoCs can be used by investigators to help isolate security incidents. + +As of OpenSearch 2.12, you can enable threat intelligence for Sigma rules related to malicious IP addresses. + +To enable threat intelligence feeds, select the **Enable threat intelligence-based detection** option. + +Threat intelligence feeds only work with **standard** log types. + --- ## Detector actions diff --git a/_security/access-control/anonymous-authentication.md b/_security/access-control/anonymous-authentication.md index 429daafb9b..cb2f951546 100644 --- a/_security/access-control/anonymous-authentication.md +++ b/_security/access-control/anonymous-authentication.md @@ -30,6 +30,19 @@ The following table describes the `anonymous_auth_enabled` setting. For more inf If you disable anonymous authentication, you must provide at least one `authc` in order for the Security plugin to initialize successfully. {: .important } +## OpenSearch Dashboards configuration + +To enable anonymous authentication for OpenSearch Dashboards, you need to modify the `opensearch_dashboards.yml` file in the configuration directory of your OpenSearch Dashboards installation. + +Add the following setting to `opensearch_dashboards.yml`: + +```yml +opensearch_security.auth.anonymous_auth_enabled: true +``` + +Anonymous login for OpenSearch Dashboards requires anonymous authentication to be enabled on the OpenSearch cluster. +{: .important} + ## Defining anonymous authentication privileges When anonymous authentication is enabled, your defined HTTP authenticators still try to find user credentials inside your HTTP request. If credentials are found, the user is authenticated. If none are found, the user is authenticated as an `anonymous` user. diff --git a/_security/access-control/api.md b/_security/access-control/api.md index ee6aee7d71..8a464bdeb1 100644 --- a/_security/access-control/api.md +++ b/_security/access-control/api.md @@ -29,6 +29,13 @@ plugins.security.restapi.roles_enabled: ["", ...] ``` {% include copy.html %} +If you're working with APIs that manage `Distinguished names` or `Certificates` that require super admin access, enable the REST API admin configuration in your `opensearch.yml` file as shown in the following setting example: + +```yml +plugins.security.restapi.admin.enabled: true +``` +{% include copy.html %} + These roles can now access all APIs. To prevent access to certain APIs: ```yml @@ -1290,6 +1297,91 @@ PATCH _plugins/_security/api/securityconfig } ``` +### Configuration upgrade check + +Introduced 2.13 +{: .label .label-purple } + +Checks the current configuration bundled with the host's Security plugin and compares it to the version of the OpenSearch Security plugin the user downloaded. Then, the API responds indicating whether or not an upgrade can be performed and what resources can be updated. + +With each new OpenSearch version, there are changes to the default security configuration. This endpoint helps cluster operators determine whether the cluster is missing defaults or has stale definitions of defaults. +{: .note} + +#### Request + +```json +GET _plugins/_security/api/_upgrade_check +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "status" : "OK", + "upgradeAvailable" : true, + "upgradeActions" : { + "roles" : { + "add" : [ "flow_framework_full_access" ] + } + } +} +``` + +#### Response fields + +| Field | Data type | Description | +|:---------|:-----------|:------------------------------| +| `upgradeAvailable` | Boolean | Responds with `true` when an upgrade to the security configuration is available. | +| `upgradeActions` | Object list | A list of security objects that would be modified when upgrading the host's Security plugin. | + +### Configuration upgrade + +Introduced 2.13 +{: .label .label-purple } + +Adds and updates resources on a host's existing security configuration from the configuration bundled with the latest version of the Security plugin. + +These bundled configuration files can be found in the `/security/config` directory. Default configuration files are updated when OpenSearch is upgraded, whereas the cluster configuration is only updated by the cluster operators. This endpoint helps cluster operator upgrade missing defaults and stale default definitions. + + +#### Request + +```json +POST _plugins/_security/api/_upgrade_perform +{ + "configs" : [ "roles" ] +} +``` +{% include copy-curl.html %} + +#### Request fields + +| Field | Data type | Description | Required | +|:----------------|:-----------|:------------------------------------------------------------------------------------------------------------------|:---------| +| `configs` | Array | Specifies the configurations to be upgraded. This field can include any combination of the following configurations: `actiongroups`,`allowlist`, `audit`, `internalusers`, `nodesdn`, `roles`, `rolesmappings`, `tenants`.
Default is all supported configurations. | No | + + +#### Example response + +```json +{ + "status" : "OK", + "upgrades" : { + "roles" : { + "add" : [ "flow_framework_full_access" ] + } + } +} +``` + +#### Response fields + +| Field | Data type | Description | +|:---------|:-----------|:------------------------------| +| `upgrades` | Object | A container for the upgrade results, organized by configuration type, such as `roles`. Each changed configuration type will be represented as a key in this object. | +| `roles` | Object | Contains a list of role-based action keys of objects modified by the upgrade. | + --- ## Distinguished names diff --git a/_security/access-control/authentication-tokens.md b/_security/access-control/authentication-tokens.md new file mode 100644 index 0000000000..4b91da20c1 --- /dev/null +++ b/_security/access-control/authentication-tokens.md @@ -0,0 +1,150 @@ +--- +layout: default +title: Authorization tokens +parent: Access control +nav_order: 125 +redirect_from: + - /security/access-control/authorization-tokens/ + - /security-plugin/access-control/authorization-tokens/ +--- + +# Authorization tokens + +The Security plugin allows you to configure two types of authentication tokens: On-Behalf-Of (OBO) tokens and Service Account tokens. + +## On-Behalf-Of authentication + +The following sections describe the use, configuration, structure, and endpoint for OBO tokens. + +### Usage + +On-Behalf-Of tokens are a special form of JSON Web Token (JWT) used for managing authentication requests between a user's client and an extension. These tokens operate "just-in-time," meaning that a token is issued immediately before it is required for authentication. A token will have a configurable window of validity (with a maximum duration of five minutes), after which it expires and cannot be used. + +An extension can use an OBO token to interact with an OpenSearch cluster, using the same privileges as the user it represents. This is why these tokens are called "on-behalf-of." Since these tokens are not restricted, they enable services to function as though they are the original user until the token expires. This implies that the feature's applicability extends beyond only extension-related uses cases, allowing for a wider range of uses. + +### Configuration + +In the [security `config.yml` file]({{site.url}}{{site.baseurl}}/security/configuration/configuration/), the OBO configuration is located under the dynamic configuration section. It contains the signing key for the token signature and the encryption key for the token payload (role information) decryption: + +``` +config: + dynamic: + on_behalf_of: + enabled: #'true'/non-specified will be consider as 'enabled' + signing_key: #encoded signing key here + encryption_key: #encoded encryption key here +... +``` + +The default encoding algorithm for signing the JWT is HMAC SHA512. Both the signing key and the encryption key are base64 encoded and stored on the OpenSearch node's file system. The keys should be the same on all hosts. Otherwise, encryption and decryption operations may fail. The keys' deployment is managed by the cluster operator. + +### Token structure + +The payload of an OBO token must include all standard configurations of a JWT, along with encrypted and decrypted roles. Depending on the Plugin Backward Compatibility Mode setting, backend roles should also be incorporated into role claims. The absence of any of these claims results in a malformed token and fails to meet the required standard for authentication. + +The OBO token contains the following claims: +* Issuer (`iss`): OpenSearch cluster identifier + * It is essential that the issuer is validated as a part of security control measures. This strategy is forward-thinking, particularly in the context of potential multi-tenant scenarios, such as OpenSearch Serverless, where differing cryptographic keys could be associated with each issuer. By checking the value of issuer, each OBO token is restricted to its associated issuer. +* Issue-at (`iat`): Current time for issuing this token + * Used as the reference of the expiration. +* Not-before (`nbf`): The earliest point at which the token can be used + * Given that the OBO token is designed for just-in-time usage, its `nbf` should align with the issued-at time (`iat`), indicating the moment when the token was created. +* Expiry (`exp`): Expiration time + * Each OBO token incorporates an expiration mechanism, which is verified upon its receipt. Once a token is issued, it cannot be revoked. Instead, the token is only invalidated upon its expiration. Further, the generation of OBO tokens by extensions is subject to dynamic settings. This functionality safeguards the system by preventing the issuance of future tokens under certain conditions. + * The default configuration establishes an expiration time of 300 seconds for OBO tokens. Recognizing that different scenarios may necessitate different token durations, OpenSearch has the capability for users to personalize this expiration time. The maximum duration that can be assigned to a token is 600 seconds. + * In reference to the OBO token's current design, token revocation isn't a current concern, given its intended just-in-time use and brief lifespan. If, however, future adjustments necessitate an extended lifespan for this token, token revocation will be added. This strategy will be adopted to improve and solidify the security measures associated with OBO token use. +* Subject (`sub`): User identifier + * Name of the user with which this OBO token is associated. +* Audience (`aud`): The extension’s unique identifier + * For the extension use case, the `aud` field is a reference to the specific extension that represents the target service. + * For the REST API use case, the API parameter service enables the specifying of the target service(s) using this token. The default value is set to `self-issued`. +* Roles: Security privilege evaluation + * The Role Security Mode [[source code](https://github.com/opensearch-project/security/blob/main/src/main/java/org/opensearch/security/authtoken/jwt/JwtVendor.java#L151)]: The configuration determines the roles claim encryption. + * Role Security Mode On (default value): Roles claim will be encrypted. + * Encrypted mapped roles (`er`) + * Role Security Mode Off: Roles claims is in plain-text. Both the mapped roles and backend roles are included in the claim [[related discussion](https://github.com/opensearch-project/security/issues/2865)]. + * Decrypted mapped roles in plain text (`dr`) + * Decrypted backend roles (`br`) + +The OpenSearch Security plugin handles the encryption and decryption processes. This approach ensures the protection of user information, even when traversing the trust boundary between OpenSearch and any third-party services. + +### API endpoint + +You can access the `POST /_plugins/_security/api/generateonbehalfoftoken` API endpoint on the Security plugin in order to create a short-lived, self-issued OBO token to perform certain actions on the user's behalf. + +To access this API endpoint, the request body should contain three API parameters: + +* `description`: This parameter allows the user to articulate the purpose for requesting this token, providing clarity and transparency. +* `service` (optional): This parameter is directed to the audience claim of the OBO token. It offers users the opportunity to designate the target service for which they intend to use the token. Although this is an optional parameter, if not specified, the default value is set to `self-issued`. +* `durationSeconds` (optional): This parameter allows users to customize the token's expiration time according to its anticipated usage. The maximum duration is capped at 600 seconds to maintain security. If not specified, the default duration is set to 300 seconds. +The following is an example of requesting an OBO token with a lifespan of 3 mins for the user “admin” for testing purposes: + +```json +POST /_plugins/_security/api/generateonbehalfoftoken +{ + "description":"Testing", + "service":"Testing Service", + "durationSeconds":"180" +} +``` +{% include copy-curl.html %} + +### Additional authorization restriction + +While the conversation about the usage of OBO tokens continues, it is critical to manage certain edge cases. Even though an OBO token can act as a valid Bearer authorization header for any API access, certain limitations are needed. For instance, using an OBO token to access the API endpoint to issue another OBO token should be forbidden. Similarly, using an OBO token to access the reset password API in order to modify a user's authentication information should be disallowed. These preventive measures are necessary to uphold the system's integrity and security. + +For more information, see the [related discussion](https://github.com/opensearch-project/security/issues/2891). + +## Service Accounts + +Service Accounts tokens are the second form of authentication token supported by the Security plugin. + +### Introduction + +Service Accounts are a new authC/authZ path where extensions can run requests without assuming the role(s) of the active user. Service Accounts are a special type of principal associated with each extension and have a set of permissions. The permissions assigned to Service Accounts grant the associated extension the authorization to run any of the mapped operations without needing to assume the roles of the active user or stash the user’s role(s) in the ephemeral user context. + +Currently, Service Accounts only permit operations on system indices associated with the mapped extension. +{: .important} + +### Background + +Before the introduction of Service Accounts, it was not possible for an extension to service a request without assuming the roles of the active user. Instead, when a request is processed, an ephemeral “Plugin User” was created. The Plugin User then assumed all the permissions of the currently authenticated operator (human user). The result was a Plugin User that acted on the extension’s behalf but had all of the privileges of the operator. In this way, the previous model can be said to have had extensions “impersonate” the operator. This impersonation approach lead to two main issues: +* Impersonation compromises referential integrity, meaning it is difficult for auditors to identify which requests were run by an extension or by an operator. A system with referential integrity maintains a transactional record in its audit log. The record provides a clear history of actions taken by various subjects at specific times. When extensions impersonate users for both requests they make on behalf of the operator and requests they send on their own, the audit log lacks referential integrity. +* Impersonation also makes it impossible to restrict an extension’s permissions beyond those of the user it impersonates. When an extension assumes the roles of the active subject, it copies all of the roles. This includes even those permissions which are uneccessary for completing its intended actions. This practice not only deviates from the principal of least-privileges, but also increases the threat surface area. With each additional permission granted to the Plugin User, the potential impact a misconfigured or malicious extension may have grows. + +### Benefits + +Service Accounts address the issues described in the Background section by defining a separate state in which autonomously operating extensions run. Service Accounts maintain referential integrity by introducing a distinct state in which extensions run when sending requests on their own behalf. +Audit logging can then record when an extension runs on its own—it makes authC/authZ calls against the Service Accounts—or whether it is running an action on behalf of the operator and therefore making use of the OBO tokens. + +Similarly, Service Accounts address threat exposure concerns by separating the roles an extension assumes from those of the operator or a generic hard-coded user (such as those in the `internal_users.yml` file). +Service Accounts will not assume the roles of the operator but instead have their own privileges listed in the Service Accounts. The roles associated with Service Accounts can therefore be as a restrictive as possible in alignment with the principle of least-privileges. To avoid providing extensions with overly permissive service accounts, extension authors should have a strong understanding of what types of operations their extensions hope to run. + +### API Endpoint + +As suggested by the name, the Boolean flag `service` denotes whether a given internal user account is a Service Accounts. If an account is not a Service Accounts, then any attempts to generate an associated authorization token for the account will fail. Similarly, the `enabled` field determines when a Service Accounts can be used by an extension to perform operations. If a Service Accounts is not `enabled`, attempts to fetch its authorization token will be blocked and the Service Accounts will be unable to run requests on its own behalf using a previously issued auth token. +The following is an example of creating a Service Accounts with `ALL PERMISSIONS` for your service or extension. +```json +PUT /_plugins/_security/api/internalusers/admin_service +{ + "opendistro_security_roles": ["all_access"], + "backend_roles": [], + "attributes": { + "enabled": "true", + "service": "true" + } + ``` + +{% include copy-curl.html %} + +## Handling OBO and Service Accounts requests +While both OBO token handling and Service Accounts can be viewed as independent features, the most significant benefits are realized when coupled. Specifically, OpenSearch exposes a client that is used to connect to the OpenSearch cluster and provides the plugins with the capability to run requests. +With OBO tokens and Service Accounts, the client now is able to be used to handle requests that use both of these features. When the client makes a request that requires an extension to use an OBO token, the first step for handling the request is forwarding the request to the Security plugin. In the Security plugin, the request is authenticated and authorized against the active user. If the active user is permitted, the request returns to OpenSearch’s core code base, where a request to create an OBO token for the target extension using the active user’s identity is created. This request to generate the OBO token is then handled by the _`IdentityPlugin`_ implementation. In the standard scenario this is the Security plugin, so the request is returned to the Security plugin’s implementation of the `TokenManager` interface, which generates a new OBO token for the request. +After generating the token, the Security plugin forwards the request with the OBO token to the extension. At that point, the extension can call OpenSearch’s REST methods with the token. The permissions associated with the token will then be evaluated for the authorization of the request. If the token conveys the permissions required for the operation, the action will be performed, and the response will be sent back to the extension. After processing OpenSearch’s response, the extension will forward its own handling of the response to the client. If the OBO token does not entail the permissions required for initiating the target action, a forbidden response is returned to the extension. + +Extensions acting on their own behalf also use the client that is exposed by OpenSearch. When an extension is first initialized in OpenSearch, the `IdentityPlugin` is triggered to create a new Service Accounts for it and to provide the associated Service Accounts token. In the default configuration, the Security plugin is the `IdentityPlugin` and handles these processes. +After OpenSearch receives the Service Accounts token, it forwards that token to the associated extension. After the extension has received its token, requests by the client to make use of the Service Accounts associated with the extension are operable. In these scenarios, the extension receives the requests from the client and then forwards the request along with the Service Accounts token to OpenSearch. OpenSearch further transfers the packages to the Security plugin, where the token is parsed and the request is treated as a traditional request using "Basic Authentication" in the `InternalAuthenticationBackend`. + +In both OBO and Service Accounts token request flows, the `TokenManager` interface for the `IdentityPlugin` is used by the `IdentityPlugin` to handle the tokens' distribution and processing. This interface is implemented by the Security plugin as an `IdentityPlugin` and contains logic for issuing a token that is either an OBO or Service Accounts token. + + diff --git a/_security/access-control/cross-cluster-search.md b/_security/access-control/cross-cluster-search.md deleted file mode 100644 index 182803da5d..0000000000 --- a/_security/access-control/cross-cluster-search.md +++ /dev/null @@ -1,243 +0,0 @@ ---- -layout: default -title: Cross-cluster search -parent: Access control -nav_order: 105 -redirect_from: - - /security/access-control/cross-cluster-search/ - - /security-plugin/access-control/cross-cluster-search/ ---- - -# Cross-cluster search - -Cross-cluster search is exactly what it sounds like: it lets any node in a cluster execute search requests against other clusters. The Security plugin supports cross-cluster search out of the box. - ---- - -#### Table of contents -1. TOC -{:toc} - - ---- - -## Authentication flow - -When accessing a *remote cluster* from a *coordinating cluster* using cross-cluster search: - -1. The Security plugin authenticates the user on the coordinating cluster. -1. The Security plugin fetches the user's backend roles on the coordinating cluster. -1. The call, including the authenticated user, is forwarded to the remote cluster. -1. The user's permissions are evaluated on the remote cluster. - -You can have different authentication and authorization configurations on the remote and coordinating cluster, but we recommend using the same settings on both. - - -## Permissions - -To query indexes on remote clusters, users need to have `READ` or `SEARCH` permissions. Furthermore, when the search request includes the query parameter `ccs_minimize_roundtrips=false` – which tells OpenSearch not to minimize outgoing and ingoing requests to remote clusters – users need to have the following additional permission for the index: - -``` -indices:admin/shards/search_shards -``` - -For more information about the `ccs_minimize_roundtrips` parameter, see the list of [URL Parameters]({{site.url}}{{site.baseurl}}/api-reference/search/#url-parameters) for the Search API. - -#### Sample roles.yml configuration - -```yml -humanresources: - cluster: - - CLUSTER_COMPOSITE_OPS_RO - indices: - 'humanresources': - '*': - - READ - - indices:admin/shards/search_shards # needed when the search request includes parameter setting 'ccs_minimize_roundtrips=false'. -``` - - -#### Sample role in OpenSearch Dashboards - -![OpenSearch Dashboards UI for creating a cross-cluster search role]({{site.url}}{{site.baseurl}}/images/security-ccs.png) - - -## Walkthrough - -Save this file as `docker-compose.yml` and run `docker-compose up` to start two single-node clusters on the same network: - -```yml -version: '3' -services: - opensearch-ccs-node1: - image: opensearchproject/opensearch:{{site.opensearch_version}} - container_name: opensearch-ccs-node1 - environment: - - cluster.name=opensearch-ccs-cluster1 - - discovery.type=single-node - - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping - - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM - ulimits: - memlock: - soft: -1 - hard: -1 - volumes: - - opensearch-data1:/usr/share/opensearch/data - ports: - - 9200:9200 - - 9600:9600 # required for Performance Analyzer - networks: - - opensearch-net - - opensearch-ccs-node2: - image: opensearchproject/opensearch:{{site.opensearch_version}} - container_name: opensearch-ccs-node2 - environment: - - cluster.name=opensearch-ccs-cluster2 - - discovery.type=single-node - - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping - - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM - ulimits: - memlock: - soft: -1 - hard: -1 - volumes: - - opensearch-data2:/usr/share/opensearch/data - ports: - - 9250:9200 - - 9700:9600 # required for Performance Analyzer - networks: - - opensearch-net - -volumes: - opensearch-data1: - opensearch-data2: - -networks: - opensearch-net: -``` - -After the clusters start, verify the names of each: - -```json -curl -XGET -u 'admin:admin' -k 'https://localhost:9200' -{ - "cluster_name" : "opensearch-ccs-cluster1", - ... -} - -curl -XGET -u 'admin:admin' -k 'https://localhost:9250' -{ - "cluster_name" : "opensearch-ccs-cluster2", - ... -} -``` - -Both clusters run on `localhost`, so the important identifier is the port number. In this case, use port 9200 (`opensearch-ccs-node1`) as the remote cluster, and port 9250 (`opensearch-ccs-node2`) as the coordinating cluster. - -To get the IP address for the remote cluster, first identify its container ID: - -```bash -docker ps -CONTAINER ID IMAGE PORTS NAMES -6fe89ebc5a8e opensearchproject/opensearch:{{site.opensearch_version}} 0.0.0.0:9200->9200/tcp, 0.0.0.0:9600->9600/tcp, 9300/tcp opensearch-ccs-node1 -2da08b6c54d8 opensearchproject/opensearch:{{site.opensearch_version}} 9300/tcp, 0.0.0.0:9250->9200/tcp, 0.0.0.0:9700->9600/tcp opensearch-ccs-node2 -``` - -Then get that container's IP address: - -```bash -docker inspect --format='{% raw %}{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}{% endraw %}' 6fe89ebc5a8e -172.31.0.3 -``` - -On the coordinating cluster, add the remote cluster name and the IP address (with port 9300) for each "seed node." In this case, you only have one seed node: - -```json -curl -k -XPUT -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9250/_cluster/settings' -d ' -{ - "persistent": { - "cluster.remote": { - "opensearch-ccs-cluster1": { - "seeds": ["172.31.0.3:9300"] - } - } - } -}' -``` - -On the remote cluster, index a document: - -```bash -curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9200/books/_doc/1' -d '{"Dracula": "Bram Stoker"}' -``` - -At this point, cross-cluster search works. You can test it using the `admin` user: - -```bash -curl -XGET -k -u 'admin:admin' 'https://localhost:9250/opensearch-ccs-cluster1:books/_search?pretty' -{ - ... - "hits": [{ - "_index": "opensearch-ccs-cluster1:books", - "_id": "1", - "_score": 1.0, - "_source": { - "Dracula": "Bram Stoker" - } - }] -} -``` - -To continue testing, create a new user on both clusters: - -```bash -curl -XPUT -k -u 'admin:admin' 'https://localhost:9200/_plugins/_security/api/internalusers/booksuser' -H 'Content-Type: application/json' -d '{"password":"password"}' -curl -XPUT -k -u 'admin:admin' 'https://localhost:9250/_plugins/_security/api/internalusers/booksuser' -H 'Content-Type: application/json' -d '{"password":"password"}' -``` - -Then run the same search as before with `booksuser`: - -```json -curl -XGET -k -u booksuser:password 'https://localhost:9250/opensearch-ccs-cluster1:books/_search?pretty' -{ - "error" : { - "root_cause" : [ - { - "type" : "security_exception", - "reason" : "no permissions for [indices:admin/shards/search_shards, indices:data/read/search] and User [name=booksuser, roles=[], requestedTenant=null]" - } - ], - "type" : "security_exception", - "reason" : "no permissions for [indices:admin/shards/search_shards, indices:data/read/search] and User [name=booksuser, roles=[], requestedTenant=null]" - }, - "status" : 403 -} -``` - -Note the permissions error. On the remote cluster, create a role with the appropriate permissions, and map `booksuser` to that role: - -```bash -curl -XPUT -k -u 'admin:admin' -H 'Content-Type: application/json' 'https://localhost:9200/_plugins/_security/api/roles/booksrole' -d '{"index_permissions":[{"index_patterns":["books"],"allowed_actions":["indices:admin/shards/search_shards","indices:data/read/search"]}]}' -curl -XPUT -k -u 'admin:admin' -H 'Content-Type: application/json' 'https://localhost:9200/_plugins/_security/api/rolesmapping/booksrole' -d '{"users" : ["booksuser"]}' -``` - -Both clusters must have the user, but only the remote cluster needs the role and mapping; in this case, the coordinating cluster handles authentication (i.e. "Does this request include valid user credentials?"), and the remote cluster handles authorization (i.e. "Can this user access this data?"). -{: .tip } - -Finally, repeat the search: - -```bash -curl -XGET -k -u booksuser:password 'https://localhost:9250/opensearch-ccs-cluster1:books/_search?pretty' -{ - ... - "hits": [{ - "_index": "opensearch-ccs-cluster1:books", - "_id": "1", - "_score": 1.0, - "_source": { - "Dracula": "Bram Stoker" - } - }] -} -``` diff --git a/_security/access-control/default-action-groups.md b/_security/access-control/default-action-groups.md index aeac294721..c50ed40a1b 100644 --- a/_security/access-control/default-action-groups.md +++ b/_security/access-control/default-action-groups.md @@ -15,42 +15,40 @@ This page catalogs all default action groups. Often, the most coherent way to cr ## General -Name | Description -:--- | :--- -unlimited | Grants complete access. Can be used on an cluster- or index-level. Equates to `"*"`. -{% comment %}kibana_all_read | asdf -kibana_all_write | asdf{% endcomment %} +| Action group | Description | Permissions | +| :--- | :--- | :--- | +| unlimited | Grants complete access to action groups. Can be used on an `cluster-` or `index-` level. Equates to "*". | `*` | ## Cluster-level -Name | Description -:---| :--- -cluster_all | Grants all cluster permissions. Equates to `cluster:*`. -cluster_monitor | Grants all cluster monitoring permissions. Equates to `cluster:monitor/*`. -cluster_composite_ops_ro | Grants read-only permissions to execute requests like `mget`, `msearch`, or `mtv`, plus permissions to query for aliases. -cluster_composite_ops | Same as `CLUSTER_COMPOSITE_OPS_RO`, but also grants `bulk` permissions and all aliases permissions. -manage_snapshots | Grants permissions to manage snapshots and repositories. -cluster_manage_pipelines | Grants permissions to manage ingest pipelines. -cluster_manage_index_templates | Grants permissions to manage index templates. +| Action group | Description | Permissions | +| :--- | :--- | :--- | +| cluster_all | Grants all cluster permissions. Equates to `cluster:*`. | `cluster:*` | +| cluster_monitor | Grants all cluster monitoring permissions. Equates to `cluster:monitor/*`. | `cluster:monitor/*` | +| cluster_composite_ops_ro | Grants read-only permissions to execute requests like `mget`, `msearch`, or `mtv`, as well as permissions to query for aliases. | `indices:data/read/mget` `indices:data/read/msearch` `indices:data/read/mtv` `indices:admin/aliases/exists*` `indices:admin/aliases/get*` `indices:data/read/scroll` `indices:admin/resolve/index` | +| cluster_composite_ops | Same as `CLUSTER_COMPOSITE_OPS_RO`, but also grants bulk permissions and all aliases permissions. | `indices:data/write/bulk` `indices:admin/aliases*` `indices:data/write/reindex` `indices:data/read/mget` `indices:data/read/msearch` `indices:data/read/mtv` `indices:admin/aliases/exists*` `indices:admin/aliases/get*` `indices:data/read/scroll` `indices:admin/resolve/index` | +| manage_snapshots | Grants permissions to manage snapshots and repositories. | `cluster:admin/snapshot/*` `cluster:admin/repository/*` | +| cluster_manage_pipelines | Grants permissions to manage ingest pipelines. | `cluster:admin/ingest/pipeline/*` | +| cluster_manage_index_templates | Grants permissions to manage index templates. | `indices:admin/template/*` `indices:admin/index_template/*` `cluster:admin/component_template/*` | ## Index-level -Name | Description -:--- | :--- -indices_all | Grants all permissions on the index. Equates to `indices:*`. -get | Grants permissions to use `get` and `mget` actions only. -read | Grants read permissions such as search, get field mappings, `get`, and `mget`. -write | Grants permissions to create and update documents within *existing indices*. To create new indexes, see `create_index`. -delete | Grants permissions to delete documents. -crud | Combines the `read`, `write`, and `delete` action groups. Included in the `data_access` action group. -search | Grants permissions to search documents. Includes `suggest`. -suggest | Grants permissions to use the suggest API. Included in the `read` action group. -create_index | Grants permissions to create indexes and mappings. -indices_monitor | Grants permissions to execute all index monitoring actions (e.g. recovery, segments info, index stats, and status). -index | A more limited version of the `write` action group. -data_access | Combines the `crud` action group with `indices:data/*`. -manage_aliases | Grants permissions to manage aliases. -manage | Grants all monitoring and administration permissions for indexes. +| Action group | Description | Permissions | +| :--- | :--- | :--- | +| indices_all | Grants all permissions on the index. Equates to `indices:*`. | `indices:*` | +| get | Grants permissions to use `get` and `mget` actions. | `indices:data/read/get*` `indices:data/read/mget*` | +| read | Grants read permissions on the index such as `search`, `get` field mappings, `get`, and `mget`. | `indices:data/read*` `indices:admin/mappings/fields/get*` `indices:admin/resolve/index` | +| write | Grants permissions to create and update documents within existing indexes. | `indices:data/write*` `indices:admin/mapping/put` | +| delete | Grants permissions to delete documents. | `indices:data/write/delete*` | +| crud | Combines the read, write, and delete action groups. Included in the `data_access` action group. | `indices:data/read*` `indices:admin/mappings/fields/get*` `indices:admin/resolve/index` `indices:data/write*` `indices:admin/mapping/put` | +| search | Grants permissions to search documents, including the Suggest API. | `indices:data/read/search*` `indices:data/read/msearch*` `indices:admin/resolve/index` `indices:data/read/suggest*` | +| suggest | Grants permissions to use the Suggest API. Included in the `read` action group. | `indices:data/read/suggest*` | +| create_index | Grants permissions to create indexes and mappings. | `indices:admin/create` `indices:admin/mapping/put` | +| indices_monitor | Grants permissions to run all index monitoring actions, such as `recovery`, `segments_info`, `index_stats`, and `status`). | `indices:monitor/*` | +| index | A more limited version of the write action group. | `indices:data/write/index*` `indices:data/write/update*` `indices:admin/mapping/put` `indices:data/write/bulk*` | +| data_access | Combines the CRUD action group with `indices:data/*`. | `indices:data/*` `indices:data/read*` `indices:admin/mappings/fields/get*` `indices:admin/resolve/index` `indices:data/write*` `indices:admin/mapping/put` | +| manage_aliases | Grants permissions to manage aliases. | `indices:admin/aliases*` | +| manage | Grants all monitoring and administration permissions for indexes. | `indices:monitor/*` `indices:admin/*` | diff --git a/_security/access-control/document-level-security.md b/_security/access-control/document-level-security.md index d1c275119b..08de85bbf7 100644 --- a/_security/access-control/document-level-security.md +++ b/_security/access-control/document-level-security.md @@ -4,20 +4,19 @@ title: Document-level security parent: Access control nav_order: 85 redirect_from: - - /security/access-control/document-level-security/ - - /security-plugin/access-control/document-level-security/ +- /security/access-control/document-level-security/ +- /security-plugin/access-control/document-level-security/ --- -# Document-level security (DLS) - -Document-level security lets you restrict a role to a subset of documents in an index. The easiest way to get started with document- and field-level security is to open OpenSearch Dashboards and choose **Security**. Then choose **Roles**, create a new role, and review the **Index permissions** section. +# Document-level security +Document-level security lets you restrict a role to a subset of documents in an index. The easiest way to get started with document- and field-level security is to open OpenSearch Dashboards and choose **Security**. Then choose **Roles**, create a new role, and review the **Index Permissions** section, shown in the following image. ![Document- and field-level security screen in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/images/security-dls.png) ## Simple roles -Document-level security uses the OpenSearch query DSL to define which documents a role grants access to. In OpenSearch Dashboards, choose an index pattern and provide a query in the **Document level security** section: +Document-level security uses OpenSearch query domain-specific language (DSL) to define which documents a role grants access to. In OpenSearch Dashboards, choose an index pattern and provide a query in the **Document-level security** section: ```json { @@ -33,7 +32,9 @@ Document-level security uses the OpenSearch query DSL to define which documents This query specifies that for the role to have access to a document, its `genres` field must include `Comedy`. -A typical request to the `_search` API includes `{ "query": { ... } }` around the query, but in this case, you only need to specify the query itself. +A typical request sent to the `_search` API includes `{ "query": { ... } }` around the query, but in this case, you only need to specify the query itself. + +## Updating roles by accessing the REST API In the REST API, you provide the query as a string, so you must escape your quotes. This role allows a user to read any document in any index with the field `public` set to `true`: @@ -185,3 +186,97 @@ plugins.security.dls.mode: filter-level Lucene-level DLS | `lucene-level` | This setting makes all DLS queries apply to the Lucene level. | Lucene-level DLS modifies Lucene queries and data structures directly. This is the most efficient mode but does not allow certain advanced constructs in DLS queries, including TLQs. Filter-level DLS | `filter-level` | This setting makes all DLS queries apply to the filter level. | In this mode, OpenSearch applies DLS by modifying queries that OpenSearch receives. This allows for term-level lookup queries in DLS queries, but you can only use the `get`, `search`, `mget`, and `msearch` operations to retrieve data from the protected index. Additionally, cross-cluster searches are limited with this mode. Adaptive | `adaptive-level` | The default setting that allows OpenSearch to automatically choose the mode. | DLS queries without TLQs are executed in Lucene-level mode, while DLS queries that contain TLQ are executed in filter- level mode. + +## DLS and multiple roles + +OpenSearch combines all DLS queries with the logical `OR` operator. However, when a role that uses DLS is combined with another security role that doesn't use DLS, the query results are filtered to display only documents matching the DLS from the first role. This filter rule also applies to roles that do not grant read documents. + +### When to enable `plugins.security.dfm_empty_overrides_all` + +When to enable the `plugins.security.dfm_empty_overrides_all` setting depends on whether you want to restrict user access to documents without DLS. + + +To ensure access is not restricted, you can set the following configuration in `opensearch.yml`: + +``` +plugins.security.dfm_empty_overrides_all: true +``` +{% include copy.html %} + + +The following examples show what level of access roles with DLS enabled and without DLS enabled, depending on the interaction. These examples can help you decide when to enable the `plugins.security.dfm_empty_overrides_all` setting. + +#### Example: Document access + +This example demonstrates that enabling `plugins.security.dfm_empty_overrides_all` is beneficial in scenarios where you need specific users to have unrestricted access to documents despite being part of a broader group with restricted access. + +**Role A with DLS**: This role is granted to a broad group of users and includes DLS to restrict access to specific documents, as shown in the following permission set: + +``` +{ + "index_permissions": [ + { + "index_patterns": ["example-index"], + "dls": "[.. some DLS here ..]", + "allowed_actions": ["indices:data/read/search"] + } + ] +} +``` + +**Role B without DLS:** This role is specifically granted to certain users, such as administrators, and does not include DLS, as shown in the following permission set: + +``` +{ + "index_permissions" : [ + { + "index_patterns" : ["*"], + "allowed_actions" : ["indices:data/read/search"] + } + ] +} +``` +{% include copy.html %} + +Setting `plugins.security.dfm_empty_overrides_all` to `true` ensures that administrators assigned Role B can override any DLS restrictions imposed by Role A. This allows specific Role B users to access all documents, regardless of the restrictions applied by Role A's DLS restrictions. + +#### Example: Search template access + +In this example, two roles are defined, one with DLS and another without DLS, granting access to search templates: + +**Role A with DLS:** + +``` +{ + "index_permissions": [ + { + "index_patterns": [ + "example-index" + ], + "dls": "[.. some DLS here ..]", + "allowed_actions": [ + "indices:data/read/search", + ] + } + ] +} +``` +{% include copy.html %} + +**Role B, without DLS**, which only grants access to search templates: + +``` +{ + "index_permissions" : [ + { + "index_patterns" : [ "*" ], + "allowed_actions" : [ "indices:data/read/search/template" ] + } + ] +} +``` +{% include copy.html %} + +When a user has both Role A and Role B permissions, the query results are filtered based on Role A's DLS, even though Role B doesn't use DLS. The DLS settings are retained, and the returned access is appropriately restricted. + +When a user is assigned both Role A and Role B and the `plugins.security.dfm_empty_overrides_all` setting is enabled, Role B's permissions Role B's permissions will override Role A's restrictions, allowing that user to access all documents. This ensures that the role without DLS takes precedence in the search query response. diff --git a/_security/access-control/field-masking.md b/_security/access-control/field-masking.md index 0fc4213806..c672e75d04 100644 --- a/_security/access-control/field-masking.md +++ b/_security/access-control/field-masking.md @@ -58,14 +58,14 @@ You configure field masking using OpenSearch Dashboards, `roles.yml`, or the RES ```yml someonerole: - cluster: [] - indices: - movies: - _masked_fields_: - - "title" - - "genres" - '*': - - "READ" + index_permissions: + - index_patterns: + - 'movies' + allowed_actions: + - read + masked_fields: + - "title" + - "genres" ``` @@ -82,14 +82,14 @@ To specify a different algorithm, add it after the masked field: ```yml someonerole: - cluster: [] - indices: - movies: - _masked_fields_: - - "title::SHA-512" - - "genres" - '*': - - "READ" + index_permissions: + - index_patterns: + - 'movies' + allowed_actions: + - read + masked_fields: + - "title::SHA-512" + - "genres" ``` @@ -103,19 +103,19 @@ hr_employee: - index_patterns: - 'humanresources' allowed_actions: - - ... + - read masked_fields: - 'lastname::/.*/::*' - '*ip_source::/[0-9]{1,3}$/::XXX::/^[0-9]{1,3}/::***' someonerole: - cluster: [] - indices: - movies: - _masked_fields_: - - "title::/./::*" - - "genres::/^[a-zA-Z]{1,3}/::XXX::/[a-zA-Z]{1,3}$/::YYY" - '*': - - "READ" + index_permissions: + - index_patterns: + - 'movies' + allowed_actions: + - read + masked_fields: + - "title::/./::*" + - "genres::/^[a-zA-Z]{1,3}/::XXX::/[a-zA-Z]{1,3}$/::YYY" ``` diff --git a/_security/access-control/impersonation.md b/_security/access-control/impersonation.md index 0033e46507..4bf7ab689d 100644 --- a/_security/access-control/impersonation.md +++ b/_security/access-control/impersonation.md @@ -47,5 +47,5 @@ plugins.security.authcz.impersonation_dn: To impersonate another user, submit a request to the system with the HTTP header `opendistro_security_impersonate_as` set to the name of the user to be impersonated. A good test is to make a GET request to the `_plugins/_security/authinfo` URI: ```bash -curl -XGET -u 'admin:admin' -k -H "opendistro_security_impersonate_as: user_1" https://localhost:9200/_plugins/_security/authinfo?pretty +curl -XGET -u 'admin:' -k -H "opendistro_security_impersonate_as: user_1" https://localhost:9200/_plugins/_security/authinfo?pretty ``` diff --git a/_security/access-control/permissions.md b/_security/access-control/permissions.md index 60939612fd..0b2d609c35 100644 --- a/_security/access-control/permissions.md +++ b/_security/access-control/permissions.md @@ -124,7 +124,7 @@ green open .kibana_3 XmTePICFRoSNf5O5uLgwRw 1 1 220 0 468.3kb 232.1kb ### Enabling system index permissions -Users that have the permission [`restapi:admin/roles`]({{site.url}}{{site.baseurl}}/security/access-control/api/#access-control-for-the-api) are able to map system index permissions to all users in the same way they would for a cluster or index permission in the `roles.yml` file. However, to preserve some control over this permission, the `plugins.security.system_indices.permissions.enabled` setting allows you to enable or disable the system index permissions feature. This setting is disabled by default. To enable the system index permissions feature, set `plugins.security.system_indices.permissions.enabled` to `true`. For more information about this setting, see [Enabling user access to system indexes]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#enabling-user-access-to-system-indexes). +Users that have the permission [`restapi:admin/roles`]({{site.url}}{{site.baseurl}}/security/access-control/api/#access-control-for-the-api) are able to map system index permissions to all users in the same way they would for a cluster or index permission in the `roles.yml` file. However, to preserve some control over this permission, the `plugins.security.system_indices.permission.enabled` setting allows you to enable or disable the system index permissions feature. This setting is disabled by default. To enable the system index permissions feature, set `plugins.security.system_indices.permissions.enabled` to `true`. For more information about this setting, see [Enabling user access to system indexes]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#enabling-user-access-to-system-indexes). Keep in mind that enabling this feature and mapping system index permissions to normal users gives those users access to indexes that may contain sensitive information and configurations essential to a cluster's health. We also recommend caution when mapping users to `restapi:admin/roles` because this permission gives a user not only the ability to assign the system index permission to another user but also the ability to self-assign access to any system index. {: .warning } @@ -380,80 +380,86 @@ See [Index templates]({{site.url}}{{site.baseurl}}/im-plugin/index-templates/). These permissions apply to an index or index pattern. You might want a user to have read access to all indexes (that is, `*`), but write access to only a few (for example, `web-logs` and `product-catalog`). -- indices:admin/aliases -- indices:admin/aliases/get -- indices:admin/analyze -- indices:admin/cache/clear -- indices:admin/close -- indices:admin/close* -- indices:admin/create (create indexes) -- indices:admin/data_stream/create -- indices:admin/data_stream/delete -- indices:admin/data_stream/get -- indices:admin/delete (delete indexes) -- indices:admin/exists -- indices:admin/flush -- indices:admin/flush* -- indices:admin/forcemerge -- indices:admin/get (retrieve index and mapping) -- indices:admin/mapping/put -- indices:admin/mappings/fields/get -- indices:admin/mappings/fields/get* -- indices:admin/mappings/get -- indices:admin/open -- indices:admin/plugins/replication/index/setup/validate -- indices:admin/plugins/replication/index/start -- indices:admin/plugins/replication/index/pause -- indices:admin/plugins/replication/index/resume -- indices:admin/plugins/replication/index/stop -- indices:admin/plugins/replication/index/update -- indices:admin/plugins/replication/index/status_check -- indices:admin/refresh -- indices:admin/refresh* -- indices:admin/resolve/index -- indices:admin/rollover -- indices:admin/seq_no/global_checkpoint_sync -- indices:admin/settings/update -- indices:admin/shards/search_shards -- indices:admin/template/delete -- indices:admin/template/get -- indices:admin/template/put -- indices:admin/upgrade -- indices:admin/validate/query -- indices:data/read/explain -- indices:data/read/field_caps -- indices:data/read/field_caps* -- indices:data/read/get -- indices:data/read/mget -- indices:data/read/mget* -- indices:data/read/msearch -- indices:data/read/msearch/template -- indices:data/read/mtv (multi-term vectors) -- indices:data/read/mtv* -- indices:data/read/plugins/replication/file_chunk -- indices:data/read/plugins/replication/changes -- indices:data/read/scroll -- indices:data/read/scroll/clear -- indices:data/read/search -- indices:data/read/search* -- indices:data/read/search/template -- indices:data/read/tv (term vectors) -- indices:data/write/bulk -- indices:data/write/bulk* -- indices:data/write/delete (delete documents) -- indices:data/write/delete/byquery -- indices:data/write/plugins/replication/changes -- indices:data/write/index (add documents to existing indexes) -- indices:data/write/reindex -- indices:data/write/update -- indices:data/write/update/byquery -- indices:monitor/data_stream/stats -- indices:monitor/recovery -- indices:monitor/segments -- indices:monitor/settings/get -- indices:monitor/shard_stores -- indices:monitor/stats -- indices:monitor/upgrade + + +| **Permission** | **Description** | +| :--- | :--- | +| `indices:admin/aliases` | Permissions for [index aliases]({{site.url}}{{site.baseurl}}/im-plugin/index-alias/). | +| `indices:admin/aliases/get` | Permission to get [index aliases]({{site.url}}{{site.baseurl}}/im-plugin/index-alias/). | +| `indices:admin/analyze` | Permission to use the [Analyze API]({{site.url}}{{site.baseurl}}/api-reference/analyze-apis/). | +| `indices:admin/cache/clear` | Permission to [clear cache]({{site.url}}{{site.baseurl}}/api-reference/index-apis/clear-index-cache/). | +| `indices:admin/close` | Permission to [close an index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/close-index/). | +| `indices:admin/close*` | Permission to [close an index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/close-index/). | +| `indices:admin/create` | Permission to [create indexes]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). | +| `indices:admin/data_stream/create` | Permission to create [data streams]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/datastream/#creating-a-data-stream). | +| `indices:admin/data_stream/delete` | Permission to [delete data streams]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/datastream/#deleting-a-data-stream). | +| `indices:admin/data_stream/get` | Permission to [get data streams]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/datastream/#viewing-a-data-stream). | +| `indices:admin/delete` | Permission to [delete indexes]({{site.url}}{{site.baseurl}}/api-reference/index-apis/delete-index/). | +| `indices:admin/exists` | Permission to use [exists query]({{site.url}}{{site.baseurl}}/query-dsl/term/exists/). | +| `indices:admin/flush` | Permission to [flush an index]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/index-management/#flushing-an-index). | +| `indices:admin/flush*` | Permission to [flush an index]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/index-management/#flushing-an-index). | +| `indices:admin/forcemerge` | Permission to force merge indexes and data streams. | +| `indices:admin/get` | Permission to get index and mapping. | +| `indices:admin/mapping/put` | Permission to add new mappings and fields to an index. | +| `indices:admin/mappings/fields/get` | Permission to get mappings fields. | +| `indices:admin/mappings/fields/get*` | Permission to get mappings fields. | +| `indices:admin/mappings/get` | Permission to [get mappings]({{site.url}}{{site.baseurl}}/security-analytics/api-tools/mappings-api/#get-mappings). | +| `indices:admin/open` | Permission to [open an index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/open-index/). | +| `indices:admin/plugins/replication/index/setup/validate` | Permission to validate a connection to a [remote cluster]({{site.url}}{{site.baseurl}}/tuning-your-cluster/replication-plugin/getting-started/#set-up-a-cross-cluster-connection). | +| `indices:admin/plugins/replication/index/start` | Permission to [start cross-cluster replication]({{site.url}}{{site.baseurl}}/tuning-your-cluster/replication-plugin/getting-started/#start-replication). | +| `indices:admin/plugins/replication/index/pause` | Permission to pause cross-cluster replication. | +| `indices:admin/plugins/replication/index/resume` | Permission to resume cross-cluster replication. | +| `indices:admin/plugins/replication/index/stop` | Permission to stop cross-cluster replication. | +| `indices:admin/plugins/replication/index/update` | Permission to update cross-cluster replication settings. | +| `indices:admin/plugins/replication/index/status_check` | Permission to check the status of cross-cluster replication. | +| `indices:admin/refresh` | Permission to use the [index refresh API]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/index-management/#refreshing-an-index). | +| `indices:admin/refresh*` | Permission to use the index refresh API. | +| `indices:admin/resolve/index` | Permission to resolve index names, index aliases and data streams. | +| `indices:admin/rollover` | Permission to perform [index rollover]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/rollover/). | +| `indices:admin/seq_no/global_checkpoint_sync` | Permission to perform a global checkpoint sync. | +| `indices:admin/settings/update` | Permission to [update index settings]({{site.url}}{{site.baseurl}}/api-reference/index-apis/update-settings/). | +| `indices:admin/shards/search_shards` | Permission to perform [cross cluster search]({{site.url}}{{site.baseurl}}/security/access-control/cross-cluster-search/). | +| `indices:admin/template/delete` | Permission to [delete index templates]({{site.url}}{{site.baseurl}}/im-plugin/index-templates/#delete-a-template). | +| `indices:admin/template/get` | Permission to [get index templates]({{site.url}}{{site.baseurl}}/im-plugin/index-templates/#retrieve-a-template). | +| `indices:admin/template/put` | Permission to [create index templates]({{site.url}}{{site.baseurl}}/im-plugin/index-templates/#create-a-template). | +| `indices:admin/upgrade` | Permission for administrators to perform upgrades. | +| `indices:admin/validate/query` | Permission to validate a specific query. | +| `indices:data/read/explain` | Permission to run the [Explain API]({{site.url}}{{site.baseurl}}/api-reference/explain/). | +| `indices:data/read/field_caps` | Permission to run the [Field Capabilities API]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/alias/#using-aliases-in-field-capabilities-api-operations). | +| `indices:data/read/field_caps*` | Permission to run the Field Capabilities API. | +| `indices:data/read/get` | Permission to read index data. | +| `indices:data/read/mget` | Permission to run [multiple GET operations]({{site.url}}{{site.baseurl}}/api-reference/document-apis/multi-get/) in one request. | +| `indices:data/read/mget*` | Permission to run multiple GET operations in one request. | +| `indices:data/read/msearch` | Permission to run [multiple search]({{site.url}}{{site.baseurl}}/api-reference/multi-search/) requests into a single request. | +| `indices:data/read/msearch/template` | Permission to bundle [multiple search templates]({{site.url}}{{site.baseurl}}/api-reference/search-template/#multiple-search-templates) and send them to your OpenSearch cluster in a single request. | +| `indices:data/read/mtv` | Permission to retrieve multiple term vectors with a single request. | +| `indices:data/read/mtv*` | Permission to retrieve multiple term vectors with a single request. | +| `indices:data/read/plugins/replication/file_chunk` | Permission to check files during segment replication. | +| `indices:data/read/plugins/replication/changes` | Permission to make changes to segment replication settings. | +| `indices:data/read/scroll` | Permission to scroll data. | +| `indices:data/read/scroll/clear` | Permission to clear read scroll data. | +| `indices:data/read/search` | Permission to [search]({{site.url}}{{site.baseurl}}/api-reference/search/) data. | +| `indices:data/read/search*` | Permission to search data. | +| `indices:data/read/search/template` | Permission to read a search template. | +| `indices:data/read/tv` | Permission to retrieve information and statistics for terms in the fields of a particular document. | +| `indices:data/write/bulk` | Permission to run a [bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) request. | +| `indices:data/write/bulk*` | Permission to run a bulk request. | +| `indices:data/write/delete` | Permission to [delete documents]({{site.url}}{{site.baseurl}}/api-reference/document-apis/delete-document/). | +| `indices:data/write/delete/byquery` | Permission to delete all documents that [match a query]({{site.url}}{{site.baseurl}}/api-reference/document-apis/delete-by-query/). | +| `indices:data/write/plugins/replication/changes` | Permission to make changes to data replication configurations and settings within indices. | +| `indices:data/write/index` | Permission to add documents to existing indexes. See also [Index document]( {{site.url}}{{site.baseurl}}/api-reference/document-apis/index-document/ ). | +| `indices:data/write/reindex` | Permission to run a [reindex]({{site.url}}{{site.baseurl}}/im-plugin/reindex-data/). | +| `indices:data/write/update` | Permission to update an index. | +| `indices:data/write/update/byquery` | Permission to run the script to update all of the documents that [match the query]({{site.url}}{{site.baseurl}}/api-reference/document-apis/update-by-query/). | +| `indices:monitor/data_stream/stats` | Permission to stream stats. | +| `indices:monitor/recovery` | Permission to access recovery stats. | +| `indices:monitor/segments` | Permission to access segment stats. | +| `indices:monitor/settings/get` | Permission to get mointor settings. | +| `indices:monitor/shard_stores` | Permission to access shard store stats. | +| `indices:monitor/stats` | Permission to access monitoring stats. | +| `indices:monitor/upgrade` | Permission to access upgrade stats. | + + ## Security REST permissions diff --git a/_security/access-control/users-roles.md b/_security/access-control/users-roles.md index 3b728029f8..0f84fc36fe 100644 --- a/_security/access-control/users-roles.md +++ b/_security/access-control/users-roles.md @@ -14,18 +14,39 @@ The Security plugin includes an internal user database. Use this database in pla Roles are the core way of controlling access to your cluster. Roles contain any combination of cluster-wide permissions, index-specific permissions, document- and field-level security, and tenants. Then you map users to these roles so that users gain those permissions. -Unless you need to create new [reserved or hidden users]({{site.url}}{{site.baseurl}}/security/access-control/api/#reserved-and-hidden-resources), we **highly** recommend using OpenSearch Dashboards or the REST API to create new users, roles, and role mappings. The `.yml` files are for initial setup, not ongoing use. -{: .warning } - --- -#### Table of contents -1. TOC +
+ + Table of contents + + {: .text-delta } +- TOC {:toc} - +
--- +## Creating and editing OpenSearch roles + +You can update OpenSearch by using one of the following methods. + +### Using the API + +You can send HTTP requests to OpenSearch-provided endpoints to update security roles, permissions, and associated settings. This method offers granular control and automation capabilities for managing roles. + +### Using the UI (OpenSearch Dashboards) + +OpenSearch Dashboards provides a user-friendly interface for managing roles. Roles, permissions, and document-level security settings are configured in the Security section within OpenSearch Dashboards. When updating roles through the UI, OpenSearch Dashboards calls the API in the background to implement the changes. + +### Editing the `roles.yml` file + +If you want more granular control of your security configuration, you can edit roles and their associated permissions in the `roles.yml` file. This method provides direct access to the underlying configuration and can be version controlled for use in collaborative development environments. +For more information about creating roles, see the [Create roles](https://opensearch.org/docs/latest/security/access-control/users-roles/#create-roles) documentation. + +Unless you need to create new [reserved or hidden users]({{site.url}}{{site.baseurl}}/security/access-control/api/#reserved-and-hidden-resources), we **highly** recommend using OpenSearch Dashboards or the REST API to create new users, roles, and role mappings. The `.yml` files are for initial setup, not ongoing use. +{: .warning } + ## Create users You can create users using OpenSearch Dashboards, `internal_users.yml`, or the REST API. When creating a user, you can map users to roles using `internal_users.yml` or the REST API, but that feature is not currently available in OpenSearch Dashboards. @@ -75,6 +96,24 @@ See [YAML files]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#roles See [Create role]({{site.url}}{{site.baseurl}}/security/access-control/api/#create-role). +## Edit roles + +You can edit roles using one of the following methods. + +### OpenSearch Dashboards + +1. Choose **Security** > **Roles**. In the **Create role** section, select **Explore existing roles**. +1. Select the role you want to edit. +1. Choose **edit role**. Make any necessary updates to the role. +1. To save your changes, select **Update**. + +### roles.yml + +See [YAML files]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#rolesyml). + +### REST API + +See [Create role]({{site.url}}{{site.baseurl}}/security/access-control/api/#create-role). ## Map users to roles @@ -107,34 +146,33 @@ The Security plugin includes several predefined roles that serve as useful defau | **Role** | **Description** | | :--- | :--- | | `alerting_ack_alerts` | Grants permissions to view and acknowledge alerts, but not to modify destinations or monitors. | -| `alerting_full_access` | Grants full permissions to all alerting actions. | +| `alerting_full_access` | Grants full permissions to perform all alerting actions. | | `alerting_read_access` | Grants permissions to view alerts, destinations, and monitors, but not to acknowledge alerts or modify destinations or monitors. | -| `anomaly_full_access` | Grants full permissions to all anomaly detection actions. | +| `all_access` | Grants full access to the cluster, including all cluster-wide operations, permissions to write to all cluster indexes, and permissions to write to all tenants. For more information about access using the REST API, see [Access control for the API]({{site.url}}{{site.baseurl}}/security/access-control/api/#access-control-for-the-api). | +| `anomaly_full_access` | Grants full permissions to perform all anomaly detection actions. | | `anomaly_read_access` | Grants permissions to view detectors, but not to create, modify, or delete detectors. | -| `all_access` | Grants full access to the cluster, including all cluster-wide operations, permission to write to all cluster indexes, and permission to write to all tenants. For more information on access using the REST API, see [Access control for the API]({{site.url}}{{site.baseurl}}/security/access-control/api/#access-control-for-the-api). | +| `asynchronous_search_full_access` | Grants full permissions to perform all asynchronous search actions. | +| `asynchronous_search_read_access` | Grants permissions to view asynchronous searches but not to submit, modify, or delete them. | | `cross_cluster_replication_follower_full_access` | Grants full access to perform cross-cluster replication actions on the follower cluster. | -| `cross_cluster_replication_leader_full_access` | Grants full access to perform cross-cluster replication actions on the leader cluster. | -| `observability_full_access` | Grants full access to perform actions on Observability objects such as visualizations, notebooks, and operational panels. | -| `observability_read_access` | Grants permission to view Observability objects such as visualizations, notebooks, and operational panels, but not to create, modify, or delete them. | -| `kibana_read_only` | A special role that prevents users from making changes to visualizations, dashboards, and other OpenSearch Dashboards objects. To enable read-only mode in Dashboards, add the `opensearch_security.readonly_mode.roles` setting to the `opensearch_dashboards.yml` file and include the role as a setting value. See the [example configuration]({{site.url}}{{site.baseurl}}/dashboards/branding/#sample-configuration) in Dashboards documentation. | -| `kibana_user` | Grants permissions to use OpenSearch Dashboards: cluster-wide searches, index monitoring, and write to various OpenSearch Dashboards indexes. | -| `logstash` | Grants permissions for Logstash to interact with the cluster: cluster-wide searches, cluster monitoring, and write to the various Logstash indexes. | -| `manage_snapshots` | Grants permissions to manage snapshot repositories, take snapshots, and restore snapshots. | +| `cross_cluster_replication_leader_full_access` | Grants full access to perform cross-cluster replication actions on the leader cluster. | +| `index_management_full_access` | Grants full permissions to perform all index management actions, including Index State Management (ISM), transforms, and rollups. | +| `index_management_read_access` | Same as `readall` but with added cluster permissions for monitoring. | +| `ml_full_access` | Grants full permissions to perform all machine learning (ML) features, including starting new ML tasks and reading or deleting models. | +| `ml_read_access` | Grants permissions to view ML features and results but not to modify them. | +| `notifications_full_access` | Grants full permissions to perform all notification actions. | +| `notifications_read_access` | Grants permissions to view notifications and their configurations but not to modify them. | +| `opensearch_dashboards_read_only` | Grants read-only access to OpenSearch Dashboards. | +| `opensearch_dashboards_user` | Grants basic user access to OpenSearch Dashboards. | +| `point_in_time_full_access` | Grants full permissions to perform all Point in Time operations. | | `readall` | Grants permissions for cluster-wide searches like `msearch` and search permissions for all indexes. | -| `readall_and_monitor` | Same as `readall` but with added cluster permissions for monitoring. | -| `security_rest_api_access` | A special role that allows access to the REST API. See `plugins.security.restapi.roles_enabled` in `opensearch.yml` and [Access control for the API]({{site.url}}{{site.baseurl}}/security/access-control/api/#access-control-for-the-api). | -| `reports_read_access` | Grants permissions to generate on-demand reports, download existing reports, and view report definitions but not to create report definitions. | | `reports_instances_read_access` | Grants permissions to generate on-demand reports and download existing reports but not to view or create report definitions. | -| `reports_full_access` | Grants full permissions to reports. | -| `asynchronous_search_full_access` | Grants full permissions to all asynchronous search actions. | -| `asynchronous_search_read_access` | Grants permissions to view asynchronous searches but not to submit, modify, or delete them. | -| `index_management_full_access` | Grants full permissions to all index management actions, including Index State Management (ISM), transforms, and rollups. | -| `snapshot_management_full_access` | Grants full permissions to all snapshot management actions. | -| `snapshot_management_read_access` | Grants permissions to view policies but not to create, modify, start, stop, or delete them. | -| `point_in_time_full_access` | Grants full permissions to all Point in Time operations. | -| `security_analytics_full_access` | Grants full permissions to all Security Analytics functionality. | -| `security_analytics_read_access` | Grants permissions to view the various components in Security Analytics, such as detectors, alerts, and findings. It also includes permissions that allow users to search for detectors and rules. This role does not allow a user to perform actions such as modifying or deleting a detector. | | `security_analytics_ack_alerts` | Grants permissions to view and acknowledge alerts. | +| `security_analytics_full_access` | Grants full permissions to use all Security Analytics functionality. | +| `security_analytics_read_access` | Grants permissions to view Security Analytics components, such as detectors, alerts, and findings. Also includes permissions that allow users to search for detectors and rules. This role does not allow a user to perform actions such as modifying or deleting a detector. | +| `security_manager` | Grants permissions to manage security-related features and configurations. | +| `snapshot_management_full_access` | Grants full permissions to perform all snapshot management actions. | +| `snapshot_management_read_access` | Grants permissions to view snapshot management actions and configurations but not to modify them. | + For more detailed summaries of the permissions for each role, reference their action groups against the descriptions in [Default action groups]({{site.url}}{{site.baseurl}}/security/access-control/default-action-groups/). diff --git a/_security/audit-logs/index.md b/_security/audit-logs/index.md index 1b3879be65..becb001ec0 100644 --- a/_security/audit-logs/index.md +++ b/_security/audit-logs/index.md @@ -13,7 +13,7 @@ redirect_from: --- -
+
Table of contents @@ -26,7 +26,7 @@ redirect_from: Audit logs let you track access to your OpenSearch cluster and are useful for compliance purposes or in the aftermath of a security breach. You can configure the categories to be logged, the detail level of the logged messages, and where to store the logs. -To enable audit logging: +Audit logging is disabled by default. To enable audit logging: 1. Add the following line to `opensearch.yml` on each node: @@ -220,3 +220,7 @@ The default setting is `10`. Setting this value to `0` disables the thread pool, plugins.security.audit.config.threadpool.max_queue_len: 100000 ``` +## Disabling audit logs + +To disable audit logs after they've been enabled, remove the `plugins.security.audit.type: internal_opensearch` setting from `opensearch.yml`, or switch off the **Enable audit logging** check box in OpenSearch Dashboards. + diff --git a/_security/authentication-backends/basic-authc.md b/_security/authentication-backends/basic-authc.md index 5e5d12597c..46a498d0ab 100644 --- a/_security/authentication-backends/basic-authc.md +++ b/_security/authentication-backends/basic-authc.md @@ -12,17 +12,23 @@ redirect_from: HTTP basic authentication provides a simple challenge-and-response process for gaining access to OpenSearch and its resources that prompts you to sign in with a username and password. You enable HTTP basic authentication in the `http_authenticator` section of the configuration by specifying `type` as `basic`, as shown in the following example: ```yml -authc: - basic_internal_auth_domain: - description: "Authenticate using HTTP basic against the internal users database" - http_enabled: true - transport_enabled: true - order: 1 - http_authenticator: - type: basic - challenge: true - authentication_backend: - type: internal +_meta: + type: "config" + config_version: 2 + +config: + dynamic: + authc: + basic_internal_auth_domain: + description: "Authenticate using HTTP basic against the internal users database" + http_enabled: true + transport_enabled: true + order: 1 + http_authenticator: + type: basic + challenge: true + authentication_backend: + type: internal ``` Additionally, you can specify the internal user database as the authentication backend by specifying `internal` as the type for `authentication_backend`. See [The internal user database](#the-internal-user-database) for information about this backend. diff --git a/_security/authentication-backends/jwt.md b/_security/authentication-backends/jwt.md index 846004d45c..afcd4c78ee 100644 --- a/_security/authentication-backends/jwt.md +++ b/_security/authentication-backends/jwt.md @@ -106,6 +106,8 @@ jwt_auth_domain: jwt_url_parameter: null subject_key: null roles_key: null + required_audience: null + required_issuer: null jwt_clock_skew_tolerance_seconds: 20 authentication_backend: type: noop @@ -120,6 +122,8 @@ Name | Description `jwt_url_parameter` | If the token is not transmitted in the HTTP header but rather as an URL parameter, define the name of the parameter here. `subject_key` | The key in the JSON payload that stores the username. If not set, the [subject](https://tools.ietf.org/html/rfc7519#section-4.1.2) registered claim is used. `roles_key` | The key in the JSON payload that stores the user's roles. The value of this key must be a comma-separated list of roles. +`required_audience` | The name of the audience which the JWT must specify. This corresponds [`aud` claim of the JWT](https://datatracker.ietf.org/doc/html/rfc7519#section-4.1.3). +`required_issuer` | The target issuer of JWT stored in the JSON payload. This corresponds to the [`iss` claim of the JWT](https://datatracker.ietf.org/doc/html/rfc7519#section-4.1.1). `jwt_clock_skew_tolerance_seconds` | Sets a window of time, in seconds, to compensate for any disparity between the JWT authentication server and OpenSearch node clock times, thereby preventing authentication failures due to the misalignment. Security sets 30 seconds as the default. Use this setting to apply a custom value. Because JWTs are self-contained and the user is authenticated at the HTTP level, no additional `authentication_backend` is needed. Set this value to `noop`. diff --git a/_security/authentication-backends/ldap.md b/_security/authentication-backends/ldap.md index 2465288fd0..c6caec4524 100755 --- a/_security/authentication-backends/ldap.md +++ b/_security/authentication-backends/ldap.md @@ -155,7 +155,7 @@ By default, the Security plugin validates the TLS certificate of the LDAP server ``` plugins.security.ssl.transport.pemtrustedcas_filepath: ... -plugins.security.ssl.http.truststore_filepath: ... +plugins.security.ssl.transport.truststore_filepath: ... ``` If your server uses a certificate signed by a different CA, import this CA into your truststore or add it to your trusted CA file on each node. @@ -509,6 +509,7 @@ Name | Description `resolve_nested_roles` | Boolean. Whether or not to resolve nested roles. Default is `false`. `max_nested_depth` | Integer. When `resolve_nested_roles` is `true`, this defines the maximum number of nested roles to traverse. Setting smaller values can reduce the amount of data retrieved from LDAP and improve authentication times at the cost of failing to discover deeply nested roles. Default is `30`. `skip_users` | Array of users that should be skipped when retrieving roles. Wildcards and regular expressions are supported. +`exclude_roles` | Array of roles that should be excluded when retrieving roles. Wildcards are supported. `nested_role_filter` | Array of role DNs that should be filtered before resolving nested roles. Wildcards and regular expressions are supported. `rolesearch_enabled` | Boolean. Enable or disable the role search. Default is `true`. `custom_attr_allowlist` | String array. Specifies the LDAP attributes that should be made available for variable substitution. diff --git a/_security/authentication-backends/openid-connect.md b/_security/authentication-backends/openid-connect.md index 4d24f2eff7..22d62bee60 100755 --- a/_security/authentication-backends/openid-connect.md +++ b/_security/authentication-backends/openid-connect.md @@ -33,19 +33,26 @@ To integrate with an OpenID IdP, set up an authentication domain and choose `ope This is the minimal configuration: ```yml -openid_auth_domain: - http_enabled: true - transport_enabled: true - order: 0 - http_authenticator: - type: openid - challenge: false - config: - subject_key: preferred_username - roles_key: roles - openid_connect_url: https://keycloak.example.com:8080/auth/realms/master/.well-known/openid-configuration - authentication_backend: - type: noop +_meta: + type: "config" + config_version: 2 + +config: + dynamic: + authc: + openid_auth_domain: + http_enabled: true + transport_enabled: true + order: 0 + http_authenticator: + type: openid + challenge: false + config: + subject_key: preferred_username + roles_key: roles + openid_connect_url: https://keycloak.example.com:8080/auth/realms/master/.well-known/openid-configuration + authentication_backend: + type: noop ``` The following table shows the configuration parameters. @@ -341,7 +348,7 @@ opensearch.password: "kibanaserver" opensearch.ssl.verificationMode: none # allowlist basic headers and multi-tenancy header -opensearch.requestHeadersAllowlist: ["Authorization", "security_tenant"] +opensearch.requestHeadersAllowlist: ["Authorization", "securitytenant"] ``` To include OpenID Connect with other authentication types in the Dashboards sign-in window, see [Configuring sign-in options]({{site.url}}{{site.baseurl}}/security/configuration/multi-auth/). @@ -370,26 +377,33 @@ Because OpenSearch Dashboards requires that the internal OpenSearch Dashboards s Modify and apply the following example settings in `config.yml`: ```yml -basic_internal_auth_domain: - http_enabled: true - transport_enabled: true - order: 0 - http_authenticator: - type: basic - challenge: false - authentication_backend: - type: internal -openid_auth_domain: - http_enabled: true - transport_enabled: true - order: 1 - http_authenticator: - type: openid - challenge: false - config: - subject_key: preferred_username - roles_key: roles - openid_connect_url: https://keycloak.example.com:8080/auth/realms/master/.well-known/openid-configuration - authentication_backend: - type: noop +_meta: + type: "config" + config_version: 2 + +config: + dynamic: + authc: + basic_internal_auth_domain: + http_enabled: true + transport_enabled: true + order: 0 + http_authenticator: + type: basic + challenge: false + authentication_backend: + type: internal + openid_auth_domain: + http_enabled: true + transport_enabled: true + order: 1 + http_authenticator: + type: openid + challenge: false + config: + subject_key: preferred_username + roles_key: roles + openid_connect_url: https://keycloak.example.com:8080/auth/realms/master/.well-known/openid-configuration + authentication_backend: + type: noop ``` diff --git a/_security/authentication-backends/saml.md b/_security/authentication-backends/saml.md index b19b13761b..5313ffe950 100755 --- a/_security/authentication-backends/saml.md +++ b/_security/authentication-backends/saml.md @@ -19,37 +19,35 @@ This profile is meant for use with web browsers. It is not a general-purpose way We provide a fully functional example that can help you understand how to use SAML with OpenSearch Dashboards. -1. Download [the example zip file]({{site.url}}{{site.baseurl}}/assets/examples/saml-example-custom.zip) to a preferred location in your directory and unzip it. -1. At the command line, specify the location of the files in your directory and run `docker-compose up`. -1. Review the files: - - * `customize-docker-compose.yml`: Defines two OpenSearch nodes, an OpenSearch Dashboards server, and a SAML server. - * `customize-opensearch_dashboards.yml`: Includes SAML settings for the default `opensearch_dashboards.yml` file. - * `customize-config.yml`: Configures SAML for authentication. - - You can remove "customize" from the file names if you plan to modify and keep these files for production. - {: .tip } - -1. In the `docker-compose.yml` file, specify your OpenSearch version number in the `image` field for nodes 1 and 2, and OpenSearch Dashboards server. For example, if you are running OpenSearch version 2.6, the `image` fields will resemble the following examples: - - ```yml - opensearch-saml-node1: - image: opensearchproject/opensearch:2.8.0 - ``` - ```yml - opensearch-saml-node2: - image: opensearchproject/opensearch:2.8.0 - ``` - ```yml - opensearch-saml-dashboards: - image: opensearchproject/opensearch-dashboards:2.8.0 - ``` - -1. Access OpenSearch Dashboards at [http://localhost:5601](http://localhost:5601){:target='\_blank'}. Note that OpenSearch Dashboards immediately redirects you to the SAML login page. - -1. Log in as `admin` with a password of `admin`. - -1. After logging in, note that your user in the upper-right is `SAMLAdmin`, as defined in `/var/www/simplesamlphp/config/authsources.php` of the SAML server. +1. Visit the [saml-demo branch](https://github.com/opensearch-project/demos/tree/saml-demo) of the demos repository and download it to a folder of your choice. If you're not familiar with how to use GitHub, see the [OpenSearch onboarding guide](https://github.com/opensearch-project/demos/blob/main/ONBOARDING.md) for instructions. + +1. Navigate to the `demo` folder: + ```zsh + $ cd /demo + ``` + +1. Review the following files, as needed: + + * `.env`: + * Defines the OpenSearch and OpenSearch Dashboards version to use. The default is the latest version ({{site.opensearch_major_minor_version}}). + * Defines the `OPENSEARCH_INITIAL_ADMIN_PASSWORD` variable required by versions 2.12 and later. + * `./custom-config/opensearch_dashboards.yml`: Includes the SAML settings for the default `opensearch_dashboards.yml` file. + * `./custom-config/config.yml`: Configures SAML for authentication. + * `docker-compose.yml`: Defines an OpenSearch server node, an OpenSearch Dashboards server node, and a SAML server node. + * `./saml/config/authsources.php`: Contains the list of users that can be authenticated by this SAML domain. + +1. From the command line, run: + ```zsh + $ docker-compose up. + ``` + +1. Access OpenSearch Dashboards at [http://localhost:5601](http://localhost:5601){:target='\_blank'}. + +1. Select `Log in with single sign-on`. This redirects you to the SAML login page. + +1. Log in to OpenSearch Dashboards with a user defined in `./saml/config/authsources.php` (such as `user1` with password `user1pass`). + +1. After logging in, note that the user ID shown in the upper-right corner of the screen is the same as the `NameID` attribute for the user defined in `./saml/config/authsources.php` of the SAML server (that is, `saml-test` for `user1`). 1. If you want to examine the SAML server, run `docker ps` to find its container ID and then `docker exec -it /bin/bash`. @@ -61,20 +59,26 @@ We provide a fully functional example that can help you understand how to use SA To use SAML for authentication, you need to configure a respective authentication domain in the `authc` section of `config/opensearch-security/config.yml`. Because SAML works solely on the HTTP layer, you do not need any `authentication_backend` and can set it to `noop`. Place all SAML-specific configuration options in this chapter in the `config` section of the SAML HTTP authenticator: ```yml -authc: - saml_auth_domain: - http_enabled: true - transport_enabled: false - order: 1 - http_authenticator: - type: saml - challenge: true - config: - idp: - metadata_file: okta.xml - ... - authentication_backend: - type: noop +_meta: + type: "config" + config_version: 2 + +config: + dynamic: + authc: + saml_auth_domain: + http_enabled: true + transport_enabled: false + order: 1 + http_authenticator: + type: saml + challenge: true + config: + idp: + metadata_file: okta.xml + ... + authentication_backend: + type: noop ``` After you have configured SAML in `config.yml`, you must also [activate it in OpenSearch Dashboards](#opensearch-dashboards-configuration). @@ -85,27 +89,33 @@ After you have configured SAML in `config.yml`, you must also [activate it in Op We recommend adding at least one other authentication domain, such as LDAP or the internal user database, to support API access to OpenSearch without SAML. For OpenSearch Dashboards and the internal OpenSearch Dashboards server user, you also must add another authentication domain that supports basic authentication. This authentication domain should be placed first in the chain, and the `challenge` flag must be set to `false`: ```yml -authc: - basic_internal_auth_domain: - http_enabled: true - transport_enabled: true - order: 0 - http_authenticator: - type: basic - challenge: false - authentication_backend: - type: internal - saml_auth_domain: - http_enabled: true - transport_enabled: false - order: 1 - http_authenticator: - type: saml - challenge: true - config: - ... - authentication_backend: - type: noop +_meta: + type: "config" + config_version: 2 + +config: + dynamic: + authc: + basic_internal_auth_domain: + http_enabled: true + transport_enabled: true + order: 0 + http_authenticator: + type: basic + challenge: false + authentication_backend: + type: internal + saml_auth_domain: + http_enabled: true + transport_enabled: false + order: 1 + http_authenticator: + type: saml + challenge: true + config: + ... + authentication_backend: + type: noop ``` @@ -224,7 +234,8 @@ SAML, unlike other protocols, is not meant to be used for exchanging user creden Name | Description :--- | :--- -`exchange_key` | The key to sign the token. The algorithm is HMAC-SHA512, so it should have at least 64 characters, and base64 URL encoding. +`exchange_key` | The key to sign the token. The algorithm is HMACSHA512, therefore we recommend to use 64 characters, for example `9a2h8ajasdfhsdiydfn7dtd6d5ashsd89a2h8ajasdHhsdiyLfn7dtd6d5ashsdI`. Ensure that you enter a value for `exchange_key`, otherwise an error is returned. + ## TLS settings @@ -311,25 +322,31 @@ Name | Description The following example shows the minimal configuration: ```yml -authc: - saml_auth_domain: - http_enabled: true - transport_enabled: false - order: 1 - http_authenticator: - type: saml - challenge: true - config: - idp: - metadata_file: metadata.xml - entity_id: http://idp.example.com/ - sp: - entity_id: https://opensearch-dashboards.example.com - kibana_url: https://opensearch-dashboards.example.com:5601/ - roles_key: Role - exchange_key: 'peuvgOLrjzuhXf ...' - authentication_backend: - type: noop +_meta: + type: "config" + config_version: 2 + +config: + dynamic: + authc: + saml_auth_domain: + http_enabled: true + transport_enabled: false + order: 1 + http_authenticator: + type: saml + challenge: true + config: + idp: + metadata_file: metadata.xml + entity_id: http://idp.example.com/ + sp: + entity_id: https://opensearch-dashboards.example.com + kibana_url: https://opensearch-dashboards.example.com:5601/ + roles_key: Role + exchange_key: 'peuvgOLrjzuhXf ...' + authentication_backend: + type: noop ``` ## OpenSearch Dashboards configuration diff --git a/_security/configuration/demo-configuration.md b/_security/configuration/demo-configuration.md new file mode 100644 index 0000000000..0f8cd4138e --- /dev/null +++ b/_security/configuration/demo-configuration.md @@ -0,0 +1,124 @@ +--- +layout: default +title: Setting up a demo configuration +parent: Configuration +nav_order: 4 +--- + +# Setting up a demo configuration + +Welcome to the OpenSearch Security plugin demo configuration setup guide. This tool provides a quick and easy way to replicate a production environment for testing purposes. The demo configuration includes the setup of security-related components, such as internal users, roles, role mappings, audit configuration, basic authentication, tenants, and allow lists. + + +The demo configuration tool performs the following tasks: + +1. Configures security settings, which are then loaded into the security index. +2. Generates demo certificates. +3. Adds security-related settings to the `opensearch.yml` file. + +## Installing the demo configuration + +The demo configuration is automatically called as part of the setup for each supported distribution of OpenSearch. The following are instructions for each distribution. + +**Note**: Starting with OpenSearch 2.12, a custom admin password is required in order to install the demo configuration. If none is provided, the cluster will fail to start. Note that this change only affects new clusters. Existing clusters are not affected because they already have `opensearch.yml` configured, so the installation tool will not run. + +### Docker + +Use the following steps to set up the Security plugin using Docker: + +1. Download [docker-compose.yml](https://opensearch.org/downloads.html). +2. In the `docker-compose.yml` file, set `DISABLE_SECURITY_PLUGIN` to `false`. +3. Run the following command: + +```bash +docker-compose up +``` +{% include copy.html %} + +### Setting up a custom admin password +**Note**: For OpenSearch versions 2.12 and later, you must set the initial admin password before installation. To customize the admin password, you can take the following steps: + +1. Download the following sample [docker-compose.yml](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/docker-compose.yml) file. +2. Create a `.env` file. +3. Add the variable `OPENSEARCH_INITIAL_ADMIN_PASSWORD` and set the variable with a strong string password. +4. Run `docker-compose up`. + +### TAR (Linux) + +For TAR distributions on Linux, download the Linux setup files from the OpenSearch [Download & Get Started](https://opensearch.org/downloads.html) page. Then use the following command to run the demo configuration: + +```bash +./opensearch-tar-install.sh +``` +{% include copy.html %} + +For OpenSearch 2.12 or later, set a new custom admin password before installation by using the following command: + +```bash +export OPENSEARCH_INITIAL_ADMIN_PASSWORD= +``` +{% include copy.html %} + +### Windows + +For ZIP distributions on Windows, after downloading and extracting the setup files, run the following command: + +```powershell +> .\opensearch-windows-install.bat +``` +{% include copy.html %} + +For OpenSearch 2.12 or later, set a new custom admin password before installation by running the following command: + +```powershell +> set OPENSEARCH_INITIAL_ADMIN_PASSWORD= +``` +{% include copy.html %} + +### Helm + +For Helm charts, the demo configuration is automatically installed during the OpenSearch installation. For OpenSearch 2.12 or later, customize the admin password in `values.yaml` under `extraEnvs`: + +```yaml +extraEnvs: + - name: OPENSEARCH_INITIAL_ADMIN_PASSWORD + value: +``` + +### RPM + +For RPM packages, install OpenSearch and set up the demo configuration by running the following command: + +```bash +sudo yum install opensearch-{{site.opensearch_version}}-linux-x64.rpm +``` +{% include copy.html %} + +For OpenSearch 2.12 or later, set a new custom admin password before installation by using the following command: + +```bash +sudo env OPENSEARCH_INITIAL_ADMIN_PASSWORD= yum install opensearch-{{site.opensearch_version}}-linux-x64.rpm +``` +{% include copy.html %} + +### DEB + +For DEB packages, install OpenSearch and set up the demo configuration by running the following command: + +```bash +sudo dpkg -i opensearch-{{site.opensearch_version}}-linux-arm64.deb +``` +{% include copy.html %} + +For OpenSearch 2.12 or later, set a new custom admin password before installation by using the following command: + +```bash +sudo env OPENSEARCH_INITIAL_ADMIN_PASSWORD= dpkg -i opensearch-{{site.opensearch_version}}-linux-arm64.deb +``` +{% include copy.html %} + +## Local distribution + +If you are building a local distribution, refer to [DEVELOPER_GUIDE.md](https://github.com/opensearch-project/security/blob/main/DEVELOPER_GUIDE.md) for instructions on building a local binary for the Security plugin. + +For OpenSearch 2.12 or later, make sure that you set a strong password before installation. diff --git a/_security/configuration/disable-enable-security.md b/_security/configuration/disable-enable-security.md new file mode 100755 index 0000000000..811fd2a69f --- /dev/null +++ b/_security/configuration/disable-enable-security.md @@ -0,0 +1,202 @@ +--- +layout: default +title: Disabling and enabling the Security plugin +parent: Configuration +nav_order: 40 +has_toc: true +redirect_from: + - /security-plugin/configuration/disable/ +--- + +# Disabling and enabling the Security plugin + +The Security plugin is installed by default with OpenSearch, but you can temporarily disable it or remove it altogether. Disabling the plugin involves a change to the `opensearch.yml` file; you may want to do this to streamline testing. A more substantive change is required to remove the Security plugin completely. You might want to remove it if, for example, you are using your own security solution or need to remove it for development purposes. + +Disabling or removing the plugin exposes the configuration index for the Security plugin. If the index contains sensitive information, make sure to protect it through some other means. If you no longer need the index, delete it. +{: .warning } + +Disabling, removing, or installing the Security plugin requires a full cluster restart because during this process, the individual nodes are not able to communicate with each other. +{: .warning} + +## Disabling/enabling the Security plugin + +You can disable the Security plugin by editing the `opensearch.yml` file: + +```yml +plugins.security.disabled: true +``` +You can then enable the plugin by removing the `plugins.security.disabled` setting. + +## Removing and adding the Security plugin + +You can completely remove the Security plugin from your OpenSearch instance. Note that OpenSearch Dashboards can only run against a secure cluster, so if you uninstall the Security plugin, you'll also need to uninstall the OpenSearch Dashboards plugin. + +### Removing the Security plugin from OpenSearch + +Do the following to remove the plugin from OpenSearch. + +1. Disable shard allocation and stop all nodes so that shards don't move when the cluster is restarted: + + ```json + curl -XPUT "https://localhost:9200/_cluster/settings" -u "admin:" -H 'Content-Type: application/json' -d '{ + "transient": { + "cluster.routing.allocation.enable": "none" + } + }' + ``` + {% include copy.html %} +2. Delete all `plugins.security.*` configuration entries from `opensearch.yml`. +3. Uninstall the Security plugin by using the following command: + + ```bash + ./bin/opensearch-plugin remove opensearch-security + ``` +4. Restart the nodes and enable shard allocation: + ```json + curl -XPUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d '{ + "transient": { + "cluster.routing.allocation.enable": "all" + } + }' + ``` + +To perform these steps on the Docker image, see [Working with plugins]({{site.url}}{{site.baseurl}}/opensearch/install/docker#working-with-plugins). +{: .note } + +### Removing the Security plugin from OpenSearch Dashboards + +If you disable the Security plugin in `opensearch.yml` and still want to use OpenSearch Dashboards, you must remove the corresponding OpenSearch Dashboards Security plugin. For more information, see [Remove plugins]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/plugins/#remove-plugins). + +Refer to the following installation types to remove the OpenSearch Dashboards plugin. + +#### Docker + +1. Remove all Security plugin configuration settings from `opensearch_dashboards.yml` or move the example file to the same folder as the `Dockerfile`: + + ```yml + --- + server.name: opensearch-dashboards + server.host: "0.0.0.0" + opensearch.hosts: http://localhost:9200 + ``` + +1. Create a new `Dockerfile`: + + ``` + FROM opensearchproject/opensearch-dashboards:{{site.opensearch_dashboards_version}} + RUN /usr/share/opensearch-dashboards/bin/opensearch-dashboards-plugin remove securityDashboards + COPY --chown=opensearch-dashboards:opensearch-dashboards opensearch_dashboards.yml /usr/share/opensearch-dashboards/config/ + ``` + +1. To build the new Docker image, run the following command: + + ```bash + docker build --tag=opensearch-dashboards-no-security . + ``` + +1. In `docker-compose.yml`, change `opensearchproject/opensearch-dashboards:{{site.opensearch_dashboards_version}}` to `opensearch-dashboards-no-security`. +1. Change `OPENSEARCH_HOSTS` or `opensearch.hosts` to `http://` rather than `https://`. +1. Enter `docker-compose up`. + +#### Tarball + +1. Navigate to the `/bin` directory in your OpenSearch Dashboards installation folder and stop the running OpenSearch Dashboards instance by pressing `Ctrl + C`. + +1. Run the following command to uninstall the Security plugin: + + ```bash + ./bin/opensearch-dashboards-plugin remove securityDashboards + ``` + +1. Remove all Security plugin configuration settings from the `opensearch_dashboards.yml` file or use the following example file: + + ```yml + --- + server.name: opensearch-dashboards + server.host: "0.0.0.0" + opensearch.hosts: http://localhost:9200 + ``` + +1. Start OpenSearch Dashboards: + ```bash + ./bin/opensearch-dashboards + ``` + +#### RPM and Debian + +1. Stop the running instance of OpenSearch Dashboards by using the following command: + + ```bash + sudo systemctl stop opensearch-dashboards + ``` + +1. Navigate to the OpenSearch Dashboards folder `/usr/share/opensearch-dashboards` and run the following command to uninstall the Security plugin: + + ```bash + ./bin/opensearch-dashboards-plugin remove securityDashboards + ``` + +1. Remove all Security plugin configuration settings from the `opensearch_dashboards.yml` file or place the example file in the `/etc/opensearch_dashboards` folder: + + ```yml + --- + server.name: opensearch-dashboards + server.host: "0.0.0.0" + opensearch.hosts: http://localhost:9200 + ``` +1. Start OpenSearch Dashboards: + ```bash + sudo systemctl start opensearch-dashboards + ``` + +### Installing the Security plugin + +Use the following steps to reinstall the plugin: + +1. Disable shard allocation and stop all nodes so that shards don't move when the cluster is restarted: + + ```json + curl -XPUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d '{ + "transient": { + "cluster.routing.allocation.enable": "none" + } + }' + ``` + {% include copy.html %} + +2. Install the Security plugin on all nodes in your cluster using one of the [installation methods]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#install): + + ```bash + bin/opensearch-plugin install opensearch-security + ``` + {% include copy.html %} + +3. Add the necessary configuration to `opensearch.yml` for TLS encryption. See +[Configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/) for information about the settings that need to be configured. + +4. Create the `OPENSEARCH_INITIAL_ADMIN_PASSWORD` variable. For more information, see [Setting up a custom admin password](https://opensearch.org/docs/latest/security/configuration/demo-configuration/#setting-up-a-custom-admin-password). + +5. Restart the nodes and reenable shard allocation: + + ```json + curl -XPUT "https://localhost:9200/_cluster/settings" -u "admin:" -H 'Content-Type: application/json' -d '{ + "transient": { + "cluster.routing.allocation.enable": "all" + } + }' + ``` + {% include copy.html %} + +### Installing the Security plugin on OpenSearch Dashboards + +Use the following steps to reinstall the plugin on OpenSearch Dashboards: + +1. Stop running your OpenSearch Dashboards cluster. +2. Install the Security plugin: + + ```bash + ./bin/opensearch-dashboards-plugin install securityDashboards + ``` + +4. Add the necessary [configuration]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/tls/) settings in the `opensearch_dashboards.yml` file. +5. Start OpenSearch Dashboards. If the plugin was successfully installed, you'll be prompted to enter your login credentials. diff --git a/_security/configuration/disable.md b/_security/configuration/disable.md deleted file mode 100755 index 568a79d094..0000000000 --- a/_security/configuration/disable.md +++ /dev/null @@ -1,121 +0,0 @@ ---- -layout: default -title: Disabling security -parent: Configuration -nav_order: 40 -redirect_from: - - /security-plugin/configuration/disable/ ---- - -# Disabling security - -You might want to temporarily disable the Security plugin to make testing or internal usage more straightforward. The Security plugin is actually two plugins: one for OpenSearch and one for OpenSearch Dashboards. You can use the OpenSearch plugin independently, but the OpenSearch Dashboards plugin requires a secured OpenSearch cluster. - -To disable the OpenSearch Security plugin, add the following line in `opensearch.yml`: - -```yml -plugins.security.disabled: true -``` - - -## Removing the OpenSearch plugin - -A more permanent option is to remove the Security plugin entirely: - -1. Delete the `plugins/opensearch-security` folder on all nodes. -1. Delete all `plugins.security.*` configuration entries from `opensearch.yml`. -1. Uninstall the Security plugin by using the following command: -```bash -/usr/share/opensearch/opensearch-plugin remove opensearch-security -``` - -To perform these steps on the Docker image, see [Working with plugins]({{site.url}}{{site.baseurl}}/opensearch/install/docker#working-with-plugins). - -Disabling or removing the plugin exposes the configuration index for the Security plugin. If the index contains sensitive information, be sure to protect it through some other means. If you no longer need the index, delete it. -{: .warning } - - -## Removing the OpenSearch Dashboards plugin - -If you disable the Security plugin in `opensearch.yml` (or delete the plugin entirely) and still want to use OpenSearch Dashboards, you must remove the corresponding OpenSearch Dashboards plugin. For more information, see [OpenSearch Dashboards remove plugins]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/plugins/#remove-plugins). - -Refer to the following installation types to remove the OpenSearch Dashboards plugin. - -### Docker - -1. Remove all Security plugin configuration settings from `opensearch_dashboards.yml` or place the example file in the same folder as the `Dockerfile`: - - ```yml - --- - server.name: opensearch-dashboards - server.host: "0.0.0.0" - opensearch.hosts: http://localhost:9200 - ``` - -1. Create a new `Dockerfile`: - - ``` - FROM opensearchproject/opensearch-dashboards:{{site.opensearch_dashboards_version}} - RUN /usr/share/opensearch-dashboards/bin/opensearch-dashboards-plugin remove securityDashboards - COPY --chown=opensearch-dashboards:opensearch-dashboards opensearch_dashboards.yml /usr/share/opensearch-dashboards/config/ - ``` - -1. To build the new Docker image, run the following command: - - ```bash - docker build --tag=opensearch-dashboards-no-security . - ``` - -1. In `docker-compose.yml`, change `opensearchproject/opensearch-dashboards:{{site.opensearch_dashboards_version}}` to `opensearch-dashboards-no-security`. -1. Change `OPENSEARCH_HOSTS` or `opensearch.hosts` to `http://` rather than `https://`. -1. Enter `docker-compose up`. - -### Tarball - -1. Navigate to the `/bin` directory in your OpenSearch Dashboards installation folder and stop the running OpenSearch Dashboards instance by pressing `Ctrl + C`. - -1. Run the following command to uninstall the Security plugin: - - ```bash - ./bin/opensearch-dashboards-plugin remove securityDashboards - ``` - -1. Remove all Security plugin configuration settings from the `opensearch_dashboards.yml` file or use the following example file: - - ```yml - --- - server.name: opensearch-dashboards - server.host: "0.0.0.0" - opensearch.hosts: http://localhost:9200 - ``` -1. Start OpenSearch Dashboards. - ```bash - ./bin/opensearch-dashboards - ``` - -### RPM and Debian - -1. Stop the running instance of OpenSearch Dashboards by using the following command: - - ```bash - sudo systemctl stop opensearch-dashboards - ``` - -1. Navigate to the OpenSearch Dashboards folder `/usr/share/opensearch-dashboards` and run the following command to uninstall the Security plugin: - - ```bash - ./bin/opensearch-dashboards-plugin remove securityDashboards - ``` - -1. Remove all Security plugin configuration settings from the `opensearch_dashboards.yml` file or place the example file in the `/etc/opensearch_dashboards` folder: - - ```yml - --- - server.name: opensearch-dashboards - server.host: "0.0.0.0" - opensearch.hosts: http://localhost:9200 - ``` -1. Start OpenSearch Dashboards: - ```bash - sudo systemctl start opensearch-dashboards - ``` diff --git a/_security/configuration/index.md b/_security/configuration/index.md index 05dc3696cb..31292c320a 100644 --- a/_security/configuration/index.md +++ b/_security/configuration/index.md @@ -22,7 +22,7 @@ The plugin includes demo certificates so that you can get up and running quickly 1. Start OpenSearch. 1. [Add users, roles, role mappings, and tenants]({{site.url}}{{site.baseurl}}/security/access-control/index/). -If you don't want to use the plugin, see [Disable security]({{site.url}}{{site.baseurl}}/security/configuration/disable). +If you don't want to use the plugin, see [Disable security]({{site.url}}{{site.baseurl}}/security/configuration/disable-enable-security/). The Security plugin has several default users, roles, action groups, permissions, and settings for OpenSearch Dashboards that use kibana in their names. We will change these names in a future release. {: .note } diff --git a/_security/configuration/opensearch-keystore.md b/_security/configuration/opensearch-keystore.md new file mode 100644 index 0000000000..3c78c9a8e3 --- /dev/null +++ b/_security/configuration/opensearch-keystore.md @@ -0,0 +1,128 @@ +--- +layout: default +title: OpenSearch keystore +parent: Configuration +nav_order: 50 +--- + +# OpenSearch keystore + +`opensearch-keystore` is a utility script used to manage an OpenSearch keystore. An OpenSearch keystore provides a secure method of storing sensitive information, such as passwords and keys, used in an OpenSearch cluster. The script allows you to securely create, list, add, and remove settings. It is included in the OpenSearch distribution. + +## Usage + +In order to use the `opensearch-keystore` script, you must have access to the file system containing the OpenSearch installation and the ability to execute OpenSearch scripts. + +To use `opensearch-keystore`, open a terminal and use the following command syntax: + +``` +opensearch-keystore [command] [options] +``` +{% include copy.html %} + +## Commands + +The `opensearch-keystore` script supports the following the commands: + +- `create`: Initializes a new keystore. If a keystore already exists, this command will overwrite the existing keystore. +- `list`: Lists all settings in the keystore. +- `add `: Adds a new setting to the current keystore. When a new setting is added, the script prompts you for the value of that setting. After adding the setting and value, both are securely stored in the keystore. +- `add-file `: Adds a new file to the keystore. +- `remove `: Removes an existing setting from the keystore. +- `upgrade `: Upgrades an existing setting in the keystore. +- `passwd`: Sets a password for the keystore. +- `has-passwd`: Prints whether the keystore is password protected. +- `help`: Displays help information about all `opensearch-keystore` commands. + +## Options + +You can append each command with the following options: + +- `-h, --help`: Displays help information about the script and its options. +- `-s, --silent`: Provides minimal output when the script responds to a command. +- `-v, --verbose`: Provides a verbose output for debugging purposes. + +## Examples + +The following examples provide the basic syntax for common `opensearch-keystore` commands: + +### Creating a new keystore + +The following command creates a new keystore: + +```bash +./bin/opensearch-keystore create +``` +{% include copy.html %} + +If a keystore already exists, the script will ask whether you would like to overwrite the existing keystore. + +The script responds with a confirmation that the keystore was created: + +```bash +Created opensearch keystore in $OPENSEARCH_HOME/config/opensearch.keystore +``` + +### Setting a keystore password + +The following command sets a new keystore password: + +```bash +./bin/opensearch-keystore passwd +``` +{% include copy.html %} + +If a keystore password already exists, the script will ask for the current keystore password before you can reset the password. + +**Response** + +The script responds with a confirmation that the keystore password was set successfully: + +```bash +OpenSearch keystore password changed successfully. +``` + +When starting OpenSearch you will be prompted to enter the keystore password. Alternatively, you can set the environment variable KEYSTORE_PASSWORD to avoid being prompted for password on startup. +{: .note} + +### Listing settings in the keystore + +The following commands list all setting currently in the keystore: + +```bash +./bin/opensearch-keystore list +``` +{% include copy.html %} + +The script responds with a list of settings in the keystore: + +```bash +keystore.seed +plugins.security.ssl.http.pemkey_password_secure +``` + +### Adding a new setting + +The following command adds a new keystore setting: + +```bash +./bin/opensearch-keystore add plugins.security.ssl.http.pemkey_password_secure +``` +{% include copy.html %} + +After this command, you will be prompted to enter the secret key securely. + +### Removing a setting + +The following command removes a keystore setting: + +```bash +./bin/opensearch-keystore remove plugins.security.ssl.http.pemkey_password_secure +``` +{% include copy.html %} + +No response exists for this command. To confirm that the setting was deleted, use `opensearch-keystore list`. + +## KeyStore entries as OpenSearch settings + +After a setting has been added to a keystore, it is implicitly added to the OpenSearch configuration as if it were another entry in `opensearch.yml`. To modify a keystore entry use `./bin/opensearch-keystore upgrade `. To remove an entry, use `./bin/opensearch-keystore remove `. diff --git a/_security/configuration/yaml.md b/_security/configuration/yaml.md index 258866a7f8..af60238b42 100644 --- a/_security/configuration/yaml.md +++ b/_security/configuration/yaml.md @@ -139,12 +139,12 @@ plugins.security.cache.ttl_minutes: 60 ### Enabling user access to system indexes -Mapping a system index permission to a user allows that user to modify the system index specified in the permission's name (the one exception is the Security plugin's [system index]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/)). The `plugins.security.system_indices.permissions.enabled` setting provides a way for administrators to make this permission available for or hidden from role mapping. +Mapping a system index permission to a user allows that user to modify the system index specified in the permission's name (the one exception is the Security plugin's [system index]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/)). The `plugins.security.system_indices.permission.enabled` setting provides a way for administrators to make this permission available for or hidden from role mapping. When set to `true`, the feature is enabled and users with permission to modify roles can create roles that include permissions that grant access to system indexes: ```yml -plugins.security.system_indices.permissions.enabled: true +plugins.security.system_indices.permission.enabled: true ``` When set to `false`, the permission is disabled and only admins with an admin certificate can make changes to system indexes. By default, the permission is set to `false` in a new cluster. diff --git a/_security/multi-tenancy/multi-tenancy-config.md b/_security/multi-tenancy/multi-tenancy-config.md index e6b1e16eb3..a4da35d6e9 100644 --- a/_security/multi-tenancy/multi-tenancy-config.md +++ b/_security/multi-tenancy/multi-tenancy-config.md @@ -8,7 +8,7 @@ nav_order: 145 # Multi-tenancy configuration -Multi-tenancy is enabled by default, but you can disable it or change its settings using `config/opensearch-security/config.yml`: +Multi-tenancy is enabled in OpenSearch Dashboards by default. If you need to disable or change settings related to multi-tenancy, see the `kibana` settings in `config/opensearch-security/config.yml`, as shown in the following example: ```yml config: diff --git a/_security/multi-tenancy/tenant-index.md b/_security/multi-tenancy/tenant-index.md index 40a7cd7016..d4e13ad193 100644 --- a/_security/multi-tenancy/tenant-index.md +++ b/_security/multi-tenancy/tenant-index.md @@ -16,7 +16,7 @@ redirect_from: - **Private** -- This tenant is exclusive to each user and can't be shared. It does not allow you to access routes or index patterns created by the user's global tenant. - **Custom** -- Administrators can create custom tenants and assign them to specific roles. Once created, these tenants can then provide spaces for specific groups of users. -The global tenant is not a *primary* tenant in the sense that it replicates its content in a private tenant. To the contrary, if you make a change to your global tenant, you won't see that change reflected in your private tenant. Some example changes include the following: +The global tenant in OpenSearch Dashboards doesn't synchronize its content with private tenants. When you make modifications inside your global tenant, these changes are exclusive to the global tenant. They aren't automatically mirrored or replicated in the private tenant. Some example changes to both private and global tenants include the following: - Change advanced settings - Create visualizations diff --git a/_tools/index.md b/_tools/index.md index 1b8817337f..108f10da97 100644 --- a/_tools/index.md +++ b/_tools/index.md @@ -107,6 +107,8 @@ Some users report compatibility issues with ingest pipelines on these versions o Beats versions newer than 7.12.x are not supported by OpenSearch. If you must update the Beats agent(s) in your environment to a newer version, you can work around the incompatibility by directing traffic from Beats to Logstash and using the Logstash Output plugin to ingest the data to OpenSearch. {: .warning } +For recommendations about log and metrics collection tools, see the [Frequently Asked Questions](https://opensearch.org/faq/#q1.20). + ## OpenSearch CLI The OpenSearch CLI command line interface (opensearch-cli) lets you manage your OpenSearch cluster from the command line and automate tasks. For more information about OpenSearch CLI, see [OpenSearch CLI]({{site.url}}{{site.baseurl}}/tools/cli/). @@ -119,4 +121,4 @@ The OpenSearch Kubernetes Operator is an open-source Kubernetes operator that he OpenSearch migration tools facilitate migrations to OpenSearch and upgrades to newer versions of OpenSearch. These can help you can set up a proof-of-concept environment locally using Docker containers or deploy to AWS using a one-click deployment script. This empowers you to fine-tune cluster configurations and manage workloads more effectively before migration. -For more information about OpenSearch migration tools, see the documentation in the [OpenSearch Migration GitHub repository](https://github.com/opensearch-project/opensearch-migrations/tree/capture-and-replay-v0.1.0). \ No newline at end of file +For more information about OpenSearch migration tools, see the documentation in the [OpenSearch Migration GitHub repository](https://github.com/opensearch-project/opensearch-migrations/tree/capture-and-replay-v0.1.0). diff --git a/_troubleshoot/index.md b/_troubleshoot/index.md index c10a502073..22c7a1018f 100644 --- a/_troubleshoot/index.md +++ b/_troubleshoot/index.md @@ -30,7 +30,7 @@ If you run legacy Kibana OSS scripts against OpenSearch Dashboards---for example In this case, your scripts likely include the `"kbn-xsrf: true"` header. Switch it to the `osd-xsrf: true` header: ``` -curl -XPOST -u 'admin:admin' 'https://DASHBOARDS_ENDPOINT/api/saved_objects/_import' -H 'osd-xsrf:true' --form file=@export.ndjson +curl -XPOST -u 'admin:' 'https://DASHBOARDS_ENDPOINT/api/saved_objects/_import' -H 'osd-xsrf:true' --form file=@export.ndjson ``` diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/index.md b/_tuning-your-cluster/availability-and-recovery/remote-store/index.md index fc6643955a..5fd19f5cc2 100644 --- a/_tuning-your-cluster/availability-and-recovery/remote-store/index.md +++ b/_tuning-your-cluster/availability-and-recovery/remote-store/index.md @@ -86,7 +86,7 @@ curl -X POST "https://localhost:9200/_remotestore/_restore" -H 'Content-Type: ap **Restore all shards of a given index** ```bash -curl -X POST "https://localhost:9200/_remotestore/_restore?restore_all_shards=true" -ku admin:admin -H 'Content-Type: application/json' -d' +curl -X POST "https://localhost:9200/_remotestore/_restore?restore_all_shards=true" -ku admin: -H 'Content-Type: application/json' -d' { "indices": ["my-index"] } diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md index c5581d864d..7cc533fe76 100644 --- a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md +++ b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md @@ -23,8 +23,13 @@ _Cluster state_ is an internal data structure that contains the metadata of the The cluster state metadata is managed by the elected cluster manager node and is essential for the cluster to properly function. When the cluster loses the majority of the cluster manager nodes permanently, then the cluster may experience data loss because the latest cluster state metadata might not be present in the surviving cluster manager nodes. Persisting the state of all the cluster manager nodes in the cluster to remote-backed storage provides better durability. -When the remote cluster state feature is enabled, the cluster metadata will be published to a remote repository configured in the cluster. As of OpenSearch 2.10, only index metadata will persist to remote-backed storage. -Any time new cluster manager nodes are launched after disaster recovery, the nodes will automatically bootstrap using the latest index metadata stored in the remote repository. Consequently, the index data will also be restored when the remote store is enabled. +When the remote cluster state feature is enabled, the cluster metadata will be published to a remote repository configured in the cluster. +Any time new cluster manager nodes are launched after disaster recovery, the nodes will automatically bootstrap using the latest metadata stored in the remote repository. This provides metadata durability. + +You can enable remote cluster state independently of remote-backed data storage. +{: .note} + +If you require data durability, you must enable remote-backed data storage as described in the [remote store documentation]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/index/). ## Configuring the remote cluster state @@ -45,9 +50,16 @@ node.attr.remote_store.repository.my-remote-state-repo.settings.region: +
Table of contents @@ -101,6 +101,8 @@ You will most likely not need to specify any parameters except for `location`. F After the Docker cluster starts, skip to step 7. + If you're using [AWS IAM instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles.html) to allow OpenSearch nodes on AWS EC2 instances to inherit roles for policies when granting access to AWS S3 buckets, skip to step 8. + 1. Add your AWS access and secret keys to the OpenSearch keystore: ```bash diff --git a/_tuning-your-cluster/index.md b/_tuning-your-cluster/index.md index 2f9da5f6be..4a2a027d2b 100644 --- a/_tuning-your-cluster/index.md +++ b/_tuning-your-cluster/index.md @@ -177,7 +177,7 @@ less /var/log/opensearch/opensearch-cluster.log Perform the following `_cat` query on any node to see all the nodes formed as a cluster: ```bash -curl -XGET https://:9200/_cat/nodes?v -u 'admin:admin' --insecure +curl -XGET https://:9200/_cat/nodes?v -u 'admin:' --insecure ``` ``` diff --git a/_tuning-your-cluster/replication-plugin/auto-follow.md b/_tuning-your-cluster/replication-plugin/auto-follow.md index fb94622727..828b835387 100644 --- a/_tuning-your-cluster/replication-plugin/auto-follow.md +++ b/_tuning-your-cluster/replication-plugin/auto-follow.md @@ -28,7 +28,7 @@ Replication rules are a collection of patterns that you create against a single Create a replication rule on the follower cluster: ```bash -curl -XPOST -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9200/_plugins/_replication/_autofollow?pretty' -d ' +curl -XPOST -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_plugins/_replication/_autofollow?pretty' -d ' { "leader_alias" : "my-connection-alias", "name": "my-replication-rule", @@ -46,13 +46,13 @@ If the Security plugin is disabled, you can leave out the `use_roles` parameter. To test the rule, create a matching index on the leader cluster: ```bash -curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9201/movies-0001?pretty' +curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9201/movies-0001?pretty' ``` And confirm its replica shows up on the follower cluster: ```bash -curl -XGET -u 'admin:admin' -k 'https://localhost:9200/_cat/indices?v' +curl -XGET -u 'admin:' -k 'https://localhost:9200/_cat/indices?v' ``` It might take several seconds for the index to appear. @@ -67,7 +67,7 @@ yellow open movies-0001 kHOxYYHxRMeszLjTD9rvSQ 1 1 0 To retrieve a list of existing replication rules that are configured on a cluster, send the following request: ```bash -curl -XGET -u 'admin:admin' -k 'https://localhost:9200/_plugins/_replication/autofollow_stats' +curl -XGET -u 'admin:' -k 'https://localhost:9200/_plugins/_replication/autofollow_stats' { "num_success_start_replication": 1, @@ -96,7 +96,7 @@ curl -XGET -u 'admin:admin' -k 'https://localhost:9200/_plugins/_replication/aut To delete a replication rule, send the following request to the follower cluster: ```bash -curl -XDELETE -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9200/_plugins/_replication/_autofollow?pretty' -d ' +curl -XDELETE -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_plugins/_replication/_autofollow?pretty' -d ' { "leader_alias" : "my-conection-alias", "name": "my-replication-rule" diff --git a/_tuning-your-cluster/replication-plugin/getting-started.md b/_tuning-your-cluster/replication-plugin/getting-started.md index c8184d0646..2387c23b68 100644 --- a/_tuning-your-cluster/replication-plugin/getting-started.md +++ b/_tuning-your-cluster/replication-plugin/getting-started.md @@ -32,7 +32,7 @@ In addition, verify and add the distinguished names (DNs) of each follower clust First, get the node's DN from each follower cluster: ```bash -curl -XGET -k -u 'admin:admin' 'https://localhost:9200/_opendistro/_security/api/ssl/certs?pretty' +curl -XGET -k -u 'admin:' 'https://localhost:9200/_opendistro/_security/api/ssl/certs?pretty' { "transport_certificates_list": [ @@ -110,13 +110,13 @@ networks: After the clusters start, verify the names of each: ```bash -curl -XGET -u 'admin:admin' -k 'https://localhost:9201' +curl -XGET -u 'admin:' -k 'https://localhost:9201' { "cluster_name" : "leader-cluster", ... } -curl -XGET -u 'admin:admin' -k 'https://localhost:9200' +curl -XGET -u 'admin:' -k 'https://localhost:9200' { "cluster_name" : "follower-cluster", ... @@ -148,7 +148,7 @@ Cross-cluster replication follows a "pull" model, so most changes occur on the f On the follower cluster, add the IP address (with port 9300) for each seed node. Because this is a single-node cluster, you only have one seed node. Provide a descriptive name for the connection, which you'll use in the request to start replication: ```bash -curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9200/_cluster/settings?pretty' -d ' +curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_cluster/settings?pretty' -d ' { "persistent": { "cluster": { @@ -167,13 +167,13 @@ curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://loca To get started, create an index called `leader-01` on the leader cluster: ```bash -curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9201/leader-01?pretty' +curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9201/leader-01?pretty' ``` Then start replication from the follower cluster. In the request body, provide the connection name and leader index that you want to replicate, along with the security roles you want to use: ```bash -curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9200/_plugins/_replication/follower-01/_start?pretty' -d ' +curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_plugins/_replication/follower-01/_start?pretty' -d ' { "leader_alias": "my-connection-alias", "leader_index": "leader-01", @@ -194,7 +194,7 @@ This command creates an identical read-only index named `follower-01` on the fol After replication starts, get the status: ```bash -curl -XGET -k -u 'admin:admin' 'https://localhost:9200/_plugins/_replication/follower-01/_status?pretty' +curl -XGET -k -u 'admin:' 'https://localhost:9200/_plugins/_replication/follower-01/_status?pretty' { "status" : "SYNCING", @@ -217,13 +217,13 @@ The leader and follower checkpoint values begin as negative numbers and reflect To confirm that replication is actually happening, add a document to the leader index: ```bash -curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9201/leader-01/_doc/1?pretty' -d '{"The Shining": "Stephen King"}' +curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9201/leader-01/_doc/1?pretty' -d '{"The Shining": "Stephen King"}' ``` Then validate the replicated content on the follower index: ```bash -curl -XGET -k -u 'admin:admin' 'https://localhost:9200/follower-01/_search?pretty' +curl -XGET -k -u 'admin:' 'https://localhost:9200/follower-01/_search?pretty' { ... @@ -251,13 +251,13 @@ After the first replication API trigger, the `.replication-metadata-store` index You can temporarily pause replication of an index if you need to remediate issues or reduce load on the leader cluster: ```bash -curl -XPOST -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9200/_plugins/_replication/follower-01/_pause?pretty' -d '{}' +curl -XPOST -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_plugins/_replication/follower-01/_pause?pretty' -d '{}' ``` To confirm that replication is paused, get the status: ```bash -curl -XGET -k -u 'admin:admin' 'https://localhost:9200/_plugins/_replication/follower-01/_status?pretty' +curl -XGET -k -u 'admin:' 'https://localhost:9200/_plugins/_replication/follower-01/_status?pretty' { "status" : "PAUSED", @@ -271,7 +271,7 @@ curl -XGET -k -u 'admin:admin' 'https://localhost:9200/_plugins/_replication/fol When you're done making changes, resume replication: ```bash -curl -XPOST -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9200/_plugins/_replication/follower-01/_resume?pretty' -d '{}' +curl -XPOST -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_plugins/_replication/follower-01/_resume?pretty' -d '{}' ``` When replication resumes, the follower index picks up any changes that were made to the leader index while replication was paused. @@ -283,7 +283,7 @@ Note that you can't resume replication after it's been paused for more than 12 h When you no longer need to replicate an index, terminate replication from the follower cluster: ```bash -curl -XPOST -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9200/_plugins/_replication/follower-01/_stop?pretty' -d '{}' +curl -XPOST -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_plugins/_replication/follower-01/_stop?pretty' -d '{}' ``` When you stop replication, the follower index un-follows the leader and becomes a standard index that you can write to. You can't restart replication after stopping it. @@ -291,7 +291,7 @@ When you stop replication, the follower index un-follows the leader and becomes Get the status to confirm that the index is no longer being replicated: ```bash -curl -XGET -k -u 'admin:admin' 'https://localhost:9200/_plugins/_replication/follower-01/_status?pretty' +curl -XGET -k -u 'admin:' 'https://localhost:9200/_plugins/_replication/follower-01/_status?pretty' { "status" : "REPLICATION NOT IN PROGRESS" diff --git a/_upgrade-to/upgrade-to.md b/_upgrade-to/upgrade-to.md index 3316072a07..340055b214 100644 --- a/_upgrade-to/upgrade-to.md +++ b/_upgrade-to/upgrade-to.md @@ -87,7 +87,7 @@ If you are migrating an Open Distro for Elasticsearch cluster, we recommend firs # Elasticsearch OSS curl -XGET 'localhost:9200/_nodes/_all?pretty=true' # Open Distro for Elasticsearch with Security plugin enabled - curl -XGET 'https://localhost:9200/_nodes/_all?pretty=true' -u 'admin:admin' -k + curl -XGET 'https://localhost:9200/_nodes/_all?pretty=true' -u 'admin:' -k ``` Specifically, check the `nodes..version` portion of the response. Also check `_cat/indices?v` for a green status on all indexes. @@ -169,7 +169,7 @@ If you are migrating an Open Distro for Elasticsearch cluster, we recommend firs # Security plugin disabled curl -XGET 'localhost:9200/_nodes/_all?pretty=true' # Security plugin enabled - curl -XGET -k -u 'admin:admin' 'https://localhost:9200/_nodes/_all?pretty=true' + curl -XGET -k -u 'admin:' 'https://localhost:9200/_nodes/_all?pretty=true' ``` Specifically, check the `nodes..version` portion of the response. Also check `_cat/indices?v` for a green status on all indexes. diff --git a/assets/examples/docker-compose.yml b/assets/examples/docker-compose.yml index 53a8aaf4db..bab29f90ca 100644 --- a/assets/examples/docker-compose.yml +++ b/assets/examples/docker-compose.yml @@ -10,6 +10,7 @@ services: - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 # Nodes eligibile to serve as cluster manager - bootstrap.memory_lock=true # Disable JVM heap memory swapping - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # Set min and max JVM heap sizes to at least 50% of system RAM + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration (for OpenSearch 2.12 and later) ulimits: memlock: soft: -1 # Set memlock to unlimited (no soft or hard limit) @@ -34,6 +35,7 @@ services: - cluster.initial_cluster_manager_nodes=opensearch-node1,opensearch-node2 - bootstrap.memory_lock=true - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} ulimits: memlock: soft: -1 diff --git a/assets/examples/saml-example-custom.zip b/assets/examples/saml-example-custom.zip deleted file mode 100644 index acb733ffd5..0000000000 Binary files a/assets/examples/saml-example-custom.zip and /dev/null differ diff --git a/assets/js/home-listener.js b/assets/js/home-listener.js new file mode 100644 index 0000000000..160414f132 --- /dev/null +++ b/assets/js/home-listener.js @@ -0,0 +1,5 @@ +const contributeButton = document.getElementById('contribute'); + +contributeButton.addEventListener('click', function(event) { + window.open('https://github.com/opensearch-project', '_blank'); +}); \ No newline at end of file diff --git a/assets/js/listener.js b/assets/js/listener.js index 2d7221c73d..029e042419 100644 --- a/assets/js/listener.js +++ b/assets/js/listener.js @@ -28,9 +28,6 @@ document.addEventListener('click', function(event) { else if (target.matches('.copy-button')) { window.navigator.clipboard.writeText(target.getAttribute('data-text')); } - else if (target.matches('.btn-contribute')) { - window.open('https://github.com/opensearch-project', '_blank'); - } }); nav.addEventListener('scroll',(e)=>{ diff --git a/images/Security/c-log-type.png b/images/Security/c-log-type.png index 2ec9967d1c..5008c6655a 100644 Binary files a/images/Security/c-log-type.png and b/images/Security/c-log-type.png differ diff --git a/images/alerting/cross-cluster-cluster-metrics-monitors.png b/images/alerting/cross-cluster-cluster-metrics-monitors.png new file mode 100644 index 0000000000..16d63f5980 Binary files /dev/null and b/images/alerting/cross-cluster-cluster-metrics-monitors.png differ diff --git a/images/alerting/cross-cluster-per-query-per-bucket-monitors.png b/images/alerting/cross-cluster-per-query-per-bucket-monitors.png new file mode 100644 index 0000000000..325535885c Binary files /dev/null and b/images/alerting/cross-cluster-per-query-per-bucket-monitors.png differ diff --git a/images/automatic-workflow-dag.png b/images/automatic-workflow-dag.png new file mode 100644 index 0000000000..361bd7bd93 Binary files /dev/null and b/images/automatic-workflow-dag.png differ diff --git a/images/ccs-devtools.png b/images/ccs-devtools.png new file mode 100644 index 0000000000..8939518786 Binary files /dev/null and b/images/ccs-devtools.png differ diff --git a/images/dashboards-assistant/conversation-history-entry.png b/images/dashboards-assistant/conversation-history-entry.png new file mode 100644 index 0000000000..2ac35ce00b Binary files /dev/null and b/images/dashboards-assistant/conversation-history-entry.png differ diff --git a/images/dashboards-assistant/conversation-history-list.png b/images/dashboards-assistant/conversation-history-list.png new file mode 100644 index 0000000000..e120897535 Binary files /dev/null and b/images/dashboards-assistant/conversation-history-list.png differ diff --git a/images/dashboards-assistant/conversation-in-notebook.png b/images/dashboards-assistant/conversation-in-notebook.png new file mode 100644 index 0000000000..2ee9a70106 Binary files /dev/null and b/images/dashboards-assistant/conversation-in-notebook.png differ diff --git a/images/dashboards-assistant/delete-conversation.png b/images/dashboards-assistant/delete-conversation.png new file mode 100644 index 0000000000..b5ac7cfd1a Binary files /dev/null and b/images/dashboards-assistant/delete-conversation.png differ diff --git a/images/dashboards-assistant/edit-conversation-title.png b/images/dashboards-assistant/edit-conversation-title.png new file mode 100644 index 0000000000..0f35babc60 Binary files /dev/null and b/images/dashboards-assistant/edit-conversation-title.png differ diff --git a/images/dashboards-assistant/entry.png b/images/dashboards-assistant/entry.png new file mode 100644 index 0000000000..b645389f56 Binary files /dev/null and b/images/dashboards-assistant/entry.png differ diff --git a/images/dashboards-assistant/rate.png b/images/dashboards-assistant/rate.png new file mode 100644 index 0000000000..efc741c78a Binary files /dev/null and b/images/dashboards-assistant/rate.png differ diff --git a/images/dashboards-assistant/response.png b/images/dashboards-assistant/response.png new file mode 100644 index 0000000000..3bb920e617 Binary files /dev/null and b/images/dashboards-assistant/response.png differ diff --git a/images/dashboards-assistant/save-conversation-to-notebook.png b/images/dashboards-assistant/save-conversation-to-notebook.png new file mode 100644 index 0000000000..a02f007044 Binary files /dev/null and b/images/dashboards-assistant/save-conversation-to-notebook.png differ diff --git a/images/dashboards-assistant/start-conversation.png b/images/dashboards-assistant/start-conversation.png new file mode 100644 index 0000000000..f61ac66333 Binary files /dev/null and b/images/dashboards-assistant/start-conversation.png differ diff --git a/images/dashboards-assistant/suggestions.png b/images/dashboards-assistant/suggestions.png new file mode 100644 index 0000000000..a7cb0ea621 Binary files /dev/null and b/images/dashboards-assistant/suggestions.png differ diff --git a/images/dashboards-assistant/traces.png b/images/dashboards-assistant/traces.png new file mode 100644 index 0000000000..ee5ad8c6e0 Binary files /dev/null and b/images/dashboards-assistant/traces.png differ diff --git a/images/dashboards/add-panel2.png b/images/dashboards/add-panel2.png new file mode 100644 index 0000000000..f76d00d4a1 Binary files /dev/null and b/images/dashboards/add-panel2.png differ diff --git a/images/dashboards/add-sample-data-2.png b/images/dashboards/add-sample-data-2.png new file mode 100644 index 0000000000..2f1b5ec901 Binary files /dev/null and b/images/dashboards/add-sample-data-2.png differ diff --git a/images/dashboards/dark-mode.png b/images/dashboards/dark-mode.png new file mode 100644 index 0000000000..f25716d134 Binary files /dev/null and b/images/dashboards/dark-mode.png differ diff --git a/images/dashboards/dashboards-home.png b/images/dashboards/dashboards-home.png new file mode 100644 index 0000000000..b8223ec7d0 Binary files /dev/null and b/images/dashboards/dashboards-home.png differ diff --git a/images/dashboards/dashboards-overview-page.png b/images/dashboards/dashboards-overview-page.png new file mode 100644 index 0000000000..ba5382f891 Binary files /dev/null and b/images/dashboards/dashboards-overview-page.png differ diff --git a/images/dashboards/discover-basics.png b/images/dashboards/discover-basics.png new file mode 100644 index 0000000000..0f3418882d Binary files /dev/null and b/images/dashboards/discover-basics.png differ diff --git a/images/dashboards/discover-filter.png b/images/dashboards/discover-filter.png new file mode 100644 index 0000000000..632e0b6f9e Binary files /dev/null and b/images/dashboards/discover-filter.png differ diff --git a/images/dashboards/discover-view.png b/images/dashboards/discover-view.png new file mode 100644 index 0000000000..63699494f9 Binary files /dev/null and b/images/dashboards/discover-view.png differ diff --git a/images/dashboards/discover.png b/images/dashboards/discover.png new file mode 100644 index 0000000000..a00e49cfa2 Binary files /dev/null and b/images/dashboards/discover.png differ diff --git a/images/dashboards/import_saved_objects_with_file_upload.gif b/images/dashboards/import_saved_objects_with_file_upload.gif new file mode 100644 index 0000000000..f35f7cf15b Binary files /dev/null and b/images/dashboards/import_saved_objects_with_file_upload.gif differ diff --git a/images/dashboards/multidata-hide-localcluster.gif b/images/dashboards/multidata-hide-localcluster.gif new file mode 100644 index 0000000000..b778063943 Binary files /dev/null and b/images/dashboards/multidata-hide-localcluster.gif differ diff --git a/images/dashboards/multidata-hide-show-auth.gif b/images/dashboards/multidata-hide-show-auth.gif new file mode 100644 index 0000000000..9f1f945c44 Binary files /dev/null and b/images/dashboards/multidata-hide-show-auth.gif differ diff --git a/images/dashboards/opensearch-assistant-QandA.png b/images/dashboards/opensearch-assistant-QandA.png new file mode 100644 index 0000000000..e35e097a25 Binary files /dev/null and b/images/dashboards/opensearch-assistant-QandA.png differ diff --git a/images/dashboards/opensearch-assistant-conversation-history.png b/images/dashboards/opensearch-assistant-conversation-history.png new file mode 100644 index 0000000000..8b4eb96b2b Binary files /dev/null and b/images/dashboards/opensearch-assistant-conversation-history.png differ diff --git a/images/dashboards/opensearch-assistant-delete-convo.png b/images/dashboards/opensearch-assistant-delete-convo.png new file mode 100644 index 0000000000..c0d908334f Binary files /dev/null and b/images/dashboards/opensearch-assistant-delete-convo.png differ diff --git a/images/dashboards/opensearch-assistant-edit-convo.png b/images/dashboards/opensearch-assistant-edit-convo.png new file mode 100644 index 0000000000..bd57487bc4 Binary files /dev/null and b/images/dashboards/opensearch-assistant-edit-convo.png differ diff --git a/images/dashboards/opensearch-assistant-full-frame.png b/images/dashboards/opensearch-assistant-full-frame.png new file mode 100644 index 0000000000..b98a233192 Binary files /dev/null and b/images/dashboards/opensearch-assistant-full-frame.png differ diff --git a/images/dashboards/opensearch-assistant-how-generated.png b/images/dashboards/opensearch-assistant-how-generated.png new file mode 100644 index 0000000000..33a9b673b6 Binary files /dev/null and b/images/dashboards/opensearch-assistant-how-generated.png differ diff --git a/images/dashboards/opensearch-assistant-notebooks.png b/images/dashboards/opensearch-assistant-notebooks.png new file mode 100644 index 0000000000..74283c13c2 Binary files /dev/null and b/images/dashboards/opensearch-assistant-notebooks.png differ diff --git a/images/dashboards/opensearch-assistant-regenerate.png b/images/dashboards/opensearch-assistant-regenerate.png new file mode 100644 index 0000000000..d547ac1852 Binary files /dev/null and b/images/dashboards/opensearch-assistant-regenerate.png differ diff --git a/images/dashboards/opensearch-assistant-save-notebook.png b/images/dashboards/opensearch-assistant-save-notebook.png new file mode 100644 index 0000000000..ed0cbff293 Binary files /dev/null and b/images/dashboards/opensearch-assistant-save-notebook.png differ diff --git a/images/dashboards/opensearch-assistant-suggestions.png b/images/dashboards/opensearch-assistant-suggestions.png new file mode 100644 index 0000000000..3a819242eb Binary files /dev/null and b/images/dashboards/opensearch-assistant-suggestions.png differ diff --git a/images/dashboards/opensearch-dashboards-discover.png b/images/dashboards/opensearch-dashboards-discover.png new file mode 100644 index 0000000000..b84d8e7b0d Binary files /dev/null and b/images/dashboards/opensearch-dashboards-discover.png differ diff --git a/images/dashboards/opensearch-dashboards-home.png b/images/dashboards/opensearch-dashboards-home.png new file mode 100644 index 0000000000..a4e8dbeea2 Binary files /dev/null and b/images/dashboards/opensearch-dashboards-home.png differ diff --git a/images/dashboards/vega-2.png b/images/dashboards/vega-2.png new file mode 100644 index 0000000000..1faa3a6e67 Binary files /dev/null and b/images/dashboards/vega-2.png differ diff --git a/images/icons/frame-icon.png b/images/icons/frame-icon.png new file mode 100644 index 0000000000..994c67d39e Binary files /dev/null and b/images/icons/frame-icon.png differ diff --git a/images/icons/menu-icon.png b/images/icons/menu-icon.png new file mode 100644 index 0000000000..d44367f298 Binary files /dev/null and b/images/icons/menu-icon.png differ diff --git a/images/intro/cluster-replicas.png b/images/intro/cluster-replicas.png new file mode 100644 index 0000000000..3462406b98 Binary files /dev/null and b/images/intro/cluster-replicas.png differ diff --git a/images/intro/cluster.png b/images/intro/cluster.png new file mode 100644 index 0000000000..300cf41ecc Binary files /dev/null and b/images/intro/cluster.png differ diff --git a/images/intro/index-shard.png b/images/intro/index-shard.png new file mode 100644 index 0000000000..f2663d2e95 Binary files /dev/null and b/images/intro/index-shard.png differ diff --git a/images/log-explorer-query-assist.png b/images/log-explorer-query-assist.png new file mode 100644 index 0000000000..1348ca0ef0 Binary files /dev/null and b/images/log-explorer-query-assist.png differ diff --git a/images/perftop.jpg b/images/perftop.jpg new file mode 100644 index 0000000000..6ff6013280 Binary files /dev/null and b/images/perftop.jpg differ diff --git a/release-notes/opensearch-documentation-release-notes-2.12.0.md b/release-notes/opensearch-documentation-release-notes-2.12.0.md new file mode 100644 index 0000000000..24decdcced --- /dev/null +++ b/release-notes/opensearch-documentation-release-notes-2.12.0.md @@ -0,0 +1,42 @@ +# OpenSearch Documentation Website 2.12.0 Release Notes + +The OpenSearch 2.12.0 documentation includes the following additions and updates. + +## New documentation for 2.12.0 + +- Disable concurrent search for composite aggregations [#6444](https://github.com/opensearch-project/documentation-website/pull/6444) +- Add warning related to anomaly detection and concurrent segment search [#6411](https://github.com/opensearch-project/documentation-website/pull/6411) +- Adds metrics framework documentation [#6393](https://github.com/opensearch-project/documentation-website/pull/6393) +- Add documentation for new reranking feature in 2.12 [#6368](https://github.com/opensearch-project/documentation-website/pull/6368) +- Add documentation for saved object management to support multiple datasource [#6364](https://github.com/opensearch-project/documentation-website/pull/6364) +- Add things to know for concurrent segment search [#6362](https://github.com/opensearch-project/documentation-website/pull/6362) +- Add cross-encoder model documentation [#6357](https://github.com/opensearch-project/documentation-website/pull/6357) +- Concurrent segment search GA and API changes [#6356](https://github.com/opensearch-project/documentation-website/pull/6356) +- Improve terms aggregation documentation and add concurrent segment search caveats [#6355](https://github.com/opensearch-project/documentation-website/pull/6355) +- Updating documentation for new datetime format [#6337](https://github.com/opensearch-project/documentation-website/pull/6337) +- Add documentation for cluster-level coordination setting changes [#6336](https://github.com/opensearch-project/documentation-website/pull/6336) +- Update remote cluster documentation for global metadata and new dynamic settings [#6309](https://github.com/opensearch-project/documentation-website/pull/6309) +- Add include_named_queries_score to docs [#6306](https://github.com/opensearch-project/documentation-website/pull/6306) +- Add remove_by_pattern ingest processor [#6295](https://github.com/opensearch-project/documentation-website/pull/6295) +- Add copy ingest processor [#6294](https://github.com/opensearch-project/documentation-website/pull/6294) +- Add alias support to Security Analytics [#6284](https://github.com/opensearch-project/documentation-website/pull/6284) +- Update SAML tutorial with new default Admin settings [#6279](https://github.com/opensearch-project/documentation-website/pull/6279) +- Add threat intelligence for 2.12 [#6273](https://github.com/opensearch-project/documentation-website/pull/6273) +- Added documentation for changing the default values of HNSW hyper parameters in k-NN plugin [#6272](https://github.com/opensearch-project/documentation-website/pull/6272) +- Document new experimental HTTP transport based on Netty 4 and Project Reactor [#6265](https://github.com/opensearch-project/documentation-website/pull/6265) +- Add Query Insights documentation [#6261](https://github.com/opensearch-project/documentation-website/pull/6261) +- Add flow framework documentation [#6257](https://github.com/opensearch-project/documentation-website/pull/6257) +- [DOC] Add documentation for admission control stats [#6253](https://github.com/opensearch-project/documentation-website/pull/6253) +- Update bundled JDK version (January 2024 Patch releases) [#6250](https://github.com/opensearch-project/documentation-website/pull/6250) +- Create or Update alias API supports must_exist parameter [#6245](https://github.com/opensearch-project/documentation-website/pull/6245) + +## In progress documentation for 2.12.0 + +- Add documentation for new bloom filter settings [#6449](https://github.com/opensearch-project/documentation-website/pull/6449) + +## Documentation for 2.12.0 experimental features + +- Add query assist documentation to Observability [#6419](https://github.com/opensearch-project/documentation-website/pull/6419) +- Add documentation for dashboards assistant plugin [#6370](https://github.com/opensearch-project/documentation-website/pull/6370) +- Add agent framework/throttling/hidden model/OS assistant and update conversational search documentation [#6354](https://github.com/opensearch-project/documentation-website/pull/6354) +- Alerting plugin - experimental cross cluster monitor support documentation [#6350](https://github.com/opensearch-project/documentation-website/pull/6350) diff --git a/release-notes/opensearch-documentation-release-notes-2.13.0.md b/release-notes/opensearch-documentation-release-notes-2.13.0.md new file mode 100644 index 0000000000..38f7678cde --- /dev/null +++ b/release-notes/opensearch-documentation-release-notes-2.13.0.md @@ -0,0 +1,35 @@ +# OpenSearch Documentation Website 2.13.0 Release Notes + +The OpenSearch 2.13.0 documentation includes the following additions and updates. + +## New documentation for 2.13.0 + +- Add example to text chunking processor documentation [#6794](https://github.com/opensearch-project/documentation-website/pull/6794) +- Add documentation for default use cases [#6767](https://github.com/opensearch-project/documentation-website/pull/6767) +- Add documentation for IO Based AdmissionController Stats [#6755](https://github.com/opensearch-project/documentation-website/pull/6755) +- Add the supported metric types [#6754](https://github.com/opensearch-project/documentation-website/pull/6754) +- Add guardrails for remote model [#6750](https://github.com/opensearch-project/documentation-website/pull/6750) +- Add qa model and new settings in ml-commons [#6749](https://github.com/opensearch-project/documentation-website/pull/6749) +- Update documentation for automatic remote model deployment [#6748](https://github.com/opensearch-project/documentation-website/pull/6748) +- Add client_config parameter documentation [#6746](https://github.com/opensearch-project/documentation-website/pull/6746) +- Remove experimental feature labels and flags for OS Assistant [#6745](https://github.com/opensearch-project/documentation-website/pull/6745) +- Remove experimental feature warning for Flow Framework plugin docs [#6741](https://github.com/opensearch-project/documentation-website/pull/6741) +- Add documentation for new workflow steps [#6740](https://github.com/opensearch-project/documentation-website/pull/6740) +- Add documentation for optional param for get workflow step API [#6736](https://github.com/opensearch-project/documentation-website/pull/6736) +- Update plugins.md with semver range support specification [#6733](https://github.com/opensearch-project/documentation-website/pull/6733) +- Remove feature flag requirement for fuzzy filter settings [#6731](https://github.com/opensearch-project/documentation-website/pull/6731) +- Update doc for decoupling of remote cluster state with remote backed data storage [#6730](https://github.com/opensearch-project/documentation-website/pull/6730) +- Add static setting for checkPendingFlushUpdate functionality of lucene index writer [#6728](https://github.com/opensearch-project/documentation-website/pull/6728) +- Add documentation for retry settings for Remote reindex [#6726](https://github.com/opensearch-project/documentation-website/pull/6726) +- Add Default Model Id for Neural Sparse Search Query in neural_query_enricher [#6725](https://github.com/opensearch-project/documentation-website/pull/6725) +- Add post_filter is supported in hybrid search [#6724](https://github.com/opensearch-project/documentation-website/pull/6724) +- Update deb/rpm autorestart service after upgrade documentation [#6720](https://github.com/opensearch-project/documentation-website/pull/6720) +- Add documentation page for Vega Visualizations [#6711](https://github.com/opensearch-project/documentation-website/pull/6711) +- Add documentation for text chunking processor [#6707](https://github.com/opensearch-project/documentation-website/pull/6707) +- Update documentation to support InnerProduct with k-NN Lucene Engine [#6703](https://github.com/opensearch-project/documentation-website/pull/6703) +- Add documentation for kuromoji_completion filter [#6699](https://github.com/opensearch-project/documentation-website/pull/6699) +- Add note about not passing 0 vector for cosine sim in k-NN [#6698](https://github.com/opensearch-project/documentation-website/pull/6698) +- Update the multiple data source documentation [Multiple Data Source][2.13.0] [#6689](https://github.com/opensearch-project/documentation-website/pull/6689) +- Add document on how to configure XContent codepoint limit (YAML) [#6666](https://github.com/opensearch-project/documentation-website/pull/6666) +- Add force-merge API supports primary_only parameter [#6664](https://github.com/opensearch-project/documentation-website/pull/6664) +- Add aggregations to Search-Hybrid search section [#6661](https://github.com/opensearch-project/documentation-website/pull/6661)