Sync search Elasticsearch #569
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Sync search Elasticsearch | |
# **What it does**: It scrapes the whole site and dumps the records in a | |
# temp directory. Then it indexes that into Elasticsearch. | |
# **Why we have it**: We want our search indexes kept up to date. | |
# **Who does it impact**: Anyone using search on docs. | |
on: | |
workflow_dispatch: | |
inputs: | |
version: | |
description: "Version to exclusively generate the search index for. E.g. 'dotcom', 'ghes-3.7', 'ghae'" | |
required: false | |
default: '' | |
languages: | |
description: "Comma separated languages. E.g. 'en,ja, es' (defaults to all)" | |
required: false | |
default: '' | |
schedule: | |
- cron: '20 */24 * * *' # Run every 24 hours at 20 minutes past the hour | |
workflow_run: | |
workflows: ['Azure Production - Build and Deploy'] | |
types: | |
- completed | |
permissions: | |
contents: read | |
# This allows a subsequently queued workflow run to cancel previous runs | |
concurrency: | |
group: '${{ github.workflow }} @ ${{ github.head_ref }} ${{ github.event_name }}' | |
cancel-in-progress: true | |
env: | |
FREEZE: ${{ secrets.FREEZE }} | |
ELASTICSEARCH_URL: ${{ secrets.ELASTICSEARCH_URL }} | |
jobs: | |
figureOutMatrix: | |
if: ${{ github.repository == 'github/docs-internal' }} | |
runs-on: ubuntu-latest | |
outputs: | |
matrix: ${{ steps.set-matrix.outputs.result }} | |
steps: | |
- uses: actions/github-script@98814c53be79b1d30f795b907e553d8679345975 | |
id: set-matrix | |
with: | |
script: | | |
// Edit this list for the definitive list of languages | |
// (other than English) we want to index in Elasticsearch. | |
const allNonEnglish = ["ja", "es", "pt", "zh", "ru", "fr", "ko", "de"] | |
const allPossible = ["en", ...allNonEnglish] | |
if (context.eventName === "workflow_run") { | |
if (context.payload.workflow_run.conclusion === "success") { | |
return ["en"] | |
} | |
throw new Error(`It was a workflow_run but not success ('${context.payload.workflow_run.conclusion}')`) | |
} | |
if (context.eventName === "workflow_dispatch") { | |
if (context.payload.inputs.languages) { | |
const clean = context.payload.inputs.languages.split(',').map(x => x.trim()).filter(Boolean) | |
const notRecognized = clean.find(x => !allPossible.includes(x)) | |
if (notRecognized) { | |
throw new Error(`'${notRecognized}' is not a recognized language code`) | |
} | |
return clean | |
} | |
return allPossible | |
} | |
if (context.eventName === "schedule") { | |
return allNonEnglish | |
} | |
console.log(context) | |
throw new Error(`Unable figure out what languages to run (${context.eventName})`) | |
- name: Debug output | |
run: echo "${{ steps.set-matrix.outputs.result }}" | |
updateElasticsearchIndexes: | |
needs: figureOutMatrix | |
name: Update indexes | |
if: ${{ github.repository == 'github/docs-internal' }} | |
runs-on: ubuntu-20.04-xl | |
strategy: | |
fail-fast: false | |
max-parallel: 3 | |
matrix: | |
language: ${{ fromJSON(needs.figureOutMatrix.outputs.matrix) }} | |
steps: | |
- if: ${{ env.FREEZE == 'true' }} | |
run: | | |
echo 'The repo is currently frozen! Exiting this workflow.' | |
exit 1 # prevents further steps from running | |
- name: Check out repo | |
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 | |
- name: Clone all translations | |
if: ${{ matrix.language != 'en' }} | |
uses: ./.github/actions/clone-translations | |
with: | |
token: ${{ secrets.DOCUBOT_REPO_PAT }} | |
- uses: ./.github/actions/node-npm-setup | |
- name: Cache nextjs build | |
uses: actions/cache@9b0c1fce7a93df8e3bb8926b0d6e9d89e92f20a7 | |
with: | |
path: .next/cache | |
key: ${{ runner.os }}-nextjs-${{ hashFiles('package*.json') }} | |
- name: Run build scripts | |
run: npm run build | |
- name: Start the server in the background | |
env: | |
ENABLE_DEV_LOGGING: false | |
run: | | |
npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log & | |
# first sleep to give it a chance to start | |
sleep 6 | |
curl --retry-connrefused --retry 4 -I http://localhost:4002/ | |
- if: ${{ failure() }} | |
name: Debug server outputs on errors | |
run: | | |
echo "____STDOUT____" | |
cat /tmp/stdout.log | |
echo "____STDERR____" | |
cat /tmp/stderr.log | |
- name: Scrape records into a temp directory | |
env: | |
# If a reusable, or anything in the `data/*` directory is deleted | |
# you might get a | |
# | |
# RenderError: Can't find the key 'site.data.reusables...' in the scope | |
# | |
# But that'll get fixed in the next translation pipeline. For now, | |
# let's just accept an empty string instead. | |
THROW_ON_EMPTY: false | |
# Note that by default, this is '' (empty string) and that means | |
# the same as not set within the script. | |
VERSION: ${{ github.event.inputs.version }} | |
run: | | |
mkdir /tmp/records | |
npm run sync-search-indices -- /tmp/records \ | |
--language ${{ matrix.language }} | |
ls -lh /tmp/records | |
- name: Check that Elasticsearch is accessible | |
run: | | |
curl --fail --retry-connrefused --retry 5 -I ${{ env.ELASTICSEARCH_URL }} | |
- name: Index into Elasticsearch | |
env: | |
# Must match what we used when scraping (npm run sync-search-indices) | |
# otherwise the script will seek other versions from disk that might | |
# not exist. | |
VERSION: ${{ github.event.inputs.version }} | |
run: | | |
./script/search/index-elasticsearch.js /tmp/records \ | |
--language ${{ matrix.language }} \ | |
- name: Check created indexes and aliases | |
run: | | |
# Not using `--fail` here because I've observed that it can fail | |
# with a rather cryptic 404 error when it should, if anything, be | |
# a 200 OK with a list of no indices. | |
curl --retry-connrefused --retry 5 ${{ env.ELASTICSEARCH_URL }}/_cat/indices?v | |
curl --retry-connrefused --retry 5 ${{ env.ELASTICSEARCH_URL }}/_cat/indices?v | |
- name: Purge Fastly edge cache | |
env: | |
FASTLY_TOKEN: ${{ secrets.FASTLY_TOKEN }} | |
FASTLY_SERVICE_ID: ${{ secrets.FASTLY_SERVICE_ID }} | |
FASTLY_SURROGATE_KEY: api-search:${{ matrix.language }} | |
run: .github/actions-scripts/purge-fastly-edge-cache.js | |
- name: Send Slack notification if workflow fails | |
uses: someimportantcompany/github-actions-slack-message@1d367080235edfa53df415bd8e0bbab480f29bad | |
if: failure() | |
with: | |
channel: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }} | |
bot-token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }} | |
color: failure | |
text: The last 'Sync search Elasticsearch' run failed. See https://github.com/${{github.repository}}/actions/workflows/sync-search-elasticsearch.yml?query=workflow%3A%22sync+search%22 |