name: Sync search Elasticsearch # **What it does**: It scrapes the whole site and dumps the records in a # temp directory. Then it indexes that into Elasticsearch. # **Why we have it**: We want our search indexes kept up to date. # **Who does it impact**: Anyone using search on docs. on: workflow_dispatch: inputs: version: description: "Version to exclusively generate the search index for. E.g. 'dotcom', 'ghes-3.7', 'ghae'" required: false default: '' languages: description: "Comma separated languages. E.g. 'en,ja, es' (defaults to all)" required: false default: '' schedule: - cron: '20 */24 * * *' # Run every 24 hours at 20 minutes past the hour workflow_run: workflows: ['Azure Production - Build and Deploy'] types: - completed permissions: contents: read # This allows a subsequently queued workflow run to cancel previous runs concurrency: group: '${{ github.workflow }} @ ${{ github.head_ref }} ${{ github.event_name }}' cancel-in-progress: true env: FREEZE: ${{ secrets.FREEZE }} ELASTICSEARCH_URL: ${{ secrets.ELASTICSEARCH_URL }} # Since we'll run in NDOE_ENV=production, we need to be explicit that # we don't want Hydro configured. HYDRO_ENDPOINT: '' HYDRO_SECRET: '' jobs: figureOutMatrix: if: ${{ github.repository == 'github/docs-internal' }} runs-on: ubuntu-latest outputs: matrix: ${{ steps.set-matrix.outputs.result }} steps: - uses: actions/github-script@98814c53be79b1d30f795b907e553d8679345975 id: set-matrix with: script: | // Edit this list for the definitive list of languages // (other than English) we want to index in Elasticsearch. const allNonEnglish = ["zh", "es", "pt", "ru", "ja", "fr", "de", "ko"] const allPossible = ["en", ...allNonEnglish] if (context.eventName === "workflow_run") { if (context.payload.workflow_run.conclusion === "success") { return ["en"] } throw new Error(`It was a workflow_run but not success ('${context.payload.workflow_run.conclusion}')`) } if (context.eventName === "workflow_dispatch") { if (context.payload.inputs.languages) { const clean = context.payload.inputs.languages.split(',').map(x => x.trim()).filter(Boolean) const notRecognized = clean.find(x => !allPossible.includes(x)) if (notRecognized) { throw new Error(`'${notRecognized}' is not a recognized language code`) } return clean } return allPossible } if (context.eventName === "schedule") { return allNonEnglish } console.log(context) throw new Error(`Unable figure out what languages to run (${context.eventName})`) - name: Debug output run: echo "${{ steps.set-matrix.outputs.result }}" updateElasticsearchIndexes: needs: figureOutMatrix name: Update indexes if: ${{ github.repository == 'github/docs-internal' }} runs-on: ubuntu-20.04-xl strategy: fail-fast: false max-parallel: 3 matrix: language: ${{ fromJSON(needs.figureOutMatrix.outputs.matrix) }} steps: - if: ${{ env.FREEZE == 'true' }} run: | echo 'The repo is currently frozen! Exiting this workflow.' exit 1 # prevents further steps from running - name: Check out repo uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 - name: Clone docs-internal.popular-pages uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 with: repository: github/docs-internal.popular-pages # This works because user `docubot` has read access to that private repo. token: ${{ secrets.DOCUBOT_REPO_PAT }} path: popular-pages - name: Clone all translations if: ${{ matrix.language != 'en' }} uses: ./.github/actions/clone-translations with: token: ${{ secrets.DOCUBOT_REPO_PAT }} - uses: ./.github/actions/node-npm-setup - name: Cache nextjs build uses: actions/cache@9b0c1fce7a93df8e3bb8926b0d6e9d89e92f20a7 with: path: .next/cache key: ${{ runner.os }}-nextjs-${{ hashFiles('package*.json') }} - name: Run build scripts run: npm run build - name: Start the server in the background env: ENABLE_DEV_LOGGING: false run: | npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log & # first sleep to give it a chance to start sleep 6 curl --retry-connrefused --retry 4 -I http://localhost:4002/ - if: ${{ failure() }} name: Debug server outputs on errors run: | echo "____STDOUT____" cat /tmp/stdout.log echo "____STDERR____" cat /tmp/stderr.log - name: Scrape records into a temp directory env: # If a reusable, or anything in the `data/*` directory is deleted # you might get a # # RenderError: Can't find the key 'site.data.reusables...' in the scope # # But that'll get fixed in the next translation pipeline. For now, # let's just accept an empty string instead. THROW_ON_EMPTY: false # Note that by default, this is '' (empty string) and that means # the same as not set within the script. VERSION: ${{ github.event.inputs.version }} # The sync-search-index recognizes this env var if you don't # use the `--popular-pags ` option. POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json run: | mkdir /tmp/records npm run sync-search-indices -- /tmp/records \ --language ${{ matrix.language }} ls -lh /tmp/records - name: Check that Elasticsearch is accessible run: | curl --fail --retry-connrefused --retry 5 -I ${{ env.ELASTICSEARCH_URL }} - name: Index into Elasticsearch env: # Must match what we used when scraping (npm run sync-search-indices) # otherwise the script will seek other versions from disk that might # not exist. VERSION: ${{ github.event.inputs.version }} run: | ./src/search/scripts/index-elasticsearch.js /tmp/records \ --language ${{ matrix.language }} \ - name: Check created indexes and aliases run: | # Not using `--fail` here because I've observed that it can fail # with a rather cryptic 404 error when it should, if anything, be # a 200 OK with a list of no indices. curl --retry-connrefused --retry 5 ${{ env.ELASTICSEARCH_URL }}/_cat/indices?v curl --retry-connrefused --retry 5 ${{ env.ELASTICSEARCH_URL }}/_cat/indices?v - name: Purge Fastly edge cache env: FASTLY_TOKEN: ${{ secrets.FASTLY_TOKEN }} FASTLY_SERVICE_ID: ${{ secrets.FASTLY_SERVICE_ID }} FASTLY_SURROGATE_KEY: api-search:${{ matrix.language }} run: .github/actions-scripts/purge-fastly-edge-cache.js - name: Send Slack notification if workflow fails uses: someimportantcompany/github-actions-slack-message@1d367080235edfa53df415bd8e0bbab480f29bad if: failure() with: channel: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }} bot-token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }} color: failure text: The last 'Sync search Elasticsearch' run failed. See https://github.com/${{github.repository}}/actions/workflows/sync-search-elasticsearch.yml?query=workflow%3A%22sync+search%22