1
0
mirror of synced 2025-12-23 11:54:18 -05:00

dry-run with scraping and elasticsearch (#31201)

This commit is contained in:
Peter Bengtsson
2022-09-27 22:59:33 +02:00
committed by GitHub
parent ff533e4ea3
commit db52a7e8bd
4 changed files with 62 additions and 16 deletions

View File

@@ -46,9 +46,57 @@ jobs:
node-version: 16.15.x
cache: npm
- name: Install
- name: Install dependencies
run: npm ci
- name: Cache nextjs build
uses: actions/cache@48af2dc4a9e8278b89d7fa154b955c30c6aaab09
with:
path: .next/cache
key: ${{ runner.os }}-nextjs-${{ hashFiles('package*.json') }}
- name: Run build scripts
run: npm run build
- name: Start the server in the background
env:
ENABLE_DEV_LOGGING: false
run: |
npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log &
# first sleep to give it a chance to start
sleep 6
curl --retry-connrefused --retry 4 -I http://localhost:4002/
- if: ${{ failure() }}
name: Debug server outputs on errors
run: |
echo "____STDOUT____"
cat /tmp/stdout.log
echo "____STDERR____"
cat /tmp/stderr.log
- name: Scrape records into a temp directory
env:
# If a reusable, or anything in the `data/*` directory is deleted
# you might get a
#
# RenderError: Can't find the key 'site.data.reusables...' in the scope
#
# But that'll get fixed in the next translation pipeline. For now,
# let's just accept an empty string instead.
THROW_ON_EMPTY: false
run: |
mkdir /tmp/records
npm run sync-search-indices -- \
--language en \
--version dotcom \
--out-directory /tmp/records \
--no-compression --no-lunr-index
ls -lh /tmp/records
# Serves two purposes;
# 1. Be confident that the Elasticsearch server start-up worked at all
# 2. Sometimes Elasticsearch will bind to the port but still not
@@ -62,8 +110,8 @@ jobs:
ELASTICSEARCH_URL: 'http://localhost:9200'
run: |
./script/search/index-elasticsearch.js --verbose \
-l en -l ja \
-V dotcom -V ghes-3.5
-l en \
-V dotcom -- /tmp/records
- name: Show created indexes and aliases
run: |

View File

@@ -104,8 +104,7 @@ jobs:
- name: Index into Elasticsearch
run: |
./script/search/index-elasticsearch.js \
--language ${{ matrix.language }} \
--source-directory /tmp/records
--language ${{ matrix.language }} -- /tmp/records
- name: Check created indexes and aliases
run: |

View File

@@ -182,7 +182,7 @@
"build": "next build",
"debug": "cross-env NODE_ENV=development ENABLED_LANGUAGES='en,ja' nodemon --inspect server.js",
"dev": "cross-env npm start",
"index-test-fixtures": "node script/search/index-elasticsearch.js -s tests/content/fixtures/search-indexes -l en -V ghae -V dotcom --index-prefix tests",
"index-test-fixtures": "node script/search/index-elasticsearch.js -l en -V ghae -V dotcom --index-prefix tests -- tests/content/fixtures/search-indexes",
"lint": "eslint '**/*.{js,mjs,ts,tsx}'",
"lint-translation": "cross-env NODE_OPTIONS=--experimental-vm-modules TEST_TRANSLATION=true jest tests/linting/lint-files.js",
"prepare": "husky install",

View File

@@ -49,12 +49,10 @@ const shortNames = Object.fromEntries(
const allVersionKeys = Object.keys(shortNames)
const DEFAULT_SOURCE_DIRECTORY = path.join('lib', 'search', 'indexes')
program
.description('Creates Elasticsearch index from records')
.option('-v, --verbose', 'Verbose outputs')
.addOption(new Option('-V, --version <VERSION...>', 'Specific versions').choices(allVersionKeys))
.addOption(new Option('-V, --version [VERSION...]', 'Specific versions').choices(allVersionKeys))
.addOption(
new Option('-l, --language <LANGUAGE...>', 'Which languages to focus on').choices(languageKeys)
)
@@ -62,16 +60,17 @@ program
new Option('--not-language <LANGUAGE...>', 'Specific language to omit').choices(languageKeys)
)
.option('-u, --elasticsearch-url <url>', 'If different from $ELASTICSEARCH_URL')
.option(
'-s, --source-directory <DIRECTORY>',
`Directory where records files are (default ${DEFAULT_SOURCE_DIRECTORY})`
)
.option('-p, --index-prefix <prefix>', 'Index string to put before index name')
.argument('<source-directory>', 'where the indexable files are')
.parse(process.argv)
main(program.opts())
main(program.opts(), program.args)
async function main(opts, args) {
if (!args.length) {
throw new Error('Must pass the source as the first argument')
}
async function main(opts) {
if (!opts.elasticsearchUrl && !process.env.ELASTICSEARCH_URL) {
throw new Error(
'Must passed the elasticsearch URL option or ' +
@@ -103,7 +102,7 @@ async function main(opts) {
if (verbose) {
console.log(`Connecting to ${chalk.bold(safeUrlDisplay(node))}`)
}
const sourceDirectory = opts.sourceDirectory || DEFAULT_SOURCE_DIRECTORY
const sourceDirectory = args[0]
try {
await fs.stat(sourceDirectory)
} catch (error) {