merge in domwaiter (#31197)
This commit is contained in:
@@ -8,7 +8,7 @@ on:
|
|||||||
merge_group:
|
merge_group:
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- script/search/index-elasticsearch.js
|
- 'script/search/**'
|
||||||
- 'package*.json'
|
- 'package*.json'
|
||||||
- .github/workflows/dry-run-elasticsearch-indexing.yml
|
- .github/workflows/dry-run-elasticsearch-indexing.yml
|
||||||
|
|
||||||
|
|||||||
152
package-lock.json
generated
152
package-lock.json
generated
@@ -16,6 +16,7 @@
|
|||||||
"accept-language-parser": "^1.5.0",
|
"accept-language-parser": "^1.5.0",
|
||||||
"ajv": "^8.11.0",
|
"ajv": "^8.11.0",
|
||||||
"ajv-formats": "^2.1.1",
|
"ajv-formats": "^2.1.1",
|
||||||
|
"bottleneck": "2.19.5",
|
||||||
"cheerio": "^1.0.0-rc.11",
|
"cheerio": "^1.0.0-rc.11",
|
||||||
"classnames": "^2.3.1",
|
"classnames": "^2.3.1",
|
||||||
"connect-datadog": "0.0.9",
|
"connect-datadog": "0.0.9",
|
||||||
@@ -119,7 +120,6 @@
|
|||||||
"cross-env": "^7.0.3",
|
"cross-env": "^7.0.3",
|
||||||
"csp-parse": "0.0.2",
|
"csp-parse": "0.0.2",
|
||||||
"dedent": "^0.7.0",
|
"dedent": "^0.7.0",
|
||||||
"domwaiter": "^1.4.0",
|
|
||||||
"eslint": "8.24.0",
|
"eslint": "8.24.0",
|
||||||
"eslint-config-prettier": "^8.5.0",
|
"eslint-config-prettier": "^8.5.0",
|
||||||
"eslint-config-standard": "^17.0.0",
|
"eslint-config-standard": "^17.0.0",
|
||||||
@@ -164,7 +164,6 @@
|
|||||||
"node": ">=16.x"
|
"node": ">=16.x"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"bottleneck": "^2.19.5",
|
|
||||||
"esm": "^3.2.25",
|
"esm": "^3.2.25",
|
||||||
"image-size": "^1.0.1",
|
"image-size": "^1.0.1",
|
||||||
"jest-puppeteer": "^5.0.4",
|
"jest-puppeteer": "^5.0.4",
|
||||||
@@ -4108,18 +4107,6 @@
|
|||||||
"tslib": "^2.4.0"
|
"tslib": "^2.4.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@szmarczak/http-timer": {
|
|
||||||
"version": "4.0.6",
|
|
||||||
"resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-4.0.6.tgz",
|
|
||||||
"integrity": "sha512-4BAffykYOgO+5nzBWYwE3W90sBgLJoUPRWWcL8wlyiM8IB8ipJz3UMJ9KXQd1RKQXpKp8Tutn80HZtWsu2u76w==",
|
|
||||||
"dev": true,
|
|
||||||
"dependencies": {
|
|
||||||
"defer-to-connect": "^2.0.0"
|
|
||||||
},
|
|
||||||
"engines": {
|
|
||||||
"node": ">=10"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/@types/babel__core": {
|
"node_modules/@types/babel__core": {
|
||||||
"version": "7.1.19",
|
"version": "7.1.19",
|
||||||
"resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.1.19.tgz",
|
"resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.1.19.tgz",
|
||||||
@@ -6240,8 +6227,8 @@
|
|||||||
},
|
},
|
||||||
"node_modules/bottleneck": {
|
"node_modules/bottleneck": {
|
||||||
"version": "2.19.5",
|
"version": "2.19.5",
|
||||||
"devOptional": true,
|
"resolved": "https://registry.npmjs.org/bottleneck/-/bottleneck-2.19.5.tgz",
|
||||||
"license": "MIT"
|
"integrity": "sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw=="
|
||||||
},
|
},
|
||||||
"node_modules/boxen": {
|
"node_modules/boxen": {
|
||||||
"version": "7.0.0",
|
"version": "7.0.0",
|
||||||
@@ -7573,64 +7560,6 @@
|
|||||||
"url": "https://github.com/fb55/domutils?sponsor=1"
|
"url": "https://github.com/fb55/domutils?sponsor=1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/domwaiter": {
|
|
||||||
"version": "1.4.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/domwaiter/-/domwaiter-1.4.0.tgz",
|
|
||||||
"integrity": "sha512-k7dIRmg5/wMsET8FFZvrlZ2A81WOjc9D5DcVVoZxkwvo2hMPklYXPiS23h3Ez7zqyp25pmEn3Hzjq8agPiRxiw==",
|
|
||||||
"dev": true,
|
|
||||||
"dependencies": {
|
|
||||||
"bottleneck": "^2.19.5",
|
|
||||||
"cheerio": "^1.0.0-rc.3",
|
|
||||||
"got": "^11.8.5"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/domwaiter/node_modules/cacheable-lookup": {
|
|
||||||
"version": "5.0.4",
|
|
||||||
"resolved": "https://registry.npmjs.org/cacheable-lookup/-/cacheable-lookup-5.0.4.tgz",
|
|
||||||
"integrity": "sha512-2/kNscPhpcxrOigMZzbiWF7dz8ilhb/nIHU3EyZiXWXpeq/au8qJ8VhdftMkty3n7Gj6HIGalQG8oiBNB3AJgA==",
|
|
||||||
"dev": true,
|
|
||||||
"engines": {
|
|
||||||
"node": ">=10.6.0"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/domwaiter/node_modules/got": {
|
|
||||||
"version": "11.8.5",
|
|
||||||
"resolved": "https://registry.npmjs.org/got/-/got-11.8.5.tgz",
|
|
||||||
"integrity": "sha512-o0Je4NvQObAuZPHLFoRSkdG2lTgtcynqymzg2Vupdx6PorhaT5MCbIyXG6d4D94kk8ZG57QeosgdiqfJWhEhlQ==",
|
|
||||||
"dev": true,
|
|
||||||
"dependencies": {
|
|
||||||
"@sindresorhus/is": "^4.0.0",
|
|
||||||
"@szmarczak/http-timer": "^4.0.5",
|
|
||||||
"@types/cacheable-request": "^6.0.1",
|
|
||||||
"@types/responselike": "^1.0.0",
|
|
||||||
"cacheable-lookup": "^5.0.3",
|
|
||||||
"cacheable-request": "^7.0.2",
|
|
||||||
"decompress-response": "^6.0.0",
|
|
||||||
"http2-wrapper": "^1.0.0-beta.5.2",
|
|
||||||
"lowercase-keys": "^2.0.0",
|
|
||||||
"p-cancelable": "^2.0.0",
|
|
||||||
"responselike": "^2.0.0"
|
|
||||||
},
|
|
||||||
"engines": {
|
|
||||||
"node": ">=10.19.0"
|
|
||||||
},
|
|
||||||
"funding": {
|
|
||||||
"url": "https://github.com/sindresorhus/got?sponsor=1"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/domwaiter/node_modules/http2-wrapper": {
|
|
||||||
"version": "1.0.3",
|
|
||||||
"resolved": "https://registry.npmjs.org/http2-wrapper/-/http2-wrapper-1.0.3.tgz",
|
|
||||||
"integrity": "sha512-V+23sDMr12Wnz7iTcDeJr3O6AIxlnvT/bmaAAAP/Xda35C90p9599p0F1eHR/N1KILWSoWVAiOMFjBBXaXSMxg==",
|
|
||||||
"dev": true,
|
|
||||||
"dependencies": {
|
|
||||||
"quick-lru": "^5.1.1",
|
|
||||||
"resolve-alpn": "^1.0.0"
|
|
||||||
},
|
|
||||||
"engines": {
|
|
||||||
"node": ">=10.19.0"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/dot-case": {
|
"node_modules/dot-case": {
|
||||||
"version": "3.0.4",
|
"version": "3.0.4",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
@@ -15723,15 +15652,6 @@
|
|||||||
"node": ">=0.10.0"
|
"node": ">=0.10.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/p-cancelable": {
|
|
||||||
"version": "2.1.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz",
|
|
||||||
"integrity": "sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==",
|
|
||||||
"dev": true,
|
|
||||||
"engines": {
|
|
||||||
"node": ">=8"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/p-limit": {
|
"node_modules/p-limit": {
|
||||||
"version": "3.1.0",
|
"version": "3.1.0",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
@@ -23418,15 +23338,6 @@
|
|||||||
"tslib": "^2.4.0"
|
"tslib": "^2.4.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"@szmarczak/http-timer": {
|
|
||||||
"version": "4.0.6",
|
|
||||||
"resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-4.0.6.tgz",
|
|
||||||
"integrity": "sha512-4BAffykYOgO+5nzBWYwE3W90sBgLJoUPRWWcL8wlyiM8IB8ipJz3UMJ9KXQd1RKQXpKp8Tutn80HZtWsu2u76w==",
|
|
||||||
"dev": true,
|
|
||||||
"requires": {
|
|
||||||
"defer-to-connect": "^2.0.0"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"@types/babel__core": {
|
"@types/babel__core": {
|
||||||
"version": "7.1.19",
|
"version": "7.1.19",
|
||||||
"resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.1.19.tgz",
|
"resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.1.19.tgz",
|
||||||
@@ -25093,7 +25004,8 @@
|
|||||||
},
|
},
|
||||||
"bottleneck": {
|
"bottleneck": {
|
||||||
"version": "2.19.5",
|
"version": "2.19.5",
|
||||||
"devOptional": true
|
"resolved": "https://registry.npmjs.org/bottleneck/-/bottleneck-2.19.5.tgz",
|
||||||
|
"integrity": "sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw=="
|
||||||
},
|
},
|
||||||
"boxen": {
|
"boxen": {
|
||||||
"version": "7.0.0",
|
"version": "7.0.0",
|
||||||
@@ -25950,54 +25862,6 @@
|
|||||||
"domhandler": "^5.0.1"
|
"domhandler": "^5.0.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"domwaiter": {
|
|
||||||
"version": "1.4.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/domwaiter/-/domwaiter-1.4.0.tgz",
|
|
||||||
"integrity": "sha512-k7dIRmg5/wMsET8FFZvrlZ2A81WOjc9D5DcVVoZxkwvo2hMPklYXPiS23h3Ez7zqyp25pmEn3Hzjq8agPiRxiw==",
|
|
||||||
"dev": true,
|
|
||||||
"requires": {
|
|
||||||
"bottleneck": "^2.19.5",
|
|
||||||
"cheerio": "^1.0.0-rc.3",
|
|
||||||
"got": "^11.8.5"
|
|
||||||
},
|
|
||||||
"dependencies": {
|
|
||||||
"cacheable-lookup": {
|
|
||||||
"version": "5.0.4",
|
|
||||||
"resolved": "https://registry.npmjs.org/cacheable-lookup/-/cacheable-lookup-5.0.4.tgz",
|
|
||||||
"integrity": "sha512-2/kNscPhpcxrOigMZzbiWF7dz8ilhb/nIHU3EyZiXWXpeq/au8qJ8VhdftMkty3n7Gj6HIGalQG8oiBNB3AJgA==",
|
|
||||||
"dev": true
|
|
||||||
},
|
|
||||||
"got": {
|
|
||||||
"version": "11.8.5",
|
|
||||||
"resolved": "https://registry.npmjs.org/got/-/got-11.8.5.tgz",
|
|
||||||
"integrity": "sha512-o0Je4NvQObAuZPHLFoRSkdG2lTgtcynqymzg2Vupdx6PorhaT5MCbIyXG6d4D94kk8ZG57QeosgdiqfJWhEhlQ==",
|
|
||||||
"dev": true,
|
|
||||||
"requires": {
|
|
||||||
"@sindresorhus/is": "^4.0.0",
|
|
||||||
"@szmarczak/http-timer": "^4.0.5",
|
|
||||||
"@types/cacheable-request": "^6.0.1",
|
|
||||||
"@types/responselike": "^1.0.0",
|
|
||||||
"cacheable-lookup": "^5.0.3",
|
|
||||||
"cacheable-request": "^7.0.2",
|
|
||||||
"decompress-response": "^6.0.0",
|
|
||||||
"http2-wrapper": "^1.0.0-beta.5.2",
|
|
||||||
"lowercase-keys": "^2.0.0",
|
|
||||||
"p-cancelable": "^2.0.0",
|
|
||||||
"responselike": "^2.0.0"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"http2-wrapper": {
|
|
||||||
"version": "1.0.3",
|
|
||||||
"resolved": "https://registry.npmjs.org/http2-wrapper/-/http2-wrapper-1.0.3.tgz",
|
|
||||||
"integrity": "sha512-V+23sDMr12Wnz7iTcDeJr3O6AIxlnvT/bmaAAAP/Xda35C90p9599p0F1eHR/N1KILWSoWVAiOMFjBBXaXSMxg==",
|
|
||||||
"dev": true,
|
|
||||||
"requires": {
|
|
||||||
"quick-lru": "^5.1.1",
|
|
||||||
"resolve-alpn": "^1.0.0"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"dot-case": {
|
"dot-case": {
|
||||||
"version": "3.0.4",
|
"version": "3.0.4",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
@@ -31563,12 +31427,6 @@
|
|||||||
"version": "1.0.2",
|
"version": "1.0.2",
|
||||||
"devOptional": true
|
"devOptional": true
|
||||||
},
|
},
|
||||||
"p-cancelable": {
|
|
||||||
"version": "2.1.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz",
|
|
||||||
"integrity": "sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==",
|
|
||||||
"dev": true
|
|
||||||
},
|
|
||||||
"p-limit": {
|
"p-limit": {
|
||||||
"version": "3.1.0",
|
"version": "3.1.0",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
"accept-language-parser": "^1.5.0",
|
"accept-language-parser": "^1.5.0",
|
||||||
"ajv": "^8.11.0",
|
"ajv": "^8.11.0",
|
||||||
"ajv-formats": "^2.1.1",
|
"ajv-formats": "^2.1.1",
|
||||||
|
"bottleneck": "2.19.5",
|
||||||
"cheerio": "^1.0.0-rc.11",
|
"cheerio": "^1.0.0-rc.11",
|
||||||
"classnames": "^2.3.1",
|
"classnames": "^2.3.1",
|
||||||
"connect-datadog": "0.0.9",
|
"connect-datadog": "0.0.9",
|
||||||
@@ -121,7 +122,6 @@
|
|||||||
"cross-env": "^7.0.3",
|
"cross-env": "^7.0.3",
|
||||||
"csp-parse": "0.0.2",
|
"csp-parse": "0.0.2",
|
||||||
"dedent": "^0.7.0",
|
"dedent": "^0.7.0",
|
||||||
"domwaiter": "^1.4.0",
|
|
||||||
"eslint": "8.24.0",
|
"eslint": "8.24.0",
|
||||||
"eslint-config-prettier": "^8.5.0",
|
"eslint-config-prettier": "^8.5.0",
|
||||||
"eslint-config-standard": "^17.0.0",
|
"eslint-config-standard": "^17.0.0",
|
||||||
@@ -169,7 +169,6 @@
|
|||||||
"license": "(MIT AND CC-BY-4.0)",
|
"license": "(MIT AND CC-BY-4.0)",
|
||||||
"name": "docs.github.com",
|
"name": "docs.github.com",
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"bottleneck": "^2.19.5",
|
|
||||||
"esm": "^3.2.25",
|
"esm": "^3.2.25",
|
||||||
"image-size": "^1.0.1",
|
"image-size": "^1.0.1",
|
||||||
"jest-puppeteer": "^5.0.4",
|
"jest-puppeteer": "^5.0.4",
|
||||||
|
|||||||
55
script/domwaiter.js
Normal file
55
script/domwaiter.js
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
import { EventEmitter } from 'node:events'
|
||||||
|
import Bottleneck from 'bottleneck'
|
||||||
|
import got from 'got'
|
||||||
|
import cheerio from 'cheerio'
|
||||||
|
|
||||||
|
export default function domwaiter(pages, opts = {}) {
|
||||||
|
const emitter = new EventEmitter()
|
||||||
|
|
||||||
|
const defaults = {
|
||||||
|
parseDOM: true,
|
||||||
|
json: false,
|
||||||
|
maxConcurrent: 5,
|
||||||
|
minTime: 500,
|
||||||
|
}
|
||||||
|
opts = Object.assign(defaults, opts)
|
||||||
|
|
||||||
|
const limiter = new Bottleneck(opts)
|
||||||
|
|
||||||
|
pages.forEach((page) => {
|
||||||
|
limiter.schedule(getPage, page, emitter, opts)
|
||||||
|
})
|
||||||
|
|
||||||
|
limiter
|
||||||
|
.on('idle', () => {
|
||||||
|
emitter.emit('done')
|
||||||
|
})
|
||||||
|
.on('error', (err) => {
|
||||||
|
emitter.emit('error', err)
|
||||||
|
})
|
||||||
|
|
||||||
|
return emitter
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getPage(page, emitter, opts) {
|
||||||
|
emitter.emit('beforePageLoad', page)
|
||||||
|
|
||||||
|
if (opts.json) {
|
||||||
|
try {
|
||||||
|
const json = await got(page.url).json()
|
||||||
|
const pageCopy = Object.assign({}, page, { json })
|
||||||
|
emitter.emit('page', pageCopy)
|
||||||
|
} catch (err) {
|
||||||
|
emitter.emit('error', err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
const body = (await got(page.url)).body
|
||||||
|
const pageCopy = Object.assign({}, page, { body })
|
||||||
|
if (opts.parseDOM) pageCopy.$ = cheerio.load(body)
|
||||||
|
emitter.emit('page', pageCopy)
|
||||||
|
} catch (err) {
|
||||||
|
emitter.emit('error', err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,11 +1,12 @@
|
|||||||
#!/usr/bin/env node
|
#!/usr/bin/env node
|
||||||
import domwaiter from 'domwaiter'
|
|
||||||
import eventToPromise from 'event-to-promise'
|
import eventToPromise from 'event-to-promise'
|
||||||
import chalk from 'chalk'
|
import chalk from 'chalk'
|
||||||
import dotenv from 'dotenv'
|
import dotenv from 'dotenv'
|
||||||
import parsePageSectionsIntoRecords from './parse-page-sections-into-records.js'
|
import parsePageSectionsIntoRecords from './parse-page-sections-into-records.js'
|
||||||
import getPopularPages from './popular-pages.js'
|
import getPopularPages from './popular-pages.js'
|
||||||
import languages from '../../lib/languages.js'
|
import languages from '../../lib/languages.js'
|
||||||
|
import domwaiter from '../domwaiter.js'
|
||||||
|
|
||||||
const pageMarker = chalk.green('|')
|
const pageMarker = chalk.green('|')
|
||||||
const recordMarker = chalk.grey('.')
|
const recordMarker = chalk.grey('.')
|
||||||
const port = 4002
|
const port = 4002
|
||||||
|
|||||||
Reference in New Issue
Block a user