1
0
mirror of synced 2025-12-19 18:10:59 -05:00

merge in domwaiter (#31197)

This commit is contained in:
Peter Bengtsson
2022-09-27 20:38:12 +02:00
committed by GitHub
parent b622b6829e
commit 8112c47fee
5 changed files with 64 additions and 151 deletions

View File

@@ -8,7 +8,7 @@ on:
merge_group:
pull_request:
paths:
- script/search/index-elasticsearch.js
- 'script/search/**'
- 'package*.json'
- .github/workflows/dry-run-elasticsearch-indexing.yml

152
package-lock.json generated
View File

@@ -16,6 +16,7 @@
"accept-language-parser": "^1.5.0",
"ajv": "^8.11.0",
"ajv-formats": "^2.1.1",
"bottleneck": "2.19.5",
"cheerio": "^1.0.0-rc.11",
"classnames": "^2.3.1",
"connect-datadog": "0.0.9",
@@ -119,7 +120,6 @@
"cross-env": "^7.0.3",
"csp-parse": "0.0.2",
"dedent": "^0.7.0",
"domwaiter": "^1.4.0",
"eslint": "8.24.0",
"eslint-config-prettier": "^8.5.0",
"eslint-config-standard": "^17.0.0",
@@ -164,7 +164,6 @@
"node": ">=16.x"
},
"optionalDependencies": {
"bottleneck": "^2.19.5",
"esm": "^3.2.25",
"image-size": "^1.0.1",
"jest-puppeteer": "^5.0.4",
@@ -4108,18 +4107,6 @@
"tslib": "^2.4.0"
}
},
"node_modules/@szmarczak/http-timer": {
"version": "4.0.6",
"resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-4.0.6.tgz",
"integrity": "sha512-4BAffykYOgO+5nzBWYwE3W90sBgLJoUPRWWcL8wlyiM8IB8ipJz3UMJ9KXQd1RKQXpKp8Tutn80HZtWsu2u76w==",
"dev": true,
"dependencies": {
"defer-to-connect": "^2.0.0"
},
"engines": {
"node": ">=10"
}
},
"node_modules/@types/babel__core": {
"version": "7.1.19",
"resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.1.19.tgz",
@@ -6240,8 +6227,8 @@
},
"node_modules/bottleneck": {
"version": "2.19.5",
"devOptional": true,
"license": "MIT"
"resolved": "https://registry.npmjs.org/bottleneck/-/bottleneck-2.19.5.tgz",
"integrity": "sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw=="
},
"node_modules/boxen": {
"version": "7.0.0",
@@ -7573,64 +7560,6 @@
"url": "https://github.com/fb55/domutils?sponsor=1"
}
},
"node_modules/domwaiter": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/domwaiter/-/domwaiter-1.4.0.tgz",
"integrity": "sha512-k7dIRmg5/wMsET8FFZvrlZ2A81WOjc9D5DcVVoZxkwvo2hMPklYXPiS23h3Ez7zqyp25pmEn3Hzjq8agPiRxiw==",
"dev": true,
"dependencies": {
"bottleneck": "^2.19.5",
"cheerio": "^1.0.0-rc.3",
"got": "^11.8.5"
}
},
"node_modules/domwaiter/node_modules/cacheable-lookup": {
"version": "5.0.4",
"resolved": "https://registry.npmjs.org/cacheable-lookup/-/cacheable-lookup-5.0.4.tgz",
"integrity": "sha512-2/kNscPhpcxrOigMZzbiWF7dz8ilhb/nIHU3EyZiXWXpeq/au8qJ8VhdftMkty3n7Gj6HIGalQG8oiBNB3AJgA==",
"dev": true,
"engines": {
"node": ">=10.6.0"
}
},
"node_modules/domwaiter/node_modules/got": {
"version": "11.8.5",
"resolved": "https://registry.npmjs.org/got/-/got-11.8.5.tgz",
"integrity": "sha512-o0Je4NvQObAuZPHLFoRSkdG2lTgtcynqymzg2Vupdx6PorhaT5MCbIyXG6d4D94kk8ZG57QeosgdiqfJWhEhlQ==",
"dev": true,
"dependencies": {
"@sindresorhus/is": "^4.0.0",
"@szmarczak/http-timer": "^4.0.5",
"@types/cacheable-request": "^6.0.1",
"@types/responselike": "^1.0.0",
"cacheable-lookup": "^5.0.3",
"cacheable-request": "^7.0.2",
"decompress-response": "^6.0.0",
"http2-wrapper": "^1.0.0-beta.5.2",
"lowercase-keys": "^2.0.0",
"p-cancelable": "^2.0.0",
"responselike": "^2.0.0"
},
"engines": {
"node": ">=10.19.0"
},
"funding": {
"url": "https://github.com/sindresorhus/got?sponsor=1"
}
},
"node_modules/domwaiter/node_modules/http2-wrapper": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/http2-wrapper/-/http2-wrapper-1.0.3.tgz",
"integrity": "sha512-V+23sDMr12Wnz7iTcDeJr3O6AIxlnvT/bmaAAAP/Xda35C90p9599p0F1eHR/N1KILWSoWVAiOMFjBBXaXSMxg==",
"dev": true,
"dependencies": {
"quick-lru": "^5.1.1",
"resolve-alpn": "^1.0.0"
},
"engines": {
"node": ">=10.19.0"
}
},
"node_modules/dot-case": {
"version": "3.0.4",
"dev": true,
@@ -15723,15 +15652,6 @@
"node": ">=0.10.0"
}
},
"node_modules/p-cancelable": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz",
"integrity": "sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==",
"dev": true,
"engines": {
"node": ">=8"
}
},
"node_modules/p-limit": {
"version": "3.1.0",
"dev": true,
@@ -23418,15 +23338,6 @@
"tslib": "^2.4.0"
}
},
"@szmarczak/http-timer": {
"version": "4.0.6",
"resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-4.0.6.tgz",
"integrity": "sha512-4BAffykYOgO+5nzBWYwE3W90sBgLJoUPRWWcL8wlyiM8IB8ipJz3UMJ9KXQd1RKQXpKp8Tutn80HZtWsu2u76w==",
"dev": true,
"requires": {
"defer-to-connect": "^2.0.0"
}
},
"@types/babel__core": {
"version": "7.1.19",
"resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.1.19.tgz",
@@ -25093,7 +25004,8 @@
},
"bottleneck": {
"version": "2.19.5",
"devOptional": true
"resolved": "https://registry.npmjs.org/bottleneck/-/bottleneck-2.19.5.tgz",
"integrity": "sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw=="
},
"boxen": {
"version": "7.0.0",
@@ -25950,54 +25862,6 @@
"domhandler": "^5.0.1"
}
},
"domwaiter": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/domwaiter/-/domwaiter-1.4.0.tgz",
"integrity": "sha512-k7dIRmg5/wMsET8FFZvrlZ2A81WOjc9D5DcVVoZxkwvo2hMPklYXPiS23h3Ez7zqyp25pmEn3Hzjq8agPiRxiw==",
"dev": true,
"requires": {
"bottleneck": "^2.19.5",
"cheerio": "^1.0.0-rc.3",
"got": "^11.8.5"
},
"dependencies": {
"cacheable-lookup": {
"version": "5.0.4",
"resolved": "https://registry.npmjs.org/cacheable-lookup/-/cacheable-lookup-5.0.4.tgz",
"integrity": "sha512-2/kNscPhpcxrOigMZzbiWF7dz8ilhb/nIHU3EyZiXWXpeq/au8qJ8VhdftMkty3n7Gj6HIGalQG8oiBNB3AJgA==",
"dev": true
},
"got": {
"version": "11.8.5",
"resolved": "https://registry.npmjs.org/got/-/got-11.8.5.tgz",
"integrity": "sha512-o0Je4NvQObAuZPHLFoRSkdG2lTgtcynqymzg2Vupdx6PorhaT5MCbIyXG6d4D94kk8ZG57QeosgdiqfJWhEhlQ==",
"dev": true,
"requires": {
"@sindresorhus/is": "^4.0.0",
"@szmarczak/http-timer": "^4.0.5",
"@types/cacheable-request": "^6.0.1",
"@types/responselike": "^1.0.0",
"cacheable-lookup": "^5.0.3",
"cacheable-request": "^7.0.2",
"decompress-response": "^6.0.0",
"http2-wrapper": "^1.0.0-beta.5.2",
"lowercase-keys": "^2.0.0",
"p-cancelable": "^2.0.0",
"responselike": "^2.0.0"
}
},
"http2-wrapper": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/http2-wrapper/-/http2-wrapper-1.0.3.tgz",
"integrity": "sha512-V+23sDMr12Wnz7iTcDeJr3O6AIxlnvT/bmaAAAP/Xda35C90p9599p0F1eHR/N1KILWSoWVAiOMFjBBXaXSMxg==",
"dev": true,
"requires": {
"quick-lru": "^5.1.1",
"resolve-alpn": "^1.0.0"
}
}
}
},
"dot-case": {
"version": "3.0.4",
"dev": true,
@@ -31563,12 +31427,6 @@
"version": "1.0.2",
"devOptional": true
},
"p-cancelable": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz",
"integrity": "sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==",
"dev": true
},
"p-limit": {
"version": "3.1.0",
"dev": true,

View File

@@ -18,6 +18,7 @@
"accept-language-parser": "^1.5.0",
"ajv": "^8.11.0",
"ajv-formats": "^2.1.1",
"bottleneck": "2.19.5",
"cheerio": "^1.0.0-rc.11",
"classnames": "^2.3.1",
"connect-datadog": "0.0.9",
@@ -121,7 +122,6 @@
"cross-env": "^7.0.3",
"csp-parse": "0.0.2",
"dedent": "^0.7.0",
"domwaiter": "^1.4.0",
"eslint": "8.24.0",
"eslint-config-prettier": "^8.5.0",
"eslint-config-standard": "^17.0.0",
@@ -169,7 +169,6 @@
"license": "(MIT AND CC-BY-4.0)",
"name": "docs.github.com",
"optionalDependencies": {
"bottleneck": "^2.19.5",
"esm": "^3.2.25",
"image-size": "^1.0.1",
"jest-puppeteer": "^5.0.4",

55
script/domwaiter.js Normal file
View File

@@ -0,0 +1,55 @@
import { EventEmitter } from 'node:events'
import Bottleneck from 'bottleneck'
import got from 'got'
import cheerio from 'cheerio'
export default function domwaiter(pages, opts = {}) {
const emitter = new EventEmitter()
const defaults = {
parseDOM: true,
json: false,
maxConcurrent: 5,
minTime: 500,
}
opts = Object.assign(defaults, opts)
const limiter = new Bottleneck(opts)
pages.forEach((page) => {
limiter.schedule(getPage, page, emitter, opts)
})
limiter
.on('idle', () => {
emitter.emit('done')
})
.on('error', (err) => {
emitter.emit('error', err)
})
return emitter
}
async function getPage(page, emitter, opts) {
emitter.emit('beforePageLoad', page)
if (opts.json) {
try {
const json = await got(page.url).json()
const pageCopy = Object.assign({}, page, { json })
emitter.emit('page', pageCopy)
} catch (err) {
emitter.emit('error', err)
}
} else {
try {
const body = (await got(page.url)).body
const pageCopy = Object.assign({}, page, { body })
if (opts.parseDOM) pageCopy.$ = cheerio.load(body)
emitter.emit('page', pageCopy)
} catch (err) {
emitter.emit('error', err)
}
}
}

View File

@@ -1,11 +1,12 @@
#!/usr/bin/env node
import domwaiter from 'domwaiter'
import eventToPromise from 'event-to-promise'
import chalk from 'chalk'
import dotenv from 'dotenv'
import parsePageSectionsIntoRecords from './parse-page-sections-into-records.js'
import getPopularPages from './popular-pages.js'
import languages from '../../lib/languages.js'
import domwaiter from '../domwaiter.js'
const pageMarker = chalk.green('|')
const recordMarker = chalk.grey('.')
const port = 4002