Add concurrency limit to link checker to prevent overwhelming external servers (#57514)
This commit is contained in:
@@ -76,8 +76,12 @@ type Options = {
|
|||||||
bail?: boolean
|
bail?: boolean
|
||||||
commentLimitToExternalLinks?: boolean
|
commentLimitToExternalLinks?: boolean
|
||||||
actionContext?: any
|
actionContext?: any
|
||||||
|
concurrency?: number
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Default concurrency limit for URL requests
|
||||||
|
const DEFAULT_CONCURRENCY_LIMIT = 3
|
||||||
|
|
||||||
const STATIC_PREFIXES: Record<string, string> = {
|
const STATIC_PREFIXES: Record<string, string> = {
|
||||||
assets: path.resolve('assets'),
|
assets: path.resolve('assets'),
|
||||||
public: path.resolve(path.join('src', 'graphql', 'data')),
|
public: path.resolve(path.join('src', 'graphql', 'data')),
|
||||||
@@ -114,6 +118,32 @@ const externalLinkCheckerDB = await JSONFilePreset<Data>(EXTERNAL_LINK_CHECKER_D
|
|||||||
|
|
||||||
type DBType = typeof externalLinkCheckerDB
|
type DBType = typeof externalLinkCheckerDB
|
||||||
|
|
||||||
|
// Simple concurrency limiter
|
||||||
|
async function limitConcurrency<T, R>(
|
||||||
|
items: T[],
|
||||||
|
asyncFn: (item: T) => Promise<R>,
|
||||||
|
limit: number = 3,
|
||||||
|
): Promise<R[]> {
|
||||||
|
const results: Promise<R>[] = []
|
||||||
|
const executing = new Set<Promise<R>>()
|
||||||
|
|
||||||
|
for (const item of items) {
|
||||||
|
const promise = asyncFn(item).then((result) => {
|
||||||
|
executing.delete(promise)
|
||||||
|
return result
|
||||||
|
})
|
||||||
|
|
||||||
|
results.push(promise)
|
||||||
|
executing.add(promise)
|
||||||
|
|
||||||
|
if (executing.size >= limit) {
|
||||||
|
await Promise.race(executing)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Promise.all(results)
|
||||||
|
}
|
||||||
|
|
||||||
// Given a number and a percentage, return the same number with a *percentage*
|
// Given a number and a percentage, return the same number with a *percentage*
|
||||||
// max change of making a bit larger or smaller.
|
// max change of making a bit larger or smaller.
|
||||||
// E.g. `jitter(55, 10)` will return a value between `[55 - 55/10: 55 + 55/10]`
|
// E.g. `jitter(55, 10)` will return a value between `[55 - 55/10: 55 + 55/10]`
|
||||||
@@ -156,6 +186,7 @@ if (import.meta.url.endsWith(process.argv[1])) {
|
|||||||
REPORT_LABEL,
|
REPORT_LABEL,
|
||||||
EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
|
EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
|
||||||
CHECK_ANCHORS,
|
CHECK_ANCHORS,
|
||||||
|
CONCURRENCY,
|
||||||
} = process.env
|
} = process.env
|
||||||
|
|
||||||
const octokit = github()
|
const octokit = github()
|
||||||
@@ -193,6 +224,7 @@ if (import.meta.url.endsWith(process.argv[1])) {
|
|||||||
reportAuthor: REPORT_AUTHOR,
|
reportAuthor: REPORT_AUTHOR,
|
||||||
actionContext: getActionContext(),
|
actionContext: getActionContext(),
|
||||||
externalServerErrorsAsWarning: EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
|
externalServerErrorsAsWarning: EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
|
||||||
|
concurrency: CONCURRENCY ? parseInt(CONCURRENCY, 10) : DEFAULT_CONCURRENCY_LIMIT,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opts.shouldComment || opts.createReport) {
|
if (opts.shouldComment || opts.createReport) {
|
||||||
@@ -238,6 +270,7 @@ if (import.meta.url.endsWith(process.argv[1])) {
|
|||||||
* externalServerErrorsAsWarning {boolean} - Treat >=500 errors or temporary request errors as warning
|
* externalServerErrorsAsWarning {boolean} - Treat >=500 errors or temporary request errors as warning
|
||||||
* filter {Array<string>} - strings to match the pages' relativePath
|
* filter {Array<string>} - strings to match the pages' relativePath
|
||||||
* versions {Array<string>} - only certain pages' versions (e.g. )
|
* versions {Array<string>} - only certain pages' versions (e.g. )
|
||||||
|
* concurrency {number} - Maximum number of concurrent URL requests (default: 3, env: CONCURRENCY)
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@@ -263,6 +296,7 @@ async function main(
|
|||||||
reportRepository = 'github/docs-content',
|
reportRepository = 'github/docs-content',
|
||||||
reportAuthor = 'docs-bot',
|
reportAuthor = 'docs-bot',
|
||||||
reportLabel = 'broken link report',
|
reportLabel = 'broken link report',
|
||||||
|
concurrency = DEFAULT_CONCURRENCY_LIMIT,
|
||||||
} = opts
|
} = opts
|
||||||
|
|
||||||
// Note! The reason we're using `warmServer()` in this script,
|
// Note! The reason we're using `warmServer()` in this script,
|
||||||
@@ -337,8 +371,9 @@ async function main(
|
|||||||
|
|
||||||
debugTimeStart(core, 'processPages')
|
debugTimeStart(core, 'processPages')
|
||||||
const t0 = new Date().getTime()
|
const t0 = new Date().getTime()
|
||||||
const flawsGroups = await Promise.all(
|
const flawsGroups = await limitConcurrency(
|
||||||
pages.map((page: Page) =>
|
pages,
|
||||||
|
(page: Page) =>
|
||||||
processPage(
|
processPage(
|
||||||
core,
|
core,
|
||||||
page,
|
page,
|
||||||
@@ -348,7 +383,7 @@ async function main(
|
|||||||
externalLinkCheckerDB,
|
externalLinkCheckerDB,
|
||||||
versions as string[],
|
versions as string[],
|
||||||
),
|
),
|
||||||
),
|
concurrency, // Limit concurrent page checks
|
||||||
)
|
)
|
||||||
const t1 = new Date().getTime()
|
const t1 = new Date().getTime()
|
||||||
debugTimeEnd(core, 'processPages')
|
debugTimeEnd(core, 'processPages')
|
||||||
@@ -653,14 +688,13 @@ async function processPage(
|
|||||||
versions: string[],
|
versions: string[],
|
||||||
) {
|
) {
|
||||||
const { verbose, verboseUrl, bail } = opts
|
const { verbose, verboseUrl, bail } = opts
|
||||||
const allFlawsEach = await Promise.all(
|
const filteredPermalinks = page.permalinks.filter((permalink) => {
|
||||||
page.permalinks
|
return !versions.length || versions.includes(permalink.pageVersion)
|
||||||
.filter((permalink) => {
|
})
|
||||||
return !versions.length || versions.includes(permalink.pageVersion)
|
const allFlawsEach = await limitConcurrency(
|
||||||
})
|
filteredPermalinks,
|
||||||
.map((permalink) => {
|
(permalink) => processPermalink(core, permalink, page, pageMap, redirects, opts, db),
|
||||||
return processPermalink(core, permalink, page, pageMap, redirects, opts, db)
|
opts.concurrency || DEFAULT_CONCURRENCY_LIMIT, // Limit concurrent permalink checks per page
|
||||||
}),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const allFlaws = allFlawsEach.flat()
|
const allFlaws = allFlawsEach.flat()
|
||||||
@@ -714,8 +748,9 @@ async function processPermalink(
|
|||||||
$('a[href]').each((i, link) => {
|
$('a[href]').each((i, link) => {
|
||||||
links.push(link)
|
links.push(link)
|
||||||
})
|
})
|
||||||
const newFlaws: LinkFlaw[] = await Promise.all(
|
const newFlaws: LinkFlaw[] = await limitConcurrency(
|
||||||
links.map(async (link) => {
|
links,
|
||||||
|
async (link) => {
|
||||||
const { href } = (link as cheerio.TagElement).attribs
|
const { href } = (link as cheerio.TagElement).attribs
|
||||||
|
|
||||||
// The global cache can't be used for anchor links because they
|
// The global cache can't be used for anchor links because they
|
||||||
@@ -756,7 +791,8 @@ async function processPermalink(
|
|||||||
globalHrefCheckCache.set(href, flaw)
|
globalHrefCheckCache.set(href, flaw)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}),
|
},
|
||||||
|
opts.concurrency || DEFAULT_CONCURRENCY_LIMIT, // Limit concurrent link checks per permalink
|
||||||
)
|
)
|
||||||
|
|
||||||
for (const flaw of newFlaws) {
|
for (const flaw of newFlaws) {
|
||||||
|
|||||||
Reference in New Issue
Block a user