1
0
mirror of synced 2025-12-19 18:10:59 -05:00

Add concurrency limit to link checker to prevent overwhelming external servers (#57514)

This commit is contained in:
Kevin Heis
2025-09-11 15:24:38 -07:00
committed by GitHub
parent 4589da076c
commit 86bcd366d3

View File

@@ -76,8 +76,12 @@ type Options = {
bail?: boolean bail?: boolean
commentLimitToExternalLinks?: boolean commentLimitToExternalLinks?: boolean
actionContext?: any actionContext?: any
concurrency?: number
} }
// Default concurrency limit for URL requests
const DEFAULT_CONCURRENCY_LIMIT = 3
const STATIC_PREFIXES: Record<string, string> = { const STATIC_PREFIXES: Record<string, string> = {
assets: path.resolve('assets'), assets: path.resolve('assets'),
public: path.resolve(path.join('src', 'graphql', 'data')), public: path.resolve(path.join('src', 'graphql', 'data')),
@@ -114,6 +118,32 @@ const externalLinkCheckerDB = await JSONFilePreset<Data>(EXTERNAL_LINK_CHECKER_D
type DBType = typeof externalLinkCheckerDB type DBType = typeof externalLinkCheckerDB
// Simple concurrency limiter
async function limitConcurrency<T, R>(
items: T[],
asyncFn: (item: T) => Promise<R>,
limit: number = 3,
): Promise<R[]> {
const results: Promise<R>[] = []
const executing = new Set<Promise<R>>()
for (const item of items) {
const promise = asyncFn(item).then((result) => {
executing.delete(promise)
return result
})
results.push(promise)
executing.add(promise)
if (executing.size >= limit) {
await Promise.race(executing)
}
}
return Promise.all(results)
}
// Given a number and a percentage, return the same number with a *percentage* // Given a number and a percentage, return the same number with a *percentage*
// max change of making a bit larger or smaller. // max change of making a bit larger or smaller.
// E.g. `jitter(55, 10)` will return a value between `[55 - 55/10: 55 + 55/10]` // E.g. `jitter(55, 10)` will return a value between `[55 - 55/10: 55 + 55/10]`
@@ -156,6 +186,7 @@ if (import.meta.url.endsWith(process.argv[1])) {
REPORT_LABEL, REPORT_LABEL,
EXTERNAL_SERVER_ERRORS_AS_WARNINGS, EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
CHECK_ANCHORS, CHECK_ANCHORS,
CONCURRENCY,
} = process.env } = process.env
const octokit = github() const octokit = github()
@@ -193,6 +224,7 @@ if (import.meta.url.endsWith(process.argv[1])) {
reportAuthor: REPORT_AUTHOR, reportAuthor: REPORT_AUTHOR,
actionContext: getActionContext(), actionContext: getActionContext(),
externalServerErrorsAsWarning: EXTERNAL_SERVER_ERRORS_AS_WARNINGS, externalServerErrorsAsWarning: EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
concurrency: CONCURRENCY ? parseInt(CONCURRENCY, 10) : DEFAULT_CONCURRENCY_LIMIT,
} }
if (opts.shouldComment || opts.createReport) { if (opts.shouldComment || opts.createReport) {
@@ -238,6 +270,7 @@ if (import.meta.url.endsWith(process.argv[1])) {
* externalServerErrorsAsWarning {boolean} - Treat >=500 errors or temporary request errors as warning * externalServerErrorsAsWarning {boolean} - Treat >=500 errors or temporary request errors as warning
* filter {Array<string>} - strings to match the pages' relativePath * filter {Array<string>} - strings to match the pages' relativePath
* versions {Array<string>} - only certain pages' versions (e.g. ) * versions {Array<string>} - only certain pages' versions (e.g. )
* concurrency {number} - Maximum number of concurrent URL requests (default: 3, env: CONCURRENCY)
* *
*/ */
@@ -263,6 +296,7 @@ async function main(
reportRepository = 'github/docs-content', reportRepository = 'github/docs-content',
reportAuthor = 'docs-bot', reportAuthor = 'docs-bot',
reportLabel = 'broken link report', reportLabel = 'broken link report',
concurrency = DEFAULT_CONCURRENCY_LIMIT,
} = opts } = opts
// Note! The reason we're using `warmServer()` in this script, // Note! The reason we're using `warmServer()` in this script,
@@ -337,8 +371,9 @@ async function main(
debugTimeStart(core, 'processPages') debugTimeStart(core, 'processPages')
const t0 = new Date().getTime() const t0 = new Date().getTime()
const flawsGroups = await Promise.all( const flawsGroups = await limitConcurrency(
pages.map((page: Page) => pages,
(page: Page) =>
processPage( processPage(
core, core,
page, page,
@@ -348,7 +383,7 @@ async function main(
externalLinkCheckerDB, externalLinkCheckerDB,
versions as string[], versions as string[],
), ),
), concurrency, // Limit concurrent page checks
) )
const t1 = new Date().getTime() const t1 = new Date().getTime()
debugTimeEnd(core, 'processPages') debugTimeEnd(core, 'processPages')
@@ -653,14 +688,13 @@ async function processPage(
versions: string[], versions: string[],
) { ) {
const { verbose, verboseUrl, bail } = opts const { verbose, verboseUrl, bail } = opts
const allFlawsEach = await Promise.all( const filteredPermalinks = page.permalinks.filter((permalink) => {
page.permalinks return !versions.length || versions.includes(permalink.pageVersion)
.filter((permalink) => { })
return !versions.length || versions.includes(permalink.pageVersion) const allFlawsEach = await limitConcurrency(
}) filteredPermalinks,
.map((permalink) => { (permalink) => processPermalink(core, permalink, page, pageMap, redirects, opts, db),
return processPermalink(core, permalink, page, pageMap, redirects, opts, db) opts.concurrency || DEFAULT_CONCURRENCY_LIMIT, // Limit concurrent permalink checks per page
}),
) )
const allFlaws = allFlawsEach.flat() const allFlaws = allFlawsEach.flat()
@@ -714,8 +748,9 @@ async function processPermalink(
$('a[href]').each((i, link) => { $('a[href]').each((i, link) => {
links.push(link) links.push(link)
}) })
const newFlaws: LinkFlaw[] = await Promise.all( const newFlaws: LinkFlaw[] = await limitConcurrency(
links.map(async (link) => { links,
async (link) => {
const { href } = (link as cheerio.TagElement).attribs const { href } = (link as cheerio.TagElement).attribs
// The global cache can't be used for anchor links because they // The global cache can't be used for anchor links because they
@@ -756,7 +791,8 @@ async function processPermalink(
globalHrefCheckCache.set(href, flaw) globalHrefCheckCache.set(href, flaw)
} }
} }
}), },
opts.concurrency || DEFAULT_CONCURRENCY_LIMIT, // Limit concurrent link checks per permalink
) )
for (const flaw of newFlaws) { for (const flaw of newFlaws) {