From 6609dfc9c0d7e451ebb9f272b051e57b3b1d91f1 Mon Sep 17 00:00:00 2001 From: Hector Alfaro Date: Thu, 18 Apr 2024 09:30:09 -0400 Subject: [PATCH] Check links within the same page (#49403) Co-authored-by: Peter Bengtsson --- .github/workflows/link-check-on-pr.yml | 1 + .../scripts/rendered-content-link-checker.js | 195 +++++++++++------- 2 files changed, 127 insertions(+), 69 deletions(-) diff --git a/.github/workflows/link-check-on-pr.yml b/.github/workflows/link-check-on-pr.yml index 2e0853a42e..02b6fe5d92 100644 --- a/.github/workflows/link-check-on-pr.yml +++ b/.github/workflows/link-check-on-pr.yml @@ -43,6 +43,7 @@ jobs: SHOULD_COMMENT: ${{ secrets.DOCS_BOT_PAT_WRITEORG_PROJECT != '' }} CHECK_EXTERNAL_LINKS: false CREATE_REPORT: false + CHECK_ANCHORS: true # Not strictly necessary bit it makes warmServer() a bit faster # because it only bothers with English to begin with, which # we're filtering on anyway once the list of all pages has diff --git a/src/links/scripts/rendered-content-link-checker.js b/src/links/scripts/rendered-content-link-checker.js index 7ad315db35..4945908340 100755 --- a/src/links/scripts/rendered-content-link-checker.js +++ b/src/links/scripts/rendered-content-link-checker.js @@ -88,6 +88,7 @@ if (import.meta.url.endsWith(process.argv[1])) { REPORT_AUTHOR, REPORT_LABEL, EXTERNAL_SERVER_ERRORS_AS_WARNINGS, + CHECK_ANCHORS, } = process.env const octokit = github() @@ -110,6 +111,7 @@ if (import.meta.url.endsWith(process.argv[1])) { verbose: true, linkReports: true, checkImages: true, + checkAnchors: Boolean(CHECK_ANCHORS), patient: boolEnvVar('PATIENT'), random: false, language: 'en', @@ -613,7 +615,7 @@ async function processPermalink(core, permalink, page, pageMap, redirects, opts, checkAnchors, checkExternalLinks, externalServerErrorsAsWarning, - { verbose, patient }, + { verbose, patient, permalink }, db, ) @@ -759,94 +761,149 @@ async function checkHrefLink( checkAnchors = false, checkExternalLinks = false, externalServerErrorsAsWarning = false, - { verbose = false, patient = false } = {}, + { verbose = false, patient = false, permalink } = {}, db = null, ) { - if (href === '#') { - if (checkAnchors) { + // this function handles hrefs in all the following forms: + + // same article links: + // 1. '#' + // 2. '#anchor' + // 3. '/to/this/article#anchor' + + // different article links: + // 4. '/some/path/article#anchor' (currently not supported) + // 5. '/some/path/article' + + // external links: + // 6. 'https://example.com' (external link) + + const [pathFragment, hashFragment] = href.split('#') + const hash = '#' + hashFragment // the hash is the part that starts with `#` + + // this conditional handles cases in which the link is to the current article (cases 1-3 above) + if (checkAnchors && (!pathFragment || pathFragment === permalink.href)) { + // cases covered by this part of the conditional: + // 1. '#' + if (hash === '#') { return { WARNING: 'Link is just an empty `#`' } } - } else if (href.startsWith('#')) { - if (checkAnchors) { + // cases covered by this part of the conditional: + // 2. '#anchor' + // 3. '/to/this/article#anchor' + else { + // Some pages are a mix of Markdown and React components. On its own, + // the Markdown might appear broken but when combined with automated + // React rendering it might work. Best to stay out of it. + const avoid = + permalink && + ((permalink.href.includes('/rest/') && !permalink.href.includes('/rest/guides/')) || + permalink.href.includes('/webhooks-and-events/webhooks/webhook-events-and-payloads') || + permalink.href.includes('/graphql/reference') || + permalink.href.includes('/code-security/codeql-cli/codeql-cli-manual/') || + permalink.href.includes( + '/apps/maintaining-github-apps/modifying-a-github-app-registration', + ) || + permalink.href.includes( + '/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning', + ) || + permalink.href.includes( + '/site-policy/github-company-policies/github-statement-against-modern-slavery-and-child-labor', + ) || + permalink.href.includes('/site-policy/content-removal-policies/dmca-takedown-policy') || + permalink.href.includes('/early-access/')) + // You don't need a DOM ID (or ) for `` // to work in all modern browsers. - if (href !== '#top') { + if (hash !== '#top' && !avoid) { // If the link is `#foo` it could either match `` // or it could match ``. - const countDOMItems = $(href).length + $(`a[name="${href.slice(1)}"]`).length + const countDOMItems = $(hash).length + $(`a[name="${hash.slice(1)}"]`).length if (countDOMItems === 0) { - return { WARNING: `Anchor on the same page can't be found by ID` } + return { CRITICAL: `Anchor on the same page can't be found by ID` } } else if (countDOMItems > 1) { - return { WARNING: `Matches multiple points in the page` } + return { CRITICAL: `Matches multiple points in the page` } } } } - } else if (href.startsWith('/')) { - const pathname = new URL(href, 'http://example.com').pathname + } + // this conditional handles cases in which the link is to a different article or externally (cases 4-6 above) + else { + // cases covered by this part of the conditional: + // 4. '/some/path/article#anchor' (currently not supported) + // 5. '/some/path/article' + if (href.startsWith('/')) { + const pathname = new URL(href, 'http://example.com').pathname + // we drop any hashes due to `.pathname` + // we don't currently support hashes for other articles we link to: /some/path/article#anchor - // Remember, if the Markdown has something like - // - // See [my link][/some/page/] - // - // In the post-processing, that will actually become - // - // See my link - // - // But, if that link was a redirect, that would have been left - // untouched. - if (pathname.endsWith('/')) { - const whatifPathname = pathname.slice(0, -1) - if (getRedirect(whatifPathname, { redirects, pages: pageMap })) { - return { - WARNING: `Redirect to ${getRedirect(whatifPathname, { redirects, pages: pageMap })}`, + // Remember, if the Markdown has something like + // + // See [my link][/some/page/] + // + // In the post-processing, that will actually become + // + // See my link + // + // But, if that link was a redirect, that would have been left + // untouched. + if (pathname.endsWith('/')) { + const whatifPathname = pathname.slice(0, -1) + if (getRedirect(whatifPathname, { redirects, pages: pageMap })) { + return { + WARNING: `Redirect to ${getRedirect(whatifPathname, { redirects, pages: pageMap })}`, + } + } else if (!pageMap[whatifPathname]) { + if (!deprecatedVersionPrefixesRegex.test(whatifPathname)) { + return { CRITICAL: 'Broken link' } + } } - } else if (!pageMap[whatifPathname]) { - if (!deprecatedVersionPrefixesRegex.test(whatifPathname)) { + return { WARNING: 'Links with a trailing / will always redirect' } + } else { + if (pathname.split('/')[1] in STATIC_PREFIXES) { + const staticFilePath = path.join( + STATIC_PREFIXES[pathname.split('/')[1]], + pathname.split(path.sep).slice(2).join(path.sep), + ) + if (!fs.existsSync(staticFilePath)) { + return { CRITICAL: `Static file not found ${staticFilePath} (${pathname})` } + } + } else if (getRedirect(pathname, { redirects, pages: pageMap })) { + return { WARNING: `Redirect to ${getRedirect(pathname, { redirects, pages: pageMap })}` } + } else if (!pageMap[pathname]) { + if (deprecatedVersionPrefixesRegex.test(pathname)) { + return + } + return { CRITICAL: 'Broken link' } } } - return { WARNING: 'Links with a trailing / will always redirect' } - } else { - if (pathname.split('/')[1] in STATIC_PREFIXES) { - const staticFilePath = path.join( - STATIC_PREFIXES[pathname.split('/')[1]], - pathname.split(path.sep).slice(2).join(path.sep), - ) - if (!fs.existsSync(staticFilePath)) { - return { CRITICAL: `Static file not found ${staticFilePath} (${pathname})` } - } - } else if (getRedirect(pathname, { redirects, pages: pageMap })) { - return { WARNING: `Redirect to ${getRedirect(pathname, { redirects, pages: pageMap })}` } - } else if (!pageMap[pathname]) { - if (deprecatedVersionPrefixesRegex.test(pathname)) { - return - } - - return { CRITICAL: 'Broken link' } + } + // cases covered by this part of the conditional: + // 6. 'https://example.com' (external link) + else if (checkExternalLinks) { + if (!href.startsWith('https://')) { + return { WARNING: `Will not check external URLs that are not HTTPS (${href})` } } - } - } else if (checkExternalLinks) { - if (!href.startsWith('https://')) { - return { WARNING: `Will not check external URLs that are not HTTPS (${href})` } - } - if (linksToSkip(href)) { - return - } - const { ok, ...info } = await checkExternalURLCached(core, href, { verbose, patient }, db) - if (!ok) { - // By default, an not-OK problem with an external link is CRITICAL - // but if it was a `responseError` or the statusCode was >= 500 - // then downgrade it to WARNING. - let problem = 'CRITICAL' - if (externalServerErrorsAsWarning) { - if ( - (info.statusCode && info.statusCode >= 500) || - (info.requestError && isTemporaryRequestError(info.requestError)) - ) { - problem = 'WARNING' - } + if (linksToSkip(href)) { + return + } + const { ok, ...info } = await checkExternalURLCached(core, href, { verbose, patient }, db) + if (!ok) { + // By default, an not-OK problem with an external link is CRITICAL + // but if it was a `responseError` or the statusCode was >= 500 + // then downgrade it to WARNING. + let problem = 'CRITICAL' + if (externalServerErrorsAsWarning) { + if ( + (info.statusCode && info.statusCode >= 500) || + (info.requestError && isTemporaryRequestError(info.requestError)) + ) { + problem = 'WARNING' + } + } + return { [problem]: `Broken external link (${JSON.stringify(info)})`, isExternal: true } } - return { [problem]: `Broken external link (${JSON.stringify(info)})`, isExternal: true } } } }