/* See function main in this file for documentation */ import fs from 'fs' import path from 'path' import cheerio from 'cheerio' import coreLib from '@actions/core' import { fetchWithRetry } from '@/frame/lib/fetch-utils' import chalk from 'chalk' import { JSONFilePreset } from 'lowdb/node' import { type Octokit } from '@octokit/rest' import type { Response } from 'express' import type { ExtendedRequest, Page, Permalink, Context } from '@/types' import shortVersions from '@/versions/middleware/short-versions' import contextualize from '@/frame/middleware/context/context' import features from '@/versions/middleware/features' import getRedirect from '@/redirects/lib/get-redirect' import warmServer from '@/frame/lib/warm-server' import { liquid } from '@/content-render/index' import { deprecated } from '@/versions/lib/enterprise-server-releases' import excludedLinks from '@/links/lib/excluded-links' import { getEnvInputs, boolEnvVar } from '@/workflows/get-env-inputs' import { debugTimeEnd, debugTimeStart } from './debug-time-taken' import { uploadArtifact as uploadArtifactLib } from './upload-artifact' import github from '@/workflows/github' import { getActionContext } from '@/workflows/action-context' import { createMinimalProcessor } from '@/content-render/unified/processor' import { createReportIssue, linkReports } from '@/workflows/issue-report' import { type CoreInject } from '@/links/scripts/action-injections' type Flaw = { WARNING?: string CRITICAL?: string isExternal?: boolean } type LinkFlaw = { page: Page permalink: Permalink href?: string url?: string text?: string src: string flaw: Flaw } type Redirects = Record type PageMap = Record type UploadArtifact = (name: string, message: string) => void type Options = { level?: string files?: string[] random?: boolean language?: string | string[] filter?: string[] version?: string | string[] max?: number linkReports?: boolean actionUrl?: string verbose?: boolean checkExternalLinks?: boolean createReport?: boolean failOnFlaw?: boolean shouldComment?: boolean reportRepository?: string reportAuthor?: string reportLabel?: string checkAnchors?: boolean checkImages?: boolean patient?: boolean externalServerErrorsAsWarning?: string verboseUrl?: string bail?: boolean commentLimitToExternalLinks?: boolean actionContext?: any } const STATIC_PREFIXES: Record = { assets: path.resolve('assets'), public: path.resolve(path.join('src', 'graphql', 'data')), } // Sanity check that these are valid paths Object.entries(STATIC_PREFIXES).forEach(([key, value]) => { if (!fs.existsSync(value)) { throw new Error(`Can't find static prefix (${key}): ${value}`) } }) // By default, we don't cache external link checks to disk. // By setting this env var to something >0, it enables the disk-based // caching of external links. const EXTERNAL_LINK_CHECKER_MAX_AGE_MS = parseInt(process.env.EXTERNAL_LINK_CHECKER_MAX_AGE_DAYS || '7') * 24 * 60 * 60 * 1000 const EXTERNAL_LINK_CHECKER_DB = process.env.EXTERNAL_LINK_CHECKER_DB || 'external-link-checker-db.json' // const adapter = new JSONFile(EXTERNAL_LINK_CHECKER_DB) type Data = { urls: { [url: string]: { timestamp: number result: { ok: boolean statusCode: number } } } } const defaultData: Data = { urls: {} } const externalLinkCheckerDB = await JSONFilePreset(EXTERNAL_LINK_CHECKER_DB, defaultData) type DBType = typeof externalLinkCheckerDB // Given a number and a percentage, return the same number with a *percentage* // max change of making a bit larger or smaller. // E.g. `jitter(55, 10)` will return a value between `[55 - 55/10: 55 + 55/10]` // This is useful to avoid the caching timestamps all getting the same // numbers from the day it started which means that they don't ALL expire // on the same day but start to expire in a bit of a "random pattern" so // you don't get all or nothing. function jitter(base: number, percentage: number) { const r = percentage / 100 const negative = Math.random() > 0.5 ? -1 : 1 return base + base * Math.random() * r * negative } // Return a function that can as quickly as possible check if a certain // href input should be skipped. // Do this so we can use a `Set` and a `iterable.some()` for a speedier // check. function linksToSkipFactory() { const set = new Set(excludedLinks.map(({ is }) => is).filter(Boolean)) const arr = excludedLinks.map(({ startsWith }) => startsWith).filter(Boolean) return (href: string) => set.has(href) || arr.some((v) => v && href.startsWith(v)) } const linksToSkip = linksToSkipFactory() const CONTENT_ROOT = path.resolve('content') const deprecatedVersionPrefixesRegex = new RegExp( `enterprise(-server@|/)(${deprecated.join('|')})(/|$)`, ) // When this file is invoked directly from action as opposed to being imported if (import.meta.url.endsWith(process.argv[1])) { // Optional env vars const { ACTION_RUN_URL, LEVEL, FILES_CHANGED, REPORT_REPOSITORY, REPORT_AUTHOR, REPORT_LABEL, EXTERNAL_SERVER_ERRORS_AS_WARNINGS, CHECK_ANCHORS, } = process.env const octokit = github() // Parse changed files JSON string let files if (FILES_CHANGED) { const fileList = JSON.parse(FILES_CHANGED) if (Array.isArray(fileList) && fileList.length > 0) { files = fileList } else { console.warn(`No changed files found in PR: ${FILES_CHANGED}. Exiting...`) process.exit(0) } } const opts: Options = { level: LEVEL, files, verbose: true, linkReports: true, checkImages: true, checkAnchors: Boolean(CHECK_ANCHORS), patient: boolEnvVar('PATIENT'), random: false, language: 'en', actionUrl: ACTION_RUN_URL, checkExternalLinks: boolEnvVar('CHECK_EXTERNAL_LINKS'), shouldComment: boolEnvVar('SHOULD_COMMENT'), commentLimitToExternalLinks: boolEnvVar('COMMENT_LIMIT_TO_EXTERNAL_LINKS'), failOnFlaw: boolEnvVar('FAIL_ON_FLAW'), createReport: boolEnvVar('CREATE_REPORT'), reportRepository: REPORT_REPOSITORY, reportLabel: REPORT_LABEL, reportAuthor: REPORT_AUTHOR, actionContext: getActionContext(), externalServerErrorsAsWarning: EXTERNAL_SERVER_ERRORS_AS_WARNINGS, } if (opts.shouldComment || opts.createReport) { // `GITHUB_TOKEN` is optional. If you need the token to post a comment // or open an issue report, you might get cryptic error messages from Octokit. getEnvInputs(['GITHUB_TOKEN']) } main(coreLib, octokit, uploadArtifactLib, opts) } /* * Renders all or specified pages to gather all links on them and verify them. * Checks internal links deterministically using filesystem and external links via external requests. * Links are considered broken for reporting and commenting if they are broken at the specified "level". * e.g. redirects are considered a "warning" while 404s are considered "critical" * * When there are broken links (flaws) this action can: * 1. Create a report issue in a specified reportRepository and link it to previous reportIssues * 2. Create a comment similar to a report on a PR that triggered this action * 3. Fail using core.setFailed when there are broken links * * opts: * level {"warning" | "critical"} Counts links as "flaws" based on this value and status criteria * files {Array} - Limit link checking to specific files (usually changed in PR) * language {string | Array} - Render pages to check from included language (or languages array) * checkExternalLinks {boolean} - Checks non docs.github.com urls (takes significantly longer) * checkImages {boolean} - Check image src urls * failOnFlaw {boolean} - When true will fail using core.setFailed when links are broken according to level (flaw) * shouldComment {boolean} - When true attempts to comment flaws on PR that triggered action * commentLimitToExternalLinks {boolean} - When true PR comment only includes external links * createReport {boolean} - Creates an issue comment in reportRepository with links considered broken (flaws) * linkReports {boolean} - When createReport is true, link the issue report to previous report(s) via comments * reportRepository {string} - Repository in form of "owner/repo-name" that report issue will be created in * reportLabel {string} - Label assigned to report issue, * reportAuthor {string} - Expected author of previous report issue for linking reports (a bot user like docs-bot) * actionUrl {string} - Used to link report or comment to the action instance for debugging * actionContext {object} - Event payload context when run from action or injected. Should include { repo, owner } * verbose {boolean} - Set to true for more verbose logging * random {boolean} - Randomize page order for debugging when true * patient {boolean} - Wait longer and retry more times for rate-limited external URLS * bail {boolean} - Throw an error on the first page (not permalink) that has >0 flaws * externalServerErrorsAsWarning {boolean} - Treat >=500 errors or temporary request errors as warning * filter {Array} - strings to match the pages' relativePath * versions {Array} - only certain pages' versions (e.g. ) * */ async function main( core: any, octokit: Octokit, uploadArtifact: UploadArtifact, opts: Options = {}, ) { const { level = 'warning', files = [], random, language = 'en', filter, version, max, verbose, checkExternalLinks = false, createReport = false, failOnFlaw = false, shouldComment = false, reportRepository = 'github/docs-content', reportAuthor = 'docs-bot', reportLabel = 'broken link report', } = opts // Note! The reason we're using `warmServer()` in this script, // even though there's no server involved, is because // the `contextualize()` function calls it. // And because warmServer() is actually idempotent, meaning it's // cheap to call it more than once, it would be expensive to call it // twice unnecessarily. // If we'd manually do the same operations that `warmServer()` does // here (e.g. `loadPageMap()`), we'd end up having to do it all over // again, the next time `contextualize()` is called. const { redirects, pages: pageMap, pageList } = await warmServer([]) if (files.length) { core.debug(`Limitting to files list: ${files.join(', ')}`) } let languages = language if (!Array.isArray(languages)) { languages = [languages] } const filters = filter || [] if (filters && !Array.isArray(filters)) { throw new Error(`filters, ${filters} is not an array`) } let versions = version || [] if (versions && typeof versions === 'string') { versions = [versions] } else if (!Array.isArray(versions)) { throw new Error(`versions, '${version}' is not an array`) } if (random) { shuffle(pageList) } debugTimeStart(core, 'getPages') const pages = getPages(pageList, languages, filters, files, max) debugTimeEnd(core, 'getPages') if (checkExternalLinks && pages.length >= 100) { core.warning( `Warning! Checking external URLs can be time costly. You're testing ${pages.length} pages.`, ) } await externalLinkCheckerDB.read() if (verbose && checkExternalLinks) { core.info(`Checking of external links is is cached to ${EXTERNAL_LINK_CHECKER_DB}`) core.info( `External link cache max age is ${ EXTERNAL_LINK_CHECKER_MAX_AGE_MS / 1000 / 60 / 60 / 24 } days`, ) let countNotTooOld = 0 let countTooOld = 0 for (const { timestamp } of Object.values(externalLinkCheckerDB.data.urls || {})) { const age = Date.now() - timestamp if (age > EXTERNAL_LINK_CHECKER_MAX_AGE_MS) { countTooOld++ } else { countNotTooOld++ } } core.info( `External link cache: ${countNotTooOld.toLocaleString()} are still fresh, ${countTooOld.toLocaleString()} links too old`, ) } debugTimeStart(core, 'processPages') const t0 = new Date().getTime() const flawsGroups = await Promise.all( pages.map((page: Page) => processPage( core, page, pageMap, redirects, opts, externalLinkCheckerDB, versions as string[], ), ), ) const t1 = new Date().getTime() debugTimeEnd(core, 'processPages') await externalLinkCheckerDB.write() const flaws = flawsGroups.flat() printGlobalCacheHitRatio(core) if (verbose) { summarizeCounts(core, pages, (t1 - t0) / 1000) core.info(`Checked ${(globalCacheHitCount + globalCacheMissCount).toLocaleString()} links`) } summarizeFlaws(core, flaws) const uniqueHrefs = new Set(flaws.map((flaw) => flaw.href)) if (flaws.length > 0) { await uploadJsonFlawsArtifact(uploadArtifact, flaws, { verboseUrl: opts.verboseUrl, }) core.info(`All flaws written to artifact log.`) if (createReport) { core.info(`Creating issue for flaws...`) const reportProps = { core, octokit, reportTitle: `${uniqueHrefs.size} broken links found`, reportBody: flawIssueDisplay(flaws, opts), reportRepository, reportLabel, } const newReport = await createReportIssue(reportProps) if (linkReports) { const linkProps = { core, octokit, newReport, reportRepository, reportAuthor, reportLabel, } await linkReports(linkProps) } } if (shouldComment) { await commentOnPR(core, octokit, flaws, opts) } const flawsInLevel = flaws.filter((flaw) => { if (level === 'critical') { return flaw?.flaw?.CRITICAL } // WARNING level and above return true }) if (flawsInLevel.length > 0) { core.setOutput('has_flaws_at_level', flawsInLevel.length > 0) if (failOnFlaw) { core.setFailed( `${flaws.length} broken links found. See action artifact uploads for details`, ) process.exit(1) } } } else { // It might be that the PR got a comment about >0 flaws before, // and now it can update that comment to say all is well again. if (shouldComment) { await commentOnPR(core, octokit, flaws, opts) } } } async function commentOnPR(core: CoreInject, octokit: Octokit, flaws: LinkFlaw[], opts: Options) { const { actionContext = {} } = opts const { owner, repo } = actionContext const pullNumber = actionContext?.pull_request?.number if (!owner || !repo || !pullNumber) { core.warning(`commentOnPR called outside of PR action runner context. Not creating comment.`) return } const findAgainSymbol = '' const body = flawIssueDisplay(flaws, opts, false) const { data } = await octokit.rest.issues.listComments({ owner, repo, issue_number: pullNumber, }) let previousCommentId for (const { body, id } of data) { if (body && body.includes(findAgainSymbol)) { previousCommentId = id } } // Since failed external urls aren't included in PR comment, body may be empty if (!body) { core.info('No flaws qualify for comment') if (previousCommentId) { const nothingComment = 'Previous broken links comment now moot. 👌😙' await octokit.rest.issues.updateComment({ owner, repo, comment_id: previousCommentId, body: `${nothingComment}\n\n${findAgainSymbol}`, }) core.info(`Updated comment on PR: ${pullNumber} (${previousCommentId})`) } return } if (previousCommentId) { const noteComment = '(*The original automated comment was updated*)' await octokit.rest.issues.updateComment({ owner, repo, comment_id: previousCommentId, body: `${body}\n\n${noteComment}\n\n${findAgainSymbol}`, }) core.info(`Updated comment on PR: ${pullNumber} (${previousCommentId})`) return } try { await octokit.rest.issues.createComment({ owner, repo, issue_number: pullNumber, body: `${body}\n\n${findAgainSymbol}`, }) core.info(`Created comment on PR: ${pullNumber}`) } catch (error) { core.setFailed(`Error commenting on PR when there are flaws`) throw error } } function flawIssueDisplay(flaws: LinkFlaw[], opts: Options, mentionExternalExclusionList = true) { let output = '' let flawsToDisplay = 0 type LinkFlawWithPermalink = { // page?: Page // permalink?: Permalink href?: string url?: string text?: string src: string flaw: Flaw permalinkHrefs: string[] } // Group broken links for each page const hrefsOnPageGroup: Record> = {} for (const { page, permalink, href, text, src, flaw } of flaws) { // When we don't want to include external links in PR comments if (opts.commentLimitToExternalLinks && !flaw.isExternal) { continue } flawsToDisplay++ const pageKey = page.fullPath if (!hrefsOnPageGroup[pageKey]) { hrefsOnPageGroup[pageKey] = {} } const linkKey = href || src if (!hrefsOnPageGroup[pageKey][linkKey]) { hrefsOnPageGroup[page.fullPath][linkKey] = { href, text, src, flaw, permalinkHrefs: [] } } if (!hrefsOnPageGroup[pageKey][linkKey].permalinkHrefs.includes(permalink.href)) { hrefsOnPageGroup[pageKey][linkKey].permalinkHrefs.push(permalink.href) } } // Don't comment if there are no qualifying flaws if (!flawsToDisplay) { return '' } // Build flaw display text for (const [pagePath, pageHrefs] of Object.entries(hrefsOnPageGroup)) { const fullPath = prettyFullPath(pagePath) output += `\n\n### In \`${fullPath}\`\n` for (const [, hrefObj] of Object.entries(pageHrefs)) { if (hrefObj.href) { output += `\n\n - Href: [${hrefObj.href}](${hrefObj.href})` output += `\n - Text: ${hrefObj.text}` } else if (hrefObj.src) { output += `\n\n - Image src: [${hrefObj.src}](${hrefObj.src})` } else { output += `\n\n - WORKFLOW ERROR: Flaw has neither 'href' nor 'src'` } output += `\n - Flaw: \`${ hrefObj.flaw.CRITICAL ? hrefObj.flaw.CRITICAL : hrefObj.flaw.WARNING }\`` output += `\n - On permalinks` for (const permalinkHref of hrefObj.permalinkHrefs) { output += `\n - \`${permalinkHref}\`` } } } if (mentionExternalExclusionList) { output += '\n\n---\n\nIf any link reported in this issue is not actually broken ' + 'and repeatedly shows up on reports, consider making a PR that adds it as an exception to `src/links/lib/excluded-links.ts`. ' + 'For more information, see [Fixing broken links in GitHub user docs](https://github.com/github/docs/blob/main/src/links/lib/README.md).' } output = `${flawsToDisplay} broken${ opts.commentLimitToExternalLinks ? ' **external** ' : ' ' }links found in [this](${opts.actionUrl}) workflow.\n${output}` // limit is 65536 if (output.length > 60000) { output = output.slice(0, 60000) + '\n\n---\n\nOUTPUT TRUNCATED' } return output } function printGlobalCacheHitRatio(core: CoreInject) { const hits = globalCacheHitCount const misses = globalCacheMissCount // It could be that the files that were tested didn't have a single // link in them. In that case, there's no cache misses or hits at all. // So avoid the division by zero. if (misses + hits) { core.debug( `Cache hit ratio: ${hits.toLocaleString()} of ${(misses + hits).toLocaleString()} (${( (100 * hits) / (misses + hits) ).toFixed(1)}%)`, ) } } function getPages( pageList: Page[], languages: string[], filters: string[], files: string[], max: number | undefined, ) { return pageList .filter((page: Page) => { if (languages.length && !languages.includes(page.languageCode)) { return false } if (filters.length && !filters.find((filter) => page.relativePath.includes(filter))) { return false } if ( files.length && // The reason for checking each file against the `relativePath` // or the `fullPath` is to make it flexible for the user. !files.find((file) => { if (page.relativePath === file) return true if (page.fullPath === file) return true // The `page.relativePath` will always be *from* the containing // directory it came from an might not be relative to the repo // root. I.e. // `content/education/quickstart.md` is the path relative to // the repo root. But the `page.relativePath` will // in this case be `education/quickstart.md`. // So give it one last chance to relate to the repo root. // This is important because you might use `git diff --name-only` // to get the list of files to focus specifically on. if (path.join(CONTENT_ROOT, page.relativePath) === path.resolve(file)) return true return false }) ) { return false } return true }) .slice(0, max ? Math.min(max, pageList.length) : pageList.length) } async function processPage( core: CoreInject, page: Page, pageMap: PageMap, redirects: Redirects, opts: Options, db: DBType, versions: string[], ) { const { verbose, verboseUrl, bail } = opts const allFlawsEach = await Promise.all( page.permalinks .filter((permalink) => { return !versions.length || versions.includes(permalink.pageVersion) }) .map((permalink) => { return processPermalink(core, permalink, page, pageMap, redirects, opts, db) }), ) const allFlaws = allFlawsEach.flat() if (allFlaws.length > 0) { if (verbose) { printFlaws(core, allFlaws, { verboseUrl }) } if (bail) { if (!verbose) { console.warn('Use --verbose to see the flaws before it exits') } throw new Error(`More than one flaw in ${page.relativePath}`) } } return allFlaws } async function processPermalink( core: any, permalink: Permalink, page: Page, pageMap: PageMap, redirects: Redirects, opts: Options, db: DBType, ) { const { level = 'critical', checkAnchors, checkImages, checkExternalLinks, verbose, patient, externalServerErrorsAsWarning, } = opts let html = '' try { html = await renderInnerHTML(page, permalink) } catch (error) { console.warn( `The error happened trying to render ${page.relativePath} (permalink: ${permalink.href})`, ) throw error } const $ = cheerio.load(html, { xmlMode: true }) const flaws: LinkFlaw[] = [] const links: cheerio.Element[] = [] $('a[href]').each((i, link) => { links.push(link) }) const newFlaws: LinkFlaw[] = await Promise.all( links.map(async (link) => { const { href } = (link as cheerio.TagElement).attribs // The global cache can't be used for anchor links because they // depend on each page it renders if (!href.startsWith('#')) { if (globalHrefCheckCache.has(href)) { globalCacheHitCount++ return globalHrefCheckCache.get(href) } globalCacheMissCount++ } const flaw = await checkHrefLink( core, href, $, redirects, pageMap, checkAnchors, checkExternalLinks, externalServerErrorsAsWarning, permalink, { verbose, patient }, db, ) if (flaw) { if (level === 'critical' && !flaw.CRITICAL) { return } const text = $(link).text() if (!href.startsWith('#')) { globalHrefCheckCache.set(href, { href, flaw, text }) } return { href, flaw, text } } else { if (!href.startsWith('#')) { globalHrefCheckCache.set(href, flaw) } } }), ) for (const flaw of newFlaws) { if (flaw) { flaws.push(Object.assign(flaw, { page, permalink })) } } if (checkImages) { $('img[src]').each((i, img) => { let { src } = (img as cheerio.TagElement).attribs // Images get a cache-busting prefix injected in the image // E.g. // We need to remove that otherwise we can't look up the image // on disk. src = src.replace(/\/cb-\d+\//, '/') if (globalImageSrcCheckCache.has(src)) { globalCacheHitCount++ return globalImageSrcCheckCache.get(src) } const flaw = checkImageSrc(src) globalImageSrcCheckCache.set(src, flaw) if (flaw) { if (level === 'critical' && !flaw.CRITICAL) { return } flaws.push({ permalink, page, src, flaw }) } }) } return flaws } async function uploadJsonFlawsArtifact( uploadArtifact: UploadArtifact, flaws: LinkFlaw[], { verboseUrl = null }: { verboseUrl?: string | null } = {}, artifactName = 'all-rendered-link-flaws.json', ) { type PrintableLinkFlaw = { href?: string url?: string text?: string src?: string flaw?: Flaw } const printableFlaws: Record = {} for (const { page, permalink, href, text, src, flaw } of flaws) { const fullPath = prettyFullPath(page.fullPath) if (!(fullPath in printableFlaws)) { printableFlaws[fullPath] = [] } if (href) { printableFlaws[fullPath].push({ href, url: verboseUrl ? new URL(permalink.href, verboseUrl).toString() : permalink.href, text, flaw, }) } else if (src) { printableFlaws[fullPath].push({ src, }) } } const message = JSON.stringify(printableFlaws, undefined, 2) return uploadArtifact(artifactName, message) } function printFlaws( core: CoreInject, flaws: LinkFlaw[], { verboseUrl }: { verboseUrl?: string | undefined } = {}, ) { let previousPage = null let previousPermalink = null for (const { page, permalink, href, text, src, flaw } of flaws) { const fullPath = prettyFullPath(page.fullPath) if (page !== previousPage) { core.info(`PAGE: ${chalk.bold(fullPath)}`) } previousPage = page if (href) { if (previousPermalink !== permalink.href) { if (verboseUrl) { core.info(` URL: ${new URL(permalink.href, verboseUrl).toString()}`) } else { core.info(` PERMALINK: ${permalink.href}`) } } previousPermalink = permalink.href core.info(` HREF: ${chalk.bold(href)}`) core.info(` TEXT: ${text}`) } else if (src) { core.info(` IMG SRC: ${chalk.bold(src)}`) } else { throw new Error("Flaw has neither 'href' nor 'src'") } core.info(` FLAW: ${flaw.CRITICAL ? chalk.red(flaw.CRITICAL) : chalk.yellow(flaw.WARNING)}`) } } // Given a full path, change to so it's relative to the `cwd()` so that you // can take it from the output and paste it to something like `code ...here...` // The problem with displaying the full path is that it's quite noisy and // takes up a lot of space. Sure, you can copy and paste it in front of // `vi` or `ls` or `code` but if we display it relative to `cwd()` you // can still paste it to the next command but it's not taking up so much // space. function prettyFullPath(fullPath: string) { return path.relative(process.cwd(), fullPath) } const globalHrefCheckCache = new Map() const globalImageSrcCheckCache = new Map() let globalCacheHitCount = 0 let globalCacheMissCount = 0 async function checkHrefLink( core: any, href: string, $: cheerio.Root, redirects: Redirects, pageMap: PageMap, checkAnchors = false, checkExternalLinks = false, externalServerErrorsAsWarning: string | undefined | null = null, permalink: Permalink, { verbose = false, patient = false }: { verbose?: boolean; patient?: boolean } = {}, db: DBType | null = null, ): Promise { // this function handles hrefs in all the following forms: // same article links: // 1. '#' // 2. '#anchor' // 3. '/to/this/article#anchor' // different article links: // 4. '/some/path/article#anchor' (currently not supported) // 5. '/some/path/article' // external links: // 6. 'https://example.com' (external link) const [pathFragment, hashFragment] = href.split('#') const hash = '#' + hashFragment // the hash is the part that starts with `#` // this conditional handles cases in which the link is to the current article (cases 1-3 above) if (checkAnchors && (!pathFragment || pathFragment === permalink.href)) { // cases covered by this part of the conditional: // 1. '#' if (hash === '#') { return { WARNING: 'Link is just an empty `#`' } } // cases covered by this part of the conditional: // 2. '#anchor' // 3. '/to/this/article#anchor' else { // Some pages are a mix of Markdown and React components. On its own, // the Markdown might appear broken but when combined with automated // React rendering it might work. Best to stay out of it. const avoid = permalink && ((permalink.href.includes('/rest/') && !permalink.href.includes('/rest/guides/')) || permalink.href.includes('/webhooks-and-events/webhooks/webhook-events-and-payloads') || permalink.href.includes('/graphql/reference') || permalink.href.includes('/code-security/codeql-cli/codeql-cli-manual/') || permalink.href.includes( '/apps/maintaining-github-apps/modifying-a-github-app-registration', ) || permalink.href.includes( '/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning', ) || permalink.href.includes( '/site-policy/github-company-policies/github-statement-against-modern-slavery-and-child-labor', ) || permalink.href.includes('/site-policy/content-removal-policies/dmca-takedown-policy') || permalink.href.includes('/early-access/')) // You don't need a DOM ID (or ) for `` // to work in all modern browsers. if (hash !== '#top' && !avoid) { // If the link is `#foo` it could either match `` // or it could match ``. const countDOMItems = $(hash).length + $(`a[name="${hash.slice(1)}"]`).length if (countDOMItems === 0) { return { CRITICAL: `Anchor on the same page can't be found by ID` } } else if (countDOMItems > 1) { return { CRITICAL: `Matches multiple points in the page` } } } } } // this conditional handles cases in which the link is to a different article or externally (cases 4-6 above) else { // cases covered by this part of the conditional: // 4. '/some/path/article#anchor' (currently not supported) // 5. '/some/path/article' if (href.startsWith('/')) { const pathname = new URL(href, 'http://example.com').pathname // we drop any hashes due to `.pathname` // we don't currently support hashes for other articles we link to: /some/path/article#anchor // Remember, if the Markdown has something like // // See [my link][/some/page/] // // In the post-processing, that will actually become // // See my link // // But, if that link was a redirect, that would have been left // untouched. if (pathname.endsWith('/')) { const whatifPathname = pathname.slice(0, -1) if (getRedirect(whatifPathname, { redirects, pages: pageMap })) { return { WARNING: `Redirect to ${getRedirect(whatifPathname, { redirects, pages: pageMap })}`, } } else if (!pageMap[whatifPathname]) { if (!deprecatedVersionPrefixesRegex.test(whatifPathname)) { return { CRITICAL: 'Broken link' } } } return { WARNING: 'Links with a trailing / will always redirect' } } else { const firstPart = pathname.split('/')[1] if (STATIC_PREFIXES[firstPart]) { const staticFilePath = path.join( STATIC_PREFIXES[firstPart], pathname.split(path.sep).slice(2).join(path.sep), ) if (!fs.existsSync(staticFilePath)) { return { CRITICAL: `Static file not found ${staticFilePath} (${pathname})` } } } else if (getRedirect(pathname, { redirects, pages: pageMap })) { return { WARNING: `Redirect to ${getRedirect(pathname, { redirects, pages: pageMap })}` } } else if (!pageMap[pathname]) { if (deprecatedVersionPrefixesRegex.test(pathname)) { return } return { CRITICAL: 'Broken link' } } } } // cases covered by this part of the conditional: // 6. 'https://example.com' (external link) else if (checkExternalLinks) { if (!href.startsWith('https://')) { return { WARNING: `Will not check external URLs that are not HTTPS (${href})` } } if (linksToSkip(href)) { return } const { ok, ...info } = await checkExternalURLCached(core, href, { verbose, patient }, db) if (!ok) { // By default, an not-OK problem with an external link is CRITICAL // but if it was a `responseError` or the statusCode was >= 500 // then downgrade it to WARNING. let problem = 'CRITICAL' if (externalServerErrorsAsWarning) { if ( (info.statusCode && info.statusCode >= 500) || (info.requestError && isTemporaryRequestError(info.requestError)) ) { problem = 'WARNING' } } return { [problem]: `Broken external link (${JSON.stringify(info)})`, isExternal: true } } } } } // Return true if the request error is sufficiently temporary. For example, // a request to `https://exammmmple.org` will fail with `ENOTFOUND` because // the DNS entry doesn't exist. It means it won't have much hope if you // simply try again later. // However, an `ETIMEDOUT` means it could work but it didn't this time but // might if we try again a different hour or day. function isTemporaryRequestError(requestError: string | undefined) { if (typeof requestError === 'string') { // See https://betterstack.com/community/guides/scaling-nodejs/nodejs-errors/ // for a definition of each one. const errorEnums = ['ECONNRESET', 'ECONNREFUSED', 'ETIMEDOUT', 'ECONNABORTED'] return errorEnums.some((enum_) => requestError.includes(enum_)) } return false } // Can't do this memoization within the checkExternalURL because it can // return a Promise since it already collates multiple URLs under the // same cache key. async function checkExternalURLCached( core: CoreInject, href: string, { verbose, patient }: { verbose?: boolean; patient?: boolean }, db: DBType | null, ) { const cacheMaxAge = EXTERNAL_LINK_CHECKER_MAX_AGE_MS const now = new Date().getTime() const url = href.split('#')[0] if (cacheMaxAge) { const tooOld = now - Math.floor(jitter(cacheMaxAge, 10)) if (db && db.data.urls[url]) { if (db.data.urls[url].timestamp > tooOld) { if (verbose) { core.info(`External URL ${url} in cache`) } return db.data.urls[url].result } else if (verbose) { core.info(`External URL ${url} in cache but too old`) // Delete it so the cache file don't bloat infinitely delete db.data.urls[url] } } } const result = await checkExternalURL(core, href, { verbose, patient, }) if (cacheMaxAge) { // By only cache storing successful results, we give the system a chance // to try 40xx and 50x errors another go. if (db && result.ok) { db.data.urls[url] = { timestamp: now, result, } } } return result } const _fetchCache = new Map() async function checkExternalURL( core: CoreInject, url: string, { verbose = false, patient = false } = {}, ) { if (!url.startsWith('https://')) throw new Error('Invalid URL') const cleanURL = url.split('#')[0] if (!_fetchCache.has(cleanURL)) { _fetchCache.set(cleanURL, innerFetch(core, cleanURL, { verbose, patient })) } return _fetchCache.get(cleanURL) } const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)) // Global for recording which domains we get rate-limited on. // For example, if you got rate limited on `something.github.com/foo` // and now we're asked to fetch for `something.github.com/bar` // it's good to know to now bother yet. const _rateLimitedDomains = new Map() async function innerFetch( core: CoreInject, url: string, config: { verbose?: boolean; useGET?: boolean; patient?: boolean; retries?: number } = {}, ) { const { verbose, useGET, patient } = config const { hostname } = new URL(url) if (_rateLimitedDomains.has(hostname)) { await sleep(_rateLimitedDomains.get(hostname)) } // The way `got` does retries: // // sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100 // // So, it means: // // 1. ~1000ms // 2. ~2000ms // 3. ~4000ms // // ...if the limit we set is 3. // Our own timeout, in @/frame/middleware/timeout.js defaults to 10 seconds. // So there's no point in trying more attempts than 3 because it would // just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000) const retry = { limit: patient ? 6 : 2, } const timeout = { request: patient ? 10000 : 2000 } const headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36', } const retries = config.retries || 0 const method = useGET ? 'GET' : 'HEAD' if (verbose) core.info(`External URL ${method}: ${url} (retries: ${retries})`) try { const r = await fetchWithRetry( url, { method, headers, }, { retries: retry.limit, timeout: timeout.request, throwHttpErrors: false, }, ) if (verbose) { core.info(`External URL ${method} ${url}: ${r.status} (retries: ${retries})`) } // If we get rate limited, remember that this hostname is now all // rate limited. And sleep for the number of seconds that the // `retry-after` header indicated. if (r.status === 429) { let sleepTime = Math.min( 60_000, Math.max( 10_000, r.headers.get('retry-after') ? getRetryAfterSleep(r.headers.get('retry-after')) : 1_000, ), ) // Sprinkle a little jitter so it doesn't all start again all // at the same time sleepTime += Math.random() * 10 * 1000 // Give it a bit extra when we can be really patient if (patient) sleepTime += 30 * 1000 _rateLimitedDomains.set(hostname, sleepTime + Math.random() * 10 * 1000) if (verbose) core.info( chalk.yellow( `Rate limited on ${hostname} (${url}). Sleeping for ${(sleepTime / 1000).toFixed(1)}s`, ), ) await sleep(sleepTime) return innerFetch(core, url, Object.assign({}, config, { retries: retries + 1 })) } else { _rateLimitedDomains.delete(hostname) } // Perhaps the server doesn't support HEAD requests. // If so, try again with a regular GET. if ((r.status === 405 || r.status === 404 || r.status === 403) && !useGET) { return innerFetch(core, url, Object.assign({}, config, { useGET: true })) } if (verbose) { core.info((r.ok ? chalk.green : chalk.red)(`${r.status} on ${url}`)) } return { ok: r.ok, statusCode: r.status } } catch (err) { if (err instanceof Error) { if (verbose) { core.info(chalk.yellow(`Request Error (${err.message}) on ${url}`)) } return { ok: false, requestError: err.message } } throw err } } // Return number of milliseconds from a `Retry-After` header value function getRetryAfterSleep(headerValue: string | null) { if (!headerValue) return 0 let ms = Math.round(parseFloat(headerValue) * 1000) if (isNaN(ms)) { const nextDate = new Date(headerValue) ms = Math.max(0, nextDate.getTime() - new Date().getTime()) } return ms } function checkImageSrc(src: string) { if (!src.startsWith('/') && !src.startsWith('http')) { return { CRITICAL: 'Image path is not absolute. Should start with a /' } } const pathname = new URL(src, 'http://example.com').pathname if (pathname.startsWith('http://')) { return { CRITICAL: "Don't use insecure HTTP:// for external images" } } if (!pathname.startsWith('/')) { return { WARNING: "External images can't not be checked" } } const prefix = pathname.split('/')[1] if (prefix in STATIC_PREFIXES) { const staticFilePath = path.join( STATIC_PREFIXES[prefix], pathname.split(path.sep).slice(2).join(path.sep), ) if (!fs.existsSync(staticFilePath)) { return { CRITICAL: `Static file not found (${pathname})` } } } else { return { WARNING: `Unrecognized image src prefix (${prefix})` } } } function summarizeFlaws(core: CoreInject, flaws: LinkFlaw[]) { if (flaws.length) { core.info( chalk.bold( `Found ${flaws.length.toLocaleString()} flaw${flaws.length === 1 ? '' : 's'} in total.`, ), ) } else { core.info(chalk.green('No flaws found! 💖')) } } function summarizeCounts(core: CoreInject, pages: Page[], tookSeconds: number) { const count = pages.map((page) => page.permalinks.length).reduce((a, b) => a + b, 0) core.info( `Tested ${count.toLocaleString()} permalinks across ${pages.length.toLocaleString()} pages`, ) core.info(`Took ${Math.floor(tookSeconds)} seconds. (~${(tookSeconds / 60).toFixed(1)} minutes)`) const permalinksPerSecond = count / tookSeconds core.info(`~${permalinksPerSecond.toFixed(1)} permalinks per second.`) const pagesPerSecond = pages.length / tookSeconds core.info(`~${pagesPerSecond.toFixed(1)} pages per second.`) } function shuffle(array: any[]) { let currentIndex = array.length let randomIndex // While there remain elements to shuffle... while (currentIndex !== 0) { // Pick a remaining element... randomIndex = Math.floor(Math.random() * currentIndex) currentIndex-- // And swap it with the current element. ;[array[currentIndex], array[randomIndex]] = [array[randomIndex], array[currentIndex]] } return array } async function renderInnerHTML(page: Page, permalink: Permalink) { const next = () => {} const res = {} const pagePath = permalink.href const context: Context = {} const req = { path: pagePath, language: permalink.languageCode, pagePath, cookies: {}, context, } // This will create and set `req.context = {...}` await contextualize(req as ExtendedRequest, res as Response, next) await shortVersions(req as ExtendedRequest, res as Response, next) req.context.page = page features(req as ExtendedRequest, res as Response, next) req.context.relativePath = page.relativePath const guts = [page.rawIntro, page.rawPermissions, page.markdown].filter(Boolean).join('\n').trim() // These lines do what the ubiquitous `renderContent` function does, // but at an absolute minimum to get a string of HTML. const markdown = await liquid.parseAndRender(guts, req.context) const processor = createMinimalProcessor(req.context) const vFile = await processor.process(markdown) return vFile.toString() } export default main