1
0
mirror of synced 2025-12-23 11:54:18 -05:00
Files
docs/src/links/scripts/rendered-content-link-checker.ts

1348 lines
43 KiB
TypeScript
Executable File

/* See function main in this file for documentation */
import fs from 'fs'
import path from 'path'
import cheerio from 'cheerio'
import coreLib from '@actions/core'
import { fetchWithRetry } from '@/frame/lib/fetch-utils'
import chalk from 'chalk'
import { JSONFilePreset } from 'lowdb/node'
import { type Octokit } from '@octokit/rest'
import type { Response } from 'express'
import type { ExtendedRequest, Page, Permalink, Context } from '@/types'
import shortVersions from '@/versions/middleware/short-versions'
import contextualize from '@/frame/middleware/context/context'
import features from '@/versions/middleware/features'
import getRedirect from '@/redirects/lib/get-redirect'
import warmServer from '@/frame/lib/warm-server'
import { liquid } from '@/content-render/index'
import { deprecated } from '@/versions/lib/enterprise-server-releases'
import excludedLinks from '@/links/lib/excluded-links'
import { getEnvInputs, boolEnvVar } from '@/workflows/get-env-inputs'
import { debugTimeEnd, debugTimeStart } from './debug-time-taken'
import { uploadArtifact as uploadArtifactLib } from './upload-artifact'
import github from '@/workflows/github'
import { getActionContext } from '@/workflows/action-context'
import { createMinimalProcessor } from '@/content-render/unified/processor'
import { createReportIssue, linkReports } from '@/workflows/issue-report'
import { type CoreInject } from '@/links/scripts/action-injections'
type Flaw = {
WARNING?: string
CRITICAL?: string
isExternal?: boolean
}
type LinkFlaw = {
page: Page
permalink: Permalink
href?: string
url?: string
text?: string
src: string
flaw: Flaw
}
type Redirects = Record<string, string>
type PageMap = Record<string, Page>
type UploadArtifact = (name: string, message: string) => void
type Options = {
level?: string
files?: string[]
random?: boolean
language?: string | string[]
filter?: string[]
version?: string | string[]
max?: number
linkReports?: boolean
actionUrl?: string
verbose?: boolean
checkExternalLinks?: boolean
createReport?: boolean
failOnFlaw?: boolean
shouldComment?: boolean
reportRepository?: string
reportAuthor?: string
reportLabel?: string
checkAnchors?: boolean
checkImages?: boolean
patient?: boolean
externalServerErrorsAsWarning?: string
verboseUrl?: string
bail?: boolean
commentLimitToExternalLinks?: boolean
actionContext?: any
}
const STATIC_PREFIXES: Record<string, string> = {
assets: path.resolve('assets'),
public: path.resolve(path.join('src', 'graphql', 'data')),
}
// Sanity check that these are valid paths
Object.entries(STATIC_PREFIXES).forEach(([key, value]) => {
if (!fs.existsSync(value)) {
throw new Error(`Can't find static prefix (${key}): ${value}`)
}
})
// By default, we don't cache external link checks to disk.
// By setting this env var to something >0, it enables the disk-based
// caching of external links.
const EXTERNAL_LINK_CHECKER_MAX_AGE_MS =
parseInt(process.env.EXTERNAL_LINK_CHECKER_MAX_AGE_DAYS || '7') * 24 * 60 * 60 * 1000
const EXTERNAL_LINK_CHECKER_DB =
process.env.EXTERNAL_LINK_CHECKER_DB || 'external-link-checker-db.json'
// const adapter = new JSONFile(EXTERNAL_LINK_CHECKER_DB)
type Data = {
urls: {
[url: string]: {
timestamp: number
result: {
ok: boolean
statusCode: number
}
}
}
}
const defaultData: Data = { urls: {} }
const externalLinkCheckerDB = await JSONFilePreset<Data>(EXTERNAL_LINK_CHECKER_DB, defaultData)
type DBType = typeof externalLinkCheckerDB
// Given a number and a percentage, return the same number with a *percentage*
// max change of making a bit larger or smaller.
// E.g. `jitter(55, 10)` will return a value between `[55 - 55/10: 55 + 55/10]`
// This is useful to avoid the caching timestamps all getting the same
// numbers from the day it started which means that they don't ALL expire
// on the same day but start to expire in a bit of a "random pattern" so
// you don't get all or nothing.
function jitter(base: number, percentage: number) {
const r = percentage / 100
const negative = Math.random() > 0.5 ? -1 : 1
return base + base * Math.random() * r * negative
}
// Return a function that can as quickly as possible check if a certain
// href input should be skipped.
// Do this so we can use a `Set` and a `iterable.some()` for a speedier
// check.
function linksToSkipFactory() {
const set = new Set(excludedLinks.map(({ is }) => is).filter(Boolean))
const arr = excludedLinks.map(({ startsWith }) => startsWith).filter(Boolean)
return (href: string) => set.has(href) || arr.some((v) => v && href.startsWith(v))
}
const linksToSkip = linksToSkipFactory()
const CONTENT_ROOT = path.resolve('content')
const deprecatedVersionPrefixesRegex = new RegExp(
`enterprise(-server@|/)(${deprecated.join('|')})(/|$)`,
)
// When this file is invoked directly from action as opposed to being imported
if (import.meta.url.endsWith(process.argv[1])) {
// Optional env vars
const {
ACTION_RUN_URL,
LEVEL,
FILES_CHANGED,
REPORT_REPOSITORY,
REPORT_AUTHOR,
REPORT_LABEL,
EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
CHECK_ANCHORS,
} = process.env
const octokit = github()
// Parse changed files JSON string
let files
if (FILES_CHANGED) {
const fileList = JSON.parse(FILES_CHANGED)
if (Array.isArray(fileList) && fileList.length > 0) {
files = fileList
} else {
console.warn(`No changed files found in PR: ${FILES_CHANGED}. Exiting...`)
process.exit(0)
}
}
const opts: Options = {
level: LEVEL,
files,
verbose: true,
linkReports: true,
checkImages: true,
checkAnchors: Boolean(CHECK_ANCHORS),
patient: boolEnvVar('PATIENT'),
random: false,
language: 'en',
actionUrl: ACTION_RUN_URL,
checkExternalLinks: boolEnvVar('CHECK_EXTERNAL_LINKS'),
shouldComment: boolEnvVar('SHOULD_COMMENT'),
commentLimitToExternalLinks: boolEnvVar('COMMENT_LIMIT_TO_EXTERNAL_LINKS'),
failOnFlaw: boolEnvVar('FAIL_ON_FLAW'),
createReport: boolEnvVar('CREATE_REPORT'),
reportRepository: REPORT_REPOSITORY,
reportLabel: REPORT_LABEL,
reportAuthor: REPORT_AUTHOR,
actionContext: getActionContext(),
externalServerErrorsAsWarning: EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
}
if (opts.shouldComment || opts.createReport) {
// `GITHUB_TOKEN` is optional. If you need the token to post a comment
// or open an issue report, you might get cryptic error messages from Octokit.
getEnvInputs(['GITHUB_TOKEN'])
}
main(coreLib, octokit, uploadArtifactLib, opts)
}
/*
* Renders all or specified pages to gather all links on them and verify them.
* Checks internal links deterministically using filesystem and external links via external requests.
* Links are considered broken for reporting and commenting if they are broken at the specified "level".
* e.g. redirects are considered a "warning" while 404s are considered "critical"
*
* When there are broken links (flaws) this action can:
* 1. Create a report issue in a specified reportRepository and link it to previous reportIssues
* 2. Create a comment similar to a report on a PR that triggered this action
* 3. Fail using core.setFailed when there are broken links
*
* opts:
* level {"warning" | "critical"} Counts links as "flaws" based on this value and status criteria
* files {Array<string>} - Limit link checking to specific files (usually changed in PR)
* language {string | Array<string>} - Render pages to check from included language (or languages array)
* checkExternalLinks {boolean} - Checks non docs.github.com urls (takes significantly longer)
* checkImages {boolean} - Check image src urls
* failOnFlaw {boolean} - When true will fail using core.setFailed when links are broken according to level (flaw)
* shouldComment {boolean} - When true attempts to comment flaws on PR that triggered action
* commentLimitToExternalLinks {boolean} - When true PR comment only includes external links
* createReport {boolean} - Creates an issue comment in reportRepository with links considered broken (flaws)
* linkReports {boolean} - When createReport is true, link the issue report to previous report(s) via comments
* reportRepository {string} - Repository in form of "owner/repo-name" that report issue will be created in
* reportLabel {string} - Label assigned to report issue,
* reportAuthor {string} - Expected author of previous report issue for linking reports (a bot user like docs-bot)
* actionUrl {string} - Used to link report or comment to the action instance for debugging
* actionContext {object} - Event payload context when run from action or injected. Should include { repo, owner }
* verbose {boolean} - Set to true for more verbose logging
* random {boolean} - Randomize page order for debugging when true
* patient {boolean} - Wait longer and retry more times for rate-limited external URLS
* bail {boolean} - Throw an error on the first page (not permalink) that has >0 flaws
* externalServerErrorsAsWarning {boolean} - Treat >=500 errors or temporary request errors as warning
* filter {Array<string>} - strings to match the pages' relativePath
* versions {Array<string>} - only certain pages' versions (e.g. )
*
*/
async function main(
core: any,
octokit: Octokit,
uploadArtifact: UploadArtifact,
opts: Options = {},
) {
const {
level = 'warning',
files = [],
random,
language = 'en',
filter,
version,
max,
verbose,
checkExternalLinks = false,
createReport = false,
failOnFlaw = false,
shouldComment = false,
reportRepository = 'github/docs-content',
reportAuthor = 'docs-bot',
reportLabel = 'broken link report',
} = opts
// Note! The reason we're using `warmServer()` in this script,
// even though there's no server involved, is because
// the `contextualize()` function calls it.
// And because warmServer() is actually idempotent, meaning it's
// cheap to call it more than once, it would be expensive to call it
// twice unnecessarily.
// If we'd manually do the same operations that `warmServer()` does
// here (e.g. `loadPageMap()`), we'd end up having to do it all over
// again, the next time `contextualize()` is called.
const { redirects, pages: pageMap, pageList } = await warmServer([])
if (files.length) {
core.debug(`Limitting to files list: ${files.join(', ')}`)
}
let languages = language
if (!Array.isArray(languages)) {
languages = [languages]
}
const filters = filter || []
if (filters && !Array.isArray(filters)) {
throw new Error(`filters, ${filters} is not an array`)
}
let versions = version || []
if (versions && typeof versions === 'string') {
versions = [versions]
} else if (!Array.isArray(versions)) {
throw new Error(`versions, '${version}' is not an array`)
}
if (random) {
shuffle(pageList)
}
debugTimeStart(core, 'getPages')
const pages = getPages(pageList, languages, filters, files, max)
debugTimeEnd(core, 'getPages')
if (checkExternalLinks && pages.length >= 100) {
core.warning(
`Warning! Checking external URLs can be time costly. You're testing ${pages.length} pages.`,
)
}
await externalLinkCheckerDB.read()
if (verbose && checkExternalLinks) {
core.info(`Checking of external links is is cached to ${EXTERNAL_LINK_CHECKER_DB}`)
core.info(
`External link cache max age is ${
EXTERNAL_LINK_CHECKER_MAX_AGE_MS / 1000 / 60 / 60 / 24
} days`,
)
let countNotTooOld = 0
let countTooOld = 0
for (const { timestamp } of Object.values(externalLinkCheckerDB.data.urls || {})) {
const age = Date.now() - timestamp
if (age > EXTERNAL_LINK_CHECKER_MAX_AGE_MS) {
countTooOld++
} else {
countNotTooOld++
}
}
core.info(
`External link cache: ${countNotTooOld.toLocaleString()} are still fresh, ${countTooOld.toLocaleString()} links too old`,
)
}
debugTimeStart(core, 'processPages')
const t0 = new Date().getTime()
const flawsGroups = await Promise.all(
pages.map((page: Page) =>
processPage(
core,
page,
pageMap,
redirects,
opts,
externalLinkCheckerDB,
versions as string[],
),
),
)
const t1 = new Date().getTime()
debugTimeEnd(core, 'processPages')
await externalLinkCheckerDB.write()
const flaws = flawsGroups.flat()
printGlobalCacheHitRatio(core)
if (verbose) {
summarizeCounts(core, pages, (t1 - t0) / 1000)
core.info(`Checked ${(globalCacheHitCount + globalCacheMissCount).toLocaleString()} links`)
}
summarizeFlaws(core, flaws)
const uniqueHrefs = new Set(flaws.map((flaw) => flaw.href))
if (flaws.length > 0) {
await uploadJsonFlawsArtifact(uploadArtifact, flaws, {
verboseUrl: opts.verboseUrl,
})
core.info(`All flaws written to artifact log.`)
if (createReport) {
core.info(`Creating issue for flaws...`)
const reportProps = {
core,
octokit,
reportTitle: `${uniqueHrefs.size} broken links found`,
reportBody: flawIssueDisplay(flaws, opts),
reportRepository,
reportLabel,
}
const newReport = await createReportIssue(reportProps)
if (linkReports) {
const linkProps = {
core,
octokit,
newReport,
reportRepository,
reportAuthor,
reportLabel,
}
await linkReports(linkProps)
}
}
if (shouldComment) {
await commentOnPR(core, octokit, flaws, opts)
}
const flawsInLevel = flaws.filter((flaw) => {
if (level === 'critical') {
return flaw?.flaw?.CRITICAL
}
// WARNING level and above
return true
})
if (flawsInLevel.length > 0) {
core.setOutput('has_flaws_at_level', flawsInLevel.length > 0)
if (failOnFlaw) {
core.setFailed(
`${flaws.length} broken links found. See action artifact uploads for details`,
)
process.exit(1)
}
}
} else {
// It might be that the PR got a comment about >0 flaws before,
// and now it can update that comment to say all is well again.
if (shouldComment) {
await commentOnPR(core, octokit, flaws, opts)
}
}
}
async function commentOnPR(core: CoreInject, octokit: Octokit, flaws: LinkFlaw[], opts: Options) {
const { actionContext = {} } = opts
const { owner, repo } = actionContext
const pullNumber = actionContext?.pull_request?.number
if (!owner || !repo || !pullNumber) {
core.warning(`commentOnPR called outside of PR action runner context. Not creating comment.`)
return
}
const findAgainSymbol = '<!-- rendered-content-link-checker-comment-finder -->'
const body = flawIssueDisplay(flaws, opts, false)
const { data } = await octokit.rest.issues.listComments({
owner,
repo,
issue_number: pullNumber,
})
let previousCommentId
for (const { body, id } of data) {
if (body && body.includes(findAgainSymbol)) {
previousCommentId = id
}
}
// Since failed external urls aren't included in PR comment, body may be empty
if (!body) {
core.info('No flaws qualify for comment')
if (previousCommentId) {
const nothingComment = 'Previous broken links comment now moot. 👌😙'
await octokit.rest.issues.updateComment({
owner,
repo,
comment_id: previousCommentId,
body: `${nothingComment}\n\n${findAgainSymbol}`,
})
core.info(`Updated comment on PR: ${pullNumber} (${previousCommentId})`)
}
return
}
if (previousCommentId) {
const noteComment = '(*The original automated comment was updated*)'
await octokit.rest.issues.updateComment({
owner,
repo,
comment_id: previousCommentId,
body: `${body}\n\n${noteComment}\n\n${findAgainSymbol}`,
})
core.info(`Updated comment on PR: ${pullNumber} (${previousCommentId})`)
return
}
try {
await octokit.rest.issues.createComment({
owner,
repo,
issue_number: pullNumber,
body: `${body}\n\n${findAgainSymbol}`,
})
core.info(`Created comment on PR: ${pullNumber}`)
} catch (error) {
core.setFailed(`Error commenting on PR when there are flaws`)
throw error
}
}
function flawIssueDisplay(flaws: LinkFlaw[], opts: Options, mentionExternalExclusionList = true) {
let output = ''
let flawsToDisplay = 0
type LinkFlawWithPermalink = {
// page?: Page
// permalink?: Permalink
href?: string
url?: string
text?: string
src: string
flaw: Flaw
permalinkHrefs: string[]
}
// Group broken links for each page
const hrefsOnPageGroup: Record<string, Record<string, LinkFlawWithPermalink>> = {}
for (const { page, permalink, href, text, src, flaw } of flaws) {
// When we don't want to include external links in PR comments
if (opts.commentLimitToExternalLinks && !flaw.isExternal) {
continue
}
flawsToDisplay++
const pageKey = page.fullPath
if (!hrefsOnPageGroup[pageKey]) {
hrefsOnPageGroup[pageKey] = {}
}
const linkKey = href || src
if (!hrefsOnPageGroup[pageKey][linkKey]) {
hrefsOnPageGroup[page.fullPath][linkKey] = { href, text, src, flaw, permalinkHrefs: [] }
}
if (!hrefsOnPageGroup[pageKey][linkKey].permalinkHrefs.includes(permalink.href)) {
hrefsOnPageGroup[pageKey][linkKey].permalinkHrefs.push(permalink.href)
}
}
// Don't comment if there are no qualifying flaws
if (!flawsToDisplay) {
return ''
}
// Build flaw display text
for (const [pagePath, pageHrefs] of Object.entries(hrefsOnPageGroup)) {
const fullPath = prettyFullPath(pagePath)
output += `\n\n### In \`${fullPath}\`\n`
for (const [, hrefObj] of Object.entries(pageHrefs)) {
if (hrefObj.href) {
output += `\n\n - Href: [${hrefObj.href}](${hrefObj.href})`
output += `\n - Text: ${hrefObj.text}`
} else if (hrefObj.src) {
output += `\n\n - Image src: [${hrefObj.src}](${hrefObj.src})`
} else {
output += `\n\n - WORKFLOW ERROR: Flaw has neither 'href' nor 'src'`
}
output += `\n - Flaw: \`${
hrefObj.flaw.CRITICAL ? hrefObj.flaw.CRITICAL : hrefObj.flaw.WARNING
}\``
output += `\n - On permalinks`
for (const permalinkHref of hrefObj.permalinkHrefs) {
output += `\n - \`${permalinkHref}\``
}
}
}
if (mentionExternalExclusionList) {
output +=
'\n\n---\n\nIf any link reported in this issue is not actually broken ' +
'and repeatedly shows up on reports, consider making a PR that adds it as an exception to `src/links/lib/excluded-links.ts`. ' +
'For more information, see [Fixing broken links in GitHub user docs](https://github.com/github/docs/blob/main/src/links/lib/README.md).'
}
output = `${flawsToDisplay} broken${
opts.commentLimitToExternalLinks ? ' **external** ' : ' '
}links found in [this](${opts.actionUrl}) workflow.\n${output}`
// limit is 65536
if (output.length > 60000) {
output = output.slice(0, 60000) + '\n\n---\n\nOUTPUT TRUNCATED'
}
return output
}
function printGlobalCacheHitRatio(core: CoreInject) {
const hits = globalCacheHitCount
const misses = globalCacheMissCount
// It could be that the files that were tested didn't have a single
// link in them. In that case, there's no cache misses or hits at all.
// So avoid the division by zero.
if (misses + hits) {
core.debug(
`Cache hit ratio: ${hits.toLocaleString()} of ${(misses + hits).toLocaleString()} (${(
(100 * hits) /
(misses + hits)
).toFixed(1)}%)`,
)
}
}
function getPages(
pageList: Page[],
languages: string[],
filters: string[],
files: string[],
max: number | undefined,
) {
return pageList
.filter((page: Page) => {
if (languages.length && !languages.includes(page.languageCode)) {
return false
}
if (filters.length && !filters.find((filter) => page.relativePath.includes(filter))) {
return false
}
if (
files.length &&
// The reason for checking each file against the `relativePath`
// or the `fullPath` is to make it flexible for the user.
!files.find((file) => {
if (page.relativePath === file) return true
if (page.fullPath === file) return true
// The `page.relativePath` will always be *from* the containing
// directory it came from an might not be relative to the repo
// root. I.e.
// `content/education/quickstart.md` is the path relative to
// the repo root. But the `page.relativePath` will
// in this case be `education/quickstart.md`.
// So give it one last chance to relate to the repo root.
// This is important because you might use `git diff --name-only`
// to get the list of files to focus specifically on.
if (path.join(CONTENT_ROOT, page.relativePath) === path.resolve(file)) return true
return false
})
) {
return false
}
return true
})
.slice(0, max ? Math.min(max, pageList.length) : pageList.length)
}
async function processPage(
core: CoreInject,
page: Page,
pageMap: PageMap,
redirects: Redirects,
opts: Options,
db: DBType,
versions: string[],
) {
const { verbose, verboseUrl, bail } = opts
const allFlawsEach = await Promise.all(
page.permalinks
.filter((permalink) => {
return !versions.length || versions.includes(permalink.pageVersion)
})
.map((permalink) => {
return processPermalink(core, permalink, page, pageMap, redirects, opts, db)
}),
)
const allFlaws = allFlawsEach.flat()
if (allFlaws.length > 0) {
if (verbose) {
printFlaws(core, allFlaws, { verboseUrl })
}
if (bail) {
if (!verbose) {
console.warn('Use --verbose to see the flaws before it exits')
}
throw new Error(`More than one flaw in ${page.relativePath}`)
}
}
return allFlaws
}
async function processPermalink(
core: any,
permalink: Permalink,
page: Page,
pageMap: PageMap,
redirects: Redirects,
opts: Options,
db: DBType,
) {
const {
level = 'critical',
checkAnchors,
checkImages,
checkExternalLinks,
verbose,
patient,
externalServerErrorsAsWarning,
} = opts
let html = ''
try {
html = await renderInnerHTML(page, permalink)
} catch (error) {
console.warn(
`The error happened trying to render ${page.relativePath} (permalink: ${permalink.href})`,
)
throw error
}
const $ = cheerio.load(html, { xmlMode: true })
const flaws: LinkFlaw[] = []
const links: cheerio.Element[] = []
$('a[href]').each((i, link) => {
links.push(link)
})
const newFlaws: LinkFlaw[] = await Promise.all(
links.map(async (link) => {
const { href } = (link as cheerio.TagElement).attribs
// The global cache can't be used for anchor links because they
// depend on each page it renders
if (!href.startsWith('#')) {
if (globalHrefCheckCache.has(href)) {
globalCacheHitCount++
return globalHrefCheckCache.get(href)
}
globalCacheMissCount++
}
const flaw = await checkHrefLink(
core,
href,
$,
redirects,
pageMap,
checkAnchors,
checkExternalLinks,
externalServerErrorsAsWarning,
permalink,
{ verbose, patient },
db,
)
if (flaw) {
if (level === 'critical' && !flaw.CRITICAL) {
return
}
const text = $(link).text()
if (!href.startsWith('#')) {
globalHrefCheckCache.set(href, { href, flaw, text })
}
return { href, flaw, text }
} else {
if (!href.startsWith('#')) {
globalHrefCheckCache.set(href, flaw)
}
}
}),
)
for (const flaw of newFlaws) {
if (flaw) {
flaws.push(Object.assign(flaw, { page, permalink }))
}
}
if (checkImages) {
$('img[src]').each((i, img) => {
let { src } = (img as cheerio.TagElement).attribs
// Images get a cache-busting prefix injected in the image
// E.g. <img src="/assets/cb-123456/foo/bar.png">
// We need to remove that otherwise we can't look up the image
// on disk.
src = src.replace(/\/cb-\d+\//, '/')
if (globalImageSrcCheckCache.has(src)) {
globalCacheHitCount++
return globalImageSrcCheckCache.get(src)
}
const flaw = checkImageSrc(src)
globalImageSrcCheckCache.set(src, flaw)
if (flaw) {
if (level === 'critical' && !flaw.CRITICAL) {
return
}
flaws.push({ permalink, page, src, flaw })
}
})
}
return flaws
}
async function uploadJsonFlawsArtifact(
uploadArtifact: UploadArtifact,
flaws: LinkFlaw[],
{ verboseUrl = null }: { verboseUrl?: string | null } = {},
artifactName = 'all-rendered-link-flaws.json',
) {
type PrintableLinkFlaw = {
href?: string
url?: string
text?: string
src?: string
flaw?: Flaw
}
const printableFlaws: Record<string, PrintableLinkFlaw[]> = {}
for (const { page, permalink, href, text, src, flaw } of flaws) {
const fullPath = prettyFullPath(page.fullPath)
if (!(fullPath in printableFlaws)) {
printableFlaws[fullPath] = []
}
if (href) {
printableFlaws[fullPath].push({
href,
url: verboseUrl ? new URL(permalink.href, verboseUrl).toString() : permalink.href,
text,
flaw,
})
} else if (src) {
printableFlaws[fullPath].push({
src,
})
}
}
const message = JSON.stringify(printableFlaws, undefined, 2)
return uploadArtifact(artifactName, message)
}
function printFlaws(
core: CoreInject,
flaws: LinkFlaw[],
{ verboseUrl }: { verboseUrl?: string | undefined } = {},
) {
let previousPage = null
let previousPermalink = null
for (const { page, permalink, href, text, src, flaw } of flaws) {
const fullPath = prettyFullPath(page.fullPath)
if (page !== previousPage) {
core.info(`PAGE: ${chalk.bold(fullPath)}`)
}
previousPage = page
if (href) {
if (previousPermalink !== permalink.href) {
if (verboseUrl) {
core.info(` URL: ${new URL(permalink.href, verboseUrl).toString()}`)
} else {
core.info(` PERMALINK: ${permalink.href}`)
}
}
previousPermalink = permalink.href
core.info(` HREF: ${chalk.bold(href)}`)
core.info(` TEXT: ${text}`)
} else if (src) {
core.info(` IMG SRC: ${chalk.bold(src)}`)
} else {
throw new Error("Flaw has neither 'href' nor 'src'")
}
core.info(` FLAW: ${flaw.CRITICAL ? chalk.red(flaw.CRITICAL) : chalk.yellow(flaw.WARNING)}`)
}
}
// Given a full path, change to so it's relative to the `cwd()` so that you
// can take it from the output and paste it to something like `code ...here...`
// The problem with displaying the full path is that it's quite noisy and
// takes up a lot of space. Sure, you can copy and paste it in front of
// `vi` or `ls` or `code` but if we display it relative to `cwd()` you
// can still paste it to the next command but it's not taking up so much
// space.
function prettyFullPath(fullPath: string) {
return path.relative(process.cwd(), fullPath)
}
const globalHrefCheckCache = new Map()
const globalImageSrcCheckCache = new Map()
let globalCacheHitCount = 0
let globalCacheMissCount = 0
async function checkHrefLink(
core: any,
href: string,
$: cheerio.Root,
redirects: Redirects,
pageMap: PageMap,
checkAnchors = false,
checkExternalLinks = false,
externalServerErrorsAsWarning: string | undefined | null = null,
permalink: Permalink,
{ verbose = false, patient = false }: { verbose?: boolean; patient?: boolean } = {},
db: DBType | null = null,
): Promise<Flaw | undefined> {
// this function handles hrefs in all the following forms:
// same article links:
// 1. '#'
// 2. '#anchor'
// 3. '/to/this/article#anchor'
// different article links:
// 4. '/some/path/article#anchor' (currently not supported)
// 5. '/some/path/article'
// external links:
// 6. 'https://example.com' (external link)
const [pathFragment, hashFragment] = href.split('#')
const hash = '#' + hashFragment // the hash is the part that starts with `#`
// this conditional handles cases in which the link is to the current article (cases 1-3 above)
if (checkAnchors && (!pathFragment || pathFragment === permalink.href)) {
// cases covered by this part of the conditional:
// 1. '#'
if (hash === '#') {
return { WARNING: 'Link is just an empty `#`' }
}
// cases covered by this part of the conditional:
// 2. '#anchor'
// 3. '/to/this/article#anchor'
else {
// Some pages are a mix of Markdown and React components. On its own,
// the Markdown might appear broken but when combined with automated
// React rendering it might work. Best to stay out of it.
const avoid =
permalink &&
((permalink.href.includes('/rest/') && !permalink.href.includes('/rest/guides/')) ||
permalink.href.includes('/webhooks-and-events/webhooks/webhook-events-and-payloads') ||
permalink.href.includes('/graphql/reference') ||
permalink.href.includes('/code-security/codeql-cli/codeql-cli-manual/') ||
permalink.href.includes(
'/apps/maintaining-github-apps/modifying-a-github-app-registration',
) ||
permalink.href.includes(
'/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning',
) ||
permalink.href.includes(
'/site-policy/github-company-policies/github-statement-against-modern-slavery-and-child-labor',
) ||
permalink.href.includes('/site-policy/content-removal-policies/dmca-takedown-policy') ||
permalink.href.includes('/early-access/'))
// You don't need a DOM ID (or <a name="top">) for `<a href="#top">`
// to work in all modern browsers.
if (hash !== '#top' && !avoid) {
// If the link is `#foo` it could either match `<element id="foo">`
// or it could match `<a name="foo">`.
const countDOMItems = $(hash).length + $(`a[name="${hash.slice(1)}"]`).length
if (countDOMItems === 0) {
return { CRITICAL: `Anchor on the same page can't be found by ID` }
} else if (countDOMItems > 1) {
return { CRITICAL: `Matches multiple points in the page` }
}
}
}
}
// this conditional handles cases in which the link is to a different article or externally (cases 4-6 above)
else {
// cases covered by this part of the conditional:
// 4. '/some/path/article#anchor' (currently not supported)
// 5. '/some/path/article'
if (href.startsWith('/')) {
const pathname = new URL(href, 'http://example.com').pathname
// we drop any hashes due to `.pathname`
// we don't currently support hashes for other articles we link to: /some/path/article#anchor
// Remember, if the Markdown has something like
//
// See [my link][/some/page/]
//
// In the post-processing, that will actually become
//
// See <a href="/en/some/page">my link</a>
//
// But, if that link was a redirect, that would have been left
// untouched.
if (pathname.endsWith('/')) {
const whatifPathname = pathname.slice(0, -1)
if (getRedirect(whatifPathname, { redirects, pages: pageMap })) {
return {
WARNING: `Redirect to ${getRedirect(whatifPathname, { redirects, pages: pageMap })}`,
}
} else if (!pageMap[whatifPathname]) {
if (!deprecatedVersionPrefixesRegex.test(whatifPathname)) {
return { CRITICAL: 'Broken link' }
}
}
return { WARNING: 'Links with a trailing / will always redirect' }
} else {
const firstPart = pathname.split('/')[1]
if (STATIC_PREFIXES[firstPart]) {
const staticFilePath = path.join(
STATIC_PREFIXES[firstPart],
pathname.split(path.sep).slice(2).join(path.sep),
)
if (!fs.existsSync(staticFilePath)) {
return { CRITICAL: `Static file not found ${staticFilePath} (${pathname})` }
}
} else if (getRedirect(pathname, { redirects, pages: pageMap })) {
return { WARNING: `Redirect to ${getRedirect(pathname, { redirects, pages: pageMap })}` }
} else if (!pageMap[pathname]) {
if (deprecatedVersionPrefixesRegex.test(pathname)) {
return
}
return { CRITICAL: 'Broken link' }
}
}
}
// cases covered by this part of the conditional:
// 6. 'https://example.com' (external link)
else if (checkExternalLinks) {
if (!href.startsWith('https://')) {
return { WARNING: `Will not check external URLs that are not HTTPS (${href})` }
}
if (linksToSkip(href)) {
return
}
const { ok, ...info } = await checkExternalURLCached(core, href, { verbose, patient }, db)
if (!ok) {
// By default, an not-OK problem with an external link is CRITICAL
// but if it was a `responseError` or the statusCode was >= 500
// then downgrade it to WARNING.
let problem = 'CRITICAL'
if (externalServerErrorsAsWarning) {
if (
(info.statusCode && info.statusCode >= 500) ||
(info.requestError && isTemporaryRequestError(info.requestError))
) {
problem = 'WARNING'
}
}
return { [problem]: `Broken external link (${JSON.stringify(info)})`, isExternal: true }
}
}
}
}
// Return true if the request error is sufficiently temporary. For example,
// a request to `https://exammmmple.org` will fail with `ENOTFOUND` because
// the DNS entry doesn't exist. It means it won't have much hope if you
// simply try again later.
// However, an `ETIMEDOUT` means it could work but it didn't this time but
// might if we try again a different hour or day.
function isTemporaryRequestError(requestError: string | undefined) {
if (typeof requestError === 'string') {
// See https://betterstack.com/community/guides/scaling-nodejs/nodejs-errors/
// for a definition of each one.
const errorEnums = ['ECONNRESET', 'ECONNREFUSED', 'ETIMEDOUT', 'ECONNABORTED']
return errorEnums.some((enum_) => requestError.includes(enum_))
}
return false
}
// Can't do this memoization within the checkExternalURL because it can
// return a Promise since it already collates multiple URLs under the
// same cache key.
async function checkExternalURLCached(
core: CoreInject,
href: string,
{ verbose, patient }: { verbose?: boolean; patient?: boolean },
db: DBType | null,
) {
const cacheMaxAge = EXTERNAL_LINK_CHECKER_MAX_AGE_MS
const now = new Date().getTime()
const url = href.split('#')[0]
if (cacheMaxAge) {
const tooOld = now - Math.floor(jitter(cacheMaxAge, 10))
if (db && db.data.urls[url]) {
if (db.data.urls[url].timestamp > tooOld) {
if (verbose) {
core.info(`External URL ${url} in cache`)
}
return db.data.urls[url].result
} else if (verbose) {
core.info(`External URL ${url} in cache but too old`)
// Delete it so the cache file don't bloat infinitely
delete db.data.urls[url]
}
}
}
const result = await checkExternalURL(core, href, {
verbose,
patient,
})
if (cacheMaxAge) {
// By only cache storing successful results, we give the system a chance
// to try 40xx and 50x errors another go.
if (db && result.ok) {
db.data.urls[url] = {
timestamp: now,
result,
}
}
}
return result
}
const _fetchCache = new Map()
async function checkExternalURL(
core: CoreInject,
url: string,
{ verbose = false, patient = false } = {},
) {
if (!url.startsWith('https://')) throw new Error('Invalid URL')
const cleanURL = url.split('#')[0]
if (!_fetchCache.has(cleanURL)) {
_fetchCache.set(cleanURL, innerFetch(core, cleanURL, { verbose, patient }))
}
return _fetchCache.get(cleanURL)
}
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms))
// Global for recording which domains we get rate-limited on.
// For example, if you got rate limited on `something.github.com/foo`
// and now we're asked to fetch for `something.github.com/bar`
// it's good to know to now bother yet.
const _rateLimitedDomains = new Map()
async function innerFetch(
core: CoreInject,
url: string,
config: { verbose?: boolean; useGET?: boolean; patient?: boolean; retries?: number } = {},
) {
const { verbose, useGET, patient } = config
const { hostname } = new URL(url)
if (_rateLimitedDomains.has(hostname)) {
await sleep(_rateLimitedDomains.get(hostname))
}
// The way `got` does retries:
//
// sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
//
// So, it means:
//
// 1. ~1000ms
// 2. ~2000ms
// 3. ~4000ms
//
// ...if the limit we set is 3.
// Our own timeout, in @/frame/middleware/timeout.js defaults to 10 seconds.
// So there's no point in trying more attempts than 3 because it would
// just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
const retry = {
limit: patient ? 6 : 2,
}
const timeout = { request: patient ? 10000 : 2000 }
const headers = {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
}
const retries = config.retries || 0
const method = useGET ? 'GET' : 'HEAD'
if (verbose) core.info(`External URL ${method}: ${url} (retries: ${retries})`)
try {
const r = await fetchWithRetry(
url,
{
method,
headers,
},
{
retries: retry.limit,
timeout: timeout.request,
throwHttpErrors: false,
},
)
if (verbose) {
core.info(`External URL ${method} ${url}: ${r.status} (retries: ${retries})`)
}
// If we get rate limited, remember that this hostname is now all
// rate limited. And sleep for the number of seconds that the
// `retry-after` header indicated.
if (r.status === 429) {
let sleepTime = Math.min(
60_000,
Math.max(
10_000,
r.headers.get('retry-after') ? getRetryAfterSleep(r.headers.get('retry-after')) : 1_000,
),
)
// Sprinkle a little jitter so it doesn't all start again all
// at the same time
sleepTime += Math.random() * 10 * 1000
// Give it a bit extra when we can be really patient
if (patient) sleepTime += 30 * 1000
_rateLimitedDomains.set(hostname, sleepTime + Math.random() * 10 * 1000)
if (verbose)
core.info(
chalk.yellow(
`Rate limited on ${hostname} (${url}). Sleeping for ${(sleepTime / 1000).toFixed(1)}s`,
),
)
await sleep(sleepTime)
return innerFetch(core, url, Object.assign({}, config, { retries: retries + 1 }))
} else {
_rateLimitedDomains.delete(hostname)
}
// Perhaps the server doesn't support HEAD requests.
// If so, try again with a regular GET.
if ((r.status === 405 || r.status === 404 || r.status === 403) && !useGET) {
return innerFetch(core, url, Object.assign({}, config, { useGET: true }))
}
if (verbose) {
core.info((r.ok ? chalk.green : chalk.red)(`${r.status} on ${url}`))
}
return { ok: r.ok, statusCode: r.status }
} catch (err) {
if (err instanceof Error) {
if (verbose) {
core.info(chalk.yellow(`Request Error (${err.message}) on ${url}`))
}
return { ok: false, requestError: err.message }
}
throw err
}
}
// Return number of milliseconds from a `Retry-After` header value
function getRetryAfterSleep(headerValue: string | null) {
if (!headerValue) return 0
let ms = Math.round(parseFloat(headerValue) * 1000)
if (isNaN(ms)) {
const nextDate = new Date(headerValue)
ms = Math.max(0, nextDate.getTime() - new Date().getTime())
}
return ms
}
function checkImageSrc(src: string) {
if (!src.startsWith('/') && !src.startsWith('http')) {
return { CRITICAL: 'Image path is not absolute. Should start with a /' }
}
const pathname = new URL(src, 'http://example.com').pathname
if (pathname.startsWith('http://')) {
return { CRITICAL: "Don't use insecure HTTP:// for external images" }
}
if (!pathname.startsWith('/')) {
return { WARNING: "External images can't not be checked" }
}
const prefix = pathname.split('/')[1]
if (prefix in STATIC_PREFIXES) {
const staticFilePath = path.join(
STATIC_PREFIXES[prefix],
pathname.split(path.sep).slice(2).join(path.sep),
)
if (!fs.existsSync(staticFilePath)) {
return { CRITICAL: `Static file not found (${pathname})` }
}
} else {
return { WARNING: `Unrecognized image src prefix (${prefix})` }
}
}
function summarizeFlaws(core: CoreInject, flaws: LinkFlaw[]) {
if (flaws.length) {
core.info(
chalk.bold(
`Found ${flaws.length.toLocaleString()} flaw${flaws.length === 1 ? '' : 's'} in total.`,
),
)
} else {
core.info(chalk.green('No flaws found! 💖'))
}
}
function summarizeCounts(core: CoreInject, pages: Page[], tookSeconds: number) {
const count = pages.map((page) => page.permalinks.length).reduce((a, b) => a + b, 0)
core.info(
`Tested ${count.toLocaleString()} permalinks across ${pages.length.toLocaleString()} pages`,
)
core.info(`Took ${Math.floor(tookSeconds)} seconds. (~${(tookSeconds / 60).toFixed(1)} minutes)`)
const permalinksPerSecond = count / tookSeconds
core.info(`~${permalinksPerSecond.toFixed(1)} permalinks per second.`)
const pagesPerSecond = pages.length / tookSeconds
core.info(`~${pagesPerSecond.toFixed(1)} pages per second.`)
}
function shuffle(array: any[]) {
let currentIndex = array.length
let randomIndex
// While there remain elements to shuffle...
while (currentIndex !== 0) {
// Pick a remaining element...
randomIndex = Math.floor(Math.random() * currentIndex)
currentIndex--
// And swap it with the current element.
;[array[currentIndex], array[randomIndex]] = [array[randomIndex], array[currentIndex]]
}
return array
}
async function renderInnerHTML(page: Page, permalink: Permalink) {
const next = () => {}
const res = {}
const pagePath = permalink.href
const context: Context = {}
const req = {
path: pagePath,
language: permalink.languageCode,
pagePath,
cookies: {},
context,
}
// This will create and set `req.context = {...}`
await contextualize(req as ExtendedRequest, res as Response, next)
await shortVersions(req as ExtendedRequest, res as Response, next)
req.context.page = page
features(req as ExtendedRequest, res as Response, next)
req.context.relativePath = page.relativePath
const guts = [page.rawIntro, page.rawPermissions, page.markdown].filter(Boolean).join('\n').trim()
// These lines do what the ubiquitous `renderContent` function does,
// but at an absolute minimum to get a string of HTML.
const markdown = await liquid.parseAndRender(guts, req.context)
const processor = createMinimalProcessor(req.context)
const vFile = await processor.process(markdown)
return vFile.toString()
}
export default main