docs/src/links/scripts/rendered-content-link-checker.ts

/* See function main in this file for documentation */

import fs from 'fs'
import path from 'path'

import cheerio from 'cheerio'
import coreLib from '@actions/core'
import { fetchWithRetry } from '@/frame/lib/fetch-utils'
import chalk from 'chalk'
import { JSONFilePreset } from 'lowdb/node'
import { type Octokit } from '@octokit/rest'
import type { Response } from 'express'

import type { ExtendedRequest, Page, Permalink, Context } from '@/types'
import shortVersions from '@/versions/middleware/short-versions'
import contextualize from '@/frame/middleware/context/context'
import features from '@/versions/middleware/features'
import getRedirect from '@/redirects/lib/get-redirect'
import warmServer from '@/frame/lib/warm-server'
import { liquid } from '@/content-render/index'
import { deprecated } from '@/versions/lib/enterprise-server-releases'
import excludedLinks from '@/links/lib/excluded-links'
import { getEnvInputs, boolEnvVar } from '@/workflows/get-env-inputs'
import { debugTimeEnd, debugTimeStart } from './debug-time-taken'
import { uploadArtifact as uploadArtifactLib } from './upload-artifact'
import github from '@/workflows/github'
import { getActionContext } from '@/workflows/action-context'
import { createMinimalProcessor } from '@/content-render/unified/processor'
import { createReportIssue, linkReports } from '@/workflows/issue-report'
import { type CoreInject } from '@/links/scripts/action-injections'

type Flaw = {
  WARNING?: string
  CRITICAL?: string
  isExternal?: boolean
}

type LinkFlaw = {
  page: Page
  permalink: Permalink
  href?: string
  url?: string
  text?: string
  src: string
  flaw: Flaw
}

type Redirects = Record<string, string>
type PageMap = Record<string, Page>

type UploadArtifact = (name: string, message: string) => void

type Options = {
  level?: string
  files?: string[]
  random?: boolean
  language?: string | string[]
  filter?: string[]
  version?: string | string[]
  max?: number
  linkReports?: boolean
  actionUrl?: string
  verbose?: boolean
  checkExternalLinks?: boolean
  createReport?: boolean
  failOnFlaw?: boolean
  shouldComment?: boolean
  reportRepository?: string
  reportAuthor?: string
  reportLabel?: string
  checkAnchors?: boolean
  checkImages?: boolean
  patient?: boolean
  externalServerErrorsAsWarning?: string
  verboseUrl?: string
  bail?: boolean
  commentLimitToExternalLinks?: boolean
  actionContext?: any
}

const STATIC_PREFIXES: Record<string, string> = {
  assets: path.resolve('assets'),
  public: path.resolve(path.join('src', 'graphql', 'data')),
}
// Sanity check that these are valid paths
Object.entries(STATIC_PREFIXES).forEach(([key, value]) => {
  if (!fs.existsSync(value)) {
    throw new Error(`Can't find static prefix (${key}): ${value}`)
  }
})

// By default, we don't cache external link checks to disk.
// By setting this env var to something >0, it enables the disk-based
// caching of external links.
const EXTERNAL_LINK_CHECKER_MAX_AGE_MS =
  parseInt(process.env.EXTERNAL_LINK_CHECKER_MAX_AGE_DAYS || '7') * 24 * 60 * 60 * 1000
const EXTERNAL_LINK_CHECKER_DB =
  process.env.EXTERNAL_LINK_CHECKER_DB || 'external-link-checker-db.json'

// const adapter = new JSONFile(EXTERNAL_LINK_CHECKER_DB)
type Data = {
  urls: {
    [url: string]: {
      timestamp: number
      result: {
        ok: boolean
        statusCode: number
      }
    }
  }
}
const defaultData: Data = { urls: {} }
const externalLinkCheckerDB = await JSONFilePreset<Data>(EXTERNAL_LINK_CHECKER_DB, defaultData)

type DBType = typeof externalLinkCheckerDB

// Given a number and a percentage, return the same number with a *percentage*
// max change of making a bit larger or smaller.
// E.g. `jitter(55, 10)` will return a value between `[55 - 55/10: 55 + 55/10]`
// This is useful to avoid the caching timestamps all getting the same
// numbers from the day it started which means that they don't ALL expire
// on the same day but start to expire in a bit of a "random pattern" so
// you don't get all or nothing.
function jitter(base: number, percentage: number) {
  const r = percentage / 100
  const negative = Math.random() > 0.5 ? -1 : 1
  return base + base * Math.random() * r * negative
}
// Return a function that can as quickly as possible check if a certain
// href input should be skipped.
// Do this so we can use a `Set` and a `iterable.some()` for a speedier
// check.
function linksToSkipFactory() {
  const set = new Set(excludedLinks.map(({ is }) => is).filter(Boolean))
  const arr = excludedLinks.map(({ startsWith }) => startsWith).filter(Boolean)
  return (href: string) => set.has(href) || arr.some((v) => v && href.startsWith(v))
}

const linksToSkip = linksToSkipFactory()

const CONTENT_ROOT = path.resolve('content')

const deprecatedVersionPrefixesRegex = new RegExp(
  `enterprise(-server@|/)(${deprecated.join('|')})(/|$)`,
)

// When this file is invoked directly from action as opposed to being imported
if (import.meta.url.endsWith(process.argv[1])) {
  // Optional env vars
  const {
    ACTION_RUN_URL,
    LEVEL,
    FILES_CHANGED,
    REPORT_REPOSITORY,
    REPORT_AUTHOR,
    REPORT_LABEL,
    EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
    CHECK_ANCHORS,
  } = process.env

  const octokit = github()

  // Parse changed files JSON string
  let files
  if (FILES_CHANGED) {
    const fileList = JSON.parse(FILES_CHANGED)
    if (Array.isArray(fileList) && fileList.length > 0) {
      files = fileList
    } else {
      console.warn(`No changed files found in PR: ${FILES_CHANGED}. Exiting...`)
      process.exit(0)
    }
  }

  const opts: Options = {
    level: LEVEL,
    files,
    verbose: true,
    linkReports: true,
    checkImages: true,
    checkAnchors: Boolean(CHECK_ANCHORS),
    patient: boolEnvVar('PATIENT'),
    random: false,
    language: 'en',
    actionUrl: ACTION_RUN_URL,
    checkExternalLinks: boolEnvVar('CHECK_EXTERNAL_LINKS'),
    shouldComment: boolEnvVar('SHOULD_COMMENT'),
    commentLimitToExternalLinks: boolEnvVar('COMMENT_LIMIT_TO_EXTERNAL_LINKS'),
    failOnFlaw: boolEnvVar('FAIL_ON_FLAW'),
    createReport: boolEnvVar('CREATE_REPORT'),
    reportRepository: REPORT_REPOSITORY,
    reportLabel: REPORT_LABEL,
    reportAuthor: REPORT_AUTHOR,
    actionContext: getActionContext(),
    externalServerErrorsAsWarning: EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
  }

  if (opts.shouldComment || opts.createReport) {
    // `GITHUB_TOKEN` is optional. If you need the token to post a comment
    // or open an issue report, you might get cryptic error messages from Octokit.
    getEnvInputs(['GITHUB_TOKEN'])
  }

  main(coreLib, octokit, uploadArtifactLib, opts)
}

/*
 * Renders all or specified pages to gather all links on them and verify them.
 * Checks internal links deterministically using filesystem and external links via external requests.
 * Links are considered broken for reporting and commenting if they are broken at the specified "level".
 * e.g. redirects are considered a "warning" while 404s are considered "critical"
 *
 * When there are broken links (flaws) this action can:
 * 1. Create a report issue in a specified reportRepository and link it to previous reportIssues
 * 2. Create a comment similar to a report on a PR that triggered this action
 * 3. Fail using core.setFailed when there are broken links
 *
 * opts:
 *  level {"warning" | "critical"} Counts links as "flaws" based on this value and status criteria
 *  files {Array<string>} - Limit link checking to specific files (usually changed in PR)
 *  language {string | Array<string>} - Render pages to check from included language (or languages array)
 *  checkExternalLinks {boolean} - Checks non docs.github.com urls (takes significantly longer)
 *  checkImages {boolean} - Check image src urls
 *  failOnFlaw {boolean} - When true will fail using core.setFailed when links are broken according to level (flaw)
 *  shouldComment {boolean} - When true attempts to comment flaws on PR that triggered action
 *  commentLimitToExternalLinks {boolean} - When true PR comment only includes external links
 *  createReport {boolean} - Creates an issue comment in reportRepository with links considered broken (flaws)
 *  linkReports {boolean} - When createReport is true, link the issue report to previous report(s) via comments
 *  reportRepository {string} - Repository in form of "owner/repo-name" that report issue will be created in
 *  reportLabel {string} - Label assigned to report issue,
 *  reportAuthor {string} - Expected author of previous report issue for linking reports (a bot user like docs-bot)
 *  actionUrl {string} - Used to link report or comment to the action instance for debugging
 *  actionContext {object} - Event payload context when run from action or injected. Should include { repo, owner }
 *  verbose {boolean} - Set to true for more verbose logging
 *  random {boolean} - Randomize page order for debugging when true
 *  patient {boolean} - Wait longer and retry more times for rate-limited external URLS
 *  bail {boolean} - Throw an error on the first page (not permalink) that has >0 flaws
 *  externalServerErrorsAsWarning {boolean} - Treat >=500 errors or temporary request errors as warning
 *  filter {Array<string>} - strings to match the pages' relativePath
 *  versions {Array<string>} - only certain pages' versions (e.g. )
 *
 */

async function main(
  core: any,
  octokit: Octokit,
  uploadArtifact: UploadArtifact,
  opts: Options = {},
) {
  const {
    level = 'warning',
    files = [],
    random,
    language = 'en',
    filter,
    version,
    max,
    verbose,
    checkExternalLinks = false,
    createReport = false,
    failOnFlaw = false,
    shouldComment = false,
    reportRepository = 'github/docs-content',
    reportAuthor = 'docs-bot',
    reportLabel = 'broken link report',
  } = opts

  // Note! The reason we're using `warmServer()` in this script,
  // even though there's no server involved, is because
  // the `contextualize()` function calls it.
  // And because warmServer() is actually idempotent, meaning it's
  // cheap to call it more than once, it would be expensive to call it
  // twice unnecessarily.
  // If we'd manually do the same operations that `warmServer()` does
  // here (e.g. `loadPageMap()`), we'd end up having to do it all over
  // again, the next time `contextualize()` is called.
  const { redirects, pages: pageMap, pageList } = await warmServer([])

  if (files.length) {
    core.debug(`Limitting to files list: ${files.join(', ')}`)
  }

  let languages = language
  if (!Array.isArray(languages)) {
    languages = [languages]
  }

  const filters = filter || []
  if (filters && !Array.isArray(filters)) {
    throw new Error(`filters, ${filters} is not an array`)
  }

  let versions = version || []
  if (versions && typeof versions === 'string') {
    versions = [versions]
  } else if (!Array.isArray(versions)) {
    throw new Error(`versions, '${version}' is not an array`)
  }

  if (random) {
    shuffle(pageList)
  }

  debugTimeStart(core, 'getPages')
  const pages = getPages(pageList, languages, filters, files, max)
  debugTimeEnd(core, 'getPages')

  if (checkExternalLinks && pages.length >= 100) {
    core.warning(
      `Warning! Checking external URLs can be time costly. You're testing ${pages.length} pages.`,
    )
  }

  await externalLinkCheckerDB.read()

  if (verbose && checkExternalLinks) {
    core.info(`Checking of external links is is cached to ${EXTERNAL_LINK_CHECKER_DB}`)
    core.info(
      `External link cache max age is ${
        EXTERNAL_LINK_CHECKER_MAX_AGE_MS / 1000 / 60 / 60 / 24
      } days`,
    )
    let countNotTooOld = 0
    let countTooOld = 0
    for (const { timestamp } of Object.values(externalLinkCheckerDB.data.urls || {})) {
      const age = Date.now() - timestamp
      if (age > EXTERNAL_LINK_CHECKER_MAX_AGE_MS) {
        countTooOld++
      } else {
        countNotTooOld++
      }
    }
    core.info(
      `External link cache: ${countNotTooOld.toLocaleString()} are still fresh, ${countTooOld.toLocaleString()} links too old`,
    )
  }

  debugTimeStart(core, 'processPages')
  const t0 = new Date().getTime()
  const flawsGroups = await Promise.all(
    pages.map((page: Page) =>
      processPage(
        core,
        page,
        pageMap,
        redirects,
        opts,
        externalLinkCheckerDB,
        versions as string[],
      ),
    ),
  )
  const t1 = new Date().getTime()
  debugTimeEnd(core, 'processPages')

  await externalLinkCheckerDB.write()

  const flaws = flawsGroups.flat()

  printGlobalCacheHitRatio(core)

  if (verbose) {
    summarizeCounts(core, pages, (t1 - t0) / 1000)
    core.info(`Checked ${(globalCacheHitCount + globalCacheMissCount).toLocaleString()} links`)
  }

  summarizeFlaws(core, flaws)

  const uniqueHrefs = new Set(flaws.map((flaw) => flaw.href))

  if (flaws.length > 0) {
    await uploadJsonFlawsArtifact(uploadArtifact, flaws, {
      verboseUrl: opts.verboseUrl,
    })
    core.info(`All flaws written to artifact log.`)
    if (createReport) {
      core.info(`Creating issue for flaws...`)
      const reportProps = {
        core,
        octokit,
        reportTitle: `${uniqueHrefs.size} broken links found`,
        reportBody: flawIssueDisplay(flaws, opts),
        reportRepository,
        reportLabel,
      }
      const newReport = await createReportIssue(reportProps)

      if (linkReports) {
        const linkProps = {
          core,
          octokit,
          newReport,
          reportRepository,
          reportAuthor,
          reportLabel,
        }
        await linkReports(linkProps)
      }
    }
    if (shouldComment) {
      await commentOnPR(core, octokit, flaws, opts)
    }

    const flawsInLevel = flaws.filter((flaw) => {
      if (level === 'critical') {
        return flaw?.flaw?.CRITICAL
      }
      // WARNING level and above
      return true
    })

    if (flawsInLevel.length > 0) {
      core.setOutput('has_flaws_at_level', flawsInLevel.length > 0)
      if (failOnFlaw) {
        core.setFailed(
          `${flaws.length} broken links found. See action artifact uploads for details`,
        )
        process.exit(1)
      }
    }
  } else {
    // It might be that the PR got a comment about >0 flaws before,
    // and now it can update that comment to say all is well again.
    if (shouldComment) {
      await commentOnPR(core, octokit, flaws, opts)
    }
  }
}

async function commentOnPR(core: CoreInject, octokit: Octokit, flaws: LinkFlaw[], opts: Options) {
  const { actionContext = {} } = opts
  const { owner, repo } = actionContext
  const pullNumber = actionContext?.pull_request?.number
  if (!owner || !repo || !pullNumber) {
    core.warning(`commentOnPR called outside of PR action runner context. Not creating comment.`)
    return
  }

  const findAgainSymbol = '<!-- rendered-content-link-checker-comment-finder -->'

  const body = flawIssueDisplay(flaws, opts, false)

  const { data } = await octokit.rest.issues.listComments({
    owner,
    repo,
    issue_number: pullNumber,
  })
  let previousCommentId
  for (const { body, id } of data) {
    if (body && body.includes(findAgainSymbol)) {
      previousCommentId = id
    }
  }

  // Since failed external urls aren't included in PR comment, body may be empty
  if (!body) {
    core.info('No flaws qualify for comment')

    if (previousCommentId) {
      const nothingComment = 'Previous broken links comment now moot. 👌😙'
      await octokit.rest.issues.updateComment({
        owner,
        repo,
        comment_id: previousCommentId,
        body: `${nothingComment}\n\n${findAgainSymbol}`,
      })
      core.info(`Updated comment on PR: ${pullNumber} (${previousCommentId})`)
    }
    return
  }

  if (previousCommentId) {
    const noteComment = '(*The original automated comment was updated*)'
    await octokit.rest.issues.updateComment({
      owner,
      repo,
      comment_id: previousCommentId,
      body: `${body}\n\n${noteComment}\n\n${findAgainSymbol}`,
    })
    core.info(`Updated comment on PR: ${pullNumber} (${previousCommentId})`)
    return
  }

  try {
    await octokit.rest.issues.createComment({
      owner,
      repo,
      issue_number: pullNumber,
      body: `${body}\n\n${findAgainSymbol}`,
    })
    core.info(`Created comment on PR: ${pullNumber}`)
  } catch (error) {
    core.setFailed(`Error commenting on PR when there are flaws`)
    throw error
  }
}

function flawIssueDisplay(flaws: LinkFlaw[], opts: Options, mentionExternalExclusionList = true) {
  let output = ''
  let flawsToDisplay = 0

  type LinkFlawWithPermalink = {
    // page?: Page
    // permalink?: Permalink
    href?: string
    url?: string
    text?: string
    src: string
    flaw: Flaw
    permalinkHrefs: string[]
  }
  // Group broken links for each page
  const hrefsOnPageGroup: Record<string, Record<string, LinkFlawWithPermalink>> = {}
  for (const { page, permalink, href, text, src, flaw } of flaws) {
    // When we don't want to include external links in PR comments
    if (opts.commentLimitToExternalLinks && !flaw.isExternal) {
      continue
    }

    flawsToDisplay++

    const pageKey = page.fullPath
    if (!hrefsOnPageGroup[pageKey]) {
      hrefsOnPageGroup[pageKey] = {}
    }

    const linkKey = href || src
    if (!hrefsOnPageGroup[pageKey][linkKey]) {
      hrefsOnPageGroup[page.fullPath][linkKey] = { href, text, src, flaw, permalinkHrefs: [] }
    }

    if (!hrefsOnPageGroup[pageKey][linkKey].permalinkHrefs.includes(permalink.href)) {
      hrefsOnPageGroup[pageKey][linkKey].permalinkHrefs.push(permalink.href)
    }
  }

  // Don't comment if there are no qualifying flaws
  if (!flawsToDisplay) {
    return ''
  }

  // Build flaw display text
  for (const [pagePath, pageHrefs] of Object.entries(hrefsOnPageGroup)) {
    const fullPath = prettyFullPath(pagePath)
    output += `\n\n### In \`${fullPath}\`\n`

    for (const [, hrefObj] of Object.entries(pageHrefs)) {
      if (hrefObj.href) {
        output += `\n\n - Href: [${hrefObj.href}](${hrefObj.href})`
        output += `\n - Text: ${hrefObj.text}`
      } else if (hrefObj.src) {
        output += `\n\n - Image src: [${hrefObj.src}](${hrefObj.src})`
      } else {
        output += `\n\n - WORKFLOW ERROR: Flaw has neither 'href' nor 'src'`
      }
      output += `\n - Flaw: \`${
        hrefObj.flaw.CRITICAL ? hrefObj.flaw.CRITICAL : hrefObj.flaw.WARNING
      }\``
      output += `\n - On permalinks`
      for (const permalinkHref of hrefObj.permalinkHrefs) {
        output += `\n     - \`${permalinkHref}\``
      }
    }
  }

  if (mentionExternalExclusionList) {
    output +=
      '\n\n---\n\nIf any link reported in this issue is not actually broken ' +
      'and repeatedly shows up on reports, consider making a PR that adds it as an exception to `src/links/lib/excluded-links.ts`. ' +
      'For more information, see [Fixing broken links in GitHub user docs](https://github.com/github/docs/blob/main/src/links/lib/README.md).'
  }

  output = `${flawsToDisplay} broken${
    opts.commentLimitToExternalLinks ? ' **external** ' : ' '
  }links found in [this](${opts.actionUrl}) workflow.\n${output}`

  // limit is 65536
  if (output.length > 60000) {
    output = output.slice(0, 60000) + '\n\n---\n\nOUTPUT TRUNCATED'
  }

  return output
}

function printGlobalCacheHitRatio(core: CoreInject) {
  const hits = globalCacheHitCount
  const misses = globalCacheMissCount
  // It could be that the files that were tested didn't have a single
  // link in them. In that case, there's no cache misses or hits at all.
  // So avoid the division by zero.
  if (misses + hits) {
    core.debug(
      `Cache hit ratio: ${hits.toLocaleString()} of ${(misses + hits).toLocaleString()} (${(
        (100 * hits) /
        (misses + hits)
      ).toFixed(1)}%)`,
    )
  }
}

function getPages(
  pageList: Page[],
  languages: string[],
  filters: string[],
  files: string[],
  max: number | undefined,
) {
  return pageList
    .filter((page: Page) => {
      if (languages.length && !languages.includes(page.languageCode)) {
        return false
      }

      if (filters.length && !filters.find((filter) => page.relativePath.includes(filter))) {
        return false
      }

      if (
        files.length &&
        // The reason for checking each file against the `relativePath`
        // or the `fullPath` is to make it flexible for the user.
        !files.find((file) => {
          if (page.relativePath === file) return true
          if (page.fullPath === file) return true
          // The `page.relativePath` will always be *from* the containing
          // directory it came from an might not be relative to the repo
          // root. I.e.
          // `content/education/quickstart.md` is the path relative to
          // the repo root. But the `page.relativePath` will
          // in this case be `education/quickstart.md`.
          // So give it one last chance to relate to the repo root.
          // This is important because you might use `git diff --name-only`
          // to get the list of files to focus specifically on.
          if (path.join(CONTENT_ROOT, page.relativePath) === path.resolve(file)) return true
          return false
        })
      ) {
        return false
      }

      return true
    })
    .slice(0, max ? Math.min(max, pageList.length) : pageList.length)
}

async function processPage(
  core: CoreInject,
  page: Page,
  pageMap: PageMap,
  redirects: Redirects,
  opts: Options,
  db: DBType,
  versions: string[],
) {
  const { verbose, verboseUrl, bail } = opts
  const allFlawsEach = await Promise.all(
    page.permalinks
      .filter((permalink) => {
        return !versions.length || versions.includes(permalink.pageVersion)
      })
      .map((permalink) => {
        return processPermalink(core, permalink, page, pageMap, redirects, opts, db)
      }),
  )

  const allFlaws = allFlawsEach.flat()

  if (allFlaws.length > 0) {
    if (verbose) {
      printFlaws(core, allFlaws, { verboseUrl })
    }

    if (bail) {
      if (!verbose) {
        console.warn('Use --verbose to see the flaws before it exits')
      }
      throw new Error(`More than one flaw in ${page.relativePath}`)
    }
  }

  return allFlaws
}

async function processPermalink(
  core: any,
  permalink: Permalink,
  page: Page,
  pageMap: PageMap,
  redirects: Redirects,
  opts: Options,
  db: DBType,
) {
  const {
    level = 'critical',
    checkAnchors,
    checkImages,
    checkExternalLinks,
    verbose,
    patient,
    externalServerErrorsAsWarning,
  } = opts
  let html = ''
  try {
    html = await renderInnerHTML(page, permalink)
  } catch (error) {
    console.warn(
      `The error happened trying to render ${page.relativePath} (permalink: ${permalink.href})`,
    )
    throw error
  }
  const $ = cheerio.load(html, { xmlMode: true })
  const flaws: LinkFlaw[] = []
  const links: cheerio.Element[] = []
  $('a[href]').each((i, link) => {
    links.push(link)
  })
  const newFlaws: LinkFlaw[] = await Promise.all(
    links.map(async (link) => {
      const { href } = (link as cheerio.TagElement).attribs

      // The global cache can't be used for anchor links because they
      // depend on each page it renders
      if (!href.startsWith('#')) {
        if (globalHrefCheckCache.has(href)) {
          globalCacheHitCount++
          return globalHrefCheckCache.get(href)
        }
        globalCacheMissCount++
      }

      const flaw = await checkHrefLink(
        core,
        href,
        $,
        redirects,
        pageMap,
        checkAnchors,
        checkExternalLinks,
        externalServerErrorsAsWarning,
        permalink,
        { verbose, patient },
        db,
      )

      if (flaw) {
        if (level === 'critical' && !flaw.CRITICAL) {
          return
        }
        const text = $(link).text()
        if (!href.startsWith('#')) {
          globalHrefCheckCache.set(href, { href, flaw, text })
        }
        return { href, flaw, text }
      } else {
        if (!href.startsWith('#')) {
          globalHrefCheckCache.set(href, flaw)
        }
      }
    }),
  )

  for (const flaw of newFlaws) {
    if (flaw) {
      flaws.push(Object.assign(flaw, { page, permalink }))
    }
  }

  if (checkImages) {
    $('img[src]').each((i, img) => {
      let { src } = (img as cheerio.TagElement).attribs

      // Images get a cache-busting prefix injected in the image
      // E.g. <img src="/assets/cb-123456/foo/bar.png">
      // We need to remove that otherwise we can't look up the image
      // on disk.
      src = src.replace(/\/cb-\d+\//, '/')

      if (globalImageSrcCheckCache.has(src)) {
        globalCacheHitCount++
        return globalImageSrcCheckCache.get(src)
      }

      const flaw = checkImageSrc(src)

      globalImageSrcCheckCache.set(src, flaw)

      if (flaw) {
        if (level === 'critical' && !flaw.CRITICAL) {
          return
        }
        flaws.push({ permalink, page, src, flaw })
      }
    })
  }

  return flaws
}

async function uploadJsonFlawsArtifact(
  uploadArtifact: UploadArtifact,
  flaws: LinkFlaw[],
  { verboseUrl = null }: { verboseUrl?: string | null } = {},
  artifactName = 'all-rendered-link-flaws.json',
) {
  type PrintableLinkFlaw = {
    href?: string
    url?: string
    text?: string
    src?: string
    flaw?: Flaw
  }
  const printableFlaws: Record<string, PrintableLinkFlaw[]> = {}
  for (const { page, permalink, href, text, src, flaw } of flaws) {
    const fullPath = prettyFullPath(page.fullPath)

    if (!(fullPath in printableFlaws)) {
      printableFlaws[fullPath] = []
    }
    if (href) {
      printableFlaws[fullPath].push({
        href,
        url: verboseUrl ? new URL(permalink.href, verboseUrl).toString() : permalink.href,
        text,
        flaw,
      })
    } else if (src) {
      printableFlaws[fullPath].push({
        src,
      })
    }
  }
  const message = JSON.stringify(printableFlaws, undefined, 2)
  return uploadArtifact(artifactName, message)
}

function printFlaws(
  core: CoreInject,
  flaws: LinkFlaw[],
  { verboseUrl }: { verboseUrl?: string | undefined } = {},
) {
  let previousPage = null
  let previousPermalink = null

  for (const { page, permalink, href, text, src, flaw } of flaws) {
    const fullPath = prettyFullPath(page.fullPath)
    if (page !== previousPage) {
      core.info(`PAGE: ${chalk.bold(fullPath)}`)
    }
    previousPage = page

    if (href) {
      if (previousPermalink !== permalink.href) {
        if (verboseUrl) {
          core.info(`  URL: ${new URL(permalink.href, verboseUrl).toString()}`)
        } else {
          core.info(`  PERMALINK: ${permalink.href}`)
        }
      }
      previousPermalink = permalink.href

      core.info(`    HREF: ${chalk.bold(href)}`)
      core.info(`    TEXT: ${text}`)
    } else if (src) {
      core.info(`    IMG SRC: ${chalk.bold(src)}`)
    } else {
      throw new Error("Flaw has neither 'href' nor 'src'")
    }

    core.info(`    FLAW: ${flaw.CRITICAL ? chalk.red(flaw.CRITICAL) : chalk.yellow(flaw.WARNING)}`)
  }
}

// Given a full path, change to so it's relative to the `cwd()` so that you
// can take it from the output and paste it to something like `code ...here...`
// The problem with displaying the full path is that it's quite noisy and
// takes up a lot of space. Sure, you can copy and paste it in front of
// `vi` or `ls` or `code` but if we display it relative to `cwd()` you
// can still paste it to the next command but it's not taking up so much
// space.
function prettyFullPath(fullPath: string) {
  return path.relative(process.cwd(), fullPath)
}

const globalHrefCheckCache = new Map()
const globalImageSrcCheckCache = new Map()
let globalCacheHitCount = 0
let globalCacheMissCount = 0

async function checkHrefLink(
  core: any,
  href: string,
  $: cheerio.Root,
  redirects: Redirects,
  pageMap: PageMap,
  checkAnchors = false,
  checkExternalLinks = false,
  externalServerErrorsAsWarning: string | undefined | null = null,
  permalink: Permalink,
  { verbose = false, patient = false }: { verbose?: boolean; patient?: boolean } = {},
  db: DBType | null = null,
): Promise<Flaw | undefined> {
  // this function handles hrefs in all the following forms:

  // same article links:
  // 1. '#'
  // 2. '#anchor'
  // 3. '/to/this/article#anchor'

  // different article links:
  // 4. '/some/path/article#anchor' (currently not supported)
  // 5. '/some/path/article'

  // external links:
  // 6. 'https://example.com' (external link)

  const [pathFragment, hashFragment] = href.split('#')
  const hash = '#' + hashFragment // the hash is the part that starts with `#`

  // this conditional handles cases in which the link is to the current article (cases 1-3 above)
  if (checkAnchors && (!pathFragment || pathFragment === permalink.href)) {
    // cases covered by this part of the conditional:
    // 1. '#'
    if (hash === '#') {
      return { WARNING: 'Link is just an empty `#`' }
    }
    // cases covered by this part of the conditional:
    // 2. '#anchor'
    // 3. '/to/this/article#anchor'
    else {
      // Some pages are a mix of Markdown and React components. On its own,
      // the Markdown might appear broken but when combined with automated
      // React rendering it might work. Best to stay out of it.
      const avoid =
        permalink &&
        ((permalink.href.includes('/rest/') && !permalink.href.includes('/rest/guides/')) ||
          permalink.href.includes('/webhooks-and-events/webhooks/webhook-events-and-payloads') ||
          permalink.href.includes('/graphql/reference') ||
          permalink.href.includes('/code-security/codeql-cli/codeql-cli-manual/') ||
          permalink.href.includes(
            '/apps/maintaining-github-apps/modifying-a-github-app-registration',
          ) ||
          permalink.href.includes(
            '/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning',
          ) ||
          permalink.href.includes(
            '/site-policy/github-company-policies/github-statement-against-modern-slavery-and-child-labor',
          ) ||
          permalink.href.includes('/site-policy/content-removal-policies/dmca-takedown-policy') ||
          permalink.href.includes('/early-access/'))

      // You don't need a DOM ID (or <a name="top">) for `<a href="#top">`
      // to work in all modern browsers.
      if (hash !== '#top' && !avoid) {
        // If the link is `#foo` it could either match `<element id="foo">`
        // or it could match `<a name="foo">`.
        const countDOMItems = $(hash).length + $(`a[name="${hash.slice(1)}"]`).length
        if (countDOMItems === 0) {
          return { CRITICAL: `Anchor on the same page can't be found by ID` }
        } else if (countDOMItems > 1) {
          return { CRITICAL: `Matches multiple points in the page` }
        }
      }
    }
  }
  // this conditional handles cases in which the link is to a different article or externally (cases 4-6 above)
  else {
    // cases covered by this part of the conditional:
    // 4. '/some/path/article#anchor' (currently not supported)
    // 5. '/some/path/article'
    if (href.startsWith('/')) {
      const pathname = new URL(href, 'http://example.com').pathname
      // we drop any hashes due to `.pathname`
      // we don't currently support hashes for other articles we link to: /some/path/article#anchor

      // Remember, if the Markdown has something like
      //
      //   See [my link][/some/page/]
      //
      // In the post-processing, that will actually become
      //
      //   See <a href="/en/some/page">my link</a>
      //
      // But, if that link was a redirect, that would have been left
      // untouched.
      if (pathname.endsWith('/')) {
        const whatifPathname = pathname.slice(0, -1)
        if (getRedirect(whatifPathname, { redirects, pages: pageMap })) {
          return {
            WARNING: `Redirect to ${getRedirect(whatifPathname, { redirects, pages: pageMap })}`,
          }
        } else if (!pageMap[whatifPathname]) {
          if (!deprecatedVersionPrefixesRegex.test(whatifPathname)) {
            return { CRITICAL: 'Broken link' }
          }
        }
        return { WARNING: 'Links with a trailing / will always redirect' }
      } else {
        const firstPart = pathname.split('/')[1]
        if (STATIC_PREFIXES[firstPart]) {
          const staticFilePath = path.join(
            STATIC_PREFIXES[firstPart],
            pathname.split(path.sep).slice(2).join(path.sep),
          )
          if (!fs.existsSync(staticFilePath)) {
            return { CRITICAL: `Static file not found ${staticFilePath} (${pathname})` }
          }
        } else if (getRedirect(pathname, { redirects, pages: pageMap })) {
          return { WARNING: `Redirect to ${getRedirect(pathname, { redirects, pages: pageMap })}` }
        } else if (!pageMap[pathname]) {
          if (deprecatedVersionPrefixesRegex.test(pathname)) {
            return
          }

          return { CRITICAL: 'Broken link' }
        }
      }
    }
    // cases covered by this part of the conditional:
    // 6. 'https://example.com' (external link)
    else if (checkExternalLinks) {
      if (!href.startsWith('https://')) {
        return { WARNING: `Will not check external URLs that are not HTTPS (${href})` }
      }
      if (linksToSkip(href)) {
        return
      }
      const { ok, ...info } = await checkExternalURLCached(core, href, { verbose, patient }, db)
      if (!ok) {
        // By default, an not-OK problem with an external link is CRITICAL
        // but if it was a `responseError` or the statusCode was >= 500
        // then downgrade it to WARNING.
        let problem = 'CRITICAL'
        if (externalServerErrorsAsWarning) {
          if (
            (info.statusCode && info.statusCode >= 500) ||
            (info.requestError && isTemporaryRequestError(info.requestError))
          ) {
            problem = 'WARNING'
          }
        }
        return { [problem]: `Broken external link (${JSON.stringify(info)})`, isExternal: true }
      }
    }
  }
}

// Return true if the request error is sufficiently temporary. For example,
// a request to `https://exammmmple.org` will fail with `ENOTFOUND` because
// the DNS entry doesn't exist. It means it won't have much hope if you
// simply try again later.
// However, an `ETIMEDOUT` means it could work but it didn't this time but
// might if we try again a different hour or day.
function isTemporaryRequestError(requestError: string | undefined) {
  if (typeof requestError === 'string') {
    // See https://betterstack.com/community/guides/scaling-nodejs/nodejs-errors/
    // for a definition of each one.
    const errorEnums = ['ECONNRESET', 'ECONNREFUSED', 'ETIMEDOUT', 'ECONNABORTED']
    return errorEnums.some((enum_) => requestError.includes(enum_))
  }
  return false
}

// Can't do this memoization within the checkExternalURL because it can
// return a Promise since it already collates multiple URLs under the
// same cache key.
async function checkExternalURLCached(
  core: CoreInject,
  href: string,
  { verbose, patient }: { verbose?: boolean; patient?: boolean },
  db: DBType | null,
) {
  const cacheMaxAge = EXTERNAL_LINK_CHECKER_MAX_AGE_MS
  const now = new Date().getTime()
  const url = href.split('#')[0]

  if (cacheMaxAge) {
    const tooOld = now - Math.floor(jitter(cacheMaxAge, 10))
    if (db && db.data.urls[url]) {
      if (db.data.urls[url].timestamp > tooOld) {
        if (verbose) {
          core.info(`External URL ${url} in cache`)
        }
        return db.data.urls[url].result
      } else if (verbose) {
        core.info(`External URL ${url} in cache but too old`)
        // Delete it so the cache file don't bloat infinitely
        delete db.data.urls[url]
      }
    }
  }

  const result = await checkExternalURL(core, href, {
    verbose,
    patient,
  })

  if (cacheMaxAge) {
    // By only cache storing successful results, we give the system a chance
    // to try 40xx and 50x errors another go.
    if (db && result.ok) {
      db.data.urls[url] = {
        timestamp: now,
        result,
      }
    }
  }

  return result
}

const _fetchCache = new Map()
async function checkExternalURL(
  core: CoreInject,
  url: string,
  { verbose = false, patient = false } = {},
) {
  if (!url.startsWith('https://')) throw new Error('Invalid URL')
  const cleanURL = url.split('#')[0]
  if (!_fetchCache.has(cleanURL)) {
    _fetchCache.set(cleanURL, innerFetch(core, cleanURL, { verbose, patient }))
  }
  return _fetchCache.get(cleanURL)
}

const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms))

// Global for recording which domains we get rate-limited on.
// For example, if you got rate limited on `something.github.com/foo`
// and now we're asked to fetch for `something.github.com/bar`
// it's good to know to now bother yet.
const _rateLimitedDomains = new Map()

async function innerFetch(
  core: CoreInject,
  url: string,
  config: { verbose?: boolean; useGET?: boolean; patient?: boolean; retries?: number } = {},
) {
  const { verbose, useGET, patient } = config

  const { hostname } = new URL(url)
  if (_rateLimitedDomains.has(hostname)) {
    await sleep(_rateLimitedDomains.get(hostname))
  }
  // The way `got` does retries:
  //
  //   sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
  //
  // So, it means:
  //
  //   1. ~1000ms
  //   2. ~2000ms
  //   3. ~4000ms
  //
  // ...if the limit we set is 3.
  // Our own timeout, in @/frame/middleware/timeout.js defaults to 10 seconds.
  // So there's no point in trying more attempts than 3 because it would
  // just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
  const retry = {
    limit: patient ? 6 : 2,
  }
  const timeout = { request: patient ? 10000 : 2000 }

  const headers = {
    'User-Agent':
      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
  }

  const retries = config.retries || 0
  const method = useGET ? 'GET' : 'HEAD'

  if (verbose) core.info(`External URL ${method}: ${url} (retries: ${retries})`)
  try {
    const r = await fetchWithRetry(
      url,
      {
        method,
        headers,
      },
      {
        retries: retry.limit,
        timeout: timeout.request,
        throwHttpErrors: false,
      },
    )
    if (verbose) {
      core.info(`External URL ${method} ${url}: ${r.status} (retries: ${retries})`)
    }

    // If we get rate limited, remember that this hostname is now all
    // rate limited. And sleep for the number of seconds that the
    // `retry-after` header indicated.
    if (r.status === 429) {
      let sleepTime = Math.min(
        60_000,
        Math.max(
          10_000,
          r.headers.get('retry-after') ? getRetryAfterSleep(r.headers.get('retry-after')) : 1_000,
        ),
      )
      // Sprinkle a little jitter so it doesn't all start again all
      // at the same time
      sleepTime += Math.random() * 10 * 1000
      // Give it a bit extra when we can be really patient
      if (patient) sleepTime += 30 * 1000

      _rateLimitedDomains.set(hostname, sleepTime + Math.random() * 10 * 1000)
      if (verbose)
        core.info(
          chalk.yellow(
            `Rate limited on ${hostname} (${url}). Sleeping for ${(sleepTime / 1000).toFixed(1)}s`,
          ),
        )
      await sleep(sleepTime)
      return innerFetch(core, url, Object.assign({}, config, { retries: retries + 1 }))
    } else {
      _rateLimitedDomains.delete(hostname)
    }

    // Perhaps the server doesn't support HEAD requests.
    // If so, try again with a regular GET.
    if ((r.status === 405 || r.status === 404 || r.status === 403) && !useGET) {
      return innerFetch(core, url, Object.assign({}, config, { useGET: true }))
    }
    if (verbose) {
      core.info((r.ok ? chalk.green : chalk.red)(`${r.status} on ${url}`))
    }
    return { ok: r.ok, statusCode: r.status }
  } catch (err) {
    if (err instanceof Error) {
      if (verbose) {
        core.info(chalk.yellow(`Request Error (${err.message}) on ${url}`))
      }
      return { ok: false, requestError: err.message }
    }
    throw err
  }
}

// Return number of milliseconds from a `Retry-After` header value
function getRetryAfterSleep(headerValue: string | null) {
  if (!headerValue) return 0
  let ms = Math.round(parseFloat(headerValue) * 1000)
  if (isNaN(ms)) {
    const nextDate = new Date(headerValue)
    ms = Math.max(0, nextDate.getTime() - new Date().getTime())
  }
  return ms
}

function checkImageSrc(src: string) {
  if (!src.startsWith('/') && !src.startsWith('http')) {
    return { CRITICAL: 'Image path is not absolute. Should start with a /' }
  }
  const pathname = new URL(src, 'http://example.com').pathname
  if (pathname.startsWith('http://')) {
    return { CRITICAL: "Don't use insecure HTTP:// for external images" }
  }
  if (!pathname.startsWith('/')) {
    return { WARNING: "External images can't not be checked" }
  }
  const prefix = pathname.split('/')[1]
  if (prefix in STATIC_PREFIXES) {
    const staticFilePath = path.join(
      STATIC_PREFIXES[prefix],
      pathname.split(path.sep).slice(2).join(path.sep),
    )
    if (!fs.existsSync(staticFilePath)) {
      return { CRITICAL: `Static file not found (${pathname})` }
    }
  } else {
    return { WARNING: `Unrecognized image src prefix (${prefix})` }
  }
}

function summarizeFlaws(core: CoreInject, flaws: LinkFlaw[]) {
  if (flaws.length) {
    core.info(
      chalk.bold(
        `Found ${flaws.length.toLocaleString()} flaw${flaws.length === 1 ? '' : 's'} in total.`,
      ),
    )
  } else {
    core.info(chalk.green('No flaws found! 💖'))
  }
}

function summarizeCounts(core: CoreInject, pages: Page[], tookSeconds: number) {
  const count = pages.map((page) => page.permalinks.length).reduce((a, b) => a + b, 0)
  core.info(
    `Tested ${count.toLocaleString()} permalinks across ${pages.length.toLocaleString()} pages`,
  )
  core.info(`Took ${Math.floor(tookSeconds)} seconds. (~${(tookSeconds / 60).toFixed(1)} minutes)`)
  const permalinksPerSecond = count / tookSeconds
  core.info(`~${permalinksPerSecond.toFixed(1)} permalinks per second.`)
  const pagesPerSecond = pages.length / tookSeconds
  core.info(`~${pagesPerSecond.toFixed(1)} pages per second.`)
}

function shuffle(array: any[]) {
  let currentIndex = array.length
  let randomIndex

  // While there remain elements to shuffle...
  while (currentIndex !== 0) {
    // Pick a remaining element...
    randomIndex = Math.floor(Math.random() * currentIndex)
    currentIndex--

    // And swap it with the current element.
    ;[array[currentIndex], array[randomIndex]] = [array[randomIndex], array[currentIndex]]
  }

  return array
}

async function renderInnerHTML(page: Page, permalink: Permalink) {
  const next = () => {}
  const res = {}

  const pagePath = permalink.href
  const context: Context = {}
  const req = {
    path: pagePath,
    language: permalink.languageCode,
    pagePath,
    cookies: {},
    context,
  }
  // This will create and set `req.context = {...}`
  await contextualize(req as ExtendedRequest, res as Response, next)
  await shortVersions(req as ExtendedRequest, res as Response, next)
  req.context.page = page
  features(req as ExtendedRequest, res as Response, next)

  req.context.relativePath = page.relativePath

  const guts = [page.rawIntro, page.rawPermissions, page.markdown].filter(Boolean).join('\n').trim()

  // These lines do what the ubiquitous `renderContent` function does,
  // but at an absolute minimum to get a string of HTML.
  const markdown = await liquid.parseAndRender(guts, req.context)
  const processor = createMinimalProcessor(req.context)
  const vFile = await processor.process(markdown)
  return vFile.toString()
}

export default main