1138 lines
37 KiB
JavaScript
Executable File
1138 lines
37 KiB
JavaScript
Executable File
/* See function main in this file for documentation */
|
|
|
|
import fs from 'fs'
|
|
import path from 'path'
|
|
import cheerio from 'cheerio'
|
|
import coreLib from '@actions/core'
|
|
import got, { RequestError } from 'got'
|
|
import chalk from 'chalk'
|
|
import { Low } from 'lowdb'
|
|
import { JSONFile } from 'lowdb/node'
|
|
|
|
import shortVersions from '../../middleware/contextualizers/short-versions.js'
|
|
import contextualize from '../../middleware/context.js'
|
|
import features from '../../middleware/contextualizers/features.js'
|
|
import getRedirect from '../../lib/get-redirect.js'
|
|
import warmServer from '../../lib/warm-server.js'
|
|
import liquid from '../../lib/render-content/liquid.js'
|
|
import { deprecated } from '../../lib/enterprise-server-releases.js'
|
|
import excludedLinks from '../../lib/excluded-links.js'
|
|
import { getEnvInputs, boolEnvVar } from './lib/get-env-inputs.js'
|
|
import { debugTimeEnd, debugTimeStart } from './lib/debug-time-taken.js'
|
|
import { uploadArtifact as uploadArtifactLib } from './lib/upload-artifact.js'
|
|
import github from '../../script/helpers/github.js'
|
|
import { getActionContext } from './lib/action-context.js'
|
|
import { createMinimalProcessor } from '../../lib/render-content/create-processor.js'
|
|
|
|
const STATIC_PREFIXES = {
|
|
assets: path.resolve('assets'),
|
|
public: path.resolve(path.join('data', 'graphql')),
|
|
}
|
|
// Sanity check that these are valid paths
|
|
Object.entries(STATIC_PREFIXES).forEach(([key, value]) => {
|
|
if (!fs.existsSync(value)) {
|
|
throw new Error(`Can't find static prefix (${key}): ${value}`)
|
|
}
|
|
})
|
|
|
|
// By default, we don't cache external link checks to disk.
|
|
// By setting this env var to something >0, it enables the disk-based
|
|
// caching of external links.
|
|
const EXTERNAL_LINK_CHECKER_MAX_AGE_MS =
|
|
parseInt(process.env.EXTERNAL_LINK_CHECKER_MAX_AGE_DAYS || 0) * 24 * 60 * 60 * 1000
|
|
const EXTERNAL_LINK_CHECKER_DB =
|
|
process.env.EXTERNAL_LINK_CHECKER_DB || 'external-link-checker-db.json'
|
|
|
|
const adapter = new JSONFile(EXTERNAL_LINK_CHECKER_DB)
|
|
const externalLinkCheckerDB = new Low(adapter)
|
|
|
|
// Given a number and a percentage, return the same number with a *percentage*
|
|
// max change of making a bit larger or smaller.
|
|
// E.g. `jitter(55, 10)` will return a value between `[55 - 55/10: 55 + 55/10]`
|
|
// This is useful to avoid the caching timestamps all getting the same
|
|
// numbers from the day it started which means that they don't ALL expire
|
|
// on the same day but start to expire in a bit of a "random pattern" so
|
|
// you don't get all or nothing.
|
|
function jitter(base, percentage) {
|
|
const r = percentage / 100
|
|
const negative = Math.random() > 0.5 ? -1 : 1
|
|
return base + base * Math.random() * r * negative
|
|
}
|
|
// Return a function that can as quickly as possible check if a certain
|
|
// href input should be skipped.
|
|
// Do this so we can use a `Set` and a `iterable.some()` for a speedier
|
|
// check.
|
|
function linksToSkipFactory() {
|
|
const set = new Set(excludedLinks.filter((regexOrURL) => typeof regexOrURL === 'string'))
|
|
const regexes = excludedLinks.filter((regexOrURL) => regexOrURL instanceof RegExp)
|
|
return (href) => set.has(href) || regexes.some((regex) => regex.test(href))
|
|
}
|
|
|
|
const linksToSkip = linksToSkipFactory(excludedLinks)
|
|
|
|
const CONTENT_ROOT = path.resolve('content')
|
|
|
|
const deprecatedVersionPrefixesRegex = new RegExp(
|
|
`enterprise(-server@|/)(${deprecated.join('|')})(/|$)`
|
|
)
|
|
|
|
// When this file is invoked directly from action as opposed to being imported
|
|
if (import.meta.url.endsWith(process.argv[1])) {
|
|
// Optional env vars
|
|
const {
|
|
ACTION_RUN_URL,
|
|
LEVEL,
|
|
FILES_CHANGED,
|
|
REPORT_REPOSITORY,
|
|
REPORT_AUTHOR,
|
|
REPORT_LABEL,
|
|
EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
|
|
} = process.env
|
|
|
|
const octokit = github()
|
|
|
|
// Parse changed files JSON string
|
|
let files
|
|
if (FILES_CHANGED) {
|
|
const fileList = JSON.parse(FILES_CHANGED)
|
|
if (Array.isArray(fileList) && fileList.length > 0) {
|
|
files = fileList
|
|
} else {
|
|
console.warn(`No changed files found in PR: ${FILES_CHANGED}. Exiting...`)
|
|
process.exit(0)
|
|
}
|
|
}
|
|
|
|
const opts = {
|
|
level: LEVEL,
|
|
files,
|
|
verbose: true,
|
|
linkReports: true,
|
|
checkImages: true,
|
|
patient: boolEnvVar('PATIENT'),
|
|
random: false,
|
|
language: 'en',
|
|
actionUrl: ACTION_RUN_URL,
|
|
checkExternalLinks: boolEnvVar('CHECK_EXTERNAL_LINKS'),
|
|
shouldComment: boolEnvVar('SHOULD_COMMENT'),
|
|
commentLimitToExternalLinks: boolEnvVar('COMMENT_LIMIT_TO_EXTERNAL_LINKS'),
|
|
failOnFlaw: boolEnvVar('FAIL_ON_FLAW'),
|
|
createReport: boolEnvVar('CREATE_REPORT'),
|
|
reportRepository: REPORT_REPOSITORY,
|
|
reportLabel: REPORT_LABEL,
|
|
reportAuthor: REPORT_AUTHOR,
|
|
actionContext: getActionContext(),
|
|
externalServerErrorsAsWarning: EXTERNAL_SERVER_ERRORS_AS_WARNINGS,
|
|
}
|
|
|
|
if (opts.shouldComment || opts.createReport) {
|
|
// `GITHUB_TOKEN` is optional. If you need the token to post a comment
|
|
// or open an issue report, you might get cryptic error messages from Octokit.
|
|
getEnvInputs(['GITHUB_TOKEN'])
|
|
}
|
|
|
|
main(coreLib, octokit, uploadArtifactLib, opts, {})
|
|
}
|
|
|
|
/*
|
|
* Renders all or specified pages to gather all links on them and verify them.
|
|
* Checks internal links deterministically using filesystem and external links via external requests.
|
|
* Links are considered broken for reporting and commenting if they are broken at the specified "level".
|
|
* e.g. redirects are considered a "warning" while 404s are considered "critical"
|
|
*
|
|
* When there are broken links (flaws) this action can:
|
|
* 1. Create a report issue in a specified reportRepository and link it to previous reportIssues
|
|
* 2. Create a comment similar to a report on a PR that triggered this action
|
|
* 3. Fail using core.setFailed when there are broken links
|
|
*
|
|
* opts:
|
|
* level {"warning" | "critical"} Counts links as "flaws" based on this value and status criteria
|
|
* files {Array<string>} - Limit link checking to specific files (usually changed in PR)
|
|
* language {string | Array<string>} - Render pages to check from included language (or languages array)
|
|
* checkExternalLinks {boolean} - Checks non docs.github.com urls (takes significantly longer)
|
|
* checkImages {boolean} - Check image src urls
|
|
* failOnFlaw {boolean} - When true will fail using core.setFailed when links are broken according to level (flaw)
|
|
* shouldComment {boolean} - When true attempts to comment flaws on PR that triggered action
|
|
* commentLimitToExternalLinks {boolean} - When true PR comment only includes external links
|
|
* createReport {boolean} - Creates an issue comment in reportRepository with links considered broken (flaws)
|
|
* linkReports {boolean} - When createReport is true, link the issue report to previous report(s) via comments
|
|
* reportRepository {string} - Repository in form of "owner/repo-name" that report issue will be created in
|
|
* reportLabel {string} - Label assigned to report issue,
|
|
* reportAuthor {string} - Expected author of previous report issue for linking reports (a bot user like Docubot)
|
|
* actionUrl {string} - Used to link report or comment to the action instance for debugging
|
|
* actionContext {object} - Event payload context when run from action or injected. Should include { repo, owner }
|
|
* verbose {boolean} - Set to true for more verbose logging
|
|
* random {boolean} - Randomize page order for debugging when true
|
|
* patient {boolean} - Wait longer and retry more times for rate-limited external URLS
|
|
* bail {boolean} - Throw an error on the first page (not permalink) that has >0 flaws
|
|
* externalServerErrorsAsWarning {boolean} - Treat >=500 errors or temporary request errors as warning
|
|
* filter {Array<string>} - strings to match the pages' relativePath
|
|
* versions {Array<string>} - only certain pages' versions (e.g. )
|
|
*
|
|
*/
|
|
async function main(core, octokit, uploadArtifact, opts = {}) {
|
|
const {
|
|
level = 'warning',
|
|
files = [],
|
|
random,
|
|
language = 'en',
|
|
filter,
|
|
version,
|
|
max,
|
|
verbose,
|
|
checkExternalLinks = false,
|
|
createReport = false,
|
|
failOnFlaw = false,
|
|
shouldComment = false,
|
|
} = opts
|
|
|
|
// Note! The reason we're using `warmServer()` in this script,
|
|
// even though there's no server involved, is because
|
|
// the `contextualize()` function calls it.
|
|
// And because warmServer() is actually idempotent, meaning it's
|
|
// cheap to call it more than once, it would be expensive to call it
|
|
// twice unnecessarily.
|
|
// If we'd manually do the same operations that `warmServer()` does
|
|
// here (e.g. `loadPageMap()`), we'd end up having to do it all over
|
|
// again, the next time `contextualize()` is called.
|
|
const { redirects, pages: pageMap, pageList } = await warmServer()
|
|
|
|
if (files.length) {
|
|
core.debug(`Limitting to files list: ${files.join(', ')}`)
|
|
}
|
|
|
|
let languages = language
|
|
if (!Array.isArray(languages)) {
|
|
languages = [languages]
|
|
}
|
|
|
|
const filters = filter || []
|
|
if (filters && !Array.isArray(filters)) {
|
|
throw new Error(`filters, ${filters} is not an array`)
|
|
}
|
|
|
|
let versions = version || []
|
|
if (versions && typeof versions === 'string') {
|
|
versions = [versions]
|
|
} else if (!Array.isArray(versions)) {
|
|
throw new Error(`versions, '${version}' is not an array`)
|
|
}
|
|
|
|
if (random) {
|
|
shuffle(pageList)
|
|
}
|
|
|
|
debugTimeStart(core, 'getPages')
|
|
const pages = getPages(pageList, languages, filters, files, max)
|
|
debugTimeEnd(core, 'getPages')
|
|
|
|
if (checkExternalLinks && pages.length >= 100) {
|
|
core.warning(
|
|
`Warning! Checking external URLs can be time costly. You're testing ${pages.length} pages.`
|
|
)
|
|
}
|
|
|
|
await externalLinkCheckerDB.read()
|
|
externalLinkCheckerDB.data ||= { urls: {} }
|
|
|
|
debugTimeStart(core, 'processPages')
|
|
const t0 = new Date().getTime()
|
|
const flawsGroups = await Promise.all(
|
|
pages.map((page) =>
|
|
processPage(core, page, pageMap, redirects, opts, externalLinkCheckerDB, versions)
|
|
)
|
|
)
|
|
const t1 = new Date().getTime()
|
|
debugTimeEnd(core, 'processPages')
|
|
|
|
await externalLinkCheckerDB.write()
|
|
|
|
const flaws = flawsGroups.flat()
|
|
|
|
printGlobalCacheHitRatio(core)
|
|
|
|
if (verbose) {
|
|
summarizeCounts(core, pages, (t1 - t0) / 1000)
|
|
core.info(`Checked ${(globalCacheHitCount + globalCacheMissCount).toLocaleString()} links`)
|
|
}
|
|
|
|
summarizeFlaws(core, flaws)
|
|
|
|
if (flaws.length > 0) {
|
|
await uploadJsonFlawsArtifact(uploadArtifact, flaws, opts)
|
|
core.info(`All flaws written to artifact log.`)
|
|
if (createReport) {
|
|
core.info(`Creating issue for flaws...`)
|
|
const newReport = await createReportIssue(core, octokit, flaws, opts)
|
|
if (linkReports) {
|
|
await linkReports(core, octokit, newReport, opts)
|
|
}
|
|
}
|
|
if (shouldComment) {
|
|
await commentOnPR(core, octokit, flaws, opts)
|
|
}
|
|
|
|
const flawsInLevel = flaws.filter((flaw) => {
|
|
if (level === 'critical') {
|
|
return flaw?.flaw?.CRITICAL
|
|
}
|
|
// WARNING level and above
|
|
return true
|
|
})
|
|
|
|
if (flawsInLevel.length > 0) {
|
|
core.setOutput('has_flaws_at_level', flawsInLevel.length > 0)
|
|
if (failOnFlaw) {
|
|
core.setFailed(
|
|
`${flaws.length + 1} broken links found. See action artifact uploads for details`
|
|
)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
async function createReportIssue(core, octokit, flaws, opts) {
|
|
const { reportRepository = 'github/docs-content', reportLabel = 'broken link report' } = opts
|
|
const [owner, repo] = reportRepository.split('/')
|
|
|
|
const brokenLinksDisplay = flawIssueDisplay(flaws, opts)
|
|
|
|
// Create issue with broken links
|
|
let newReport
|
|
try {
|
|
const { data } = await octokit.request('POST /repos/{owner}/{repo}/issues', {
|
|
owner,
|
|
repo,
|
|
title: `${flaws.length + 1} broken links found`,
|
|
body: brokenLinksDisplay,
|
|
labels: [reportLabel],
|
|
})
|
|
newReport = data
|
|
core.info(`Created broken links report at ${newReport.html_url}\n`)
|
|
} catch (error) {
|
|
core.error(error)
|
|
core.setFailed('Error creating new issue')
|
|
throw error
|
|
}
|
|
|
|
return newReport
|
|
}
|
|
|
|
async function linkReports(core, octokit, newReport, opts) {
|
|
const {
|
|
reportRepository = 'github/docs-content',
|
|
reportAuthor = 'docs-bot',
|
|
reportLabel = 'broken link report',
|
|
} = opts
|
|
|
|
const [owner, repo] = reportRepository.split('/')
|
|
|
|
core.debug('Attempting to link reports...')
|
|
// Find previous broken link report issue
|
|
let previousReports
|
|
try {
|
|
previousReports = await octokit.rest.issues.listForRepo({
|
|
owner,
|
|
repo,
|
|
creator: reportAuthor,
|
|
labels: reportLabel,
|
|
state: 'all', // We want to get the previous report, even if it is closed
|
|
sort: 'created',
|
|
direction: 'desc',
|
|
per_page: 25,
|
|
})
|
|
previousReports = previousReports.data
|
|
} catch (error) {
|
|
core.setFailed('Error listing issues for repo')
|
|
throw error
|
|
}
|
|
core.debug(`Found ${previousReports.length} previous reports`)
|
|
|
|
if (previousReports.length <= 1) {
|
|
core.info('No previous reports to link to')
|
|
return
|
|
}
|
|
|
|
// 2nd report should be most recent previous report
|
|
const previousReport = previousReports[1]
|
|
|
|
// Comment the old report link on the new report
|
|
try {
|
|
await octokit.rest.issues.createComment({
|
|
owner,
|
|
repo,
|
|
issue_number: newReport.number,
|
|
body: `⬅️ [Previous report](${previousReport.html_url})`,
|
|
})
|
|
core.info(`Linked old report to new report via comment on new report, #${newReport.number}`)
|
|
} catch (error) {
|
|
core.setFailed(`Error commenting on newReport, #${newReport.number}`)
|
|
throw error
|
|
}
|
|
|
|
// Comment on all previous reports that are still open
|
|
for (const previousReport of previousReports) {
|
|
if (previousReport.state === 'closed' || previousReport.html_url === newReport.html_url) {
|
|
continue
|
|
}
|
|
|
|
// If an old report is not assigned to someone we close it
|
|
const shouldClose = !previousReport.assignees.length
|
|
let body = `➡️ [Newer report](${newReport.html_url})`
|
|
if (shouldClose) {
|
|
body += '\n\nClosing in favor of newer report since there are no assignees on this issue'
|
|
}
|
|
try {
|
|
await octokit.rest.issues.createComment({
|
|
owner,
|
|
repo,
|
|
issue_number: previousReport.number,
|
|
body,
|
|
})
|
|
core.info(
|
|
`Linked old report to new report via comment on old report: #${previousReport.number}.`
|
|
)
|
|
} catch (error) {
|
|
core.setFailed(`Error commenting on previousReport, #${previousReport.number}`)
|
|
throw error
|
|
}
|
|
if (shouldClose) {
|
|
try {
|
|
await octokit.rest.issues.update({
|
|
owner,
|
|
repo,
|
|
issue_number: previousReport.number,
|
|
state: 'closed',
|
|
})
|
|
core.info(`Closing old report: #${previousReport.number} because it doesn't have assignees`)
|
|
} catch (error) {
|
|
core.setFailed(`Error closing previousReport, #${previousReport.number}`)
|
|
throw error
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
async function commentOnPR(core, octokit, flaws, opts) {
|
|
const { actionContext = {} } = opts
|
|
const { owner, repo } = actionContext
|
|
const pullNumber = actionContext?.pull_request?.number
|
|
if (!owner || !repo || !pullNumber) {
|
|
core.warning(`commentOnPR called outside of PR action runner context. Not creating comment.`)
|
|
return
|
|
}
|
|
|
|
const body = flawIssueDisplay(flaws, opts, false)
|
|
// Since failed external urls aren't included in PR comment, body may be empty
|
|
if (!body) {
|
|
core.info('No flaws qualify for comment')
|
|
return
|
|
}
|
|
|
|
try {
|
|
await octokit.rest.issues.createComment({
|
|
owner,
|
|
repo,
|
|
issue_number: pullNumber,
|
|
body,
|
|
})
|
|
core.info(`Created comment on PR: ${pullNumber}`)
|
|
} catch (error) {
|
|
core.setFailed(`Error commenting on PR when there are flaws`)
|
|
throw error
|
|
}
|
|
}
|
|
|
|
function flawIssueDisplay(flaws, opts, mentionExternalExclusionList = true) {
|
|
let output = ''
|
|
let flawsToDisplay = 0
|
|
|
|
// Group broken links for each page
|
|
const hrefsOnPageGroup = {}
|
|
for (const { page, permalink, href, text, src, flaw } of flaws) {
|
|
// When we don't want to include external links in PR comments
|
|
if (opts.commentLimitToExternalLinks && !flaw.isExternal) {
|
|
continue
|
|
}
|
|
|
|
flawsToDisplay++
|
|
|
|
const pageKey = page.fullPath
|
|
if (!hrefsOnPageGroup[pageKey]) {
|
|
hrefsOnPageGroup[pageKey] = {}
|
|
}
|
|
|
|
const linkKey = href || src
|
|
if (!hrefsOnPageGroup[pageKey][linkKey]) {
|
|
hrefsOnPageGroup[page.fullPath][linkKey] = { href, text, src, flaw, permalinkHrefs: [] }
|
|
}
|
|
|
|
if (!hrefsOnPageGroup[pageKey][linkKey].permalinkHrefs.includes(permalink.href)) {
|
|
hrefsOnPageGroup[pageKey][linkKey].permalinkHrefs.push(permalink.href)
|
|
}
|
|
}
|
|
|
|
// Don't comment if there are no qualifying flaws
|
|
if (!flawsToDisplay) {
|
|
return ''
|
|
}
|
|
|
|
// Build flaw display text
|
|
for (const [pagePath, pageHrefs] of Object.entries(hrefsOnPageGroup)) {
|
|
const fullPath = prettyFullPath(pagePath)
|
|
output += `\n\n### In \`${fullPath}\`\n`
|
|
|
|
for (const [, hrefObj] of Object.entries(pageHrefs)) {
|
|
if (hrefObj.href) {
|
|
output += `\n\n - Href: [${hrefObj.href}](${hrefObj.href})`
|
|
output += `\n - Text: ${hrefObj.text}`
|
|
} else if (hrefObj.src) {
|
|
output += `\n\n - Image src: [${hrefObj.src}](${hrefObj.src})`
|
|
} else {
|
|
output += `\n\n - WORKFLOW ERROR: Flaw has neither 'href' nor 'src'`
|
|
}
|
|
output += `\n - Flaw: \`${
|
|
hrefObj.flaw.CRITICAL ? hrefObj.flaw.CRITICAL : hrefObj.flaw.WARNING
|
|
}\``
|
|
output += `\n - On permalinks`
|
|
for (const permalinkHref of hrefObj.permalinkHrefs) {
|
|
output += `\n - \`${permalinkHref}\``
|
|
}
|
|
}
|
|
}
|
|
|
|
if (mentionExternalExclusionList) {
|
|
output +=
|
|
'\n\n---\n\nIf any link reported in this issue is not actually broken ' +
|
|
'and repeatedly shows up on reports, consider making a PR that adds it as an exception to `lib/excluded-link.js`.'
|
|
}
|
|
|
|
return `${flawsToDisplay} broken${
|
|
opts.commentLimitToExternalLinks ? ' **external** ' : ' '
|
|
}links found in [this](${opts.actionUrl}) workflow.\n${output}`
|
|
}
|
|
|
|
function printGlobalCacheHitRatio(core) {
|
|
const hits = globalCacheHitCount
|
|
const misses = globalCacheMissCount
|
|
// It could be that the files that were tested didn't have a single
|
|
// link in them. In that case, there's no cache misses or hits at all.
|
|
// So avoid the division by zero.
|
|
if (misses + hits) {
|
|
core.debug(
|
|
`Cache hit ratio: ${hits.toLocaleString()} of ${(misses + hits).toLocaleString()} (${(
|
|
(100 * hits) /
|
|
(misses + hits)
|
|
).toFixed(1)}%)`
|
|
)
|
|
}
|
|
}
|
|
|
|
function getPages(pageList, languages, filters, files, max) {
|
|
return pageList
|
|
.filter((page) => {
|
|
if (languages.length && !languages.includes(page.languageCode)) {
|
|
return false
|
|
}
|
|
|
|
if (filters.length && !filters.find((filter) => page.relativePath.includes(filter))) {
|
|
return false
|
|
}
|
|
|
|
if (
|
|
files.length &&
|
|
// The reason for checking each file against the `relativePath`
|
|
// or the `fullPath` is to make it flexible for the user.
|
|
!files.find((file) => {
|
|
if (page.relativePath === file) return true
|
|
if (page.fullPath === file) return true
|
|
// The `page.relativePath` will always be *from* the containing
|
|
// directory it came from an might not be relative to the repo
|
|
// root. I.e.
|
|
// `content/education/quickstart.md` is the path relative to
|
|
// the repo root. But the `page.relativePath` will
|
|
// in this case be `education/quickstart.md`.
|
|
// So give it one last chance to relate to the repo root.
|
|
// This is important because you might use `git diff --name-only`
|
|
// to get the list of files to focus specifically on.
|
|
if (path.join(CONTENT_ROOT, page.relativePath) === path.resolve(file)) return true
|
|
return false
|
|
})
|
|
) {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
})
|
|
.slice(0, max ? Math.min(max, pageList.length) : pageList.length)
|
|
}
|
|
|
|
async function processPage(core, page, pageMap, redirects, opts, db, versions) {
|
|
const { verbose, verboseUrl, bail } = opts
|
|
const allFlawsEach = await Promise.all(
|
|
page.permalinks
|
|
.filter((permalink) => {
|
|
return !versions.length || versions.includes(permalink.pageVersion)
|
|
})
|
|
.map((permalink) => {
|
|
return processPermalink(core, permalink, page, pageMap, redirects, opts, db)
|
|
})
|
|
)
|
|
|
|
const allFlaws = allFlawsEach.flat()
|
|
|
|
if (allFlaws.length > 0) {
|
|
if (verbose) {
|
|
printFlaws(core, allFlaws, { verboseUrl })
|
|
}
|
|
|
|
if (bail) {
|
|
if (!verbose) {
|
|
console.warn('Use --verbose to see the flaws before it exits')
|
|
}
|
|
throw new Error(`More than one flaw in ${page.relativePath}`)
|
|
}
|
|
}
|
|
|
|
return allFlaws
|
|
}
|
|
|
|
async function processPermalink(core, permalink, page, pageMap, redirects, opts, db) {
|
|
const {
|
|
level = 'critical',
|
|
checkAnchors,
|
|
checkImages,
|
|
checkExternalLinks,
|
|
verbose,
|
|
patient,
|
|
externalServerErrorsAsWarning,
|
|
} = opts
|
|
const html = await renderInnerHTML(page, permalink)
|
|
const $ = cheerio.load(html, { xmlMode: true })
|
|
const flaws = []
|
|
const links = []
|
|
$('a[href]').each((i, link) => {
|
|
links.push(link)
|
|
})
|
|
const newFlaws = await Promise.all(
|
|
links.map(async (link) => {
|
|
const { href } = link.attribs
|
|
|
|
// The global cache can't be used for anchor links because they
|
|
// depend on each page it renders
|
|
if (!href.startsWith('#')) {
|
|
if (globalHrefCheckCache.has(href)) {
|
|
globalCacheHitCount++
|
|
return globalHrefCheckCache.get(href)
|
|
}
|
|
globalCacheMissCount++
|
|
}
|
|
|
|
const flaw = await checkHrefLink(
|
|
core,
|
|
href,
|
|
$,
|
|
redirects,
|
|
pageMap,
|
|
checkAnchors,
|
|
checkExternalLinks,
|
|
externalServerErrorsAsWarning,
|
|
{ verbose, patient },
|
|
db
|
|
)
|
|
|
|
if (flaw) {
|
|
if (level === 'critical' && !flaw.CRITICAL) {
|
|
return
|
|
}
|
|
const text = $(link).text()
|
|
if (!href.startsWith('#')) {
|
|
globalHrefCheckCache.set(href, { href, flaw, text })
|
|
}
|
|
return { href, flaw, text }
|
|
} else {
|
|
if (!href.startsWith('#')) {
|
|
globalHrefCheckCache.set(href, flaw)
|
|
}
|
|
}
|
|
})
|
|
)
|
|
|
|
for (const flaw of newFlaws) {
|
|
if (flaw) {
|
|
flaws.push(Object.assign(flaw, { page, permalink }))
|
|
}
|
|
}
|
|
|
|
if (checkImages) {
|
|
$('img[src]').each((i, img) => {
|
|
let { src } = img.attribs
|
|
|
|
// Images get a cache-busting prefix injected in the image
|
|
// E.g. <img src="/assets/cb-123456/foo/bar.png">
|
|
// We need to remove that otherwise we can't look up the image
|
|
// on disk.
|
|
src = src.replace(/\/cb-\d+\//, '/')
|
|
|
|
if (globalImageSrcCheckCache.has(src)) {
|
|
globalCacheHitCount++
|
|
return globalImageSrcCheckCache.get(src)
|
|
}
|
|
|
|
const flaw = checkImageSrc(src, $)
|
|
|
|
globalImageSrcCheckCache.set(src, flaw)
|
|
|
|
if (flaw) {
|
|
if (level === 'critical' && !flaw.CRITICAL) {
|
|
return
|
|
}
|
|
flaws.push({ permalink, page, src, flaw })
|
|
}
|
|
})
|
|
}
|
|
|
|
return flaws
|
|
}
|
|
|
|
async function uploadJsonFlawsArtifact(
|
|
uploadArtifact,
|
|
flaws,
|
|
{ verboseUrl = null } = {},
|
|
artifactName = 'all-rendered-link-flaws.json'
|
|
) {
|
|
const printableFlaws = {}
|
|
for (const { page, permalink, href, text, src, flaw } of flaws) {
|
|
const fullPath = prettyFullPath(page.fullPath)
|
|
|
|
if (!(fullPath in printableFlaws)) {
|
|
printableFlaws[fullPath] = []
|
|
}
|
|
if (href) {
|
|
printableFlaws[fullPath].push({
|
|
href,
|
|
url: verboseUrl ? new URL(permalink.href, verboseUrl).toString() : permalink.href,
|
|
text,
|
|
flaw,
|
|
})
|
|
} else if (src) {
|
|
printableFlaws[fullPath].push({
|
|
src,
|
|
})
|
|
}
|
|
}
|
|
const message = JSON.stringify(printableFlaws, undefined, 2)
|
|
return uploadArtifact(artifactName, message)
|
|
}
|
|
|
|
function printFlaws(core, flaws, { verboseUrl = null } = {}) {
|
|
let previousPage = null
|
|
let previousPermalink = null
|
|
|
|
for (const { page, permalink, href, text, src, flaw } of flaws) {
|
|
const fullPath = prettyFullPath(page.fullPath)
|
|
if (page !== previousPage) {
|
|
core.info(`PAGE: ${chalk.bold(fullPath)}`)
|
|
}
|
|
previousPage = page
|
|
|
|
if (href) {
|
|
if (previousPermalink !== permalink.href) {
|
|
if (verboseUrl) {
|
|
core.info(` URL: ${new URL(permalink.href, verboseUrl).toString()}`)
|
|
} else {
|
|
core.info(` PERMALINK: ${permalink.href}`)
|
|
}
|
|
}
|
|
previousPermalink = permalink.href
|
|
|
|
core.info(` HREF: ${chalk.bold(href)}`)
|
|
core.info(` TEXT: ${text}`)
|
|
} else if (src) {
|
|
core.info(` IMG SRC: ${chalk.bold(src)}`)
|
|
} else {
|
|
throw new Error("Flaw has neither 'href' nor 'src'")
|
|
}
|
|
|
|
core.info(` FLAW: ${flaw.CRITICAL ? chalk.red(flaw.CRITICAL) : chalk.yellow(flaw.WARNING)}`)
|
|
}
|
|
}
|
|
|
|
// Given a full path, change to so it's relative to the `cwd()` so that you
|
|
// can take it from the output and paste it to something like `code ...here...`
|
|
// The problem with displaying the full path is that it's quite noisy and
|
|
// takes up a lot of space. Sure, you can copy and paste it in front of
|
|
// `vi` or `ls` or `code` but if we display it relative to `cwd()` you
|
|
// can still paste it to the next command but it's not taking up so much
|
|
// space.
|
|
function prettyFullPath(fullPath) {
|
|
return path.relative(process.cwd(), fullPath)
|
|
}
|
|
|
|
const globalHrefCheckCache = new Map()
|
|
const globalImageSrcCheckCache = new Map()
|
|
let globalCacheHitCount = 0
|
|
let globalCacheMissCount = 0
|
|
|
|
async function checkHrefLink(
|
|
core,
|
|
href,
|
|
$,
|
|
redirects,
|
|
pageMap,
|
|
checkAnchors = false,
|
|
checkExternalLinks = false,
|
|
externalServerErrorsAsWarning = false,
|
|
{ verbose = false, patient = false } = {},
|
|
db = null
|
|
) {
|
|
if (href === '#') {
|
|
if (checkAnchors) {
|
|
return { WARNING: 'Link is just an empty `#`' }
|
|
}
|
|
} else if (href.startsWith('#')) {
|
|
if (checkAnchors) {
|
|
// You don't need a DOM ID (or <a name="top">) for `<a href="#top">`
|
|
// to work in all modern browsers.
|
|
if (href !== '#top') {
|
|
// If the link is `#foo` it could either match `<element id="foo">`
|
|
// or it could match `<a name="foo">`.
|
|
const countDOMItems = $(href).length + $(`a[name="${href.slice(1)}"]`).length
|
|
if (countDOMItems === 0) {
|
|
return { WARNING: `Anchor on the same page can't be found by ID` }
|
|
} else if (countDOMItems > 1) {
|
|
return { WARNING: `Matches multiple points in the page` }
|
|
}
|
|
}
|
|
}
|
|
} else if (href.startsWith('/')) {
|
|
const pathname = new URL(href, 'http://example.com').pathname
|
|
|
|
// Remember, if the Markdown has something like
|
|
//
|
|
// See [my link][/some/page/]
|
|
//
|
|
// In the post-processing, that will actually become
|
|
//
|
|
// See <a href="/en/some/page">my link</a>
|
|
//
|
|
// But, if that link was a redirect, that would have been left
|
|
// untouched.
|
|
if (pathname.endsWith('/')) {
|
|
return { WARNING: 'Links with a trailing / will always redirect' }
|
|
} else {
|
|
if (pathname.split('/')[1] in STATIC_PREFIXES) {
|
|
const staticFilePath = path.join(
|
|
STATIC_PREFIXES[pathname.split('/')[1]],
|
|
pathname.split(path.sep).slice(2).join(path.sep)
|
|
)
|
|
if (!fs.existsSync(staticFilePath)) {
|
|
return { CRITICAL: `Static file not found ${staticFilePath} (${pathname})` }
|
|
}
|
|
} else if (getRedirect(pathname, { redirects, pages: pageMap })) {
|
|
return { WARNING: `Redirect to ${getRedirect(pathname, { redirects, pages: pageMap })}` }
|
|
} else if (!pageMap[pathname]) {
|
|
if (deprecatedVersionPrefixesRegex.test(pathname)) {
|
|
return
|
|
}
|
|
|
|
return { CRITICAL: 'Broken link' }
|
|
}
|
|
}
|
|
} else if (checkExternalLinks) {
|
|
if (!href.startsWith('https://')) {
|
|
return { WARNING: `Will not check external URLs that are not HTTPS (${href})` }
|
|
}
|
|
if (linksToSkip(href)) {
|
|
return
|
|
}
|
|
const { ok, ...info } = await checkExternalURLCached(core, href, { verbose, patient }, db)
|
|
if (!ok) {
|
|
// By default, an not-OK problem with an external link is CRITICAL
|
|
// but if it was a `responseError` or the statusCode was >= 500
|
|
// then downgrade it to WARNING.
|
|
let problem = 'CRITICAL'
|
|
if (externalServerErrorsAsWarning) {
|
|
if (
|
|
(info.statusCode && info.statusCode >= 500) ||
|
|
(info.requestError && isTemporaryRequestError(info.requestError))
|
|
) {
|
|
problem = 'WARNING'
|
|
}
|
|
}
|
|
return { [problem]: `Broken external link (${JSON.stringify(info)})`, isExternal: true }
|
|
}
|
|
}
|
|
}
|
|
|
|
// Return true if the request error is sufficiently temporary. For example,
|
|
// a request to `https://exammmmple.org` will fail with `ENOTFOUND` because
|
|
// the DNS entry doesn't exist. It means it won't have much hope if you
|
|
// simply try again later.
|
|
// However, an `ETIMEDOUT` means it could work but it didn't this time but
|
|
// might if we try again a different hour or day.
|
|
function isTemporaryRequestError(requestError) {
|
|
if (typeof requestError === 'string') {
|
|
// See https://betterstack.com/community/guides/scaling-nodejs/nodejs-errors/
|
|
// for a definition of each one.
|
|
const errorEnums = ['ECONNRESET', 'ECONNREFUSED', 'ETIMEDOUT', 'ECONNABORTED']
|
|
return errorEnums.some((enum_) => requestError.includes(enum_))
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Can't do this memoization within the checkExternalURL because it can
|
|
// return a Promise since it already collates multiple URLs under the
|
|
// same cache key.
|
|
async function checkExternalURLCached(core, href, { verbose, patient }, db) {
|
|
const cacheMaxAge = EXTERNAL_LINK_CHECKER_MAX_AGE_MS
|
|
const timestamp = new Date().getTime()
|
|
const url = href.split('#')[0]
|
|
|
|
if (cacheMaxAge) {
|
|
const tooOld = timestamp - Math.floor(jitter(cacheMaxAge, 10))
|
|
if (db && db.data.urls[url]) {
|
|
if (db.data.urls[url].timestamp > tooOld) {
|
|
if (verbose) {
|
|
core.debug(`External URL ${url} in cache`)
|
|
}
|
|
return db.data.urls[url].result
|
|
} else if (verbose) {
|
|
core.info(`External URL ${url} in cache but too old`)
|
|
// Delete it so the cache file don't bloat infinitely
|
|
delete db.data.urls[url]
|
|
}
|
|
}
|
|
}
|
|
|
|
const result = await checkExternalURL(core, href, {
|
|
verbose,
|
|
patient,
|
|
})
|
|
|
|
if (cacheMaxAge) {
|
|
// By only cache storing successful results, we give the system a chance
|
|
// to try 40xx and 50x errors another go.
|
|
if (db && result.ok) {
|
|
db.data.urls[url] = {
|
|
timestamp,
|
|
result,
|
|
}
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
const _fetchCache = new Map()
|
|
async function checkExternalURL(core, url, { verbose = false, patient = false } = {}) {
|
|
if (!url.startsWith('https://')) throw new Error('Invalid URL')
|
|
const cleanURL = url.split('#')[0]
|
|
if (!_fetchCache.has(cleanURL)) {
|
|
_fetchCache.set(cleanURL, innerFetch(core, cleanURL, { verbose, patient }))
|
|
}
|
|
return _fetchCache.get(cleanURL)
|
|
}
|
|
|
|
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
|
|
|
|
// Global for recording which domains we get rate-limited on.
|
|
// For example, if you got rate limited on `something.github.com/foo`
|
|
// and now we're asked to fetch for `something.github.com/bar`
|
|
// it's good to know to now bother yet.
|
|
const _rateLimitedDomains = new Map()
|
|
|
|
async function innerFetch(core, url, config = {}) {
|
|
const { verbose, useGET, patient } = config
|
|
|
|
const { hostname } = new URL(url)
|
|
if (_rateLimitedDomains.has(hostname)) {
|
|
await sleep(_rateLimitedDomains.get(hostname))
|
|
}
|
|
// The way `got` does retries:
|
|
//
|
|
// sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
|
|
//
|
|
// So, it means:
|
|
//
|
|
// 1. ~1000ms
|
|
// 2. ~2000ms
|
|
// 3. ~4000ms
|
|
//
|
|
// ...if the limit we set is 3.
|
|
// Our own timeout, in ./middleware/timeout.js defaults to 10 seconds.
|
|
// So there's no point in trying more attempts than 3 because it would
|
|
// just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
|
|
const retry = {
|
|
limit: patient ? 6 : 2,
|
|
}
|
|
const timeout = { request: patient ? 10000 : 2000 }
|
|
|
|
const headers = {
|
|
'User-Agent':
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
|
|
}
|
|
|
|
const retries = config.retries || 0
|
|
const httpFunction = useGET ? got.get : got.head
|
|
|
|
if (verbose) core.info(`External URL ${useGET ? 'GET' : 'HEAD'}: ${url} (retries: ${retries})`)
|
|
try {
|
|
const r = await httpFunction(url, {
|
|
headers,
|
|
throwHttpErrors: false,
|
|
retry,
|
|
timeout,
|
|
})
|
|
if (verbose) {
|
|
core.info(
|
|
`External URL ${useGET ? 'GET' : 'HEAD'} ${url}: ${r.statusCode} (retries: ${retries})`
|
|
)
|
|
}
|
|
|
|
// If we get rate limited, remember that this hostname is now all
|
|
// rate limited. And sleep for the number of seconds that the
|
|
// `retry-after` header indicated.
|
|
if (r.statusCode === 429) {
|
|
let sleepTime = Math.min(
|
|
60_000,
|
|
Math.max(10_000, getRetryAfterSleep(r.headers['retry-after']))
|
|
)
|
|
// Sprinkle a little jitter so it doesn't all start again all
|
|
// at the same time
|
|
sleepTime += Math.random() * 10 * 1000
|
|
// Give it a bit extra when we can be really patient
|
|
if (patient) sleepTime += 30 * 1000
|
|
|
|
_rateLimitedDomains.set(hostname, sleepTime + Math.random() * 10 * 1000)
|
|
if (verbose)
|
|
core.info(
|
|
chalk.yellow(
|
|
`Rate limited on ${hostname} (${url}). Sleeping for ${(sleepTime / 1000).toFixed(1)}s`
|
|
)
|
|
)
|
|
await sleep(sleepTime)
|
|
return innerFetch(core, url, Object.assign({}, config, { retries: retries + 1 }))
|
|
} else {
|
|
_rateLimitedDomains.delete(hostname)
|
|
}
|
|
|
|
// Perhaps the server doesn't support HEAD requests.
|
|
// If so, try again with a regular GET.
|
|
if ((r.statusCode === 405 || r.statusCode === 404 || r.statusCode === 403) && !useGET) {
|
|
return innerFetch(core, url, Object.assign({}, config, { useGET: true }))
|
|
}
|
|
if (verbose) {
|
|
core.info((r.ok ? chalk.green : chalk.red)(`${r.statusCode} on ${url}`))
|
|
}
|
|
return { ok: r.ok, statusCode: r.statusCode }
|
|
} catch (err) {
|
|
if (err instanceof RequestError) {
|
|
if (verbose) {
|
|
core.info(chalk.yellow(`RequestError (${err.message}) on ${url}`))
|
|
}
|
|
return { ok: false, requestError: err.message }
|
|
}
|
|
throw err
|
|
}
|
|
}
|
|
|
|
// Return number of milliseconds from a `Retry-After` header value
|
|
function getRetryAfterSleep(headerValue) {
|
|
if (!headerValue) return 0
|
|
let ms = Math.round(parseFloat(headerValue) * 1000)
|
|
if (isNaN(ms)) {
|
|
ms = Math.max(0, new Date(headerValue) - new Date())
|
|
}
|
|
return ms
|
|
}
|
|
|
|
function checkImageSrc(src, $) {
|
|
const pathname = new URL(src, 'http://example.com').pathname
|
|
if (!pathname.startsWith('/')) {
|
|
return { WARNING: "External images can't not be checked" }
|
|
}
|
|
const prefix = pathname.split('/')[1]
|
|
if (prefix in STATIC_PREFIXES) {
|
|
const staticFilePath = path.join(
|
|
STATIC_PREFIXES[prefix],
|
|
pathname.split(path.sep).slice(2).join(path.sep)
|
|
)
|
|
if (!fs.existsSync(staticFilePath)) {
|
|
return { CRITICAL: `Static file not found (${pathname})` }
|
|
}
|
|
} else {
|
|
return { WARNING: `Unrecognized image src prefix (${prefix})` }
|
|
}
|
|
}
|
|
|
|
function summarizeFlaws(core, flaws) {
|
|
if (flaws.length) {
|
|
core.info(
|
|
chalk.bold(
|
|
`Found ${flaws.length.toLocaleString()} flaw${flaws.length === 1 ? '' : 's'} in total.`
|
|
)
|
|
)
|
|
} else {
|
|
core.info(chalk.green('No flaws found! 💖'))
|
|
}
|
|
}
|
|
|
|
function summarizeCounts(core, pages, tookSeconds) {
|
|
const count = pages.map((page) => page.permalinks.length).reduce((a, b) => a + b, 0)
|
|
core.info(
|
|
`Tested ${count.toLocaleString()} permalinks across ${pages.length.toLocaleString()} pages`
|
|
)
|
|
core.info(`Took ${Math.floor(tookSeconds)} seconds. (~${(tookSeconds / 60).toFixed(1)} minutes)`)
|
|
const permalinksPerSecond = count / tookSeconds
|
|
core.info(`~${permalinksPerSecond.toFixed(1)} permalinks per second.`)
|
|
const pagesPerSecond = pages.length / tookSeconds
|
|
core.info(`~${pagesPerSecond.toFixed(1)} pages per second.`)
|
|
}
|
|
|
|
function shuffle(array) {
|
|
let currentIndex = array.length
|
|
let randomIndex
|
|
|
|
// While there remain elements to shuffle...
|
|
while (currentIndex !== 0) {
|
|
// Pick a remaining element...
|
|
randomIndex = Math.floor(Math.random() * currentIndex)
|
|
currentIndex--
|
|
|
|
// And swap it with the current element.
|
|
;[array[currentIndex], array[randomIndex]] = [array[randomIndex], array[currentIndex]]
|
|
}
|
|
|
|
return array
|
|
}
|
|
|
|
async function renderInnerHTML(page, permalink) {
|
|
const next = () => {}
|
|
const res = {}
|
|
|
|
const pagePath = permalink.href
|
|
const req = {
|
|
path: pagePath,
|
|
language: permalink.languageCode,
|
|
pagePath,
|
|
cookies: {},
|
|
}
|
|
// This will create and set `req.context = {...}`
|
|
await contextualize(req, res, next)
|
|
await shortVersions(req, res, next)
|
|
req.context.page = page
|
|
await features(req, res, next)
|
|
|
|
req.context.relativePath = page.relativePath
|
|
|
|
// These lines do what the ubiquitous `renderContent` function does,
|
|
// but at an absolute minimum to get a string of HTML.
|
|
const markdown = await liquid.parseAndRender(page.markdown, req.context)
|
|
const processor = createMinimalProcessor(req.context)
|
|
const vFile = await processor.process(markdown)
|
|
return vFile.toString()
|
|
}
|
|
|
|
export default main
|