1
0
mirror of synced 2025-12-22 11:26:57 -05:00
Files
docs/script/rendered-content-link-checker.js
2022-08-29 20:43:12 +00:00

614 lines
18 KiB
JavaScript
Executable File

#!/usr/bin/env node
// [start-readme]
//
// This script goes through all content and renders their HTML and from there
// can analyze for various flaws (e.g. broken links)
//
// [end-readme]
import fs from 'fs'
import path from 'path'
import cheerio from 'cheerio'
import { program, Option, InvalidArgumentError } from 'commander'
import chalk from 'chalk'
import got from 'got'
import shortVersions from '../middleware/contextualizers/short-versions.js'
import contextualize from '../middleware/context.js'
import { languageKeys } from '../lib/languages.js'
import getRedirect from '../lib/get-redirect.js'
import warmServer from '../lib/warm-server.js'
import renderContent from '../lib/render-content/index.js'
import { deprecated } from '../lib/enterprise-server-releases.js'
import excludedLinks from '../lib/excluded-links.js'
const STATIC_PREFIXES = {
assets: path.resolve('assets'),
public: path.resolve(path.join('data', 'graphql')),
}
// Sanity check that these are valid paths
Object.entries(STATIC_PREFIXES).forEach(([key, value]) => {
if (!fs.existsSync(value)) {
throw new Error(`Can't find static prefix (${key}): ${value}`)
}
})
// Return a function that can as quickly as possible check if a certain
// href input should be skipped.
// Do this so we can use a `Set` and a `iterable.some()` for a speedier
// check.
function linksToSkipFactory() {
const set = new Set(excludedLinks.filter((regexOrURL) => typeof regexOrURL === 'string'))
const regexes = excludedLinks.filter((regexOrURL) => regexOrURL instanceof RegExp)
return (href) => set.has(href) || regexes.some((regex) => regex.test(href))
}
const linksToSkip = linksToSkipFactory(excludedLinks)
const CONTENT_ROOT = path.resolve('content')
const deprecatedVersionPrefixesRegex = new RegExp(
`enterprise(-server@|/)(${deprecated.join('|')})(/|$)`
)
program
.description('Analyze all checked content files, render them, and check for flaws.')
.addOption(
new Option('-L, --level <LEVEL>', 'Filter of flaw level').choices([
'all',
'warning',
'critical',
])
)
.addOption(
new Option('-l, --language <LANGUAGE...>', 'Which languages to focus on').choices(languageKeys)
)
.option('--verbose-url <BASE_URL>', 'Print the absolute URL if set')
.option('-f, --filter <FILTER...>', 'Search filter(s) on the paths')
.option('-e, --exit', 'Exit script by count of flaws (useful for CI)')
.option('-b, --bail', 'Exit on the first flaw')
.option('--check-anchors', "Validate links that start with a '#' too")
.option('--check-images', 'Validate local images too')
.option('--check-external-links', 'Check external URLs too')
.option('-v, --verbose', 'Verbose outputs')
.option('--debug', "Loud about everything it's doing")
.option('--random', 'Load pages in a random order (useful for debugging)')
.option('--max <number>', 'integer argument (default: none)', (value) => {
const parsed = parseInt(value, 10)
if (isNaN(parsed)) {
throw new InvalidArgumentError('Not a number.')
}
return parsed
})
.option(
'--list <file>.json',
'JSON file containing an array of specific files to check (default: none)',
(filePath) => {
const resolvedPath = path.resolve(filePath)
let stats
try {
stats = fs.statSync(resolvedPath)
} catch (error) {
// Ignore
}
if (!stats || !stats.isFile()) {
throw new InvalidArgumentError('Not an existing file.')
}
return resolvedPath
}
)
.arguments('[files...]', 'Specific files to check')
.parse(process.argv)
main(program.opts(), program.args)
async function main(opts, files) {
const { random, language, filter, exit, debug, max, verbose, list, checkExternalLinks } = opts
// Note! The reason we're using `warmServer()` in this script,
// even though there's no server involved, is because
// the `contextualize()` function calls it.
// And because warmServer() is actually idempotent, meaning it's
// cheap to call it more than once, it would be expensive to call it
// twice unnecessarily.
// If we'd manually do the same operations that `warmServer()` does
// here (e.g. `loadPageMap()`), we'd end up having to do it all over
// again, the next time `contextualize()` is called.
const { redirects, pages: pageMap, pageList } = await warmServer()
const languages = language || []
console.assert(Array.isArray(languages), `${languages} is not an array`)
const filters = filter || []
console.assert(Array.isArray(filters), `${filters} is not an array`)
if (list && Array.isArray(files) && files.length > 0) {
throw new InvalidArgumentError('Cannot specify both --list and a file list.')
}
if (list) {
const fileList = JSON.parse(await fs.promises.readFile(list))
if (Array.isArray(fileList) && fileList.length > 0) {
files = fileList
} else {
// This must be allowed for empty PRs that accompany docs-early-access repo PRs
console.warn('No files found in --list. Exiting...')
process.exit(0)
}
}
if (random) {
shuffle(pageList)
}
debug && console.time('getPages')
const pages = getPages(pageList, languages, filters, files, max)
debug && console.timeEnd('getPages')
if (checkExternalLinks && pages.length >= 100) {
console.warn(
chalk.yellow(
`Warning! Checking external URLs can be time costly. You're testing ${pages.length} pages.`
)
)
}
const processPagesStart = new Date()
const flawsGroups = await Promise.all(
pages.map((page) => processPage(page, pageMap, redirects, opts))
)
const processPagesEnd = new Date()
const flaws = flawsGroups.flat()
debug && printGlobalCacheHitRatio()
if (verbose) {
summarizeCounts(pages)
console.log(`Checked ${(globalCacheHitCount + globalCacheMissCount).toLocaleString()} links`)
console.log(`Took ${getDurationString(processPagesStart, processPagesEnd)}`)
summarizeFlaws(flaws)
}
if (exit) {
process.exit(flaws.length)
}
}
function printGlobalCacheHitRatio() {
const hits = globalCacheHitCount
const misses = globalCacheMissCount
// It could be that the files that were tested didn't have a single
// link in them. In that case, there's no cache misses or hits at all.
// So avoid the division by zero.
if (misses + hits) {
console.log(
`Cache hit ratio: ${hits.toLocaleString()} of ${(misses + hits).toLocaleString()} (${(
(100 * hits) /
(misses + hits)
).toFixed(1)}%)`
)
}
}
function getDurationString(date1, date2) {
const seconds = (date2.getTime() - date1.getTime()) / 1000
const minutes = seconds / 60
if (minutes > 1) {
return `${minutes.toFixed(1)} minutes`
}
return `${seconds.toFixed(1)} seconds`
}
function getPages(pageList, languages, filters, files, max) {
return pageList
.filter((page) => {
if (languages.length && !languages.includes(page.languageCode)) {
return false
}
if (filters.length && !filters.find((filter) => page.relativePath.includes(filter))) {
return false
}
if (
files.length &&
// The reason for checking each file against the `relativePath`
// or the `fullPath` is to make it flexible for the user.
!files.find((file) => {
if (page.relativePath === file) return true
if (page.fullPath === file) return true
// The `page.relativePath` will always be *from* the containing
// directory it came from an might not be relative to the repo
// root. I.e.
// `content/education/quickstart.md` is the path relative to
// the repo root. But the `page.relativePath` will
// in this case be `education/quickstart.md`.
// So give it one last chance to relate to the repo root.
// This is important because you might use `git diff --name-only`
// to get the list of files to focus specifically on.
if (path.join(CONTENT_ROOT, page.relativePath) === path.resolve(file)) return true
return false
})
) {
return false
}
return true
})
.slice(0, max ? Math.min(max, pageList.length) : pageList.length)
}
async function processPage(page, pageMap, redirects, opts) {
const { bail, verboseUrl } = opts
const allFlawsEach = await Promise.all(
page.permalinks.map((permalink) => processPermalink(permalink, page, pageMap, redirects, opts))
)
const allFlaws = allFlawsEach.flat()
if (bail && allFlaws.length > 0) {
printFlaws(allFlaws, verboseUrl)
process.exit(1)
}
printFlaws(allFlaws, verboseUrl)
return allFlaws
}
async function processPermalink(permalink, page, pageMap, redirects, opts) {
const { level, checkAnchors, checkImages, checkExternalLinks } = opts
const html = await renderInnerHTML(page, permalink)
const $ = cheerio.load(html)
const flaws = []
const links = []
$('a[href]').each((i, link) => {
links.push(link)
})
const newFlaws = await Promise.all(
links.map(async (link) => {
const { href } = link.attribs
// The global cache can't be used for anchor links because they
// depend on each page it renders
if (!href.startsWith('#')) {
if (globalHrefCheckCache.has(href)) {
globalCacheHitCount++
return globalHrefCheckCache.get(href)
}
globalCacheMissCount++
}
const flaw = await checkHrefLink(
href,
$,
redirects,
pageMap,
checkAnchors,
checkExternalLinks
)
if (flaw) {
if (level === 'critical' && !flaw.CRITICAL) {
return
}
const text = $(link).text()
if (!href.startsWith('#')) {
globalHrefCheckCache.set(href, { href, flaw, text })
}
return { href, flaw, text }
} else {
if (!href.startsWith('#')) {
globalHrefCheckCache.set(href, flaw)
}
}
})
)
for (const flaw of newFlaws) {
if (flaw) {
flaws.push(Object.assign(flaw, { page, permalink }))
}
}
if (checkImages) {
$('img[src]').each((i, img) => {
let { src } = img.attribs
// Images get a cache-busting prefix injected in the image
// E.g. <img src="/assets/cb-123456/foo/bar.png">
// We need to remove that otherwise we can't look up the image
// on disk.
src = src.replace(/\/cb-\d+\//, '/')
if (globalImageSrcCheckCache.has(src)) {
globalCacheHitCount++
return globalImageSrcCheckCache.get(src)
}
const flaw = checkImageSrc(src, $)
globalImageSrcCheckCache.set(src, flaw)
if (flaw) {
if (level === 'critical' && !flaw.CRITICAL) {
return
}
flaws.push({ permalink, page, src, flaw })
}
})
}
return flaws
}
function printFlaws(flaws, verboseUrl = null) {
let previousPage = null
let previousPermalink = null
for (const { page, permalink, href, text, src, flaw } of flaws) {
if (page !== previousPage) {
console.log(`PAGE: ${chalk.bold(prettyFullPath(page.fullPath))}`)
}
previousPage = page
if (href) {
if (previousPermalink !== permalink.href) {
if (verboseUrl) {
console.log(` URL: ${new URL(permalink.href, verboseUrl).toString()}`)
} else {
console.log(` PERMALINK: ${permalink.href}`)
}
}
previousPermalink = permalink.href
console.log(` HREF: ${chalk.bold(href)}`)
console.log(` TEXT: ${text}`)
} else if (src) {
console.log(` IMG SRC: ${chalk.bold(src)}`)
} else {
throw new Error("Flaw has neither 'href' nor 'src'")
}
console.log(
` FLAW: ${flaw.CRITICAL ? chalk.red(flaw.CRITICAL) : chalk.yellow(flaw.WARNING)}`
)
console.log('')
}
}
// Given a full path, change to so it's relative to the `cwd()` so that you
// can take it from the output and paste it to something like `code ...here...`
// The problem with displaying the full path is that it's quite noisy and
// takes up a lot of space. Sure, you can copy and paste it in front of
// `vi` or `ls` or `code` but if we display it relative to `cwd()` you
// can still paste it to the next command but it's not taking up so much
// space.
function prettyFullPath(fullPath) {
return path.relative(process.cwd(), fullPath)
}
const globalHrefCheckCache = new Map()
const globalImageSrcCheckCache = new Map()
let globalCacheHitCount = 0
let globalCacheMissCount = 0
async function checkHrefLink(
href,
$,
redirects,
pageMap,
checkAnchors = false,
checkExternalLinks = false
) {
if (href === '#') {
if (checkAnchors) {
return { WARNING: 'Link is just an empty `#`' }
}
} else if (href.startsWith('#')) {
if (checkAnchors) {
const countDOMItems = $(href).length
if (countDOMItems !== 1) {
return { WARNING: `Anchor is an empty string` }
}
}
} else if (href.startsWith('/')) {
const pathname = new URL(href, 'http://example.com').pathname
// Remember, if the Markdown has something like
//
// See [my link][/some/page/]
//
// In the post-processing, that will actually become
//
// See <a href="/en/some/page">my link</a>
//
// But, if that link was a redirect, that would have been left
// untouched.
if (pathname.endsWith('/')) {
return { WARNING: 'Links with a trailing / will always redirect' }
} else {
if (pathname.split('/')[1] in STATIC_PREFIXES) {
const staticFilePath = path.join(
STATIC_PREFIXES[pathname.split('/')[1]],
pathname.split(path.sep).slice(2).join(path.sep)
)
if (!fs.existsSync(staticFilePath)) {
return { CRITICAL: `Static file not found ${staticFilePath} (${pathname})` }
}
} else if (getRedirect(pathname, { redirects, pages: pageMap })) {
return { WARNING: `Redirect to ${getRedirect(pathname, { redirects, pages: pageMap })}` }
} else if (!pageMap[pathname]) {
if (deprecatedVersionPrefixesRegex.test(pathname)) {
return
}
return { CRITICAL: 'Broken link' }
}
}
} else if (checkExternalLinks) {
if (!href.startsWith('https://')) {
return { WARNING: `Will not check external URLs that are not HTTPS (${href})` }
}
if (linksToSkip(href)) {
return
}
let failed = false
try {
failed = await checkExternalURL(href)
} catch (err) {
return { WARNING: `Got error when testing ${href}: ${err.toString()}` }
}
if (failed) {
return { CRITICAL: 'Broken external link ' }
}
}
}
const externalResponseCache = new Map()
const externalResponseWaiting = new Set()
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
async function checkExternalURL(url) {
if (!url.startsWith('https://')) throw new Error('Invalid URL')
if (externalResponseCache.has(url)) {
const result = externalResponseCache.get(url)
return result
}
if (externalResponseWaiting.has(url)) {
// Because this whole script is based on `Promise.all()` you can't
// guarantee that you first make the list of external URLs distinct,
// so you'll end up with N concurrent threads that both start,
// waiting for the same URL to check.
// If there's one going on, sleep and retry all over.
await sleep(500 + Math.random() * 100)
return await checkExternalURL(url)
}
externalResponseWaiting.add(url)
// The way `got` does retries:
//
// sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
//
// So, it means:
//
// 1. ~1000ms
// 2. ~2000ms
// 3. ~4000ms
//
// ...if the limit we set is 3.
// Our own timeout, in ./middleware/timeout.js defaults to 10 seconds.
// So there's no point in trying more attempts than 3 because it would
// just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
const retry = {
limit: 3,
}
const timeout = 2000
const r = await got(url, {
throwHttpErrors: false,
retry,
timeout,
})
const failed = r.statusCode !== 200
externalResponseCache.set(url, failed)
externalResponseWaiting.delete(url)
return failed
}
function checkImageSrc(src, $) {
const pathname = new URL(src, 'http://example.com').pathname
if (!pathname.startsWith('/')) {
return { WARNING: "External images can't not be checked" }
}
const prefix = pathname.split('/')[1]
if (prefix in STATIC_PREFIXES) {
const staticFilePath = path.join(
STATIC_PREFIXES[prefix],
pathname.split(path.sep).slice(2).join(path.sep)
)
if (!fs.existsSync(staticFilePath)) {
return { CRITICAL: `Static file not found (${pathname})` }
}
} else {
return { WARNING: `Unrecognized image src prefix (${prefix})` }
}
}
function summarizeFlaws(flaws) {
if (flaws.length) {
console.log(
chalk.bold(
`Found ${flaws.length.toLocaleString()} flaw${flaws.length === 1 ? '' : 's'} in total.`
)
)
} else {
console.log(chalk.green('No flaws found! 💖'))
}
}
function summarizeCounts(pages) {
const count = pages.map((page) => page.permalinks.length).reduce((a, b) => a + b, 0)
console.log(
`Tested ${count.toLocaleString()} permalinks across ${pages.length.toLocaleString()} pages`
)
}
function shuffle(array) {
let currentIndex = array.length
let randomIndex
// While there remain elements to shuffle...
while (currentIndex !== 0) {
// Pick a remaining element...
randomIndex = Math.floor(Math.random() * currentIndex)
currentIndex--
// And swap it with the current element.
;[array[currentIndex], array[randomIndex]] = [array[randomIndex], array[currentIndex]]
}
return array
}
async function renderInnerHTML(page, permalink) {
const next = () => {}
const res = {}
const pagePath = permalink.href
const req = {
path: pagePath,
language: permalink.languageCode,
pagePath,
cookies: {},
}
await contextualize(req, res, next)
await shortVersions(req, res, next)
const context = Object.assign({}, req.context, { page })
context.relativePath = page.relativePath
return await renderContent(page.markdown, context)
}
// Delibertely commented out. Kept temporarily in case it's better.
// async function renderPage(page, permalink) {
// const next = () => {}
// const res = {}
// const pagePath = permalink.href
// const req = {
// path: pagePath,
// language: permalink.languageCode,
// pagePath,
// cookies: {},
// }
// await contextualize(req, res, next)
// const context = Object.assign({}, req.context, { page })
// return await page._render(context)
// }