#!/usr/bin/env node // [start-readme] // // This script goes through all content and renders their HTML and from there // can analyze for various flaws (e.g. broken links) // // [end-readme] import fs from 'fs' import path from 'path' import cheerio from 'cheerio' import { program, Option, InvalidArgumentError } from 'commander' import chalk from 'chalk' import got, { RequestError } from 'got' import shortVersions from '../middleware/contextualizers/short-versions.js' import contextualize from '../middleware/context.js' import { languageKeys } from '../lib/languages.js' import getRedirect from '../lib/get-redirect.js' import warmServer from '../lib/warm-server.js' import renderContent from '../lib/render-content/index.js' import { deprecated } from '../lib/enterprise-server-releases.js' import excludedLinks from '../lib/excluded-links.js' const STATIC_PREFIXES = { assets: path.resolve('assets'), public: path.resolve(path.join('data', 'graphql')), } // Sanity check that these are valid paths Object.entries(STATIC_PREFIXES).forEach(([key, value]) => { if (!fs.existsSync(value)) { throw new Error(`Can't find static prefix (${key}): ${value}`) } }) // Return a function that can as quickly as possible check if a certain // href input should be skipped. // Do this so we can use a `Set` and a `iterable.some()` for a speedier // check. function linksToSkipFactory() { const set = new Set(excludedLinks.filter((regexOrURL) => typeof regexOrURL === 'string')) const regexes = excludedLinks.filter((regexOrURL) => regexOrURL instanceof RegExp) return (href) => set.has(href) || regexes.some((regex) => regex.test(href)) } const linksToSkip = linksToSkipFactory(excludedLinks) const CONTENT_ROOT = path.resolve('content') const deprecatedVersionPrefixesRegex = new RegExp( `enterprise(-server@|/)(${deprecated.join('|')})(/|$)` ) program .description('Analyze all checked content files, render them, and check for flaws.') .addOption( new Option('-L, --level ', 'Filter of flaw level').choices([ 'all', 'warning', 'critical', ]) ) .addOption( new Option('-l, --language ', 'Which languages to focus on').choices(languageKeys) ) .option('--verbose-url ', 'Print the absolute URL if set') .option('-f, --filter ', 'Search filter(s) on the paths') .option('-e, --exit', 'Exit script by count of flaws (useful for CI)') .option('-b, --bail', 'Exit on the first flaw') .option('--check-anchors', "Validate links that start with a '#' too") .option('--check-images', 'Validate local images too') .option('--check-external-links', 'Check external URLs too') .option('-v, --verbose', 'Verbose outputs') .option('--debug', "Loud about everything it's doing") .option('--random', 'Load pages in a random order (useful for debugging)') .option('--patient', 'Give external link checking longer timeouts and more retries') .option('-o, --out ', 'Put warnings and errors into a file instead of stdout') .option('--json-output', 'Print JSON to stdout or file instead') .option('--max ', 'integer argument (default: none)', (value) => { const parsed = parseInt(value, 10) if (isNaN(parsed)) { throw new InvalidArgumentError('Not a number.') } return parsed }) .option( '--list .json', 'JSON file containing an array of specific files to check (default: none)', (filePath) => { const resolvedPath = path.resolve(filePath) let stats try { stats = fs.statSync(resolvedPath) } catch (error) { // Ignore } if (!stats || !stats.isFile()) { throw new InvalidArgumentError('Not an existing file.') } return resolvedPath } ) .arguments('[files...]', 'Specific files to check') .parse(process.argv) main(program.opts(), program.args) async function main(opts, files) { const { random, language, filter, exit, debug, max, verbose, list, checkExternalLinks, jsonOutput, out, } = opts // Note! The reason we're using `warmServer()` in this script, // even though there's no server involved, is because // the `contextualize()` function calls it. // And because warmServer() is actually idempotent, meaning it's // cheap to call it more than once, it would be expensive to call it // twice unnecessarily. // If we'd manually do the same operations that `warmServer()` does // here (e.g. `loadPageMap()`), we'd end up having to do it all over // again, the next time `contextualize()` is called. const { redirects, pages: pageMap, pageList } = await warmServer() const languages = language || [] console.assert(Array.isArray(languages), `${languages} is not an array`) const filters = filter || [] console.assert(Array.isArray(filters), `${filters} is not an array`) if (list && Array.isArray(files) && files.length > 0) { throw new InvalidArgumentError('Cannot specify both --list and a file list.') } if (list) { const fileList = JSON.parse(await fs.promises.readFile(list)) if (Array.isArray(fileList) && fileList.length > 0) { files = fileList } else { // This must be allowed for empty PRs that accompany docs-early-access repo PRs console.warn('No files found in --list. Exiting...') process.exit(0) } } if (random) { shuffle(pageList) } debug && console.time('getPages') const pages = getPages(pageList, languages, filters, files, max) debug && console.timeEnd('getPages') if (checkExternalLinks && pages.length >= 100) { console.warn( chalk.yellow( `Warning! Checking external URLs can be time costly. You're testing ${pages.length} pages.` ) ) } const processPagesStart = new Date() const flawsGroups = await Promise.all( pages.map((page) => processPage(page, pageMap, redirects, opts)) ) const processPagesEnd = new Date() const flaws = flawsGroups.flat() if (jsonOutput) { jsonPrintFlaws(flaws, opts) } debug && printGlobalCacheHitRatio() if (verbose) { summarizeCounts(pages) console.log(`Checked ${(globalCacheHitCount + globalCacheMissCount).toLocaleString()} links`) console.log(`Took ${getDurationString(processPagesStart, processPagesEnd)}`) summarizeFlaws(flaws) if (out && flaws.length > 0) { console.log(`All flaws written to ${chalk.bold(out)}`) } } if (exit) { process.exit(flaws.length) } } function printGlobalCacheHitRatio() { const hits = globalCacheHitCount const misses = globalCacheMissCount // It could be that the files that were tested didn't have a single // link in them. In that case, there's no cache misses or hits at all. // So avoid the division by zero. if (misses + hits) { console.log( `Cache hit ratio: ${hits.toLocaleString()} of ${(misses + hits).toLocaleString()} (${( (100 * hits) / (misses + hits) ).toFixed(1)}%)` ) } } function getDurationString(date1, date2) { const seconds = (date2.getTime() - date1.getTime()) / 1000 const minutes = seconds / 60 if (minutes > 1) { return `${minutes.toFixed(1)} minutes` } return `${seconds.toFixed(1)} seconds` } function getPages(pageList, languages, filters, files, max) { return pageList .filter((page) => { if (languages.length && !languages.includes(page.languageCode)) { return false } if (filters.length && !filters.find((filter) => page.relativePath.includes(filter))) { return false } if ( files.length && // The reason for checking each file against the `relativePath` // or the `fullPath` is to make it flexible for the user. !files.find((file) => { if (page.relativePath === file) return true if (page.fullPath === file) return true // The `page.relativePath` will always be *from* the containing // directory it came from an might not be relative to the repo // root. I.e. // `content/education/quickstart.md` is the path relative to // the repo root. But the `page.relativePath` will // in this case be `education/quickstart.md`. // So give it one last chance to relate to the repo root. // This is important because you might use `git diff --name-only` // to get the list of files to focus specifically on. if (path.join(CONTENT_ROOT, page.relativePath) === path.resolve(file)) return true return false }) ) { return false } return true }) .slice(0, max ? Math.min(max, pageList.length) : pageList.length) } async function processPage(page, pageMap, redirects, opts) { const { bail, verboseUrl, jsonOutput, out } = opts const allFlawsEach = await Promise.all( page.permalinks.map((permalink) => processPermalink(permalink, page, pageMap, redirects, opts)) ) const allFlaws = allFlawsEach.flat() if (bail && allFlaws.length > 0) { if (jsonOutput) { jsonPrintFlaws(allFlaws, opts) } else { printFlaws(allFlaws, { verboseUrl, out }) } process.exit(1) } if (!jsonOutput) { printFlaws(allFlaws, { verboseUrl, out }) } return allFlaws } async function processPermalink(permalink, page, pageMap, redirects, opts) { const { level, checkAnchors, checkImages, checkExternalLinks, verbose, patient } = opts const html = await renderInnerHTML(page, permalink) const $ = cheerio.load(html) const flaws = [] const links = [] $('a[href]').each((i, link) => { links.push(link) }) const newFlaws = await Promise.all( links.map(async (link) => { const { href } = link.attribs // The global cache can't be used for anchor links because they // depend on each page it renders if (!href.startsWith('#')) { if (globalHrefCheckCache.has(href)) { globalCacheHitCount++ return globalHrefCheckCache.get(href) } globalCacheMissCount++ } const flaw = await checkHrefLink( href, $, redirects, pageMap, checkAnchors, checkExternalLinks, { verbose, patient } ) if (flaw) { if (level === 'critical' && !flaw.CRITICAL) { return } const text = $(link).text() if (!href.startsWith('#')) { globalHrefCheckCache.set(href, { href, flaw, text }) } return { href, flaw, text } } else { if (!href.startsWith('#')) { globalHrefCheckCache.set(href, flaw) } } }) ) for (const flaw of newFlaws) { if (flaw) { flaws.push(Object.assign(flaw, { page, permalink })) } } if (checkImages) { $('img[src]').each((i, img) => { let { src } = img.attribs // Images get a cache-busting prefix injected in the image // E.g. // We need to remove that otherwise we can't look up the image // on disk. src = src.replace(/\/cb-\d+\//, '/') if (globalImageSrcCheckCache.has(src)) { globalCacheHitCount++ return globalImageSrcCheckCache.get(src) } const flaw = checkImageSrc(src, $) globalImageSrcCheckCache.set(src, flaw) if (flaw) { if (level === 'critical' && !flaw.CRITICAL) { return } flaws.push({ permalink, page, src, flaw }) } }) } return flaws } function jsonPrintFlaws(flaws, { verboseUrl = null, out = null } = {}) { const printableFlaws = {} for (const { page, permalink, href, text, src, flaw } of flaws) { const fullPath = prettyFullPath(page.fullPath) if (!(fullPath in printableFlaws)) { printableFlaws[fullPath] = [] } if (href) { printableFlaws[fullPath].push({ href, url: verboseUrl ? new URL(permalink.href, verboseUrl).toString() : permalink.href, text, flaw, }) } else if (src) { printableFlaws[fullPath].push({ src, }) } } const message = JSON.stringify(printableFlaws, undefined, 2) if (out) { fs.writeFileSync(out, message + '\n', 'utf-8') } else { console.log(message) } } function printFlaws(flaws, { verboseUrl = null, out = null } = {}) { let previousPage = null let previousPermalink = null function fout(msg) { if (out) { fs.appendFileSync(out, `${msg}\n`, 'utf-8') } else { console.log(msg) } } for (const { page, permalink, href, text, src, flaw } of flaws) { const fullPath = prettyFullPath(page.fullPath) if (page !== previousPage) { if (out) { fout(`PAGE: ${fullPath}`) } else { console.log(`PAGE: ${chalk.bold(fullPath)}`) } } previousPage = page if (href) { if (previousPermalink !== permalink.href) { if (verboseUrl) { fout(` URL: ${new URL(permalink.href, verboseUrl).toString()}`) } else { fout(` PERMALINK: ${permalink.href}`) } } previousPermalink = permalink.href if (out) { fout(` HREF: ${href}`) } else { console.log(` HREF: ${chalk.bold(href)}`) } fout(` TEXT: ${text}`) } else if (src) { if (out) { fout(` IMG SRC: ${src}`) } else { console.log(` IMG SRC: ${chalk.bold(src)}`) } } else { throw new Error("Flaw has neither 'href' nor 'src'") } if (out) { fout(` FLAW: ${flaw.CRITICAL ? flaw.CRITICAL : flaw.WARNING}`) } else { console.log( ` FLAW: ${flaw.CRITICAL ? chalk.red(flaw.CRITICAL) : chalk.yellow(flaw.WARNING)}` ) } fout('') } } // Given a full path, change to so it's relative to the `cwd()` so that you // can take it from the output and paste it to something like `code ...here...` // The problem with displaying the full path is that it's quite noisy and // takes up a lot of space. Sure, you can copy and paste it in front of // `vi` or `ls` or `code` but if we display it relative to `cwd()` you // can still paste it to the next command but it's not taking up so much // space. function prettyFullPath(fullPath) { return path.relative(process.cwd(), fullPath) } const globalHrefCheckCache = new Map() const globalImageSrcCheckCache = new Map() let globalCacheHitCount = 0 let globalCacheMissCount = 0 async function checkHrefLink( href, $, redirects, pageMap, checkAnchors = false, checkExternalLinks = false, { verbose = false, patient = false } = {} ) { if (href === '#') { if (checkAnchors) { return { WARNING: 'Link is just an empty `#`' } } } else if (href.startsWith('#')) { if (checkAnchors) { const countDOMItems = $(href).length if (countDOMItems !== 1) { return { WARNING: `Anchor is an empty string` } } } } else if (href.startsWith('/')) { const pathname = new URL(href, 'http://example.com').pathname // Remember, if the Markdown has something like // // See [my link][/some/page/] // // In the post-processing, that will actually become // // See my link // // But, if that link was a redirect, that would have been left // untouched. if (pathname.endsWith('/')) { return { WARNING: 'Links with a trailing / will always redirect' } } else { if (pathname.split('/')[1] in STATIC_PREFIXES) { const staticFilePath = path.join( STATIC_PREFIXES[pathname.split('/')[1]], pathname.split(path.sep).slice(2).join(path.sep) ) if (!fs.existsSync(staticFilePath)) { return { CRITICAL: `Static file not found ${staticFilePath} (${pathname})` } } } else if (getRedirect(pathname, { redirects, pages: pageMap })) { return { WARNING: `Redirect to ${getRedirect(pathname, { redirects, pages: pageMap })}` } } else if (!pageMap[pathname]) { if (deprecatedVersionPrefixesRegex.test(pathname)) { return } return { CRITICAL: 'Broken link' } } } } else if (checkExternalLinks) { if (!href.startsWith('https://')) { return { WARNING: `Will not check external URLs that are not HTTPS (${href})` } } if (linksToSkip(href)) { return } const { ok, ...info } = await checkExternalURL(href, { verbose, patient }) if (!ok) { return { CRITICAL: `Broken external link (${JSON.stringify(info)})` } } } } const _fetchCache = new Map() async function checkExternalURL(url, { verbose = false, patient = false } = {}) { if (!url.startsWith('https://')) throw new Error('Invalid URL') const cleanURL = url.split('#')[0] if (!_fetchCache.has(cleanURL)) { _fetchCache.set(cleanURL, innerFetch(cleanURL, { verbose, patient })) } return _fetchCache.get(cleanURL) } const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)) // Global for recording which domains we get rate-limited on. // For example, if you got rate limited on `something.github.com/foo` // and now we're asked to fetch for `something.github.com/bar` // it's good to know to now bother yet. const _rateLimitedDomains = new Map() async function innerFetch(url, config = {}) { const { verbose, useGET, patient } = config const { hostname } = new URL(url) if (_rateLimitedDomains.has(hostname)) { await sleep(_rateLimitedDomains.get(hostname)) } // The way `got` does retries: // // sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100 // // So, it means: // // 1. ~1000ms // 2. ~2000ms // 3. ~4000ms // // ...if the limit we set is 3. // Our own timeout, in ./middleware/timeout.js defaults to 10 seconds. // So there's no point in trying more attempts than 3 because it would // just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000) const retry = { limit: patient ? 5 : 2, } const timeout = { request: patient ? 10000 : 2000 } const headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36', } const retries = config.retries || 0 const httpFunction = useGET ? got.get : got.head if (verbose) console.log(`External URL ${useGET ? 'GET' : 'HEAD'}: ${url} (retries: ${retries})`) try { const r = await httpFunction(url, { headers, throwHttpErrors: false, retry, timeout, }) if (verbose) { console.log( `External URL ${useGET ? 'GET' : 'HEAD'} ${url}: ${r.statusCode} (retries: ${retries})` ) } // If we get rate limited, remember that this hostname is now all // rate limited. And sleep for the number of seconds that the // `retry-after` header indicated. if (r.statusCode === 429) { let sleepTime = Math.min( 60_000, Math.max(10_000, getRetryAfterSleep(r.headers['retry-after'])) ) // Sprinkle a little jitter so it doesn't all start again all // at the same time sleepTime += Math.random() * 10 * 1000 // Give it a bit extra when we can be really patient if (patient) sleepTime += 30 * 1000 _rateLimitedDomains.set(hostname, sleepTime + Math.random() * 10 * 1000) if (verbose) console.log( chalk.yellow( `Rate limited on ${hostname} (${url}). Sleeping for ${(sleepTime / 1000).toFixed(1)}s` ) ) await sleep(sleepTime) return innerFetch(url, Object.assign({}, config, { retries: retries + 1 })) } else { _rateLimitedDomains.delete(hostname) } // Perhaps the server doesn't suppport HEAD requests. // If so, try again with a regular GET. if ((r.statusCode === 405 || r.statusCode === 404) && !useGET) { return innerFetch(url, Object.assign({}, config, { useGET: true })) } if (verbose) { console.log((r.ok ? chalk.green : chalk.red)(`${r.statusCode} on ${url}`)) } return { ok: r.ok, statusCode: r.statusCode } } catch (err) { if (err instanceof RequestError) { if (verbose) { console.log(chalk.yellow(`RequestError (${err.message}) on ${url}`)) } return { ok: false, requestError: err.message } } throw err } } // Return number of milliseconds from a `Retry-After` header value function getRetryAfterSleep(headerValue) { if (!headerValue) return 0 let ms = Math.round(parseFloat(headerValue) * 1000) if (isNaN(ms)) { ms = Math.max(0, new Date(headerValue) - new Date()) } return ms } function checkImageSrc(src, $) { const pathname = new URL(src, 'http://example.com').pathname if (!pathname.startsWith('/')) { return { WARNING: "External images can't not be checked" } } const prefix = pathname.split('/')[1] if (prefix in STATIC_PREFIXES) { const staticFilePath = path.join( STATIC_PREFIXES[prefix], pathname.split(path.sep).slice(2).join(path.sep) ) if (!fs.existsSync(staticFilePath)) { return { CRITICAL: `Static file not found (${pathname})` } } } else { return { WARNING: `Unrecognized image src prefix (${prefix})` } } } function summarizeFlaws(flaws) { if (flaws.length) { console.log( chalk.bold( `Found ${flaws.length.toLocaleString()} flaw${flaws.length === 1 ? '' : 's'} in total.` ) ) } else { console.log(chalk.green('No flaws found! 💖')) } } function summarizeCounts(pages) { const count = pages.map((page) => page.permalinks.length).reduce((a, b) => a + b, 0) console.log( `Tested ${count.toLocaleString()} permalinks across ${pages.length.toLocaleString()} pages` ) } function shuffle(array) { let currentIndex = array.length let randomIndex // While there remain elements to shuffle... while (currentIndex !== 0) { // Pick a remaining element... randomIndex = Math.floor(Math.random() * currentIndex) currentIndex-- // And swap it with the current element. ;[array[currentIndex], array[randomIndex]] = [array[randomIndex], array[currentIndex]] } return array } async function renderInnerHTML(page, permalink) { const next = () => {} const res = {} const pagePath = permalink.href const req = { path: pagePath, language: permalink.languageCode, pagePath, cookies: {}, } await contextualize(req, res, next) await shortVersions(req, res, next) const context = Object.assign({}, req.context, { page }) context.relativePath = page.relativePath return await renderContent(page.markdown, context) } // Delibertely commented out. Kept temporarily in case it's better. // async function renderPage(page, permalink) { // const next = () => {} // const res = {} // const pagePath = permalink.href // const req = { // path: pagePath, // language: permalink.languageCode, // pagePath, // cookies: {}, // } // await contextualize(req, res, next) // const context = Object.assign({}, req.context, { page }) // return await page._render(context) // }