const cheerio = require('cheerio') const findPage = require('./find-page') const renderContent = require('./render-content') const rewriteLocalLinks = require('./rewrite-local-links') const getApplicableVersions = require('./get-applicable-versions') const { getPathWithoutLanguage } = require('./path-utils') const { getEnterpriseVersionNumber } = require('./patterns') const { deprecated } = require('./enterprise-server-releases') // internal links will have a language code by the time we're testing them // we also want to capture same-page anchors (#foo) const languageCode = 'en' const internalHrefs = ['/en', '#'] const renderedPageCache = {} const checkedAnchorCache = {} module.exports = async function checkLinks ($, page, context, version, checkedLinkCache = {}) { // run rewriteLocalLinks to version links and add language codes rewriteLocalLinks($, version, languageCode) const brokenLinks = { anchors: [], links: [] } // internal link check for (const href of internalHrefs) { const internalLinks = $(`a[href^="${href}"]`).get() for (const internalLink of internalLinks) { const href = $(internalLink).attr('href') // enable caching so we don't check links more than once // anchor links are cached locally (within this run) since they are specific to the page if (checkedLinkCache[href] || checkedAnchorCache[href]) continue const [link, anchor] = href.split('#') // if anchor only (e.g., #foo), look for heading on same page if (anchor && !link) { // ignore anchors that are autogenerated from headings if (anchor === $(internalLink).parent().attr('id')) continue const matchingHeadings = getMatchingHeadings($, anchor) if (matchingHeadings.length === 0) { brokenLinks.anchors.push({ 'broken same-page anchor': `#${anchor}`, reason: 'heading not found on page' }) } checkedAnchorCache[href] = true continue } checkedLinkCache[href] = true // skip rare hardcoded links to old GHE versions // these paths will always be in the old versioned form // example: /enterprise/11.10.340/admin/articles/upgrading-to-the-latest-release const gheVersionInLink = link.match(getEnterpriseVersionNumber) if (gheVersionInLink && deprecated.includes(gheVersionInLink[1])) continue // look for linked page const linkedPage = findPage(link, context.pages, context.redirects, languageCode) if (!linkedPage) { brokenLinks.links.push({ 'broken link': link, reason: 'linked page not found' }) continue } // finding the linked page isn't enough if it's a github.com page; also need to check versions if (linkedPage.relativePath.startsWith('github')) { const linkedPageVersions = getApplicableVersions(linkedPage.versions, linkedPage.relativePath) if (!linkedPageVersions.includes(version) && $(internalLink).attr('class') !== 'dotcom-only') { brokenLinks.links.push({ 'broken link': link, reason: `${version} not found in linked page versions`, 'linked page': linkedPage.fullPath }) continue } } // don't check anchors on developers content if (linkedPage.relativePath.match(/^(rest|graphql|developers)/)) continue // create a unique string for caching purposes const pathToCache = version + linkedPage.relativePath const anchorToCheck = anchor // if link with anchor (e.g., /some/path#foo), look for heading on linked page if (anchorToCheck) { // either render page or fetch it from cache if we've already rendered it let linkedPageObject if (!renderedPageCache[pathToCache]) { const linkedPageHtml = await renderContent(linkedPage.markdown, context) linkedPageObject = cheerio.load(linkedPageHtml, { xmlMode: true }) renderedPageCache[pathToCache] = linkedPageObject } else { linkedPageObject = renderedPageCache[pathToCache] } const matchingHeadings = getMatchingHeadings(linkedPageObject, anchorToCheck) if (matchingHeadings.length === 0) { if (anchor) { brokenLinks.anchors.push({ 'broken anchor': `#${anchor}`, 'full link': `${getPathWithoutLanguage(link)}#${anchor}`, reason: 'heading not found on linked page', 'linked page': linkedPage.fullPath }) } continue } } } } return { brokenLinks, checkedLinkCache } } // article titles are h1s; headings can be any subsequent level function getMatchingHeadings ($, anchor) { return $(` h2[id="${anchor}"], h3[id="${anchor}"], h4[id="${anchor}"], h5[id="${anchor}"], h6[id="${anchor}"], a[name="${anchor}"] `) }