docs/lib/update-internal-links.js

import fs from 'fs'
import path from 'path'

import { visit } from 'unist-util-visit'
import { fromMarkdown } from 'mdast-util-from-markdown'
import { toMarkdown } from 'mdast-util-to-markdown'
import yaml from 'js-yaml'

import frontmatter from './read-frontmatter.js'
import {
  getPathWithLanguage,
  getPathWithoutLanguage,
  getPathWithoutVersion,
  getVersionStringFromPath,
} from './path-utils.js'
import loadRedirects from './redirects/precompile.js'
import patterns from './patterns.js'
import { loadUnversionedTree, loadPages, loadPageMap } from './page-data.js'
import getRedirect, { splitPathByLanguage } from './get-redirect.js'
import nonEnterpriseDefaultVersion from './non-enterprise-default-version.js'
import { deprecated } from './enterprise-server-releases.js'

function objectClone(obj) {
  try {
    return structuredClone(obj)
  } catch {
    // Need to polyfill for Node 16 folks
    // Using `yaml.load(yaml.dump(...))` is safe enough because this
    // data itself came from the Yaml deserializing in frontmatter().
    return yaml.load(yaml.dump(obj))
  }
}

// That magical string that can be turned into th actual title when
// we, at runtime, render out the links
const AUTOTITLE = 'AUTOTITLE'

const Options = {
  setAutotitle: false,
  fixHref: false,
  verbose: false,
  strict: false,
}

export async function updateInternalLinks(files, options = {}) {
  const opts = Object.assign({}, Options, options)

  const results = []

  const unversionedTree = await loadUnversionedTree(['en'])
  const pageList = await loadPages(unversionedTree, ['en'])
  const pageMap = await loadPageMap(pageList)
  const redirects = await loadRedirects(pageList)

  const context = {
    pages: pageMap,
    redirects,
    currentLanguage: 'en',
    userLanguage: 'en',
  }

  for (const file of files) {
    try {
      results.push({
        file,
        ...(await updateFile(file, context, opts)),
      })
    } catch (err) {
      console.warn(`The file it tried to process on exception was: ${file}`)
      throw err
    }
  }

  return results
}

async function updateFile(file, context, opts) {
  const rawContent = fs.readFileSync(file, 'utf8')
  const { data, content } = frontmatter(rawContent)

  // Since this function can process both `.md` and `.yml` files,
  // when treating a `.md` file, the `data` from `frontmatter(rawContent)`
  // is easy. But when dealing a file like `data/learning-tracks/foo.yml`
  // then the the `frontmatter(rawContent).data` always becomes `{}`.
  // And since the Yaml file might contain arrays of internal linked
  // pathnames, we have to re-read it fully.
  if (file.endsWith('.yml')) {
    Object.assign(data, yaml.load(content))
  }

  let newContent = content
  const ast = fromMarkdown(newContent)

  const replacements = []
  const warnings = []

  // The day we know with confidence that everyone us on Node >=17,
  // we can change this to use `structuredClone` without the polyfill
  // technique.
  const newData = objectClone(data)

  const ANY = Symbol('any')
  const IS_ARRAY = Symbol('is array')

  // This configuration determines which nested things to bother looking
  // into.
  const HAS_LINKS = {
    featuredLinks: ['gettingStarted', 'startHere', 'guideCards', 'popular'],
    introLinks: ANY,
    includeGuides: IS_ARRAY,
  }

  if (
    file.split(path.sep).includes('data') &&
    file.split(path.sep).includes('learning-tracks') &&
    file.endsWith('.yml')
  ) {
    // data/learning-tracks/**/*.yml files are different because the keys
    // are abitrary but what they might all have in common is a key
    // there called `guides`
    for (const key of Object.keys(data)) {
      HAS_LINKS[key] = ['guides']
    }
  }

  for (const [key, seek] of Object.entries(HAS_LINKS)) {
    if (!(key in data)) {
      continue
    }
    try {
      if (Array.isArray(data[key])) {
        if ((Array.isArray(seek) && seek.includes(key)) || seek === IS_ARRAY || seek === ANY) {
          const better = getNewFrontmatterLinkList(data[key], context, opts, file, rawContent)
          if (!equalArray(better, data[key])) {
            newData[key] = better
          }
        }
      } else {
        for (const [group, thing] of Object.entries(data[key])) {
          if (Array.isArray(thing)) {
            if (
              (Array.isArray(seek) && seek.includes(group)) ||
              seek === IS_ARRAY ||
              seek === ANY
            ) {
              const better = getNewFrontmatterLinkList(thing, context, opts, file, rawContent)
              if (!equalArray(better, thing)) {
                newData[key][group] = better
              }
            }
          }
        }
      }
    } catch (error) {
      // When in strict mode, if it throws an error that stacktrace will
      // bubble up to the CLI. And the CLI will mention which file it
      // was processing when it failed. But we have a valuable piece of
      // information here about which frontmatter key it was that failed.
      console.warn(`The frontmatter key it processed and failed was '${key}'`)
      throw error
    }
  }

  const lineOffset = rawContent.replace(content, '').split(/\n/g).length - 1

  visit(ast, definitionMatcher, (node) => {
    const asMarkdown = toMarkdown(node).trim()
    // E.g. `[foo]: /bar`
    if (content.includes(asMarkdown)) {
      if (opts.fixHref) {
        let newHref = node.url
        const { label } = node
        const betterHref = getNewHref(newHref, context, opts, file)
        // getNewHref() might return a deliberate `undefined` if the
        // new href value could not be computed for some reason.
        if (betterHref !== undefined) {
          newHref = betterHref
        }
        const newAsMarkdown = `[${label}]: ${newHref}`
        if (asMarkdown !== newAsMarkdown) {
          // Something can be improved!
          const column = node.position.start.column
          const line = node.position.start.line + lineOffset
          replacements.push({
            asMarkdown,
            newAsMarkdown,
            line,
            column,
          })
          newContent = newContent.replace(asMarkdown, newAsMarkdown)
        }
      }
    }
  })

  visit(ast, linkMatcher, (node) => {
    const asMarkdown = toMarkdown(node).trim()
    if (content.includes(asMarkdown)) {
      // The title part of the link might be more Markdown.
      // For example...
      //
      //    [This *is* cool](/articles/link)
      //
      // In that case, for this link node, the title is the combined
      // serialization of `node.children`. But `toMarkdown()` always appends
      // `\n` to the serialized output.
      // Now the title, of the above-mentioned example becomes 'This *is* cool'
      // which is unlikely to attempt to be the documents title, that
      // it links to.
      const title = node.children.map((child) => toMarkdown(child).slice(0, -1)).join('')

      let newTitle = title
      let newHref = node.url

      const hasQuotesAroundLink = content.includes(`"${asMarkdown}`)

      if (opts.setAutotitle) {
        if (hasQuotesAroundLink) {
          /**
           * Note! A lot of internal links are bullet points like:
           *
           *     - [Creating a repository](/articles/create-a-repo)
           *     - [Forking a repository](/articles/fork-a-repo)
           * or
           *     1. [Set your username in Git](/github/getting-started-with-github/setting-your-username-in-git).
           *     1. [Set your commit email address in Git](/articles/setting-your-commit-email-address).
           *
           * Perhaps we could recognize them as such an consider them
           * matches anyway. In particular if the title is make up
           * a leading capital letter any most rest in lower case.
           */

          if (title !== AUTOTITLE) {
            newTitle = AUTOTITLE
          }
        } else {
          /**
           * The Markdown link sometimes is written like this:
           *
           *   ["This is the title](/foo/bar)."
           *
           * or...
           *
           *   ["This is the title"](/foo/bar).
           */
          if (node.children && node.children.length > 0 && node.children[0].value) {
            if (singleStartingQuote(node.children[0].value)) {
              const column = node.position.start.column
              const line = node.position.start.line + lineOffset
              warnings.push({
                warning: 'Starts with a single " inside the text',
                asMarkdown,
                line,
                column,
              })
            } else if (isSimpleQuote(node.children[0].value)) {
              const column = node.position.start.column
              const line = node.position.start.line + lineOffset
              warnings.push({
                warning: 'Starts and ends with a " inside the text',
                asMarkdown,
                line,
                column,
              })
            }
          }
        }
      }
      if (opts.fixHref) {
        const betterHref = getNewHref(node.url, context, opts, file)
        // getNewHref() might return a deliberate `undefined` if the
        // new href value could not be computed for some reason.
        if (betterHref !== undefined) {
          newHref = betterHref
        }
      }
      const newAsMarkdown = `[${newTitle}](${newHref})`
      if (asMarkdown !== newAsMarkdown) {
        // Something can be improved!
        const column = node.position.start.column
        const line = node.position.start.line + lineOffset
        replacements.push({
          asMarkdown,
          newAsMarkdown,
          line,
          column,
        })
        newContent = newContent.replace(asMarkdown, newAsMarkdown)
      }
    } else if (opts.verbose) {
      console.warn(
        `Unable to find link as Markdown ('${asMarkdown}') in the source content (${file})`
      )
    }
  })

  return {
    data,
    content,
    rawContent,
    newContent,
    replacements,
    warnings,
    newData,
  }
}

function definitionMatcher(node) {
  const { type, url } = node
  if (type === 'definition' && url) {
    return url.startsWith('/')
  }
  return false
}

function linkMatcher(node) {
  if (node.type === 'link' && node.url) {
    const { url } = node
    if (url.startsWith('/') || url.startsWith('./')) {
      // Sometimes there's a link to view the asset as a separate link.
      // Skip these because they ultimately link to an actual Page.
      if (url.startsWith('/assets') || url.startsWith('/public/')) {
        return false
      }

      // If a link uses Liquid we can't process it. It would require full
      // rendering which this script is not doing.
      if (url.includes('{{') || url.includes('{%')) {
        return false
      }

      // Sometimes we link to archived enterprise-server versions. These
      // can never be updated because although they appear to be internal,
      // they are, in a sense external. For example:
      // See "[This old thing](/enterprise-server@3.1/some/page)".
      // Skip these
      const version = getVersionStringFromPath(url)
      if (
        version &&
        version.startsWith('enterprise-server@') &&
        deprecated.includes(version.replace('enterprise-server@', ''))
      ) {
        return false
      }

      // Really old versions like `/enterprise/2.1` don't need to be
      // corrected because they're deliberately pointing to archived
      // versions.
      if (patterns.getEnterpriseVersionNumber.test(url)) {
        return false
      }

      return true
    }
  }
  return false
}

function getNewFrontmatterLinkList(list, context, opts, file, rawContent) {
  /**
   * The `list` is expected to all be strings. Sometimes they're like this:
   *
   *   /search-github/searching-on-github/searching-for-repositories
   *
   * Sometimes they're like this:
   *
   *   {% ifversion fpt or ghec or ghes > 3.4 %}/pages/getting-started-with-github-pages{% endif %}
   *
   * In the case of Liquid, we have to temporarily remove it to be able to
   * test the path as a URL.
   **/

  const better = []
  for (const entry of list) {
    if (/{%\s*else\s*%}/.test(entry)) {
      console.warn(`Skipping frontmatter link with {% else %} in it: ${entry}. (file: ${file})`)
      better.push(entry)
      continue
    }
    const pure = stripLiquid(entry)
    let asURL = '/en'
    if (!pure.startsWith('/')) {
      asURL += '/'
    }
    asURL += pure
    if (asURL in context.pages) {
      better.push(entry)
    } else {
      const redirected = getRedirect(asURL, context)
      if (redirected === undefined) {
        const lineNumber = findLineNumber(entry, rawContent)
        const msg =
          'A frontmatter link appears to be broken. ' +
          `Neither redirect or a findable page: ${pure}. (file: ${file} line: ${
            lineNumber || 'unknown'
          })`

        if (opts.strict) {
          throw new Error(msg)
        }
        console.warn(`WARNING: ${msg}`)
        better.push(entry)
      } else {
        // Perhaps it just redirected to a specific version
        const redirectedWithoutLanguage = getPathWithoutLanguage(redirected)
        const asURLWithoutVersion = getPathWithoutVersion(redirectedWithoutLanguage)
        if (asURLWithoutVersion === pure) {
          better.push(entry)
        } else {
          better.push(entry.replace(pure, asURLWithoutVersion))
        }
      }
    }
  }
  return better
}

// Try to return the line in the raw content that entry was on.
// It's hard to know exactly because the `entry` is the result of parsing
// the YAML, most likely, from the front
function findLineNumber(entry, rawContent) {
  let number = 0
  for (const line of rawContent.split(/\n/g)) {
    number++
    if (line.endsWith(entry) && line.includes(` ${entry}`)) {
      return number
    }
  }

  return null
}

const liquidStartRex = /^{%-?\s*ifversion .+?\s*%}/
const liquidEndRex = /{%-?\s*endif\s*-?%}$/

// Return
//
//    /foo/bar
//
// if the text input was
//
//   {% ifversion ghes%}/foo/bar{%endif %}
//
// And if no liquid, just return as is.
function stripLiquid(text) {
  if (liquidStartRex.test(text) && liquidEndRex.test(text)) {
    return text.replace(liquidStartRex, '').replace(liquidEndRex, '').trim()
  } else if (text.includes('{')) {
    throw new Error(`Unsupported Liquid in frontmatter link list (${text})`)
  }
  return text
}

function equalArray(arr1, arr2) {
  return arr1.length === arr2.length && arr1.every((item, i) => item === arr2[i])
}

function getNewHref(href, context, opts, file) {
  const { currentLanguage } = context
  const parsed = new URL(href, 'https://docs.github.com')
  const hash = parsed.hash
  const search = parsed.search
  const pure = parsed.pathname
  let newHref = pure.replace(patterns.trailingSlash, '$1')

  // Before testing if it redirects takes it somewhere, we temporarily
  // pretend it's already prefixed for English (/en)
  const [language, withoutLanguage] = splitPathByLanguage(newHref, currentLanguage)
  if (withoutLanguage !== newHref) {
    // It means the link already had a language in it
    const msg = `Unable to cope with internal links with hardcoded language '${newHref}' (file: ${file})`
    if (opts.strict) {
      throw new Error(msg)
    } else {
      console.warn(`WARNING: ${msg}`)
      return
    }
  }
  const newHrefWithLanguage = getPathWithLanguage(withoutLanguage, language)
  const redirected = getRedirect(newHrefWithLanguage, context)

  // If it comes back as `undefined` it means it didn't need to be
  // redirected, specifically.
  // Optionally, we could skip this whole step of checking for completely
  // broken internal links because other tools will later check that.
  if (redirected === undefined) {
    if (!context.pages[newHrefWithLanguage]) {
      // If this happens, it's very possible that it's a broken link
      const msg = `A link appears to be broken. Neither redirect or a findable page '${href}' (${file})`
      if (opts.strict) {
        throw new Error(msg)
      } else {
        console.warn(`WARNING: ${msg}`)
        return
      }
    }
  }

  if (redirected) {
    // The getRedirect() function will produce a final URL that the user
    // can use, but that means it also injects the language in there.
    // For updating the content statically, we don't want that.
    // Note: It could be an idea to somehow tell getRedirect() to not
    // bother but perhaps it adds unnecessarily complexity to a function that
    // has to work perfectly for runtime.
    const redirectedWithoutLanguage = getPathWithoutLanguage(redirected)
    // Some paths can't be viewed in fre-pro-team so the getRedirect()
    // function will inject the version that you're supposed to go to.
    // For example `/enterprise/admin/guides/installation/configuring-a-hostname`
    // redirects to `/enterprise-server@3.7/admin/configuration/configuring-...`
    // (at the time of writing) which is good when you're actually clicking
    // the link but not good when we're trying to update the source
    // content.
    // The `getPathWithoutVersion` function doesn't change the input if
    // the URL passed doesn't appear to have a valid version in it already.
    // I.e. `getPathWithLanguage('/get-started') === '/get-started``
    // but `getPathWithLanguage('/enterprise-server@3.8/get-started') === '/get-started``
    // But hang on, in some rare cases the content deliberately linked to
    // a specific version. If that's the case, leave it like that.
    // There's another exception! Some links have the `/free-pro-team@latest/`
    // prefix. The `getRedirect()` will always remove that. If that's the case
    // we always want respect that and put it back in.
    if (withoutLanguage.includes(`/${nonEnterpriseDefaultVersion}/`)) {
      newHref = `/${nonEnterpriseDefaultVersion}${redirectedWithoutLanguage}`
    } else if (withoutLanguage.startsWith('/enterprise-server/')) {
      const msg =
        "Old /enterprise-server/ links that don't include a @version is no longer supported. " +
        'If you see this, manually fix that link to use enterprise-server@latest.'
      if (opts.strict) {
        throw new Error(msg)
      } else {
        console.warn(msg)
        return
      }
    } else if (withoutLanguage.startsWith('/enterprise-server@latest')) {
      // getRedirect() will always replace `enterprise-server@latest` with
      // whatever the latest number is. E.g. `enterprise-server@3.9`.
      // But we have to "undo" that.
      newHref = `/enterprise-server@latest${getPathWithoutVersion(redirectedWithoutLanguage)}`
    } else if (getPathWithoutVersion(withoutLanguage) !== withoutLanguage) {
      newHref = redirectedWithoutLanguage
    } else {
      newHref = getPathWithoutVersion(redirectedWithoutLanguage)
    }
  }

  if (search) {
    newHref += search
  }
  if (hash) {
    newHref += hash
  }
  return newHref
}

function singleStartingQuote(text) {
  return text.startsWith('"') && text.split('"').length === 2
}

function isSimpleQuote(text) {
  return text.startsWith('"') && text.endsWith('"') && text.split('"').length === 3
}