Merge pull request #32252 from github/repo-sync

Repo sync
2025-12-19 18:10:59 -05:00 · 2024-03-27 13:30:10 -04:00
parent 18e7fd470a b141a77a45
commit ae168ac836
7 changed files with 260 additions and 27 deletions
--- a/.github/workflows/count-translation-corruptions.yml
+++ b/.github/workflows/count-translation-corruptions.yml
@@ -0,0 +1,43 @@
 name: Count translation corruptions
 # **What it does**: Generates a summary of Liquid corruptions per language.
 # **Why we have it**: For insights into the state of translations and things we can do to fix them
 # **Who does it impact**: Engineering
 on:
  workflow_dispatch:
  pull_request:
    paths:
      - src/languages/scripts/count-translation-corruptions.ts
      - .github/workflows/count-translation-corruptions.yml
      - .github/actions/node-npm-setup/action.yml
      - .github/actions/clone-translations/action.yml
      - 'package**.json'
 permissions:
  contents: read
 jobs:
  count-translation-corruptions:
    if: github.repository == 'github/docs-internal'
    runs-on: ubuntu-20.04-xl
    steps:
      - name: Checkout English repo
        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
        with:
          # Using a PAT is necessary so that the new commit will trigger the
          # CI in the PR. (Events from GITHUB_TOKEN don't trigger new workflows.)
          token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}
      # It's important because translations are often a bit behind.
      # So if a translation is a bit behind, it might still be referencing
      # an asset even though none of the English content does.
      - name: Clone all translations
        uses: ./.github/actions/clone-translations
        with:
          token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}
      - uses: ./.github/actions/node-npm-setup
      - name: Run count
        run: npm run count-translation-corruptions
--- a/.github/workflows/sme-review-tracking-issue.yml
+++ b/.github/workflows/sme-review-tracking-issue.yml
@@ -13,10 +13,6 @@ on:
    types:
      - labeled
  pull_request:
    types:
      - labeled
 permissions:
  contents: read
--- a/package.json
+++ b/package.json
@@ -22,6 +22,7 @@
    "check-content-type": "node src/workflows/check-content-type.js",
    "check-github-github-links": "node src/links/scripts/check-github-github-links.js",
    "copy-fixture-data": "node src/tests/scripts/copy-fixture-data.js",
    "count-translation-corruptions": "tsx src/languages/scripts/count-translation-corruptions.ts",
    "debug": "cross-env NODE_ENV=development ENABLED_LANGUAGES=en nodemon --inspect src/frame/server.js",
    "delete-orphan-translation-files": "tsx src/workflows/delete-orphan-translation-files.ts",
    "dev": "cross-env npm start",
--- a/src/code-scanning/scripts/generate-code-scanning-query-list.ts
+++ b/src/code-scanning/scripts/generate-code-scanning-query-list.ts
@@ -90,6 +90,12 @@ type Query = {
  autofixSupport: 'none' | 'default'
 }
 type QueryExtended = Query & {
  inDefault: boolean
  inExtended: boolean
  inAutofix: boolean
 }
 const opts = program.opts()
 main(
  {
@@ -162,8 +168,28 @@ async function main(options: Options, language: string) {
    }
  }
-  const entries = Object.values(queries)
+  function decorate(query: Query): QueryExtended {
-  entries.sort((a, b) => a.name.localeCompare(b.name))
+    return {
      ...query,
      inDefault: query.packs.includes('code-scanning'),
      inExtended: query.packs.includes('security-extended'),
      inAutofix: query.autofixSupport === 'default',
    }
  }
  const entries = Object.values(queries).map(decorate)
  // Spec: "Queries that are both in Default and Extended should come first,
  // in alphabetical order. Followed by the queries that are in Extended only."
  entries.sort((a, b) => {
    if (a.inDefault && !b.inDefault) return -1
    else if (!a.inDefault && b.inDefault) return 1
    if (a.inExtended && !b.inExtended) return -1
    else if (!a.inExtended && b.inExtended) return 1
    return a.name.localeCompare(b.name)
  })
  // At the moment, our chosen business logic is that we omit the Autofix
  // column if there are no queries that support it.
@@ -174,7 +200,7 @@ async function main(options: Options, language: string) {
  printQueries(options, entries, includeAutofix)
 }
-function printQueries(options: Options, queries: Query[], includeAutofix: boolean) {
+function printQueries(options: Options, queries: QueryExtended[], includeAutofix: boolean) {
  const markdown = []
  markdown.push('{% rowheaders %}')
  markdown.push('') // blank line
@@ -190,18 +216,9 @@ function printQueries(options: Options, queries: Query[], includeAutofix: boolea
  for (const query of queries) {
    const markdownLink = `[${query.name}](${query.url})`
-    let defaultIcon = notIncludedOcticon
+    const defaultIcon = query.inDefault ? includedOcticon : notIncludedOcticon
-    let extendedIcon = notIncludedOcticon
+    const extendedIcon = query.inExtended ? includedOcticon : notIncludedOcticon
-    let autofixIcon = notIncludedOcticon
+    const autofixIcon = query.inAutofix ? includedOcticon : notIncludedOcticon
    if (query.packs.includes('code-scanning')) {
      defaultIcon = includedOcticon
    }
    if (query.packs.includes('security-extended')) {
      extendedIcon = includedOcticon
    }
    if (query.autofixSupport === 'default') {
      autofixIcon = includedOcticon
    }
    const row = [markdownLink, query.cwes.join(', '), defaultIcon, extendedIcon]
    if (includeAutofix) {
      row.push(autofixIcon)
--- a/src/frame/lib/page-data.js
+++ b/src/frame/lib/page-data.js
@@ -266,13 +266,17 @@ async function translateTree(dir, langObj, enTree) {
 *
 * Order of languages and versions doesn't matter, but order of child page arrays DOES matter (for navigation).
 */
-export async function loadSiteTree(unversionedTree) {
+export async function loadSiteTree(unversionedTree, languagesOnly = []) {
-  const rawTree = Object.assign({}, unversionedTree || (await loadUnversionedTree()))
+  const rawTree = Object.assign({}, unversionedTree || (await loadUnversionedTree(languagesOnly)))
  const siteTree = {}
  const langCodes = (languagesOnly.length && languagesOnly) || Object.keys(languages)
  // For every language...
  await Promise.all(
-    Object.keys(languages).map(async (langCode) => {
+    langCodes.map(async (langCode) => {
      if (!(langCode in rawTree)) {
        throw new Error(`No tree for language ${langCode}`)
      }
      const treePerVersion = {}
      // in every version...
      await Promise.all(
@@ -329,8 +333,12 @@ export async function loadPageList(unversionedTree, languagesOnly = []) {
  const rawTree = unversionedTree || (await loadUnversionedTree(languagesOnly))
  const pageList = []
  const langCodes = (languagesOnly.length && languagesOnly) || Object.keys(languages)
  await Promise.all(
-    ((languagesOnly.length && languagesOnly) || Object.keys(languages)).map(async (langCode) => {
+    langCodes.map(async (langCode) => {
      if (!(langCode in rawTree)) {
        throw new Error(`No tree for language ${langCode}`)
      }
      await addToCollection(rawTree[langCode], pageList)
    }),
  )
--- a/src/frame/lib/warm-server.js
+++ b/src/frame/lib/warm-server.js
@@ -26,8 +26,8 @@ async function warmServer(languagesOnly = []) {
  }
  const unversionedTree = await dog.loadUnversionedTree(languagesOnly)
-  const siteTree = await dog.loadSiteTree(unversionedTree)
+  const siteTree = await dog.loadSiteTree(unversionedTree, languagesOnly)
-  const pageList = await dog.loadPages(unversionedTree)
+  const pageList = await dog.loadPages(unversionedTree, languagesOnly)
  const pageMap = await dog.loadPageMap(pageList)
  const redirects = await dog.loadRedirects(pageList)
@@ -52,12 +52,12 @@ dog.warmServer = statsd.asyncTimer(warmServer, 'warm_server')
 // We only want statistics if the priming needs to occur, so let's wrap the
 // real method and return early [without statistics] whenever possible
-export default async function warmServerWrapper() {
+export default async function warmServerWrapper(languagesOnly = []) {
  // Handle receiving multiple calls to this method from multiple page requests
  // by holding the in-progress Promise and returning it instead of allowing
  // the server to actually load all of the files multiple times.
  if (!promisedWarmServer) {
-    promisedWarmServer = dog.warmServer()
+    promisedWarmServer = dog.warmServer(languagesOnly)
  }
  return promisedWarmServer
 }
--- a/src/languages/scripts/count-translation-corruptions.ts
+++ b/src/languages/scripts/count-translation-corruptions.ts
@@ -0,0 +1,168 @@
 import path from 'path'
 import fs from 'fs'
 import { program } from 'commander'
 import chalk from 'chalk'
 import { TokenizationError } from 'liquidjs'
 import walk from 'walk-sync'
 import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js'
 import languages from '@/languages/lib/languages.js'
 import warmServer, { type Site } from '@/frame/lib/warm-server.js'
 import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content.js'
 program
  .description('Tally the number of liquid corruptions in a translation')
  .argument('[language...]', 'language(s) to compare against')
  .action(main)
 program.parse(process.argv)
 type Page = {
  relativePath: string
  fullPath: string
  title: string
  shortTitle?: string
  intro: string
  markdown: string
  languageCode: string
 }
 type Reusables = Map<string, string>
 async function main(languageCodes: string[]) {
  const langCodes = languageCodes.length
    ? languageCodes
    : Object.keys(languages).filter((x) => x !== 'en')
  const site = await warmServer(languageCodes.length ? ['en', ...langCodes] : [])
  // When checking reusables, we only want to check the files that
  // have an English equivalent.
  const reusables = getReusables()
  const totalErrors = new Map<string, number>()
  for (const languageCode of langCodes) {
    if (!(languageCode in languages)) {
      console.error(chalk.red(`Language ${languageCode} not found`))
      return process.exit(1)
    }
    if (languageCode === 'en') {
      console.error(chalk.red("Can't test in English ('en')"))
      return process.exit(1)
    }
    const { errors } = run(languageCode, site, reusables)
    for (const [error, count] of Array.from(errors.entries())) {
      totalErrors.set(error, (totalErrors.get(error) || 0) + count)
    }
  }
  const sumTotal = Array.from(totalErrors.values()).reduce((acc, count) => acc + count, 0)
  console.log('\nGRAND TOTAL ERRORS:', sumTotal)
 }
 function getReusables(): Reusables {
  const reusables = new Map()
  const files = walk('data/reusables', {
    includeBasePath: true,
    globs: ['**/*.md'],
    ignore: ['**/README.md'],
  })
  for (const file of files) {
    const content = fs.readFileSync(file, 'utf8')
    reusables.set(file, content)
  }
  return reusables
 }
 function run(languageCode: string, site: Site, englishReusables: Reusables) {
  const PADDING = 60
  const language = languages[languageCode as keyof typeof languages]
  console.log(`--- Tallying liquid corruptions in ${languageCode} (${language.name}) ---`)
  const pageList: Page[] = site.pageList
  const errors = new Map<string, number>()
  const wheres = new Map<string, number>()
  const illegalTags = new Map<string, number>()
  function countError(error: TokenizationError, where: string) {
    const errorString = (error as any).originalError.message as string
    if (errorString.includes('illegal tag syntax')) {
      const illegalTag = (error as any).token.content
      illegalTags.set(illegalTag, (illegalTags.get(illegalTag) || 0) + 1)
    }
    errors.set(errorString, (errors.get(errorString) || 0) + 1)
    wheres.set(where, (wheres.get(where) || 0) + 1)
  }
  for (const page of pageList) {
    if (page.languageCode !== languageCode) continue
    const strings: string[][] = [
      ['title', page.title],
      ['shortTitle', page.shortTitle || ''],
      ['intro', page.intro || ''],
      ['markdown', page.markdown],
    ].filter(([, string]) => Boolean(string))
    for (const [where, string] of strings) {
      try {
        getLiquidTokens(string)
      } catch (error) {
        if (error instanceof TokenizationError) {
          countError(error, where)
        } else {
          throw error
        }
      }
    }
  }
  for (const [relativePath, englishContent] of Array.from(englishReusables.entries())) {
    try {
      const filePath = path.join(language.dir, relativePath)
      const rawContent = fs.readFileSync(filePath, 'utf8')
      const correctedContent = correctTranslatedContentStrings(rawContent, englishContent, {
        code: languageCode,
        relativePath,
      })
      getLiquidTokens(correctedContent)
    } catch (error) {
      if (error instanceof TokenizationError) {
        countError(error, 'reusable')
      } else if (error instanceof Error && error.message.startsWith('ENOENT')) {
        continue
      } else {
        throw error
      }
    }
  }
  const flat = Array.from(errors.entries()).sort((a, b) => b[1] - a[1])
  const sumTotal = flat.reduce((acc, [, count]) => acc + count, 0)
  console.log('\nMost common errors')
  flat.forEach(([error, count], i) => {
    console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
  })
  console.log(`${'TOTAL:'.padEnd(3 + 1 + PADDING)}`, sumTotal)
  if (sumTotal) {
    const whereFlat = Array.from(wheres.entries()).sort((a, b) => b[1] - a[1])
    console.log('\nMost common places')
    whereFlat.forEach(([error, count], i) => {
      console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
    })
    const illegalTagsFlat = Array.from(illegalTags.entries()).sort((a, b) => b[1] - a[1])
    if (illegalTagsFlat.reduce((acc, [, count]) => acc + count, 0)) {
      console.log('\nMost common illegal tags', illegalTagsFlat.length > 10 ? ' (Top 10)' : '')
      illegalTagsFlat.slice(0, 10).forEach(([error, count], i) => {
        console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
      })
    }
  }
  console.log('\n')
  return { errors }
 }