43
.github/workflows/count-translation-corruptions.yml
vendored
Normal file
43
.github/workflows/count-translation-corruptions.yml
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
name: Count translation corruptions
|
||||
|
||||
# **What it does**: Generates a summary of Liquid corruptions per language.
|
||||
# **Why we have it**: For insights into the state of translations and things we can do to fix them
|
||||
# **Who does it impact**: Engineering
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
paths:
|
||||
- src/languages/scripts/count-translation-corruptions.ts
|
||||
- .github/workflows/count-translation-corruptions.yml
|
||||
- .github/actions/node-npm-setup/action.yml
|
||||
- .github/actions/clone-translations/action.yml
|
||||
- 'package**.json'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
count-translation-corruptions:
|
||||
if: github.repository == 'github/docs-internal'
|
||||
runs-on: ubuntu-20.04-xl
|
||||
steps:
|
||||
- name: Checkout English repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
|
||||
with:
|
||||
# Using a PAT is necessary so that the new commit will trigger the
|
||||
# CI in the PR. (Events from GITHUB_TOKEN don't trigger new workflows.)
|
||||
token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}
|
||||
|
||||
# It's important because translations are often a bit behind.
|
||||
# So if a translation is a bit behind, it might still be referencing
|
||||
# an asset even though none of the English content does.
|
||||
- name: Clone all translations
|
||||
uses: ./.github/actions/clone-translations
|
||||
with:
|
||||
token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}
|
||||
|
||||
- uses: ./.github/actions/node-npm-setup
|
||||
|
||||
- name: Run count
|
||||
run: npm run count-translation-corruptions
|
||||
@@ -13,10 +13,6 @@ on:
|
||||
types:
|
||||
- labeled
|
||||
|
||||
pull_request:
|
||||
types:
|
||||
- labeled
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
"check-content-type": "node src/workflows/check-content-type.js",
|
||||
"check-github-github-links": "node src/links/scripts/check-github-github-links.js",
|
||||
"copy-fixture-data": "node src/tests/scripts/copy-fixture-data.js",
|
||||
"count-translation-corruptions": "tsx src/languages/scripts/count-translation-corruptions.ts",
|
||||
"debug": "cross-env NODE_ENV=development ENABLED_LANGUAGES=en nodemon --inspect src/frame/server.js",
|
||||
"delete-orphan-translation-files": "tsx src/workflows/delete-orphan-translation-files.ts",
|
||||
"dev": "cross-env npm start",
|
||||
|
||||
@@ -90,6 +90,12 @@ type Query = {
|
||||
autofixSupport: 'none' | 'default'
|
||||
}
|
||||
|
||||
type QueryExtended = Query & {
|
||||
inDefault: boolean
|
||||
inExtended: boolean
|
||||
inAutofix: boolean
|
||||
}
|
||||
|
||||
const opts = program.opts()
|
||||
main(
|
||||
{
|
||||
@@ -162,8 +168,28 @@ async function main(options: Options, language: string) {
|
||||
}
|
||||
}
|
||||
|
||||
const entries = Object.values(queries)
|
||||
entries.sort((a, b) => a.name.localeCompare(b.name))
|
||||
function decorate(query: Query): QueryExtended {
|
||||
return {
|
||||
...query,
|
||||
inDefault: query.packs.includes('code-scanning'),
|
||||
inExtended: query.packs.includes('security-extended'),
|
||||
inAutofix: query.autofixSupport === 'default',
|
||||
}
|
||||
}
|
||||
|
||||
const entries = Object.values(queries).map(decorate)
|
||||
|
||||
// Spec: "Queries that are both in Default and Extended should come first,
|
||||
// in alphabetical order. Followed by the queries that are in Extended only."
|
||||
entries.sort((a, b) => {
|
||||
if (a.inDefault && !b.inDefault) return -1
|
||||
else if (!a.inDefault && b.inDefault) return 1
|
||||
|
||||
if (a.inExtended && !b.inExtended) return -1
|
||||
else if (!a.inExtended && b.inExtended) return 1
|
||||
|
||||
return a.name.localeCompare(b.name)
|
||||
})
|
||||
|
||||
// At the moment, our chosen business logic is that we omit the Autofix
|
||||
// column if there are no queries that support it.
|
||||
@@ -174,7 +200,7 @@ async function main(options: Options, language: string) {
|
||||
printQueries(options, entries, includeAutofix)
|
||||
}
|
||||
|
||||
function printQueries(options: Options, queries: Query[], includeAutofix: boolean) {
|
||||
function printQueries(options: Options, queries: QueryExtended[], includeAutofix: boolean) {
|
||||
const markdown = []
|
||||
markdown.push('{% rowheaders %}')
|
||||
markdown.push('') // blank line
|
||||
@@ -190,18 +216,9 @@ function printQueries(options: Options, queries: Query[], includeAutofix: boolea
|
||||
|
||||
for (const query of queries) {
|
||||
const markdownLink = `[${query.name}](${query.url})`
|
||||
let defaultIcon = notIncludedOcticon
|
||||
let extendedIcon = notIncludedOcticon
|
||||
let autofixIcon = notIncludedOcticon
|
||||
if (query.packs.includes('code-scanning')) {
|
||||
defaultIcon = includedOcticon
|
||||
}
|
||||
if (query.packs.includes('security-extended')) {
|
||||
extendedIcon = includedOcticon
|
||||
}
|
||||
if (query.autofixSupport === 'default') {
|
||||
autofixIcon = includedOcticon
|
||||
}
|
||||
const defaultIcon = query.inDefault ? includedOcticon : notIncludedOcticon
|
||||
const extendedIcon = query.inExtended ? includedOcticon : notIncludedOcticon
|
||||
const autofixIcon = query.inAutofix ? includedOcticon : notIncludedOcticon
|
||||
const row = [markdownLink, query.cwes.join(', '), defaultIcon, extendedIcon]
|
||||
if (includeAutofix) {
|
||||
row.push(autofixIcon)
|
||||
|
||||
@@ -266,13 +266,17 @@ async function translateTree(dir, langObj, enTree) {
|
||||
*
|
||||
* Order of languages and versions doesn't matter, but order of child page arrays DOES matter (for navigation).
|
||||
*/
|
||||
export async function loadSiteTree(unversionedTree) {
|
||||
const rawTree = Object.assign({}, unversionedTree || (await loadUnversionedTree()))
|
||||
export async function loadSiteTree(unversionedTree, languagesOnly = []) {
|
||||
const rawTree = Object.assign({}, unversionedTree || (await loadUnversionedTree(languagesOnly)))
|
||||
const siteTree = {}
|
||||
|
||||
const langCodes = (languagesOnly.length && languagesOnly) || Object.keys(languages)
|
||||
// For every language...
|
||||
await Promise.all(
|
||||
Object.keys(languages).map(async (langCode) => {
|
||||
langCodes.map(async (langCode) => {
|
||||
if (!(langCode in rawTree)) {
|
||||
throw new Error(`No tree for language ${langCode}`)
|
||||
}
|
||||
const treePerVersion = {}
|
||||
// in every version...
|
||||
await Promise.all(
|
||||
@@ -329,8 +333,12 @@ export async function loadPageList(unversionedTree, languagesOnly = []) {
|
||||
const rawTree = unversionedTree || (await loadUnversionedTree(languagesOnly))
|
||||
const pageList = []
|
||||
|
||||
const langCodes = (languagesOnly.length && languagesOnly) || Object.keys(languages)
|
||||
await Promise.all(
|
||||
((languagesOnly.length && languagesOnly) || Object.keys(languages)).map(async (langCode) => {
|
||||
langCodes.map(async (langCode) => {
|
||||
if (!(langCode in rawTree)) {
|
||||
throw new Error(`No tree for language ${langCode}`)
|
||||
}
|
||||
await addToCollection(rawTree[langCode], pageList)
|
||||
}),
|
||||
)
|
||||
|
||||
@@ -26,8 +26,8 @@ async function warmServer(languagesOnly = []) {
|
||||
}
|
||||
|
||||
const unversionedTree = await dog.loadUnversionedTree(languagesOnly)
|
||||
const siteTree = await dog.loadSiteTree(unversionedTree)
|
||||
const pageList = await dog.loadPages(unversionedTree)
|
||||
const siteTree = await dog.loadSiteTree(unversionedTree, languagesOnly)
|
||||
const pageList = await dog.loadPages(unversionedTree, languagesOnly)
|
||||
const pageMap = await dog.loadPageMap(pageList)
|
||||
const redirects = await dog.loadRedirects(pageList)
|
||||
|
||||
@@ -52,12 +52,12 @@ dog.warmServer = statsd.asyncTimer(warmServer, 'warm_server')
|
||||
|
||||
// We only want statistics if the priming needs to occur, so let's wrap the
|
||||
// real method and return early [without statistics] whenever possible
|
||||
export default async function warmServerWrapper() {
|
||||
export default async function warmServerWrapper(languagesOnly = []) {
|
||||
// Handle receiving multiple calls to this method from multiple page requests
|
||||
// by holding the in-progress Promise and returning it instead of allowing
|
||||
// the server to actually load all of the files multiple times.
|
||||
if (!promisedWarmServer) {
|
||||
promisedWarmServer = dog.warmServer()
|
||||
promisedWarmServer = dog.warmServer(languagesOnly)
|
||||
}
|
||||
return promisedWarmServer
|
||||
}
|
||||
|
||||
168
src/languages/scripts/count-translation-corruptions.ts
Normal file
168
src/languages/scripts/count-translation-corruptions.ts
Normal file
@@ -0,0 +1,168 @@
|
||||
import path from 'path'
|
||||
import fs from 'fs'
|
||||
|
||||
import { program } from 'commander'
|
||||
import chalk from 'chalk'
|
||||
import { TokenizationError } from 'liquidjs'
|
||||
import walk from 'walk-sync'
|
||||
|
||||
import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js'
|
||||
import languages from '@/languages/lib/languages.js'
|
||||
import warmServer, { type Site } from '@/frame/lib/warm-server.js'
|
||||
import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content.js'
|
||||
|
||||
program
|
||||
.description('Tally the number of liquid corruptions in a translation')
|
||||
.argument('[language...]', 'language(s) to compare against')
|
||||
.action(main)
|
||||
program.parse(process.argv)
|
||||
|
||||
type Page = {
|
||||
relativePath: string
|
||||
fullPath: string
|
||||
title: string
|
||||
shortTitle?: string
|
||||
intro: string
|
||||
markdown: string
|
||||
languageCode: string
|
||||
}
|
||||
|
||||
type Reusables = Map<string, string>
|
||||
|
||||
async function main(languageCodes: string[]) {
|
||||
const langCodes = languageCodes.length
|
||||
? languageCodes
|
||||
: Object.keys(languages).filter((x) => x !== 'en')
|
||||
const site = await warmServer(languageCodes.length ? ['en', ...langCodes] : [])
|
||||
|
||||
// When checking reusables, we only want to check the files that
|
||||
// have an English equivalent.
|
||||
const reusables = getReusables()
|
||||
|
||||
const totalErrors = new Map<string, number>()
|
||||
|
||||
for (const languageCode of langCodes) {
|
||||
if (!(languageCode in languages)) {
|
||||
console.error(chalk.red(`Language ${languageCode} not found`))
|
||||
return process.exit(1)
|
||||
}
|
||||
if (languageCode === 'en') {
|
||||
console.error(chalk.red("Can't test in English ('en')"))
|
||||
return process.exit(1)
|
||||
}
|
||||
const { errors } = run(languageCode, site, reusables)
|
||||
for (const [error, count] of Array.from(errors.entries())) {
|
||||
totalErrors.set(error, (totalErrors.get(error) || 0) + count)
|
||||
}
|
||||
}
|
||||
|
||||
const sumTotal = Array.from(totalErrors.values()).reduce((acc, count) => acc + count, 0)
|
||||
console.log('\nGRAND TOTAL ERRORS:', sumTotal)
|
||||
}
|
||||
|
||||
function getReusables(): Reusables {
|
||||
const reusables = new Map()
|
||||
const files = walk('data/reusables', {
|
||||
includeBasePath: true,
|
||||
globs: ['**/*.md'],
|
||||
ignore: ['**/README.md'],
|
||||
})
|
||||
for (const file of files) {
|
||||
const content = fs.readFileSync(file, 'utf8')
|
||||
reusables.set(file, content)
|
||||
}
|
||||
return reusables
|
||||
}
|
||||
|
||||
function run(languageCode: string, site: Site, englishReusables: Reusables) {
|
||||
const PADDING = 60
|
||||
const language = languages[languageCode as keyof typeof languages]
|
||||
|
||||
console.log(`--- Tallying liquid corruptions in ${languageCode} (${language.name}) ---`)
|
||||
|
||||
const pageList: Page[] = site.pageList
|
||||
const errors = new Map<string, number>()
|
||||
const wheres = new Map<string, number>()
|
||||
const illegalTags = new Map<string, number>()
|
||||
|
||||
function countError(error: TokenizationError, where: string) {
|
||||
const errorString = (error as any).originalError.message as string
|
||||
if (errorString.includes('illegal tag syntax')) {
|
||||
const illegalTag = (error as any).token.content
|
||||
illegalTags.set(illegalTag, (illegalTags.get(illegalTag) || 0) + 1)
|
||||
}
|
||||
errors.set(errorString, (errors.get(errorString) || 0) + 1)
|
||||
wheres.set(where, (wheres.get(where) || 0) + 1)
|
||||
}
|
||||
|
||||
for (const page of pageList) {
|
||||
if (page.languageCode !== languageCode) continue
|
||||
|
||||
const strings: string[][] = [
|
||||
['title', page.title],
|
||||
['shortTitle', page.shortTitle || ''],
|
||||
['intro', page.intro || ''],
|
||||
['markdown', page.markdown],
|
||||
].filter(([, string]) => Boolean(string))
|
||||
|
||||
for (const [where, string] of strings) {
|
||||
try {
|
||||
getLiquidTokens(string)
|
||||
} catch (error) {
|
||||
if (error instanceof TokenizationError) {
|
||||
countError(error, where)
|
||||
} else {
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const [relativePath, englishContent] of Array.from(englishReusables.entries())) {
|
||||
try {
|
||||
const filePath = path.join(language.dir, relativePath)
|
||||
const rawContent = fs.readFileSync(filePath, 'utf8')
|
||||
const correctedContent = correctTranslatedContentStrings(rawContent, englishContent, {
|
||||
code: languageCode,
|
||||
relativePath,
|
||||
})
|
||||
getLiquidTokens(correctedContent)
|
||||
} catch (error) {
|
||||
if (error instanceof TokenizationError) {
|
||||
countError(error, 'reusable')
|
||||
} else if (error instanceof Error && error.message.startsWith('ENOENT')) {
|
||||
continue
|
||||
} else {
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const flat = Array.from(errors.entries()).sort((a, b) => b[1] - a[1])
|
||||
const sumTotal = flat.reduce((acc, [, count]) => acc + count, 0)
|
||||
|
||||
console.log('\nMost common errors')
|
||||
flat.forEach(([error, count], i) => {
|
||||
console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
|
||||
})
|
||||
console.log(`${'TOTAL:'.padEnd(3 + 1 + PADDING)}`, sumTotal)
|
||||
|
||||
if (sumTotal) {
|
||||
const whereFlat = Array.from(wheres.entries()).sort((a, b) => b[1] - a[1])
|
||||
console.log('\nMost common places')
|
||||
whereFlat.forEach(([error, count], i) => {
|
||||
console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
|
||||
})
|
||||
|
||||
const illegalTagsFlat = Array.from(illegalTags.entries()).sort((a, b) => b[1] - a[1])
|
||||
if (illegalTagsFlat.reduce((acc, [, count]) => acc + count, 0)) {
|
||||
console.log('\nMost common illegal tags', illegalTagsFlat.length > 10 ? ' (Top 10)' : '')
|
||||
illegalTagsFlat.slice(0, 10).forEach(([error, count], i) => {
|
||||
console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
|
||||
})
|
||||
}
|
||||
}
|
||||
console.log('\n')
|
||||
|
||||
return { errors }
|
||||
}
|
||||
Reference in New Issue
Block a user