diff --git a/.github/workflows/translation-health-report.yml b/.github/workflows/translation-health-report.yml new file mode 100644 index 0000000000..4a1d1127f8 --- /dev/null +++ b/.github/workflows/translation-health-report.yml @@ -0,0 +1,132 @@ +name: Translation health report + +# **What it does**: Provides errors and summary statistics on rendering translated content. +# **Why we have it**: To improve our translations by having clearer visibility. +# **Who does it impact**: Docs engineering, Microsoft translators. + +on: + workflow_dispatch: + schedule: + - cron: '20 16 * * *' # Run every day at 16:20 UTC / 8:20 PST + +permissions: + contents: read + +jobs: + create-translation-health-report: + name: Create translation health report + if: github.repository == 'github/docs-internal' + runs-on: ubuntu-latest + # This sets a maximum execution time of 300 minutes (5 hours) + # to prevent the workflow from running longer than necessary. + timeout-minutes: 300 + strategy: + fail-fast: false + max-parallel: 1 + matrix: + include: + - language: es + language_dir: translations/es-ES + language_repo: github/docs-internal.es-es + + - language: ja + language_dir: translations/ja-JP + language_repo: github/docs-internal.ja-jp + + - language: pt + language_dir: translations/pt-BR + language_repo: github/docs-internal.pt-br + + - language: cn + language_dir: translations/zh-CN + language_repo: github/docs-internal.zh-cn + + # We'll be ready to add the following languages in a future effort. + + # - language: ru + # language_dir: translations/ru-RU + # language_repo: github/docs-internal.ru-ru + + # - language: ko + # language_dir: translations/ko-KR + # language_repo: github/docs-internal.ko-kr + + # - language: fr + # language_dir: translations/fr-FR + # language_repo: github/docs-internal.fr-fr + + # - language: de + # language_dir: translations/de-DE + # language_repo: github/docs-internal.de-de + + steps: + - name: Checkout the docs-internal repo + uses: actions/checkout@dcd71f646680f2efd8db4afa5ad64fdcba30e748 + + - name: Remove all language translations + run: | + git rm -rf --quiet ${{ matrix.language_dir }}/content + git rm -rf --quiet ${{ matrix.language_dir }}/data + + - name: Checkout the language-specific repo + uses: actions/checkout@dcd71f646680f2efd8db4afa5ad64fdcba30e748 + with: + repository: ${{ matrix.language_repo }} + token: ${{ secrets.DOCUBOT_READORG_REPO_WORKFLOW_SCOPES }} + path: ${{ matrix.language_dir }} + + - name: Get language SHA + run: | + gitref=$(cd ${{ matrix.language_dir }} && git rev-parse --short HEAD) + echo "gitref=$gitref" >> $GITHUB_ENV + + - name: 'Setup node' + uses: actions/setup-node@17f8bd926464a1afa4c6a11669539e9c1ba77048 + with: + node-version: '16.17.0' + + - name: npm ci + run: npm ci + + - name: Create translation health report + run: | + translation_health_report=$( \ + node script/i18n/create-translation-health-report.js \ + --language ${{ matrix.language }} \ + --gitref ${{ env.gitref }} \ + | jq -Rsa . + ) + echo "translation_health_report=$translation_health_report" >> $GITHUB_ENV + + - name: Log in to Azure + uses: azure/login@1f63701bf3e6892515f1b7ce2d2bf1708b46beaf + with: + creds: ${{ secrets.PROD_AZURE_CREDENTIALS }} + + - name: Upload to Azure blob storage + uses: azure/CLI@61bb69d64d613b52663984bf12d6bac8fd7b3cc8 + with: + inlineScript: | + az storage blob upload \ + --name "${{ matrix.language }}-latest.json" \ + --data $translation_health_report \ + --container-name translation-health-reports + az storage blob upload \ + --name "${{ matrix.language }}-$(date +%Y-%m-%d).json" \ + --data $translation_health_report \ + --container-name translation-health-reports + + - name: Log out from Azure + if: always() + run: | + az logout + + # Emit a notification for the first responder to triage if the workflow failed. + - name: Send Slack notification if workflow failed + uses: someimportantcompany/github-actions-slack-message@f8d28715e7b8a4717047d23f48c39827cacad340 + if: failure() + with: + channel: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }} + bot-token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }} + color: failure + text: 'The health report for ${{ matrix.language }} failed.' diff --git a/script/i18n/create-translation-health-report.js b/script/i18n/create-translation-health-report.js new file mode 100755 index 0000000000..e9ef2c88bf --- /dev/null +++ b/script/i18n/create-translation-health-report.js @@ -0,0 +1,163 @@ +#!/usr/bin/env node + +// [start-readme] +// +// Create a list of errors and summary statistics for errors in a particular language. +// +// [end-readme] + +/* Nota bene: + If you are getting more errors all the sudden, try running this: + $ script/i18n/create-translation-health-report.js -l en -r 000 + If there's any errors, const context = { ... } probably needs more data. +*/ + +import { program } from 'commander' +import fs from 'fs/promises' +import { pick } from 'lodash-es' + +import { loadPages, loadPageMap } from '../../lib/page-data.js' +import loadSiteData from '../../lib/site-data.js' +import loadRedirects from '../../lib/redirects/precompile.js' +import { allVersions, allVersionKeys } from '../../lib/all-versions.js' +import { languageKeys } from '../../lib/languages.js' +import { getProductStringFromPath } from '../../lib/path-utils.js' + +program + .description('Create a translation health report for one language.') + .requiredOption('-l, --language ', 'The language to health check') + .requiredOption('-r, --gitref ', 'Language repo latest git commit short SHA') + .parse(process.argv) + +// Gather popularity data the search uses to prioritize errors +async function fetchPopularityData() { + const output = {} + const popularPagesRaw = await fs.readFile('lib/search/popular-pages.json', 'utf8') + for (const line of popularPagesRaw.split('\n')) { + try { + const row = JSON.parse(line) + output[row.path_article] = row.path_count + } catch {} + } + return output +} + +async function collectPageErrors(page, { language, data, redirects, plainPath, pageMap }) { + // Go through each version... + const promises = allVersionKeys + .filter((version) => page.applicableVersions.includes(version)) + .map(async (version) => { + // Collect if errors + const pageVersionErrors = [] + try { + const path = `/${language}/${version}/${plainPath}` + // Reference middleware/context.js for data shape + const context = { + ...data, // needed for all pages + currentVersion: version, // needed for all pages + currentLanguage: language, // needed for all pages + currentPath: path, // needed for all pages + currentVersionObj: allVersions[version], // needed for ifversion tag + currentProduct: getProductStringFromPath(path), // needed for learning-track on guides pages + pages: pageMap, // needed for learning-track on guides pages + redirects, // needed for learning-track on guides pages + } + await page.render(context, pageVersionErrors) + } catch (err) { + pageVersionErrors.push(err) + } + if (pageVersionErrors.length) { + return [ + version, + // Filter down properties to make it easier for + // translators to get the clearest information on the error + pageVersionErrors.map((err) => pick(err, ['name', 'message', 'token.content'])), + ] + // Other fields: Object.getOwnPropertyNames(err) + } + }) + const arr = (await Promise.all(promises)).filter(Boolean) + if (arr.length) { + return Object.fromEntries(arr) + } +} + +function groupErrors(errors) { + return errors + .map((page) => Object.values(page.versions).flat()) + .flat() + .map((version) => version.message) + .reduce((sum, val) => { + sum[val] = sum[val] || 0 + sum[val]++ + return sum + }, {}) +} + +async function createReport() { + // Check that the language is valid + const { language, gitref } = program.opts() + if (!languageKeys.includes(language)) { + throw new Error(`Language ${language} is not in ${languageKeys.join()}.`) + } + + // Load popularity data to sort errors + const popularity = await fetchPopularityData() + + // Load all pages + const allPages = await loadPages() + const dataErrors = [] + const data = loadSiteData(dataErrors)[language] + const pages = allPages + .filter((page) => page.languageCode === language) + // Early access pages log to the console, which would show in the report + .filter((page) => !page.relativePath.includes('early-access')) + const pageMap = await loadPageMap(pages) + const redirects = await loadRedirects(pages) + + // Try to render each page + const pageErrors = ( + await Promise.all( + pages.map(async (page) => { + const plainPath = page.relativePath.replace('/index.md', '').replace('.md', '') + const errorsByVersion = await collectPageErrors(page, { + language, + data, + redirects, + plainPath, + pageMap, + }) + if (errorsByVersion) { + return { + path: plainPath, + popularity: popularity[plainPath] || 0, + versions: errorsByVersion, + } + } + }) + ) + ) + .filter(Boolean) + // Sort by popularity desc so the translators know what to focus on first + .sort((a, b) => b.popularity - a.popularity) + + // Begin an output report + const report = { + language, + gitref, + datetime: new Date().toJSON(), + totalPages: pages.length, + // totalErrorPages should be around en: 0, es: 1043, ja: 1004, pt: 995, cn: 1063 + totalErrorPages: pageErrors.length, + pageErrors, + // To group errors by message instead + groupedPageErrors: groupErrors(pageErrors), + // Filter down properties to make it easier for + // translators to get the clearest information on the error + dataErrors: dataErrors.map((err) => pick(err, ['name', 'message', 'token.content'])), + } + + return report +} + +console.log(JSON.stringify(await createReport(), null, 2))