diff --git a/.github/actions-scripts/msft-create-translation-batch-pr.js b/.github/actions-scripts/msft-create-translation-batch-pr.js index d267146359..1ea57c342d 100755 --- a/.github/actions-scripts/msft-create-translation-batch-pr.js +++ b/.github/actions-scripts/msft-create-translation-batch-pr.js @@ -1,5 +1,6 @@ #!/usr/bin/env node +import fs from 'fs' import github from '@actions/github' const OPTIONS = Object.fromEntries( @@ -32,6 +33,7 @@ const { BASE, HEAD, LANGUAGE, + BODY_FILE, GITHUB_TOKEN, } = OPTIONS const [OWNER, REPO] = GITHUB_REPOSITORY.split('/') @@ -119,7 +121,7 @@ async function main() { title: TITLE, base: BASE, head: HEAD, - body: `New translation batch for ${LANGUAGE}. You can see the log in [\`translations/log/${LANGUAGE}-resets.csv\`](https://github.com/${OWNER}/${REPO}/tree/${HEAD}/translations/log/msft-${LANGUAGE}-resets.csv).`, + body: fs.readFileSync(BODY_FILE, 'utf8'), labels: ['translation-batch', `translation-batch-${LANGUAGE}`], owner: OWNER, repo: REPO, diff --git a/.github/workflows/msft-create-translation-batch-pr.yml b/.github/workflows/msft-create-translation-batch-pr.yml index e897714c7b..c5cf5b4c7f 100644 --- a/.github/workflows/msft-create-translation-batch-pr.yml +++ b/.github/workflows/msft-create-translation-batch-pr.yml @@ -1,4 +1,4 @@ -name: Create translation Batch Pull Request +name: Create translation Batch Pull Request (Microsoft) # **What it does**: # - Creates one pull request per language after running a series of automated checks, @@ -31,48 +31,39 @@ jobs: matrix: include: - language: es - crowdin_language: es-ES language_dir: translations/es-ES language_repo: github/docs-internal.es-es - language: ja - crowdin_language: ja-JP language_dir: translations/ja-JP language_repo: github/docs-internal.ja-jp - language: pt - crowdin_language: pt-BR language_dir: translations/pt-BR language_repo: github/docs-internal.pt-br - language: cn - crowdin_language: zh-CN language_dir: translations/zh-CN language_repo: github/docs-internal.zh-cn # We'll be ready to add the following languages in a future effort. # - language: ru - # crowdin_language: ru-RU # language_dir: translations/ru-RU # language_repo: github/docs-internal.ru-ru # - language: ko - # crowdin_language: ko-KR # language_dir: translations/ko-KR # language_repo: github/docs-internal.ko-kr # - language: fr - # crowdin_language: fr-FR # language_dir: translations/fr-FR # language_repo: github/docs-internal.fr-fr # - language: de - # crowdin_language: de-DE # language_dir: translations/de-DE # language_repo: github/docs-internal.de-de - # TODO: replace the branch name steps: - name: Set branch name id: set-branch @@ -109,11 +100,10 @@ jobs: - name: Remove .git from the language-specific repo run: rm -rf ${{ matrix.language_dir }}/.git - # TODO: Rename this step - - name: Commit crowdin sync + - name: Commit translated files run: | git add ${{ matrix.language_dir }} - git commit -m "Add crowdin translations" || echo "Nothing to commit" + git commit -m "Add translations" || echo "Nothing to commit" - name: 'Setup node' uses: actions/setup-node@17f8bd926464a1afa4c6a11669539e9c1ba77048 @@ -122,19 +112,16 @@ jobs: - run: npm ci - # step 6 in docs-engineering/crowdin.md - name: Homogenize frontmatter run: | node script/i18n/homogenize-frontmatter.js git add ${{ matrix.language_dir }} && git commit -m "Run script/i18n/homogenize-frontmatter.js" || echo "Nothing to commit" - # step 7 in docs-engineering/crowdin.md - name: Fix translation errors run: | node script/i18n/fix-translation-errors.js git add ${{ matrix.language_dir }} && git commit -m "Run script/i18n/fix-translation-errors.js" || echo "Nothing to commit" - # step 8b in docs-engineering/crowdin.md - name: Check rendering run: | node script/i18n/lint-translation-files.js --check rendering | tee -a /tmp/batch.log | cat @@ -142,26 +129,18 @@ jobs: - name: Reset files with broken liquid tags run: | - node script/i18n/reset-files-with-broken-liquid-tags.js --language=${{ matrix.language }} | tee -a /tmp/batch.log | cat - git add ${{ matrix.language_dir }} && git commit -m "run script/i18n/reset-files-with-broken-liquid-tags.js --language=${{ matrix.language }}" || echo "Nothing to commit" - - # step 5 in docs-engineering/crowdin.md using script from docs-internal#22709 - - name: Reset known broken files - run: | - node script/i18n/reset-known-broken-translation-files.js | tee -a /tmp/batch.log | cat - git add ${{ matrix.language_dir }} && git commit -m "run script/i18n/reset-known-broken-translation-files.js" || echo "Nothing to commit" - env: - GITHUB_TOKEN: ${{ secrets.DOCUBOT_REPO_PAT }} + node script/i18n/msft-reset-files-with-broken-liquid-tags.js --language=${{ matrix.language }} | tee -a /tmp/batch.log | cat + git add ${{ matrix.language_dir }} && git commit -m "run script/i18n/msft-reset-files-with-broken-liquid-tags.js --language=${{ matrix.language }}" || echo "Nothing to commit" - name: Check in CSV report run: | mkdir -p translations/log csvFile=translations/log/msft-${{ matrix.language }}-resets.csv - script/i18n/report-reset-files.js --report-type=csv --language=${{ matrix.language }} --log-file=/tmp/batch.log > $csvFile + script/i18n/msft-report-reset-files.js --report-type=csv --language=${{ matrix.language }} --log-file=/tmp/batch.log > $csvFile git add -f $csvFile && git commit -m "Check in ${{ matrix.language }} CSV report" || echo "Nothing to commit" - name: Write the reported files that were reset to /tmp/pr-body.txt - run: script/i18n/report-reset-files.js --report-type=pull-request-body --language=${{ matrix.language }} --log-file=/tmp/batch.log > /tmp/pr-body.txt + run: script/i18n/msft-report-reset-files.js --report-type=pull-request-body --language=${{ matrix.language }} --log-file=/tmp/batch.log --csv-path=${{ steps.set-branch.outputs.BRANCH_NAME }}/translations/log/msft-${{ matrix.language }}-resets.csv > /tmp/pr-body.txt - name: Push filtered translations run: git push origin ${{ steps.set-branch.outputs.BRANCH_NAME }} diff --git a/script/i18n/msft-report-reset-files.js b/script/i18n/msft-report-reset-files.js new file mode 100755 index 0000000000..13b1c5d656 --- /dev/null +++ b/script/i18n/msft-report-reset-files.js @@ -0,0 +1,69 @@ +#!/usr/bin/env node + +import { program } from 'commander' +import fs from 'fs' +import languages from '../../lib/languages.js' + +const defaultWorkflowUrl = [ + process.env.GITHUB_SERVER_URL, + process.env.GITHUB_REPOSITORY, + 'actions/runs', + process.env.GITHUB_RUN_ID, +].join('/') + +const reportTypes = { + 'pull-request-body': pullRequestBodyReport, + csv: csvReport, +} + +program + .description('Reads a translation batch log and generates a report') + .requiredOption('--language ', 'The language to compare') + .requiredOption('--log-file ', 'The batch log file') + .requiredOption( + '--report-type ', + 'The batch log file, I.E: ' + Object.keys(reportTypes).join(', ') + ) + .option('--workflow-url ', 'The workflow url', defaultWorkflowUrl) + .option('--csv-path ', 'The path to the CSV file') + .parse(process.argv) + +const options = program.opts() +const language = languages[options.language] +const { logFile, workflowUrl, reportType, csvPath } = options + +if (!Object.keys(reportTypes).includes(reportType)) { + throw new Error(`Invalid report type: ${reportType}`) +} + +const logFileContents = fs.readFileSync(logFile, 'utf8') + +const revertLines = logFileContents + .split('\n') + .filter((line) => line.match(/^(-> reverted to English)|^(-> removed)/)) + .filter((line) => line.match(language.dir)) + +const reportEntries = revertLines.sort().map((line) => { + const [, file, reason] = line.match(/^-> (?:reverted to English|removed): (.*) Reason: (.*)$/) + return { file, reason } +}) + +function pullRequestBodyReport() { + return [ + `New translation batch for ${language.name}. Product of [this workflow](${workflowUrl}). + +## ${reportEntries.length} files reverted. + +You can see the log in [\`${csvPath}\`](${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/blob/${csvPath}).`, + ].join('\n') +} + +function csvReport() { + const lines = reportEntries.map(({ file, reason }) => { + return [file, reason].join(',') + }) + + return ['file,reason', lines].flat().join('\n') +} + +console.log(reportTypes[reportType]()) diff --git a/script/i18n/msft-reset-files-with-broken-liquid-tags.js b/script/i18n/msft-reset-files-with-broken-liquid-tags.js new file mode 100755 index 0000000000..7cecbed266 --- /dev/null +++ b/script/i18n/msft-reset-files-with-broken-liquid-tags.js @@ -0,0 +1,80 @@ +#!/usr/bin/env node + +import { program } from 'commander' +import { execFileSync } from 'child_process' +import { languageFiles, compareLiquidTags } from './msft-tokens.js' +import languages from '../../lib/languages.js' + +program + .description('show-liquid-tags-diff') + .requiredOption('-l, --language ', 'The language to compare') + .option('-d, --dry-run', 'Just pretend to reset files') + .parse(process.argv) + +function resetFiles(files) { + console.log(`Reseting ${files.length} files:`) + + const dryRun = program.opts().dryRun ? '--dry-run' : '' + + files.forEach((file) => { + const cmd = 'script/i18n/reset-translated-file.js' + const args = [file, '--reason', 'broken liquid tags', dryRun] + execFileSync(cmd, args, { stdio: 'inherit' }) + }) +} + +function deleteFiles(files) { + console.log(`Deleting ${files.length} files:`) + + const dryRun = program.opts().dryRun ? '--dry-run' : '' + + files.forEach((file) => { + const cmd = 'script/i18n/reset-translated-file.js' + const args = [ + file, + '--remove', + '--reason', + 'file deleted because it no longer exists in main', + dryRun, + ] + execFileSync(cmd, args, { stdio: 'inherit' }) + }) +} + +async function main() { + const options = program.opts() + const language = languages[options.language] + + if (!language) { + throw new Error(`Language ${options.language} not found`) + } + + // languageFiles() returns an array indexed as follows: + // [0]: intersection of the files that exist in both main and the language-specific branch + // [1]: files that exist only in the language-specific branch, not in main + const allContentFiles = languageFiles(language, 'content') + const allDataFiles = languageFiles(language, 'data') + const files = [allContentFiles[0], allDataFiles[0]].flat() + const nonexitentFiles = [allContentFiles[1], allDataFiles[1]].flat() + const brokenFiles = [] + + files.forEach((file) => { + try { + // it throws error if the the syntax is invalid + const comparison = compareLiquidTags(file, language) + + if (comparison.diff.count === 0) { + return + } + + brokenFiles.push(comparison.translation) + } catch (e) { + brokenFiles.push(e.filePath) + } + }) + + await resetFiles(brokenFiles) + await deleteFiles(nonexitentFiles) +} + +main() diff --git a/script/i18n/msft-tokens.js b/script/i18n/msft-tokens.js new file mode 100644 index 0000000000..a4b1b5d97d --- /dev/null +++ b/script/i18n/msft-tokens.js @@ -0,0 +1,90 @@ +import walk from 'walk-sync' +import { Tokenizer } from 'liquidjs' +import { readFileSync } from 'fs' +import gitDiff from 'git-diff' +import _ from 'lodash' + +function getGitDiff(a, b) { + return gitDiff(a, b, { flags: '--ignore-all-space' }) +} + +function getMissingLines(diff) { + return diff + .split('\n') + .filter((line) => line.startsWith('-')) + .map((line) => line.replace('-', '')) +} + +function getExceedingLines(diff) { + return diff + .split('\n') + .filter((line) => line.startsWith('+')) + .map((line) => line.replace('+', '')) +} + +export function languageFiles(language, folder = 'content') { + const englishFiles = walk(folder, { directories: false }) + const languageFiles = walk(`${language.dir}/${folder}`, { directories: false }) + return [ + _.intersection(englishFiles, languageFiles).map((file) => `${folder}/${file}`), + _.difference(languageFiles, englishFiles).map((file) => `${language.dir}/${folder}/${file}`), // returns languageFiles not included in englishFiles + ] +} + +export function compareLiquidTags(file, language) { + const translation = `${language.dir}/${file}` + const sourceTokens = getTokensFromFile(file).rejectType('html') + const otherFileTokens = getTokensFromFile(translation).rejectType('html') + const diff = sourceTokens.diff(otherFileTokens) + + return { + file, + translation, + diff, + } +} + +function getTokens(contents) { + const tokenizer = new Tokenizer(contents) + return new Tokens(...tokenizer.readTopLevelTokens()) +} + +export function getTokensFromFile(filePath) { + const contents = readFileSync(filePath, 'utf8') + try { + return new Tokens(...getTokens(contents)) + } catch (e) { + const error = new Error(`Error parsing ${filePath}: ${e.message}`) + error.filePath = filePath + throw error + } +} + +export class Tokens extends Array { + rejectType(tagType) { + return this.filter( + (token) => token.constructor.name.toUpperCase() !== `${tagType}Token`.toUpperCase() + ) + } + + onlyText() { + return this.map((token) => token.getText()) + } + + diff(otherTokens) { + const a = this.onlyText() + const b = otherTokens.onlyText() + + const diff = getGitDiff(a.join('\n'), b.join('\n')) + + if (!diff) { + return { count: 0, missing: [], exceeding: [], output: '' } + } + + const missing = getMissingLines(diff) + const exceeding = getExceedingLines(diff) + const count = exceeding.length + missing.length + + return { count, missing, exceeding, output: diff } + } +} diff --git a/script/i18n/reset-translated-file.js b/script/i18n/reset-translated-file.js index ede553222a..a416ebd9d1 100755 --- a/script/i18n/reset-translated-file.js +++ b/script/i18n/reset-translated-file.js @@ -30,6 +30,7 @@ program '-m, --prefer-main', 'Reset file to the translated file, try using the file from `main` branch first, if not found (usually due to renaming), fall back to English source.' ) + .option('-rm, --remove', 'Remove the translated files altogether') .option('-d, --dry-run', 'Just pretend to reset files') .option('-r, --reason ', 'A reason why the file is getting reset') .parse(process.argv) @@ -44,6 +45,14 @@ const resetToEnglishSource = (translationFilePath) => { 'path argument must be in the format `translations//path/to/file`' ) + if (program.opts().remove) { + if (!dryRun) { + const fullPath = path.join(process.cwd(), translationFilePath) + fs.unlinkSync(fullPath) + } + console.log('-> removed: %s %s', translationFilePath, reasonMessage) + return + } if (!fs.existsSync(translationFilePath)) { return } diff --git a/tests/rendering/server.js b/tests/rendering/server.js index 04206efb86..d2a6d9bf6e 100644 --- a/tests/rendering/server.js +++ b/tests/rendering/server.js @@ -754,7 +754,9 @@ describe('URLs by language', () => { const $ = await getDOM('/ja/site-policy/github-terms/github-terms-of-service') expect($.res.statusCode).toBe(200) // This check is true on either the translated version of the page, or when the title is pending translation and is in English. - expect($('h1')[0].children[0].data).toMatch(/(GitHub利用規約|GitHub Terms of Service)/) + expect($('h1')[0].children[0].data).toMatch( + /(GitHub利用規約|GitHub Terms of Service|GitHub のサービス条件)/ + ) expect($('h2 a[href="#summary"]').length).toBe(1) }) })