1
0
mirror of synced 2025-12-23 21:07:12 -05:00

only load site tree from English and "translate" the others (#32761)

This commit is contained in:
Peter Bengtsson
2022-11-21 13:37:48 +01:00
committed by GitHub
parent f189f9d900
commit 00b7ba1a6b
4 changed files with 163 additions and 181 deletions

View File

@@ -1,34 +1,10 @@
import path from 'path' import path from 'path'
import fs from 'fs/promises' import fs from 'fs/promises'
import languages from './languages.js' import Page from './page.js'
import Page, { FrontmatterErrorsError } from './page.js'
// If you run `export DEBUG_TRANSLATION_FALLBACKS=true` in your terminal, export default async function createTree(originalPath, rootPath) {
// every time a translation file fails to initialize we fall back to English const basePath = rootPath || originalPath
// and write a warning to stdout.
const DEBUG_TRANSLATION_FALLBACKS = Boolean(
JSON.parse(process.env.DEBUG_TRANSLATION_FALLBACKS || 'false')
)
// If you don't want to fall back to English automatically on corrupt
// translation files, set `export THROW_TRANSLATION_ERRORS=true`
const THROW_TRANSLATION_ERRORS = Boolean(
JSON.parse(process.env.THROW_TRANSLATION_ERRORS || 'false')
)
// Module level cache
const _basePaths = new Map()
export function getBasePath(directory) {
if (!_basePaths.has(directory)) {
_basePaths.set(directory, path.posix.join(directory, 'content'))
}
return _basePaths.get(directory)
}
export default async function createTree(originalPath, langObj) {
// This basePath definition is needed both here and in lib/page-data.js because this
// function runs recursively, and the value for originalPath changes on recursive runs.
const basePath = getBasePath(langObj.dir)
// On recursive runs, this is processing page.children items in `/<link>` format. // On recursive runs, this is processing page.children items in `/<link>` format.
// If the path exists as is, assume this is a directory with a child index.md. // If the path exists as is, assume this is a directory with a child index.md.
@@ -44,54 +20,22 @@ export default async function createTree(originalPath, langObj) {
const relativePath = filepath.replace(`${basePath}/`, '') const relativePath = filepath.replace(`${basePath}/`, '')
// Initialize the Page! This is where the file reads happen. // Initialize the Page! This is where the file reads happen.
let page const page = await Page.init({
try { basePath,
page = await Page.init({ relativePath,
basePath, languageCode: 'en',
relativePath, })
languageCode: langObj.code,
})
} catch (err) {
if (
!THROW_TRANSLATION_ERRORS &&
err instanceof FrontmatterErrorsError &&
langObj.code !== 'en'
) {
// Something corrupt in the `.md` file caused it to throw an
// error from reading it in. Let's "gracefully" recover by
// swapping this one out for the English content and pretend it
// exists in this other language.
const englishBasePath = getBasePath(languages.en.dir)
page = await Page.init({
basePath: englishBasePath,
relativePath,
languageCode: langObj.code,
})
if (DEBUG_TRANSLATION_FALLBACKS) {
console.warn(
`Unable to initialized ${path.join(basePath, relativePath)} due to frontmatter errors. ` +
`Will proceed with using ${path.join(englishBasePath, relativePath)} instead.`
)
}
} else {
throw err
}
}
if (!page) { if (!page) {
// Do not throw an error if Early Access is not available. // Do not throw an error if Early Access is not available.
if (relativePath.startsWith('early-access')) { if (relativePath.startsWith('early-access')) {
if (langObj.code === 'en') { console.warn(
console.warn( `${relativePath} could not be turned into a Page, but is ignore because it's early-access`
`${relativePath} could not be turned into a Page, but is ignore because it's early-access` )
)
}
return return
} }
// Do not throw an error if translated page is not available.
if (langObj.code !== 'en') return
throw Error(`Cannot initialize page for ${filepath} in ${langObj.code}`) throw Error(`Cannot initialize page for ${filepath}`)
} }
// Create the root tree object on the first run, and create children recursively. // Create the root tree object on the first run, and create children recursively.
@@ -105,7 +49,7 @@ export default async function createTree(originalPath, langObj) {
item.childPages = ( item.childPages = (
await Promise.all( await Promise.all(
item.page.children.map( item.page.children.map(
async (child) => await createTree(path.posix.join(originalPath, child), langObj) async (child) => await createTree(path.posix.join(originalPath, child), basePath)
) )
) )
).filter(Boolean) ).filter(Boolean)

View File

@@ -2,23 +2,33 @@ import path from 'path'
import languages from './languages.js' import languages from './languages.js'
import { allVersions } from './all-versions.js' import { allVersions } from './all-versions.js'
import createTree, { getBasePath } from './create-tree.js' import createTree from './create-tree.js'
import nonEnterpriseDefaultVersion from './non-enterprise-default-version.js' import nonEnterpriseDefaultVersion from './non-enterprise-default-version.js'
import readFileContents from './read-file-contents.js'
import Page from './page.js' import Page from './page.js'
import frontmatterSchema from './frontmatter.js'
// If you run `export DEBUG_TRANSLATION_FALLBACKS=true` in your terminal,
// every time a translation file fails to initialize we fall back to English
// and write a warning to stdout.
const DEBUG_TRANSLATION_FALLBACKS = Boolean(
JSON.parse(process.env.DEBUG_TRANSLATION_FALLBACKS || 'false')
)
// If you don't want to fall back to English automatically on corrupt
// translation files, set `export THROW_TRANSLATION_ERRORS=true`
const THROW_TRANSLATION_ERRORS = Boolean(
JSON.parse(process.env.THROW_TRANSLATION_ERRORS || 'false')
)
const versions = Object.keys(allVersions) const versions = Object.keys(allVersions)
// These are the exceptions to the rule. class FrontmatterParsingError extends Error {}
// If a URI starts with one of these prefixes, it basically means we don't
// bother to "backfill" a translation in its spot. // Note! As of Nov 2022, the schema says that 'product' is translatable
// For example, `/en/github/site-policy-deprecated/foo` works // which is surprising since only a single page has prose in it.
// only in English and we don't bother making `/ja/github/site-policy-deprecated/foo` const translatableFrontmatterKeys = Object.entries(frontmatterSchema.schema.properties)
// work too. .filter(([, value]) => value.translatable)
const TRANSLATION_DRIFT_EXCEPTIONS = [ .map(([key]) => key)
'github/site-policy-deprecated',
// Early access stuff never has translations.
'early-access',
]
/** /**
* We only need to initialize pages _once per language_ since pages don't change per version. So we do that * We only need to initialize pages _once per language_ since pages don't change per version. So we do that
@@ -30,6 +40,7 @@ export async function loadUnversionedTree(languagesOnly = null) {
throw new Error("'languagesOnly' has to be an array") throw new Error("'languagesOnly' has to be an array")
} }
const unversionedTree = {} const unversionedTree = {}
unversionedTree.en = await createTree(path.join(languages.en.dir, 'content'))
const languagesValues = Object.entries(languages) const languagesValues = Object.entries(languages)
.filter(([language]) => { .filter(([language]) => {
@@ -38,16 +49,135 @@ export async function loadUnversionedTree(languagesOnly = null) {
.map(([, data]) => { .map(([, data]) => {
return data return data
}) })
await Promise.all( await Promise.all(
languagesValues.map(async (langObj) => { languagesValues
const localizedContentPath = path.posix.join(langObj.dir, 'content') .filter((langObj) => langObj.code !== 'en')
unversionedTree[langObj.code] = await createTree(localizedContentPath, langObj) .map(async (langObj) => {
}) const localizedContentPath = path.join(langObj.dir, 'content')
unversionedTree[langObj.code] = await translateTree(
localizedContentPath,
langObj,
unversionedTree.en
)
})
) )
return unversionedTree return unversionedTree
} }
async function translateTree(dir, langObj, enTree) {
const item = {}
const enPage = enTree.page
const { ...enData } = enPage
const basePath = dir
const relativePath = enPage.relativePath
const fullPath = path.join(basePath, relativePath)
let data
let content
try {
const read = await readFileContents(fullPath)
// If it worked, great!
content = read.content
data = read.data
if (!data) {
// If the file's frontmatter Yaml is entirely broken,
// the result of `readFileContents()` is that you just
// get a `errors` key. E.g.
//
// errors: [
// {
// reason: 'invalid frontmatter entry',
// message: 'YML parsing error!',
// filepath: 'translations/ja-JP/content/get-started/index.md'
// }
// ]
//
// If this the case throw error so we can lump this error with
// how we deal with the file not even being present on disk.
throw new FrontmatterParsingError(read.errors)
}
for (const { property } of read.errors) {
// If any of the errors happened on keys that are considered
// translatable, we can't accept that and have to fall back to
// English.
// For example, if a Japanese page's frontmatter lacks `title`,
// (which triggers a 'is required' error) you can't include it
// because you'd have a Page with `{title: undefined}`.
// The beauty in this is that if the translated content file
// has something wrong with, say, the `versions` frontmatter key
// we don't even care because we won't be using it anyway.
if (translatableFrontmatterKeys.includes(property)) {
const msg = `frontmatter error on '${property}' (in ${fullPath}) so falling back to English`
if (DEBUG_TRANSLATION_FALLBACKS) {
console.warn(msg)
}
if (THROW_TRANSLATION_ERRORS) {
throw new Error(msg)
}
data[property] = enData[property]
}
}
} catch (error) {
// If it didn't work because it didn't exist, don't fret,
// we'll use the English equivalent's data and content.
if (error.code === 'ENOENT' || error instanceof FrontmatterParsingError) {
data = enData
content = enPage.markdown
const msg = `Unable to initialized ${fullPath} because translation content file does not exist.`
if (DEBUG_TRANSLATION_FALLBACKS) {
console.warn(msg)
}
if (THROW_TRANSLATION_ERRORS) {
throw new Error(msg)
}
} else {
throw error
}
}
const translatedData = Object.fromEntries(
translatableFrontmatterKeys.map((key) => {
return [key, data[key]]
})
)
// The "content" isn't a frontmatter key
translatedData.markdown = content
item.page = new Page(
Object.assign(
{},
// By default, shallow-copy everything from the English equivalent.
enData,
// Overlay with the translations core properties.
{
basePath,
relativePath,
languageCode: langObj.code,
fullPath,
},
// And the translations translated properties.
translatedData
)
)
if (item.page.children) {
item.childPages = await Promise.all(
enTree.childPages
.filter((childTree) => {
// Translations should not get early access pages at all.
return childTree.page.relativePath.split(path.sep)[0] !== 'early-access'
})
.map((childTree) => translateTree(dir, langObj, childTree))
)
}
return item
}
/** /**
* The siteTree is a nested object with pages for every language and version, useful for nav because it * The siteTree is a nested object with pages for every language and version, useful for nav because it
* contains parent, child, and sibling relationships: * contains parent, child, and sibling relationships:
@@ -151,103 +281,11 @@ export function createMapFromArray(pageList) {
} }
export async function loadPageMap(pageList) { export async function loadPageMap(pageList) {
const pages = await correctTranslationOrphans(pageList || (await loadPageList())) const pages = pageList || (await loadPageList())
const pageMap = createMapFromArray(pages) const pageMap = createMapFromArray(pages)
return pageMap return pageMap
} }
// If a translation page exists, that doesn't have an English equivalent,
// remove it.
// If an English page exists, that doesn't have an translation equivalent,
// add it.
// Note, this function is exported purely for the benefit of the unit tests.
export async function correctTranslationOrphans(pageList, basePath = null) {
const englishRelativePaths = new Set()
for (const page of pageList) {
if (page.languageCode === 'en') {
englishRelativePaths.add(page.relativePath)
}
}
// Prime the Map with an empty set for each language prefix.
// It's important that we do this for *every* language rather than
// just populating `nonEnglish` based on those pages that *are* present.
// Otherwise, we won't have an index of all the languages
// that *might* be missing.
const nonEnglish = new Map()
Object.keys(languages)
.filter((lang) => lang !== 'en')
.forEach((languageCode) => {
nonEnglish.set(languageCode, new Set())
})
// By default, when backfilling, we set the `basePath` to be that of
// English. But for the benefit of being able to do unit tests,
// we make this an optional argument. Then, unit tests can use
// its "tests/fixtures" directory.
const englishBasePath = basePath || getBasePath(languages.en.dir)
// Filter out all non-English pages that appear to be excess.
// E.g. if an English doc was renamed from `content/foo.md` to
// `content/bar.md` what will happen is that `TRANSLATIONS_ROOT/*/content/foo.md`
// will still linger around and we want to remove that even if it was
// scooped up from disk.
const newPageList = []
for (const page of pageList) {
if (page.languageCode === 'en') {
// English pages are never considered "excess"
newPageList.push(page)
continue
}
// If this translation page exists in English, keep it but also
// add it to the set of relative paths that is known.
if (englishRelativePaths.has(page.relativePath)) {
nonEnglish.get(page.languageCode).add(page.relativePath)
newPageList.push(page)
continue
}
}
const pageLoadPromises = []
for (const relativePath of englishRelativePaths) {
for (const [languageCode, relativePaths] of nonEnglish) {
if (!relativePaths.has(relativePath)) {
// At this point, we've found an English `relativePath` that is
// not used by this language.
// But before we decide to "backfill" it from the English equivalent
// we first need to figure out if it should be excluded.
// The reason for doing this check this late is for the benefit
// of optimization. In general, when the translation pipeline has
// done its magic, this should be very rare, so it's unnecessary
// to do this exception check on every single English relativePath.
if (TRANSLATION_DRIFT_EXCEPTIONS.find((exception) => relativePath.startsWith(exception))) {
continue
}
// The magic right here!
// The trick is that we can't clone instances of class Page. We need
// to create them for this language. But the trick is that we
// use the English relative path so it can have something to read.
// For example, if we have figured out that
// `TRANSLATIONS_ROOT/ja-JP/content/foo.md` doesn't exist, we pretend
// that we can use `foo.md` and the base path of `content/`.
pageLoadPromises.push(
Page.init({
basePath: englishBasePath,
relativePath,
languageCode,
})
)
}
}
}
const additionalPages = await Promise.all(pageLoadPromises)
newPageList.push(...additionalPages)
return newPageList
}
export default { export default {
loadUnversionedTree, loadUnversionedTree,
loadSiteTree, loadSiteTree,

View File

@@ -67,7 +67,7 @@ class Page {
} }
constructor(opts) { constructor(opts) {
if (opts.frontmatterErrors.length) { if (opts.frontmatterErrors && opts.frontmatterErrors.length) {
throw new FrontmatterErrorsError( throw new FrontmatterErrorsError(
`${opts.frontmatterErrors.length} frontmatter errors trying to load ${opts.fullPath}`, `${opts.frontmatterErrors.length} frontmatter errors trying to load ${opts.fullPath}`,
opts.frontmatterErrors opts.frontmatterErrors

View File

@@ -1,6 +1,6 @@
import { jest } from '@jest/globals' import { jest } from '@jest/globals'
import path from 'path' import path from 'path'
import { loadPages, correctTranslationOrphans } from '../../lib/page-data.js' import { loadPages } from '../../lib/page-data.js'
import libLanguages from '../../lib/languages.js' import libLanguages from '../../lib/languages.js'
import { liquid } from '../../lib/render-content/index.js' import { liquid } from '../../lib/render-content/index.js'
import patterns from '../../lib/patterns.js' import patterns from '../../lib/patterns.js'
@@ -18,7 +18,7 @@ describe('pages module', () => {
let pages let pages
beforeAll(async () => { beforeAll(async () => {
pages = await correctTranslationOrphans(await loadPages()) pages = await loadPages()
}) })
describe('loadPages', () => { describe('loadPages', () => {