1
0
mirror of synced 2025-12-19 18:10:59 -05:00
Files
docs/src/data-directory/scripts/find-orphaned-features/find.ts

327 lines
11 KiB
TypeScript

/**
* This script will loop over all pages, in all languages, and look at
* the following:
*
* 1. `title` in frontmatter
* 2. `intro` in frontmatter
* 3. `shortTitle` in frontmatter (if present)
* 4. the markdown body itself
* 5. The `versions:` frontmatter key (if the page is in English)
*
* Then it will search out the features mentioned based on `data/features/*.yml`
* It will make a Set of these (e.g. `dependabot-grouped-dependencies` and
* `ghas-enablement-webhook`) and one by one pluck them away.
*
* After the pages, it will loop over the reusables in English, and do the
* same search there. Once it's done the English, it loops over the
* reusables in the translations (if they exist) and does the same search.
*
* Lastly, it will output the remaining features, as relative file paths.
* For example, `data/features/havent-been-used-in-years.yml` so now you
* know that file can be deleted.
*
* NOTE: A lot of translations have corrupted Liquid. So if we can't parse
* the Liquid we fall back to string search. A regex will try to find
* all `{% ifversion ... %}` (and `elsif`) and search for any features
* mentioned inside that as a string.
*
*/
import { strictEqual } from 'node:assert'
import fs from 'fs'
import path from 'path'
import chalk from 'chalk'
import { TokenizationError, TokenKind } from 'liquidjs'
import type { TagToken } from 'liquidjs'
import type { Page } from '@/types'
import warmServer from '@/frame/lib/warm-server'
import { getDeepDataByLanguage } from '@/data-directory/lib/get-data'
import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils'
import languages from '@/languages/lib/languages-server'
import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content'
const EXCEPTIONS = new Set([
// From data/features/placeholder.yml. Used by tests.
'placeholder',
])
type Options = {
sourceDirectory: string
output?: string
verbose?: boolean
}
export async function find(options: Options) {
const { sourceDirectory } = options
if (process.env.ENABLED_LANGUAGES && process.env.ENABLED_LANGUAGES === 'en') {
console.warn(
chalk.yellow(
`Only English is enabled. Be careful with the output.
To include all translations make sure they're available and that
ENABLED_LANGUAGES is not set or set to 'all'.`.replaceAll(/\s\s+/g, ' '),
),
)
}
const site = await warmServer([])
const features = new Set(
Object.keys(getDeepDataByLanguage('features', 'en')).filter((f) => !EXCEPTIONS.has(f)),
)
if (options.verbose) {
console.log(`Found ${features.size} features`)
}
const pageList: Page[] = site.pageList
if (options.verbose) {
console.log(`Searching ${pageList.length.toLocaleString()} pages`)
}
const t0 = new Date()
searchAndRemove(features, pageList, Boolean(options.verbose))
const t1 = new Date()
if (options.verbose) {
const color = features.size === 0 ? chalk.green : chalk.yellow
console.log(
color(
`Searched ${pageList.length.toLocaleString()} pages in ${formatDelta(t0, t1)}.
And found ${features.size} features remaining (i.e. orphans).`.replace(/\s\s+/, ' '),
),
)
}
const remaining = Array.from(features).map((feature) =>
path.join(sourceDirectory, `${feature}.yml`),
)
if (options.output) {
if (options.output.endsWith('.json')) {
if (remaining.length) {
fs.writeFileSync(options.output, JSON.stringify(remaining, null, 2))
}
} else {
fs.writeFileSync(options.output, remaining.join('\n'))
}
if (!options.verbose) {
return
}
}
console.log(chalk.bold(`Orphans found (${remaining.length}):`))
for (const feature of remaining) {
console.log(chalk.green(feature))
}
}
function formatDelta(t0: Date, t1: Date) {
const ms = t1.getTime() - t0.getTime()
return `${(ms / 1000).toFixed(1)} seconds`
}
function searchAndRemove(features: Set<string>, pages: Page[], verbose = false) {
for (const page of pages) {
const content = page.markdown
// We actually never bother looking at the `versions:` frontmatter
// key in translations, so it doesn't matter if the translated
// frontmatter might have `versions: some-old-feature`.
if (page.languageCode === 'en') {
for (const [key, value] of Object.entries(page.versions)) {
if (key === 'feature') {
if (features.has(value)) {
features.delete(value)
}
}
}
}
const combined = `
${content}
${page.title || ''}
${page.shortTitle || ''}
${page.intro || ''}
`
checkString(combined, features, { page, verbose, languageCode: page.languageCode })
}
// Reusables are a bit special, as they are shared between languages.
// There'll always be a slight mismatch between files present on disk
// in English vs. translations.
// The translations never delete files, so there's often excess reusables
// on disk in translations. And the English might be ahead, meaning a file
// has been introduced in English but not yet translated.
// The code below loops over the English reusables, and takes note of the
// their relative paths and content. Then, we re-use the keys of that map
// to know which files, in the translations, to check. And when we read
// them in, we'll need the English equivalent content to be able to
// use the correctTranslatedContentStrings function.
// Check variables files
for (const filePath of getVariableFiles(path.join(languages.en.dir, 'data', 'variables'))) {
const fileContent = fs.readFileSync(filePath, 'utf-8')
checkString(fileContent, features, { filePath, verbose, languageCode: 'en' })
}
const englishReusables = new Map<string, string>()
for (const filePath of getReusableFiles(path.join(languages.en.dir, 'data', 'reusables'))) {
const relativePath = path.relative(languages.en.dir, filePath)
const fileContent = fs.readFileSync(filePath, 'utf-8')
checkString(fileContent, features, { filePath, verbose, languageCode: 'en' })
englishReusables.set(relativePath, fileContent)
}
for (const language of Object.values(languages)) {
if (language.code === 'en') continue // Already did that in the loop above
for (const [relativePath, englishFileContent] of Array.from(englishReusables.entries())) {
const filePath = path.join(language.dir, relativePath)
try {
const fileContent = fs.readFileSync(filePath, 'utf-8')
const correctedFileContent = correctTranslatedContentStrings(
fileContent,
englishFileContent,
{
code: language.code,
relativePath,
},
)
checkString(correctedFileContent, features, {
filePath,
verbose,
languageCode: language.code,
})
} catch (error) {
if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {
// That a reusable does *not* exist in a translation is
// perfectly expected. It means that English reusable was
// most likely added recently and the translation hasn't been
// translated yet.
continue
}
throw error
}
}
}
}
export function getReusableFiles(root: string): string[] {
const here = []
for (const file of fs.readdirSync(root)) {
const filePath = `${root}/${file}`
if (fs.statSync(filePath).isDirectory()) {
here.push(...getReusableFiles(filePath))
} else if (file.endsWith('.md') && file !== 'README.md') {
here.push(filePath)
}
}
return here
}
export function getVariableFiles(root: string): string[] {
const here = []
for (const file of fs.readdirSync(root)) {
const filePath = `${root}/${file}`
if (fs.statSync(filePath).isDirectory()) {
here.push(...getVariableFiles(filePath))
} else if (file.endsWith('.yml') && file !== 'README.yml') {
here.push(filePath)
}
}
return here
}
const IGNORE_ARGS = new Set(['or', 'and', 'not', '<', '>', 'ghes', 'fpt', 'ghec', '!=', '='])
function checkString(
string: string,
features: Set<string>,
{
page,
filePath,
languageCode,
verbose = false,
}: { page?: Page; filePath?: string; languageCode?: string; verbose?: boolean } = {},
) {
try {
// The reason for the `noCache: true` is that we're going to be sending
// a LOT of different strings in and the cache will fill up rapidly
// when testing every possible string in every possible language for
// every page.
const tokens = getLiquidTokens(string, { noCache: true }).filter(
(token): token is TagToken => token.kind === TokenKind.Tag,
)
for (const token of tokens) {
if (token.name === 'ifversion' || token.name === 'elsif') {
for (const arg of token.args.split(/\s+/)) {
if (IGNORE_ARGS.has(arg)) continue
if (isFloat(arg)) continue
if (features.has(arg)) {
features.delete(arg)
}
}
}
}
} catch (error) {
if (error instanceof TokenizationError) {
// If it happens in English, it's a serious error
if (languageCode === 'en') throw error
// The translation might, currently, have corrupted liquid
// So treat it as a string
if (verbose)
console.log(
`TokenizationError in ${page ? page.fullPath : filePath}. Treating ${page ? page.fullPath : filePath} as a string and using regex`,
)
for (const feature of Array.from(findByRegex(features, string))) {
features.delete(feature)
}
} else {
throw error
}
}
}
function findByRegex(features: Set<string>, string: string) {
const found = new Set<string>()
for (const match of string.match(/\{%\s*(ifversion|elsif)\s*(.*?)\s*%\}/g) || []) {
for (const feature of Array.from(features)) {
const regex = new RegExp(`\\s${escapeRegex(feature)}(\\s|%)`, 'i')
if (regex.test(match)) {
found.add(feature)
}
}
}
return found
}
const test = findByRegex(
new Set(['placeholder', 'foo-bar']),
`
placeholder
{%ifversion placeholder-foo or fpt%}
{% elsif not-placeholder %}
{% elsif foo-bar%}
{%endif %}
{% data reusables.enterprise-migration-tool.placeholder-table %}
{% data placeholder %}
`,
)
console.assert(test.has('foo-bar'), test.toString())
console.assert(!test.has('placeholder'), test.toString())
function escapeRegex(string: string) {
return string.replace(/[/\-\\^$*+?.()|[\]{}]/g, '\\$&')
}
function isFloat(x: any) {
return !!(Number(x) + 1)
}
strictEqual(isFloat('1.2'), true)
strictEqual(isFloat('10'), true)
strictEqual(isFloat('notatall'), false)
strictEqual(isFloat('2fa'), false)