1
0
mirror of synced 2025-12-19 18:10:59 -05:00

Script to find orphaned features (#49825)

Co-authored-by: Robert Sese <734194+rsese@users.noreply.github.com>
This commit is contained in:
Peter Bengtsson
2024-03-28 17:15:35 -04:00
committed by GitHub
parent f9c3f93014
commit e11df99f66
3 changed files with 319 additions and 0 deletions

View File

@@ -27,6 +27,7 @@
"delete-orphan-translation-files": "tsx src/workflows/delete-orphan-translation-files.ts", "delete-orphan-translation-files": "tsx src/workflows/delete-orphan-translation-files.ts",
"dev": "cross-env npm start", "dev": "cross-env npm start",
"find-orphaned-assets": "node src/assets/scripts/find-orphaned-assets.js", "find-orphaned-assets": "node src/assets/scripts/find-orphaned-assets.js",
"find-orphaned-features": "tsx src/data-directory/scripts/find-orphaned-features/index.ts",
"find-past-built-pr": "tsx src/workflows/find-past-built-pr.ts", "find-past-built-pr": "tsx src/workflows/find-past-built-pr.ts",
"fixture-dev": "cross-env ROOT=src/fixtures/fixtures npm start", "fixture-dev": "cross-env ROOT=src/fixtures/fixtures npm start",
"fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests", "fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests",

View File

@@ -0,0 +1,300 @@
/**
* This script will loop over all pages, in all languages, and look at
* the following:
*
* 1. `title` in frontmatter
* 2. `intro` in frontmatter
* 3. `shortTitle` in frontmatter (if present)
* 4. the markdown body itself
* 5. The `versions:` frontmatter key (if the page is in English)
*
* Then it will search out the features mentioned based on `data/features/*.yml`
* It will make a Set of these (e.g. `dependabot-grouped-dependencies` and
* `ghas-enablement-webhook`) and one by one pluck them away.
*
* After the pages, it will loop over the reusables in English, and do the
* same search there. Once it's done the English, it loops over the
* reusables in the translations (if they exist) and does the same search.
*
* Lastly, it will output the remaining features, as relative file paths.
* For example, `data/features/havent-been-used-in-years.yml` so now you
* know that file can be deleted.
*
* NOTE: A lot of translations have corrupted Liquid. So if we can't parse
* the Liquid we fall back to string search. A regex will try to find
* all `{% ifversion ... %}` (and `elsif`) and search for any features
* mentioned inside that as a string.
*
*/
import fs from 'fs'
import path from 'path'
import chalk from 'chalk'
import { TokenizationError } from 'liquidjs'
import warmServer from '@/frame/lib/warm-server.js'
import { getDeepDataByLanguage } from '@/data-directory/lib/get-data.js'
import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js'
import languages from '@/languages/lib/languages.js'
import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content.js'
type Options = {
sourceDirectory: string
output?: string
verbose?: boolean
}
type Page = {
permalinks: Permalink[]
relativePath: string
fullPath: string
title: string
shortTitle?: string
intro: string
markdown: string
languageCode: string
versions: Record<string, string>
}
type Permalink = {
href: string
languageCode: string
}
export async function find(options: Options) {
const { sourceDirectory } = options
if (process.env.ENABLED_LANGUAGES && process.env.ENABLED_LANGUAGES === 'en') {
console.warn(
chalk.yellow(
`Only English is enabled. Be careful with the output.
To include all translations make sure they're available and that
ENABLED_LANGUAGES is not set or set to 'all'.`.replaceAll(/\s\s+/g, ' '),
),
)
}
const site = await warmServer([])
const features = new Set(Object.keys(getDeepDataByLanguage('features', 'en')))
if (options.verbose) {
console.log(`Found ${features.size} features`)
}
const pageList: Page[] = site.pageList
if (options.verbose) {
console.log(`Searching ${pageList.length.toLocaleString()} pages`)
}
const t0 = new Date()
searchAndRemove(features, pageList, Boolean(options.verbose))
const t1 = new Date()
if (options.verbose) {
const color = features.size === 0 ? chalk.green : chalk.yellow
console.log(
color(
`Searched ${pageList.length.toLocaleString()} pages in ${formatDelta(t0, t1)}.
And found ${features.size} features remaining (i.e. orphans).`.replace(/\s\s+/, ' '),
),
)
}
const remaining = Array.from(features).map((feature) =>
path.join(sourceDirectory, `${feature}.yml`),
)
if (options.output) {
if (options.output.endsWith('.json')) {
fs.writeFileSync(options.output, JSON.stringify(remaining, null, 2))
} else {
fs.writeFileSync(options.output, remaining.join('\n'))
}
if (!options.verbose) {
return
}
}
console.log(chalk.bold(`Orphans found (${remaining.length}):`))
for (const feature of remaining) {
console.log(chalk.green(feature))
}
}
function formatDelta(t0: Date, t1: Date) {
const ms = t1.getTime() - t0.getTime()
return `${(ms / 1000).toFixed(1)} seconds`
}
function searchAndRemove(features: Set<string>, pages: Page[], verbose = false) {
for (const page of pages) {
const content = page.markdown
// We actually never bother looking at the `versions:` frontmatter
// key in translations, so it doesn't matter if the translated
// frontmatter might have `versions: some-old-feature`.
if (page.languageCode === 'en') {
for (const [key, value] of Object.entries(page.versions)) {
if (key === 'feature') {
if (features.has(value)) {
features.delete(value)
}
}
}
}
const combined = `
${content}
${page.title || ''}
${page.shortTitle || ''}
${page.intro || ''}
`
checkString(combined, features, { page, verbose, languageCode: page.languageCode })
}
// Reusables are a bit special, as they are shared between languages.
// There'll always be a slight mismatch between files present on disk
// in English vs. translations.
// The translations never delete files, so there's often excess reusables
// on disk in translations. And the English might be ahead, meaning a file
// has been introduced in English but not yet translated.
// The code below loops over the English reusables, and takes note of the
// their relative paths and content. Then, we re-use the keys of that map
// to know which files, in the translations, to check. And when we read
// them in, we'll need the English equivalent content to be able to
// use the correctTranslatedContentStrings function.
const englishReusables = new Map<string, string>()
for (const filePath of getReusableFiles(path.join(languages.en.dir, 'data', 'reusables'))) {
const relativePath = path.relative(languages.en.dir, filePath)
const fileContent = fs.readFileSync(filePath, 'utf-8')
checkString(fileContent, features, { filePath, verbose, languageCode: 'en' })
englishReusables.set(relativePath, fileContent)
}
for (const language of Object.values(languages)) {
if (language.code === 'en') continue // Already did that in the loop above
for (const [relativePath, englishFileContent] of Array.from(englishReusables.entries())) {
const filePath = path.join(language.dir, relativePath)
try {
const fileContent = fs.readFileSync(filePath, 'utf-8')
const correctedFileContent = correctTranslatedContentStrings(
fileContent,
englishFileContent,
{
code: language.code,
relativePath,
},
)
checkString(correctedFileContent, features, {
filePath,
verbose,
languageCode: language.code,
})
} catch (error) {
if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {
// That a reusable does *not* exist in a translation is
// perfectly expected. It means that English reusable was
// most likely added recently and the translation hasn't been
// translated yet.
continue
}
throw error
}
}
}
}
function getReusableFiles(root: string): string[] {
const here = []
for (const file of fs.readdirSync(root)) {
const filePath = `${root}/${file}`
if (fs.statSync(filePath).isDirectory()) {
here.push(...getReusableFiles(filePath))
} else if (file.endsWith('.md') && file !== 'README.md') {
here.push(filePath)
}
}
return here
}
const IGNORE_ARGS = new Set(['or', 'and', 'not', '<', '>', 'ghes', 'fpt', 'ghec', '!=', '='])
function checkString(
string: string,
features: Set<string>,
{
page,
filePath,
languageCode,
verbose = false,
}: { page?: Page; filePath?: string; languageCode?: string; verbose?: boolean } = {},
) {
try {
for (const token of getLiquidTokens(string)) {
if (token.name === 'ifversion' || token.name === 'elsif') {
for (const arg of token.args.split(/\s+/)) {
if (IGNORE_ARGS.has(arg)) continue
if (isFloat(arg)) continue
if (features.has(arg)) {
features.delete(arg)
}
}
}
}
} catch (error) {
if (error instanceof TokenizationError) {
// If it happens in English, it's a serious error
if (languageCode === 'en') throw error
// The translation might, currently, have corrupted liquid
// So treat it as a string
if (verbose)
console.log(
`TokenizationError in ${page ? page.fullPath : filePath}. Treating ${page ? page.fullPath : filePath} as a string and using regex`,
)
for (const feature of Array.from(findByRegex(features, string))) {
features.delete(feature)
}
} else {
throw error
}
}
}
function findByRegex(features: Set<string>, string: string) {
const found = new Set<string>()
for (const match of string.match(/\{%\s*(ifversion|elsif)\s*(.*?)\s*%\}/g) || []) {
for (const feature of Array.from(features)) {
const regex = new RegExp(`\\s${escapeRegex(feature)}(\\s|%)`, 'i')
if (regex.test(match)) {
found.add(feature)
}
}
}
return found
}
const test = findByRegex(
new Set(['placeholder', 'foo-bar']),
`
placeholder
{%ifversion placeholder-foo or fpt%}
{% elsif not-placeholder %}
{% elsif foo-bar%}
{%endif %}
{% data reusables.enterprise-migration-tool.placeholder-table %}
{% data placeholder %}
`,
)
console.assert(test.has('foo-bar'), test.toString())
console.assert(!test.has('placeholder'), test.toString())
function escapeRegex(string: string) {
return string.replace(/[/\-\\^$*+?.()|[\]{}]/g, '\\$&')
}
function isFloat(x: any) {
return !!(parseFloat(x) + 1)
}

View File

@@ -0,0 +1,18 @@
import { program } from 'commander'
import { find } from './find'
program
.name('find-orphaned-features')
.description(
"Compare what's in data/features/*.yml with what's mentioned in Markdown and frontmatter",
)
program
.command('find')
.description('Figure out what features are not being used')
.option('-s, --source-directory <directory>', 'Source directory', 'data/features')
.option('-o, --output <output-file>', 'Output file')
.option('-v, --verbose', 'Verbose')
.action(find)
program.parse(process.argv)