From 3b16d72ea90a4c037fa98f1b491fcd265502fab9 Mon Sep 17 00:00:00 2001 From: Evan Bonsignori Date: Tue, 30 Jul 2024 11:47:05 -0700 Subject: [PATCH] add reusables helper CLI (#50800) Co-authored-by: Peter Bengtsson Co-authored-by: Peter Bengtsson --- package.json | 1 + src/content-render/scripts/reusables-cli.ts | 71 +++++++ .../scripts/reusables-cli/README.md | 132 ++++++++++++ .../reusables-cli/find/potential-uses.ts | 99 +++++++++ .../scripts/reusables-cli/find/unused.ts | 54 +++++ .../scripts/reusables-cli/find/used.ts | 74 +++++++ .../scripts/reusables-cli/ignore-reusables.ts | 5 + .../scripts/reusables-cli/shared.ts | 196 ++++++++++++++++++ 8 files changed, 632 insertions(+) create mode 100644 src/content-render/scripts/reusables-cli.ts create mode 100644 src/content-render/scripts/reusables-cli/README.md create mode 100644 src/content-render/scripts/reusables-cli/find/potential-uses.ts create mode 100644 src/content-render/scripts/reusables-cli/find/unused.ts create mode 100644 src/content-render/scripts/reusables-cli/find/used.ts create mode 100644 src/content-render/scripts/reusables-cli/ignore-reusables.ts create mode 100644 src/content-render/scripts/reusables-cli/shared.ts diff --git a/package.json b/package.json index ae780e584e..565ddccd33 100644 --- a/package.json +++ b/package.json @@ -58,6 +58,7 @@ "prettier-check": "prettier -c \"**/*.{ts,tsx,js,mjs,scss,yml,yaml}\"", "prevent-pushes-to-main": "node src/workflows/prevent-pushes-to-main.js", "release-banner": "node src/ghes-releases/scripts/release-banner.js", + "reusables": "tsx src/content-render/scripts/reusables-cli.ts", "remove-version-markup": "node src/ghes-releases/scripts/remove-version-markup.js", "rendered-content-link-checker": "tsx src/links/scripts/rendered-content-link-checker.ts", "rendered-content-link-checker-cli": "tsx src/links/scripts/rendered-content-link-checker-cli.ts", diff --git a/src/content-render/scripts/reusables-cli.ts b/src/content-render/scripts/reusables-cli.ts new file mode 100644 index 0000000000..0d5990b953 --- /dev/null +++ b/src/content-render/scripts/reusables-cli.ts @@ -0,0 +1,71 @@ +// Usage: npm run reusables -- --help +// Usage: npm run reusables -- find used accounts/create-account.md +// Usage: npm run reusables -- find unused accounts/create-account.md +// Usage: npm run reusables -- find any-unused +// Usage: npm run reusables -- find top-used + +import { Command } from 'commander' +import { findTopUsed, findUsed } from './reusables-cli/find/used' +import { findPotentialUses } from './reusables-cli/find/potential-uses' +import { findUnused } from './reusables-cli/find/unused' + +const defaultSimilarityThreshold = 10000 +const defaultTopUsedCount = 10 +const absolutePathDescription = 'Show absolute paths in output instead of relative path to repo' + +const program = new Command() + +program + .name('reusables-helper-cli') + .description('Tools to help with reusable Docs content snippets') + +const findCommand = program.command('find') + +findCommand + .command('used') + .description('Find all content files that use a specific reusable.') + .argument( + '', + 'Path to the reusable file relative to content/data/reusables, e.g. "accounts/create-account.md".', + ) + .option('-a --absolute', absolutePathDescription, false) + .action(findUsed) + +findCommand + .command('top-used') + .description('Find the top x most used reusables.') + .argument( + '[number-of-most-used-to-find]', + 'Number of most used reusables to find.', + defaultTopUsedCount, + ) + .option('-a --absolute', absolutePathDescription, false) + .action(findTopUsed) + +findCommand + .command('unused') + .description( + 'Find all reusables that are not used in any content files. WARNING: This command may take a long time to run.', + ) + .option('-a --absolute', absolutePathDescription, false) + .action(findUnused) + +findCommand + .command('potential-uses') + .option( + '-s, --similar', + 'Find files where contents loosely matches a reusable instead of an exact match.', + ) + .option( + '-t, --threshold ', + 'Similarity threshold for similar reusables. e.g. 10000. This requires the --similar flag and some experimentation to find a useful value.', + parseFloat, + defaultSimilarityThreshold, + ) + .option('-a --absolute', absolutePathDescription, false) + .description( + 'Find all content files that could use any reusables, but do not. WARNING: This command may take a long time to run.', + ) + .action(findPotentialUses) + +program.parse() diff --git a/src/content-render/scripts/reusables-cli/README.md b/src/content-render/scripts/reusables-cli/README.md new file mode 100644 index 0000000000..9e492e1ab8 --- /dev/null +++ b/src/content-render/scripts/reusables-cli/README.md @@ -0,0 +1,132 @@ +# Reusables CLI + +Helpful CLI tool for making it easier to work with `data/reusables`. + +Helps find where reusables are already used, and where they could be used. + +## Usage + +`npm run reusables -- --help` to see commands + +## Commands: + +`npm run reusables --`: + +- [find used ](#command-npm-run-reusables-cli----find-used-reusable-path) +- [find top-used [number-of-most-used-to-find]](#command-npm-run-reusables-cli----find-top-used-number-of-most-used-to-find) +- [find unused](#command-npm-run-reusables-cli----find-unused) +- [find potential-uses](#command-npm-run-reusables-cli----find-potential-uses) + + +### Command: `npm run reusables -- find used ` + +Find where a specific reusable is used + +#### Example + +`npm run reusables -- find used copilot/signup-procedure.md` + +``` +Searching for content files that use data/reusables/copilot/signup-procedure.md... + +Found 2 files that use data/reusables/copilot/signup-procedure.md. + +In content/billing/managing-billing-for-github-copilot/managing-your-github-copilot-individual-subscription.md on: + Line 35 + +In content/copilot/quickstart.md on: + Line 29 +``` + +### Command: `npm run reusables -- find top-used [number-of-most-used-to-find]` + +Find top X (default 10) most used reusables and the number of times they are used. + +#### Example + +`npm run reusables -- find top-used 5` + +``` +Searching for the top 5 most used reusables... +0/3225 reusables checked... +100/3225 reusables checked... +(etc, etc) +3225/3225 reusables checked... + +Top 5 most used reusables: +#1. 318 uses of data/reusables/repositories/navigate-to-repo.md +#2. 286 uses of data/reusables/profile/access_org.md +#3. 212 uses of data/reusables/enterprise-accounts/access-enterprise.md +#4. 193 uses of data/reusables/profile/org_settings.md +#5. 171 uses of data/reusables/actions/action-checkout.md +``` + +### Command: `npm run reusables -- find unused` + +Find which reusables aren't used in any content files. + +This will take ~10+ minutes to run locally. You will be updated at each 5% interval. + +#### Example + +`npm run reusables -- find unused` + +``` +Searching 6468 files and 3225 reusables... +Progress: 5% done +Progress: 10% done +Progress: 15% done + +... + +Found 111 unused reusables: +data/reusables/actions/action-labeler.md +data/reusables/actions/actions-audit-events-for-enterprise.md +data/reusables/actions/actions-audit-events-workflow.md +data/reusables/actions/cache-no-org-policy.md +data/reusables/actions/configure-runner-group-access.md +... +``` + +### Command: `npm run reusables -- find potential-uses` + +Find which files that reusables might be used in. + +The command does this by searching every `content/` & `data/` file for strings that match every reusable that isn't ignored in `src/content-render/scripts/reusables-cli/ignore-reusables.ts`. + +#### Example + +`npm run reusables -- find potential-uses` + +``` +Searching 6468 files for potential reusable use... +0/3225 reusables checked... +100/3225 reusables checked... +(etc, etc) +3223/3225 reusables checked... + +Found 13 files that could use reusables. + +Reusable data/reusables/actions/action-labeler.md can be used +In content/actions/using-workflows/reusing-workflows.md on: + Line 146 + Line 188 + +(cont.) +``` + +#### Ignoring reusables + +Some reusables might not make sense to "reuse" everywhere they could be reused. For instance, at the time of writing there is a reusable that is just the number "30" which shows up in certain files, but doesn't make sense to be replaced with a reusable. + +In these cases you can skip these reusables from being checked by the `find potential-uses` command by adding their paths to the array in [src/content-render/scripts/reusables-cli/ignore-reusables.ts](./ignore-unused-reusables.ts) + +#### Similarity search + +This may or may not be a useful search. It does a looser search to find places where the reusable _may_ be usable. You can include this type of search with the `-s` flag. You can alter the "threshold" used by the scoring algorithm to show more (higher number) or less (lower number) potential results with the `-t` flag. + +The threshold is a number that finds how similar the words in the reusable are to the words in a given article. + +A good default threshold number is `15000`. You can experiment with a higher/lower number if you aren't getting good results. + +e.g. `npm run reusables -- find potential-uses -s -t 15000` diff --git a/src/content-render/scripts/reusables-cli/find/potential-uses.ts b/src/content-render/scripts/reusables-cli/find/potential-uses.ts new file mode 100644 index 0000000000..cb423a4750 --- /dev/null +++ b/src/content-render/scripts/reusables-cli/find/potential-uses.ts @@ -0,0 +1,99 @@ +import fs from 'fs' +import { + FilesWithLineNumbers, + FilesWithSimilarity, + findIndicesOfSubstringInString, + findSimilarSubStringInString, + getAllContentFilePaths, + getAllReusablesFilePaths, + getRelativeReusablesPath, + printFindsWithLineNumbers, +} from '../shared' +import { reusablesToIgnore } from '../ignore-reusables' + +export function findPotentialUses({ + similar, + threshold, + absolute, +}: { + similar?: boolean + threshold: number + absolute: boolean +}) { + const reusableFiles = getAllReusablesFilePaths() + const allFilePaths = getAllContentFilePaths() + + const filesThatCouldUseReusable: FilesWithLineNumbers = [] + const filesThatCouldUseReusableSimilar: FilesWithSimilarity = [] + + // Read all content & data files into memory + const allFileContents = allFilePaths.map((filePath) => { + return { + filePath, + fileContents: fs.readFileSync(filePath, 'utf-8'), + } + }) + + console.log(`Searching ${allFileContents.length} files for potential reusable use...`) + if (similar) { + console.log('Using similarity search, this may take a while...') + } + + let reusableCount = 0 + let reusableContents + for (const reusableFilePath of reusableFiles) { + reusableContents = fs.readFileSync(reusableFilePath, 'utf-8') + + const reusableRelativeFilePath = getRelativeReusablesPath(reusableFilePath) + if (!reusableContents.trim()) { + if (!absolute) { + console.log(`Skipping empty reusable file: ${reusableRelativeFilePath}`) + } else { + console.log(`Skipping empty reusable file: ${reusableFilePath}`) + } + continue + } + + if (reusablesToIgnore.includes(reusableRelativeFilePath)) { + continue + } + + if (reusableCount % 100 === 0) { + console.log(`${reusableCount}/${reusableFiles.length} reusables checked...`) + } + reusableCount += 1 + + for (const { filePath, fileContents } of allFileContents) { + // Skip the reusable file itself + if (filePath === reusableFilePath) continue + + const indices = findIndicesOfSubstringInString(reusableContents.trim(), fileContents) + if (indices.length > 0) { + // Find line numbers of each index in fileContents + const lineNumbers = indices.map((index) => fileContents.slice(0, index).split('\n').length) + + filesThatCouldUseReusable.push({ + filePath, + lineNumbers, + reusableFile: reusableFilePath, + }) + } + + if (similar) { + const similarityScore = findSimilarSubStringInString(reusableContents.trim(), fileContents) + if (similarityScore > threshold) { + filesThatCouldUseReusableSimilar.push({ + filePath, + similarityScore, + reusableFile: reusableFilePath, + }) + } + } + } + } + + console.log(`${reusableCount}/${reusableFiles.length} reusables checked...`) + + console.log(`\nFound ${filesThatCouldUseReusable.length} files that could use reusables.`) + printFindsWithLineNumbers(absolute, filesThatCouldUseReusable) +} diff --git a/src/content-render/scripts/reusables-cli/find/unused.ts b/src/content-render/scripts/reusables-cli/find/unused.ts new file mode 100644 index 0000000000..7d2dc342f5 --- /dev/null +++ b/src/content-render/scripts/reusables-cli/find/unused.ts @@ -0,0 +1,54 @@ +import fs from 'fs' +import path from 'path' +import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js' +import { + getAllContentFilePaths, + getAllReusablesFilePaths, + getRelativeReusablesPath, + resolveReusablePath, +} from '../shared' + +export function findUnused({ absolute }: { absolute: boolean }) { + const reusableFilePaths = getAllReusablesFilePaths() + const allFilePaths = getAllContentFilePaths() + + const usedReusables = new Set() + const totalFiles = allFilePaths.length + let lastLoggedPercent = 0 + + console.log(`Searching ${totalFiles} files and ${reusableFilePaths.length} reusables...`) + + for (let i = 0; i < totalFiles; i++) { + const filePath = allFilePaths[i] + const fileContents = fs.readFileSync(filePath, 'utf-8') + const liquidTokens = getLiquidTokens(fileContents) + for (const token of liquidTokens) { + const { args, name } = token + if (name === 'data' && args.startsWith('reusables.')) { + const reusableName = path.join('data', ...args.split('.')) + '.md' + // Special cases where we don't want them to count as reusables. It's an example in a how-to doc + if (reusableName.includes('foo/bar.md') || reusableName.includes('your-reusable-name.md')) { + continue + } + const reusablePath = resolveReusablePath(reusableName) + usedReusables.add(reusablePath) + } + } + + const percentDone = Math.floor(((i + 1) / totalFiles) * 100) + if (percentDone >= lastLoggedPercent + 5) { + console.log(`Progress: ${percentDone}% done`) + lastLoggedPercent = percentDone + } + } + + const unusedReusables = reusableFilePaths.filter((filePath) => !usedReusables.has(filePath)) + + console.log(`\nFound ${unusedReusables.length} unused reusables:`) + for (const reusableFilePath of unusedReusables) { + const printReusablePath = absolute + ? reusableFilePath + : getRelativeReusablesPath(reusableFilePath) + console.log(printReusablePath) + } +} diff --git a/src/content-render/scripts/reusables-cli/find/used.ts b/src/content-render/scripts/reusables-cli/find/used.ts new file mode 100644 index 0000000000..9669eb0bc5 --- /dev/null +++ b/src/content-render/scripts/reusables-cli/find/used.ts @@ -0,0 +1,74 @@ +import fs from 'fs' +import path from 'path' +import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js' +import { + FilesWithLineNumbers, + getAllContentFilePaths, + getIndicesOfLiquidVariable, + getRelativeReusablesPath, + getReusableLiquidString, + printFindsWithLineNumbers, + resolveReusablePath, +} from '../shared' + +export function findUsed(reusablePath: string, { absolute }: { absolute: boolean }) { + const reusableFilePath = resolveReusablePath(reusablePath) + const reusableLiquidVar = getReusableLiquidString(reusableFilePath) + + const printReusablePath = absolute ? reusableFilePath : getRelativeReusablesPath(reusableFilePath) + + console.log(`Searching for content files that use ${printReusablePath}...`) + + const allFilePaths = getAllContentFilePaths() + + const filesWithReusables: FilesWithLineNumbers = [] + + for (const filePath of allFilePaths) { + // Skip the reusable file itself + if (filePath === reusableFilePath) continue + + const fileContents = fs.readFileSync(filePath, 'utf-8') + + const indices = getIndicesOfLiquidVariable(reusableLiquidVar, fileContents) + if (indices.length > 0) { + // Find line numbers of each index in fileContents + const lineNumbers = indices.map((index) => fileContents.slice(0, index).split('\n').length) + + filesWithReusables.push({ + filePath, + lineNumbers, + }) + } + } + + console.log(`\nFound ${filesWithReusables.length} files that use ${printReusablePath}.`) + printFindsWithLineNumbers(absolute, filesWithReusables) +} + +export function findTopUsed(numberOfMostUsedToFind: number, { absolute }: { absolute: boolean }) { + const allFilePaths = getAllContentFilePaths() + + const reusableCounts = new Map() + for (const filePath of allFilePaths) { + const fileContents = fs.readFileSync(filePath, 'utf-8') + const liquidTokens = getLiquidTokens(fileContents) + for (const token of liquidTokens) { + const { args, name } = token + if (name === 'data' && args.startsWith('reusables.')) { + reusableCounts.set(args, (reusableCounts.get(args) || 0) + 1) + } + } + } + + const sortedCounts = Array.from(reusableCounts.entries()).sort((a, b) => b[1] - a[1]) + + console.log(`\nTop ${numberOfMostUsedToFind} most used reusables:`) + let i = 0 + for (const [reusable, count] of sortedCounts.slice(0, numberOfMostUsedToFind)) { + let printReusablePath = path.join('data', ...reusable.split('.')) + '.md' + if (absolute) { + printReusablePath = path.resolve(printReusablePath) + } + console.log(`#${`${++i}.`.padEnd(3)} ${count} uses of ${printReusablePath}`) + } +} diff --git a/src/content-render/scripts/reusables-cli/ignore-reusables.ts b/src/content-render/scripts/reusables-cli/ignore-reusables.ts new file mode 100644 index 0000000000..9c9979f80f --- /dev/null +++ b/src/content-render/scripts/reusables-cli/ignore-reusables.ts @@ -0,0 +1,5 @@ +// List of reusables to ignore when checking for potential uses of reusables +// Make sure paths are relative to the root of the repo +export const reusablesToIgnore = [ + 'data/reusables/copilot/trial-period.md', // Just a number, so it pops up in unrelated files +] diff --git a/src/content-render/scripts/reusables-cli/shared.ts b/src/content-render/scripts/reusables-cli/shared.ts new file mode 100644 index 0000000000..e42dc25de6 --- /dev/null +++ b/src/content-render/scripts/reusables-cli/shared.ts @@ -0,0 +1,196 @@ +import walk from 'walk-sync' +import path from 'path' +import { TokenizationError } from 'liquidjs' +import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils' + +const __dirname = path.dirname(new URL(import.meta.url).pathname) + +const repoRoot = path.resolve(__dirname, '../../../../') +const contentDirectory = path.resolve(__dirname, repoRoot, 'content/') +const dataDirectory = path.resolve(__dirname, repoRoot, 'data/') + +const reusablesDirectory = path.resolve(dataDirectory, 'reusables/') + +export type FilesWithLineNumbers = { + filePath: string + lineNumbers: number[] + reusableFile?: string +}[] +export type FilesWithSimilarity = { + filePath: string + similarityScore: number + reusableFile?: string +}[] + +export function filterFiles(files: string[]) { + return files.filter( + (filePath) => + filePath.endsWith('.md') || (filePath.endsWith('.yml') && !filePath.endsWith('README.md')), + ) +} + +export function getAllContentFilePaths() { + const allContentFiles = filterFiles( + walk(contentDirectory, { + includeBasePath: true, + directories: false, + }), + ) + + const allDataFiles = filterFiles( + walk(dataDirectory, { + includeBasePath: true, + directories: false, + }), + ) + + return [...allContentFiles, ...allDataFiles] +} + +// Get the string that represents the reusable in the content files +export function getReusableLiquidString(reusablePath: string): string { + const relativePath = path.relative(reusablesDirectory, reusablePath) + return `reusables.${relativePath.slice(0, -3).split('/').join('.')}` +} + +export function getIndicesOfLiquidVariable(liquidVariable: string, fileContents: string): number[] { + const indices: number[] = [] + try { + for (const token of getLiquidTokens(fileContents)) { + if (token.name === 'data' && token.args.trim() === liquidVariable) { + indices.push(token.begin) + } + } + } catch (err) { + if (err instanceof TokenizationError) return [] + throw err + } + + return indices +} + +// Find the path to a reusable file. +export function resolveReusablePath(reusablePath: string): string { + // Try .md if extension is not provided + if (!reusablePath.endsWith('.md') && !reusablePath.endsWith('.yml')) { + reusablePath += '.md' + } + + // Allow user to just pass the name of the file. If it's not ambiguous, we'll find it. + const allReusableFiles = getAllReusablesFilePaths() + const foundPaths = [] + for (const possiblePath of allReusableFiles) { + if (possiblePath.includes(reusablePath)) { + foundPaths.push(possiblePath) + } + } + + if (foundPaths.length === 0) { + console.error(`Reusables file not found: ${reusablePath}`) + process.exit(1) + } else if (foundPaths.length === 1) { + return foundPaths[0] + } else { + console.error(`Multiple reusables found by name: ${reusablePath}`) + for (let i = 0; i < foundPaths.length; i++) { + console.error(` ${i + 1}: ${getRelativeReusablesPath(foundPaths[i])}`) + } + console.error('Please specify which reusable by passing the full path') + process.exit(1) + } +} + +export function getAllReusablesFilePaths(): string[] { + return filterFiles( + walk(reusablesDirectory, { + includeBasePath: true, + directories: false, + }), + ) +} + +export function findIndicesOfSubstringInString(substr: string, str: string): number[] { + str = str.toLowerCase() + + const result: number[] = [] + + let idx = str.indexOf(substr) + + while (idx !== -1) { + result.push(idx) + idx = str.indexOf(substr, idx + 1) + } + return result +} + +export function findSimilarSubStringInString(substr: string, str: string) { + // Take every sentence in the substr, lower case it, and compare it to every sentence in the str to get a similarity score + const substrSentences = substr.split('.').map((sentence) => sentence.toLowerCase()) + const corpus = str.split('.').map((sentence) => sentence.toLowerCase()) + + let similarityScore = 0 + + // Find how similar every two strings are based on the words they share + for (const substrSentence of substrSentences) { + for (const sentence of corpus) { + const substrTokens = substrSentence.split(' ') + const tokens = sentence.split(' ') + + const sharedWords = substrTokens.filter((token) => tokens.includes(token)) + + similarityScore += sharedWords.length / (substrTokens.length + tokens.length) + } + } + + // Normalize the similarity score + return Math.round((similarityScore / substrSentences.length) * corpus.length) +} + +export function printFindsWithLineNumbers( + absolute: boolean, + reusableFindings: { filePath: string; lineNumbers: number[]; reusableFile?: string }[], + similarityFindings?: { filePath: string; similarityScore: number; reusableFile?: string }[], +) { + for (const { filePath, lineNumbers, reusableFile } of reusableFindings) { + let printReusablePath = reusableFile + let printFilePath = filePath + if (!absolute) { + printReusablePath = getRelativeReusablesPath(printReusablePath as string) + printFilePath = path.relative(repoRoot, printFilePath) + } + if (reusableFile) { + console.log(`\nReusable ${printReusablePath} can be used`) + console.log(`In ${printFilePath} on:`) + } else { + console.log(`\nIn ${printFilePath} on:`) + } + for (const lineNumber of lineNumbers) { + console.log(` Line ${lineNumber}`) + } + } + + if (similarityFindings?.length) { + console.log('\nFindings using "similar" algorithm:') + for (const { filePath, similarityScore, reusableFile } of similarityFindings) { + let printReusablePath = reusableFile + let printFilePath = filePath + if (!absolute) { + printReusablePath = getRelativeReusablesPath(printReusablePath as string) + printFilePath = path.relative(repoRoot, printFilePath) + } + if (reusableFile) { + console.log(`\nReusables ${printReusablePath} can be used`) + console.log(`In ${printFilePath} with similarity score: ${similarityScore}`) + } else { + console.log(`\nIn ${printFilePath} with similarity score: ${similarityScore}`) + } + } + } +} + +export function getRelativeReusablesPath(reusablePath: string) { + if (!reusablePath) { + return '' + } + return path.relative(repoRoot, reusablePath) +}