1
0
mirror of synced 2026-01-30 15:01:41 -05:00

add reusables helper CLI (#50800)

Co-authored-by: Peter Bengtsson <mail@peterbe.com>
Co-authored-by: Peter Bengtsson <peterbe@github.com>
This commit is contained in:
Evan Bonsignori
2024-07-30 11:47:05 -07:00
committed by GitHub
parent 74195dbcfd
commit 3b16d72ea9
8 changed files with 632 additions and 0 deletions

View File

@@ -0,0 +1,71 @@
// Usage: npm run reusables -- --help
// Usage: npm run reusables -- find used accounts/create-account.md
// Usage: npm run reusables -- find unused accounts/create-account.md
// Usage: npm run reusables -- find any-unused
// Usage: npm run reusables -- find top-used
import { Command } from 'commander'
import { findTopUsed, findUsed } from './reusables-cli/find/used'
import { findPotentialUses } from './reusables-cli/find/potential-uses'
import { findUnused } from './reusables-cli/find/unused'
const defaultSimilarityThreshold = 10000
const defaultTopUsedCount = 10
const absolutePathDescription = 'Show absolute paths in output instead of relative path to repo'
const program = new Command()
program
.name('reusables-helper-cli')
.description('Tools to help with reusable Docs content snippets')
const findCommand = program.command('find')
findCommand
.command('used')
.description('Find all content files that use a specific reusable.')
.argument(
'<reusable-path>',
'Path to the reusable file relative to content/data/reusables, e.g. "accounts/create-account.md".',
)
.option('-a --absolute', absolutePathDescription, false)
.action(findUsed)
findCommand
.command('top-used')
.description('Find the top x most used reusables.')
.argument(
'[number-of-most-used-to-find]',
'Number of most used reusables to find.',
defaultTopUsedCount,
)
.option('-a --absolute', absolutePathDescription, false)
.action(findTopUsed)
findCommand
.command('unused')
.description(
'Find all reusables that are not used in any content files. WARNING: This command may take a long time to run.',
)
.option('-a --absolute', absolutePathDescription, false)
.action(findUnused)
findCommand
.command('potential-uses')
.option(
'-s, --similar',
'Find files where contents loosely matches a reusable instead of an exact match.',
)
.option(
'-t, --threshold <number>',
'Similarity threshold for similar reusables. e.g. 10000. This requires the --similar flag and some experimentation to find a useful value.',
parseFloat,
defaultSimilarityThreshold,
)
.option('-a --absolute', absolutePathDescription, false)
.description(
'Find all content files that could use any reusables, but do not. WARNING: This command may take a long time to run.',
)
.action(findPotentialUses)
program.parse()

View File

@@ -0,0 +1,132 @@
# Reusables CLI
Helpful CLI tool for making it easier to work with `data/reusables`.
Helps find where reusables are already used, and where they could be used.
## Usage
`npm run reusables -- --help` to see commands
## Commands:
`npm run reusables --`:
- [find used <reusable-path>](#command-npm-run-reusables-cli----find-used-reusable-path)
- [find top-used [number-of-most-used-to-find]](#command-npm-run-reusables-cli----find-top-used-number-of-most-used-to-find)
- [find unused](#command-npm-run-reusables-cli----find-unused)
- [find potential-uses](#command-npm-run-reusables-cli----find-potential-uses)
### Command: `npm run reusables -- find used <reusable-path>`
Find where a specific reusable is used
#### Example
`npm run reusables -- find used copilot/signup-procedure.md`
```
Searching for content files that use data/reusables/copilot/signup-procedure.md...
Found 2 files that use data/reusables/copilot/signup-procedure.md.
In content/billing/managing-billing-for-github-copilot/managing-your-github-copilot-individual-subscription.md on:
Line 35
In content/copilot/quickstart.md on:
Line 29
```
### Command: `npm run reusables -- find top-used [number-of-most-used-to-find]`
Find top X (default 10) most used reusables and the number of times they are used.
#### Example
`npm run reusables -- find top-used 5`
```
Searching for the top 5 most used reusables...
0/3225 reusables checked...
100/3225 reusables checked...
(etc, etc)
3225/3225 reusables checked...
Top 5 most used reusables:
#1. 318 uses of data/reusables/repositories/navigate-to-repo.md
#2. 286 uses of data/reusables/profile/access_org.md
#3. 212 uses of data/reusables/enterprise-accounts/access-enterprise.md
#4. 193 uses of data/reusables/profile/org_settings.md
#5. 171 uses of data/reusables/actions/action-checkout.md
```
### Command: `npm run reusables -- find unused`
Find which reusables aren't used in any content files.
This will take ~10+ minutes to run locally. You will be updated at each 5% interval.
#### Example
`npm run reusables -- find unused`
```
Searching 6468 files and 3225 reusables...
Progress: 5% done
Progress: 10% done
Progress: 15% done
...
Found 111 unused reusables:
data/reusables/actions/action-labeler.md
data/reusables/actions/actions-audit-events-for-enterprise.md
data/reusables/actions/actions-audit-events-workflow.md
data/reusables/actions/cache-no-org-policy.md
data/reusables/actions/configure-runner-group-access.md
...
```
### Command: `npm run reusables -- find potential-uses`
Find which files that reusables might be used in.
The command does this by searching every `content/` & `data/` file for strings that match every reusable that isn't ignored in `src/content-render/scripts/reusables-cli/ignore-reusables.ts`.
#### Example
`npm run reusables -- find potential-uses`
```
Searching 6468 files for potential reusable use...
0/3225 reusables checked...
100/3225 reusables checked...
(etc, etc)
3223/3225 reusables checked...
Found 13 files that could use reusables.
Reusable data/reusables/actions/action-labeler.md can be used
In content/actions/using-workflows/reusing-workflows.md on:
Line 146
Line 188
(cont.)
```
#### Ignoring reusables
Some reusables might not make sense to "reuse" everywhere they could be reused. For instance, at the time of writing there is a reusable that is just the number "30" which shows up in certain files, but doesn't make sense to be replaced with a reusable.
In these cases you can skip these reusables from being checked by the `find potential-uses` command by adding their paths to the array in [src/content-render/scripts/reusables-cli/ignore-reusables.ts](./ignore-unused-reusables.ts)
#### Similarity search
This may or may not be a useful search. It does a looser search to find places where the reusable _may_ be usable. You can include this type of search with the `-s` flag. You can alter the "threshold" used by the scoring algorithm to show more (higher number) or less (lower number) potential results with the `-t` flag.
The threshold is a number that finds how similar the words in the reusable are to the words in a given article.
A good default threshold number is `15000`. You can experiment with a higher/lower number if you aren't getting good results.
e.g. `npm run reusables -- find potential-uses -s -t 15000`

View File

@@ -0,0 +1,99 @@
import fs from 'fs'
import {
FilesWithLineNumbers,
FilesWithSimilarity,
findIndicesOfSubstringInString,
findSimilarSubStringInString,
getAllContentFilePaths,
getAllReusablesFilePaths,
getRelativeReusablesPath,
printFindsWithLineNumbers,
} from '../shared'
import { reusablesToIgnore } from '../ignore-reusables'
export function findPotentialUses({
similar,
threshold,
absolute,
}: {
similar?: boolean
threshold: number
absolute: boolean
}) {
const reusableFiles = getAllReusablesFilePaths()
const allFilePaths = getAllContentFilePaths()
const filesThatCouldUseReusable: FilesWithLineNumbers = []
const filesThatCouldUseReusableSimilar: FilesWithSimilarity = []
// Read all content & data files into memory
const allFileContents = allFilePaths.map((filePath) => {
return {
filePath,
fileContents: fs.readFileSync(filePath, 'utf-8'),
}
})
console.log(`Searching ${allFileContents.length} files for potential reusable use...`)
if (similar) {
console.log('Using similarity search, this may take a while...')
}
let reusableCount = 0
let reusableContents
for (const reusableFilePath of reusableFiles) {
reusableContents = fs.readFileSync(reusableFilePath, 'utf-8')
const reusableRelativeFilePath = getRelativeReusablesPath(reusableFilePath)
if (!reusableContents.trim()) {
if (!absolute) {
console.log(`Skipping empty reusable file: ${reusableRelativeFilePath}`)
} else {
console.log(`Skipping empty reusable file: ${reusableFilePath}`)
}
continue
}
if (reusablesToIgnore.includes(reusableRelativeFilePath)) {
continue
}
if (reusableCount % 100 === 0) {
console.log(`${reusableCount}/${reusableFiles.length} reusables checked...`)
}
reusableCount += 1
for (const { filePath, fileContents } of allFileContents) {
// Skip the reusable file itself
if (filePath === reusableFilePath) continue
const indices = findIndicesOfSubstringInString(reusableContents.trim(), fileContents)
if (indices.length > 0) {
// Find line numbers of each index in fileContents
const lineNumbers = indices.map((index) => fileContents.slice(0, index).split('\n').length)
filesThatCouldUseReusable.push({
filePath,
lineNumbers,
reusableFile: reusableFilePath,
})
}
if (similar) {
const similarityScore = findSimilarSubStringInString(reusableContents.trim(), fileContents)
if (similarityScore > threshold) {
filesThatCouldUseReusableSimilar.push({
filePath,
similarityScore,
reusableFile: reusableFilePath,
})
}
}
}
}
console.log(`${reusableCount}/${reusableFiles.length} reusables checked...`)
console.log(`\nFound ${filesThatCouldUseReusable.length} files that could use reusables.`)
printFindsWithLineNumbers(absolute, filesThatCouldUseReusable)
}

View File

@@ -0,0 +1,54 @@
import fs from 'fs'
import path from 'path'
import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js'
import {
getAllContentFilePaths,
getAllReusablesFilePaths,
getRelativeReusablesPath,
resolveReusablePath,
} from '../shared'
export function findUnused({ absolute }: { absolute: boolean }) {
const reusableFilePaths = getAllReusablesFilePaths()
const allFilePaths = getAllContentFilePaths()
const usedReusables = new Set<string>()
const totalFiles = allFilePaths.length
let lastLoggedPercent = 0
console.log(`Searching ${totalFiles} files and ${reusableFilePaths.length} reusables...`)
for (let i = 0; i < totalFiles; i++) {
const filePath = allFilePaths[i]
const fileContents = fs.readFileSync(filePath, 'utf-8')
const liquidTokens = getLiquidTokens(fileContents)
for (const token of liquidTokens) {
const { args, name } = token
if (name === 'data' && args.startsWith('reusables.')) {
const reusableName = path.join('data', ...args.split('.')) + '.md'
// Special cases where we don't want them to count as reusables. It's an example in a how-to doc
if (reusableName.includes('foo/bar.md') || reusableName.includes('your-reusable-name.md')) {
continue
}
const reusablePath = resolveReusablePath(reusableName)
usedReusables.add(reusablePath)
}
}
const percentDone = Math.floor(((i + 1) / totalFiles) * 100)
if (percentDone >= lastLoggedPercent + 5) {
console.log(`Progress: ${percentDone}% done`)
lastLoggedPercent = percentDone
}
}
const unusedReusables = reusableFilePaths.filter((filePath) => !usedReusables.has(filePath))
console.log(`\nFound ${unusedReusables.length} unused reusables:`)
for (const reusableFilePath of unusedReusables) {
const printReusablePath = absolute
? reusableFilePath
: getRelativeReusablesPath(reusableFilePath)
console.log(printReusablePath)
}
}

View File

@@ -0,0 +1,74 @@
import fs from 'fs'
import path from 'path'
import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js'
import {
FilesWithLineNumbers,
getAllContentFilePaths,
getIndicesOfLiquidVariable,
getRelativeReusablesPath,
getReusableLiquidString,
printFindsWithLineNumbers,
resolveReusablePath,
} from '../shared'
export function findUsed(reusablePath: string, { absolute }: { absolute: boolean }) {
const reusableFilePath = resolveReusablePath(reusablePath)
const reusableLiquidVar = getReusableLiquidString(reusableFilePath)
const printReusablePath = absolute ? reusableFilePath : getRelativeReusablesPath(reusableFilePath)
console.log(`Searching for content files that use ${printReusablePath}...`)
const allFilePaths = getAllContentFilePaths()
const filesWithReusables: FilesWithLineNumbers = []
for (const filePath of allFilePaths) {
// Skip the reusable file itself
if (filePath === reusableFilePath) continue
const fileContents = fs.readFileSync(filePath, 'utf-8')
const indices = getIndicesOfLiquidVariable(reusableLiquidVar, fileContents)
if (indices.length > 0) {
// Find line numbers of each index in fileContents
const lineNumbers = indices.map((index) => fileContents.slice(0, index).split('\n').length)
filesWithReusables.push({
filePath,
lineNumbers,
})
}
}
console.log(`\nFound ${filesWithReusables.length} files that use ${printReusablePath}.`)
printFindsWithLineNumbers(absolute, filesWithReusables)
}
export function findTopUsed(numberOfMostUsedToFind: number, { absolute }: { absolute: boolean }) {
const allFilePaths = getAllContentFilePaths()
const reusableCounts = new Map<string, number>()
for (const filePath of allFilePaths) {
const fileContents = fs.readFileSync(filePath, 'utf-8')
const liquidTokens = getLiquidTokens(fileContents)
for (const token of liquidTokens) {
const { args, name } = token
if (name === 'data' && args.startsWith('reusables.')) {
reusableCounts.set(args, (reusableCounts.get(args) || 0) + 1)
}
}
}
const sortedCounts = Array.from(reusableCounts.entries()).sort((a, b) => b[1] - a[1])
console.log(`\nTop ${numberOfMostUsedToFind} most used reusables:`)
let i = 0
for (const [reusable, count] of sortedCounts.slice(0, numberOfMostUsedToFind)) {
let printReusablePath = path.join('data', ...reusable.split('.')) + '.md'
if (absolute) {
printReusablePath = path.resolve(printReusablePath)
}
console.log(`#${`${++i}.`.padEnd(3)} ${count} uses of ${printReusablePath}`)
}
}

View File

@@ -0,0 +1,5 @@
// List of reusables to ignore when checking for potential uses of reusables
// Make sure paths are relative to the root of the repo
export const reusablesToIgnore = [
'data/reusables/copilot/trial-period.md', // Just a number, so it pops up in unrelated files
]

View File

@@ -0,0 +1,196 @@
import walk from 'walk-sync'
import path from 'path'
import { TokenizationError } from 'liquidjs'
import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils'
const __dirname = path.dirname(new URL(import.meta.url).pathname)
const repoRoot = path.resolve(__dirname, '../../../../')
const contentDirectory = path.resolve(__dirname, repoRoot, 'content/')
const dataDirectory = path.resolve(__dirname, repoRoot, 'data/')
const reusablesDirectory = path.resolve(dataDirectory, 'reusables/')
export type FilesWithLineNumbers = {
filePath: string
lineNumbers: number[]
reusableFile?: string
}[]
export type FilesWithSimilarity = {
filePath: string
similarityScore: number
reusableFile?: string
}[]
export function filterFiles(files: string[]) {
return files.filter(
(filePath) =>
filePath.endsWith('.md') || (filePath.endsWith('.yml') && !filePath.endsWith('README.md')),
)
}
export function getAllContentFilePaths() {
const allContentFiles = filterFiles(
walk(contentDirectory, {
includeBasePath: true,
directories: false,
}),
)
const allDataFiles = filterFiles(
walk(dataDirectory, {
includeBasePath: true,
directories: false,
}),
)
return [...allContentFiles, ...allDataFiles]
}
// Get the string that represents the reusable in the content files
export function getReusableLiquidString(reusablePath: string): string {
const relativePath = path.relative(reusablesDirectory, reusablePath)
return `reusables.${relativePath.slice(0, -3).split('/').join('.')}`
}
export function getIndicesOfLiquidVariable(liquidVariable: string, fileContents: string): number[] {
const indices: number[] = []
try {
for (const token of getLiquidTokens(fileContents)) {
if (token.name === 'data' && token.args.trim() === liquidVariable) {
indices.push(token.begin)
}
}
} catch (err) {
if (err instanceof TokenizationError) return []
throw err
}
return indices
}
// Find the path to a reusable file.
export function resolveReusablePath(reusablePath: string): string {
// Try .md if extension is not provided
if (!reusablePath.endsWith('.md') && !reusablePath.endsWith('.yml')) {
reusablePath += '.md'
}
// Allow user to just pass the name of the file. If it's not ambiguous, we'll find it.
const allReusableFiles = getAllReusablesFilePaths()
const foundPaths = []
for (const possiblePath of allReusableFiles) {
if (possiblePath.includes(reusablePath)) {
foundPaths.push(possiblePath)
}
}
if (foundPaths.length === 0) {
console.error(`Reusables file not found: ${reusablePath}`)
process.exit(1)
} else if (foundPaths.length === 1) {
return foundPaths[0]
} else {
console.error(`Multiple reusables found by name: ${reusablePath}`)
for (let i = 0; i < foundPaths.length; i++) {
console.error(` ${i + 1}: ${getRelativeReusablesPath(foundPaths[i])}`)
}
console.error('Please specify which reusable by passing the full path')
process.exit(1)
}
}
export function getAllReusablesFilePaths(): string[] {
return filterFiles(
walk(reusablesDirectory, {
includeBasePath: true,
directories: false,
}),
)
}
export function findIndicesOfSubstringInString(substr: string, str: string): number[] {
str = str.toLowerCase()
const result: number[] = []
let idx = str.indexOf(substr)
while (idx !== -1) {
result.push(idx)
idx = str.indexOf(substr, idx + 1)
}
return result
}
export function findSimilarSubStringInString(substr: string, str: string) {
// Take every sentence in the substr, lower case it, and compare it to every sentence in the str to get a similarity score
const substrSentences = substr.split('.').map((sentence) => sentence.toLowerCase())
const corpus = str.split('.').map((sentence) => sentence.toLowerCase())
let similarityScore = 0
// Find how similar every two strings are based on the words they share
for (const substrSentence of substrSentences) {
for (const sentence of corpus) {
const substrTokens = substrSentence.split(' ')
const tokens = sentence.split(' ')
const sharedWords = substrTokens.filter((token) => tokens.includes(token))
similarityScore += sharedWords.length / (substrTokens.length + tokens.length)
}
}
// Normalize the similarity score
return Math.round((similarityScore / substrSentences.length) * corpus.length)
}
export function printFindsWithLineNumbers(
absolute: boolean,
reusableFindings: { filePath: string; lineNumbers: number[]; reusableFile?: string }[],
similarityFindings?: { filePath: string; similarityScore: number; reusableFile?: string }[],
) {
for (const { filePath, lineNumbers, reusableFile } of reusableFindings) {
let printReusablePath = reusableFile
let printFilePath = filePath
if (!absolute) {
printReusablePath = getRelativeReusablesPath(printReusablePath as string)
printFilePath = path.relative(repoRoot, printFilePath)
}
if (reusableFile) {
console.log(`\nReusable ${printReusablePath} can be used`)
console.log(`In ${printFilePath} on:`)
} else {
console.log(`\nIn ${printFilePath} on:`)
}
for (const lineNumber of lineNumbers) {
console.log(` Line ${lineNumber}`)
}
}
if (similarityFindings?.length) {
console.log('\nFindings using "similar" algorithm:')
for (const { filePath, similarityScore, reusableFile } of similarityFindings) {
let printReusablePath = reusableFile
let printFilePath = filePath
if (!absolute) {
printReusablePath = getRelativeReusablesPath(printReusablePath as string)
printFilePath = path.relative(repoRoot, printFilePath)
}
if (reusableFile) {
console.log(`\nReusables ${printReusablePath} can be used`)
console.log(`In ${printFilePath} with similarity score: ${similarityScore}`)
} else {
console.log(`\nIn ${printFilePath} with similarity score: ${similarityScore}`)
}
}
}
}
export function getRelativeReusablesPath(reusablePath: string) {
if (!reusablePath) {
return ''
}
return path.relative(repoRoot, reusablePath)
}