From f6a95d5b79cc39f0e8d9abbb04da5ff285576dd3 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Thu, 16 May 2024 08:18:53 -0400 Subject: [PATCH] Script to turn all documents to a JSON file (#50260) Co-authored-by: docs-bot --- .github/workflows/all-documents.yml | 42 ++++++ package.json | 1 + .../scripts/all-documents/cli.ts | 137 ++++++++++++++++++ .../scripts/all-documents/lib.ts | 118 +++++++++++++++ 4 files changed, 298 insertions(+) create mode 100644 .github/workflows/all-documents.yml create mode 100644 src/content-render/scripts/all-documents/cli.ts create mode 100644 src/content-render/scripts/all-documents/lib.ts diff --git a/.github/workflows/all-documents.yml b/.github/workflows/all-documents.yml new file mode 100644 index 0000000000..2ac5c63bdf --- /dev/null +++ b/.github/workflows/all-documents.yml @@ -0,0 +1,42 @@ +name: All documents script + +# **What it does**: Verifies that the all-documents script works. +# **Why we have it**: Code quality and sustainability. +# **Who does it impact**: docs-engineering + +on: + pull_request: + paths: + - 'src/content-render/scripts/all-documents/**' + - 'package*.json' + - .github/workflows/all-documents.yml + +permissions: + contents: read + +jobs: + all-documents-script: + if: github.repository == 'github/docs-internal' + runs-on: ubuntu-latest + steps: + - name: Check out repo + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - uses: ./.github/actions/node-npm-setup + + - name: Run all-documents script + env: + NODE_ENV: production + run: | + echo "Help..." + npm run all-documents -- --help + + echo "" + echo "Storing in a file (English only)" + npm run all-documents -- -o all-documents.json -l en + + echo "" + echo "Look at the first 50 lines of the file..." + cat all-documents.json | jq | head -n 50 + + # We're essentially expecting it to not crash and fail. diff --git a/package.json b/package.json index 128ef701a8..041b0f6176 100644 --- a/package.json +++ b/package.json @@ -16,6 +16,7 @@ }, "exports": "./src/frame/server.ts", "scripts": { + "all-documents": "tsx src/content-render/scripts/all-documents/cli.ts", "analyze-text": "node src/search/scripts/analyze-text.js", "archive-version": "node --max-old-space-size=8192 src/ghes-releases/scripts/archive-version.js", "audit-log-sync": "tsx src/audit-logs/scripts/sync.ts", diff --git a/src/content-render/scripts/all-documents/cli.ts b/src/content-render/scripts/all-documents/cli.ts new file mode 100644 index 0000000000..9035cde6b4 --- /dev/null +++ b/src/content-render/scripts/all-documents/cli.ts @@ -0,0 +1,137 @@ +/** + * You specify one or more languages and versions, and this script + * will output a JSON file with the metadata needed. + * You run it with: + * + * npm run all-documents -- -o /tmp/all-documents.json + * + * By default, it will do free-pro-team, enterprise-cloud, and whatever + * the latest enterprise-server is. You can specify versions with: --version + * For example: + * + * npm run all-documents -- -v free-pro-team@latest -v ghes-3.12 + * + * By default it will include all languages, but you can specify + * with --language + * + * npm run all-documents -- -l en -l de + * + * For debugging purposes, because there are so *many* documents you can + * apply a filter by URL matching, for example: + * + * npm run all-documents -- -f get-started/using-github + * + * This will only include documents whose URL contains the string + * 'get-started/using-github'. + * + * If you don't specify an output file (the --output flag or -o for short), + * it will print all the JSON to stdout. + * + * By default the fields set to include are: title, shortTitle, intro, url. + * You can instead specify the fields you only want. For example + * + * npm run all-documents -- --field url --field title + * + * Now the JSON will look like this: + * + * ... + * {"title": "Some title", "url": "/some-url"} + * ... + */ + +import { writeFileSync, statSync } from 'fs' + +import { program, Option } from 'commander' + +import { languageKeys } from '@/languages/lib/languages.js' +import { allVersions } from '@/versions/lib/all-versions.js' +import { allDocuments, POSSIBLE_FIELDS, type AllDocument } from './lib' + +// E.g. enteprise-server@3.12, free-pro-team@latest, etc +const fullVersions = Object.keys(allVersions) +const defaultVersions: string[] = [] +const shortAlias = new Map() +for (const [version, info] of Object.entries(allVersions)) { + shortAlias.set(info.openApiVersionName, version) + if (info.hasNumberedReleases) { + if (info.latestRelease === info.currentRelease) { + defaultVersions.push(version) + } + } else { + defaultVersions.push(version) + } +} + +program + .description("Generate a JSON output of all documents' metadata") + .addOption( + new Option('-l, --language ', 'Specific languages(s)').choices(languageKeys), + ) + .addOption( + new Option('-v, --version ', 'Specific version(s)').choices([ + ...fullVersions, + ...shortAlias.keys(), + ]), + ) + .addOption( + new Option('--field ', 'Fields to include for each document (multiple)').choices( + POSSIBLE_FIELDS, + ), + ) + .option('-f, --filter ', 'Only for matched files (most for debugging)') + .option('-o, --output ', 'Output file', 'all-documents.json') + .action(main) + +program.parse(process.argv) + +type Options = { + version?: string[] + language?: string[] + field?: string[] + output: string + filter?: string +} +async function main(options: Options) { + const languages = options.language ? options.language : languageKeys + const versions: string[] = [] + for (const v of options.version || defaultVersions) { + if (shortAlias.has(v)) { + versions.push(shortAlias.get(v)!) + } else { + versions.push(v) + } + } + const filter = options.filter + const fields = options.field || POSSIBLE_FIELDS + + const t0 = new Date() + const documents = await allDocuments({ + languages, + versions, + filter, + fields, + }) + const t1 = new Date() + + const toJson: AllDocument[] = [] + for (const doc of documents) { + const { documents, ...rest } = doc + toJson.push({ + ...rest, + documents, + }) + } + + const toString = JSON.stringify(toJson, null, 2) + const outFile = options.output + writeFileSync(outFile, toString) + const seconds = (t1.getTime() - t0.getTime()) / 1000 + const size = statSync(outFile).size + console.log(`Wrote ${outFile} (${fileSize(size)}). Took ${seconds.toFixed(1)} seconds.`) +} + +const fileSize = (bytes: number) => { + if (bytes > 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)}Mb` + if (bytes > 1024) return `${(bytes / 1024).toFixed(1)}Kb` + return `${bytes} bytes` +} diff --git a/src/content-render/scripts/all-documents/lib.ts b/src/content-render/scripts/all-documents/lib.ts new file mode 100644 index 0000000000..c7c188250d --- /dev/null +++ b/src/content-render/scripts/all-documents/lib.ts @@ -0,0 +1,118 @@ +import contextualize from '@/frame/middleware/context/context.js' +import features from '@/versions/middleware/features.js' +import shortVersions from '@/versions/middleware/short-versions.js' + +import warmServer from '@/frame/lib/warm-server.js' + +export const POSSIBLE_FIELDS = ['title', 'shortTitle', 'intro', 'url'] + +type Document = { + title?: string + shortTitle?: string | null + intro?: string + url?: string +} + +export type AllDocument = { + version: string + language: string + documents: Document[] +} + +type Permalink = { + languageCode: string + pageVersion: string + title: string + href: string +} + +type Page = { + permalinks: Permalink[] + fullPath: string + title: string + shortTitle?: string + intro: string + languageCode: string + documentType: string + renderProp: (prop: string, context: any, opts: any) => Promise +} + +type Options = { + languages: string[] + versions: string[] + fields: string[] + filter?: string +} + +export async function allDocuments(options: Options): Promise { + const { filter, versions, languages, fields } = options + + const site = await warmServer(options.languages) + const pages: Page[] = site.pageList + const allDocuments: AllDocument[] = [] + + type ByVersion = Map + const byLanguage = new Map() + + for (const page of pages) { + if (!languages.includes(page.languageCode)) { + continue + } + + for (const permalink of page.permalinks) { + if (filter && !permalink.href.includes(filter)) { + continue + } + if (!versions.includes(permalink.pageVersion)) { + continue + } + + const next = () => {} + const res = {} + const pagePath = permalink.href + const context: any = {} + const req = { + path: pagePath, + language: permalink.languageCode, + pagePath, + cookies: {}, + query: {}, + context, + } + + await contextualize(req, res, next) + await shortVersions(req, res, next) + req.context.page = page + await features(req, res, next) + + const title = fields.includes('title') + ? await page.renderProp('title', req.context, { textOnly: true }) + : undefined + const shortTitle = fields.includes('shortTitle') + ? page.shortTitle + ? await page.renderProp('shortTitle', req.context, { textOnly: true }) + : null + : undefined + const intro = fields.includes('intro') + ? await page.renderProp('intro', req.context, { textOnly: true }) + : undefined + + const url = fields.includes('url') ? permalink.href : undefined + + if (!byLanguage.has(permalink.languageCode)) { + byLanguage.set(permalink.languageCode, new Map()) + } + const byVersion = byLanguage.get(permalink.languageCode) as ByVersion + if (!byVersion.has(permalink.pageVersion)) { + byVersion.set(permalink.pageVersion, []) + } + byVersion.get(permalink.pageVersion)?.push({ title, shortTitle, intro, url }) + } + } + for (const [language, byVersion] of byLanguage) { + for (const [version, documents] of byVersion) { + allDocuments.push({ version, language, documents }) + } + } + return allDocuments +}