diff --git a/.github/workflows/sync-search-elasticsearch.yml b/.github/workflows/sync-search-elasticsearch.yml index 66d797a770..4ca84e0899 100644 --- a/.github/workflows/sync-search-elasticsearch.yml +++ b/.github/workflows/sync-search-elasticsearch.yml @@ -115,13 +115,13 @@ jobs: - name: Check out repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: Clone docs-internal.popular-pages + - name: Clone docs-internal-data uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: - repository: github/docs-internal.popular-pages + repository: github/docs-internal-data # This works because user `docs-bot` has read access to that private repo. token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }} - path: popular-pages + path: docs-internal-data - name: Clone all translations if: ${{ matrix.language != 'en' }} @@ -170,8 +170,8 @@ jobs: VERSION: ${{ inputs.version }} # The sync-search-index recognizes this env var if you don't - # use the `--popular-pags ` option. - POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json + # use the `--docs-internal-data ` option. + DOCS_INTERNAL_DATA: docs-internal-data run: | mkdir /tmp/records diff --git a/.github/workflows/sync-search-pr.yml b/.github/workflows/sync-search-pr.yml index 2305220569..f7d504a77f 100644 --- a/.github/workflows/sync-search-pr.yml +++ b/.github/workflows/sync-search-pr.yml @@ -42,13 +42,13 @@ jobs: - name: Check out repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: Clone docs-internal.popular-pages + - name: Clone docs-internal-data uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: - repository: github/docs-internal.popular-pages + repository: github/docs-internal-data # This works because user `docs-bot` has read access to that private repo. token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }} - path: popular-pages + path: docs-internal-data - uses: ./.github/actions/setup-elasticsearch @@ -89,8 +89,8 @@ jobs: THROW_ON_EMPTY: false # The sync-search-index recognizes this env var if you don't - # use the `--popular-pags ` option. - POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json + # use the `--docs-internal-data ` option. + DOCS_INTERNAL_DATA: docs-internal-data run: | mkdir /tmp/records diff --git a/src/search/scripts/build-records.js b/src/search/scripts/build-records.js index 4eae0cc4db..42313c4ee1 100644 --- a/src/search/scripts/build-records.js +++ b/src/search/scripts/build-records.js @@ -38,7 +38,7 @@ export default async function buildRecords( redirects, config = {}, ) { - const { noMarkers, popularPagesFilePath } = config + const { noMarkers, docsInternalDataPath } = config console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`) const records = [] const pages = indexablePages @@ -59,8 +59,8 @@ export default async function buildRecords( return permalink }) - const popularPages = popularPagesFilePath - ? await getPopularPages(popularPagesFilePath, redirects) + const popularPages = docsInternalDataPath + ? await getPopularPages(docsInternalDataPath, redirects, pageVersion, languageCode) : {} console.log('indexable pages', indexablePages.length) diff --git a/src/search/scripts/popular-pages.js b/src/search/scripts/popular-pages.js index 5076838b42..11dac81862 100644 --- a/src/search/scripts/popular-pages.js +++ b/src/search/scripts/popular-pages.js @@ -1,12 +1,30 @@ +import { join } from 'path' +import { existsSync } from 'fs' import fs from 'fs/promises' -export default async function getPopularPages(filePath, redirects) { - const popularPagesRaw = await fs.readFile(filePath, 'utf-8') +export default async function getPopularPages(dirPath, redirects, version, language) { + // The dirPath is the path to the github/docs-internal-data repo. + // We make assumptions about the structure of the repo. In particular, + // the pageviews rollups live in + // `hydro/rollups/pageviews/$language/$versionprefix/rollup.json` + // For example + // `hydro/rollups/pageviews/en/enterprise-cloud/rollup.json` + const versionPrefix = version.split('@')[0] + let filePath = join(dirPath, 'hydro/rollups/pageviews', language, versionPrefix, 'rollup.json') + if (!existsSync(filePath) && language !== 'en') { + console.warn("Trying the rollup for 'en'") + language = 'en' + filePath = join(dirPath, 'hydro/rollups/pageviews', language, versionPrefix, 'rollup.json') + } + if (!existsSync(filePath)) { + throw new Error(`No rollup found for version '${versionPrefix}'. Tried ${filePath}`) + } + const rollupRaw = await fs.readFile(filePath, 'utf-8') // Firt iterate through the array of objects, not making an assumption // that the first one is the biggest one. const all = {} - for (const { path_article: path, path_count: count } of JSON.parse(popularPagesRaw)) { + for (const [path, count] of Object.entries(JSON.parse(rollupRaw))) { if (!path) { // Can happen if the SQL query is, for some unknown reason, finding // a path that is either `null` or an empty string. Treat it as a diff --git a/src/search/scripts/sync-search-indices.js b/src/search/scripts/sync-search-indices.js index 02b8b82044..17e3e13fbc 100755 --- a/src/search/scripts/sync-search-indices.js +++ b/src/search/scripts/sync-search-indices.js @@ -7,7 +7,7 @@ // // [end-readme] -import { existsSync } from 'fs' +import { existsSync, statSync, readdirSync } from 'fs' import assert from 'assert' import { program, Option } from 'commander' @@ -39,7 +39,10 @@ program ) .option('--no-markers', 'Do not print a marker for each parsed document') .option('--filter ', 'Filter to only do pages that match this string') - .option('-p, --popular-pages ', 'Popular pages JSON file (defaults to $POPULAR_PAGES_JSON)') + .option( + '-d, --docs-internal-data ', + 'Path to github/docs-internal-data repo (defaults to $DOCS_INTERNAL_DATA)', + ) .argument('', 'where the indexable files should be written') .parse(process.argv) @@ -88,19 +91,33 @@ async function main(opts, args) { } } - let popularPagesFilePath - const { popularPages } = opts - const { POPULAR_PAGES_JSON } = process.env - if (popularPages) { - if (!existsSync(popularPages)) { - throw new Error(`'${popularPages}' does not exist`) + let docsInternalDataPath + const { docsInternalData } = opts + const { DOCS_INTERNAL_DATA } = process.env + + // Taking care of legacy + if (process.env.POPULAR_PAGES_JSON) { + throw new Error('POPULAR_PAGES_JSON is deprecated. Use DOCS_INTERNAL_DATA instead.') + } + + if (docsInternalData) { + if (!existsSync(docsInternalData)) { + throw new Error(`'${docsInternalData}' does not exist`) } - popularPagesFilePath = popularPages - } else if (POPULAR_PAGES_JSON) { - if (!existsSync(POPULAR_PAGES_JSON)) { - throw new Error(`'${POPULAR_PAGES_JSON}' does not exist`) + docsInternalDataPath = docsInternalData + } else if (DOCS_INTERNAL_DATA) { + if (!existsSync(DOCS_INTERNAL_DATA)) { + throw new Error(`'${DOCS_INTERNAL_DATA}' does not exist`) } - popularPagesFilePath = POPULAR_PAGES_JSON + docsInternalDataPath = DOCS_INTERNAL_DATA + } + if (docsInternalDataPath) { + if (!statSync(docsInternalDataPath).isDirectory()) + throw new Error('docsInternalDataPath must be a directory') + + const files = readdirSync(docsInternalDataPath) + if (!files.includes('hydro')) + throw new Error(`'${docsInternalDataPath}' must contain a 'hydro' directory`) } // A `--version` or `process.env.VERSION` was specified, we need to convert @@ -127,7 +144,7 @@ async function main(opts, args) { const config = { noMarkers: !opts.markers, filter: opts.filter, - popularPagesFilePath, + docsInternalDataPath, } const options = {