get popularities from docs-internal.popular-pages (#35583)
This commit is contained in:
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@@ -10,7 +10,6 @@
|
||||
/.github/ @github/docs-engineering
|
||||
/script/ @github/docs-engineering
|
||||
/includes/ @github/docs-engineering
|
||||
/lib/search/popular-pages.json @github/docs-engineering
|
||||
Dockerfile @github/docs-engineering
|
||||
package-lock.json @github/docs-engineering
|
||||
package.json @github/docs-engineering
|
||||
|
||||
12
.github/workflows/sync-search-elasticsearch.yml
vendored
12
.github/workflows/sync-search-elasticsearch.yml
vendored
@@ -99,6 +99,14 @@ jobs:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
|
||||
|
||||
- name: Clone docs-internal.popular-pages
|
||||
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
|
||||
with:
|
||||
repository: github/docs-internal.popular-pages
|
||||
# This works because user `docubot` has read access to that private repo.
|
||||
token: ${{ secrets.DOCUBOT_REPO_PAT }}
|
||||
path: popular-pages
|
||||
|
||||
- name: Clone all translations
|
||||
if: ${{ matrix.language != 'en' }}
|
||||
uses: ./.github/actions/clone-translations
|
||||
@@ -149,6 +157,10 @@ jobs:
|
||||
# the same as not set within the script.
|
||||
VERSION: ${{ github.event.inputs.version }}
|
||||
|
||||
# The sync-search-index recognizes this env var if you don't
|
||||
# use the `--popular-pags <PATH>` option.
|
||||
POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json
|
||||
|
||||
run: |
|
||||
mkdir /tmp/records
|
||||
npm run sync-search-indices -- /tmp/records \
|
||||
|
||||
13
.github/workflows/sync-search-pr.yml
vendored
13
.github/workflows/sync-search-pr.yml
vendored
@@ -10,7 +10,6 @@ on:
|
||||
paths:
|
||||
- 'script/search/**'
|
||||
- 'package*.json'
|
||||
- lib/search/popular-pages.json
|
||||
# Ultimately, for debugging this workflow itself
|
||||
- .github/workflows/sync-search-pr.yml
|
||||
|
||||
@@ -36,6 +35,14 @@ jobs:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
|
||||
|
||||
- name: Clone docs-internal.popular-pages
|
||||
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
|
||||
with:
|
||||
repository: github/docs-internal.popular-pages
|
||||
# This works because user `docubot` has read access to that private repo.
|
||||
token: ${{ secrets.DOCUBOT_REPO_PAT }}
|
||||
path: popular-pages
|
||||
|
||||
- uses: ./.github/actions/setup-elasticsearch
|
||||
|
||||
- uses: ./.github/actions/node-npm-setup
|
||||
@@ -78,6 +85,10 @@ jobs:
|
||||
# let's just accept an empty string instead.
|
||||
THROW_ON_EMPTY: false
|
||||
|
||||
# The sync-search-index recognizes this env var if you don't
|
||||
# use the `--popular-pags <PATH>` option.
|
||||
POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json
|
||||
|
||||
run: |
|
||||
mkdir /tmp/records
|
||||
npm run sync-search-indices -- /tmp/records \
|
||||
|
||||
10
.github/workflows/translation-health-report.yml
vendored
10
.github/workflows/translation-health-report.yml
vendored
@@ -59,6 +59,14 @@ jobs:
|
||||
- name: Checkout the docs-internal repo
|
||||
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
|
||||
|
||||
- name: Clone docs-internal.popular-pages
|
||||
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
|
||||
with:
|
||||
repository: github/docs-internal.popular-pages
|
||||
# This works because user `docubot` has read access to that private repo.
|
||||
token: ${{ secrets.DOCUBOT_REPO_PAT }}
|
||||
path: popular-pages
|
||||
|
||||
- name: Checkout the language-specific repo
|
||||
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
|
||||
with:
|
||||
@@ -74,6 +82,8 @@ jobs:
|
||||
- uses: ./.github/actions/node-npm-setup
|
||||
|
||||
- name: Create translation health report
|
||||
env:
|
||||
POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json
|
||||
run: |
|
||||
node script/i18n/create-translation-health-report.js \
|
||||
--language ${{ matrix.language }} \
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -59,12 +59,12 @@ if (!languageKeys.includes(language)) {
|
||||
|
||||
// Gather popularity data the search uses to prioritize errors
|
||||
const scores = {}
|
||||
const popularPagesRaw = await fs.readFile('lib/search/popular-pages.json', 'utf8')
|
||||
for (const line of popularPagesRaw.split('\n')) {
|
||||
try {
|
||||
const row = JSON.parse(line)
|
||||
const { POPULAR_PAGES_JSON } = process.env
|
||||
if (POPULAR_PAGES_JSON) {
|
||||
const popularPagesRaw = await fs.readFile(POPULAR_PAGES_JSON, 'utf8')
|
||||
for (const row of JSON.parse(popularPagesRaw)) {
|
||||
scores[row.path_article] = row.path_count
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
|
||||
// Load all pages in language
|
||||
|
||||
@@ -32,7 +32,7 @@ export default async function buildRecords(
|
||||
redirects,
|
||||
config = {}
|
||||
) {
|
||||
const { noMarkers } = config
|
||||
const { noMarkers, popularPagesFilePath } = config
|
||||
console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`)
|
||||
const records = []
|
||||
const pages = indexablePages
|
||||
@@ -53,7 +53,9 @@ export default async function buildRecords(
|
||||
return permalink
|
||||
})
|
||||
|
||||
const popularPages = await getPopularPages(redirects)
|
||||
const popularPages = popularPagesFilePath
|
||||
? await getPopularPages(popularPagesFilePath, redirects)
|
||||
: {}
|
||||
|
||||
console.log('indexable pages', indexablePages.length)
|
||||
console.log('pages in index', pages.length)
|
||||
|
||||
@@ -1,40 +1,49 @@
|
||||
import fs from 'fs/promises'
|
||||
|
||||
const POPULAR_PAGES_JSON = './lib/search/popular-pages.json'
|
||||
export default async function getPopularPages(filePath, redirects) {
|
||||
const popularPagesRaw = await fs.readFile(filePath, 'utf-8')
|
||||
|
||||
export default async function getPopularPages(redirects) {
|
||||
const popularPages = {}
|
||||
try {
|
||||
const popularPagesRaw = await fs.readFile(POPULAR_PAGES_JSON, 'utf-8')
|
||||
let biggestCount = 0
|
||||
for (const line of popularPagesRaw.split('\n')) {
|
||||
if (!line.trim()) continue
|
||||
const { path_article: path, path_count: count } = JSON.parse(line)
|
||||
// The root page or any other potentially dirty record that is empty.
|
||||
if (!path) continue
|
||||
// This is safe because the `POPULAR_PAGES_JSON` always lists the
|
||||
// most popular first.
|
||||
if (!biggestCount) biggestCount = count
|
||||
// Don't bother writing massively long floating point numbers
|
||||
// because reducing it makes the JSON records smaller and we don't
|
||||
// need any more precision than 7 significant figures.
|
||||
const ratio = Number((count / biggestCount).toFixed(7))
|
||||
// The reason we're heeding redirects is because it's very possible
|
||||
// that the `POPULAR_PAGES_JSON` file is older/"staler" than the
|
||||
// content itself.
|
||||
// Imaging our analytics recorded that `/en/foo` had 1,234 pageviews,
|
||||
// and someone goes and... `git mv content/foo content/bar` plus
|
||||
// adding `redirect_from: - /foo` into the front-matter.
|
||||
// Then, by using the redirects first, we can maintain that popularity
|
||||
// by now "pretending" that it's `/en/bar` that has 1,234 pageviews.
|
||||
popularPages[redirects[path] || path] = ratio
|
||||
// Firt iterate through the array of objects, not making an assumption
|
||||
// that the first one is the biggest one.
|
||||
const all = {}
|
||||
for (const { path_article: path, path_count: count } of JSON.parse(popularPagesRaw)) {
|
||||
if (!path) {
|
||||
// Can happen if the SQL query is, for some unknown reason, finding
|
||||
// a path that is either `null` or an empty string. Treat it as a
|
||||
// junk entry and skip it.
|
||||
continue
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === 'ENOENT') {
|
||||
console.warn(`The file ${POPULAR_PAGES_JSON} can not be found.`)
|
||||
} else {
|
||||
throw error
|
||||
if (path === 'index') {
|
||||
// That's the home page which doesn't count. It doesn't count because
|
||||
// people don't arrive on that for the information they seek. It's
|
||||
// merely a navigation tool.
|
||||
continue
|
||||
}
|
||||
if (path.startsWith('early-access/')) {
|
||||
// We never index these anyway so their popularity is never relevant.
|
||||
continue
|
||||
}
|
||||
all[path] = count
|
||||
}
|
||||
|
||||
const biggestCount = Math.max(...Object.values(all))
|
||||
const popularPages = {}
|
||||
for (const [path, count] of Object.entries(all)) {
|
||||
// Don't bother writing massively long floating point numbers
|
||||
// because reducing it makes the JSON records smaller and we don't
|
||||
// need any more precision than 7 significant figures.
|
||||
const ratio = Number((count / biggestCount).toFixed(7))
|
||||
|
||||
// The reason we're heeding redirects is because it's possible
|
||||
// that the JSON file is older/"staler" than the
|
||||
// content itself.
|
||||
// Imaging our analytics recorded that `/en/foo` had 1,234 pageviews,
|
||||
// and someone goes and... `git mv content/foo content/bar` plus
|
||||
// adding `redirect_from: - /foo` into the front-matter.
|
||||
// Then, by using the redirects first, we can maintain that popularity
|
||||
// by now "pretending" that it's `/en/bar` that has 1,234 pageviews.
|
||||
popularPages[redirects[path] || path] = ratio
|
||||
}
|
||||
|
||||
return popularPages
|
||||
}
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
//
|
||||
// [end-readme]
|
||||
|
||||
import { existsSync } from 'fs'
|
||||
|
||||
import assert from 'assert'
|
||||
import { program, Option } from 'commander'
|
||||
|
||||
@@ -37,6 +39,7 @@ program
|
||||
)
|
||||
.option('--no-markers', 'Do not print a marker for each parsed document')
|
||||
.option('--filter <MATCH>', 'Filter to only do pages that match this string')
|
||||
.option('-p, --popular-pages <PATH>', 'Popular pages JSON file (defaults to $POPULAR_PAGES_JSON)')
|
||||
.argument('<out-directory>', 'where the indexable files should be written')
|
||||
.parse(process.argv)
|
||||
|
||||
@@ -85,6 +88,21 @@ async function main(opts, args) {
|
||||
}
|
||||
}
|
||||
|
||||
let popularPagesFilePath
|
||||
const { popularPages } = opts
|
||||
const { POPULAR_PAGES_JSON } = process.env
|
||||
if (popularPages) {
|
||||
if (!existsSync(popularPages)) {
|
||||
throw new Error(`'${popularPages}' does not exist`)
|
||||
}
|
||||
popularPagesFilePath = popularPages
|
||||
} else if (POPULAR_PAGES_JSON) {
|
||||
if (!existsSync(POPULAR_PAGES_JSON)) {
|
||||
throw new Error(`'${POPULAR_PAGES_JSON}' does not exist`)
|
||||
}
|
||||
popularPagesFilePath = POPULAR_PAGES_JSON
|
||||
}
|
||||
|
||||
// A `--version` or `process.env.VERSION` was specified, we need to convert
|
||||
// it to the long name. I.e. `free-pro-team@latest`. Not `dotcom`.
|
||||
// But it could also have beeb specified as `all` which means that `version`
|
||||
@@ -109,6 +127,7 @@ async function main(opts, args) {
|
||||
const config = {
|
||||
noMarkers: !opts.markers,
|
||||
filter: opts.filter,
|
||||
popularPagesFilePath,
|
||||
}
|
||||
|
||||
const options = {
|
||||
|
||||
@@ -109,7 +109,6 @@ const IGNORE_PATHS = [
|
||||
'**/*.graphql', // E.g. data/graphql/ghec/schema.docs.graphql
|
||||
'package-lock.json', // At the time of writing it's 1.5MB!
|
||||
'.linkinator/full.log', // Only present if you've run linkinator
|
||||
'lib/search/popular-pages.json', // used to build search indexes
|
||||
'tests/**/*.json',
|
||||
'src/**/*.json', // OpenAPI schema files
|
||||
'content/early-access', // Not committed to public repository.
|
||||
|
||||
Reference in New Issue
Block a user