1
0
mirror of synced 2025-12-19 18:10:59 -05:00

get popularities from docs-internal.popular-pages (#35583)

This commit is contained in:
Peter Bengtsson
2023-03-15 15:47:32 -04:00
committed by GitHub
parent 416c7393eb
commit be30059bc5
10 changed files with 103 additions and 1042 deletions

1
.github/CODEOWNERS vendored
View File

@@ -10,7 +10,6 @@
/.github/ @github/docs-engineering
/script/ @github/docs-engineering
/includes/ @github/docs-engineering
/lib/search/popular-pages.json @github/docs-engineering
Dockerfile @github/docs-engineering
package-lock.json @github/docs-engineering
package.json @github/docs-engineering

View File

@@ -99,6 +99,14 @@ jobs:
- name: Check out repo
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
- name: Clone docs-internal.popular-pages
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
with:
repository: github/docs-internal.popular-pages
# This works because user `docubot` has read access to that private repo.
token: ${{ secrets.DOCUBOT_REPO_PAT }}
path: popular-pages
- name: Clone all translations
if: ${{ matrix.language != 'en' }}
uses: ./.github/actions/clone-translations
@@ -149,6 +157,10 @@ jobs:
# the same as not set within the script.
VERSION: ${{ github.event.inputs.version }}
# The sync-search-index recognizes this env var if you don't
# use the `--popular-pags <PATH>` option.
POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json
run: |
mkdir /tmp/records
npm run sync-search-indices -- /tmp/records \

View File

@@ -10,7 +10,6 @@ on:
paths:
- 'script/search/**'
- 'package*.json'
- lib/search/popular-pages.json
# Ultimately, for debugging this workflow itself
- .github/workflows/sync-search-pr.yml
@@ -36,6 +35,14 @@ jobs:
- name: Check out repo
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
- name: Clone docs-internal.popular-pages
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
with:
repository: github/docs-internal.popular-pages
# This works because user `docubot` has read access to that private repo.
token: ${{ secrets.DOCUBOT_REPO_PAT }}
path: popular-pages
- uses: ./.github/actions/setup-elasticsearch
- uses: ./.github/actions/node-npm-setup
@@ -78,6 +85,10 @@ jobs:
# let's just accept an empty string instead.
THROW_ON_EMPTY: false
# The sync-search-index recognizes this env var if you don't
# use the `--popular-pags <PATH>` option.
POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json
run: |
mkdir /tmp/records
npm run sync-search-indices -- /tmp/records \

View File

@@ -59,6 +59,14 @@ jobs:
- name: Checkout the docs-internal repo
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
- name: Clone docs-internal.popular-pages
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
with:
repository: github/docs-internal.popular-pages
# This works because user `docubot` has read access to that private repo.
token: ${{ secrets.DOCUBOT_REPO_PAT }}
path: popular-pages
- name: Checkout the language-specific repo
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
with:
@@ -74,6 +82,8 @@ jobs:
- uses: ./.github/actions/node-npm-setup
- name: Create translation health report
env:
POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json
run: |
node script/i18n/create-translation-health-report.js \
--language ${{ matrix.language }} \

File diff suppressed because it is too large Load Diff

View File

@@ -59,12 +59,12 @@ if (!languageKeys.includes(language)) {
// Gather popularity data the search uses to prioritize errors
const scores = {}
const popularPagesRaw = await fs.readFile('lib/search/popular-pages.json', 'utf8')
for (const line of popularPagesRaw.split('\n')) {
try {
const row = JSON.parse(line)
const { POPULAR_PAGES_JSON } = process.env
if (POPULAR_PAGES_JSON) {
const popularPagesRaw = await fs.readFile(POPULAR_PAGES_JSON, 'utf8')
for (const row of JSON.parse(popularPagesRaw)) {
scores[row.path_article] = row.path_count
} catch {}
}
}
// Load all pages in language

View File

@@ -32,7 +32,7 @@ export default async function buildRecords(
redirects,
config = {}
) {
const { noMarkers } = config
const { noMarkers, popularPagesFilePath } = config
console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`)
const records = []
const pages = indexablePages
@@ -53,7 +53,9 @@ export default async function buildRecords(
return permalink
})
const popularPages = await getPopularPages(redirects)
const popularPages = popularPagesFilePath
? await getPopularPages(popularPagesFilePath, redirects)
: {}
console.log('indexable pages', indexablePages.length)
console.log('pages in index', pages.length)

View File

@@ -1,40 +1,49 @@
import fs from 'fs/promises'
const POPULAR_PAGES_JSON = './lib/search/popular-pages.json'
export default async function getPopularPages(filePath, redirects) {
const popularPagesRaw = await fs.readFile(filePath, 'utf-8')
export default async function getPopularPages(redirects) {
const popularPages = {}
try {
const popularPagesRaw = await fs.readFile(POPULAR_PAGES_JSON, 'utf-8')
let biggestCount = 0
for (const line of popularPagesRaw.split('\n')) {
if (!line.trim()) continue
const { path_article: path, path_count: count } = JSON.parse(line)
// The root page or any other potentially dirty record that is empty.
if (!path) continue
// This is safe because the `POPULAR_PAGES_JSON` always lists the
// most popular first.
if (!biggestCount) biggestCount = count
// Don't bother writing massively long floating point numbers
// because reducing it makes the JSON records smaller and we don't
// need any more precision than 7 significant figures.
const ratio = Number((count / biggestCount).toFixed(7))
// The reason we're heeding redirects is because it's very possible
// that the `POPULAR_PAGES_JSON` file is older/"staler" than the
// content itself.
// Imaging our analytics recorded that `/en/foo` had 1,234 pageviews,
// and someone goes and... `git mv content/foo content/bar` plus
// adding `redirect_from: - /foo` into the front-matter.
// Then, by using the redirects first, we can maintain that popularity
// by now "pretending" that it's `/en/bar` that has 1,234 pageviews.
popularPages[redirects[path] || path] = ratio
// Firt iterate through the array of objects, not making an assumption
// that the first one is the biggest one.
const all = {}
for (const { path_article: path, path_count: count } of JSON.parse(popularPagesRaw)) {
if (!path) {
// Can happen if the SQL query is, for some unknown reason, finding
// a path that is either `null` or an empty string. Treat it as a
// junk entry and skip it.
continue
}
} catch (error) {
if (error.code === 'ENOENT') {
console.warn(`The file ${POPULAR_PAGES_JSON} can not be found.`)
} else {
throw error
if (path === 'index') {
// That's the home page which doesn't count. It doesn't count because
// people don't arrive on that for the information they seek. It's
// merely a navigation tool.
continue
}
if (path.startsWith('early-access/')) {
// We never index these anyway so their popularity is never relevant.
continue
}
all[path] = count
}
const biggestCount = Math.max(...Object.values(all))
const popularPages = {}
for (const [path, count] of Object.entries(all)) {
// Don't bother writing massively long floating point numbers
// because reducing it makes the JSON records smaller and we don't
// need any more precision than 7 significant figures.
const ratio = Number((count / biggestCount).toFixed(7))
// The reason we're heeding redirects is because it's possible
// that the JSON file is older/"staler" than the
// content itself.
// Imaging our analytics recorded that `/en/foo` had 1,234 pageviews,
// and someone goes and... `git mv content/foo content/bar` plus
// adding `redirect_from: - /foo` into the front-matter.
// Then, by using the redirects first, we can maintain that popularity
// by now "pretending" that it's `/en/bar` that has 1,234 pageviews.
popularPages[redirects[path] || path] = ratio
}
return popularPages
}

View File

@@ -7,6 +7,8 @@
//
// [end-readme]
import { existsSync } from 'fs'
import assert from 'assert'
import { program, Option } from 'commander'
@@ -37,6 +39,7 @@ program
)
.option('--no-markers', 'Do not print a marker for each parsed document')
.option('--filter <MATCH>', 'Filter to only do pages that match this string')
.option('-p, --popular-pages <PATH>', 'Popular pages JSON file (defaults to $POPULAR_PAGES_JSON)')
.argument('<out-directory>', 'where the indexable files should be written')
.parse(process.argv)
@@ -85,6 +88,21 @@ async function main(opts, args) {
}
}
let popularPagesFilePath
const { popularPages } = opts
const { POPULAR_PAGES_JSON } = process.env
if (popularPages) {
if (!existsSync(popularPages)) {
throw new Error(`'${popularPages}' does not exist`)
}
popularPagesFilePath = popularPages
} else if (POPULAR_PAGES_JSON) {
if (!existsSync(POPULAR_PAGES_JSON)) {
throw new Error(`'${POPULAR_PAGES_JSON}' does not exist`)
}
popularPagesFilePath = POPULAR_PAGES_JSON
}
// A `--version` or `process.env.VERSION` was specified, we need to convert
// it to the long name. I.e. `free-pro-team@latest`. Not `dotcom`.
// But it could also have beeb specified as `all` which means that `version`
@@ -109,6 +127,7 @@ async function main(opts, args) {
const config = {
noMarkers: !opts.markers,
filter: opts.filter,
popularPagesFilePath,
}
const options = {

View File

@@ -109,7 +109,6 @@ const IGNORE_PATHS = [
'**/*.graphql', // E.g. data/graphql/ghec/schema.docs.graphql
'package-lock.json', // At the time of writing it's 1.5MB!
'.linkinator/full.log', // Only present if you've run linkinator
'lib/search/popular-pages.json', // used to build search indexes
'tests/**/*.json',
'src/**/*.json', // OpenAPI schema files
'content/early-access', // Not committed to public repository.