1
0
mirror of synced 2025-12-23 11:54:18 -05:00
Files
docs/script/search/parse-page-sections-into-records.js
James M. Greene 542a459c06 Move script-only dependencies to devDependencies (#19542)
* Move lib/search/sync.js to script/search/sync.js

* Move mdast-util-from-markdown to devDeps

* Move lib/redirects/add-redirect-to-frontmatter.js to script/helpers/

* Move mkdirp to devDeps

* Move linkinator to devDeps

* Move rimraf to devDeps

* Fix script/search/sync.js require paths

* Move lib/search/build-records.js to script/search/

* Move lib/search/find-indexable-pages to script/search/

* Fix require paths for build-records

* Fix require paths for find-indexable-pages

* Move lib/search/algolia-get-remote-index-names.js to script/search/

* Movbe lib/search/algolia-search-index.js to script/search/

* Move lib/search/lunr-search-index.js to script/search/

* Move lib/search/lunr-get-index-names.js to script/search/

* Fix Lunr search index paths

* Move lib/search/validate-records.js to script/search/

* Move is-url to devDeps

* Move lib/search/algolia-client.js to script/search/

* Move lib/search/parse-page-sections-into-records.js to script/search/

* Move lib/search/rank.js to script/search/

* Fix path to cached-index-names.json file

* Normalize require for fs.promises
2021-05-25 20:44:19 +00:00

98 lines
2.6 KiB
JavaScript

// This module takes cheerio page object and divides it into sections
// using H1,h2,h3 heading elements as section delimiters. The text
// that follows each heading becomes the content of the search record.
const { chain } = require('lodash')
const urlPrefix = 'https://docs.github.com'
const ignoredHeadingSlugs = [
'in-this-article',
'further-reading'
]
const { maxContentLength } = require('../../lib/search/config')
module.exports = function parsePageSectionsIntoRecords (href, $) {
const title = $('h1').text().trim()
const breadcrumbsArray = $('nav.breadcrumbs a')
.map((i, el) => {
return $(el)
.text()
.trim()
.replace(/\n/g, ' ')
.replace(/\s+/g, ' ')
})
.get()
.slice(0, -1)
const breadcrumbs = breadcrumbsArray.join(' / ') || ''
const metaKeywords = $('meta[name="keywords"]').attr('content')
const topics = metaKeywords ? metaKeywords.split(',') : []
const productName = breadcrumbsArray[0] || ''
topics.push(productName)
// Remove "github" to make filter queries shorter
if (productName.includes('GitHub ')) {
topics.push(productName.replace('GitHub ', ''))
}
let records
const $sections = $('.article-grid-body h3')
.filter('[id]')
.filter((i, el) => {
return !ignoredHeadingSlugs.includes($(el).attr('id'))
})
if ($sections.length > 0) {
records = $sections
.map((i, el) => {
const heading = $(el).text().trim()
const slug = $(el).attr('id')
const objectID = [href, slug].join('#')
const url = [urlPrefix, objectID].join('')
const content = $(el)
// Platform-specific content is nested in a DIV
// GraphQL content in nested in two DIVS
.nextUntil('h2, h3, div > h2, div > h3, div > div > h2, div > div > h3')
.map((i, el) => $(el).text())
.get()
.join(' ')
.trim()
.slice(0, maxContentLength)
return {
objectID,
url,
slug,
breadcrumbs,
heading,
title,
content,
topics
}
})
.get()
} else {
// There are no sections. Treat the entire article as the record.
const objectID = href
const url = [urlPrefix, objectID].join('')
const content = $('.article-grid-body p, .article-grid-body ul, .article-grid-body ol, .article-grid-body table')
.map((i, el) => $(el).text())
.get()
.join(' ')
.trim()
.slice(0, maxContentLength)
records = [{
objectID,
url,
breadcrumbs,
title,
content,
topics
}]
}
return chain(records)
.uniqBy('objectID')
.value()
}