#!/usr/bin/env node import { maxContentLength } from '../../lib/search/config.js' // This module takes cheerio page object and divides it into sections // using H1,H2 heading elements as section delimiters. The text // that follows each heading becomes the content of the search record. const ignoredHeadingSlugs = ['in-this-article', 'further-reading', 'prerequisites'] export default function parsePageSectionsIntoRecords(page) { const { href, $, languageCode } = page const title = $('h1').first().text().trim() const breadcrumbsArray = $('[data-search=breadcrumbs] nav.breadcrumbs a') .map((i, el) => { return $(el).text().trim().replace('/', '').replace(/\s+/g, ' ') }) .get() .slice(0, -1) const breadcrumbs = breadcrumbsArray.join(' / ') || '' const metaKeywords = $('meta[name="keywords"]').attr('content') const topics = metaKeywords ? metaKeywords.split(',') : [] const productName = breadcrumbsArray[0] || '' topics.push(productName) // Remove "github" to make filter queries shorter if (productName.includes('GitHub ')) { topics.push(productName.replace('GitHub ', '')) } const objectID = href const rootSelector = '[data-search=article-body]' const $root = $(rootSelector) const $sections = $('h2', $root) .filter('[id]') .filter((i, el) => { return !ignoredHeadingSlugs.includes($(el).attr('id')) }) const headings = $sections .map((i, el) => $(el).text()) .get() .join(' ') .trim() const intro = $('[data-search=lead] p').text().trim() let body = '' // Typical example pages with no `$root` are: // https://docs.github.com/en/code-security/guides or // https://docs.github.com/en/graphql/overview/explorer // // We need to avoid these because if you use `getAllText()` on these // pages, it will extract *everything* from the page, which will // include the side bar and footer. // TODO: Come up a custom solution to extract some text from these // pages that yields some decent content to be searched on, because // when you view these pages in a browser, there's clearly text there. if ($root.length > 0) { body = getAllText($, $root) } if (!body && !intro) { console.warn(`${objectID} has no body and no intro.`) } if (languageCode !== 'en' && body.length > maxContentLength) { body = body.slice(0, maxContentLength) } const content = `${intro}\n${body}`.trim() return { objectID, breadcrumbs, title, headings, content, topics, } } function getAllText($, $root) { let text = '' // We need this so we can know if we processed, for example, // a
because if that's the case, don't use // a ' ' to concatenate the texts together but a '\n' instead. // That means, given this input: // //
Bla
| Foo | Bar |
Hi again
// // we can produce this outcome: // // 'Bla\nFoo Bar\nHi again' // let previousTagName = '' $('p, h2, h3, td, pre, li', $root).each((i, element) => { const $element = $(element) if (previousTagName === 'td' && element.tagName !== 'td') { text += '\n' } // Because our cheerio selector is all the block level tags, // what you might end up with is, from: // //Text
Code
. // If all HTML was exactly like that, you could omit the