diff --git a/script/search/parse-page-sections-into-records.js b/script/search/parse-page-sections-into-records.js index ed1426b33f..227ffe0e95 100644 --- a/script/search/parse-page-sections-into-records.js +++ b/script/search/parse-page-sections-into-records.js @@ -59,7 +59,7 @@ export default function parsePageSectionsIntoRecords(page) { // pages that yields some decent content to be searched on, because // when you view these pages in a browser, there's clearly text there. if ($root.length > 0) { - body = getAllText($, $root) + body = getAllText($root) } if (!body && !intro) { @@ -85,55 +85,42 @@ export default function parsePageSectionsIntoRecords(page) { } } -function getAllText($, $root) { - let text = '' +function getAllText($root) { + const inlineElements = new Set( + `a,abbr,acronym,audio,b,bdi,bdo,big,br,button,canvas,cite,code,data, + datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,label,map,mark, + meter,noscript,object,output,picture,progress,q,ruby,s,samp,script, + select,slot,small,span,strong,sub,sup,svg,template,textarea,time, + tt,u,var,video,wbr` + .split(',') + .map((s) => s.trim()) + ) - // We need this so we can know if we processed, for example, - // a
because if that's the case, don't use - // a ' ' to concatenate the texts together but a '\n' instead. - // That means, given this input: - // - //
Bla
| Foo | Bar |
Hi again
- // - // we can produce this outcome: - // - // 'Bla\nFoo Bar\nHi again' - // - let previousTagName = '' + const walkTree = (node, callback, index = 0, level = 0) => { + callback(node, index, level) + for (let i = 0; i < (node.children || []).length; i++) { + walkTree(node.children[i], callback, i, ++level) + level-- + } + } - $('p, h2, h3, td, pre, li', $root).each((i, element) => { - const $element = $(element) - if (previousTagName === 'td' && element.tagName !== 'td') { - text += '\n' + const fragments = [] + + walkTree($root[0], (element) => { + if (element.name === 'body') return + + if (element.type === 'text') { + const parentElement = element.parent || {} + const previousElement = element.prev || {} + let { data } = element + if (data.trim()) { + if (!inlineElements.has(parentElement.name) && !inlineElements.has(previousElement.name)) { + data = `\n${data}` + } + fragments.push(data) + } } - // Because our cheerio selector is all the block level tags, - // what you might end up with is, from: - // - //Text
Code
. - // If all HTML was exactly like that, you could omit the
This is an introduction to the article.
+This won't be ignored.
+ +GitHub allows you to add as many email addresses to your account as you like. If you set an email address in your local Git configuration, you will need to add it to your account settings in order to connect your commits to your account. For more information about your email address and commits, see "Setting your commit email address."
Text here
+ // + // would become: + // + // Heading\nHeadingText here + // + // So now we make sure it only appears exactly once. + expect(record.content.match(/Changing your primary email address/g).length).toBe(1) + // But note also that it would also concatenate the text of the heading + // with the text of the paragraph without a whitespace in between. + expect(record.content.includes('email addressYou can set')).toBeFalsy() + }) })