From e61d71f42f14aa2f2ef8a7190a7e4cb062ad508d Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Wed, 13 Jul 2022 23:29:55 +0200 Subject: [PATCH] HTML to plain text is broken in various places (#29006) * HTML to plain text is broken in various places * remove comment --- .../parse-page-sections-into-records.js | 81 ++++++++----------- ...h-heading-and-paragraph-no-whitespace.html | 23 ++++++ .../fixtures/page-with-multiple-h1s.html | 3 +- .../parse-page-sections-into-records.js | 32 +++++++- 4 files changed, 90 insertions(+), 49 deletions(-) create mode 100644 tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html diff --git a/script/search/parse-page-sections-into-records.js b/script/search/parse-page-sections-into-records.js index ed1426b33f..227ffe0e95 100644 --- a/script/search/parse-page-sections-into-records.js +++ b/script/search/parse-page-sections-into-records.js @@ -59,7 +59,7 @@ export default function parsePageSectionsIntoRecords(page) { // pages that yields some decent content to be searched on, because // when you view these pages in a browser, there's clearly text there. if ($root.length > 0) { - body = getAllText($, $root) + body = getAllText($root) } if (!body && !intro) { @@ -85,55 +85,42 @@ export default function parsePageSectionsIntoRecords(page) { } } -function getAllText($, $root) { - let text = '' +function getAllText($root) { + const inlineElements = new Set( + `a,abbr,acronym,audio,b,bdi,bdo,big,br,button,canvas,cite,code,data, + datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,label,map,mark, + meter,noscript,object,output,picture,progress,q,ruby,s,samp,script, + select,slot,small,span,strong,sub,sup,svg,template,textarea,time, + tt,u,var,video,wbr` + .split(',') + .map((s) => s.trim()) + ) - // We need this so we can know if we processed, for example, - // a followed by a

because if that's the case, don't use - // a ' ' to concatenate the texts together but a '\n' instead. - // That means, given this input: - // - //

Bla

FooBar

Hi again

- // - // we can produce this outcome: - // - // 'Bla\nFoo Bar\nHi again' - // - let previousTagName = '' + const walkTree = (node, callback, index = 0, level = 0) => { + callback(node, index, level) + for (let i = 0; i < (node.children || []).length; i++) { + walkTree(node.children[i], callback, i, ++level) + level-- + } + } - $('p, h2, h3, td, pre, li', $root).each((i, element) => { - const $element = $(element) - if (previousTagName === 'td' && element.tagName !== 'td') { - text += '\n' + const fragments = [] + + walkTree($root[0], (element) => { + if (element.name === 'body') return + + if (element.type === 'text') { + const parentElement = element.parent || {} + const previousElement = element.prev || {} + let { data } = element + if (data.trim()) { + if (!inlineElements.has(parentElement.name) && !inlineElements.has(previousElement.name)) { + data = `\n${data}` + } + fragments.push(data) + } } - // Because our cheerio selector is all the block level tags, - // what you might end up with is, from: - // - //
  • Text

  • - //
  • Code
  • - // - // ['Text', 'Text', 'Code', 'Code'] - // - // because it will spot both the
  • and the

    . - // If all HTML was exactly like that, you could omit the

  • selector, - // but a lot of HTML is like this: - // - //
  • Bare text
  • - // - // So we need to bail if we're inside a block level element whose parent - // already was a
  • . - if ((element.tagName === 'p' || element.tagName === 'pre') && element.parent.tagName === 'li') { - return - } - text += $element.text() - if (element.tagName === 'td') { - text += ' ' - } else { - text += '\n' - } - previousTagName = element.tagName }) - text = text.trim().replace(/\s*[\r\n]+/g, '\n') - return text + return fragments.join('').trim() } diff --git a/tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html b/tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html new file mode 100644 index 0000000000..c46b88c799 --- /dev/null +++ b/tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html @@ -0,0 +1,23 @@ +
    + +
    + +

    I am the page title

    + +
    +

    This is an introduction to the article.

    +
    + +
    +

    Heading

    + + +
    + +
    diff --git a/tests/unit/search/fixtures/page-with-multiple-h1s.html b/tests/unit/search/fixtures/page-with-multiple-h1s.html index 2e074f243e..d0252a5dc2 100644 --- a/tests/unit/search/fixtures/page-with-multiple-h1s.html +++ b/tests/unit/search/fixtures/page-with-multiple-h1s.html @@ -14,5 +14,6 @@

    A heading 1 inside the body

    -

    This won't be ignored.

    + +

    Managing email preferences

    You can add or change the email addresses associated with your account on GitHub.com. You can also manage emails you receive from GitHub.
    diff --git a/tests/unit/search/parse-page-sections-into-records.js b/tests/unit/search/parse-page-sections-into-records.js index 13141aa2d2..eff36a033d 100644 --- a/tests/unit/search/parse-page-sections-into-records.js +++ b/tests/unit/search/parse-page-sections-into-records.js @@ -1,7 +1,10 @@ import { fileURLToPath } from 'url' import path from 'path' import fs from 'fs/promises' + import cheerio from 'cheerio' +import { expect, test } from '@jest/globals' + import parsePageSectionsIntoRecords from '../../../script/search/parse-page-sections-into-records.js' const __dirname = path.dirname(fileURLToPath(import.meta.url)) @@ -22,6 +25,10 @@ const fixtures = { path.join(__dirname, 'fixtures/page-with-multiple-h1s.html'), 'utf8' ), + pageHeadingParagraphNoWhitespace: await fs.readFile( + path.join(__dirname, 'fixtures/page-with-heading-and-paragraph-no-whitespace.html'), + 'utf8' + ), } describe('search parsePageSectionsIntoRecords module', () => { @@ -40,7 +47,7 @@ describe('search parsePageSectionsIntoRecords module', () => { "In this article\nThis won't be ignored.\nFirst heading\n" + "Here's a paragraph.\nAnd another.\nSecond heading\n" + "Here's a paragraph in the second section.\nAnd another.\n" + - 'Table heading\nPeter Human\n' + + 'Table heading\nPeter\nHuman\n' + 'Bullet\nPoint\nNumbered\nList\n' + "Further reading\nThis won't be ignored.", topics: ['topic1', 'topic2', 'GitHub Actions', 'Actions'], @@ -90,4 +97,27 @@ describe('search parsePageSectionsIntoRecords module', () => { const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) expect(record.title).toEqual('I am the page title') }) + + test("content doesn't lump headings with paragraphs together", () => { + const html = fixtures.pageHeadingParagraphNoWhitespace + const $ = cheerio.load(html) + const href = '/example/href' + const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) + + // This is a

    inside the page but it should only appear once. + // We had a bug where the heading would be injected twice. + // E.g. + // + //

    Heading

    Text here

    + // + // would become: + // + // Heading\nHeadingText here + // + // So now we make sure it only appears exactly once. + expect(record.content.match(/Changing your primary email address/g).length).toBe(1) + // But note also that it would also concatenate the text of the heading + // with the text of the paragraph without a whitespace in between. + expect(record.content.includes('email addressYou can set')).toBeFalsy() + }) })