HTML to plain text is broken in various places (#29006)

* HTML to plain text is broken in various places * remove comment
2026-01-07 09:01:31 -05:00 · 2022-07-13 23:29:55 +02:00
parent 97212c5114
commit e61d71f42f
4 changed files with 90 additions and 49 deletions
--- a/tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html
+++ b/tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html
@@ -0,0 +1,23 @@
+<div data-search="breadcrumbs">
+  <nav class="breadcrumbs">
+    <a href="#">GitHub Actions</a>
+    <a href="#">actions learning path</a>
+    <a href="#">I am the page title</a>
+  </nav>
+</div>
+
+<h1>I am the page title</h1>
+
+<div data-search="lead">
+  <p>This is an introduction to the article.</p>
+</div>
+
+<div data-search="article-body">
+  <h1>Heading</h1>
+
+  <!-- Deliberately no whitespace between tags -->
+  <div><ul><ul><li><div><span><div><a href="foo"><h2>Adding an email address to your GitHub account</h2><p>GitHub, see "<a href="/en/articles/setting-your-commit-email-address">Setting your commit email address</a>."</p></a></div></span></div></li>
+    <li><div><div><a href="/"><h2>Changing your primary email address</h2><p>You can change the email address associated with your personal account at any time.</p></a></div></span></div></li>
+   </ul></ul></div>
+
+</div>
--- a/tests/unit/search/fixtures/page-with-multiple-h1s.html
+++ b/tests/unit/search/fixtures/page-with-multiple-h1s.html
--- a/tests/unit/search/parse-page-sections-into-records.js
+++ b/tests/unit/search/parse-page-sections-into-records.js
@@ -1,7 +1,10 @@
 import { fileURLToPath } from 'url'
 import path from 'path'
 import fs from 'fs/promises'
+
 import cheerio from 'cheerio'
+import { expect, test } from '@jest/globals'
+
 import parsePageSectionsIntoRecords from '../../../script/search/parse-page-sections-into-records.js'
 const __dirname = path.dirname(fileURLToPath(import.meta.url))

@@ -22,6 +25,10 @@ const fixtures = {
    path.join(__dirname, 'fixtures/page-with-multiple-h1s.html'),
    'utf8'
  ),
+  pageHeadingParagraphNoWhitespace: await fs.readFile(
+    path.join(__dirname, 'fixtures/page-with-heading-and-paragraph-no-whitespace.html'),
+    'utf8'
+  ),
 }

 describe('search parsePageSectionsIntoRecords module', () => {
@@ -40,7 +47,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
        "In this article\nThis won't be ignored.\nFirst heading\n" +
        "Here's a paragraph.\nAnd another.\nSecond heading\n" +
        "Here's a paragraph in the second section.\nAnd another.\n" +
-        'Table heading\nPeter Human\n' +
+        'Table heading\nPeter\nHuman\n' +
        'Bullet\nPoint\nNumbered\nList\n' +
        "Further reading\nThis won't be ignored.",
      topics: ['topic1', 'topic2', 'GitHub Actions', 'Actions'],
@@ -90,4 +97,27 @@ describe('search parsePageSectionsIntoRecords module', () => {
    const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
    expect(record.title).toEqual('I am the page title')
  })
+
+  test("content doesn't lump headings with paragraphs together", () => {
+    const html = fixtures.pageHeadingParagraphNoWhitespace
+    const $ = cheerio.load(html)
+    const href = '/example/href'
+    const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
+
+    // This is a <h2> inside the page but it should only appear once.
+    // We had a bug where the heading would be injected twice.
+    // E.g.
+    //
+    //    <h2>Heading</h2><p>Text here</p>
+    //
+    // would become:
+    //
+    //    Heading\nHeadingText here
+    //
+    // So now we make sure it only appears exactly once.
+    expect(record.content.match(/Changing your primary email address/g).length).toBe(1)
+    // But note also that it would also concatenate the text of the heading
+    // with the text of the paragraph without a whitespace in between.
+    expect(record.content.includes('email addressYou can set')).toBeFalsy()
+  })
 })