From e61d71f42f14aa2f2ef8a7190a7e4cb062ad508d Mon Sep 17 00:00:00 2001
From: Peter Bengtsson <peterbe@github.com>
Date: Wed, 13 Jul 2022 23:29:55 +0200
Subject: [PATCH] HTML to plain text is broken in various places (#29006)

* HTML to plain text is broken in various places

* remove comment
---
 .../parse-page-sections-into-records.js       | 81 ++++++++-----------
 ...h-heading-and-paragraph-no-whitespace.html | 23 ++++++
 .../fixtures/page-with-multiple-h1s.html      |  3 +-
 .../parse-page-sections-into-records.js       | 32 +++++++-
 4 files changed, 90 insertions(+), 49 deletions(-)
 create mode 100644 tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html
diff --git a/script/search/parse-page-sections-into-records.js b/script/search/parse-page-sections-into-records.js
index ed1426b33f..227ffe0e95 100644
--- a/script/search/parse-page-sections-into-records.js
+++ b/script/search/parse-page-sections-into-records.js
@@ -59,7 +59,7 @@ export default function parsePageSectionsIntoRecords(page) {
   // pages that yields some decent content to be searched on, because
   // when you view these pages in a browser, there's clearly text there.
   if ($root.length > 0) {
-    body = getAllText($, $root)
+    body = getAllText($root)
   }
 
   if (!body && !intro) {
@@ -85,55 +85,42 @@ export default function parsePageSectionsIntoRecords(page) {
   }
 }
 
-function getAllText($, $root) {
-  let text = ''
+function getAllText($root) {
+  const inlineElements = new Set(
+    `a,abbr,acronym,audio,b,bdi,bdo,big,br,button,canvas,cite,code,data,
+    datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,label,map,mark,
+    meter,noscript,object,output,picture,progress,q,ruby,s,samp,script,
+    select,slot,small,span,strong,sub,sup,svg,template,textarea,time,
+    tt,u,var,video,wbr`
+      .split(',')
+      .map((s) => s.trim())
+  )
 
-  // We need this so we can know if we processed, for example,
-  // a <td> followed by a <p> because if that's the case, don't use
-  // a ' ' to concatenate the texts together but a '\n' instead.
-  // That means, given this input:
-  //
-  //    <p>Bla</p><table><tr><td>Foo</td><td>Bar</td></table><p>Hi again</p>
-  //
-  // we can produce this outcome:
-  //
-  //    'Bla\nFoo Bar\nHi again'
-  //
-  let previousTagName = ''
+  const walkTree = (node, callback, index = 0, level = 0) => {
+    callback(node, index, level)
+    for (let i = 0; i < (node.children || []).length; i++) {
+      walkTree(node.children[i], callback, i, ++level)
+      level--
+    }
+  }
 
-  $('p, h2, h3, td, pre, li', $root).each((i, element) => {
-    const $element = $(element)
-    if (previousTagName === 'td' && element.tagName !== 'td') {
-      text += '\n'
+  const fragments = []
+
+  walkTree($root[0], (element) => {
+    if (element.name === 'body') return
+
+    if (element.type === 'text') {
+      const parentElement = element.parent || {}
+      const previousElement = element.prev || {}
+      let { data } = element
+      if (data.trim()) {
+        if (!inlineElements.has(parentElement.name) && !inlineElements.has(previousElement.name)) {
+          data = `\n${data}`
+        }
+        fragments.push(data)
+      }
     }
-    // Because our cheerio selector is all the block level tags,
-    // what you might end up with is, from:
-    //
-    //   <li><p>Text</p></li>
-    //   <li><pre>Code</pre></li>
-    //
-    //   ['Text', 'Text', 'Code', 'Code']
-    //
-    // because it will spot both the <li> and the <p>.
-    // If all HTML was exactly like that, you could omit the <li> selector,
-    // but a lot of HTML is like this:
-    //
-    //    <li>Bare text<li>
-    //
-    // So we need to bail if we're inside a block level element whose parent
-    // already was a <li>.
-    if ((element.tagName === 'p' || element.tagName === 'pre') && element.parent.tagName === 'li') {
-      return
-    }
-    text += $element.text()
-    if (element.tagName === 'td') {
-      text += ' '
-    } else {
-      text += '\n'
-    }
-    previousTagName = element.tagName
   })
-  text = text.trim().replace(/\s*[\r\n]+/g, '\n')
 
-  return text
+  return fragments.join('').trim()
 }
diff --git a/tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html b/tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html
new file mode 100644
index 0000000000..c46b88c799
--- /dev/null
+++ b/tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html
@@ -0,0 +1,23 @@
+<div data-search="breadcrumbs">
+  <nav class="breadcrumbs">
+    <a href="#">GitHub Actions</a>
+    <a href="#">actions learning path</a>
+    <a href="#">I am the page title</a>
+  </nav>
+</div>
+
+<h1>I am the page title</h1>
+
+<div data-search="lead">
+  <p>This is an introduction to the article.</p>
+</div>
+
+<div data-search="article-body">
+  <h1>Heading</h1>
+
+  <!-- Deliberately no whitespace between tags -->
+  <div><ul><ul><li><div><span><div><a href="foo"><h2>Adding an email address to your GitHub account</h2><p>GitHub, see "<a href="/en/articles/setting-your-commit-email-address">Setting your commit email address</a>."</p></a></div></span></div></li>
+    <li><div><div><a href="/"><h2>Changing your primary email address</h2><p>You can change the email address associated with your personal account at any time.</p></a></div></span></div></li>
+   </ul></ul></div>
+
+</div>
diff --git a/tests/unit/search/fixtures/page-with-multiple-h1s.html b/tests/unit/search/fixtures/page-with-multiple-h1s.html
index 2e074f243e..d0252a5dc2 100644
--- a/tests/unit/search/fixtures/page-with-multiple-h1s.html
+++ b/tests/unit/search/fixtures/page-with-multiple-h1s.html
@@ -14,5 +14,6 @@
 
 <div data-search="article-body">
   <h1>A heading 1 inside the body</h1>
-  <p>This won't be ignored.</p>
+
+<div data-search="article-body" class="Box-sc-1gh2r6s-0 fWkkBJ"><div class="d-flex flex-items-baseline flex-justify-between"><h1 class="border-bottom-0">Managing email preferences</h1></div><div class="f2 color-fg-muted mb-3 Lead_container__g1kT8" data-search="lead">You can add or change the email addresses associated with your account on GitHub.com. You can also manage emails you receive from GitHub.</div><div class="border-bottom border-xl-0 pb-4 mb-5 pb-xl-2 mb-xl-2"></div><div class="mt-7"><ul data-testid="table-of-contents" class="list-style-none"><ul class="List__ListBox-sc-1x7olzq-0 iFaQQI"><li tabindex="0" aria-labelledby="react-aria-1029 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1029" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/adding-an-email-address-to-your-github-account"><h2 class="py-1 h4">Adding an email address to your GitHub account</h2><p class="f4 color-fg-muted">GitHub allows you to add as many email addresses to your account as you like. If you set an email address in your local Git configuration, you will need to add it to your account settings in order to connect your commits to your account. For more information about your email address and commits, see "<a href="/en/articles/setting-your-commit-email-address">Setting your commit email address</a>."</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1032 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1032" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/changing-your-primary-email-address"><h2 class="py-1 h4">Changing your primary email address</h2><p class="f4 color-fg-muted">You can change the email address associated with your personal account at any time.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1035 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1035" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/setting-a-backup-email-address"><h2 class="py-1 h4">Setting a backup email address</h2><p class="f4 color-fg-muted">Use a backup email address as an additional destination for security-relevant account notifications and to securely reset your password if you can no longer access your primary email address.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1038 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1038" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/setting-your-commit-email-address"><h2 class="py-1 h4">Setting your commit email address</h2><p class="f4 color-fg-muted">You can set the email address that is used to author commits on GitHub.com and on your computer.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1041 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1041" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/blocking-command-line-pushes-that-expose-your-personal-email-address"><h2 class="py-1 h4">Blocking command line pushes that expose your personal email address</h2><p class="f4 color-fg-muted">If you've chosen to keep your email address private when performing web-based operations, you can also choose to block command line pushes that may expose your personal email address.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1044 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1044" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/remembering-your-github-username-or-email"><h2 class="py-1 h4">Remembering your GitHub username or email</h2><p class="f4 color-fg-muted">Are you signing in to GitHub.com for the first time in a while? If so, welcome back! If you can't remember the username for your personal account on GitHub, you can try these methods for remembering it.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1047 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1047" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/types-of-emails-github-sends"><h2 class="py-1 h4">Types of emails GitHub sends</h2><p class="f4 color-fg-muted">There are several types of emails you can receive from GitHub, including notifications, account information, customer research invitations, and marketing communications.</p></a></div></span></div></li><li tabindex="0" aria-labelledby="react-aria-1050 " class="Item__LiBox-sc-yeql7o-0 cBNrzJ border-bottom"><div data-component="ActionList.Item--DividerContainer" class="Box-sc-1gh2r6s-0 gwyGig"><span id="react-aria-1050" class="Box-sc-1gh2r6s-0 gvhUXE"><div class="mt-2"><a rel="" data-testid="bump-link" class="BumpLink_container__lXyMT no-underline d-block py-1" href="/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/managing-marketing-emails-from-github"><h2 class="py-1 h4">Managing marketing emails from GitHub</h2><p class="f4 color-fg-muted">In addition to notifications and account emails, GitHub occasionally sends marketing emails with news and information about our products. If you unsubscribe from existing marketing emails, you won't be included in future campaigns unless you change your GitHub email settings.</p></a></div></span></div></li></ul></ul></div></div>
 </div>
diff --git a/tests/unit/search/parse-page-sections-into-records.js b/tests/unit/search/parse-page-sections-into-records.js
index 13141aa2d2..eff36a033d 100644
--- a/tests/unit/search/parse-page-sections-into-records.js
+++ b/tests/unit/search/parse-page-sections-into-records.js
@@ -1,7 +1,10 @@
 import { fileURLToPath } from 'url'
 import path from 'path'
 import fs from 'fs/promises'
+
 import cheerio from 'cheerio'
+import { expect, test } from '@jest/globals'
+
 import parsePageSectionsIntoRecords from '../../../script/search/parse-page-sections-into-records.js'
 const __dirname = path.dirname(fileURLToPath(import.meta.url))
 
@@ -22,6 +25,10 @@ const fixtures = {
     path.join(__dirname, 'fixtures/page-with-multiple-h1s.html'),
     'utf8'
   ),
+  pageHeadingParagraphNoWhitespace: await fs.readFile(
+    path.join(__dirname, 'fixtures/page-with-heading-and-paragraph-no-whitespace.html'),
+    'utf8'
+  ),
 }
 
 describe('search parsePageSectionsIntoRecords module', () => {
@@ -40,7 +47,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
         "In this article\nThis won't be ignored.\nFirst heading\n" +
         "Here's a paragraph.\nAnd another.\nSecond heading\n" +
         "Here's a paragraph in the second section.\nAnd another.\n" +
-        'Table heading\nPeter Human\n' +
+        'Table heading\nPeter\nHuman\n' +
         'Bullet\nPoint\nNumbered\nList\n' +
         "Further reading\nThis won't be ignored.",
       topics: ['topic1', 'topic2', 'GitHub Actions', 'Actions'],
@@ -90,4 +97,27 @@ describe('search parsePageSectionsIntoRecords module', () => {
     const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
     expect(record.title).toEqual('I am the page title')
   })
+
+  test("content doesn't lump headings with paragraphs together", () => {
+    const html = fixtures.pageHeadingParagraphNoWhitespace
+    const $ = cheerio.load(html)
+    const href = '/example/href'
+    const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
+
+    // This is a <h2> inside the page but it should only appear once.
+    // We had a bug where the heading would be injected twice.
+    // E.g.
+    //
+    //    <h2>Heading</h2><p>Text here</p>
+    //
+    // would become:
+    //
+    //    Heading\nHeadingText here
+    //
+    // So now we make sure it only appears exactly once.
+    expect(record.content.match(/Changing your primary email address/g).length).toBe(1)
+    // But note also that it would also concatenate the text of the heading
+    // with the text of the paragraph without a whitespace in between.
+    expect(record.content.includes('email addressYou can set')).toBeFalsy()
+  })
 })