1
0
mirror of synced 2026-01-07 09:01:31 -05:00

HTML to plain text is broken in various places (#29006)

* HTML to plain text is broken in various places

* remove comment
This commit is contained in:
Peter Bengtsson
2022-07-13 23:29:55 +02:00
committed by GitHub
parent 97212c5114
commit e61d71f42f
4 changed files with 90 additions and 49 deletions

View File

@@ -0,0 +1,23 @@
<div data-search="breadcrumbs">
<nav class="breadcrumbs">
<a href="#">GitHub Actions</a>
<a href="#">actions learning path</a>
<a href="#">I am the page title</a>
</nav>
</div>
<h1>I am the page title</h1>
<div data-search="lead">
<p>This is an introduction to the article.</p>
</div>
<div data-search="article-body">
<h1>Heading</h1>
<!-- Deliberately no whitespace between tags -->
<div><ul><ul><li><div><span><div><a href="foo"><h2>Adding an email address to your GitHub account</h2><p>GitHub, see "<a href="/en/articles/setting-your-commit-email-address">Setting your commit email address</a>."</p></a></div></span></div></li>
<li><div><div><a href="/"><h2>Changing your primary email address</h2><p>You can change the email address associated with your personal account at any time.</p></a></div></span></div></li>
</ul></ul></div>
</div>

File diff suppressed because one or more lines are too long

View File

@@ -1,7 +1,10 @@
import { fileURLToPath } from 'url'
import path from 'path'
import fs from 'fs/promises'
import cheerio from 'cheerio'
import { expect, test } from '@jest/globals'
import parsePageSectionsIntoRecords from '../../../script/search/parse-page-sections-into-records.js'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
@@ -22,6 +25,10 @@ const fixtures = {
path.join(__dirname, 'fixtures/page-with-multiple-h1s.html'),
'utf8'
),
pageHeadingParagraphNoWhitespace: await fs.readFile(
path.join(__dirname, 'fixtures/page-with-heading-and-paragraph-no-whitespace.html'),
'utf8'
),
}
describe('search parsePageSectionsIntoRecords module', () => {
@@ -40,7 +47,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
"In this article\nThis won't be ignored.\nFirst heading\n" +
"Here's a paragraph.\nAnd another.\nSecond heading\n" +
"Here's a paragraph in the second section.\nAnd another.\n" +
'Table heading\nPeter Human\n' +
'Table heading\nPeter\nHuman\n' +
'Bullet\nPoint\nNumbered\nList\n' +
"Further reading\nThis won't be ignored.",
topics: ['topic1', 'topic2', 'GitHub Actions', 'Actions'],
@@ -90,4 +97,27 @@ describe('search parsePageSectionsIntoRecords module', () => {
const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
expect(record.title).toEqual('I am the page title')
})
test("content doesn't lump headings with paragraphs together", () => {
const html = fixtures.pageHeadingParagraphNoWhitespace
const $ = cheerio.load(html)
const href = '/example/href'
const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
// This is a <h2> inside the page but it should only appear once.
// We had a bug where the heading would be injected twice.
// E.g.
//
// <h2>Heading</h2><p>Text here</p>
//
// would become:
//
// Heading\nHeadingText here
//
// So now we make sure it only appears exactly once.
expect(record.content.match(/Changing your primary email address/g).length).toBe(1)
// But note also that it would also concatenate the text of the heading
// with the text of the paragraph without a whitespace in between.
expect(record.content.includes('email addressYou can set')).toBeFalsy()
})
})