HTML to plain text is broken in various places (#29006)
* HTML to plain text is broken in various places * remove comment
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
<div data-search="breadcrumbs">
|
||||
<nav class="breadcrumbs">
|
||||
<a href="#">GitHub Actions</a>
|
||||
<a href="#">actions learning path</a>
|
||||
<a href="#">I am the page title</a>
|
||||
</nav>
|
||||
</div>
|
||||
|
||||
<h1>I am the page title</h1>
|
||||
|
||||
<div data-search="lead">
|
||||
<p>This is an introduction to the article.</p>
|
||||
</div>
|
||||
|
||||
<div data-search="article-body">
|
||||
<h1>Heading</h1>
|
||||
|
||||
<!-- Deliberately no whitespace between tags -->
|
||||
<div><ul><ul><li><div><span><div><a href="foo"><h2>Adding an email address to your GitHub account</h2><p>GitHub, see "<a href="/en/articles/setting-your-commit-email-address">Setting your commit email address</a>."</p></a></div></span></div></li>
|
||||
<li><div><div><a href="/"><h2>Changing your primary email address</h2><p>You can change the email address associated with your personal account at any time.</p></a></div></span></div></li>
|
||||
</ul></ul></div>
|
||||
|
||||
</div>
|
||||
File diff suppressed because one or more lines are too long
@@ -1,7 +1,10 @@
|
||||
import { fileURLToPath } from 'url'
|
||||
import path from 'path'
|
||||
import fs from 'fs/promises'
|
||||
|
||||
import cheerio from 'cheerio'
|
||||
import { expect, test } from '@jest/globals'
|
||||
|
||||
import parsePageSectionsIntoRecords from '../../../script/search/parse-page-sections-into-records.js'
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url))
|
||||
|
||||
@@ -22,6 +25,10 @@ const fixtures = {
|
||||
path.join(__dirname, 'fixtures/page-with-multiple-h1s.html'),
|
||||
'utf8'
|
||||
),
|
||||
pageHeadingParagraphNoWhitespace: await fs.readFile(
|
||||
path.join(__dirname, 'fixtures/page-with-heading-and-paragraph-no-whitespace.html'),
|
||||
'utf8'
|
||||
),
|
||||
}
|
||||
|
||||
describe('search parsePageSectionsIntoRecords module', () => {
|
||||
@@ -40,7 +47,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
|
||||
"In this article\nThis won't be ignored.\nFirst heading\n" +
|
||||
"Here's a paragraph.\nAnd another.\nSecond heading\n" +
|
||||
"Here's a paragraph in the second section.\nAnd another.\n" +
|
||||
'Table heading\nPeter Human\n' +
|
||||
'Table heading\nPeter\nHuman\n' +
|
||||
'Bullet\nPoint\nNumbered\nList\n' +
|
||||
"Further reading\nThis won't be ignored.",
|
||||
topics: ['topic1', 'topic2', 'GitHub Actions', 'Actions'],
|
||||
@@ -90,4 +97,27 @@ describe('search parsePageSectionsIntoRecords module', () => {
|
||||
const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
|
||||
expect(record.title).toEqual('I am the page title')
|
||||
})
|
||||
|
||||
test("content doesn't lump headings with paragraphs together", () => {
|
||||
const html = fixtures.pageHeadingParagraphNoWhitespace
|
||||
const $ = cheerio.load(html)
|
||||
const href = '/example/href'
|
||||
const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' })
|
||||
|
||||
// This is a <h2> inside the page but it should only appear once.
|
||||
// We had a bug where the heading would be injected twice.
|
||||
// E.g.
|
||||
//
|
||||
// <h2>Heading</h2><p>Text here</p>
|
||||
//
|
||||
// would become:
|
||||
//
|
||||
// Heading\nHeadingText here
|
||||
//
|
||||
// So now we make sure it only appears exactly once.
|
||||
expect(record.content.match(/Changing your primary email address/g).length).toBe(1)
|
||||
// But note also that it would also concatenate the text of the heading
|
||||
// with the text of the paragraph without a whitespace in between.
|
||||
expect(record.content.includes('email addressYou can set')).toBeFalsy()
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user