From 401d9696a6007f6842c1542c5490b7415cd33d87 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Mon, 14 Nov 2022 13:34:21 +0100 Subject: [PATCH 1/2] use /api/search/v1 in Search.tsx (#31952) --- components/Search.tsx | 106 +++++++++++++++++++++++------------- data/ui.yml | 1 + middleware/api/search.js | 12 +++- tests/browser/browser.js | 4 +- tests/content/api-search.js | 30 ++++++++++ 5 files changed, 109 insertions(+), 44 deletions(-) diff --git a/components/Search.tsx b/components/Search.tsx index 13bd9e5aad..a314389df4 100644 --- a/components/Search.tsx +++ b/components/Search.tsx @@ -16,24 +16,33 @@ import { Link } from 'components/Link' import styles from './Search.module.scss' -// The search endpoint used prior to using /api/search/legacy was -// just /search, which used middleware/search.js. We are leaving that -// middleware in tact to allow folks that previously used the /search -// endpoint to continue doing so. But, we changed the endpoint used by -// the search input on docs.github.com to use the new /api/search/legacy -// endpoint. -// Eventually, we will deprecate the /search and /api/search/legacy -// endpoints and use the /api/search/v1 endpoint, which has -// a different response JSON format. -const SEARCH_API_ENDPOINT = '/api/search/legacy' +const SEARCH_API_ENDPOINT = '/api/search/v1' -type SearchResult = { +type Hit = { + id: string url: string - breadcrumbs: string title: string - content: string - score: number - popularity: number + breadcrumbs: string + highlights: { + title: Array + content: Array + } + score?: number + popularity?: number + es_url?: string +} + +// Note, the JSON will contain other keys too, but since we don't use +// them in this component, we don't need to specify them. +type Meta = { + found: { + value: number + } +} + +type Data = { + hits: Hit[] + meta: Meta } type Props = { @@ -73,10 +82,14 @@ export function Search({ language, version, query, + // In its current state, can't with confidence know if the user + // initiated a search because of the input debounce or if the + // user has finished typing. + autocomplete: 'true', })}` : null - const { data: results, error: searchError } = useSWR( + const { data, error: searchError } = useSWR( fetchURL, async (url: string) => { const response = await fetch(url) @@ -102,14 +115,14 @@ export function Search({ } ) - const [previousResults, setPreviousResults] = useState() + const [previousData, setPreviousData] = useState() useEffect(() => { - if (results) { - setPreviousResults(results) + if (data) { + setPreviousData(data) } else if (!query) { - setPreviousResults(undefined) + setPreviousData(undefined) } - }, [results, query]) + }, [data, query]) // The `isLoading` boolean will become false every time the useSWR hook // fires off a new XHR. So it toggles from false/true often. @@ -124,7 +137,7 @@ export function Search({ // mean saying "Loading..." is a lie! // That's why we combine them into a final one. We're basically doing // this to favor *NOT* saying "Loading...". - const isLoadingRaw = Boolean(query && !results && !searchError) + const isLoadingRaw = Boolean(query && !data && !searchError) const [isLoadingDebounced] = useDebounce(isLoadingRaw, 500) const isLoading = isLoadingRaw && isLoadingDebounced @@ -216,7 +229,7 @@ export function Search({ isHeaderSearch={isHeaderSearch} isMobileSearch={isMobileSearch} isLoading={isLoading} - results={previousResults} + results={previousData} closeSearch={closeSearch} debug={debug} query={query} @@ -342,7 +355,7 @@ function ShowSearchResults({ isHeaderSearch: boolean isMobileSearch: boolean isLoading: boolean - results: SearchResult[] | undefined + results: Data | undefined closeSearch: () => void debug: boolean query: string @@ -461,11 +474,33 @@ function ShowSearchResults({ {t('search_results_for')}: {query}

- {t('matches_displayed')}: {results.length === 0 ? t('no_results') : results.length} + {results.meta.found.value === 0 ? ( + t('no_results') + ) : ( + + {t('matches_found')}: {results.meta.found.value.toLocaleString()} + + )} + .{' '} + {results.meta.found.value > results.hits.length && ( + + {t('matches_displayed')}:{' '} + {results.meta.found.value === 0 ? t('no_results') : results.hits.length} + + )}

- {results.map(({ url, breadcrumbs, title, content, score, popularity }, index) => { + {results.hits.map((hit, index) => { + const { url, breadcrumbs, title, highlights, score, popularity } = hit + + const contentHTML = + highlights.content && highlights.content.length > 0 + ? highlights.content.join('
') + : '' + const titleHTML = + highlights.title && highlights.title.length > 0 ? highlights.title[0] : title + return (
- {/* Breadcrumbs in search records don't include the page title. These fields may contain elements that we need to render */} - {debug && ( + {debug && score !== undefined && popularity !== undefined && ( score: {score.toFixed(4)} popularity: {popularity.toFixed(4)} @@ -505,13 +533,13 @@ function ShowSearchResults({

{ validate: (v) => v >= 1 && v <= 10, }, { key: 'sort', default_: DEFAULT_SORT, validate: (v) => POSSIBLE_SORTS.includes(v) }, - { key: 'debug', default_: Boolean(process.env.NODE_ENV === 'development' || req.query.debug) }, + { key: 'autocomplete', default_: false, cast: toBoolean }, + { key: 'debug', default_: process.env.NODE_ENV === 'development', cast: toBoolean }, ] const search = {} @@ -221,11 +222,16 @@ const validationMiddleware = (req, res, next) => { return next() } +function toBoolean(value) { + if (value === 'true' || value === '1') return true + return false +} + router.get( '/v1', validationMiddleware, catchMiddlewareError(async function search(req, res) { - const { indexName, query, page, size, debug, sort } = req.search + const { indexName, query, autocomplete, page, size, debug, sort } = req.search // The getSearchResults() function is a mix of preparing the search, // sending & receiving it, and post-processing the response from the @@ -236,7 +242,7 @@ router.get( const tags = ['version:v1', `indexName:${indexName}`] const timed = statsd.asyncTimer(getSearchResults, 'api.search', tags) - const options = { indexName, query, page, size, debug, sort } + const options = { indexName, query, page, size, debug, sort, usePrefixSearch: autocomplete } try { const { meta, hits } = await timed(options) diff --git a/tests/browser/browser.js b/tests/browser/browser.js index 1e04bfd9ae..4fd81dfbe3 100644 --- a/tests/browser/browser.js +++ b/tests/browser/browser.js @@ -91,7 +91,7 @@ describe('browser search', () => { newPage.on('request', (interceptedRequest) => { if ( interceptedRequest.method() === 'GET' && - /api\/search\/legacy\?/i.test(interceptedRequest.url()) + /api\/search\/v1\?/i.test(interceptedRequest.url()) ) { const { searchParams } = new URL(interceptedRequest.url()) expect(searchParams.get('version')).toBe('dotcom') @@ -119,7 +119,7 @@ describe('browser search', () => { newPage.on('request', (interceptedRequest) => { if ( interceptedRequest.method() === 'GET' && - /api\/search\/legacy\?/i.test(interceptedRequest.url()) + /api\/search\/v1\?/i.test(interceptedRequest.url()) ) { const { searchParams } = new URL(interceptedRequest.url()) expect(searchParams.get('version')).toBe('ghae') diff --git a/tests/content/api-search.js b/tests/content/api-search.js index 73c60783a7..cfe8870016 100644 --- a/tests/content/api-search.js +++ b/tests/content/api-search.js @@ -83,6 +83,36 @@ describeIfElasticsearchURL('search v1 middleware', () => { expect(hit.es_url).toBeTruthy() }) + test('search with and without autocomplete on', async () => { + // *Without* autocomplete=true + { + const sp = new URLSearchParams() + sp.set('query', 'sill') + const res = await get('/api/search/v1?' + sp) + expect(res.statusCode).toBe(200) + const results = JSON.parse(res.text) + // Fixtures contains no word called 'sill'. It does contain the term + // 'silly' which, in English, becomes 'silli` when stemmed. + // Because we don't use `&autocomplete=true` this time, we expect + // to find nothing. + expect(results.meta.found.value).toBe(0) + } + + // *With* autocomplete=true + { + const sp = new URLSearchParams() + sp.set('query', 'sill') + sp.set('autocomplete', 'true') + const res = await get('/api/search/v1?' + sp) + expect(res.statusCode).toBe(200) + const results = JSON.parse(res.text) + expect(results.meta.found.value).toBeGreaterThanOrEqual(1) + const hit = results.hits[0] + const contentHighlights = hit.highlights.content + expect(contentHighlights[0]).toMatch('silly') + } + }) + test('find nothing', async () => { const sp = new URLSearchParams() sp.set('query', 'xojixjoiwejhfoiuwehjfioweufhj') From 2f665145716dbbb6174b5ef8fa72513ac38de99d Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Mon, 14 Nov 2022 13:46:36 +0100 Subject: [PATCH 2/2] make highlighting fields configurable (#32328) --- middleware/api/es-search.js | 76 ++++++++++++------- middleware/api/search.js | 32 +++++++- tests/content/api-search.js | 44 +++++++++++ .../github-docs-dotcom-en-records.json | 6 +- 4 files changed, 126 insertions(+), 32 deletions(-) diff --git a/middleware/api/es-search.js b/middleware/api/es-search.js index c6129a97ce..928b9ac7cb 100644 --- a/middleware/api/es-search.js +++ b/middleware/api/es-search.js @@ -1,5 +1,8 @@ import { Client } from '@elastic/elasticsearch' +export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content', 'headings'] +export const DEFAULT_HIGHLIGHT_FIELDS = ['title', 'content'] + const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL const isDevMode = process.env.NODE_ENV !== 'production' @@ -37,6 +40,7 @@ export async function getSearchResults({ topics, includeTopics, usePrefixSearch, + highlights, }) { if (topics && !Array.isArray(topics)) { throw new Error("'topics' has to be an array") @@ -72,7 +76,8 @@ export async function getSearchResults({ matchQuery.bool.filter = topicsFilter } - const highlight = getHighlightConfiguration(query) + const highlightFields = highlights || DEFAULT_HIGHLIGHT_FIELDS + const highlight = getHighlightConfiguration(query, highlightFields) const searchQuery = { highlight, @@ -141,7 +146,7 @@ export async function getSearchResults({ // const hitsAll = result.hits // ES >7.11 const hitsAll = result.body // ES <=7.11 - const hits = getHits(hitsAll.hits.hits, { indexName, debug, includeTopics }) + const hits = getHits(hitsAll.hits.hits, { indexName, debug, includeTopics, highlightFields }) const t1 = new Date() const meta = { @@ -345,14 +350,26 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) { return matchQueries } -function getHits(hits, { indexName, debug, includeTopics }) { +function getHits(hits, { indexName, debug, includeTopics, highlightFields }) { return hits.map((hit) => { + // Return `hit.highlights[...]` based on the highlight fields requested. + // So if you searched with `&highlights=headings&highlights=content` + // this will become: + // { + // content: [...], + // headings: [...] + // } + // even if there was a match on 'title'. + const hitHighlights = Object.fromEntries( + highlightFields.map((key) => [key, (hit.highlight && hit.highlight[key]) || []]) + ) + const result = { id: hit._id, url: hit._source.url, title: hit._source.title, breadcrumbs: hit._source.breadcrumbs, - highlights: hit.highlight || {}, + highlights: hitHighlights, } if (includeTopics) { result.topics = hit._source.topics || [] @@ -374,31 +391,38 @@ function getHits(hits, { indexName, debug, includeTopics }) { // of highlights of content under each title. If we feel it shows too // many highlights in the search result UI, we can come back here // and change it to something more appropriate. -function getHighlightConfiguration(query) { - return { - pre_tags: [''], - post_tags: [''], - fields: { - title: { - fragment_size: 200, - number_of_fragments: 1, - }, - headings: { fragment_size: 150, number_of_fragments: 2 }, - // The 'no_match_size' is so we can display *something* for the - // preview if there was no highlight match at all within the content. - content: { - fragment_size: 150, - number_of_fragments: 3, - no_match_size: 150, +function getHighlightConfiguration(query, highlights) { + const fields = {} + if (highlights.includes('title')) { + fields.title = { + fragment_size: 200, + number_of_fragments: 1, + } + } + if (highlights.includes('headings')) { + fields.headings = { fragment_size: 150, number_of_fragments: 2 } + } + if (highlights.includes('content')) { + // The 'no_match_size' is so we can display *something* for the + // preview if there was no highlight match at all within the content. + fields.content = { + fragment_size: 150, + number_of_fragments: 3, + no_match_size: 150, - highlight_query: { - match_phrase_prefix: { - content: { - query, - }, + highlight_query: { + match_phrase_prefix: { + content: { + query, }, }, }, - }, + } + } + + return { + pre_tags: [''], + post_tags: [''], + fields, } } diff --git a/middleware/api/search.js b/middleware/api/search.js index 716dee9e06..c0cfdb2c21 100644 --- a/middleware/api/search.js +++ b/middleware/api/search.js @@ -7,7 +7,11 @@ import { allVersions } from '../../lib/all-versions.js' import statsd from '../../lib/statsd.js' import { defaultCacheControl } from '../cache-control.js' import catchMiddlewareError from '../catch-middleware-error.js' -import { getSearchResults } from './es-search.js' +import { + getSearchResults, + POSSIBLE_HIGHLIGHT_FIELDS, + DEFAULT_HIGHLIGHT_FIELDS, +} from './es-search.js' // Used by the legacy search const versions = new Set(Object.values(searchVersions)) @@ -182,6 +186,19 @@ const validationMiddleware = (req, res, next) => { validate: (v) => v >= 1 && v <= 10, }, { key: 'sort', default_: DEFAULT_SORT, validate: (v) => POSSIBLE_SORTS.includes(v) }, + { + key: 'highlights', + default_: DEFAULT_HIGHLIGHT_FIELDS, + cast: (v) => (Array.isArray(v) ? v : [v]), + validate: (v) => { + for (const highlight of v) { + if (!POSSIBLE_HIGHLIGHT_FIELDS.includes(highlight)) { + throw new ValidationError(`highlight value '${highlight}' is not valid`) + } + } + return true + }, + }, { key: 'autocomplete', default_: false, cast: toBoolean }, { key: 'debug', default_: process.env.NODE_ENV === 'development', cast: toBoolean }, ] @@ -231,7 +248,7 @@ router.get( '/v1', validationMiddleware, catchMiddlewareError(async function search(req, res) { - const { indexName, query, autocomplete, page, size, debug, sort } = req.search + const { indexName, query, autocomplete, page, size, debug, sort, highlights } = req.search // The getSearchResults() function is a mix of preparing the search, // sending & receiving it, and post-processing the response from the @@ -242,7 +259,16 @@ router.get( const tags = ['version:v1', `indexName:${indexName}`] const timed = statsd.asyncTimer(getSearchResults, 'api.search', tags) - const options = { indexName, query, page, size, debug, sort, usePrefixSearch: autocomplete } + const options = { + indexName, + query, + page, + size, + debug, + sort, + highlights, + usePrefixSearch: autocomplete, + } try { const { meta, hits } = await timed(options) diff --git a/tests/content/api-search.js b/tests/content/api-search.js index cfe8870016..1b12e40a70 100644 --- a/tests/content/api-search.js +++ b/tests/content/api-search.js @@ -57,8 +57,11 @@ describeIfElasticsearchURL('search v1 middleware', () => { expect(hit.url).toBe('/en/foo') expect(hit.title).toBe('Foo') expect(hit.breadcrumbs).toBe('fooing') + // By default, 'title' and 'content' is included in highlights, + // but not 'headings' expect(hit.highlights.title[0]).toBe('Foo') expect(hit.highlights.content[0]).toMatch('foo') + expect(hit.highlights.headings).toBeUndefined() // Check that it can be cached at the CDN expect(res.headers['set-cookie']).toBeUndefined() @@ -123,6 +126,38 @@ describeIfElasticsearchURL('search v1 middleware', () => { expect(results.meta.found.value).toBe(0) }) + test('configurable highlights', async () => { + const sp = new URLSearchParams() + sp.set('query', 'introduction heading') + sp.append('highlights', 'headings') + sp.append('highlights', 'content') + const res = await get('/api/search/v1?' + sp) + expect(res.statusCode).toBe(200) + const results = JSON.parse(res.text) + expect(results.meta.found.value).toBeGreaterThanOrEqual(1) + for (const hit of results.hits) { + expect(hit.highlights.title).toBeFalsy() + expect(hit.highlights.headings).toBeTruthy() + expect(hit.highlights.content).toBeTruthy() + } + }) + + test('highlights keys matches highlights configuration', async () => { + const sp = new URLSearchParams() + // This will match because it's in the 'content' but not in 'headings' + sp.set('query', 'Fact of life') + sp.set('highlights', 'headings') + const res = await get('/api/search/v1?' + sp) + expect(res.statusCode).toBe(200) + const results = JSON.parse(res.text) + expect(results.meta.found.value).toBeGreaterThanOrEqual(1) + for (const hit of results.hits) { + expect(hit.highlights.headings).toBeTruthy() + expect(hit.highlights.title).toBeFalsy() + expect(hit.highlights.content).toBeFalsy() + } + }) + test('version can be aliased', async () => { const sp = new URLSearchParams() sp.set('query', 'foo') @@ -199,6 +234,15 @@ describeIfElasticsearchURL('search v1 middleware', () => { expect(res.statusCode).toBe(400) expect(JSON.parse(res.text).error).toMatch('sort') } + // unrecognized highlights + { + const sp = new URLSearchParams() + sp.set('query', 'test') + sp.set('highlights', 'neverheardof') + const res = await get('/api/search/v1?' + sp) + expect(res.statusCode).toBe(400) + expect(JSON.parse(res.text).error).toMatch('neverheardof') + } }) test('breadcrumbless records should always return a string', async () => { diff --git a/tests/content/fixtures/search-indexes/github-docs-dotcom-en-records.json b/tests/content/fixtures/search-indexes/github-docs-dotcom-en-records.json index 95aee65198..b7e4f4d245 100644 --- a/tests/content/fixtures/search-indexes/github-docs-dotcom-en-records.json +++ b/tests/content/fixtures/search-indexes/github-docs-dotcom-en-records.json @@ -3,7 +3,7 @@ "objectID": "/en/foo", "breadcrumbs": "fooing", "title": "Foo", - "headings": "", + "headings": "Heading 1", "content": "This is a fixture with the silly foo word.", "topics": ["Test", "Fixture"], "popularity": 0.5 @@ -12,7 +12,7 @@ "objectID": "/en/bar", "breadcrumbs": "baring", "title": "Bar", - "headings": "", + "headings": "Sub-heading This Sub-heading That", "content": "Can't have foo if you don't also have bar.", "topics": ["Test", "Fixture", "Get started"], "popularity": 0.6 @@ -21,7 +21,7 @@ "objectID": "/en/bar", "breadcrumbs": "", "title": "Breadcrumbless", - "headings": "", + "headings": "Introduction", "content": "Not all pages have a breadcrumb. Fact of life.", "topics": [], "popularity": 0.1