1
0
mirror of synced 2025-12-23 03:44:00 -05:00
Files
docs/middleware/api/es-search.js
Peter Bengtsson f8f20605f4 search api with elasticsearch (#29053)
* indexing records into Elasticsearch

* @elastic/elasticsearch@8.2.0

* mv

* fix the code

* pipe

* lfs

* use sha

* change how index names work

* search api with Elasticsearch

* works

* refactor client

* better snowballing

* remove dbg

* wip

* highlights and fuzzy search

* improvements and upgrade

* also upgrade

* wip

* index more explicitly

* better validation

* change max size

* popularity by ordinal ranking instead

* playing

* fix package-lock

* rearranging

* catchMiddlewareError

* fix indexing

* match_phrase_prefix on wrapped search

* refactorings

* escape html in content

* wip

* search by URL

* update lock file

* v1 api

* inroads on jest testing the search

* valid yaml hopefully

* index fixtures into local elasticsearch

* specific versions to index only

* small fixes

* feedbacked

* fix tests

* use the npm script
2022-08-05 19:40:38 +00:00

276 lines
7.7 KiB
JavaScript

import { Client } from '@elastic/elasticsearch'
const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
const isDevMode = process.env.NODE_ENV !== 'production'
function getClient() {
return new Client({
node: ELASTICSEARCH_URL,
})
}
// The true work horse that actually performs the Elasticsearch query
export async function getSearchResults({
indexName,
query,
page,
size,
debug,
sort,
topics,
includeTopics,
usePrefixSearch,
}) {
const t0 = new Date()
const client = getClient()
const from = size * (page - 1)
const matchQueries = getMatchQueries(query.trim(), {
usePrefixSearch,
fuzzy: {
minLength: 3,
maxLength: 20,
},
})
const matchQuery = {
bool: {
should: matchQueries,
},
}
if (topics) {
throw new Error('Not implemented yet')
}
const highlight = getHighlightConfiguration(query)
const searchQuery = {
index: indexName,
highlight,
from,
size,
// Since we know exactly which fields from the source we're going
// need we can specify that here. It's an inclusion list.
// We can save precious network by not having to transmit fields
// stored in Elasticsearch to here if it's not going to be needed
// anyway.
_source_includes: [
'title',
'url',
'breadcrumbs',
// 'headings'
'popularity',
],
}
if (includeTopics) {
searchQuery._source_includes.push('topics')
}
if (sort === 'best') {
// To sort by a function score, you need to wrap the primary
// match query into a bool operation.
searchQuery.query = {
bool: {
must: [
{
function_score: {
boost_mode: 'multiply',
query: matchQuery,
boost: 1.0,
functions: [
{
field_value_factor: {
field: 'popularity',
// modifier: 'log1p',
factor: 1.0,
// missing: 0.0001,
missing: 1.0,
},
},
],
},
},
],
},
}
} else if (sort === 'relevance') {
// Do nothing, it's the default.
// We could have a secondary sort on the 'popularity' but the
// chances of this ever doing anything is very weak because of the
// floating point almost always being different.
searchQuery.query = matchQuery
} else {
throw new Error(`Unrecognized sort enum '${sort}'`)
}
const result = await client.search(searchQuery)
const hits = getHits(result.hits.hits, { indexName, debug, includeTopics })
const t1 = new Date()
const meta = {
found: result.hits.total,
took: {
query_msec: result.took,
total_msec: t1.getTime() - t0.getTime(),
},
page,
size,
}
return { meta, hits }
}
function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
const matchQueries = []
if (query.includes(' ')) {
// If the query contains spaces, prioritize a "match phrase" query
// beyond a regular "match" query.
// Basically, that means if you search for 'foo bar' we'd rather
// rank:
// "A common term is foo bar which is often used"
// above:
// "Some people use foo"
// "Bar is also a common term"
//
// So that, when all are matched you get this rank:
// 1. "A common term is foo bar which is often used"
// 2. "Some people use foo"
// 3. "Bar is also a common term"
//
// But note, a "match phrase" isn't the holy panacea of matches.
// In particular, just because there exists a document whose *content*
// contains the phrase "... foo bar ..." we might still prefer the
// matches on title that contains the words *separately*. This
// is why a 'match_phrase' on 'content' has a lesser boost
// that a 'match' on 'title'.
const matchPhraseStrategy = usePrefixSearch ? 'match_phrase_prefix' : 'match_phrase'
matchQueries.push(
...[
{ [matchPhraseStrategy]: { title: { boost: 20.0, query } } },
{ [matchPhraseStrategy]: { headings: { boost: 6.0, query } } },
{ [matchPhraseStrategy]: { content: { boost: 2.0, query } } },
]
)
}
// Unless the query was something like `"foo bar"` search on each word
if (!(query.includes(' ') && query.startsWith('"') && query.endsWith('"'))) {
if (usePrefixSearch && !query.includes(' ')) {
matchQueries.push(
...[
{ prefix: { title: { boost: 10.0, value: query } } },
{ prefix: { headings: { boost: 3.0, value: query } } },
{ prefix: { content: { boost: 0.5, value: query } } },
]
)
} else {
matchQueries.push(
...[
{ match: { title: { boost: 10.0, query } } },
{ match: { headings: { boost: 3.0, query } } },
{ match: { content: { boost: 0.5, query } } },
]
)
}
}
// Add a fuzzy query if it's not too short or too long.
// Might consider only enabling this when there's no space in the query
// because something like "githob actions" will overwhelmingly
// match on the "actions" part with the regular 'match' query.
if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
matchQueries.push({
fuzzy: {
title: { value: query },
},
})
}
// If the query is just a single no-space word...
if (query.split(/\s/g).length === 1) {
// E.g. someone searched for `/en/site-policy/github-company-policies`
if (query.startsWith('/')) {
matchQueries.push({
match: { url: query.split('?')[0].split('#')[0] },
})
} else if (query.startsWith('http')) {
// E.g. `https://docs.github.com/en/some/page?foo=bar`
// will become a search on `{url: '/en/some/page'}`
let pathname
try {
pathname = new URL(query).pathname
} catch {
// If it failed, it can't be initialized with the `URL` constructor
// we so we can deem it *not* a valid URL.
}
if (pathname) {
matchQueries.push({
match: { url: pathname },
})
}
}
}
return matchQueries
}
function getHits(hits, { indexName, debug, includeTopics }) {
return hits.map((hit) => {
const result = {
id: hit._id,
url: hit._source.url,
title: hit._source.title,
breadcrumbs: hit._source.breadcrumbs || [],
highlights: hit.highlight || {},
}
if (includeTopics) {
result.topics = hit._source.topics || []
}
if (debug) {
result.score = hit._score || 0.0
result.popularity = hit._source.popularity || 0.0
if (isDevMode) {
result.es_url = `http://localhost:9200/${indexName}/_doc/${hit._id}`
}
}
return result
})
}
// The highlight configuration is dependent on how we use the content
// in the UI. For example, we feel we need about 3 lines (max)
// of highlights of content under each title. If we feel it shows too
// many highlights in the search result UI, we can come back here
// and change it to something more appropriate.
function getHighlightConfiguration(query) {
return {
pre_tags: ['<mark>'],
post_tags: ['</mark>'],
fields: {
title: {
fragment_size: 200,
number_of_fragments: 1,
},
headings: { fragment_size: 150, number_of_fragments: 2 },
// The 'no_match_size' is so we can display *something* for the
// preview if there was no highlight match at all within the content.
content: {
fragment_size: 150,
number_of_fragments: 3,
no_match_size: 150,
highlight_query: {
match_phrase_prefix: {
content: {
query,
},
},
},
},
},
}
}