1
0
mirror of synced 2025-12-23 11:54:18 -05:00
Files
docs/middleware/api/es-search.js

388 lines
12 KiB
JavaScript

import { Client } from '@elastic/elasticsearch'
const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL
const isDevMode = process.env.NODE_ENV !== 'production'
function getClient() {
if (!ELASTICSEARCH_URL) {
// If this was mistakenly not set, it will eventually fail
// when you use the Client. But `new Client({node: undefined})`
// won't throw. And the error you get when you actually do try
// to use that Client instance is cryptic compared to this
// plain and simple thrown error.
throw new Error(`$ELASTICSEARCH_URL is not set`)
}
return new Client({
node: ELASTICSEARCH_URL,
// The default is 30,000ms but we noticed that the median time is about
// 100ms with some occasional searches taking multiple seconds.
// The default `maxRetries` is 5 which is a sensible number.
// If a query gets stuck, it's better to (relatively) quickly give up
// and retry. So if it takes longer than this time here, we're banking on
// that it was just bad luck and that it'll work if we simply try again.
// See internal issue #2318.
requestTimeout: 500,
})
}
// The true work horse that actually performs the Elasticsearch query
export async function getSearchResults({
indexName,
query,
page,
size,
debug,
sort,
topics,
includeTopics,
usePrefixSearch,
}) {
if (topics && !Array.isArray(topics)) {
throw new Error("'topics' has to be an array")
}
const t0 = new Date()
const client = getClient()
const from = size * (page - 1)
const matchQueries = getMatchQueries(query.trim(), {
usePrefixSearch,
fuzzy: {
minLength: 3,
maxLength: 20,
},
})
const matchQuery = {
bool: {
should: matchQueries,
},
}
const topicsFilter = (topics || []).map((topic) => {
return {
term: {
// Remember, 'topics' is a keyword field, meaning you need
// to filter by "Webhooks", not "webhooks"
topics: topic,
},
}
})
if (topicsFilter.length) {
matchQuery.bool.filter = topicsFilter
}
const highlight = getHighlightConfiguration(query)
const searchQuery = {
highlight,
from,
size,
// COMMENTED out because of ES 7.11.
// Once we're on ES >7.11 we can add this option in.
// // Since we know exactly which fields from the source we're going
// // need we can specify that here. It's an inclusion list.
// // We can save precious network by not having to transmit fields
// // stored in Elasticsearch to here if it's not going to be needed
// // anyway.
// _source_includes: [
// 'title',
// 'url',
// 'breadcrumbs',
// // 'headings'
// 'popularity',
// ],
}
// See note above why this is excluded in ES 7.11
// if (includeTopics) {
// searchQuery._source_includes.push('topics')
// }
if (sort === 'best') {
// To sort by a function score, you need to wrap the primary
// match query into a bool operation.
searchQuery.query = {
bool: {
must: [
{
function_score: {
boost_mode: 'multiply',
query: matchQuery,
boost: 1.0,
functions: [
{
field_value_factor: {
field: 'popularity',
// modifier: 'log1p',
factor: 1.0,
// missing: 0.0001,
missing: 1.0,
},
},
],
},
},
],
},
}
} else if (sort === 'relevance') {
// Do nothing, it's the default.
// We could have a secondary sort on the 'popularity' but the
// chances of this ever doing anything is very weak because of the
// floating point almost always being different.
searchQuery.query = matchQuery
} else {
throw new Error(`Unrecognized sort enum '${sort}'`)
}
const result = await client.search({ index: indexName, body: searchQuery })
// const hitsAll = result.hits // ES >7.11
const hitsAll = result.body // ES <=7.11
const hits = getHits(hitsAll.hits.hits, { indexName, debug, includeTopics })
const t1 = new Date()
const meta = {
found: hitsAll.hits.total,
took: {
query_msec: hitsAll.took,
total_msec: t1.getTime() - t0.getTime(),
},
page,
size,
}
return { meta, hits }
}
function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
const BOOST_PHRASE = 10.0
const BOOST_TITLE = 4.0
const BOOST_HEADINGS = 3.0
const BOOST_CONTENT = 1.0
const BOOST_AND = 2.5
const BOOST_EXPLICIT = 3.5
// Number doesn't matter so much but just make sure it's
// boosted low. Because we only really want this to come into
// play if nothing else matches. E.g. a search for `Acions`
// which wouldn't find anythig else anyway.
const BOOST_FUZZY = 0.1
const matchQueries = []
// If the query input is multiple words, it's good to know because you can
// make the query do `match_phrase` and you can make `match` query
// with the `AND` operator (`OR` is the default).
const isMultiWordQuery = query.includes(' ') || query.includes('-')
if (isMultiWordQuery) {
// If the query contains spaces, prioritize a "match phrase" query
// beyond a regular "match" query.
// Basically, that means if you search for 'foo bar' we'd rather
// rank:
// "A common term is foo bar which is often used"
// above:
// "Some people use foo"
// "Bar is also a common term"
//
// So that, when all are matched you get this rank:
// 1. "A common term is foo bar which is often used"
// 2. "Some people use foo"
// 3. "Bar is also a common term"
//
// But note, a "match phrase" isn't the holy panacea of matches.
// In particular, just because there exists a document whose *content*
// contains the phrase "... foo bar ..." we might still prefer the
// matches on title that contains the words *separately*. This
// is why a 'match_phrase' on 'content' has a lesser boost
// that a 'match' on 'title'.
const matchPhraseStrategy = usePrefixSearch ? 'match_phrase_prefix' : 'match_phrase'
matchQueries.push(
...[
{
[matchPhraseStrategy]: {
title_explicit: { boost: BOOST_EXPLICIT * BOOST_PHRASE * BOOST_TITLE, query },
},
},
{ [matchPhraseStrategy]: { title: { boost: BOOST_PHRASE * BOOST_TITLE, query } } },
{
[matchPhraseStrategy]: {
headings_explicit: { boost: BOOST_EXPLICIT * BOOST_PHRASE * BOOST_HEADINGS, query },
},
},
{ [matchPhraseStrategy]: { headings: { boost: BOOST_PHRASE * BOOST_HEADINGS, query } } },
{ [matchPhraseStrategy]: { content: { boost: BOOST_PHRASE, query } } },
{
[matchPhraseStrategy]: {
content_explicit: { boost: BOOST_EXPLICIT * BOOST_PHRASE, query },
},
},
]
)
}
// Unless the query was something like `"foo bar"` search on each word
if (!(isMultiWordQuery && query.startsWith('"') && query.endsWith('"'))) {
if (usePrefixSearch && !isMultiWordQuery) {
matchQueries.push(
...[
{ prefix: { title_explicit: { boost: BOOST_EXPLICIT * BOOST_TITLE, value: query } } },
{
prefix: { headings_explicit: { boost: BOOST_EXPLICIT * BOOST_HEADINGS, value: query } },
},
{ prefix: { content_explicit: { boost: BOOST_EXPLICIT * BOOST_CONTENT, value: query } } },
{ prefix: { title: { boost: BOOST_TITLE, value: query } } },
{ prefix: { headings: { boost: BOOST_HEADINGS, value: query } } },
{ prefix: { content: { boost: BOOST_CONTENT, value: query } } },
]
)
} else {
if (isMultiWordQuery) {
matchQueries.push(
...[
{
match: {
title_explicit: {
boost: BOOST_EXPLICIT * BOOST_TITLE * BOOST_AND,
query,
operator: 'AND',
},
},
},
{
match: {
headings_explicit: {
boost: BOOST_EXPLICIT * BOOST_HEADINGS * BOOST_AND,
query,
operator: 'AND',
},
},
},
{
match: {
content_explicit: {
boost: BOOST_EXPLICIT * BOOST_CONTENT * BOOST_AND,
query,
operator: 'AND',
},
},
},
{ match: { title: { boost: BOOST_TITLE * BOOST_AND, query, operator: 'AND' } } },
{ match: { headings: { boost: BOOST_HEADINGS * BOOST_AND, query, operator: 'AND' } } },
{ match: { content: { boost: BOOST_CONTENT * BOOST_AND, query, operator: 'AND' } } },
]
)
}
matchQueries.push(
...[
{ match: { title_explicit: { boost: BOOST_EXPLICIT * BOOST_TITLE, query } } },
{ match: { headings_explicit: { boost: BOOST_EXPLICIT * BOOST_HEADINGS, query } } },
{ match: { content_explicit: { boost: BOOST_EXPLICIT * BOOST_CONTENT, query } } },
{ match: { title: { boost: BOOST_TITLE, query } } },
{ match: { headings: { boost: BOOST_HEADINGS, query } } },
{ match: { content: { boost: BOOST_CONTENT, query } } },
]
)
}
}
// Add a fuzzy query if it's not too short or too long.
// Might consider only enabling this when there's no space in the query
// because something like "githob actions" will overwhelmingly
// match on the "actions" part with the regular 'match' query.
if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
matchQueries.push({
fuzzy: {
title: { value: query, boost: BOOST_FUZZY },
},
})
}
// If the query is just a single no-space word...
if (query.split(/\s/g).length === 1) {
// E.g. someone searched for `/en/site-policy/github-company-policies`
if (query.startsWith('/')) {
matchQueries.push({
match: { url: query.split('?')[0].split('#')[0] },
})
} else if (query.startsWith('http')) {
// E.g. `https://docs.github.com/en/some/page?foo=bar`
// will become a search on `{url: '/en/some/page'}`
let pathname
try {
pathname = new URL(query).pathname
} catch {
// If it failed, it can't be initialized with the `URL` constructor
// we so we can deem it *not* a valid URL.
}
if (pathname) {
matchQueries.push({
match: { url: pathname },
})
}
}
}
return matchQueries
}
function getHits(hits, { indexName, debug, includeTopics }) {
return hits.map((hit) => {
const result = {
id: hit._id,
url: hit._source.url,
title: hit._source.title,
breadcrumbs: hit._source.breadcrumbs,
highlights: hit.highlight || {},
}
if (includeTopics) {
result.topics = hit._source.topics || []
}
if (debug) {
result.score = hit._score || 0.0
result.popularity = hit._source.popularity || 0.0
if (isDevMode) {
result.es_url = `http://localhost:9200/${indexName}/_doc/${hit._id}`
}
}
return result
})
}
// The highlight configuration is dependent on how we use the content
// in the UI. For example, we feel we need about 3 lines (max)
// of highlights of content under each title. If we feel it shows too
// many highlights in the search result UI, we can come back here
// and change it to something more appropriate.
function getHighlightConfiguration(query) {
return {
pre_tags: ['<mark>'],
post_tags: ['</mark>'],
fields: {
title: {
fragment_size: 200,
number_of_fragments: 1,
},
headings: { fragment_size: 150, number_of_fragments: 2 },
// The 'no_match_size' is so we can display *something* for the
// preview if there was no highlight match at all within the content.
content: {
fragment_size: 150,
number_of_fragments: 3,
no_match_size: 150,
highlight_query: {
match_phrase_prefix: {
content: {
query,
},
},
},
},
},
}
}