446 lines
14 KiB
JavaScript
446 lines
14 KiB
JavaScript
import { Client } from '@elastic/elasticsearch'
|
|
|
|
export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content', 'headings']
|
|
export const DEFAULT_HIGHLIGHT_FIELDS = ['title', 'content']
|
|
|
|
const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL
|
|
|
|
const isDevMode = process.env.NODE_ENV !== 'production'
|
|
|
|
function getClient() {
|
|
if (!ELASTICSEARCH_URL) {
|
|
// If this was mistakenly not set, it will eventually fail
|
|
// when you use the Client. But `new Client({node: undefined})`
|
|
// won't throw. And the error you get when you actually do try
|
|
// to use that Client instance is cryptic compared to this
|
|
// plain and simple thrown error.
|
|
throw new Error(`$ELASTICSEARCH_URL is not set`)
|
|
}
|
|
return new Client({
|
|
node: ELASTICSEARCH_URL,
|
|
// The default is 30,000ms but we noticed that the median time is about
|
|
// 100-150ms with some occasional searches taking multiple seconds.
|
|
// The default `maxRetries` is 5 which is a sensible number.
|
|
// If a query gets stuck, it's better to (relatively) quickly give up
|
|
// and retry. So if it takes longer than this time here, we're banking on
|
|
// that it was just bad luck and that it'll work if we simply try again.
|
|
// See internal issue #2318.
|
|
requestTimeout: 1000,
|
|
})
|
|
}
|
|
|
|
// The true work horse that actually performs the Elasticsearch query
|
|
export async function getSearchResults({
|
|
indexName,
|
|
query,
|
|
page,
|
|
size,
|
|
debug,
|
|
sort,
|
|
topics,
|
|
includeTopics,
|
|
usePrefixSearch,
|
|
highlights,
|
|
include,
|
|
}) {
|
|
if (topics && !Array.isArray(topics)) {
|
|
throw new Error("'topics' has to be an array")
|
|
}
|
|
if (include) {
|
|
if (!Array.isArray(include)) {
|
|
throw new Error("'include' has to be an array")
|
|
}
|
|
if (!include.every((value) => typeof value === 'string')) {
|
|
throw new Error("Every entry in the 'include' must be a string")
|
|
}
|
|
}
|
|
const t0 = new Date()
|
|
const client = getClient()
|
|
const from = size * (page - 1)
|
|
|
|
const matchQueries = getMatchQueries(query.trim(), {
|
|
usePrefixSearch,
|
|
fuzzy: {
|
|
minLength: 3,
|
|
maxLength: 20,
|
|
},
|
|
})
|
|
|
|
const matchQuery = {
|
|
bool: {
|
|
should: matchQueries,
|
|
},
|
|
}
|
|
|
|
const topicsFilter = (topics || []).map((topic) => {
|
|
return {
|
|
term: {
|
|
// Remember, 'topics' is a keyword field, meaning you need
|
|
// to filter by "Webhooks", not "webhooks"
|
|
topics: topic,
|
|
},
|
|
}
|
|
})
|
|
if (topicsFilter.length) {
|
|
matchQuery.bool.filter = topicsFilter
|
|
}
|
|
|
|
const highlightFields = highlights || DEFAULT_HIGHLIGHT_FIELDS
|
|
const highlight = getHighlightConfiguration(query, highlightFields)
|
|
|
|
const searchQuery = {
|
|
highlight,
|
|
from,
|
|
size,
|
|
|
|
// COMMENTED out because of ES 7.11.
|
|
// Once we're on ES >7.11 we can add this option in.
|
|
// // Since we know exactly which fields from the source we're going
|
|
// // need we can specify that here. It's an inclusion list.
|
|
// // We can save precious network by not having to transmit fields
|
|
// // stored in Elasticsearch to here if it's not going to be needed
|
|
// // anyway.
|
|
// _source_includes: [
|
|
// 'title',
|
|
// 'url',
|
|
// 'breadcrumbs',
|
|
// // 'headings'
|
|
// 'popularity',
|
|
// ],
|
|
}
|
|
|
|
// See note above why this is excluded in ES 7.11
|
|
// if (includeTopics) {
|
|
// searchQuery._source_includes.push('topics')
|
|
// }
|
|
|
|
if (sort === 'best') {
|
|
// To sort by a function score, you need to wrap the primary
|
|
// match query into a bool operation.
|
|
searchQuery.query = {
|
|
bool: {
|
|
must: [
|
|
{
|
|
function_score: {
|
|
boost_mode: 'multiply',
|
|
query: matchQuery,
|
|
boost: 1.0,
|
|
functions: [
|
|
{
|
|
field_value_factor: {
|
|
field: 'popularity',
|
|
// modifier: 'log1p',
|
|
factor: 1.0,
|
|
// missing: 0.0001,
|
|
missing: 1.0,
|
|
},
|
|
},
|
|
],
|
|
},
|
|
},
|
|
],
|
|
},
|
|
}
|
|
} else if (sort === 'relevance') {
|
|
// Do nothing, it's the default.
|
|
// We could have a secondary sort on the 'popularity' but the
|
|
// chances of this ever doing anything is very weak because of the
|
|
// floating point almost always being different.
|
|
searchQuery.query = matchQuery
|
|
} else {
|
|
throw new Error(`Unrecognized sort enum '${sort}'`)
|
|
}
|
|
|
|
const result = await client.search({ index: indexName, body: searchQuery })
|
|
|
|
// const hitsAll = result.hits // ES >7.11
|
|
const hitsAll = result.body // ES <=7.11
|
|
const hits = getHits(hitsAll.hits.hits, {
|
|
indexName,
|
|
debug,
|
|
includeTopics,
|
|
highlightFields,
|
|
include,
|
|
})
|
|
const t1 = new Date()
|
|
|
|
const meta = {
|
|
found: hitsAll.hits.total,
|
|
took: {
|
|
query_msec: hitsAll.took,
|
|
total_msec: t1.getTime() - t0.getTime(),
|
|
},
|
|
page,
|
|
size,
|
|
}
|
|
|
|
return { meta, hits }
|
|
}
|
|
|
|
function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
|
|
const BOOST_PHRASE = 10.0
|
|
const BOOST_TITLE = 4.0
|
|
const BOOST_HEADINGS = 3.0
|
|
const BOOST_CONTENT = 1.0
|
|
const BOOST_AND = 2.5
|
|
const BOOST_EXPLICIT = 3.5
|
|
// Number doesn't matter so much but just make sure it's
|
|
// boosted low. Because we only really want this to come into
|
|
// play if nothing else matches. E.g. a search for `Acions`
|
|
// which wouldn't find anythig else anyway.
|
|
const BOOST_FUZZY = 0.1
|
|
|
|
const matchQueries = []
|
|
|
|
// If the query input is multiple words, it's good to know because you can
|
|
// make the query do `match_phrase` and you can make `match` query
|
|
// with the `AND` operator (`OR` is the default).
|
|
const isMultiWordQuery = query.includes(' ') || query.includes('-')
|
|
|
|
if (isMultiWordQuery) {
|
|
// If the query contains spaces, prioritize a "match phrase" query
|
|
// beyond a regular "match" query.
|
|
// Basically, that means if you search for 'foo bar' we'd rather
|
|
// rank:
|
|
// "A common term is foo bar which is often used"
|
|
// above:
|
|
// "Some people use foo"
|
|
// "Bar is also a common term"
|
|
//
|
|
// So that, when all are matched you get this rank:
|
|
// 1. "A common term is foo bar which is often used"
|
|
// 2. "Some people use foo"
|
|
// 3. "Bar is also a common term"
|
|
//
|
|
// But note, a "match phrase" isn't the holy panacea of matches.
|
|
// In particular, just because there exists a document whose *content*
|
|
// contains the phrase "... foo bar ..." we might still prefer the
|
|
// matches on title that contains the words *separately*. This
|
|
// is why a 'match_phrase' on 'content' has a lesser boost
|
|
// that a 'match' on 'title'.
|
|
const matchPhraseStrategy = usePrefixSearch ? 'match_phrase_prefix' : 'match_phrase'
|
|
matchQueries.push(
|
|
...[
|
|
{
|
|
[matchPhraseStrategy]: {
|
|
title_explicit: { boost: BOOST_EXPLICIT * BOOST_PHRASE * BOOST_TITLE, query },
|
|
},
|
|
},
|
|
{ [matchPhraseStrategy]: { title: { boost: BOOST_PHRASE * BOOST_TITLE, query } } },
|
|
{
|
|
[matchPhraseStrategy]: {
|
|
headings_explicit: { boost: BOOST_EXPLICIT * BOOST_PHRASE * BOOST_HEADINGS, query },
|
|
},
|
|
},
|
|
{ [matchPhraseStrategy]: { headings: { boost: BOOST_PHRASE * BOOST_HEADINGS, query } } },
|
|
]
|
|
)
|
|
// If the content is short, it is given a disproportionate advantage
|
|
// in search ranking. For example, our category and map-topic pages
|
|
// often includes a list of other document titles but because it's so
|
|
// short it thinks that content is really relevant. This only applies
|
|
// when you use `match_phrase_prefix` which first makes a search
|
|
// all preceeding terms and then manually appends matches on the last word.
|
|
// See https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-match-query-phrase-prefix.html#match-phrase-prefix-query-notes
|
|
if (!usePrefixSearch) {
|
|
matchQueries.push(
|
|
...[
|
|
{ [matchPhraseStrategy]: { content: { boost: BOOST_PHRASE, query } } },
|
|
{
|
|
[matchPhraseStrategy]: {
|
|
content_explicit: { boost: BOOST_EXPLICIT * BOOST_PHRASE, query },
|
|
},
|
|
},
|
|
]
|
|
)
|
|
}
|
|
}
|
|
|
|
// Unless the query was something like `"foo bar"` search on each word
|
|
if (!(isMultiWordQuery && query.startsWith('"') && query.endsWith('"'))) {
|
|
const matchStrategy = usePrefixSearch ? 'match_bool_prefix' : 'match'
|
|
if (isMultiWordQuery) {
|
|
matchQueries.push(
|
|
...[
|
|
{
|
|
[matchStrategy]: {
|
|
title_explicit: {
|
|
boost: BOOST_EXPLICIT * BOOST_TITLE * BOOST_AND,
|
|
query,
|
|
operator: 'AND',
|
|
},
|
|
},
|
|
},
|
|
{
|
|
[matchStrategy]: {
|
|
headings_explicit: {
|
|
boost: BOOST_EXPLICIT * BOOST_HEADINGS * BOOST_AND,
|
|
query,
|
|
operator: 'AND',
|
|
},
|
|
},
|
|
},
|
|
{
|
|
[matchStrategy]: {
|
|
content_explicit: {
|
|
boost: BOOST_EXPLICIT * BOOST_CONTENT * BOOST_AND,
|
|
query,
|
|
operator: 'AND',
|
|
},
|
|
},
|
|
},
|
|
{
|
|
[matchStrategy]: {
|
|
title: { boost: BOOST_TITLE * BOOST_AND, query, operator: 'AND' },
|
|
},
|
|
},
|
|
{
|
|
[matchStrategy]: {
|
|
headings: { boost: BOOST_HEADINGS * BOOST_AND, query, operator: 'AND' },
|
|
},
|
|
},
|
|
{
|
|
[matchStrategy]: {
|
|
content: { boost: BOOST_CONTENT * BOOST_AND, query, operator: 'AND' },
|
|
},
|
|
},
|
|
]
|
|
)
|
|
}
|
|
matchQueries.push(
|
|
...[
|
|
{ [matchStrategy]: { title_explicit: { boost: BOOST_EXPLICIT * BOOST_TITLE, query } } },
|
|
{
|
|
[matchStrategy]: {
|
|
headings_explicit: { boost: BOOST_EXPLICIT * BOOST_HEADINGS, query },
|
|
},
|
|
},
|
|
{
|
|
[matchStrategy]: { content_explicit: { boost: BOOST_EXPLICIT * BOOST_CONTENT, query } },
|
|
},
|
|
{ [matchStrategy]: { title: { boost: BOOST_TITLE, query } } },
|
|
{ [matchStrategy]: { headings: { boost: BOOST_HEADINGS, query } } },
|
|
{ [matchStrategy]: { content: { boost: BOOST_CONTENT, query } } },
|
|
]
|
|
)
|
|
}
|
|
|
|
// Add a fuzzy query if it's not too short or too long.
|
|
// Might consider only enabling this when there's no space in the query
|
|
// because something like "githob actions" will overwhelmingly
|
|
// match on the "actions" part with the regular 'match' query.
|
|
if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
|
|
matchQueries.push({
|
|
fuzzy: {
|
|
title: { value: query, boost: BOOST_FUZZY },
|
|
},
|
|
})
|
|
}
|
|
|
|
// If the query is just a single no-space word...
|
|
if (query.split(/\s/g).length === 1) {
|
|
// E.g. someone searched for `/en/site-policy/github-company-policies`
|
|
if (query.startsWith('/')) {
|
|
matchQueries.push({
|
|
match: { url: query.split('?')[0].split('#')[0] },
|
|
})
|
|
} else if (query.startsWith('http')) {
|
|
// E.g. `https://docs.github.com/en/some/page?foo=bar`
|
|
// will become a search on `{url: '/en/some/page'}`
|
|
let pathname
|
|
try {
|
|
pathname = new URL(query).pathname
|
|
} catch {
|
|
// If it failed, it can't be initialized with the `URL` constructor
|
|
// we so we can deem it *not* a valid URL.
|
|
}
|
|
if (pathname) {
|
|
matchQueries.push({
|
|
match: { url: pathname },
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
return matchQueries
|
|
}
|
|
|
|
function getHits(hits, { indexName, debug, includeTopics, highlightFields, include }) {
|
|
return hits.map((hit) => {
|
|
// Return `hit.highlights[...]` based on the highlight fields requested.
|
|
// So if you searched with `&highlights=headings&highlights=content`
|
|
// this will become:
|
|
// {
|
|
// content: [...],
|
|
// headings: [...]
|
|
// }
|
|
// even if there was a match on 'title'.
|
|
const hitHighlights = Object.fromEntries(
|
|
highlightFields.map((key) => [key, (hit.highlight && hit.highlight[key]) || []])
|
|
)
|
|
|
|
const result = {
|
|
id: hit._id,
|
|
url: hit._source.url,
|
|
title: hit._source.title,
|
|
breadcrumbs: hit._source.breadcrumbs,
|
|
highlights: hitHighlights,
|
|
}
|
|
if (includeTopics) {
|
|
result.topics = hit._source.topics || []
|
|
}
|
|
if (debug) {
|
|
result.score = hit._score || 0.0
|
|
result.popularity = hit._source.popularity || 0.0
|
|
if (isDevMode) {
|
|
result.es_url = `http://localhost:9200/${indexName}/_doc/${hit._id}`
|
|
}
|
|
}
|
|
for (const field of include || []) {
|
|
result[field] = hit._source[field]
|
|
}
|
|
return result
|
|
})
|
|
}
|
|
|
|
// The highlight configuration is dependent on how we use the content
|
|
// in the UI. For example, we feel we need about 3 lines (max)
|
|
// of highlights of content under each title. If we feel it shows too
|
|
// many highlights in the search result UI, we can come back here
|
|
// and change it to something more appropriate.
|
|
function getHighlightConfiguration(query, highlights) {
|
|
const fields = {}
|
|
if (highlights.includes('title')) {
|
|
fields.title = {
|
|
fragment_size: 200,
|
|
number_of_fragments: 1,
|
|
}
|
|
}
|
|
if (highlights.includes('headings')) {
|
|
fields.headings = { fragment_size: 150, number_of_fragments: 2 }
|
|
}
|
|
if (highlights.includes('content')) {
|
|
// The 'no_match_size' is so we can display *something* for the
|
|
// preview if there was no highlight match at all within the content.
|
|
fields.content = {
|
|
fragment_size: 150,
|
|
number_of_fragments: 1,
|
|
no_match_size: 150,
|
|
|
|
highlight_query: {
|
|
match_phrase_prefix: {
|
|
content: {
|
|
query,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
return {
|
|
pre_tags: ['<mark>'],
|
|
post_tags: ['</mark>'],
|
|
fields,
|
|
}
|
|
}
|