1
0
mirror of synced 2025-12-23 11:54:18 -05:00
Files
docs/middleware/api/search.js
2023-02-15 15:48:38 +00:00

356 lines
12 KiB
JavaScript

import express from 'express'
import searchVersions from '../../lib/search/versions.js'
import FailBot from '../../lib/failbot.js'
import languages from '../../lib/languages.js'
import { allVersions } from '../../lib/all-versions.js'
import statsd from '../../lib/statsd.js'
import { searchCacheControl } from '../cache-control.js'
import catchMiddlewareError from '../catch-middleware-error.js'
import { setFastlySurrogateKey } from '../set-fastly-surrogate-key.js'
import {
getSearchResults,
POSSIBLE_HIGHLIGHT_FIELDS,
DEFAULT_HIGHLIGHT_FIELDS,
} from './es-search.js'
// Used by the legacy search
const versions = new Set(Object.values(searchVersions))
const languagesSet = new Set(Object.keys(languages))
const router = express.Router()
const DEFAULT_SIZE = 10
const MAX_SIZE = 50 // How much you return has a strong impact on performance
const DEFAULT_PAGE = 1
const POSSIBLE_SORTS = ['best', 'relevance']
const DEFAULT_SORT = POSSIBLE_SORTS[0]
const MAX_PAGE = 10
// There are some fields you can optionally include in the output.
// These are fields available in Elasticsearch that we don't include in
// the output by default. E.g. `...&include=intro`
// Requesting anything that is not in this list will result in
// a 400 Bad Request.
const V1_ADDITIONAL_INCLUDES = ['intro', 'headings']
// If someone searches for `...&version=3.5` what they actually mean
// is `ghes-3.5`. This is because of legacy formatting with the old search.
// In some distant future we can clean up any client enough that this
// aliasing won't be necessary.
const versionAliases = {}
Object.values(allVersions).forEach((info) => {
if (info.hasNumberedReleases) {
versionAliases[info.currentRelease] = info.miscVersionName
} else {
versionAliases[info.version] = info.miscVersionName
versionAliases[info.miscVersionName] = info.miscVersionName
}
})
const legacyEnterpriseServerVersions = Object.fromEntries(
Object.entries(searchVersions)
.filter(([fullName]) => {
return fullName.startsWith('enterprise-server@')
})
.map(([, shortName]) => {
return [shortName, `ghes-${shortName}`]
})
)
function getIndexPrefix() {
// This logic is mirrored in the scripts we use before running tests
// In particular, see the `index-test-fixtures` npm script.
// That's expected to be run before CI and local jest testing.
// The reason we have a deliberately different index name (by prefix)
// for testing compared to regular operation is to make it convenient
// for engineers working on local manual testing *and* automated
// testing without have to re-index different content (e.g. fixtures
// vs real content) on the same index name.
if (process.env.NODE_ENV === 'test') return 'tests_'
return ''
}
function convertLegacyVersionName(version) {
// In the olden days we used to use `?version=3.5&...` but we decided
// that's ambiguous and it should be `ghes-3.5` instead.
return legacyEnterpriseServerVersions[version] || version
}
router.get(
'/legacy',
catchMiddlewareError(async function legacySearch(req, res) {
const { query, version, language, filters, limit: limit_ } = req.query
const topics = []
if (filters) {
if (Array.isArray(filters)) {
topics.push(...filters)
} else {
topics.push(filters)
}
}
const limit = Math.min(parseInt(limit_, 10) || 10, 100)
if (!versions.has(version)) {
return res.status(400).json({ error: 'Unrecognized version' })
}
if (!languagesSet.has(language)) {
return res.status(400).json({ error: 'Unrecognized language' })
}
if (!query || !limit) {
return res.status(200).json([])
}
const indexName = `${getIndexPrefix()}github-docs-${convertLegacyVersionName(
version
)}-${language}`
const hits = []
const tags = ['version:legacy', `indexName:${indexName}`]
const timed = statsd.asyncTimer(getSearchResults, 'api.search', tags)
const options = {
indexName,
query,
page: 1,
sort: 'best',
size: limit,
debug: true,
includeTopics: true,
// The legacy search is used as an autocomplete. In other words,
// a debounce that sends the query before the user has had a
// chance to fully submit the search. That means if the user
// send the query 'google cl' they hope to find 'Google Cloud'
// even though they didn't type that fully.
usePrefixSearch: true,
topics,
}
try {
const { hits: hits_, meta } = await timed(options)
hits.push(...hits_)
statsd.timing('api.search.total', meta.took.total_msec, tags)
statsd.timing('api.search.query', meta.took.query_msec, tags)
} catch (error) {
// If we don't catch here, the `catchMiddlewareError()` wrapper
// will take any thrown error and pass it to `next()`.
await handleGetSearchResultsError(req, res, error, options)
return
}
// The legacy search just returned an array
const results = hits.map((hit) => {
let title = hit.title
if (hit.highlights?.title && hit.highlights?.title.length) {
title = hit.highlights.title[0]
}
let content = ''
if (hit.highlights?.content && hit.highlights?.content.length) {
content = hit.highlights.content.join('\n')
}
return {
url: hit.url,
title,
breadcrumbs: hit.breadcrumbs || '',
content,
topics: hit.topics || [],
popularity: hit.popularity || 0.0,
score: hit.score,
}
})
if (process.env.NODE_ENV !== 'development') {
searchCacheControl(res)
setFastlySurrogateKey(res, `api-search:${language}`, true)
}
res.setHeader('x-search-legacy', 'yes')
res.status(200).json(results)
})
)
class ValidationError extends Error {}
const validationMiddleware = (req, res, next) => {
const params = [
{ key: 'query' },
{
key: 'version',
default_: 'dotcom',
validate: (v) => {
if (versionAliases[v] || allVersions[v]) return true
const valid = [...Object.keys(versionAliases), ...Object.keys(allVersions)]
throw new ValidationError(`'${v}' not in ${valid}`)
},
},
{ key: 'language', default_: 'en', validate: (v) => v in languages },
{
key: 'size',
default_: DEFAULT_SIZE,
cast: (v) => parseInt(v, 10),
validate: (v) => v >= 0 && v <= MAX_SIZE,
},
{
key: 'page',
default_: DEFAULT_PAGE,
cast: (v) => parseInt(v, 10),
validate: (v) => v >= 1 && v <= MAX_PAGE,
},
{ key: 'sort', default_: DEFAULT_SORT, validate: (v) => POSSIBLE_SORTS.includes(v) },
{
key: 'highlights',
default_: DEFAULT_HIGHLIGHT_FIELDS,
cast: (v) => (Array.isArray(v) ? v : [v]),
validate: (v) => {
for (const highlight of v) {
if (!POSSIBLE_HIGHLIGHT_FIELDS.includes(highlight)) {
throw new ValidationError(`highlight value '${highlight}' is not valid`)
}
}
return true
},
},
{ key: 'autocomplete', default_: false, cast: toBoolean },
{ key: 'debug', default_: process.env.NODE_ENV === 'development', cast: toBoolean },
{
key: 'include',
default_: [],
cast: toArray,
// Note: At the time of writing this general validator middleware
// doesn't yet know it's being used by the v1 version.
// But we don't have any other versions yet so no need to
// over-engineer this more.
validate: (values) => values.every((value) => V1_ADDITIONAL_INCLUDES.includes(value)),
},
]
const search = {}
for (const { key, default_, cast, validate } of params) {
let value = req.query[key]
if (!value || (typeof value === 'string' && !value.trim())) {
if (default_ === undefined) {
// no value and no default, bad!
return res.status(400).json({ error: `No truthy value for key '${key}'` })
}
value = default_
}
if (cast) {
value = cast(value)
}
try {
if (validate && !validate(value)) {
return res
.status(400)
.json({ error: `Not a valid value (${JSON.stringify(value)}) for key '${key}'` })
}
} catch (err) {
if (err instanceof ValidationError) {
return res.status(400).json({ error: err.toString(), field: key })
}
throw err
}
search[key] = value
}
const version = versionAliases[search.version] || allVersions[search.version].miscVersionName
search.indexName = `${getIndexPrefix()}github-docs-${version}-${search.language}` // github-docs-ghes-3.5-en
req.search = search
return next()
}
function toBoolean(value) {
if (value === 'true' || value === '1') return true
return false
}
function toArray(value) {
return Array.isArray(value) ? value : [value]
}
router.get(
'/v1',
validationMiddleware,
catchMiddlewareError(async function search(req, res) {
const {
indexName,
language,
query,
autocomplete,
page,
size,
debug,
sort,
highlights,
include,
} = req.search
// The getSearchResults() function is a mix of preparing the search,
// sending & receiving it, and post-processing the response from the
// network (i.e. Elasticsearch).
// This measurement then combines both the Node-work and the total
// network-work but we know that roughly 99.5% of the total time is
// spent in the network-work time so this primarily measures that.
const tags = ['version:v1', `indexName:${indexName}`]
const timed = statsd.asyncTimer(getSearchResults, 'api.search', tags)
const options = {
indexName,
query,
page,
size,
debug,
sort,
highlights,
usePrefixSearch: autocomplete,
include,
}
try {
const { meta, hits } = await timed(options)
statsd.timing('api.search.total', meta.took.total_msec, tags)
statsd.timing('api.search.query', meta.took.query_msec, tags)
if (process.env.NODE_ENV !== 'development') {
searchCacheControl(res)
setFastlySurrogateKey(res, `api-search:${language}`, true)
}
// The v1 version of the output matches perfectly what comes out
// of the getSearchResults() function.
res.status(200).json({ meta, hits })
} catch (error) {
// If getSearchResult() throws an error that might be 404 inside
// elasticsearch, if we don't capture that here, it will propgate
// to the next middleware.
await handleGetSearchResultsError(req, res, error, options)
}
})
)
// We have more than one place where we do `try{...} catch error( THIS )`
// which is slightly different depending on the "sub-version" (e.g. /legacy)
// This function is a single place to take care of all of these error handlings
async function handleGetSearchResultsError(req, res, error, options) {
if (process.env.NODE_ENV === 'development') {
console.error(`Error calling getSearchResults(${options})`, error)
} else {
const reports = FailBot.report(error, Object.assign({ url: req.url }, options))
// It might be `undefined` if no backends are configured which
// is likely when using production NODE_ENV on your laptop
// where you might not have a HATSTACK_URL configured.
if (reports) await Promise.all(reports)
}
res.status(500).json({ error: error.message })
}
// Alias for the latest version
router.get('/', (req, res) => {
// At the time of writing, the latest version is v1. (July 2022)
// Use `req.originalUrl` because this router is "self contained"
// which means that `req.url` will be `/` in this context.
res.redirect(307, req.originalUrl.replace('/search', '/search/v1'))
})
export default router