import express from 'express' import searchVersions from '../../lib/search/versions.js' import FailBot from '../../lib/failbot.js' import languages from '../../lib/languages.js' import { allVersions } from '../../lib/all-versions.js' import { defaultCacheControl } from '../cache-control.js' import catchMiddlewareError from '../catch-middleware-error.js' import { getSearchResults, ELASTICSEARCH_URL } from './es-search.js' // Used by the legacy search const versions = new Set(Object.values(searchVersions)) const languagesSet = new Set(Object.keys(languages)) const router = express.Router() const DEFAULT_SIZE = 10 const MAX_SIZE = 50 // How much you return has a strong impact on performance const DEFAULT_PAGE = 1 const POSSIBLE_SORTS = ['best', 'relevance'] const DEFAULT_SORT = POSSIBLE_SORTS[0] // If someone searches for `...&version=3.5` what they actually mean // is `ghes-3.5`. This is because of legacy formatting with the old search. // In some distant future we can clean up any client enough that this // aliasing won't be necessary. const versionAliases = {} Object.values(allVersions).forEach((info) => { if (info.hasNumberedReleases) { versionAliases[info.currentRelease] = info.miscVersionName } else { versionAliases[info.version] = info.miscVersionName versionAliases[info.miscVersionName] = info.miscVersionName } }) const legacyEnterpriseServerVersions = Object.fromEntries( Object.entries(searchVersions) .filter(([fullName]) => { return fullName.startsWith('enterprise-server@') }) .map(([, shortName]) => { return [shortName, `ghes-${shortName}`] }) ) function getIndexPrefix() { // This logic is mirrored in the scripts we use before running tests // In particular, see the `index-test-fixtures` npm script. // That's expected to be run before CI and local jest testing. // The reason we have a deliberately different index name (by prefix) // for testing compared to regular operation is to make it convenient // for engineers working on local manual testing *and* automated // testing without have to re-index different content (e.g. fixtures // vs real content) on the same index name. if (process.env.NODE_ENV === 'test') return 'tests_' return '' } function convertLegacyVersionName(version) { // In the olden days we used to use `?version=3.5&...` but we decided // that's ambiguous and it should be `ghes-3.5` instead. return legacyEnterpriseServerVersions[version] || version } function notConfiguredMiddleware(req, res, next) { if (!ELASTICSEARCH_URL) { if (process.env.NODE_ENV === 'production') { // Temporarily, this is OKish. The Docs Engineering team is // currently working on setting up an Elasticsearch cloud // instance that we can use. We don't currently have that, // but this code is running in production. We just don't want // to unnecessarily throw errors when it's actually a known thing. return res.status(500).send('ELASTICSEARCH_URL not been set up yet') } throw new Error( 'process.env.ELASTICSEARCH_URL is not set. ' + "If you're working on this locally, add `ELASTICSEARCH_URL=http://localhost:9200` in your .env file" ) } return next() } router.get( '/legacy', notConfiguredMiddleware, catchMiddlewareError(async function legacySearch(req, res) { const { query, version, language, filters, limit: limit_ } = req.query if (filters) { throw new Error('not implemented yet') } const limit = Math.min(parseInt(limit_, 10) || 10, 100) if (!versions.has(version)) { return res.status(400).json({ error: 'Unrecognized version' }) } if (!languagesSet.has(language)) { return res.status(400).json({ error: 'Unrecognized language' }) } if (!query || !limit) { return res.status(200).json([]) } const indexName = `${getIndexPrefix()}github-docs-${convertLegacyVersionName( version )}-${language}` const hits = [] try { const searchResults = await getSearchResults({ indexName, query, page: 1, sort: 'best', size: limit, debug: true, includeTopics: true, // The legacy search is used as an autocomplete. In other words, // a debounce that sends the query before the user has had a // chance to fully submit the search. That means if the user // send the query 'google cl' they hope to find 'Google Cloud' // even though they didn't type that fully. usePrefixSearch: true, }) hits.push(...searchResults.hits) } catch (err) { // If we don't catch here, the `catchMiddlewareError()` wrapper // will take any thrown error and pass it to `next()`. console.error('Error wrapping getSearchResults()', err) return res.status(500).json([]) } // The legacy search just returned an array const results = hits.map((hit) => { let title = hit.title if (hit.highlights?.title && hit.highlights?.title.length) { title = hit.highlights.title[0] } let content = '' if (hit.highlights?.content && hit.highlights?.content.length) { content = hit.highlights.content.join('\n') } return { url: hit.url, title, breadcrumbs: hit.breadcrumbs || '', content, topics: hit.topics || [], popularity: hit.popularity || 0.0, score: hit.score, } }) if (process.env.NODE_ENV !== 'development') { defaultCacheControl(res) } res.setHeader('x-search-legacy', 'yes') res.status(200).json(results) }) ) class ValidationError extends Error {} const validationMiddleware = (req, res, next) => { const params = [ { key: 'query' }, { key: 'version', default_: 'dotcom', validate: (v) => { if (versionAliases[v] || allVersions[v]) return true const valid = [...Object.keys(versionAliases), ...Object.keys(allVersions)] throw new ValidationError(`'${v}' not in ${valid}`) }, }, { key: 'language', default_: 'en', validate: (v) => v in languages }, { key: 'size', default_: DEFAULT_SIZE, cast: (v) => parseInt(v, 10), validate: (v) => v >= 0 && v <= MAX_SIZE, }, { key: 'page', default_: DEFAULT_PAGE, cast: (v) => parseInt(v, 10), validate: (v) => v >= 1 && v <= 10, }, { key: 'sort', default_: DEFAULT_SORT, validate: (v) => POSSIBLE_SORTS.includes(v) }, { key: 'debug', default_: Boolean(process.env.NODE_ENV === 'development' || req.query.debug) }, ] const search = {} for (const { key, default_, cast, validate } of params) { let value = req.query[key] if (!value || (typeof value === 'string' && !value.trim())) { if (default_ === undefined) { // no value and no default, bad! return res.status(400).json({ error: `No truthy value for key '${key}'` }) } value = default_ } if (cast) { value = cast(value) } try { if (validate && !validate(value)) { return res .status(400) .json({ error: `Not a valid value (${JSON.stringify(value)}) for key '${key}'` }) } } catch (err) { if (err instanceof ValidationError) { return res.status(400).json({ error: err.toString(), field: key }) } throw err } search[key] = value } const version = versionAliases[search.version] || allVersions[search.version].miscVersionName search.indexName = `${getIndexPrefix()}github-docs-${version}-${search.language}` // github-docs-ghes-3.5-en req.search = search return next() } router.get( '/v1', validationMiddleware, notConfiguredMiddleware, catchMiddlewareError(async function search(req, res) { const { indexName, query, page, size, debug, sort } = req.search try { const { meta, hits } = await getSearchResults({ indexName, query, page, size, debug, sort }) if (process.env.NODE_ENV !== 'development') { // The assumption, at the moment is that searches are never distinguished // differently depending on a cookie or a request header. // So the only distinguishing key is the request URL. // Because of that, it's safe to allow the reverse proxy (a.k.a the CDN) // cache and hold on to this. defaultCacheControl(res) } // The v1 version of the output matches perfectly what comes out // of the getSearchResults() function. res.status(200).json({ meta, hits }) } catch (error) { // If getSearchResult() throws an error that might be 404 inside // elasticsearch, if we don't capture that here, it will propgate // to the next middleware. if (process.env.NODE_ENV === 'development') { console.error('Error calling getSearchResults()', error) } else { const reports = FailBot.report(error, { url: req.url, indexName, query, page, size, debug, sort, }) // It might be `undefined` if no backends are configured which // is likely when using production NODE_ENV on your laptop // where you might not have a HATSTACK_URL configured. if (reports) await Promise.all(reports) } res.status(500).send(error.message) } }) ) // Alias for the latest version router.get('/', (req, res) => { // At the time of writing, the latest version is v1. (July 2022) // Use `req.originalUrl` because this router is "self contained" // which means that `req.url` will be `/` in this context. res.redirect(307, req.originalUrl.replace('/search', '/search/v1')) }) export default router