diff --git a/javascripts/search.js b/javascripts/search.js index f522fcb1c6..c7d49eb1c2 100644 --- a/javascripts/search.js +++ b/javascripts/search.js @@ -287,7 +287,7 @@ function tmplSearchResult ({ url, breadcrumbs, heading, title, content }) { ) } -// Convert em to mark tags in search responses +// Convert mark tags in search responses function markify (text) { const { mark } = tags return text diff --git a/lib/search/compress.js b/lib/search/compress.js new file mode 100644 index 0000000000..7e978463ba --- /dev/null +++ b/lib/search/compress.js @@ -0,0 +1,21 @@ +const { promisify } = require('util') +const zlib = require('zlib') +const brotliCompress = promisify(zlib.brotliCompress) +const brotliDecompress = promisify(zlib.brotliDecompress) + +const options = { + params: { + [zlib.constants.BROTLI_PARAM_MODE]: zlib.constants.BROTLI_MODE_TEXT, + [zlib.constants.BROTLI_PARAM_QUALITY]: 6 + } +} + +module.exports = { + async compress (data) { + return brotliCompress(data, options) + }, + + async decompress (data) { + return brotliDecompress(data, options) + } +} diff --git a/lib/search/indexes/github-docs-2.20-cn-records.json.br b/lib/search/indexes/github-docs-2.20-cn-records.json.br new file mode 100644 index 0000000000..702854e145 Binary files /dev/null and b/lib/search/indexes/github-docs-2.20-cn-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.20-cn.json.br b/lib/search/indexes/github-docs-2.20-cn.json.br new file mode 100644 index 0000000000..f578b5644a Binary files /dev/null and b/lib/search/indexes/github-docs-2.20-cn.json.br differ diff --git a/lib/search/indexes/github-docs-2.20-en-records.json.br b/lib/search/indexes/github-docs-2.20-en-records.json.br new file mode 100644 index 0000000000..81f6a7c6e0 Binary files /dev/null and b/lib/search/indexes/github-docs-2.20-en-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.20-en.json.br b/lib/search/indexes/github-docs-2.20-en.json.br new file mode 100644 index 0000000000..240a0e852a Binary files /dev/null and b/lib/search/indexes/github-docs-2.20-en.json.br differ diff --git a/lib/search/indexes/github-docs-2.20-ja-records.json.br b/lib/search/indexes/github-docs-2.20-ja-records.json.br new file mode 100644 index 0000000000..d4fc62c072 Binary files /dev/null and b/lib/search/indexes/github-docs-2.20-ja-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.20-ja.json.br b/lib/search/indexes/github-docs-2.20-ja.json.br new file mode 100644 index 0000000000..46a0ea9c9e Binary files /dev/null and b/lib/search/indexes/github-docs-2.20-ja.json.br differ diff --git a/lib/search/indexes/github-docs-2.21-cn-records.json.br b/lib/search/indexes/github-docs-2.21-cn-records.json.br new file mode 100644 index 0000000000..443e5ce6f3 Binary files /dev/null and b/lib/search/indexes/github-docs-2.21-cn-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.21-cn.json.br b/lib/search/indexes/github-docs-2.21-cn.json.br new file mode 100644 index 0000000000..b7e424656f Binary files /dev/null and b/lib/search/indexes/github-docs-2.21-cn.json.br differ diff --git a/lib/search/indexes/github-docs-2.21-en-records.json.br b/lib/search/indexes/github-docs-2.21-en-records.json.br new file mode 100644 index 0000000000..97b48ce4b6 Binary files /dev/null and b/lib/search/indexes/github-docs-2.21-en-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.21-en.json.br b/lib/search/indexes/github-docs-2.21-en.json.br new file mode 100644 index 0000000000..9a379ab14f Binary files /dev/null and b/lib/search/indexes/github-docs-2.21-en.json.br differ diff --git a/lib/search/indexes/github-docs-2.21-es-records.json.br b/lib/search/indexes/github-docs-2.21-es-records.json.br new file mode 100644 index 0000000000..2c71aab8b1 Binary files /dev/null and b/lib/search/indexes/github-docs-2.21-es-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.21-es.json.br b/lib/search/indexes/github-docs-2.21-es.json.br new file mode 100644 index 0000000000..9ff15cc32c Binary files /dev/null and b/lib/search/indexes/github-docs-2.21-es.json.br differ diff --git a/lib/search/indexes/github-docs-2.21-ja-records.json.br b/lib/search/indexes/github-docs-2.21-ja-records.json.br new file mode 100644 index 0000000000..2679fd7b83 Binary files /dev/null and b/lib/search/indexes/github-docs-2.21-ja-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.21-ja.json.br b/lib/search/indexes/github-docs-2.21-ja.json.br new file mode 100644 index 0000000000..99c5b39639 Binary files /dev/null and b/lib/search/indexes/github-docs-2.21-ja.json.br differ diff --git a/lib/search/indexes/github-docs-2.22-cn-records.json.br b/lib/search/indexes/github-docs-2.22-cn-records.json.br new file mode 100644 index 0000000000..426a35b94b Binary files /dev/null and b/lib/search/indexes/github-docs-2.22-cn-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.22-cn.json.br b/lib/search/indexes/github-docs-2.22-cn.json.br new file mode 100644 index 0000000000..9ad7a777bc Binary files /dev/null and b/lib/search/indexes/github-docs-2.22-cn.json.br differ diff --git a/lib/search/indexes/github-docs-2.22-en-records.json.br b/lib/search/indexes/github-docs-2.22-en-records.json.br new file mode 100644 index 0000000000..264a058595 Binary files /dev/null and b/lib/search/indexes/github-docs-2.22-en-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.22-en.json.br b/lib/search/indexes/github-docs-2.22-en.json.br new file mode 100644 index 0000000000..a64a406ef0 Binary files /dev/null and b/lib/search/indexes/github-docs-2.22-en.json.br differ diff --git a/lib/search/indexes/github-docs-2.22-es-records.json.br b/lib/search/indexes/github-docs-2.22-es-records.json.br new file mode 100644 index 0000000000..313a3d7045 Binary files /dev/null and b/lib/search/indexes/github-docs-2.22-es-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.22-es.json.br b/lib/search/indexes/github-docs-2.22-es.json.br new file mode 100644 index 0000000000..3a1f42cdfa Binary files /dev/null and b/lib/search/indexes/github-docs-2.22-es.json.br differ diff --git a/lib/search/indexes/github-docs-2.22-ja-records.json.br b/lib/search/indexes/github-docs-2.22-ja-records.json.br new file mode 100644 index 0000000000..d7dc86a125 Binary files /dev/null and b/lib/search/indexes/github-docs-2.22-ja-records.json.br differ diff --git a/lib/search/indexes/github-docs-2.22-ja.json.br b/lib/search/indexes/github-docs-2.22-ja.json.br new file mode 100644 index 0000000000..dd821b5c5c Binary files /dev/null and b/lib/search/indexes/github-docs-2.22-ja.json.br differ diff --git a/lib/search/indexes/github-docs-3.0-cn-records.json.br b/lib/search/indexes/github-docs-3.0-cn-records.json.br new file mode 100644 index 0000000000..f4226c231a Binary files /dev/null and b/lib/search/indexes/github-docs-3.0-cn-records.json.br differ diff --git a/lib/search/indexes/github-docs-3.0-cn.json.br b/lib/search/indexes/github-docs-3.0-cn.json.br new file mode 100644 index 0000000000..e806f8fad1 Binary files /dev/null and b/lib/search/indexes/github-docs-3.0-cn.json.br differ diff --git a/lib/search/indexes/github-docs-3.0-en-records.json.br b/lib/search/indexes/github-docs-3.0-en-records.json.br new file mode 100644 index 0000000000..c334cc751c Binary files /dev/null and b/lib/search/indexes/github-docs-3.0-en-records.json.br differ diff --git a/lib/search/indexes/github-docs-3.0-en.json.br b/lib/search/indexes/github-docs-3.0-en.json.br new file mode 100644 index 0000000000..6fd4b0bf99 Binary files /dev/null and b/lib/search/indexes/github-docs-3.0-en.json.br differ diff --git a/lib/search/indexes/github-docs-3.0-es-records.json.br b/lib/search/indexes/github-docs-3.0-es-records.json.br new file mode 100644 index 0000000000..d4fc39d195 Binary files /dev/null and b/lib/search/indexes/github-docs-3.0-es-records.json.br differ diff --git a/lib/search/indexes/github-docs-3.0-es.json.br b/lib/search/indexes/github-docs-3.0-es.json.br new file mode 100644 index 0000000000..32b7af5d63 Binary files /dev/null and b/lib/search/indexes/github-docs-3.0-es.json.br differ diff --git a/lib/search/indexes/github-docs-3.0-ja-records.json.br b/lib/search/indexes/github-docs-3.0-ja-records.json.br new file mode 100644 index 0000000000..6bc5577432 Binary files /dev/null and b/lib/search/indexes/github-docs-3.0-ja-records.json.br differ diff --git a/lib/search/indexes/github-docs-3.0-ja.json.br b/lib/search/indexes/github-docs-3.0-ja.json.br new file mode 100644 index 0000000000..a51d6fc4e9 Binary files /dev/null and b/lib/search/indexes/github-docs-3.0-ja.json.br differ diff --git a/lib/search/indexes/github-docs-dotcom-cn-records.json.br b/lib/search/indexes/github-docs-dotcom-cn-records.json.br new file mode 100644 index 0000000000..e765eeca18 Binary files /dev/null and b/lib/search/indexes/github-docs-dotcom-cn-records.json.br differ diff --git a/lib/search/indexes/github-docs-dotcom-cn.json.br b/lib/search/indexes/github-docs-dotcom-cn.json.br new file mode 100644 index 0000000000..97b801b09b Binary files /dev/null and b/lib/search/indexes/github-docs-dotcom-cn.json.br differ diff --git a/lib/search/indexes/github-docs-dotcom-en-records.json.br b/lib/search/indexes/github-docs-dotcom-en-records.json.br new file mode 100644 index 0000000000..3a0f7038a0 Binary files /dev/null and b/lib/search/indexes/github-docs-dotcom-en-records.json.br differ diff --git a/lib/search/indexes/github-docs-dotcom-en.json.br b/lib/search/indexes/github-docs-dotcom-en.json.br new file mode 100644 index 0000000000..16b64d3304 Binary files /dev/null and b/lib/search/indexes/github-docs-dotcom-en.json.br differ diff --git a/lib/search/indexes/github-docs-dotcom-es-records.json.br b/lib/search/indexes/github-docs-dotcom-es-records.json.br new file mode 100644 index 0000000000..c36c64045c Binary files /dev/null and b/lib/search/indexes/github-docs-dotcom-es-records.json.br differ diff --git a/lib/search/indexes/github-docs-dotcom-es.json.br b/lib/search/indexes/github-docs-dotcom-es.json.br new file mode 100644 index 0000000000..84ed1508ce Binary files /dev/null and b/lib/search/indexes/github-docs-dotcom-es.json.br differ diff --git a/lib/search/indexes/github-docs-dotcom-ja-records.json.br b/lib/search/indexes/github-docs-dotcom-ja-records.json.br new file mode 100644 index 0000000000..9897ba2d10 Binary files /dev/null and b/lib/search/indexes/github-docs-dotcom-ja-records.json.br differ diff --git a/lib/search/indexes/github-docs-dotcom-ja.json.br b/lib/search/indexes/github-docs-dotcom-ja.json.br new file mode 100644 index 0000000000..cfa1999952 Binary files /dev/null and b/lib/search/indexes/github-docs-dotcom-ja.json.br differ diff --git a/lib/search/indexes/github-docs-ghae-cn-records.json.br b/lib/search/indexes/github-docs-ghae-cn-records.json.br new file mode 100644 index 0000000000..b0a1d0b383 Binary files /dev/null and b/lib/search/indexes/github-docs-ghae-cn-records.json.br differ diff --git a/lib/search/indexes/github-docs-ghae-cn.json.br b/lib/search/indexes/github-docs-ghae-cn.json.br new file mode 100644 index 0000000000..b05db313d3 Binary files /dev/null and b/lib/search/indexes/github-docs-ghae-cn.json.br differ diff --git a/lib/search/indexes/github-docs-ghae-en-records.json.br b/lib/search/indexes/github-docs-ghae-en-records.json.br new file mode 100644 index 0000000000..07764c50f6 Binary files /dev/null and b/lib/search/indexes/github-docs-ghae-en-records.json.br differ diff --git a/lib/search/indexes/github-docs-ghae-en.json.br b/lib/search/indexes/github-docs-ghae-en.json.br new file mode 100644 index 0000000000..102150b927 Binary files /dev/null and b/lib/search/indexes/github-docs-ghae-en.json.br differ diff --git a/lib/search/indexes/github-docs-ghae-ja-records.json.br b/lib/search/indexes/github-docs-ghae-ja-records.json.br new file mode 100644 index 0000000000..85757d440d Binary files /dev/null and b/lib/search/indexes/github-docs-ghae-ja-records.json.br differ diff --git a/lib/search/indexes/github-docs-ghae-ja.json.br b/lib/search/indexes/github-docs-ghae-ja.json.br new file mode 100644 index 0000000000..919b188bc7 Binary files /dev/null and b/lib/search/indexes/github-docs-ghae-ja.json.br differ diff --git a/lib/search/lunr-get-index-names.js b/lib/search/lunr-get-index-names.js new file mode 100644 index 0000000000..a532e4be73 --- /dev/null +++ b/lib/search/lunr-get-index-names.js @@ -0,0 +1,6 @@ +const fs = require('fs').promises +const path = require('path') + +module.exports = async function getIndexNames () { + return await fs.readdir(path.join(__dirname, 'indexes')) +} diff --git a/lib/search/lunr-search-index.js b/lib/search/lunr-search-index.js new file mode 100644 index 0000000000..05eb88ddc3 --- /dev/null +++ b/lib/search/lunr-search-index.js @@ -0,0 +1,93 @@ +const lunr = require('lunr') +require('lunr-languages/lunr.stemmer.support')(lunr) +require('lunr-languages/tinyseg')(lunr) +require('lunr-languages/lunr.ja')(lunr) +require('lunr-languages/lunr.es')(lunr) +require('lunr-languages/lunr.pt')(lunr) +require('lunr-languages/lunr.de')(lunr) +const fs = require('fs').promises +const path = require('path') +const rank = require('./rank') +const validateRecords = require('./validate-records') +const { compress } = require('./compress') + +module.exports = class LunrIndex { + constructor (name, records) { + this.name = name + + // Add custom rankings + this.records = records.map(record => { + record.customRanking = rank(record) + return record + }) + + this.validate() + + return this + } + + validate () { + return validateRecords(this.name, this.records) + } + + build () { + const language = this.name.split('-').pop() + const records = this.records + + this.index = lunr(function constructIndex () { // No arrow here! + if (['ja', 'es', 'pt', 'de'].includes(language)) { + this.use(lunr[language]) + } + + this.ref('objectID') + this.field('url') + this.field('slug') + this.field('breadcrumbs') + this.field('heading') + this.field('title') + this.field('content') + this.field('customRanking') + + this.metadataWhitelist = ['position'] + + for (const record of records) { + this.add(record) + } + }) + } + + toJSON () { + this.build() + return JSON.stringify(this.index, null, 2) + } + + get recordsObject () { + return Object.fromEntries( + this.records.map(record => [record.objectID, record]) + ) + } + + async write () { + this.build() + + // Write the parsed records + await Promise.resolve(this.recordsObject) + .then(JSON.stringify) + .then(compress) + .then(content => fs.writeFile( + path.posix.join(__dirname, 'indexes', `${this.name}-records.json.br`), + content + // Do not set to 'utf8' + )) + + // Write the index + await Promise.resolve(this.index) + .then(JSON.stringify) + .then(compress) + .then(content => fs.writeFile( + path.posix.join(__dirname, 'indexes', `${this.name}.json.br`), + content + // Do not set to 'utf8' + )) + } +} diff --git a/lib/search/lunr-search.js b/lib/search/lunr-search.js new file mode 100644 index 0000000000..e081206a45 --- /dev/null +++ b/lib/search/lunr-search.js @@ -0,0 +1,81 @@ +const fs = require('fs').promises +const path = require('path') +const lunr = require('lunr') +const { get } = require('lodash') +const { namePrefix } = require('./config') +const { decompress } = require('./compress') + +const LUNR_DIR = './indexes' +const lunrIndexes = new Map() +const lunrRecords = new Map() + +module.exports = async function loadLunrResults ({ version, language, query, limit }) { + const indexName = `${namePrefix}-${version}-${language}` + if (!lunrIndexes.has(indexName) || !lunrRecords.has(indexName)) { + lunrIndexes.set(indexName, await loadLunrIndex(indexName)) + lunrRecords.set(indexName, await loadLunrRecords(indexName)) + } + const results = lunrIndexes.get(indexName) + .search(query) + .slice(0, limit) + .map((result) => { + const record = lunrRecords.get(indexName)[result.ref] + return { + url: result.ref, + breadcrumbs: field(result, record, 'breadcrumbs'), + heading: field(result, record, 'heading'), + title: field(result, record, 'title'), + content: field(result, record, 'content') + } + }) + return results +} + +async function loadLunrIndex (indexName) { + const filePath = path.posix.join(__dirname, LUNR_DIR, `${indexName}.json.br`) + // Do not set to 'utf8' on file reads + return fs.readFile(filePath) + .then(decompress) + .then(JSON.parse) + .then(lunr.Index.load) +} + +async function loadLunrRecords (indexName) { + const filePath = path.posix.join(__dirname, LUNR_DIR, `${indexName}-records.json.br`) + // Do not set to 'utf8' on file reads + return fs.readFile(filePath) + .then(decompress) + .then(JSON.parse) +} + +function field (result, record, name) { + const text = record[name] + if (!text) return text + + // First, get a list of all the positions of the matching tokens + const positions = Object.values(result.matchData.metadata) + .map(fields => get(fields, [name, 'position'])) + .filter(Boolean) + .flat() + .sort((a, b) => a[0] - b[0]) + .map(([start, length]) => [start, start + length]) + .map(([start, end], i, a) => [i && a[i - 1][1], start, end]) + + // If this field has no token matches, no highlighting + if (!positions.length) return text + + // Highlight the text + return positions + .map(([prev, start, end], i) => [ + text.slice(prev, start), + mark(text.slice(start, end)), + i === positions.length - 1 && text.slice(end) + ]) + .flat() + .filter(Boolean) + .join('') +} + +function mark (text) { + return `${text}` +} diff --git a/lib/search/sync.js b/lib/search/sync.js index fa6e03428f..58168b686c 100644 --- a/lib/search/sync.js +++ b/lib/search/sync.js @@ -14,6 +14,10 @@ const { namePrefix } = require('./config') const getRemoteIndexNames = require('./algolia-get-remote-index-names') const AlgoliaIndex = require('./algolia-search-index') +// Lunr +const LunrIndex = require('./lunr-search-index') +const getLunrIndexNames = require('./lunr-get-index-names') + // Build a search data file for every combination of product version and language // e.g. `github-docs-dotcom-en.json` and `github-docs-2.14-ja.json` module.exports = async function syncSearchIndexes (opts = {}) { @@ -67,22 +71,31 @@ module.exports = async function syncSearchIndexes (opts = {}) { // The page version will be the new version, e.g., free-pro-team@latest, enterprise-server@2.22 const records = await buildRecords(indexName, indexablePages, pageVersion, languageCode) - const index = new AlgoliaIndex(indexName, records) + const index = process.env.USE_LUNR + ? new LunrIndex(indexName, records) + : new AlgoliaIndex(indexName, records) if (opts.dryRun) { const cacheFile = path.join(cacheDir, `${indexName}.json`) fs.writeFileSync(cacheFile, JSON.stringify(index, null, 2)) console.log('wrote dry-run index to disk: ', cacheFile) } else { - await index.syncWithRemote() - console.log('synced index with remote: ', indexName) + if (process.env.USE_LUNR) { + await index.write() + console.log('wrote index to file: ', indexName) + } else { + await index.syncWithRemote() + console.log('synced index with remote: ', indexName) + } } } } // Fetch a list of index names and cache it for tests // to ensure that an index exists for every language and GHE version - const remoteIndexNames = await getRemoteIndexNames() + const remoteIndexNames = process.env.USE_LUNR + ? await getLunrIndexNames() + : await getRemoteIndexNames() const cachedIndexNamesFile = path.join(__dirname, './cached-index-names.json') fs.writeFileSync( cachedIndexNamesFile, diff --git a/middleware/search.js b/middleware/search.js index 6e2a7e6f20..25744462c1 100644 --- a/middleware/search.js +++ b/middleware/search.js @@ -1,6 +1,7 @@ const express = require('express') const languages = new Set(Object.keys(require('../lib/languages'))) const versions = require('../lib/search/versions') +const loadLunrResults = require('../lib/search/lunr-search') const loadAlgoliaResults = require('../lib/search/algolia-search') const router = express.Router() @@ -11,8 +12,8 @@ router.get('/', async (req, res) => { 'cache-control': 'private, no-store' }) - const { query, version, language } = req.query - const limit = Math.min(parseInt(req.query.limit, 10) || 10, 100) + const { query, version, language, limit: limit_ } = req.query + const limit = Math.min(parseInt(limit_, 10) || 10, 100) if (!versions.has(version) || !languages.has(language)) { return res.status(400).json([]) } @@ -21,7 +22,9 @@ router.get('/', async (req, res) => { } try { - const results = await loadAlgoliaResults({ version, language, query, limit }) + const results = process.env.USE_LUNR + ? await loadLunrResults({ version, language, query, limit }) + : await loadAlgoliaResults({ version, language, query, limit }) return res.status(200).json(results) } catch (err) { console.error(err) diff --git a/package-lock.json b/package-lock.json index d0bb36d553..629b86b299 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17925,6 +17925,16 @@ "yallist": "^4.0.0" } }, + "lunr": { + "version": "2.3.9", + "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.3.9.tgz", + "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==" + }, + "lunr-languages": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/lunr-languages/-/lunr-languages-1.4.0.tgz", + "integrity": "sha512-YWfZDExJN/MJEVE/DbM4AuVRLsqeHi+q3wmECMsWjGIOkd5mr9DUNos7fv8f5do9VLRMYXIzFjn+N4+KPI9pQA==" + }, "macos-release": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/macos-release/-/macos-release-2.3.0.tgz", diff --git a/package.json b/package.json index 5e68103448..8caa408856 100644 --- a/package.json +++ b/package.json @@ -65,6 +65,8 @@ "linkinator": "^2.13.1", "liquid": "^5.1.0", "lodash": "^4.17.19", + "lunr": "^2.3.9", + "lunr-languages": "^1.4.0", "mdast-util-from-markdown": "^0.8.4", "mini-css-extract-plugin": "^0.9.0", "mkdirp": "^1.0.3",