import { fileURLToPath } from 'url' import path from 'path' import lunr from 'lunr' import lunrStemmerSupport from 'lunr-languages/lunr.stemmer.support.js' import tinyseg from 'lunr-languages/tinyseg.js' import lunrJa from 'lunr-languages/lunr.ja.js' import lunrEs from 'lunr-languages/lunr.es.js' import lunrPt from 'lunr-languages/lunr.pt.js' import { get } from 'lodash-es' import readFileAsync from '../readfile-async.js' import { namePrefix } from './config.js' import { decompress } from './compress.js' const __dirname = path.dirname(fileURLToPath(import.meta.url)) // By default Lunr considers the `-` character to be a word boundary. // This allows hypens to be included in the query. // If you change this, remember to make it match the indexing separator // in script/search/lunr-search-index.js so the query is tokenized // identically to the way it was indexed. lunr.QueryLexer.termSeparator = /[\s]+/ lunrStemmerSupport(lunr) tinyseg(lunr) lunrJa(lunr) lunrEs(lunr) lunrPt(lunr) const LUNR_DIR = './indexes' const lunrIndexes = new Map() const lunrRecords = new Map() // Max size of the `.content` record included in the JSON payload that the // middleware server will serve. // The reason we're worrying about that here and not in the middleware // is because what we're *ultimately* sending is HTML so we can't let // the consumer of this module, slice it as a regular string because // they might cut off an HTML tag in the middle. // As of Oct 2021, with the way the CSS works inside components/Search.tsx // roughly 450-650 characters is contained. Let's just make sure we're // well within limit. So no visual difference, but smaller JSON payloads. const MAX_CONTENT_LENGTH = 1000 export default async function loadLunrResults({ version, language, query, limit }) { const indexName = `${namePrefix}-${version}-${language}` if (!lunrIndexes.has(indexName) || !lunrRecords.has(indexName)) { lunrIndexes.set(indexName, await loadLunrIndex(indexName)) lunrRecords.set(indexName, await loadLunrRecords(indexName)) } const index = lunrIndexes.get(indexName) const records = lunrRecords.get(indexName) const queryLength = query.trim().length // A search results /combined/ score is: // // normalizedScore + POPULARITY_FACTOR * record.popularity // // where the "normalizedScore" is the ratio of its Lunr score divided // by the highest score of all found in Lunr. That means, that the record // Lunr thinks matches the most becomes 1.0. // // It's the number we sort on. The `record.popularity` is always a // number between (and including) 0-1. // If the Lunr score is, say, 5.0 and the popularity is 0.1, and // the POPULARITY_FACTOR is 10, the combined score is 5.0 + 10 * 0.1 = 6.0 // If you make this too large, the Lunr score becomes insignificant and // any single match anywhere will always favor the popular documents. // The best way to adjust this number is to get a feeling for what // kinds of Lunr score numbers we're usually getting and adjust // accordingly. // Short queries are bound to be very ambigous and the more ambiguous // the more relevant the popularity is. const POPULARITY_FACTOR = queryLength <= 2 ? 25 : queryLength <= 6 ? 10 : 5 // This number determines how much more we favor the title search first. // It's a multiplier. We do 2 searches: one on title, one on all other fields. // Then, we compare all scores. But the scores in the results from the title // we multiply that with this number. // The effect is that we favor matches in the title more than we favor // matches that were not in the title. // If you search for 'foobar' and it appears in the title of one // not-so-popular record, but also appears in the content of a // very popular record, you want to give the title-matching one a // leg up. // Note that the Lunr scores from the content is usually much higher // than scores on the title. E.g. the word `codespaces` might appear // 10 times on a page that is actually about something else. If there's // a record whose title includes `codespaces` it might get a very low // Lunr score but since title matches are generally a "better", we // want to make sure this number accounts for that. const TITLE_FIRST = queryLength <= 2 ? 45 : queryLength <= 6 ? 25 : 10 // Multiplication bonus given to matches that were made on the // the search where ALL tokens are required. // E.g. you search for 'foo bar' and we have three records: // // A) "This foo is very special" // B) "With bar and foo you can't go wrong" // C) "Only bar can save you" // // What will happen is that it only finds record (B) when it's // requires to match both 'foo' *and* 'bar'. So you get these scores: // // A) score = result.score + popularity // B) score = MATCH_PHRASE * (result.score + popularity) // C) score = result.score + popularity // // So it's very powerful multiplier. But that's fine because a // "phrase match" is a very accurate thing. const MATCH_PHRASE = 5 // Imagine that we have 1,000 documents. 100 of them contain the word // 'foobar'. Of those 100, we want to display the top 10 "best". // But if we only do `lunrindex.search('foobar').slice(0, 10)` we // would slice prematurely. Instead, we do // `lunrindex.search('foobar').slice(0, 100)` first, sort those, // and in the final step, after any custom sorting, we `.slice(0, 10)`. // This number decides how many to extract from Lunr in the first place // that we're going to do our custom sorting on. // This number can be allowed to be pretty big because we're only ever // going to do the more time-consuming highlighting on the `limit` // records that we finally return. const PRE_LIMIT = 500 const titleQuery = query.trim() let highestTitleScore = 0.0 const andTitleResults = [] // This will turn something like 'foo and bar' into: // [ // { str: 'foo', metadata: { position: [Array], index: 0 } }, // { str: 'bar', metadata: { position: [Array], index: 1 } } // ] // Note how the stopword gets omitted. // It's important to omit the stopwords because even if the record // actually contains the stopword, it won't match then. // E.g. you have a record called "Foo And Bar" and you search for // {foo AND and AND bar} it will actually not find anything. // But if you change it to {foo AND bar} it will match "Foo And Bar" // Same goes if any other stopwords were used like "Foo the Bar with for a". // That also needs to become an AND-search of {foo AND bar} ...only. const titleQueryTokenized = lunr.tokenizer(titleQuery).filter(lunr.stopWordFilter) if (titleQueryTokenized.length > 1) { andTitleResults.push( ...index .query((q) => { for (const { str } of titleQueryTokenized) { q.term(str, { fields: ['title'], presence: lunr.Query.presence.REQUIRED }) } }) .slice(0, PRE_LIMIT) .map((result) => { const { popularity } = records[result.ref] if (result.score > highestTitleScore) { highestTitleScore = result.score } const score = result.score / highestTitleScore return { result, _score: MATCH_PHRASE * TITLE_FIRST * (score + POPULARITY_FACTOR * (popularity || 0.0)), } }) ) } const titleResults = index .query((q) => { // The objective is to create an OR-query specifically for the 'title' // because *we* value matches on that much higher than any other // field in our records. // But we want to make sure that the last word is always treated // like a forward-tokenized token. I.e. you typed "google ku" // becomes a search for "google ku*". // Note that it's import that use the `lunr.tokenizer()` function when // using the `index.query()` function because, for starters, it will // normalize the input. // If you use `index.search()` is the higher abstraction of basically // doing this: // (pseudo code) // // Index.prototype.search = function(input) { // lunr.tokenize(input).forEach(token => { // Index.query(callback => { // callback(token) // }) // }) // } // // If we didn't use the tokenized form, we'd get different results // for searching for "SSH agent" and "ssh AgenT" for example. titleQueryTokenized.forEach(({ str }, i) => { const isLastToken = i === titleQueryTokenized.length - 1 const isShort = str.length <= 3 q.term(str, { fields: ['title'], wildcard: isLastToken && isShort ? lunr.Query.wildcard.TRAILING : lunr.Query.wildcard.NONE, }) }) }) .slice(0, PRE_LIMIT) .map((result) => { const { popularity } = records[result.ref] if (result.score > highestTitleScore) { highestTitleScore = result.score } const score = result.score / highestTitleScore return { result, _score: TITLE_FIRST * (score + POPULARITY_FACTOR * (popularity || 0.0)), } }) let allQuery = query.trim() // Unfortunately, Lunr currently doesn't support phrase matching // so you always end up with 0 results if you search for `"foo bar"`. // In this case it's better to do a search for `foo` and `bar`. if ( allQuery.startsWith('"') && allQuery.endsWith('"') && (allQuery.match(/"/g) || []).length === 2 ) { allQuery = allQuery.slice(1, -1) } let highestAllScore = 0.0 const allResults = index .search(allQuery) .slice(0, PRE_LIMIT) .map((result) => { const { popularity } = records[result.ref] if (result.score > highestAllScore) { highestAllScore = result.score } const score = result.score / highestAllScore return { result, score, _score: score + POPULARITY_FACTOR * (popularity || 0.0), } }) const _unique = new Set() const combinedMatchData = {} const results = [] for (const matches of [andTitleResults, titleResults, allResults]) { for (const match of matches) { const { result } = match // We need to loop over all results (both from title searches and // from all-field searches) but we can only keep one. // But before we do that filtering (i.e. omitting previous kept) // we need to merge all the matchData from each result. // That's because the `result.matchData` from the title search // will have Lunr match positions for 'title' but the `result.matchData` // from the all-field search, will have positions for other things // such as 'content' and 'breadcrumbs'. combinedMatchData[result.ref] = Object.assign( combinedMatchData[result.ref] || {}, result.matchData ) if (_unique.has(result.ref)) continue _unique.add(result.ref) results.push(match) } } // Highest score first results.sort((a, b) => b._score - a._score) // We might have found much more than `limit` number of matches and we've // taken them all out for our custom sorting. Now, once that's done, // of the ones we're going to return we apply the highlighting. // The reasonsing is that the highlighting work isn't free and it'd // be a waste to do it on results we're not going to return anyway. return results.slice(0, limit).map(({ result }) => { const record = records[result.ref] const matchData = combinedMatchData[result.ref] return { url: result.ref, breadcrumbs: field(matchData, record, 'breadcrumbs'), title: field(matchData, record, 'title'), content: smartSlice(field(matchData, record, 'content'), MAX_CONTENT_LENGTH), // don't highlight the topics array topics: record.topics, score: result.score, popularity: record.popularity || 0.0, } }) } async function loadLunrIndex(indexName) { const filePath = path.posix.join(__dirname, LUNR_DIR, `${indexName}.json.br`) // Do not set to 'utf8' on file reads return readFileAsync(filePath).then(decompress).then(JSON.parse).then(lunr.Index.load) } async function loadLunrRecords(indexName) { const filePath = path.posix.join(__dirname, LUNR_DIR, `${indexName}-records.json.br`) // Do not set to 'utf8' on file reads return readFileAsync(filePath).then(decompress).then(JSON.parse) } // Highlight a match within an attribute field function field(matchData, record, name) { const text = record[name] if (!text) return text // First, get a list of all the positions of the matching tokens const positions = Object.values(matchData.metadata) .map((fields) => get(fields, [name, 'position'])) .filter(Boolean) .flat() .sort((a, b) => a[0] - b[0]) .map(([start, length]) => [start, start + length]) .map(([start, end], i, a) => [i && a[i - 1][1], start, end]) // If this field has no token matches, no highlighting if (!positions.length) return text // Highlight the text const highlighted = positions .map(([prev, start, end], i) => [ text.slice(prev, start), mark(text.slice(start, end)), i === positions.length - 1 && text.slice(end), ]) .flat() .filter(Boolean) .join('') // We can't HTML escape the content until AFTER all the matchData positions // have been processed otherwise, the positions should shift. // The only HTML that is OK to keep is and . return highlighted .replace(/&/g, '&') .replace(//g, '>') .replace(/<mark>/g, '') .replace(/<\/mark>/g, '') } function mark(text) { return `${text}` } // Give a long string, "slice" it in a safe way so as to not chop any // HTML tags in half. // The resulting string will only be at *least* as long as the `length` // provided. Possibly longer. function smartSlice(text, length, needleTag = '') { // If the needleTag isn't present at all, we can dare to use a // very basic crude string slice because the text won't have any // other HTML tags we might cut in half. if (!text.includes(needleTag)) { return text.slice(0, length) } // The algorithm is simple, split the text by lines. Loop over them, // and only include them if we've encountered the first needleTag // and bail early if we've buffered enough in the array of lines. const lines = [] let sum = 0 let started = false for (const line of text.split('\n')) { if (line.indexOf(needleTag) > -1) started = true if (started) { lines.push(line) sum += line.length if (sum > length) { break } } } return lines.join('\n') }