diff --git a/quartz/components/scripts/search.inline.ts b/quartz/components/scripts/search.inline.ts index 093ece275..ead340537 100644 --- a/quartz/components/scripts/search.inline.ts +++ b/quartz/components/scripts/search.inline.ts @@ -1,14 +1,7 @@ import FlexSearch, { DefaultDocumentSearchResults, Id } from "flexsearch" -import type { ContentDetails } from "../../plugins/emitters/contentIndex" +import { ContentDetails } from "../../plugins/emitters/contentIndex" import { SemanticClient, type SemanticResult } from "./semantic.inline" -import { - registerEscapeHandler, - removeAllChildren, - highlight, - tokenizeTerm, - encode, - fetchCanonical, -} from "./util" +import { registerEscapeHandler, removeAllChildren, fetchCanonical } from "./util" import { FullSlug, normalizeRelativeURLs, resolveRelative } from "../../util/path" interface Item { @@ -61,6 +54,71 @@ type SimilarityResult = { item: Item; similarity: number } let chunkMetadata: Record = {} let manifestIds: string[] = [] +const contextWindowWords = 30 +const tokenizeTerm = (term: string) => { + const tokens = term.split(/\s+/).filter((t) => t.trim() !== "") + const tokenLen = tokens.length + if (tokenLen > 1) { + for (let i = 1; i < tokenLen; i++) { + tokens.push(tokens.slice(0, i + 1).join(" ")) + } + } + + return tokens.sort((a, b) => b.length - a.length) // always highlight longest terms first +} + +function highlight(searchTerm: string, text: string, trim?: boolean) { + const tokenizedTerms = tokenizeTerm(searchTerm) + let tokenizedText = text.split(/\s+/).filter((t) => t !== "") + + let startIndex = 0 + let endIndex = tokenizedText.length - 1 + if (trim) { + const includesCheck = (tok: string) => + tokenizedTerms.some((term) => tok.toLowerCase().startsWith(term.toLowerCase())) + const occurrencesIndices = tokenizedText.map(includesCheck) + + let bestSum = 0 + let bestIndex = 0 + for (let i = 0; i < Math.max(tokenizedText.length - contextWindowWords, 0); i++) { + const window = occurrencesIndices.slice(i, i + contextWindowWords) + const windowSum = window.reduce((total, cur) => total + (cur ? 1 : 0), 0) + if (windowSum >= bestSum) { + bestSum = windowSum + bestIndex = i + } + } + + startIndex = Math.max(bestIndex - contextWindowWords, 0) + endIndex = Math.min(startIndex + 2 * contextWindowWords, tokenizedText.length - 1) + tokenizedText = tokenizedText.slice(startIndex, endIndex) + } + + const slice = tokenizedText + .map((tok) => { + // see if this tok is prefixed by any search terms + for (const searchTok of tokenizedTerms) { + if (tok.toLowerCase().includes(searchTok.toLowerCase())) { + const regex = new RegExp(searchTok.toLowerCase(), "gi") + return tok.replace(regex, `$&`) + } + } + return tok + }) + .join(" ") + + return `${startIndex === 0 ? "" : "..."}${slice}${ + endIndex === tokenizedText.length - 1 ? "" : "..." + }` +} + +// To be used with search and everything else with flexsearch +const encoder = (str: string) => + str + .toLowerCase() + .split(/\s+/) + .filter((token) => token.length > 0) + /** * Get parent document slug for a chunk ID */ @@ -125,7 +183,7 @@ function aggregateChunkResults( // Initialize the FlexSearch Document instance with the appropriate configuration const index = new FlexSearch.Document({ tokenize: "forward", - encode, + encode: encoder, document: { id: "id", tag: "tags", diff --git a/quartz/components/scripts/util.ts b/quartz/components/scripts/util.ts index de0681f78..f71790104 100644 --- a/quartz/components/scripts/util.ts +++ b/quartz/components/scripts/util.ts @@ -44,68 +44,3 @@ export async function fetchCanonical(url: URL): Promise { const [_, redirect] = text.match(canonicalRegex) ?? [] return redirect ? fetch(`${new URL(redirect, url)}`) : res } - -const contextWindowWords = 30 -export const tokenizeTerm = (term: string) => { - const tokens = term.split(/\s+/).filter((t) => t.trim() !== "") - const tokenLen = tokens.length - if (tokenLen > 1) { - for (let i = 1; i < tokenLen; i++) { - tokens.push(tokens.slice(0, i + 1).join(" ")) - } - } - - return tokens.sort((a, b) => b.length - a.length) // always highlight longest terms first -} - -export function highlight(searchTerm: string, text: string, trim?: boolean) { - const tokenizedTerms = tokenizeTerm(searchTerm) - let tokenizedText = text.split(/\s+/).filter((t) => t !== "") - - let startIndex = 0 - let endIndex = tokenizedText.length - 1 - if (trim) { - const includesCheck = (tok: string) => - tokenizedTerms.some((term) => tok.toLowerCase().startsWith(term.toLowerCase())) - const occurrencesIndices = tokenizedText.map(includesCheck) - - let bestSum = 0 - let bestIndex = 0 - for (let i = 0; i < Math.max(tokenizedText.length - contextWindowWords, 0); i++) { - const window = occurrencesIndices.slice(i, i + contextWindowWords) - const windowSum = window.reduce((total, cur) => total + (cur ? 1 : 0), 0) - if (windowSum >= bestSum) { - bestSum = windowSum - bestIndex = i - } - } - - startIndex = Math.max(bestIndex - contextWindowWords, 0) - endIndex = Math.min(startIndex + 2 * contextWindowWords, tokenizedText.length - 1) - tokenizedText = tokenizedText.slice(startIndex, endIndex) - } - - const slice = tokenizedText - .map((tok) => { - // see if this tok is prefixed by any search terms - for (const searchTok of tokenizedTerms) { - if (tok.toLowerCase().includes(searchTok.toLowerCase())) { - const regex = new RegExp(searchTok.toLowerCase(), "gi") - return tok.replace(regex, `$&`) - } - } - return tok - }) - .join(" ") - - return `${startIndex === 0 ? "" : "..."}${slice}${ - endIndex === tokenizedText.length - 1 ? "" : "..." - }` -} - -// To be used with search and everything else with flexsearch -export const encode = (str: string) => - str - .toLowerCase() - .split(/\s+/) - .filter((token) => token.length > 0)