revert: redudant changes

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
Aaron Pham 2025-10-05 20:00:35 -04:00
parent 68682a8fe3
commit eb8a4cce18
No known key found for this signature in database
GPG Key ID: 18974753009D2BFA
2 changed files with 68 additions and 75 deletions

View File

@ -1,14 +1,7 @@
import FlexSearch, { DefaultDocumentSearchResults, Id } from "flexsearch" import FlexSearch, { DefaultDocumentSearchResults, Id } from "flexsearch"
import type { ContentDetails } from "../../plugins/emitters/contentIndex" import { ContentDetails } from "../../plugins/emitters/contentIndex"
import { SemanticClient, type SemanticResult } from "./semantic.inline" import { SemanticClient, type SemanticResult } from "./semantic.inline"
import { import { registerEscapeHandler, removeAllChildren, fetchCanonical } from "./util"
registerEscapeHandler,
removeAllChildren,
highlight,
tokenizeTerm,
encode,
fetchCanonical,
} from "./util"
import { FullSlug, normalizeRelativeURLs, resolveRelative } from "../../util/path" import { FullSlug, normalizeRelativeURLs, resolveRelative } from "../../util/path"
interface Item { interface Item {
@ -61,6 +54,71 @@ type SimilarityResult = { item: Item; similarity: number }
let chunkMetadata: Record<string, { parentSlug: string; chunkId: number }> = {} let chunkMetadata: Record<string, { parentSlug: string; chunkId: number }> = {}
let manifestIds: string[] = [] let manifestIds: string[] = []
const contextWindowWords = 30
const tokenizeTerm = (term: string) => {
const tokens = term.split(/\s+/).filter((t) => t.trim() !== "")
const tokenLen = tokens.length
if (tokenLen > 1) {
for (let i = 1; i < tokenLen; i++) {
tokens.push(tokens.slice(0, i + 1).join(" "))
}
}
return tokens.sort((a, b) => b.length - a.length) // always highlight longest terms first
}
function highlight(searchTerm: string, text: string, trim?: boolean) {
const tokenizedTerms = tokenizeTerm(searchTerm)
let tokenizedText = text.split(/\s+/).filter((t) => t !== "")
let startIndex = 0
let endIndex = tokenizedText.length - 1
if (trim) {
const includesCheck = (tok: string) =>
tokenizedTerms.some((term) => tok.toLowerCase().startsWith(term.toLowerCase()))
const occurrencesIndices = tokenizedText.map(includesCheck)
let bestSum = 0
let bestIndex = 0
for (let i = 0; i < Math.max(tokenizedText.length - contextWindowWords, 0); i++) {
const window = occurrencesIndices.slice(i, i + contextWindowWords)
const windowSum = window.reduce((total, cur) => total + (cur ? 1 : 0), 0)
if (windowSum >= bestSum) {
bestSum = windowSum
bestIndex = i
}
}
startIndex = Math.max(bestIndex - contextWindowWords, 0)
endIndex = Math.min(startIndex + 2 * contextWindowWords, tokenizedText.length - 1)
tokenizedText = tokenizedText.slice(startIndex, endIndex)
}
const slice = tokenizedText
.map((tok) => {
// see if this tok is prefixed by any search terms
for (const searchTok of tokenizedTerms) {
if (tok.toLowerCase().includes(searchTok.toLowerCase())) {
const regex = new RegExp(searchTok.toLowerCase(), "gi")
return tok.replace(regex, `<span class="highlight">$&</span>`)
}
}
return tok
})
.join(" ")
return `${startIndex === 0 ? "" : "..."}${slice}${
endIndex === tokenizedText.length - 1 ? "" : "..."
}`
}
// To be used with search and everything else with flexsearch
const encoder = (str: string) =>
str
.toLowerCase()
.split(/\s+/)
.filter((token) => token.length > 0)
/** /**
* Get parent document slug for a chunk ID * Get parent document slug for a chunk ID
*/ */
@ -125,7 +183,7 @@ function aggregateChunkResults(
// Initialize the FlexSearch Document instance with the appropriate configuration // Initialize the FlexSearch Document instance with the appropriate configuration
const index = new FlexSearch.Document<Item>({ const index = new FlexSearch.Document<Item>({
tokenize: "forward", tokenize: "forward",
encode, encode: encoder,
document: { document: {
id: "id", id: "id",
tag: "tags", tag: "tags",

View File

@ -44,68 +44,3 @@ export async function fetchCanonical(url: URL): Promise<Response> {
const [_, redirect] = text.match(canonicalRegex) ?? [] const [_, redirect] = text.match(canonicalRegex) ?? []
return redirect ? fetch(`${new URL(redirect, url)}`) : res return redirect ? fetch(`${new URL(redirect, url)}`) : res
} }
const contextWindowWords = 30
export const tokenizeTerm = (term: string) => {
const tokens = term.split(/\s+/).filter((t) => t.trim() !== "")
const tokenLen = tokens.length
if (tokenLen > 1) {
for (let i = 1; i < tokenLen; i++) {
tokens.push(tokens.slice(0, i + 1).join(" "))
}
}
return tokens.sort((a, b) => b.length - a.length) // always highlight longest terms first
}
export function highlight(searchTerm: string, text: string, trim?: boolean) {
const tokenizedTerms = tokenizeTerm(searchTerm)
let tokenizedText = text.split(/\s+/).filter((t) => t !== "")
let startIndex = 0
let endIndex = tokenizedText.length - 1
if (trim) {
const includesCheck = (tok: string) =>
tokenizedTerms.some((term) => tok.toLowerCase().startsWith(term.toLowerCase()))
const occurrencesIndices = tokenizedText.map(includesCheck)
let bestSum = 0
let bestIndex = 0
for (let i = 0; i < Math.max(tokenizedText.length - contextWindowWords, 0); i++) {
const window = occurrencesIndices.slice(i, i + contextWindowWords)
const windowSum = window.reduce((total, cur) => total + (cur ? 1 : 0), 0)
if (windowSum >= bestSum) {
bestSum = windowSum
bestIndex = i
}
}
startIndex = Math.max(bestIndex - contextWindowWords, 0)
endIndex = Math.min(startIndex + 2 * contextWindowWords, tokenizedText.length - 1)
tokenizedText = tokenizedText.slice(startIndex, endIndex)
}
const slice = tokenizedText
.map((tok) => {
// see if this tok is prefixed by any search terms
for (const searchTok of tokenizedTerms) {
if (tok.toLowerCase().includes(searchTok.toLowerCase())) {
const regex = new RegExp(searchTok.toLowerCase(), "gi")
return tok.replace(regex, `<span class="highlight">$&</span>`)
}
}
return tok
})
.join(" ")
return `${startIndex === 0 ? "" : "..."}${slice}${
endIndex === tokenizedText.length - 1 ? "" : "..."
}`
}
// To be used with search and everything else with flexsearch
export const encode = (str: string) =>
str
.toLowerCase()
.split(/\s+/)
.filter((token) => token.length > 0)