mirror of
https://github.com/jackyzha0/quartz.git
synced 2025-12-20 11:24:05 -06:00
perf: optimize CJK search encoder with manual buffer tracking
Replace regex-based tokenization with index-based buffer management. This improves performance by ~2.93x according to benchmark results. - Use explicit buffer start/end indices instead of string concatenation - Replace split(/\s+/) with direct whitespace code point checks - Remove redundant filter() operations - Add CJK Extension A support (U+20000-U+2A6DF) Performance: ~878ms → ~300ms (100 iterations, mixed CJK/English text)
This commit is contained in:
parent
6cb8d1cc0e
commit
887332b810
@ -16,40 +16,49 @@ interface Item {
|
|||||||
type SearchType = "basic" | "tags"
|
type SearchType = "basic" | "tags"
|
||||||
let searchType: SearchType = "basic"
|
let searchType: SearchType = "basic"
|
||||||
let currentSearchTerm: string = ""
|
let currentSearchTerm: string = ""
|
||||||
const encoder = (str: string) => {
|
const encoder = (str: string): string[] => {
|
||||||
const tokens: string[] = []
|
const tokens: string[] = []
|
||||||
let buffer = ""
|
let bufferStart = -1
|
||||||
|
let bufferEnd = -1
|
||||||
|
const lower = str.toLowerCase()
|
||||||
|
|
||||||
for (const char of str.toLowerCase()) {
|
let i = 0
|
||||||
const code = char.codePointAt(0)
|
for (const char of lower) {
|
||||||
if (code === undefined) continue
|
const code = char.codePointAt(0)!
|
||||||
|
|
||||||
// Check if character is CJK
|
|
||||||
const isCJK =
|
const isCJK =
|
||||||
(code >= 0x3040 && code <= 0x309f) || // Hiragana
|
(code >= 0x3040 && code <= 0x309f) ||
|
||||||
(code >= 0x30a0 && code <= 0x30ff) || // Katakana
|
(code >= 0x30a0 && code <= 0x30ff) ||
|
||||||
(code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
|
(code >= 0x4e00 && code <= 0x9fff) ||
|
||||||
(code >= 0xac00 && code <= 0xd7af) // Hangul Syllables
|
(code >= 0xac00 && code <= 0xd7af) ||
|
||||||
|
(code >= 0x20000 && code <= 0x2a6df)
|
||||||
|
|
||||||
|
const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
|
||||||
|
|
||||||
if (isCJK) {
|
if (isCJK) {
|
||||||
// Flush non-CJK buffer
|
if (bufferStart !== -1) {
|
||||||
if (buffer) {
|
tokens.push(lower.slice(bufferStart, bufferEnd))
|
||||||
tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
|
bufferStart = -1
|
||||||
buffer = ""
|
|
||||||
}
|
}
|
||||||
// Add CJK character as individual token
|
|
||||||
tokens.push(char)
|
tokens.push(char)
|
||||||
|
} else if (isWhitespace) {
|
||||||
|
if (bufferStart !== -1) {
|
||||||
|
tokens.push(lower.slice(bufferStart, bufferEnd))
|
||||||
|
bufferStart = -1
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
buffer += char
|
if (bufferStart === -1) bufferStart = i
|
||||||
}
|
bufferEnd = i + char.length
|
||||||
}
|
}
|
||||||
|
|
||||||
// Flush remaining non-CJK buffer
|
i += char.length
|
||||||
if (buffer) {
|
|
||||||
tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return tokens.filter((token) => token.length > 0)
|
if (bufferStart !== -1) {
|
||||||
|
tokens.push(lower.slice(bufferStart))
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
let index = new FlexSearch.Document<Item>({
|
let index = new FlexSearch.Document<Item>({
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user