perf: optimize CJK search encoder with manual buffer tracking

Replace regex-based tokenization with index-based buffer management.
This improves performance by ~2.93x according to benchmark results.

- Use explicit buffer start/end indices instead of string concatenation
- Replace split(/\s+/) with direct whitespace code point checks
- Remove redundant filter() operations
- Add CJK Extension A support (U+20000-U+2A6DF)

Performance: ~878ms → ~300ms (100 iterations, mixed CJK/English text)
This commit is contained in:
chiyonn 2025-12-02 08:17:13 +09:00
parent 6cb8d1cc0e
commit 887332b810

View File

@ -16,40 +16,49 @@ interface Item {
type SearchType = "basic" | "tags" type SearchType = "basic" | "tags"
let searchType: SearchType = "basic" let searchType: SearchType = "basic"
let currentSearchTerm: string = "" let currentSearchTerm: string = ""
const encoder = (str: string) => { const encoder = (str: string): string[] => {
const tokens: string[] = [] const tokens: string[] = []
let buffer = "" let bufferStart = -1
let bufferEnd = -1
const lower = str.toLowerCase()
for (const char of str.toLowerCase()) { let i = 0
const code = char.codePointAt(0) for (const char of lower) {
if (code === undefined) continue const code = char.codePointAt(0)!
// Check if character is CJK
const isCJK = const isCJK =
(code >= 0x3040 && code <= 0x309f) || // Hiragana (code >= 0x3040 && code <= 0x309f) ||
(code >= 0x30a0 && code <= 0x30ff) || // Katakana (code >= 0x30a0 && code <= 0x30ff) ||
(code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs (code >= 0x4e00 && code <= 0x9fff) ||
(code >= 0xac00 && code <= 0xd7af) // Hangul Syllables (code >= 0xac00 && code <= 0xd7af) ||
(code >= 0x20000 && code <= 0x2a6df)
const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
if (isCJK) { if (isCJK) {
// Flush non-CJK buffer if (bufferStart !== -1) {
if (buffer) { tokens.push(lower.slice(bufferStart, bufferEnd))
tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0)) bufferStart = -1
buffer = ""
} }
// Add CJK character as individual token
tokens.push(char) tokens.push(char)
} else if (isWhitespace) {
if (bufferStart !== -1) {
tokens.push(lower.slice(bufferStart, bufferEnd))
bufferStart = -1
}
} else { } else {
buffer += char if (bufferStart === -1) bufferStart = i
} bufferEnd = i + char.length
} }
// Flush remaining non-CJK buffer i += char.length
if (buffer) {
tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
} }
return tokens.filter((token) => token.length > 0) if (bufferStart !== -1) {
tokens.push(lower.slice(bufferStart))
}
return tokens
} }
let index = new FlexSearch.Document<Item>({ let index = new FlexSearch.Document<Item>({