From 887332b81031073c980cece851a94b8f26cbcaf5 Mon Sep 17 00:00:00 2001 From: chiyonn Date: Tue, 2 Dec 2025 08:17:13 +0900 Subject: [PATCH] perf: optimize CJK search encoder with manual buffer tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace regex-based tokenization with index-based buffer management. This improves performance by ~2.93x according to benchmark results. - Use explicit buffer start/end indices instead of string concatenation - Replace split(/\s+/) with direct whitespace code point checks - Remove redundant filter() operations - Add CJK Extension A support (U+20000-U+2A6DF) Performance: ~878ms → ~300ms (100 iterations, mixed CJK/English text) --- quartz/components/scripts/search.inline.ts | 49 +++++++++++++--------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/quartz/components/scripts/search.inline.ts b/quartz/components/scripts/search.inline.ts index 93965a2af..717f17f00 100644 --- a/quartz/components/scripts/search.inline.ts +++ b/quartz/components/scripts/search.inline.ts @@ -16,40 +16,49 @@ interface Item { type SearchType = "basic" | "tags" let searchType: SearchType = "basic" let currentSearchTerm: string = "" -const encoder = (str: string) => { +const encoder = (str: string): string[] => { const tokens: string[] = [] - let buffer = "" + let bufferStart = -1 + let bufferEnd = -1 + const lower = str.toLowerCase() - for (const char of str.toLowerCase()) { - const code = char.codePointAt(0) - if (code === undefined) continue + let i = 0 + for (const char of lower) { + const code = char.codePointAt(0)! - // Check if character is CJK const isCJK = - (code >= 0x3040 && code <= 0x309f) || // Hiragana - (code >= 0x30a0 && code <= 0x30ff) || // Katakana - (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs - (code >= 0xac00 && code <= 0xd7af) // Hangul Syllables + (code >= 0x3040 && code <= 0x309f) || + (code >= 0x30a0 && code <= 0x30ff) || + (code >= 0x4e00 && code <= 0x9fff) || + (code >= 0xac00 && code <= 0xd7af) || + (code >= 0x20000 && code <= 0x2a6df) + + const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13 if (isCJK) { - // Flush non-CJK buffer - if (buffer) { - tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0)) - buffer = "" + if (bufferStart !== -1) { + tokens.push(lower.slice(bufferStart, bufferEnd)) + bufferStart = -1 } - // Add CJK character as individual token tokens.push(char) + } else if (isWhitespace) { + if (bufferStart !== -1) { + tokens.push(lower.slice(bufferStart, bufferEnd)) + bufferStart = -1 + } } else { - buffer += char + if (bufferStart === -1) bufferStart = i + bufferEnd = i + char.length } + + i += char.length } - // Flush remaining non-CJK buffer - if (buffer) { - tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0)) + if (bufferStart !== -1) { + tokens.push(lower.slice(bufferStart)) } - return tokens.filter((token) => token.length > 0) + return tokens } let index = new FlexSearch.Document({