perf: optimize CJK search encoder with manual buffer tracking

Replace regex-based tokenization with index-based buffer management. This improves performance by ~2.93x according to benchmark results. - Use explicit buffer start/end indices instead of string concatenation - Replace split(/\s+/) with direct whitespace code point checks - Remove redundant filter() operations - Add CJK Extension A support (U+20000-U+2A6DF) Performance: ~878ms → ~300ms (100 iterations, mixed CJK/English text)
2026-02-04 06:25:41 -06:00 · 2025-12-02 08:17:13 +09:00 · 2025-12-02 08:17:13 +09:00 · 887332b810
commit 887332b810
parent 6cb8d1cc0e
1 changed files with 29 additions and 20 deletions
--- a/quartz/components/scripts/search.inline.ts
+++ b/quartz/components/scripts/search.inline.ts
@ -16,40 +16,49 @@ interface Item {
 type SearchType = "basic" | "tags"
 let searchType: SearchType = "basic"
 let currentSearchTerm: string = ""
-const encoder = (str: string) => {
+const encoder = (str: string): string[] => {
  const tokens: string[] = []
-  let buffer = ""
+  let bufferStart = -1
+  let bufferEnd = -1
+  const lower = str.toLowerCase()

-  for (const char of str.toLowerCase()) {
-    const code = char.codePointAt(0)
-    if (code === undefined) continue
+  let i = 0
+  for (const char of lower) {
+    const code = char.codePointAt(0)!

-    // Check if character is CJK
    const isCJK =
-      (code >= 0x3040 && code <= 0x309f) || // Hiragana
-      (code >= 0x30a0 && code <= 0x30ff) || // Katakana
-      (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
-      (code >= 0xac00 && code <= 0xd7af) // Hangul Syllables
+      (code >= 0x3040 && code <= 0x309f) ||
+      (code >= 0x30a0 && code <= 0x30ff) ||
+      (code >= 0x4e00 && code <= 0x9fff) ||
+      (code >= 0xac00 && code <= 0xd7af) ||
+      (code >= 0x20000 && code <= 0x2a6df)
+
+    const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13

    if (isCJK) {
-      // Flush non-CJK buffer
-      if (buffer) {
-        tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
-        buffer = ""
+      if (bufferStart !== -1) {
+        tokens.push(lower.slice(bufferStart, bufferEnd))
+        bufferStart = -1
      }
-      // Add CJK character as individual token
      tokens.push(char)
+    } else if (isWhitespace) {
+      if (bufferStart !== -1) {
+        tokens.push(lower.slice(bufferStart, bufferEnd))
+        bufferStart = -1
+      }
    } else {
-      buffer += char
-    }
+      if (bufferStart === -1) bufferStart = i
+      bufferEnd = i + char.length
    }

-  // Flush remaining non-CJK buffer
-  if (buffer) {
-    tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
+    i += char.length
  }

-  return tokens.filter((token) => token.length > 0)
+  if (bufferStart !== -1) {
+    tokens.push(lower.slice(bufferStart))
+  }
+
+  return tokens
 }

 let index = new FlexSearch.Document<Item>({