From 887332b81031073c980cece851a94b8f26cbcaf5 Mon Sep 17 00:00:00 2001
From: chiyonn <kud0h.chi4@gmail.com>
Date: Tue, 2 Dec 2025 08:17:13 +0900
Subject: [PATCH] perf: optimize CJK search encoder with manual buffer tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace regex-based tokenization with index-based buffer management.
This improves performance by ~2.93x according to benchmark results.

- Use explicit buffer start/end indices instead of string concatenation
- Replace split(/\s+/) with direct whitespace code point checks
- Remove redundant filter() operations
- Add CJK Extension A support (U+20000-U+2A6DF)

Performance: ~878ms → ~300ms (100 iterations, mixed CJK/English text)
---
 quartz/components/scripts/search.inline.ts | 49 +++++++++++++---------
 1 file changed, 29 insertions(+), 20 deletions(-)
diff --git a/quartz/components/scripts/search.inline.ts b/quartz/components/scripts/search.inline.ts
index 93965a2af..717f17f00 100644
--- a/quartz/components/scripts/search.inline.ts
+++ b/quartz/components/scripts/search.inline.ts
@@ -16,40 +16,49 @@ interface Item {
 type SearchType = "basic" | "tags"
 let searchType: SearchType = "basic"
 let currentSearchTerm: string = ""
-const encoder = (str: string) => {
+const encoder = (str: string): string[] => {
   const tokens: string[] = []
-  let buffer = ""
+  let bufferStart = -1
+  let bufferEnd = -1
+  const lower = str.toLowerCase()
 
-  for (const char of str.toLowerCase()) {
-    const code = char.codePointAt(0)
-    if (code === undefined) continue
+  let i = 0
+  for (const char of lower) {
+    const code = char.codePointAt(0)!
 
-    // Check if character is CJK
     const isCJK =
-      (code >= 0x3040 && code <= 0x309f) || // Hiragana
-      (code >= 0x30a0 && code <= 0x30ff) || // Katakana
-      (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
-      (code >= 0xac00 && code <= 0xd7af) // Hangul Syllables
+      (code >= 0x3040 && code <= 0x309f) ||
+      (code >= 0x30a0 && code <= 0x30ff) ||
+      (code >= 0x4e00 && code <= 0x9fff) ||
+      (code >= 0xac00 && code <= 0xd7af) ||
+      (code >= 0x20000 && code <= 0x2a6df)
+
+    const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
 
     if (isCJK) {
-      // Flush non-CJK buffer
-      if (buffer) {
-        tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
-        buffer = ""
+      if (bufferStart !== -1) {
+        tokens.push(lower.slice(bufferStart, bufferEnd))
+        bufferStart = -1
       }
-      // Add CJK character as individual token
       tokens.push(char)
+    } else if (isWhitespace) {
+      if (bufferStart !== -1) {
+        tokens.push(lower.slice(bufferStart, bufferEnd))
+        bufferStart = -1
+      }
     } else {
-      buffer += char
+      if (bufferStart === -1) bufferStart = i
+      bufferEnd = i + char.length
     }
+
+    i += char.length
   }
 
-  // Flush remaining non-CJK buffer
-  if (buffer) {
-    tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
+  if (bufferStart !== -1) {
+    tokens.push(lower.slice(bufferStart))
   }
 
-  return tokens.filter((token) => token.length > 0)
+  return tokens
 }
 
 let index = new FlexSearch.Document<Item>({