feat: improve search tokenization for CJK languages

Enhance the encoder function to properly tokenize CJK (Chinese, Japanese, Korean) characters while maintaining English word tokenization. This fixes search issues where CJK text was not searchable due to whitespace-only splitting. Changes: - Tokenize CJK characters (Hiragana, Katakana, Kanji, Hangul) individually - Preserve whitespace-based tokenization for non-CJK text - Support mixed CJK/English content in search queries This addresses the CJK search issues reported in #2109 where Japanese text like "て以来" was not searchable because the encoder only split on whitespace. Tested with Japanese, Chinese, and Korean content to verify character-level tokenization works correctly while maintaining English search functionality.
2026-02-04 06:25:41 -06:00 · 2025-11-30 16:49:28 +09:00 · 2025-11-30 16:49:28 +09:00 · 6cb8d1cc0e
commit 6cb8d1cc0e
parent 13ff64db97
1 changed files with 33 additions and 4 deletions
--- a/quartz/components/scripts/search.inline.ts
+++ b/quartz/components/scripts/search.inline.ts
@ -17,10 +17,39 @@ type SearchType = "basic" | "tags"
 let searchType: SearchType = "basic"
 let currentSearchTerm: string = ""
 const encoder = (str: string) => {
-  return str
-    .toLowerCase()
-    .split(/\s+/)
-    .filter((token) => token.length > 0)
+  const tokens: string[] = []
+  let buffer = ""
+
+  for (const char of str.toLowerCase()) {
+    const code = char.codePointAt(0)
+    if (code === undefined) continue
+
+    // Check if character is CJK
+    const isCJK =
+      (code >= 0x3040 && code <= 0x309f) || // Hiragana
+      (code >= 0x30a0 && code <= 0x30ff) || // Katakana
+      (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
+      (code >= 0xac00 && code <= 0xd7af) // Hangul Syllables
+
+    if (isCJK) {
+      // Flush non-CJK buffer
+      if (buffer) {
+        tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
+        buffer = ""
+      }
+      // Add CJK character as individual token
+      tokens.push(char)
+    } else {
+      buffer += char
+    }
+  }
+
+  // Flush remaining non-CJK buffer
+  if (buffer) {
+    tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
+  }
+
+  return tokens.filter((token) => token.length > 0)
 }

 let index = new FlexSearch.Document<Item>({