From 6cb8d1cc0ea1075841e84eca26d8aff29ba44113 Mon Sep 17 00:00:00 2001 From: chiyonn Date: Sun, 30 Nov 2025 16:49:28 +0900 Subject: [PATCH] feat: improve search tokenization for CJK languages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhance the encoder function to properly tokenize CJK (Chinese, Japanese, Korean) characters while maintaining English word tokenization. This fixes search issues where CJK text was not searchable due to whitespace-only splitting. Changes: - Tokenize CJK characters (Hiragana, Katakana, Kanji, Hangul) individually - Preserve whitespace-based tokenization for non-CJK text - Support mixed CJK/English content in search queries This addresses the CJK search issues reported in #2109 where Japanese text like "て以来" was not searchable because the encoder only split on whitespace. Tested with Japanese, Chinese, and Korean content to verify character-level tokenization works correctly while maintaining English search functionality. --- quartz/components/scripts/search.inline.ts | 37 +++++++++++++++++++--- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/quartz/components/scripts/search.inline.ts b/quartz/components/scripts/search.inline.ts index 6a84a50e0..93965a2af 100644 --- a/quartz/components/scripts/search.inline.ts +++ b/quartz/components/scripts/search.inline.ts @@ -17,10 +17,39 @@ type SearchType = "basic" | "tags" let searchType: SearchType = "basic" let currentSearchTerm: string = "" const encoder = (str: string) => { - return str - .toLowerCase() - .split(/\s+/) - .filter((token) => token.length > 0) + const tokens: string[] = [] + let buffer = "" + + for (const char of str.toLowerCase()) { + const code = char.codePointAt(0) + if (code === undefined) continue + + // Check if character is CJK + const isCJK = + (code >= 0x3040 && code <= 0x309f) || // Hiragana + (code >= 0x30a0 && code <= 0x30ff) || // Katakana + (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs + (code >= 0xac00 && code <= 0xd7af) // Hangul Syllables + + if (isCJK) { + // Flush non-CJK buffer + if (buffer) { + tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0)) + buffer = "" + } + // Add CJK character as individual token + tokens.push(char) + } else { + buffer += char + } + } + + // Flush remaining non-CJK buffer + if (buffer) { + tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0)) + } + + return tokens.filter((token) => token.length > 0) } let index = new FlexSearch.Document({