feat: improve search tokenization for CJK languages

Enhance the encoder function to properly tokenize CJK (Chinese, Japanese,
Korean) characters while maintaining English word tokenization. This fixes
search issues where CJK text was not searchable due to whitespace-only
splitting.

Changes:
- Tokenize CJK characters (Hiragana, Katakana, Kanji, Hangul) individually
- Preserve whitespace-based tokenization for non-CJK text
- Support mixed CJK/English content in search queries

This addresses the CJK search issues reported in #2109 where Japanese text
like "て以来" was not searchable because the encoder only split on whitespace.

Tested with Japanese, Chinese, and Korean content to verify character-level
tokenization works correctly while maintaining English search functionality.
This commit is contained in:
chiyonn 2025-11-30 16:49:28 +09:00
parent 13ff64db97
commit 6cb8d1cc0e

View File

@ -17,10 +17,39 @@ type SearchType = "basic" | "tags"
let searchType: SearchType = "basic" let searchType: SearchType = "basic"
let currentSearchTerm: string = "" let currentSearchTerm: string = ""
const encoder = (str: string) => { const encoder = (str: string) => {
return str const tokens: string[] = []
.toLowerCase() let buffer = ""
.split(/\s+/)
.filter((token) => token.length > 0) for (const char of str.toLowerCase()) {
const code = char.codePointAt(0)
if (code === undefined) continue
// Check if character is CJK
const isCJK =
(code >= 0x3040 && code <= 0x309f) || // Hiragana
(code >= 0x30a0 && code <= 0x30ff) || // Katakana
(code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
(code >= 0xac00 && code <= 0xd7af) // Hangul Syllables
if (isCJK) {
// Flush non-CJK buffer
if (buffer) {
tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
buffer = ""
}
// Add CJK character as individual token
tokens.push(char)
} else {
buffer += char
}
}
// Flush remaining non-CJK buffer
if (buffer) {
tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
}
return tokens.filter((token) => token.length > 0)
} }
let index = new FlexSearch.Document<Item>({ let index = new FlexSearch.Document<Item>({