mirror of
https://github.com/jackyzha0/quartz.git
synced 2025-12-20 11:24:05 -06:00
feat: improve search tokenization for CJK languages
Enhance the encoder function to properly tokenize CJK (Chinese, Japanese, Korean) characters while maintaining English word tokenization. This fixes search issues where CJK text was not searchable due to whitespace-only splitting. Changes: - Tokenize CJK characters (Hiragana, Katakana, Kanji, Hangul) individually - Preserve whitespace-based tokenization for non-CJK text - Support mixed CJK/English content in search queries This addresses the CJK search issues reported in #2109 where Japanese text like "て以来" was not searchable because the encoder only split on whitespace. Tested with Japanese, Chinese, and Korean content to verify character-level tokenization works correctly while maintaining English search functionality.
This commit is contained in:
parent
13ff64db97
commit
6cb8d1cc0e
@ -17,10 +17,39 @@ type SearchType = "basic" | "tags"
|
||||
let searchType: SearchType = "basic"
|
||||
let currentSearchTerm: string = ""
|
||||
const encoder = (str: string) => {
|
||||
return str
|
||||
.toLowerCase()
|
||||
.split(/\s+/)
|
||||
.filter((token) => token.length > 0)
|
||||
const tokens: string[] = []
|
||||
let buffer = ""
|
||||
|
||||
for (const char of str.toLowerCase()) {
|
||||
const code = char.codePointAt(0)
|
||||
if (code === undefined) continue
|
||||
|
||||
// Check if character is CJK
|
||||
const isCJK =
|
||||
(code >= 0x3040 && code <= 0x309f) || // Hiragana
|
||||
(code >= 0x30a0 && code <= 0x30ff) || // Katakana
|
||||
(code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
|
||||
(code >= 0xac00 && code <= 0xd7af) // Hangul Syllables
|
||||
|
||||
if (isCJK) {
|
||||
// Flush non-CJK buffer
|
||||
if (buffer) {
|
||||
tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
|
||||
buffer = ""
|
||||
}
|
||||
// Add CJK character as individual token
|
||||
tokens.push(char)
|
||||
} else {
|
||||
buffer += char
|
||||
}
|
||||
}
|
||||
|
||||
// Flush remaining non-CJK buffer
|
||||
if (buffer) {
|
||||
tokens.push(...buffer.split(/\s+/).filter((t) => t.length > 0))
|
||||
}
|
||||
|
||||
return tokens.filter((token) => token.length > 0)
|
||||
}
|
||||
|
||||
let index = new FlexSearch.Document<Item>({
|
||||
|
||||
Loading…
Reference in New Issue
Block a user