mirror of
https://github.com/jackyzha0/quartz.git
synced 2025-12-19 10:54:06 -06:00
test: add comprehensive unit tests for CJK search encoder
Add 21 unit tests covering: - English word tokenization - CJK character-level tokenization (Japanese, Korean, Chinese) - Mixed CJK/English content - Edge cases All tests pass, confirming the encoder correctly handles CJK text. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
887332b810
commit
d8f1651748
163
quartz/components/scripts/search.test.ts
Normal file
163
quartz/components/scripts/search.test.ts
Normal file
@ -0,0 +1,163 @@
|
||||
import test, { describe } from "node:test"
|
||||
import assert from "node:assert"
|
||||
|
||||
// Inline the encoder function from search.inline.ts for testing
|
||||
const encoder = (str: string): string[] => {
|
||||
const tokens: string[] = []
|
||||
let bufferStart = -1
|
||||
let bufferEnd = -1
|
||||
const lower = str.toLowerCase()
|
||||
|
||||
let i = 0
|
||||
for (const char of lower) {
|
||||
const code = char.codePointAt(0)!
|
||||
|
||||
const isCJK =
|
||||
(code >= 0x3040 && code <= 0x309f) ||
|
||||
(code >= 0x30a0 && code <= 0x30ff) ||
|
||||
(code >= 0x4e00 && code <= 0x9fff) ||
|
||||
(code >= 0xac00 && code <= 0xd7af) ||
|
||||
(code >= 0x20000 && code <= 0x2a6df)
|
||||
|
||||
const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
|
||||
|
||||
if (isCJK) {
|
||||
if (bufferStart !== -1) {
|
||||
tokens.push(lower.slice(bufferStart, bufferEnd))
|
||||
bufferStart = -1
|
||||
}
|
||||
tokens.push(char)
|
||||
} else if (isWhitespace) {
|
||||
if (bufferStart !== -1) {
|
||||
tokens.push(lower.slice(bufferStart, bufferEnd))
|
||||
bufferStart = -1
|
||||
}
|
||||
} else {
|
||||
if (bufferStart === -1) bufferStart = i
|
||||
bufferEnd = i + char.length
|
||||
}
|
||||
|
||||
i += char.length
|
||||
}
|
||||
|
||||
if (bufferStart !== -1) {
|
||||
tokens.push(lower.slice(bufferStart))
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
describe("search encoder", () => {
|
||||
describe("English text", () => {
|
||||
test("should tokenize simple English words", () => {
|
||||
const result = encoder("hello world")
|
||||
assert.deepStrictEqual(result, ["hello", "world"])
|
||||
})
|
||||
|
||||
test("should handle multiple spaces", () => {
|
||||
const result = encoder("hello world")
|
||||
assert.deepStrictEqual(result, ["hello", "world"])
|
||||
})
|
||||
|
||||
test("should handle tabs and newlines", () => {
|
||||
const result = encoder("hello\tworld\ntest")
|
||||
assert.deepStrictEqual(result, ["hello", "world", "test"])
|
||||
})
|
||||
|
||||
test("should lowercase all text", () => {
|
||||
const result = encoder("Hello WORLD Test")
|
||||
assert.deepStrictEqual(result, ["hello", "world", "test"])
|
||||
})
|
||||
})
|
||||
|
||||
describe("CJK text", () => {
|
||||
test("should tokenize Japanese Hiragana character by character", () => {
|
||||
const result = encoder("こんにちは")
|
||||
assert.deepStrictEqual(result, ["こ", "ん", "に", "ち", "は"])
|
||||
})
|
||||
|
||||
test("should tokenize Japanese Katakana character by character", () => {
|
||||
const result = encoder("コントロール")
|
||||
assert.deepStrictEqual(result, ["コ", "ン", "ト", "ロ", "ー", "ル"])
|
||||
})
|
||||
|
||||
test("should tokenize Japanese Kanji character by character", () => {
|
||||
const result = encoder("日本語")
|
||||
assert.deepStrictEqual(result, ["日", "本", "語"])
|
||||
})
|
||||
|
||||
test("should tokenize Korean Hangul character by character", () => {
|
||||
const result = encoder("안녕하세요")
|
||||
assert.deepStrictEqual(result, ["안", "녕", "하", "세", "요"])
|
||||
})
|
||||
|
||||
test("should tokenize Chinese characters character by character", () => {
|
||||
const result = encoder("你好世界")
|
||||
assert.deepStrictEqual(result, ["你", "好", "世", "界"])
|
||||
})
|
||||
|
||||
test("should handle mixed Hiragana/Katakana/Kanji", () => {
|
||||
const result = encoder("て以来")
|
||||
assert.deepStrictEqual(result, ["て", "以", "来"])
|
||||
})
|
||||
})
|
||||
|
||||
describe("Mixed CJK and English", () => {
|
||||
test("should handle Japanese with English words", () => {
|
||||
const result = encoder("hello 世界")
|
||||
assert.deepStrictEqual(result, ["hello", "世", "界"])
|
||||
})
|
||||
|
||||
test("should handle English with Japanese words", () => {
|
||||
const result = encoder("世界 hello world")
|
||||
assert.deepStrictEqual(result, ["世", "界", "hello", "world"])
|
||||
})
|
||||
|
||||
test("should handle complex mixed content", () => {
|
||||
const result = encoder("これはtest文章です")
|
||||
assert.deepStrictEqual(result, ["こ", "れ", "は", "test", "文", "章", "で", "す"])
|
||||
})
|
||||
|
||||
test("should handle mixed Korean and English", () => {
|
||||
const result = encoder("hello 안녕 world")
|
||||
assert.deepStrictEqual(result, ["hello", "안", "녕", "world"])
|
||||
})
|
||||
|
||||
test("should handle mixed Chinese and English", () => {
|
||||
const result = encoder("你好 world")
|
||||
assert.deepStrictEqual(result, ["你", "好", "world"])
|
||||
})
|
||||
})
|
||||
|
||||
describe("Edge cases", () => {
|
||||
test("should handle empty string", () => {
|
||||
const result = encoder("")
|
||||
assert.deepStrictEqual(result, [])
|
||||
})
|
||||
|
||||
test("should handle only whitespace", () => {
|
||||
const result = encoder(" \t\n ")
|
||||
assert.deepStrictEqual(result, [])
|
||||
})
|
||||
|
||||
test("should handle single character", () => {
|
||||
const result = encoder("a")
|
||||
assert.deepStrictEqual(result, ["a"])
|
||||
})
|
||||
|
||||
test("should handle single CJK character", () => {
|
||||
const result = encoder("あ")
|
||||
assert.deepStrictEqual(result, ["あ"])
|
||||
})
|
||||
|
||||
test("should handle CJK with trailing whitespace", () => {
|
||||
const result = encoder("日本語 ")
|
||||
assert.deepStrictEqual(result, ["日", "本", "語"])
|
||||
})
|
||||
|
||||
test("should handle English with trailing whitespace", () => {
|
||||
const result = encoder("hello ")
|
||||
assert.deepStrictEqual(result, ["hello"])
|
||||
})
|
||||
})
|
||||
})
|
||||
Loading…
Reference in New Issue
Block a user