From 09228f3f867a2f05b24e5bf1163ab743e3913894 Mon Sep 17 00:00:00 2001 From: sub-hub <70351692+sub-hub@users.noreply.github.com> Date: Mon, 21 Apr 2025 13:34:01 +0900 Subject: [PATCH] Fix: Correct tokenize flow in tokenizer encode function --- src/ts/tokenizer.ts | 56 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/ts/tokenizer.ts b/src/ts/tokenizer.ts index b553ba27..1e433534 100644 --- a/src/ts/tokenizer.ts +++ b/src/ts/tokenizer.ts @@ -89,9 +89,7 @@ export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Arr default: result = await tikJS(data, 'o200k_base'); break; } - } - - if(db.aiModel === 'custom' && pluginTokenizer){ + } else if (db.aiModel === 'custom' && pluginTokenizer) { switch(pluginTokenizer){ case 'mistral': result = await tokenizeWebTokenizers(data, 'mistral'); break; @@ -120,32 +118,34 @@ export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Arr } } - if(modelInfo.tokenizer === LLMTokenizer.NovelList){ - result = await tokenizeWebTokenizers(data, 'novellist'); - } else if(modelInfo.tokenizer === LLMTokenizer.Claude){ - result = await tokenizeWebTokenizers(data, 'claude'); - } else if(modelInfo.tokenizer === LLMTokenizer.NovelAI){ - result = await tokenizeWebTokenizers(data, 'novelai'); - } else if(modelInfo.tokenizer === LLMTokenizer.Mistral){ - result = await tokenizeWebTokenizers(data, 'mistral'); - } else if(modelInfo.tokenizer === LLMTokenizer.Llama){ - result = await tokenizeWebTokenizers(data, 'llama'); - } else if(modelInfo.tokenizer === LLMTokenizer.Local){ - result = await tokenizeGGUFModel(data); - } else if(modelInfo.tokenizer === LLMTokenizer.tiktokenO200Base){ - result = await tikJS(data, 'o200k_base'); - } else if(modelInfo.tokenizer === LLMTokenizer.GoogleCloud && db.googleClaudeTokenizing){ - result = await tokenizeGoogleCloud(data); - } else if(modelInfo.tokenizer === LLMTokenizer.Gemma || modelInfo.tokenizer === LLMTokenizer.GoogleCloud){ - result = await gemmaTokenize(data); - } else if(modelInfo.tokenizer === LLMTokenizer.DeepSeek){ - result = await tokenizeWebTokenizers(data, 'DeepSeek'); - } else if(modelInfo.tokenizer === LLMTokenizer.Cohere){ - result = await tokenizeWebTokenizers(data, 'cohere'); - } else { - result = await tikJS(data); + // Fallback + if (result === undefined) { + if(modelInfo.tokenizer === LLMTokenizer.NovelList){ + result = await tokenizeWebTokenizers(data, 'novellist'); + } else if(modelInfo.tokenizer === LLMTokenizer.Claude){ + result = await tokenizeWebTokenizers(data, 'claude'); + } else if(modelInfo.tokenizer === LLMTokenizer.NovelAI){ + result = await tokenizeWebTokenizers(data, 'novelai'); + } else if(modelInfo.tokenizer === LLMTokenizer.Mistral){ + result = await tokenizeWebTokenizers(data, 'mistral'); + } else if(modelInfo.tokenizer === LLMTokenizer.Llama){ + result = await tokenizeWebTokenizers(data, 'llama'); + } else if(modelInfo.tokenizer === LLMTokenizer.Local){ + result = await tokenizeGGUFModel(data); + } else if(modelInfo.tokenizer === LLMTokenizer.tiktokenO200Base){ + result = await tikJS(data, 'o200k_base'); + } else if(modelInfo.tokenizer === LLMTokenizer.GoogleCloud && db.googleClaudeTokenizing){ + result = await tokenizeGoogleCloud(data); + } else if(modelInfo.tokenizer === LLMTokenizer.Gemma || modelInfo.tokenizer === LLMTokenizer.GoogleCloud){ + result = await gemmaTokenize(data); + } else if(modelInfo.tokenizer === LLMTokenizer.DeepSeek){ + result = await tokenizeWebTokenizers(data, 'DeepSeek'); + } else if(modelInfo.tokenizer === LLMTokenizer.Cohere){ + result = await tokenizeWebTokenizers(data, 'cohere'); + } else { + result = await tikJS(data); + } } - if(db.useTokenizerCaching){ encodeCache.set(cacheKey, result); }