From 33d8ed45680f52dbd545cdd12076eaff576052d6 Mon Sep 17 00:00:00 2001 From: sub-hub <70351692+sub-hub@users.noreply.github.com> Date: Mon, 21 Apr 2025 13:27:43 +0900 Subject: [PATCH 1/3] restore tokenizer caching with old-bug version --- src/ts/tokenizer.ts | 162 ++++++++++++++++++++++++++------------------ 1 file changed, 97 insertions(+), 65 deletions(-) diff --git a/src/ts/tokenizer.ts b/src/ts/tokenizer.ts index e71528a4..b553ba27 100644 --- a/src/ts/tokenizer.ts +++ b/src/ts/tokenizer.ts @@ -6,9 +6,27 @@ import { supportsInlayImage } from "./process/files/inlays"; import { risuChatParser } from "./parser.svelte"; import { tokenizeGGUFModel } from "./process/models/local"; import { globalFetch } from "./globalApi.svelte"; -import { getModelInfo, LLMTokenizer } from "./model/modellist"; +import { getModelInfo, LLMTokenizer, type LLMModel } from "./model/modellist"; import { pluginV2 } from "./plugins/plugins"; import type { GemmaTokenizer } from "@huggingface/transformers"; +import { LRUMap } from 'mnemonist'; + +const MAX_CACHE_SIZE = 1500; + +const encodeCache = new LRUMap(MAX_CACHE_SIZE); + +function getHash( + data: string, + aiModel: string, + customTokenizer: string, + currentPluginProvider: string, + googleClaudeTokenizing: boolean, + modelInfo: LLMModel, + pluginTokenizer: string +): string { + const combined = `${data}::${aiModel}::${customTokenizer}::${currentPluginProvider}::${googleClaudeTokenizing ? '1' : '0'}::${modelInfo.tokenizer}::${pluginTokenizer}`; + return combined; +} export const tokenizerList = [ @@ -25,100 +43,114 @@ export const tokenizerList = [ ] as const export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{ - let db = getDatabase() + const db = getDatabase(); + const modelInfo = getModelInfo(db.aiModel); + const pluginTokenizer = pluginV2.providerOptions.get(db.currentPluginProvider)?.tokenizer ?? "none"; + + let cacheKey = '' + if(db.useTokenizerCaching){ + cacheKey = getHash( + data, + db.aiModel, + db.customTokenizer, + db.currentPluginProvider, + db.googleClaudeTokenizing, + modelInfo, + pluginTokenizer + ); + const cachedResult = encodeCache.get(cacheKey); + if (cachedResult !== undefined) { + return cachedResult; + } + } + + let result: number[] | Uint32Array | Int32Array; + if(db.aiModel === 'openrouter' || db.aiModel === 'reverse_proxy'){ switch(db.customTokenizer){ case 'mistral': - return await tokenizeWebTokenizers(data, 'mistral') + result = await tokenizeWebTokenizers(data, 'mistral'); break; case 'llama': - return await tokenizeWebTokenizers(data, 'llama') + result = await tokenizeWebTokenizers(data, 'llama'); break; case 'novelai': - return await tokenizeWebTokenizers(data, 'novelai') + result = await tokenizeWebTokenizers(data, 'novelai'); break; case 'claude': - return await tokenizeWebTokenizers(data, 'claude') + result = await tokenizeWebTokenizers(data, 'claude'); break; case 'novellist': - return await tokenizeWebTokenizers(data, 'novellist') + result = await tokenizeWebTokenizers(data, 'novellist'); break; case 'llama3': - return await tokenizeWebTokenizers(data, 'llama') + result = await tokenizeWebTokenizers(data, 'llama'); break; case 'gemma': - return await gemmaTokenize(data) + result = await gemmaTokenize(data); break; case 'cohere': - return await tokenizeWebTokenizers(data, 'cohere') + result = await tokenizeWebTokenizers(data, 'cohere'); break; case 'deepseek': - return await tokenizeWebTokenizers(data, 'DeepSeek') + result = await tokenizeWebTokenizers(data, 'DeepSeek'); break; default: - return await tikJS(data, 'o200k_base') + result = await tikJS(data, 'o200k_base'); break; } } - - const modelInfo = getModelInfo(db.aiModel) - - if(db.aiModel === 'custom' && pluginV2.providerOptions.get(db.currentPluginProvider)?.tokenizer){ - const tokenizer = pluginV2.providerOptions.get(db.currentPluginProvider)?.tokenizer - switch(tokenizer){ + + if(db.aiModel === 'custom' && pluginTokenizer){ + switch(pluginTokenizer){ case 'mistral': - return await tokenizeWebTokenizers(data, 'mistral') + result = await tokenizeWebTokenizers(data, 'mistral'); break; case 'llama': - return await tokenizeWebTokenizers(data, 'llama') + result = await tokenizeWebTokenizers(data, 'llama'); break; case 'novelai': - return await tokenizeWebTokenizers(data, 'novelai') + result = await tokenizeWebTokenizers(data, 'novelai'); break; case 'claude': - return await tokenizeWebTokenizers(data, 'claude') + result = await tokenizeWebTokenizers(data, 'claude'); break; case 'novellist': - return await tokenizeWebTokenizers(data, 'novellist') + result = await tokenizeWebTokenizers(data, 'novellist'); break; case 'llama3': - return await tokenizeWebTokenizers(data, 'llama') + result = await tokenizeWebTokenizers(data, 'llama'); break; case 'gemma': - return await gemmaTokenize(data) + result = await gemmaTokenize(data); break; case 'cohere': - return await tokenizeWebTokenizers(data, 'cohere') + result = await tokenizeWebTokenizers(data, 'cohere'); break; case 'o200k_base': - return await tikJS(data, 'o200k_base') + result = await tikJS(data, 'o200k_base'); break; case 'cl100k_base': - return await tikJS(data, 'cl100k_base') + result = await tikJS(data, 'cl100k_base'); break; case 'custom': - return await pluginV2.providerOptions.get(db.currentPluginProvider)?.tokenizerFunc?.(data) ?? [0] + result = await pluginV2.providerOptions.get(db.currentPluginProvider)?.tokenizerFunc?.(data) ?? [0]; break; default: - return await tikJS(data, 'o200k_base') + result = await tikJS(data, 'o200k_base'); break; } - } - + } + if(modelInfo.tokenizer === LLMTokenizer.NovelList){ - const nv= await tokenizeWebTokenizers(data, 'novellist') - return nv - } - if(modelInfo.tokenizer === LLMTokenizer.Claude){ - return await tokenizeWebTokenizers(data, 'claude') - } - if(modelInfo.tokenizer === LLMTokenizer.NovelAI){ - return await tokenizeWebTokenizers(data, 'novelai') - } - if(modelInfo.tokenizer === LLMTokenizer.Mistral){ - return await tokenizeWebTokenizers(data, 'mistral') - } - if(modelInfo.tokenizer === LLMTokenizer.Llama){ - return await tokenizeWebTokenizers(data, 'llama') - } - if(modelInfo.tokenizer === LLMTokenizer.Local){ - return await tokenizeGGUFModel(data) - } - if(modelInfo.tokenizer === LLMTokenizer.tiktokenO200Base){ - return await tikJS(data, 'o200k_base') - } - if(modelInfo.tokenizer === LLMTokenizer.GoogleCloud && db.googleClaudeTokenizing){ - return await tokenizeGoogleCloud(data) - } - if(modelInfo.tokenizer === LLMTokenizer.Gemma || modelInfo.tokenizer === LLMTokenizer.GoogleCloud){ - return await gemmaTokenize(data) - } - if(modelInfo.tokenizer === LLMTokenizer.DeepSeek){ - return await tokenizeWebTokenizers(data, 'DeepSeek') - } - if(modelInfo.tokenizer === LLMTokenizer.Cohere){ - return await tokenizeWebTokenizers(data, 'cohere') + result = await tokenizeWebTokenizers(data, 'novellist'); + } else if(modelInfo.tokenizer === LLMTokenizer.Claude){ + result = await tokenizeWebTokenizers(data, 'claude'); + } else if(modelInfo.tokenizer === LLMTokenizer.NovelAI){ + result = await tokenizeWebTokenizers(data, 'novelai'); + } else if(modelInfo.tokenizer === LLMTokenizer.Mistral){ + result = await tokenizeWebTokenizers(data, 'mistral'); + } else if(modelInfo.tokenizer === LLMTokenizer.Llama){ + result = await tokenizeWebTokenizers(data, 'llama'); + } else if(modelInfo.tokenizer === LLMTokenizer.Local){ + result = await tokenizeGGUFModel(data); + } else if(modelInfo.tokenizer === LLMTokenizer.tiktokenO200Base){ + result = await tikJS(data, 'o200k_base'); + } else if(modelInfo.tokenizer === LLMTokenizer.GoogleCloud && db.googleClaudeTokenizing){ + result = await tokenizeGoogleCloud(data); + } else if(modelInfo.tokenizer === LLMTokenizer.Gemma || modelInfo.tokenizer === LLMTokenizer.GoogleCloud){ + result = await gemmaTokenize(data); + } else if(modelInfo.tokenizer === LLMTokenizer.DeepSeek){ + result = await tokenizeWebTokenizers(data, 'DeepSeek'); + } else if(modelInfo.tokenizer === LLMTokenizer.Cohere){ + result = await tokenizeWebTokenizers(data, 'cohere'); + } else { + result = await tikJS(data); } - return await tikJS(data) + if(db.useTokenizerCaching){ + encodeCache.set(cacheKey, result); + } + + return result; } type tokenizerType = 'novellist'|'claude'|'novelai'|'llama'|'mistral'|'llama3'|'gemma'|'cohere'|'googleCloud'|'DeepSeek' From 09228f3f867a2f05b24e5bf1163ab743e3913894 Mon Sep 17 00:00:00 2001 From: sub-hub <70351692+sub-hub@users.noreply.github.com> Date: Mon, 21 Apr 2025 13:34:01 +0900 Subject: [PATCH 2/3] Fix: Correct tokenize flow in tokenizer encode function --- src/ts/tokenizer.ts | 56 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/ts/tokenizer.ts b/src/ts/tokenizer.ts index b553ba27..1e433534 100644 --- a/src/ts/tokenizer.ts +++ b/src/ts/tokenizer.ts @@ -89,9 +89,7 @@ export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Arr default: result = await tikJS(data, 'o200k_base'); break; } - } - - if(db.aiModel === 'custom' && pluginTokenizer){ + } else if (db.aiModel === 'custom' && pluginTokenizer) { switch(pluginTokenizer){ case 'mistral': result = await tokenizeWebTokenizers(data, 'mistral'); break; @@ -120,32 +118,34 @@ export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Arr } } - if(modelInfo.tokenizer === LLMTokenizer.NovelList){ - result = await tokenizeWebTokenizers(data, 'novellist'); - } else if(modelInfo.tokenizer === LLMTokenizer.Claude){ - result = await tokenizeWebTokenizers(data, 'claude'); - } else if(modelInfo.tokenizer === LLMTokenizer.NovelAI){ - result = await tokenizeWebTokenizers(data, 'novelai'); - } else if(modelInfo.tokenizer === LLMTokenizer.Mistral){ - result = await tokenizeWebTokenizers(data, 'mistral'); - } else if(modelInfo.tokenizer === LLMTokenizer.Llama){ - result = await tokenizeWebTokenizers(data, 'llama'); - } else if(modelInfo.tokenizer === LLMTokenizer.Local){ - result = await tokenizeGGUFModel(data); - } else if(modelInfo.tokenizer === LLMTokenizer.tiktokenO200Base){ - result = await tikJS(data, 'o200k_base'); - } else if(modelInfo.tokenizer === LLMTokenizer.GoogleCloud && db.googleClaudeTokenizing){ - result = await tokenizeGoogleCloud(data); - } else if(modelInfo.tokenizer === LLMTokenizer.Gemma || modelInfo.tokenizer === LLMTokenizer.GoogleCloud){ - result = await gemmaTokenize(data); - } else if(modelInfo.tokenizer === LLMTokenizer.DeepSeek){ - result = await tokenizeWebTokenizers(data, 'DeepSeek'); - } else if(modelInfo.tokenizer === LLMTokenizer.Cohere){ - result = await tokenizeWebTokenizers(data, 'cohere'); - } else { - result = await tikJS(data); + // Fallback + if (result === undefined) { + if(modelInfo.tokenizer === LLMTokenizer.NovelList){ + result = await tokenizeWebTokenizers(data, 'novellist'); + } else if(modelInfo.tokenizer === LLMTokenizer.Claude){ + result = await tokenizeWebTokenizers(data, 'claude'); + } else if(modelInfo.tokenizer === LLMTokenizer.NovelAI){ + result = await tokenizeWebTokenizers(data, 'novelai'); + } else if(modelInfo.tokenizer === LLMTokenizer.Mistral){ + result = await tokenizeWebTokenizers(data, 'mistral'); + } else if(modelInfo.tokenizer === LLMTokenizer.Llama){ + result = await tokenizeWebTokenizers(data, 'llama'); + } else if(modelInfo.tokenizer === LLMTokenizer.Local){ + result = await tokenizeGGUFModel(data); + } else if(modelInfo.tokenizer === LLMTokenizer.tiktokenO200Base){ + result = await tikJS(data, 'o200k_base'); + } else if(modelInfo.tokenizer === LLMTokenizer.GoogleCloud && db.googleClaudeTokenizing){ + result = await tokenizeGoogleCloud(data); + } else if(modelInfo.tokenizer === LLMTokenizer.Gemma || modelInfo.tokenizer === LLMTokenizer.GoogleCloud){ + result = await gemmaTokenize(data); + } else if(modelInfo.tokenizer === LLMTokenizer.DeepSeek){ + result = await tokenizeWebTokenizers(data, 'DeepSeek'); + } else if(modelInfo.tokenizer === LLMTokenizer.Cohere){ + result = await tokenizeWebTokenizers(data, 'cohere'); + } else { + result = await tikJS(data); + } } - if(db.useTokenizerCaching){ encodeCache.set(cacheKey, result); } From 2a36743cb6966b3a193dbb692bde6acabf547c7f Mon Sep 17 00:00:00 2001 From: sub-hub <70351692+sub-hub@users.noreply.github.com> Date: Mon, 21 Apr 2025 14:57:19 +0900 Subject: [PATCH 3/3] Fix: Potential problem in tikJS function --- src/ts/tokenizer.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ts/tokenizer.ts b/src/ts/tokenizer.ts index 1e433534..25fb442d 100644 --- a/src/ts/tokenizer.ts +++ b/src/ts/tokenizer.ts @@ -209,6 +209,7 @@ async function gemmaTokenize(text:string) { async function tikJS(text:string, model='cl100k_base') { if(!tikParser || lastTikModel !== model){ + tikParser?.free() if(model === 'cl100k_base'){ const {Tiktoken} = await import('@dqbd/tiktoken') const cl100k_base = await import("@dqbd/tiktoken/encoders/cl100k_base.json");