Add: Caching tokenize (#826)
# PR Checklist
- [ ] Have you checked if it works normally in all models? *Ignore this
if it doesn't use models.*
- [ ] Have you checked if it works normally in all web, local, and node
hosted versions? If it doesn't, have you blocked it in those versions?
- [ ] Have you added type definitions?
# Description
Previous PR #805 had a bug due to incorrect flow of if statements. I
apologize for this.
In this PR, I have improved the flow of if statement additionally from
the modification I made before.
Please refer [Fix: Correct tokenize flow in tokenizer encode
function](09228f3f86)
To see the how I changed the flow.
The biggest change of this commit is that it resolves the issue of
duplicate tokenization that was occurring
By add a `Line 122: if (result === undefined) {`
I think this change corrected the error even if people didn't turn on
the caching option.
And there is an error in the tikJS function, independent of this wrong
flow.
https://github.com/dqbd/tiktoken/issues/35
This commit is contained in:
@@ -6,9 +6,27 @@ import { supportsInlayImage } from "./process/files/inlays";
|
|||||||
import { risuChatParser } from "./parser.svelte";
|
import { risuChatParser } from "./parser.svelte";
|
||||||
import { tokenizeGGUFModel } from "./process/models/local";
|
import { tokenizeGGUFModel } from "./process/models/local";
|
||||||
import { globalFetch } from "./globalApi.svelte";
|
import { globalFetch } from "./globalApi.svelte";
|
||||||
import { getModelInfo, LLMTokenizer } from "./model/modellist";
|
import { getModelInfo, LLMTokenizer, type LLMModel } from "./model/modellist";
|
||||||
import { pluginV2 } from "./plugins/plugins";
|
import { pluginV2 } from "./plugins/plugins";
|
||||||
import type { GemmaTokenizer } from "@huggingface/transformers";
|
import type { GemmaTokenizer } from "@huggingface/transformers";
|
||||||
|
import { LRUMap } from 'mnemonist';
|
||||||
|
|
||||||
|
const MAX_CACHE_SIZE = 1500;
|
||||||
|
|
||||||
|
const encodeCache = new LRUMap<string, number[] | Uint32Array | Int32Array>(MAX_CACHE_SIZE);
|
||||||
|
|
||||||
|
function getHash(
|
||||||
|
data: string,
|
||||||
|
aiModel: string,
|
||||||
|
customTokenizer: string,
|
||||||
|
currentPluginProvider: string,
|
||||||
|
googleClaudeTokenizing: boolean,
|
||||||
|
modelInfo: LLMModel,
|
||||||
|
pluginTokenizer: string
|
||||||
|
): string {
|
||||||
|
const combined = `${data}::${aiModel}::${customTokenizer}::${currentPluginProvider}::${googleClaudeTokenizing ? '1' : '0'}::${modelInfo.tokenizer}::${pluginTokenizer}`;
|
||||||
|
return combined;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
export const tokenizerList = [
|
export const tokenizerList = [
|
||||||
@@ -25,100 +43,114 @@ export const tokenizerList = [
|
|||||||
] as const
|
] as const
|
||||||
|
|
||||||
export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{
|
export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{
|
||||||
let db = getDatabase()
|
const db = getDatabase();
|
||||||
|
const modelInfo = getModelInfo(db.aiModel);
|
||||||
|
const pluginTokenizer = pluginV2.providerOptions.get(db.currentPluginProvider)?.tokenizer ?? "none";
|
||||||
|
|
||||||
|
let cacheKey = ''
|
||||||
|
if(db.useTokenizerCaching){
|
||||||
|
cacheKey = getHash(
|
||||||
|
data,
|
||||||
|
db.aiModel,
|
||||||
|
db.customTokenizer,
|
||||||
|
db.currentPluginProvider,
|
||||||
|
db.googleClaudeTokenizing,
|
||||||
|
modelInfo,
|
||||||
|
pluginTokenizer
|
||||||
|
);
|
||||||
|
const cachedResult = encodeCache.get(cacheKey);
|
||||||
|
if (cachedResult !== undefined) {
|
||||||
|
return cachedResult;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let result: number[] | Uint32Array | Int32Array;
|
||||||
|
|
||||||
if(db.aiModel === 'openrouter' || db.aiModel === 'reverse_proxy'){
|
if(db.aiModel === 'openrouter' || db.aiModel === 'reverse_proxy'){
|
||||||
switch(db.customTokenizer){
|
switch(db.customTokenizer){
|
||||||
case 'mistral':
|
case 'mistral':
|
||||||
return await tokenizeWebTokenizers(data, 'mistral')
|
result = await tokenizeWebTokenizers(data, 'mistral'); break;
|
||||||
case 'llama':
|
case 'llama':
|
||||||
return await tokenizeWebTokenizers(data, 'llama')
|
result = await tokenizeWebTokenizers(data, 'llama'); break;
|
||||||
case 'novelai':
|
case 'novelai':
|
||||||
return await tokenizeWebTokenizers(data, 'novelai')
|
result = await tokenizeWebTokenizers(data, 'novelai'); break;
|
||||||
case 'claude':
|
case 'claude':
|
||||||
return await tokenizeWebTokenizers(data, 'claude')
|
result = await tokenizeWebTokenizers(data, 'claude'); break;
|
||||||
case 'novellist':
|
case 'novellist':
|
||||||
return await tokenizeWebTokenizers(data, 'novellist')
|
result = await tokenizeWebTokenizers(data, 'novellist'); break;
|
||||||
case 'llama3':
|
case 'llama3':
|
||||||
return await tokenizeWebTokenizers(data, 'llama')
|
result = await tokenizeWebTokenizers(data, 'llama'); break;
|
||||||
case 'gemma':
|
case 'gemma':
|
||||||
return await gemmaTokenize(data)
|
result = await gemmaTokenize(data); break;
|
||||||
case 'cohere':
|
case 'cohere':
|
||||||
return await tokenizeWebTokenizers(data, 'cohere')
|
result = await tokenizeWebTokenizers(data, 'cohere'); break;
|
||||||
case 'deepseek':
|
case 'deepseek':
|
||||||
return await tokenizeWebTokenizers(data, 'DeepSeek')
|
result = await tokenizeWebTokenizers(data, 'DeepSeek'); break;
|
||||||
default:
|
default:
|
||||||
return await tikJS(data, 'o200k_base')
|
result = await tikJS(data, 'o200k_base'); break;
|
||||||
}
|
}
|
||||||
}
|
} else if (db.aiModel === 'custom' && pluginTokenizer) {
|
||||||
|
switch(pluginTokenizer){
|
||||||
const modelInfo = getModelInfo(db.aiModel)
|
|
||||||
|
|
||||||
if(db.aiModel === 'custom' && pluginV2.providerOptions.get(db.currentPluginProvider)?.tokenizer){
|
|
||||||
const tokenizer = pluginV2.providerOptions.get(db.currentPluginProvider)?.tokenizer
|
|
||||||
switch(tokenizer){
|
|
||||||
case 'mistral':
|
case 'mistral':
|
||||||
return await tokenizeWebTokenizers(data, 'mistral')
|
result = await tokenizeWebTokenizers(data, 'mistral'); break;
|
||||||
case 'llama':
|
case 'llama':
|
||||||
return await tokenizeWebTokenizers(data, 'llama')
|
result = await tokenizeWebTokenizers(data, 'llama'); break;
|
||||||
case 'novelai':
|
case 'novelai':
|
||||||
return await tokenizeWebTokenizers(data, 'novelai')
|
result = await tokenizeWebTokenizers(data, 'novelai'); break;
|
||||||
case 'claude':
|
case 'claude':
|
||||||
return await tokenizeWebTokenizers(data, 'claude')
|
result = await tokenizeWebTokenizers(data, 'claude'); break;
|
||||||
case 'novellist':
|
case 'novellist':
|
||||||
return await tokenizeWebTokenizers(data, 'novellist')
|
result = await tokenizeWebTokenizers(data, 'novellist'); break;
|
||||||
case 'llama3':
|
case 'llama3':
|
||||||
return await tokenizeWebTokenizers(data, 'llama')
|
result = await tokenizeWebTokenizers(data, 'llama'); break;
|
||||||
case 'gemma':
|
case 'gemma':
|
||||||
return await gemmaTokenize(data)
|
result = await gemmaTokenize(data); break;
|
||||||
case 'cohere':
|
case 'cohere':
|
||||||
return await tokenizeWebTokenizers(data, 'cohere')
|
result = await tokenizeWebTokenizers(data, 'cohere'); break;
|
||||||
case 'o200k_base':
|
case 'o200k_base':
|
||||||
return await tikJS(data, 'o200k_base')
|
result = await tikJS(data, 'o200k_base'); break;
|
||||||
case 'cl100k_base':
|
case 'cl100k_base':
|
||||||
return await tikJS(data, 'cl100k_base')
|
result = await tikJS(data, 'cl100k_base'); break;
|
||||||
case 'custom':
|
case 'custom':
|
||||||
return await pluginV2.providerOptions.get(db.currentPluginProvider)?.tokenizerFunc?.(data) ?? [0]
|
result = await pluginV2.providerOptions.get(db.currentPluginProvider)?.tokenizerFunc?.(data) ?? [0]; break;
|
||||||
default:
|
default:
|
||||||
return await tikJS(data, 'o200k_base')
|
result = await tikJS(data, 'o200k_base'); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback
|
||||||
|
if (result === undefined) {
|
||||||
|
if(modelInfo.tokenizer === LLMTokenizer.NovelList){
|
||||||
|
result = await tokenizeWebTokenizers(data, 'novellist');
|
||||||
|
} else if(modelInfo.tokenizer === LLMTokenizer.Claude){
|
||||||
|
result = await tokenizeWebTokenizers(data, 'claude');
|
||||||
|
} else if(modelInfo.tokenizer === LLMTokenizer.NovelAI){
|
||||||
|
result = await tokenizeWebTokenizers(data, 'novelai');
|
||||||
|
} else if(modelInfo.tokenizer === LLMTokenizer.Mistral){
|
||||||
|
result = await tokenizeWebTokenizers(data, 'mistral');
|
||||||
|
} else if(modelInfo.tokenizer === LLMTokenizer.Llama){
|
||||||
|
result = await tokenizeWebTokenizers(data, 'llama');
|
||||||
|
} else if(modelInfo.tokenizer === LLMTokenizer.Local){
|
||||||
|
result = await tokenizeGGUFModel(data);
|
||||||
|
} else if(modelInfo.tokenizer === LLMTokenizer.tiktokenO200Base){
|
||||||
|
result = await tikJS(data, 'o200k_base');
|
||||||
|
} else if(modelInfo.tokenizer === LLMTokenizer.GoogleCloud && db.googleClaudeTokenizing){
|
||||||
|
result = await tokenizeGoogleCloud(data);
|
||||||
|
} else if(modelInfo.tokenizer === LLMTokenizer.Gemma || modelInfo.tokenizer === LLMTokenizer.GoogleCloud){
|
||||||
|
result = await gemmaTokenize(data);
|
||||||
|
} else if(modelInfo.tokenizer === LLMTokenizer.DeepSeek){
|
||||||
|
result = await tokenizeWebTokenizers(data, 'DeepSeek');
|
||||||
|
} else if(modelInfo.tokenizer === LLMTokenizer.Cohere){
|
||||||
|
result = await tokenizeWebTokenizers(data, 'cohere');
|
||||||
|
} else {
|
||||||
|
result = await tikJS(data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(db.useTokenizerCaching){
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.NovelList){
|
encodeCache.set(cacheKey, result);
|
||||||
const nv= await tokenizeWebTokenizers(data, 'novellist')
|
|
||||||
return nv
|
|
||||||
}
|
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.Claude){
|
|
||||||
return await tokenizeWebTokenizers(data, 'claude')
|
|
||||||
}
|
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.NovelAI){
|
|
||||||
return await tokenizeWebTokenizers(data, 'novelai')
|
|
||||||
}
|
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.Mistral){
|
|
||||||
return await tokenizeWebTokenizers(data, 'mistral')
|
|
||||||
}
|
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.Llama){
|
|
||||||
return await tokenizeWebTokenizers(data, 'llama')
|
|
||||||
}
|
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.Local){
|
|
||||||
return await tokenizeGGUFModel(data)
|
|
||||||
}
|
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.tiktokenO200Base){
|
|
||||||
return await tikJS(data, 'o200k_base')
|
|
||||||
}
|
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.GoogleCloud && db.googleClaudeTokenizing){
|
|
||||||
return await tokenizeGoogleCloud(data)
|
|
||||||
}
|
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.Gemma || modelInfo.tokenizer === LLMTokenizer.GoogleCloud){
|
|
||||||
return await gemmaTokenize(data)
|
|
||||||
}
|
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.DeepSeek){
|
|
||||||
return await tokenizeWebTokenizers(data, 'DeepSeek')
|
|
||||||
}
|
|
||||||
if(modelInfo.tokenizer === LLMTokenizer.Cohere){
|
|
||||||
return await tokenizeWebTokenizers(data, 'cohere')
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return await tikJS(data)
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
type tokenizerType = 'novellist'|'claude'|'novelai'|'llama'|'mistral'|'llama3'|'gemma'|'cohere'|'googleCloud'|'DeepSeek'
|
type tokenizerType = 'novellist'|'claude'|'novelai'|'llama'|'mistral'|'llama3'|'gemma'|'cohere'|'googleCloud'|'DeepSeek'
|
||||||
@@ -177,6 +209,7 @@ async function gemmaTokenize(text:string) {
|
|||||||
|
|
||||||
async function tikJS(text:string, model='cl100k_base') {
|
async function tikJS(text:string, model='cl100k_base') {
|
||||||
if(!tikParser || lastTikModel !== model){
|
if(!tikParser || lastTikModel !== model){
|
||||||
|
tikParser?.free()
|
||||||
if(model === 'cl100k_base'){
|
if(model === 'cl100k_base'){
|
||||||
const {Tiktoken} = await import('@dqbd/tiktoken')
|
const {Tiktoken} = await import('@dqbd/tiktoken')
|
||||||
const cl100k_base = await import("@dqbd/tiktoken/encoders/cl100k_base.json");
|
const cl100k_base = await import("@dqbd/tiktoken/encoders/cl100k_base.json");
|
||||||
|
|||||||
Reference in New Issue
Block a user