Refactor tokenizer

This commit is contained in:
kwaroran
2024-12-07 06:24:33 +09:00
parent 34b4a1245b
commit 9d8f239250
5 changed files with 172 additions and 70 deletions

View File

@@ -6,7 +6,7 @@ import { supportsInlayImage } from "./process/files/image";
import { risuChatParser } from "./parser.svelte";
import { tokenizeGGUFModel } from "./process/models/local";
import { globalFetch } from "./globalApi.svelte";
import { getModelInfo } from "./model/modellist";
import { getModelInfo, LLMTokenizer } from "./model/modellist";
export const tokenizerList = [
@@ -45,48 +45,37 @@ export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Arr
return await tikJS(data, 'o200k_base')
}
}
if(db.aiModel.startsWith('novellist')){
const modelInfo = getModelInfo(db.aiModel)
if(modelInfo.tokenizer === LLMTokenizer.NovelList){
const nv= await tokenizeWebTokenizers(data, 'novellist')
return nv
}
if(db.aiModel.startsWith('claude')){
if(modelInfo.tokenizer === LLMTokenizer.Claude){
return await tokenizeWebTokenizers(data, 'claude')
}
if(db.aiModel.startsWith('novelai')){
if(modelInfo.tokenizer === LLMTokenizer.NovelAI){
return await tokenizeWebTokenizers(data, 'novelai')
}
if(db.aiModel.startsWith('mistral')){
if(modelInfo.tokenizer === LLMTokenizer.Mistral){
return await tokenizeWebTokenizers(data, 'mistral')
}
if(db.aiModel === 'mancer' ||
db.aiModel === 'textgen_webui' ||
(db.aiModel === 'reverse_proxy' && db.reverseProxyOobaMode)){
if(modelInfo.tokenizer === LLMTokenizer.Llama){
return await tokenizeWebTokenizers(data, 'llama')
}
if(db.aiModel.startsWith('local_')){
if(modelInfo.tokenizer === LLMTokenizer.Local){
return await tokenizeGGUFModel(data)
}
if(db.aiModel === 'ooba'){
if(db.reverseProxyOobaArgs.tokenizer === 'mixtral' || db.reverseProxyOobaArgs.tokenizer === 'mistral'){
return await tokenizeWebTokenizers(data, 'mistral')
}
else if(db.reverseProxyOobaArgs.tokenizer === 'llama'){
return await tokenizeWebTokenizers(data, 'llama')
}
else{
return await tokenizeWebTokenizers(data, 'llama')
}
}
if(db.aiModel.startsWith('gpt4o')){
if(modelInfo.tokenizer === LLMTokenizer.tiktokenO200Base){
return await tikJS(data, 'o200k_base')
}
if(db.aiModel.startsWith('gemini')){
if(db.aiModel.endsWith('-vertex')){
return await tokenizeWebTokenizers(data, 'gemma')
}
if(modelInfo.tokenizer === LLMTokenizer.GoogleCloud && db.googleClaudeTokenizing){
return await tokenizeGoogleCloud(data)
}
if(db.aiModel.startsWith('cohere')){
if(modelInfo.tokenizer === LLMTokenizer.Gemma || modelInfo.tokenizer === LLMTokenizer.GoogleCloud){
return await tokenizeWebTokenizers(data, 'gemma')
}
if(modelInfo.tokenizer === LLMTokenizer.Cohere){
return await tokenizeWebTokenizers(data, 'cohere')
}