From da272d83d8ad57beed0320154edb697abf395f9f Mon Sep 17 00:00:00 2001 From: kwaroran Date: Fri, 19 Apr 2024 13:35:56 +0900 Subject: [PATCH] Add custom tokenizers --- src/lang/en.ts | 1 + src/lib/Setting/Pages/BotSettings.svelte | 14 ++++++++++--- src/ts/storage/database.ts | 2 ++ src/ts/tokenizer.ts | 26 ++++++++++++++++++++++++ 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/lang/en.ts b/src/lang/en.ts index d3ff2005..3f72a604 100644 --- a/src/lang/en.ts +++ b/src/lang/en.ts @@ -576,4 +576,5 @@ export const languageEnglish = { autoContinueMinTokens: "Target Tokens (Auto Continue)", autoContinueChat: "Anti-Incomplete Response (Auto Continue)", removeIncompleteResponse: "Remove Incomplete Sentences", + tokenizer: "Tokenizer", } \ No newline at end of file diff --git a/src/lib/Setting/Pages/BotSettings.svelte b/src/lib/Setting/Pages/BotSettings.svelte index d7c09d67..c201c6d4 100644 --- a/src/lib/Setting/Pages/BotSettings.svelte +++ b/src/lib/Setting/Pages/BotSettings.svelte @@ -5,7 +5,7 @@ import { DataBase } from "src/ts/storage/database"; import { customProviderStore, getCurrentPluginMax } from "src/ts/plugins/plugins"; import { getModelMaxContext, isTauri } from "src/ts/storage/globalApi"; - import { tokenize, tokenizeAccurate } from "src/ts/tokenizer"; + import { tokenize, tokenizeAccurate, tokenizerList } from "src/ts/tokenizer"; import ModelList from "src/lib/UI/ModelList.svelte"; import DropList from "src/lib/SideBars/DropList.svelte"; import { PlusIcon, TrashIcon } from "lucide-svelte"; @@ -242,6 +242,14 @@ {/await} {/if} +{#if $DataBase.aiModel === 'openrouter' || $DataBase.aiModel === 'reverse_proxy'} + {language.tokenizer} + + {#each tokenizerList as entry} + {entry[1]} + {/each} + +{/if} {#if $DataBase.aiModel.startsWith('gpt') || $DataBase.subModel.startsWith('gpt') || $DataBase.aiModel.startsWith('instructgpt') || $DataBase.subModel.startsWith('instructgpt')} OpenAI {language.apiKey} @@ -254,10 +262,10 @@ {/if} {#if $DataBase.aiModel.startsWith('openrouter')} -
+
-
+
{/if} diff --git a/src/ts/storage/database.ts b/src/ts/storage/database.ts index 8a01c062..6abccae4 100644 --- a/src/ts/storage/database.ts +++ b/src/ts/storage/database.ts @@ -390,6 +390,7 @@ export function setDatabase(data:Database){ data.repetition_penalty ??= 1 data.min_p ??= 0 data.top_a ??= 0 + data.customTokenizer ??= 'tik' changeLanguage(data.language) DataBase.set(data) @@ -633,6 +634,7 @@ export interface Database{ autoContinueChat:boolean autoContinueMinTokens:number removeIncompleteResponse:boolean + customTokenizer:string } export interface customscript{ diff --git a/src/ts/tokenizer.ts b/src/ts/tokenizer.ts index 9995c411..f4a3d3ae 100644 --- a/src/ts/tokenizer.ts +++ b/src/ts/tokenizer.ts @@ -7,8 +7,34 @@ import { supportsInlayImage } from "./process/files/image"; import { risuChatParser } from "./parser"; import { tokenizeGGUFModel } from "./process/models/local"; + +export const tokenizerList = [ + ['tik', 'Tiktoken (OpenAI)'], + ['mistral', 'Mistral'], + ['novelai', 'NovelAI'], + ['claude', 'Claude'], + ['llama', 'Llama'], + ['novellist', 'Novellist'], +] as const + async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{ let db = get(DataBase) + if(db.aiModel === 'openrouter' || db.aiModel === 'reverse_proxy'){ + switch(db.customTokenizer){ + case 'mistral': + return await tokenizeWebTokenizers(data, 'mistral') + case 'llama': + return await tokenizeWebTokenizers(data, 'llama') + case 'novelai': + return await tokenizeWebTokenizers(data, 'novelai') + case 'claude': + return await tokenizeWebTokenizers(data, 'claude') + case 'novellist': + return await tokenizeWebTokenizers(data, 'novellist') + default: + return await tikJS(data) + } + } if(db.aiModel.startsWith('novellist')){ const nv= await tokenizeWebTokenizers(data, 'novellist') return nv