Add custom tokenizers

This commit is contained in:
kwaroran
2024-04-19 13:35:56 +09:00
parent b291242830
commit da272d83d8
4 changed files with 40 additions and 3 deletions

View File

@@ -576,4 +576,5 @@ export const languageEnglish = {
autoContinueMinTokens: "Target Tokens (Auto Continue)", autoContinueMinTokens: "Target Tokens (Auto Continue)",
autoContinueChat: "Anti-Incomplete Response (Auto Continue)", autoContinueChat: "Anti-Incomplete Response (Auto Continue)",
removeIncompleteResponse: "Remove Incomplete Sentences", removeIncompleteResponse: "Remove Incomplete Sentences",
tokenizer: "Tokenizer",
} }

View File

@@ -5,7 +5,7 @@
import { DataBase } from "src/ts/storage/database"; import { DataBase } from "src/ts/storage/database";
import { customProviderStore, getCurrentPluginMax } from "src/ts/plugins/plugins"; import { customProviderStore, getCurrentPluginMax } from "src/ts/plugins/plugins";
import { getModelMaxContext, isTauri } from "src/ts/storage/globalApi"; import { getModelMaxContext, isTauri } from "src/ts/storage/globalApi";
import { tokenize, tokenizeAccurate } from "src/ts/tokenizer"; import { tokenize, tokenizeAccurate, tokenizerList } from "src/ts/tokenizer";
import ModelList from "src/lib/UI/ModelList.svelte"; import ModelList from "src/lib/UI/ModelList.svelte";
import DropList from "src/lib/SideBars/DropList.svelte"; import DropList from "src/lib/SideBars/DropList.svelte";
import { PlusIcon, TrashIcon } from "lucide-svelte"; import { PlusIcon, TrashIcon } from "lucide-svelte";
@@ -242,6 +242,14 @@
</SelectInput> </SelectInput>
{/await} {/await}
{/if} {/if}
{#if $DataBase.aiModel === 'openrouter' || $DataBase.aiModel === 'reverse_proxy'}
<span class="text-textcolor">{language.tokenizer}</span>
<SelectInput bind:value={$DataBase.customTokenizer}>
{#each tokenizerList as entry}
<OptionInput value={entry[0]}>{entry[1]}</OptionInput>
{/each}
</SelectInput>
{/if}
{#if $DataBase.aiModel.startsWith('gpt') || $DataBase.subModel.startsWith('gpt') {#if $DataBase.aiModel.startsWith('gpt') || $DataBase.subModel.startsWith('gpt')
|| $DataBase.aiModel.startsWith('instructgpt') || $DataBase.subModel.startsWith('instructgpt')} || $DataBase.aiModel.startsWith('instructgpt') || $DataBase.subModel.startsWith('instructgpt')}
<span class="text-textcolor">OpenAI {language.apiKey} <Help key="oaiapikey"/></span> <span class="text-textcolor">OpenAI {language.apiKey} <Help key="oaiapikey"/></span>
@@ -254,10 +262,10 @@
</div> </div>
{/if} {/if}
{#if $DataBase.aiModel.startsWith('openrouter')} {#if $DataBase.aiModel.startsWith('openrouter')}
<div class="flex items-center"> <div class="flex items-center mb-4">
<Check bind:check={$DataBase.openrouterFallback} name={language.openrouterFallback}/> <Check bind:check={$DataBase.openrouterFallback} name={language.openrouterFallback}/>
</div> </div>
<div class="flex items-center"> <div class="flex items-center mb-4">
<Check bind:check={$DataBase.openrouterMiddleOut} name={language.openrouterMiddleOut}/> <Check bind:check={$DataBase.openrouterMiddleOut} name={language.openrouterMiddleOut}/>
</div> </div>
{/if} {/if}

View File

@@ -390,6 +390,7 @@ export function setDatabase(data:Database){
data.repetition_penalty ??= 1 data.repetition_penalty ??= 1
data.min_p ??= 0 data.min_p ??= 0
data.top_a ??= 0 data.top_a ??= 0
data.customTokenizer ??= 'tik'
changeLanguage(data.language) changeLanguage(data.language)
DataBase.set(data) DataBase.set(data)
@@ -633,6 +634,7 @@ export interface Database{
autoContinueChat:boolean autoContinueChat:boolean
autoContinueMinTokens:number autoContinueMinTokens:number
removeIncompleteResponse:boolean removeIncompleteResponse:boolean
customTokenizer:string
} }
export interface customscript{ export interface customscript{

View File

@@ -7,8 +7,34 @@ import { supportsInlayImage } from "./process/files/image";
import { risuChatParser } from "./parser"; import { risuChatParser } from "./parser";
import { tokenizeGGUFModel } from "./process/models/local"; import { tokenizeGGUFModel } from "./process/models/local";
export const tokenizerList = [
['tik', 'Tiktoken (OpenAI)'],
['mistral', 'Mistral'],
['novelai', 'NovelAI'],
['claude', 'Claude'],
['llama', 'Llama'],
['novellist', 'Novellist'],
] as const
async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{ async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{
let db = get(DataBase) let db = get(DataBase)
if(db.aiModel === 'openrouter' || db.aiModel === 'reverse_proxy'){
switch(db.customTokenizer){
case 'mistral':
return await tokenizeWebTokenizers(data, 'mistral')
case 'llama':
return await tokenizeWebTokenizers(data, 'llama')
case 'novelai':
return await tokenizeWebTokenizers(data, 'novelai')
case 'claude':
return await tokenizeWebTokenizers(data, 'claude')
case 'novellist':
return await tokenizeWebTokenizers(data, 'novellist')
default:
return await tikJS(data)
}
}
if(db.aiModel.startsWith('novellist')){ if(db.aiModel.startsWith('novellist')){
const nv= await tokenizeWebTokenizers(data, 'novellist') const nv= await tokenizeWebTokenizers(data, 'novellist')
return nv return nv