[feat] trin & claude tokenizer + novellist support
This commit is contained in:
@@ -1,18 +1,26 @@
|
||||
import type { Tiktoken } from "@dqbd/tiktoken";
|
||||
import type { Tokenizer } from "@mlc-ai/web-tokenizers";
|
||||
|
||||
import { DataBase, type character } from "./storage/database";
|
||||
import { get } from "svelte/store";
|
||||
import { tokenizeTransformers } from "./transformers/transformer";
|
||||
import type { OpenAIChat } from "./process";
|
||||
|
||||
async function encode(data:string):Promise<(number[]|Uint32Array)>{
|
||||
async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{
|
||||
let db = get(DataBase)
|
||||
if(db.aiModel === 'novellist'){
|
||||
return await tokenizeTransformers('naclbit/trin_tokenizer_v3',data)
|
||||
return await tokenizeWebTokenizers(data, 'novellist')
|
||||
}
|
||||
if(db.aiModel.startsWith('claude')){
|
||||
return await tokenizeWebTokenizers(data, 'claude')
|
||||
}
|
||||
return await tikJS(data)
|
||||
}
|
||||
|
||||
type tokenizerType = 'novellist'|'claude'
|
||||
|
||||
let tikParser:Tiktoken = null
|
||||
let tokenizersTokenizer:Tokenizer = null
|
||||
let tokenizersType:tokenizerType = null
|
||||
|
||||
async function tikJS(text:string) {
|
||||
if(!tikParser){
|
||||
@@ -28,6 +36,24 @@ async function tikJS(text:string) {
|
||||
return tikParser.encode(text)
|
||||
}
|
||||
|
||||
async function tokenizeWebTokenizers(text:string, type:tokenizerType) {
|
||||
if(type !== tokenizersType || !tokenizersTokenizer){
|
||||
const webTokenizer = await import('@mlc-ai/web-tokenizers')
|
||||
switch(type){
|
||||
case "novellist":
|
||||
tokenizersTokenizer = await webTokenizer.Tokenizer.fromSentencePiece(
|
||||
await (await fetch("/token/trin/spiece.model")
|
||||
).arrayBuffer())
|
||||
case "claude":
|
||||
tokenizersTokenizer = await webTokenizer.Tokenizer.fromJSON(
|
||||
await (await fetch("/token/claude/claude.json")
|
||||
).arrayBuffer())
|
||||
}
|
||||
tokenizersType = type
|
||||
}
|
||||
return (tokenizersTokenizer.encode(text))
|
||||
}
|
||||
|
||||
export async function tokenizerChar(char:character) {
|
||||
const encoded = await encode(char.name + '\n' + char.firstMessage + '\n' + char.desc)
|
||||
return encoded.length
|
||||
|
||||
Reference in New Issue
Block a user