[feat] accurate tokenizing

This commit is contained in:
kwaroran
2023-11-23 17:52:00 +09:00
parent dab121c9c7
commit 634fe418b4
6 changed files with 116 additions and 16 deletions

View File

@@ -4,6 +4,8 @@ import { DataBase, type character } from "./storage/database";
import { get } from "svelte/store";
import type { OpenAIChat } from "./process";
import { supportsInlayImage } from "./image";
import { risuChatParser } from "./parser";
import type { Proompt } from "./process/proompt";
async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{
let db = get(DataBase)
@@ -85,6 +87,14 @@ export async function tokenize(data:string) {
return encoded.length
}
export async function tokenizeAccurate(data:string) {
data = risuChatParser(data.replace('{{slot}}',''), {
tokenizeAccurate: true
})
const encoded = await encode(data)
return encoded.length
}
export class ChatTokenizer {
@@ -149,4 +159,4 @@ export class ChatTokenizer {
export async function tokenizeNum(data:string) {
const encoded = await encode(data)
return encoded
}
}