152 lines
4.8 KiB
TypeScript
152 lines
4.8 KiB
TypeScript
import type { Tiktoken } from "@dqbd/tiktoken";
|
|
import type { Tokenizer } from "@mlc-ai/web-tokenizers";
|
|
import { DataBase, type character } from "./storage/database";
|
|
import { get } from "svelte/store";
|
|
import type { OpenAIChat } from "./process";
|
|
import { supportsInlayImage } from "./image";
|
|
|
|
async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{
|
|
let db = get(DataBase)
|
|
if(db.aiModel.startsWith('novellist')){
|
|
const nv= await tokenizeWebTokenizers(data, 'novellist')
|
|
return nv
|
|
}
|
|
if(db.aiModel.startsWith('claude')){
|
|
return await tokenizeWebTokenizers(data, 'claude')
|
|
}
|
|
if(db.aiModel.startsWith('novelai')){
|
|
return await tokenizeWebTokenizers(data, 'novelai')
|
|
}
|
|
if(db.aiModel.startsWith('local_') || db.aiModel === 'mancer' || db.aiModel === 'textgen_webui'){
|
|
return await tokenizeWebTokenizers(data, 'llama')
|
|
}
|
|
return await tikJS(data)
|
|
}
|
|
|
|
type tokenizerType = 'novellist'|'claude'|'novelai'|'llama'
|
|
|
|
let tikParser:Tiktoken = null
|
|
let tokenizersTokenizer:Tokenizer = null
|
|
let tokenizersType:tokenizerType = null
|
|
|
|
async function tikJS(text:string) {
|
|
if(!tikParser){
|
|
const {Tiktoken} = await import('@dqbd/tiktoken')
|
|
const cl100k_base = await import("@dqbd/tiktoken/encoders/cl100k_base.json");
|
|
|
|
tikParser = new Tiktoken(
|
|
cl100k_base.bpe_ranks,
|
|
cl100k_base.special_tokens,
|
|
cl100k_base.pat_str
|
|
);
|
|
}
|
|
return tikParser.encode(text)
|
|
}
|
|
|
|
async function tokenizeWebTokenizers(text:string, type:tokenizerType) {
|
|
if(type !== tokenizersType || !tokenizersTokenizer){
|
|
const webTokenizer = await import('@mlc-ai/web-tokenizers')
|
|
switch(type){
|
|
case "novellist":
|
|
tokenizersTokenizer = await webTokenizer.Tokenizer.fromSentencePiece(
|
|
await (await fetch("/token/trin/spiece.model")
|
|
).arrayBuffer())
|
|
break
|
|
case "claude":
|
|
tokenizersTokenizer = await webTokenizer.Tokenizer.fromJSON(
|
|
await (await fetch("/token/claude/claude.json")
|
|
).arrayBuffer())
|
|
break
|
|
case 'novelai':
|
|
tokenizersTokenizer = await webTokenizer.Tokenizer.fromSentencePiece(
|
|
await (await fetch("/token/nai/nerdstash_v2.model")
|
|
).arrayBuffer())
|
|
|
|
break
|
|
case 'llama':
|
|
tokenizersTokenizer = await webTokenizer.Tokenizer.fromSentencePiece(
|
|
await (await fetch("/token/llama/llama.model")
|
|
).arrayBuffer())
|
|
|
|
}
|
|
tokenizersType = type
|
|
}
|
|
return (tokenizersTokenizer.encode(text))
|
|
}
|
|
|
|
export async function tokenizerChar(char:character) {
|
|
const encoded = await encode(char.name + '\n' + char.firstMessage + '\n' + char.desc)
|
|
return encoded.length
|
|
}
|
|
|
|
export async function tokenize(data:string) {
|
|
const encoded = await encode(data)
|
|
return encoded.length
|
|
}
|
|
|
|
|
|
export class ChatTokenizer {
|
|
|
|
private chatAdditonalTokens:number
|
|
private useName:'name'|'noName'
|
|
|
|
constructor(chatAdditonalTokens:number, useName:'name'|'noName'){
|
|
this.chatAdditonalTokens = chatAdditonalTokens
|
|
this.useName = useName
|
|
}
|
|
async tokenizeChat(data:OpenAIChat) {
|
|
if(data.memo && data.memo.startsWith('inlayImage')){
|
|
const db = get(DataBase)
|
|
if(!supportsInlayImage()){
|
|
return this.chatAdditonalTokens
|
|
}
|
|
if(db.gptVisionQuality === 'low'){
|
|
return 87
|
|
}
|
|
|
|
let encoded = this.chatAdditonalTokens
|
|
const memo = data.memo.split('-')
|
|
let height = parseInt(memo[1])
|
|
let width = parseInt(memo[2])
|
|
|
|
if(height === width){
|
|
if(height > 768){
|
|
height = 768
|
|
width = 768
|
|
}
|
|
}
|
|
else if(height > width){
|
|
if(width > 768){
|
|
width = 768
|
|
height = height * (768 / width)
|
|
}
|
|
}
|
|
else{
|
|
if(height > 768){
|
|
height = 768
|
|
width = width * (768 / height)
|
|
}
|
|
}
|
|
|
|
const chunkSize = Math.ceil(width / 512) * Math.ceil(height / 512)
|
|
encoded += chunkSize * 2
|
|
encoded += 85
|
|
|
|
return encoded
|
|
}
|
|
|
|
let encoded = (await encode(data.content)).length + this.chatAdditonalTokens
|
|
if(data.name && this.useName ==='name'){
|
|
encoded += (await encode(data.name)).length + 1
|
|
}
|
|
return encoded
|
|
}
|
|
|
|
|
|
}
|
|
|
|
export async function tokenizeNum(data:string) {
|
|
const encoded = await encode(data)
|
|
return encoded
|
|
}
|