[feat] accurate tokenizing

2023-11-23 17:52:00 +09:00
parent dab121c9c7
commit 634fe418b4
6 changed files with 116 additions and 16 deletions
--- a/src/ts/parser.ts
+++ b/src/ts/parser.ts
@@ -11,6 +11,7 @@ import { selectedCharID } from './stores';
 import { calcString } from './process/infunctions';
 import { findCharacterbyId } from './util';
 import { getInlayImage } from './image';
+import { cloneDeep } from 'lodash';

 const convertora = new showdown.Converter({
    simpleLineBreaks: true,
@@ -346,7 +347,14 @@ function wppParser(data:string){


 const rgx = /(?:{{|<)(.+?)(?:}}|>)/gm
-type matcherArg = {chatID:number,db:Database,chara:character|string,rmVar:boolean,var?:{[key:string]:string}}
+type matcherArg = {
+    chatID:number,
+    db:Database,
+    chara:character|string,
+    rmVar:boolean,
+    var?:{[key:string]:string}
+    tokenizeAccurate?:boolean
+}
 const matcher = (p1:string,matcherArg:matcherArg) => {
    if(p1.length > 10000){
        return ''
@@ -390,7 +398,7 @@ const matcher = (p1:string,matcherArg:matcherArg) => {
        case 'bot':{
            let selectedChar = get(selectedCharID)
            let currentChar = db.characters[selectedChar]
-            if(currentChar.type !== 'group'){
+            if(currentChar && currentChar.type !== 'group'){
                return currentChar.name
            }
            if(chara){
@@ -484,6 +492,9 @@ const matcher = (p1:string,matcherArg:matcherArg) => {
            return ''
        }
        case 'time':{
+            if(matcherArg.tokenizeAccurate){
+                return `00:00:00`
+            }
            if(chatID === -1){
                return "[Cannot get time]"
            }
@@ -499,6 +510,9 @@ const matcher = (p1:string,matcherArg:matcherArg) => {
            return date.toLocaleTimeString()
        }
        case 'date':{
+            if(matcherArg.tokenizeAccurate){
+                return `00:00:00`
+            }
            if(chatID === -1){
                return "[Cannot get time]"
            }
@@ -513,6 +527,9 @@ const matcher = (p1:string,matcherArg:matcherArg) => {
            return date.toLocaleDateString()
        }
        case 'idle_duration':{
+            if(matcherArg.tokenizeAccurate){
+                return `00:00:00`
+            }
            if(chatID === -1){
                return "[Cannot get time]"
            }
@@ -622,11 +639,17 @@ const matcher = (p1:string,matcherArg:matcherArg) => {
    if(p1.startsWith('random')){
        if(p1.startsWith('random::')){
            const randomIndex = Math.floor(Math.random() * (arra.length - 1)) + 1
+            if(matcherArg.tokenizeAccurate){
+                return arra[0]
+            }
            return arra[randomIndex]
        }
        else{
            const arr = p1.split(/\:|\,/g)
            const randomIndex = Math.floor(Math.random() * (arr.length - 1)) + 1
+            if(matcherArg.tokenizeAccurate){
+                return arra[0]
+            }
            return arr[randomIndex]
        }
    }
@@ -656,7 +679,7 @@ const smMatcher = (p1:string,matcherArg:matcherArg) => {
        case 'bot':{
            let selectedChar = get(selectedCharID)
            let currentChar = db.characters[selectedChar]
-            if(currentChar.type !== 'group'){
+            if(currentChar && currentChar.type !== 'group'){
                return currentChar.name
            }
            if(chara){
@@ -707,6 +730,7 @@ export function risuChatParser(da:string, arg:{
    chara?:string|character|groupChat
    rmVar?:boolean,
    var?:{[key:string]:string}
+    tokenizeAccurate?:boolean
 } = {}):string{
    const chatID = arg.chatID ?? -1
    const db = arg.db ?? get(DataBase)
@@ -724,6 +748,13 @@ export function risuChatParser(da:string, arg:{
            chara = aChara
        }
    }
+    if(arg.tokenizeAccurate){
+        const db = arg.db ?? get(DataBase)
+        const selchar = chara ?? db.characters[get(selectedCharID)]
+        if(!selchar){
+            chara = 'bot'
+        }
+    }

    
    let pointer = 0;
@@ -731,12 +762,16 @@ export function risuChatParser(da:string, arg:{
    let pf = performance.now()
    let v = new Uint8Array(512)
    let pureMode = false
+    let commentMode = false
+    let commentLatest:string[] = [""]
+    let commentV = new Uint8Array(512)
    const matcherObj = {
        chatID: chatID,
        chara: chara,
        rmVar: arg.rmVar ?? false,
        db: db,
-        var: arg.var ?? null
+        var: arg.var ?? null,
+        tokenizeAccurate: arg.tokenizeAccurate ?? false
    }
    while(pointer < da.length){
        switch(da[pointer]){
@@ -791,6 +826,25 @@ export function risuChatParser(da:string, arg:{
                        pureMode = false
                        break
                    }
+                    case 'Comment':{
+                        if(!commentMode){
+                            commentMode = true
+                            commentLatest = nested.map((f) => f)
+                            if(commentLatest[0].endsWith('\n')){
+                                commentLatest[0] = commentLatest[0].substring(0, commentLatest[0].length - 1)
+                            }
+                            commentV = new Uint8Array(v)
+                        }
+                        break
+                    }
+                    case '/Comment':{
+                        if(commentMode){
+                            nested = commentLatest
+                            v = commentV
+                            commentMode = false
+                        }
+                        break
+                    }
                    default:{
                        const mc = (pureMode) ? null : smMatcher(dat, matcherObj)
                        nested[0] += mc ?? `<${dat}>`
--- a/src/ts/process/proompt.ts
+++ b/src/ts/process/proompt.ts
@@ -1,3 +1,5 @@
+import { tokenizeAccurate } from "../tokenizer";
+
 export type Proompt = ProomptPlain|ProomptTyped|ProomptChat|ProomptAuthorNote;

 export interface ProomptPlain {
@@ -23,4 +25,29 @@ export interface ProomptChat {
    type: 'chat';
    rangeStart: number;
    rangeEnd: number|'end';
+}
+
+export async function tokenizePreset(proompts:Proompt[]){
+    let total = 0
+    for(const proompt of proompts){
+        switch(proompt.type){
+            case 'plain':
+            case 'jailbreak':{
+                total += await tokenizeAccurate(proompt.text)
+                break
+            }
+            case 'persona':
+            case 'description':
+            case 'lorebook':
+            case 'postEverything':
+            case 'authornote':
+            case 'memory':{
+                if(proompt.innerFormat){
+                    total += await tokenizeAccurate(proompt.innerFormat)
+                }
+                break
+            }
+        }
+    }
+    return total
 }
--- a/src/ts/tokenizer.ts
+++ b/src/ts/tokenizer.ts
@@ -4,6 +4,8 @@ import { DataBase, type character } from "./storage/database";
 import { get } from "svelte/store";
 import type { OpenAIChat } from "./process";
 import { supportsInlayImage } from "./image";
+import { risuChatParser } from "./parser";
+import type { Proompt } from "./process/proompt";

 async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{
    let db = get(DataBase)
@@ -85,6 +87,14 @@ export async function tokenize(data:string) {
    return encoded.length
 }

+export async function tokenizeAccurate(data:string) {
+    data = risuChatParser(data.replace('{{slot}}',''), {
+        tokenizeAccurate: true
+    })
+    const encoded = await encode(data)
+    return encoded.length
+}
+

 export class ChatTokenizer {

@@ -149,4 +159,4 @@ export class ChatTokenizer {
 export async function tokenizeNum(data:string) {
    const encoded = await encode(data)
    return encoded
-}
+}