[feat] new tokenizing

This commit is contained in:
kwaroran
2023-06-09 22:27:34 +09:00
parent cfe20c968b
commit dbf5f6b25a
5 changed files with 51 additions and 47 deletions

View File

@@ -2,7 +2,7 @@ import type { OpenAIChat } from ".";
import type { character } from "../storage/database"; import type { character } from "../storage/database";
import { replacePlaceholders } from "../util"; import { replacePlaceholders } from "../util";
export function exampleMessage(char:character):OpenAIChat[]{ export function exampleMessage(char:character, userName:string):OpenAIChat[]{
if(char.exampleMessage === ''){ if(char.exampleMessage === ''){
return [] return []
} }
@@ -34,14 +34,16 @@ export function exampleMessage(char:character):OpenAIChat[]{
add() add()
currentMessage = { currentMessage = {
role: "assistant", role: "assistant",
content: trimed.split(':', 2)[1] content: trimed.split(':', 2)[1],
name: 'example_' + char.name
} }
} }
else if(lowered.startsWith('{{user}}:') || lowered.startsWith('<user>:')){ else if(lowered.startsWith('{{user}}:') || lowered.startsWith('<user>:')){
add() add()
currentMessage = { currentMessage = {
role: "user", role: "user",
content: trimed.split(':', 2)[1] content: trimed.split(':', 2)[1],
name: 'example_' + userName
} }
} }
else{ else{

View File

@@ -1,7 +1,7 @@
import { get, writable } from "svelte/store"; import { get, writable } from "svelte/store";
import { DataBase, setDatabase, type character } from "../storage/database"; import { DataBase, setDatabase, type character } from "../storage/database";
import { CharEmotion, selectedCharID } from "../stores"; import { CharEmotion, selectedCharID } from "../stores";
import { tokenize, tokenizeNum } from "../tokenizer"; import { ChatTokenizer, tokenizeNum } from "../tokenizer";
import { language } from "../../lang"; import { language } from "../../lang";
import { alertError } from "../alert"; import { alertError } from "../alert";
import { loadLoreBookPrompt } from "./lorebook"; import { loadLoreBookPrompt } from "./lorebook";
@@ -15,7 +15,6 @@ import { supaMemory } from "./supaMemory";
import { v4 } from "uuid"; import { v4 } from "uuid";
import { cloneDeep } from "lodash"; import { cloneDeep } from "lodash";
import { groupOrder } from "./group"; import { groupOrder } from "./group";
import { getNameMaxTokens } from "./stringlize";
export interface OpenAIChat{ export interface OpenAIChat{
role: 'system'|'user'|'assistant' role: 'system'|'user'|'assistant'
@@ -69,7 +68,6 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
if(nowChatroom.type === 'group'){ if(nowChatroom.type === 'group'){
if(chatProcessIndex === -1){ if(chatProcessIndex === -1){
const charNames =nowChatroom.characters.map((v) => findCharacterbyIdwithCache(v).name) const charNames =nowChatroom.characters.map((v) => findCharacterbyIdwithCache(v).name)
caculatedChatTokens += await getNameMaxTokens([...charNames, db.username])
const messages = nowChatroom.chats[nowChatroom.chatPage].message const messages = nowChatroom.chats[nowChatroom.chatPage].message
const lastMessage = messages[messages.length-1] const lastMessage = messages[messages.length-1]
@@ -110,14 +108,10 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
} }
else{ else{
currentChar = nowChatroom currentChar = nowChatroom
if(!db.aiModel.startsWith('gpt')){
caculatedChatTokens += await getNameMaxTokens([currentChar.name, db.username])
}
} }
let chatAdditonalTokens = arg.chatAdditonalTokens ?? caculatedChatTokens let chatAdditonalTokens = arg.chatAdditonalTokens ?? caculatedChatTokens
const tokenizer = new ChatTokenizer(chatAdditonalTokens, db.aiModel.startsWith('gpt') ? 'noName' : 'name')
let selectedChat = nowChatroom.chatPage let selectedChat = nowChatroom.chatPage
let currentChat = nowChatroom.chats[selectedChat] let currentChat = nowChatroom.chats[selectedChat]
let maxContextTokens = db.maxContext let maxContextTokens = db.maxContext
@@ -205,17 +199,17 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
}) })
//await tokenize currernt //await tokenize currernt
let currentTokens = (await tokenize(Object.keys(unformated).map((key) => { let currentTokens = 0
return (unformated[key] as OpenAIChat[]).map((d) => {
return d.content for(const key in unformated){
}).join('\n\n') currentTokens += await tokenizer.tokenizeChat(unformated[key])
}).join('\n\n')) + db.maxResponse) + 130 }
const examples = exampleMessage(currentChar) const examples = exampleMessage(currentChar, db.username)
for(const example of examples){ for(const example of examples){
currentTokens += await tokenize(example.content) + chatAdditonalTokens currentTokens += await tokenizer.tokenizeChat(example)
} }
let chats:OpenAIChat[] = examples let chats:OpenAIChat[] = examples
@@ -230,15 +224,14 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
if(nowChatroom.type !== 'group'){ if(nowChatroom.type !== 'group'){
const firstMsg = nowChatroom.firstMsgIndex === -1 ? nowChatroom.firstMessage : nowChatroom.alternateGreetings[nowChatroom.firstMsgIndex] const firstMsg = nowChatroom.firstMsgIndex === -1 ? nowChatroom.firstMessage : nowChatroom.alternateGreetings[nowChatroom.firstMsgIndex]
chats.push({ const chat:OpenAIChat = {
role: 'assistant', role: 'assistant',
content: processScript(currentChar, content: processScript(currentChar,
replacePlaceholders(firstMsg, currentChar.name), replacePlaceholders(firstMsg, currentChar.name),
'editprocess') 'editprocess')
}) }
currentTokens += await tokenize(processScript(currentChar, chats.push(chat)
replacePlaceholders(firstMsg, currentChar.name), currentTokens += await tokenizer.tokenizeChat(chat)
'editprocess'))
} }
const ms = currentChat.message const ms = currentChat.message
@@ -259,17 +252,18 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
if(!msg.chatId){ if(!msg.chatId){
msg.chatId = v4() msg.chatId = v4()
} }
chats.push({ const chat:OpenAIChat = {
role: msg.role === 'user' ? 'user' : 'assistant', role: msg.role === 'user' ? 'user' : 'assistant',
content: formedChat, content: formedChat,
memo: msg.chatId, memo: msg.chatId,
name: name name: name
}) }
currentTokens += (await tokenize(formedChat) + chatAdditonalTokens) chats.push(chat)
currentTokens += await tokenizer.tokenizeChat(chat)
} }
if(nowChatroom.supaMemory && db.supaMemoryType !== 'none'){ if(nowChatroom.supaMemory && db.supaMemoryType !== 'none'){
const sp = await supaMemory(chats, currentTokens, maxContextTokens, currentChat, nowChatroom, chatAdditonalTokens) const sp = await supaMemory(chats, currentTokens, maxContextTokens, currentChat, nowChatroom, tokenizer)
if(sp.error){ if(sp.error){
alertError(sp.error) alertError(sp.error)
return false return false
@@ -287,7 +281,7 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
return false return false
} }
currentTokens -= (await tokenize(chats[0].content) + chatAdditonalTokens) currentTokens -= await tokenizer.tokenizeChat(chats[0])
chats.splice(0, 1) chats.splice(0, 1)
} }
currentChat.lastMemory = chats[0].memo currentChat.lastMemory = chats[0].memo

View File

@@ -53,15 +53,4 @@ export function unstringlizeChat(text:string, formated:OpenAIChat[], char:string
} }
return text return text
}
export async function getNameMaxTokens(names:string[]){
let maxCharNameTokens = 0
for(const name of names){
const tokens = await tokenize(name + ': ') + 1
if(maxCharNameTokens < tokens){
maxCharNameTokens = tokens
}
}
return maxCharNameTokens
} }

View File

@@ -1,8 +1,7 @@
import { get } from "svelte/store"; import { get } from "svelte/store";
import type { OpenAIChat } from "."; import type { OpenAIChat } from ".";
import { DataBase, type Chat, type character, type groupChat } from "../storage/database"; import { DataBase, type Chat, type character, type groupChat } from "../storage/database";
import { tokenize } from "../tokenizer"; import { tokenize, type ChatTokenizer } from "../tokenizer";
import { findCharacterbyId } from "../util";
import { requestChatData } from "./request"; import { requestChatData } from "./request";
export async function supaMemory( export async function supaMemory(
@@ -11,7 +10,7 @@ export async function supaMemory(
maxContextTokens:number, maxContextTokens:number,
room:Chat, room:Chat,
char:character|groupChat, char:character|groupChat,
chatAdditonalTokens:number tokenizer:ChatTokenizer
): Promise<{ currentTokens: number; chats: OpenAIChat[]; error?:string; memory?:string;lastId?:string}>{ ): Promise<{ currentTokens: number; chats: OpenAIChat[]; error?:string; memory?:string;lastId?:string}>{
const db = get(DataBase) const db = get(DataBase)
@@ -27,7 +26,7 @@ export async function supaMemory(
} }
if(coIndex !== -1){ if(coIndex !== -1){
for(let i=0;i<coIndex;i++){ for(let i=0;i<coIndex;i++){
currentTokens -= (await tokenize(chats[0].content) + chatAdditonalTokens) currentTokens -= await tokenizer.tokenizeChat(chats[0])
chats.splice(0, 1) chats.splice(0, 1)
} }
} }
@@ -53,13 +52,13 @@ export async function supaMemory(
lastId = id lastId = id
break break
} }
currentTokens -= (await tokenize(chats[0].content) + chatAdditonalTokens) currentTokens -= await tokenizer.tokenizeChat(chats[0])
chats.splice(0, 1) chats.splice(0, 1)
i += 1 i += 1
} }
supaMemory = data supaMemory = data
currentTokens += await tokenize(supaMemory) + chatAdditonalTokens currentTokens += await tokenize(supaMemory)
} }
@@ -179,7 +178,7 @@ export async function supaMemory(
} }
continue continue
} }
const tokens = await tokenize(cont.content) + chatAdditonalTokens const tokens = await tokenizer.tokenizeChat(cont)
if((chunkSize + tokens) > maxChunkSize){ if((chunkSize + tokens) > maxChunkSize){
if(stringlizedChat === ''){ if(stringlizedChat === ''){
stringlizedChat += `${cont.role === 'assistant' ? char.type === 'group' ? '' : char.name : db.username}: ${cont.content}\n\n` stringlizedChat += `${cont.role === 'assistant' ? char.type === 'group' ? '' : char.name : db.username}: ${cont.content}\n\n`
@@ -201,7 +200,7 @@ export async function supaMemory(
return result return result
} }
const tokenz = await tokenize(result + '\n\n') + chatAdditonalTokens const tokenz = await tokenize(result + '\n\n')
currentTokens += tokenz currentTokens += tokenz
supaMemory += result.replace(/\n+/g,'\n') + '\n\n' supaMemory += result.replace(/\n+/g,'\n') + '\n\n'
} }

View File

@@ -2,6 +2,7 @@ import type { Tiktoken } from "@dqbd/tiktoken";
import { DataBase, type character } from "./storage/database"; import { DataBase, type character } from "./storage/database";
import { get } from "svelte/store"; import { get } from "svelte/store";
import { tokenizeTransformers } from "./transformers/transformer"; import { tokenizeTransformers } from "./transformers/transformer";
import type { OpenAIChat } from "./process";
async function encode(data:string):Promise<(number[]|Uint32Array)>{ async function encode(data:string):Promise<(number[]|Uint32Array)>{
let db = get(DataBase) let db = get(DataBase)
@@ -37,6 +38,25 @@ export async function tokenize(data:string) {
return encoded.length return encoded.length
} }
export class ChatTokenizer {
private chatAdditonalTokens:number
private useName:'name'|'noName'
constructor(chatAdditonalTokens:number, useName:'name'|'noName'){
this.chatAdditonalTokens = chatAdditonalTokens
this.useName = useName
}
async tokenizeChat(data:OpenAIChat) {
const encoded = (await encode(data.content)).length
+ this.useName === 'name' ? (await encode(data.name)).length : 0 + this.chatAdditonalTokens
return encoded
}
}
export async function tokenizeNum(data:string) { export async function tokenizeNum(data:string) {
const encoded = await encode(data) const encoded = await encode(data)
return encoded return encoded