diff --git a/src/ts/process/files/image.ts b/src/ts/process/files/image.ts index 8cbdbc80..d4f76fd3 100644 --- a/src/ts/process/files/image.ts +++ b/src/ts/process/files/image.ts @@ -87,6 +87,7 @@ export async function getInlayImage(id: string){ export function supportsInlayImage(){ const db = get(DataBase) return db.aiModel.startsWith('gptv') || (db.aiModel === 'reverse_proxy' && db.proxyRequestModel?.startsWith('gptv')) || db.aiModel === 'gemini-pro-vision' + || db.aiModel.startsWith('claude-3') || db.proxyRequestModel?.startsWith('claude-3') } export async function reencodeImage(img:Uint8Array){ diff --git a/src/ts/process/index.ts b/src/ts/process/index.ts index 7d46ce71..4108f730 100644 --- a/src/ts/process/index.ts +++ b/src/ts/process/index.ts @@ -33,6 +33,14 @@ export interface OpenAIChat{ name?:string removable?:boolean attr?:string[] + multimodals?: MultiModal[] +} + +export interface MultiModal{ + type:'image'|'video' + base64:string, + height?:number, + width?:number } export interface OpenAIChatFull extends OpenAIChat{ @@ -568,19 +576,19 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n } } + let multimodal:MultiModal[] = [] if(inlays.length > 0){ for(const inlay of inlays){ const inlayName = inlay.replace('{{inlay::', '').replace('}}', '') const inlayData = await getInlayImage(inlayName) if(inlayData){ if(supportsInlayImage()){ - const imgchat = { - role: msg.role === 'user' ? 'user' : 'assistant', - content: inlayData.data, - memo: `inlayImage-${inlayData.height}-${inlayData.width}`, - } as const - chats.push(imgchat) - currentTokens += await tokenizer.tokenizeChat(imgchat) + multimodal.push({ + type: 'image', + base64: inlayData.data, + width: inlayData.width, + height: inlayData.height + }) } } formatedChat = formatedChat.replace(inlay, '') @@ -604,7 +612,11 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n role: msg.role === 'user' ? 'user' : 'assistant', content: formatedChat, memo: msg.chatId, - attr: attr + attr: attr, + multimodals: multimodal + } + if(chat.multimodals.length === 0){ + delete chat.multimodals } chats.push(chat) currentTokens += await tokenizer.tokenizeChat(chat) diff --git a/src/ts/process/request.ts b/src/ts/process/request.ts index 55598c00..270b0257 100644 --- a/src/ts/process/request.ts +++ b/src/ts/process/request.ts @@ -1,5 +1,5 @@ import { get } from "svelte/store"; -import type { OpenAIChat, OpenAIChatFull } from "."; +import type { MultiModal, OpenAIChat, OpenAIChatFull } from "."; import { DataBase, setDatabase, type character } from "../storage/database"; import { pluginProcess } from "../plugins/plugins"; import { language } from "../../lang"; @@ -117,6 +117,7 @@ export interface OpenAIChatExtra { name?:string removable?:boolean attr?:string[] + multimodals?:MultiModal[] } @@ -172,33 +173,29 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model' case 'reverse_proxy':{ let formatedChat:OpenAIChatExtra[] = [] if(db.inlayImage){ - let pendingImages:OpenAIImageContents[] = [] for(let i=0;i 0 && m.role === 'user'){ + let v:OpenAIChatExtra = cloneDeep(m) + let contents:OpenAIContents[] = [] + for(let j=0;j 0 && m.role === 'user'){ - let v:OpenAIChatExtra = cloneDeep(m) - let contents:OpenAIContents[] = pendingImages - contents.push({ - "type": "text", - "text": m.content - }) - v.content = contents - formatedChat.push(v) - pendingImages = [] - } - else{ - formatedChat.push(m) - } + formatedChat.push(m) } } } @@ -218,6 +215,7 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model' delete formatedChat[i].memo delete formatedChat[i].removable delete formatedChat[i].attr + delete formatedChat[i].multimodals } if(aiModel === 'reverse_proxy' && db.reverseProxyOobaMode && formatedChat[i].role === 'system'){ const cont = formatedChat[i].content @@ -301,6 +299,7 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model' openrouterRequestModel = await getFreeOpenRouterModel() } + console.log(formatedChat) if(aiModel.startsWith('mistral')){ requestModel = aiModel @@ -1414,20 +1413,106 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model' } } + interface Claude3TextBlock { + type: 'text', + text: string + } + + interface Claude3ImageBlock { + type: 'image', + source: { + type: 'base64' + media_type: string, + data: string + } + } + + type Claude3ContentBlock = Claude3TextBlock|Claude3ImageBlock + interface Claude3Chat { role: 'user'|'assistant' - content: string + content: string|Claude3ContentBlock[] } let claudeChat: Claude3Chat[] = [] let systemPrompt:string = '' - const addClaudeChat = (chat:Claude3Chat) => { + const addClaudeChat = (chat:{ + role: 'user'|'assistant' + content: string + }, multimodals?:MultiModal[]) => { if(claudeChat.length > 0 && claudeChat[claudeChat.length-1].role === chat.role){ - claudeChat[claudeChat.length-1].content += "\n\n" + chat.content + let content = claudeChat[claudeChat.length-1].content + if(multimodals && multimodals.length > 0 && !Array.isArray(content)){ + content = [{ + type: 'text', + text: content + }] + } + + if(Array.isArray(content)){ + let lastContent = content[content.length-1] + if( lastContent?.type === 'text'){ + lastContent.text += "\n\n" + chat.content + content[content.length-1] = lastContent + } + else{ + content.push({ + type: 'text', + text: chat.content + }) + } + + if(multimodals && multimodals.length > 0){ + for(const modal of multimodals){ + if(modal.type === 'image'){ + const dataurl = modal.base64 + const base64 = dataurl.split(',')[1] + const mediaType = dataurl.split(';')[0].split(':')[1] + + content.unshift({ + type: 'image', + source: { + type: 'base64', + media_type: mediaType, + data: base64 + } + }) + } + } + } + } + else{ + content += "\n\n" + chat.content + } + claudeChat[claudeChat.length-1].content = content } else{ - claudeChat.push(chat) + let formatedChat:Claude3Chat = chat + if(multimodals && multimodals.length > 0){ + formatedChat.content = [{ + type: 'text', + text: chat.content + }] + for(const modal of multimodals){ + if(modal.type === 'image'){ + const dataurl = modal.base64 + const base64 = dataurl.split(',')[1] + const mediaType = dataurl.split(';')[0].split(':')[1] + + formatedChat.content.unshift({ + type: 'image', + source: { + type: 'base64', + media_type: mediaType, + data: base64 + } + }) + } + } + + } + claudeChat.push(formatedChat) } } for(const chat of formated){ @@ -1436,14 +1521,14 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model' addClaudeChat({ role: 'user', content: chat.content - }) + }, chat.multimodals) break } case 'assistant':{ addClaudeChat({ role: 'assistant', content: chat.content - }) + }, chat.multimodals) break } case 'system':{ @@ -1464,7 +1549,7 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model' } } } - + console.log(claudeChat) if(claudeChat.length === 0 && systemPrompt === ''){ return { type: 'fail', diff --git a/src/ts/tokenizer.ts b/src/ts/tokenizer.ts index 6665e0af..9995c411 100644 --- a/src/ts/tokenizer.ts +++ b/src/ts/tokenizer.ts @@ -2,7 +2,7 @@ import type { Tiktoken } from "@dqbd/tiktoken"; import type { Tokenizer } from "@mlc-ai/web-tokenizers"; import { DataBase, type character } from "./storage/database"; import { get } from "svelte/store"; -import type { OpenAIChat } from "./process"; +import type { MultiModal, OpenAIChat } from "./process"; import { supportsInlayImage } from "./process/files/image"; import { risuChatParser } from "./parser"; import { tokenizeGGUFModel } from "./process/models/local"; @@ -132,53 +132,56 @@ export class ChatTokenizer { this.useName = useName } async tokenizeChat(data:OpenAIChat) { - if(data.memo && data.memo.startsWith('inlayImage')){ - const db = get(DataBase) - if(!supportsInlayImage()){ - return this.chatAdditonalTokens - } - if(db.gptVisionQuality === 'low'){ - return 87 - } - - let encoded = this.chatAdditonalTokens - const memo = data.memo.split('-') - let height = parseInt(memo[1]) - let width = parseInt(memo[2]) - - if(height === width){ - if(height > 768){ - height = 768 - width = 768 - } - } - else if(height > width){ - if(width > 768){ - width = 768 - height = height * (768 / width) - } - } - else{ - if(height > 768){ - height = 768 - width = width * (768 / height) - } - } - - const chunkSize = Math.ceil(width / 512) * Math.ceil(height / 512) - encoded += chunkSize * 2 - encoded += 85 - - return encoded - } - let encoded = (await encode(data.content)).length + this.chatAdditonalTokens if(data.name && this.useName ==='name'){ encoded += (await encode(data.name)).length + 1 } + if(data.multimodals && data.multimodals.length > 0){ + for(const multimodal of data.multimodals){ + encoded += await this.tokenizeMultiModal(multimodal) + } + } return encoded } + async tokenizeMultiModal(data:MultiModal){ + const db = get(DataBase) + if(!supportsInlayImage()){ + return this.chatAdditonalTokens + } + if(db.gptVisionQuality === 'low'){ + return 87 + } + + let encoded = this.chatAdditonalTokens + let height = data.height ?? 0 + let width = data.width ?? 0 + + if(height === width){ + if(height > 768){ + height = 768 + width = 768 + } + } + else if(height > width){ + if(width > 768){ + width = 768 + height = height * (768 / width) + } + } + else{ + if(height > 768){ + height = 768 + width = width * (768 / height) + } + } + + const chunkSize = Math.ceil(width / 512) * Math.ceil(height / 512) + encoded += chunkSize * 2 + encoded += 85 + + return encoded + } }