From 7d7cef4a691b4a416b0e6640a016c7ab5717c038 Mon Sep 17 00:00:00 2001 From: kwaroran Date: Thu, 13 Mar 2025 14:18:05 +0900 Subject: [PATCH] Add gemini image response --- src/lang/en.ts | 1 + src/lib/Setting/Pages/PromptSettings.svelte | 2 + src/lib/SideBars/DevTool.svelte | 8 +++ src/ts/model/modellist.ts | 2 +- src/ts/parser.svelte.ts | 6 +- src/ts/process/files/inlays.ts | 15 +++- src/ts/process/index.svelte.ts | 16 ++++- src/ts/process/request.ts | 79 +++++++++++++++++---- src/ts/process/stableDiff.ts | 2 +- src/ts/storage/database.svelte.ts | 8 +++ 10 files changed, 114 insertions(+), 25 deletions(-) diff --git a/src/lang/en.ts b/src/lang/en.ts index 9ca9dbdc..fe762dc1 100644 --- a/src/lang/en.ts +++ b/src/lang/en.ts @@ -1072,4 +1072,5 @@ export const languageEnglish = { automaticCachePoint: "Automatic Cache Point", experimentalChatCompression: "Experimental Chat Data Handling", loadingChatData: "Loading Chat Data", + outputImageModal: "Output Image Modal", } diff --git a/src/lib/Setting/Pages/PromptSettings.svelte b/src/lib/Setting/Pages/PromptSettings.svelte index bb32378d..6515987d 100644 --- a/src/lib/Setting/Pages/PromptSettings.svelte +++ b/src/lib/Setting/Pages/PromptSettings.svelte @@ -126,6 +126,8 @@ + + {#if DBState.db.showUnrecommended} diff --git a/src/lib/SideBars/DevTool.svelte b/src/lib/SideBars/DevTool.svelte index a0263a2d..0a1ca717 100644 --- a/src/lib/SideBars/DevTool.svelte +++ b/src/lib/SideBars/DevTool.svelte @@ -81,6 +81,14 @@ md += `> ${modals.length} non-text content(s) included\n` } + if(formated[i].thoughts && formated[i].thoughts.length > 0){ + md += `> ${formated[i].thoughts.length} thought(s) included\n` + } + + if(formated[i].cachePoint){ + md += `> Cache point\n` + } + md += '```\n' + formated[i].content.replaceAll('```', '\\`\\`\\`') + '\n```\n' } $doingChat = false diff --git a/src/ts/model/modellist.ts b/src/ts/model/modellist.ts index eaec07ba..9fb1e68e 100644 --- a/src/ts/model/modellist.ts +++ b/src/ts/model/modellist.ts @@ -969,7 +969,7 @@ export const LLMModels: LLMModel[] = [ id: 'gemini-2.0-flash-exp', provider: LLMProvider.GoogleCloud, format: LLMFormat.GoogleCloud, - flags: [LLMFlags.geminiBlockOff,LLMFlags.hasImageInput, LLMFlags.hasFirstSystemPrompt, LLMFlags.poolSupported, LLMFlags.hasAudioInput, LLMFlags.hasVideoInput, LLMFlags.hasStreaming, LLMFlags.requiresAlternateRole], + flags: [LLMFlags.geminiBlockOff,LLMFlags.hasImageInput, LLMFlags.hasImageOutput, LLMFlags.poolSupported, LLMFlags.hasAudioInput, LLMFlags.hasVideoInput, LLMFlags.hasStreaming, LLMFlags.requiresAlternateRole], parameters: ['temperature', 'top_k', 'top_p', 'presence_penalty', 'frequency_penalty'], tokenizer: LLMTokenizer.GoogleCloud, }, diff --git a/src/ts/parser.svelte.ts b/src/ts/parser.svelte.ts index 75ae99df..6c37b389 100644 --- a/src/ts/parser.svelte.ts +++ b/src/ts/parser.svelte.ts @@ -495,14 +495,14 @@ function trimmer(str:string){ } async function parseInlayAssets(data:string){ - const inlayMatch = data.match(/{{(inlay|inlayed)::(.+?)}}/g) + const inlayMatch = data.match(/{{(inlay|inlayed|inlayeddata)::(.+?)}}/g) if(inlayMatch){ for(const inlay of inlayMatch){ const inlayType = inlay.startsWith('{{inlayed') ? 'inlayed' : 'inlay' const id = inlay.substring(inlay.indexOf('::') + 2, inlay.length - 2) const asset = await getInlayAsset(id) - let prefix = inlayType === 'inlayed' ? `
` : '' - let postfix = inlayType === 'inlayed' ? `
\n\n` : '' + let prefix = inlayType !== 'inlay' ? `
` : '' + let postfix = inlayType !== 'inlay' ? `
\n\n` : '' switch(asset?.type){ case 'image': data = data.replace(inlay, `${prefix}${postfix}`) diff --git a/src/ts/process/files/inlays.ts b/src/ts/process/files/inlays.ts index 0d08b9cf..6ac8ee50 100644 --- a/src/ts/process/files/inlays.ts +++ b/src/ts/process/files/inlays.ts @@ -71,7 +71,7 @@ export async function postInlayAsset(img:{ return null } -export async function writeInlayImage(imgObj:HTMLImageElement, arg:{name?:string, ext?:string} = {}) { +export async function writeInlayImage(imgObj:HTMLImageElement, arg:{name?:string, ext?:string, id?:string} = {}) { let drawHeight = 0 let drawWidth = 0 @@ -103,7 +103,7 @@ export async function writeInlayImage(imgObj:HTMLImageElement, arg:{name?:string const dataURI = canvas.toDataURL('image/png') - const imgid = v4() + const imgid = arg.id ?? v4() await inlayStorage.setItem(imgid, { name: arg.name ?? imgid, @@ -132,6 +132,17 @@ export async function getInlayAsset(id: string){ return img } +export async function setInlayAsset(id: string, img:{ + name: string, + data: string, + ext: string, + height: number, + width: number, + type: 'image'|'video'|'audio' +}){ + await inlayStorage.setItem(id, img) +} + export function supportsInlayImage(){ const db = getDatabase() return getModelInfo(db.aiModel).flags.includes(LLMFlags.hasImageInput) diff --git a/src/ts/process/index.svelte.ts b/src/ts/process/index.svelte.ts index 5d859a46..8bda6569 100644 --- a/src/ts/process/index.svelte.ts +++ b/src/ts/process/index.svelte.ts @@ -724,10 +724,19 @@ export async function sendChat(chatProcessIndex = -1,arg:{ } let inlays:string[] = [] if(msg.role === 'char'){ - formatedChat = formatedChat.replace(/{{(inlay|inlayed)::(.+?)}}/g, '') + formatedChat = formatedChat.replace(/{{(inlay|inlayed|inlayeddata)::(.+?)}}/g, ( + match: string, + p1: string, + p2: string + ) => { + if(p2 && p1 === 'inlayeddata'){ + inlays.push(p2) + } + return '' + }) } else{ - const inlayMatch = formatedChat.match(/{{(inlay|inlayed)::(.+?)}}/g) + const inlayMatch = formatedChat.match(/{{(inlay|inlayed|inlayeddata)::(.+?)}}/g) if(inlayMatch){ for(const inlay of inlayMatch){ inlays.push(inlay) @@ -1293,7 +1302,8 @@ export async function sendChat(chatProcessIndex = -1,arg:{ isGroupChat: nowChatroom.type === 'group', bias: {}, continue: arg.continue, - chatId: generationId + chatId: generationId, + imageResponse: DBState.db.outputImageModal }, 'model', abortSignal) let result = '' diff --git a/src/ts/process/request.ts b/src/ts/process/request.ts index 0540e47d..17fcffd3 100644 --- a/src/ts/process/request.ts +++ b/src/ts/process/request.ts @@ -11,7 +11,7 @@ import { risuChatParser } from "../parser.svelte"; import { SignatureV4 } from "@smithy/signature-v4"; import { HttpRequest } from "@smithy/protocol-http"; import { Sha256 } from "@aws-crypto/sha256-js"; -import { supportsInlayImage } from "./files/inlays"; +import { supportsInlayImage, writeInlayImage } from "./files/inlays"; import { Capacitor } from "@capacitor/core"; import { getFreeOpenRouterModel } from "../model/openrouter"; import { runTransformers } from "./transformers"; @@ -42,6 +42,7 @@ interface requestDataArgument{ noMultiGen?:boolean schema?:string extractJson?:string + imageResponse?:boolean } interface RequestDataArgumentExtended extends requestDataArgument{ @@ -374,13 +375,15 @@ export interface OpenAIChatExtra { cachePoint?:boolean } -function reformater(formated:OpenAIChat[],modelInfo:LLMModel){ +export function reformater(formated:OpenAIChat[],modelInfo:LLMModel|LLMFlags[]){ + + const flags = Array.isArray(modelInfo) ? modelInfo : modelInfo.flags const db = getDatabase() let systemPrompt:OpenAIChat|null = null - if(!modelInfo.flags.includes(LLMFlags.hasFullSystemPrompt)){ - if(modelInfo.flags.includes(LLMFlags.hasFirstSystemPrompt)){ + if(!flags.includes(LLMFlags.hasFullSystemPrompt)){ + if(flags.includes(LLMFlags.hasFirstSystemPrompt)){ while(formated[0].role === 'system'){ if(systemPrompt){ systemPrompt.content += '\n\n' + formated[0].content @@ -400,7 +403,7 @@ function reformater(formated:OpenAIChat[],modelInfo:LLMModel){ } } - if(modelInfo.flags.includes(LLMFlags.requiresAlternateRole)){ + if(flags.includes(LLMFlags.requiresAlternateRole)){ let newFormated:OpenAIChat[] = [] for(let i=0;i 1){ - const thought = rDatas.splice(rDatas.length-2, 1)[0] - rDatas[rDatas.length-1] = `${thought}\n\n${rDatas.join('\n\n')}` + if(arg.modelInfo.flags.includes(LLMFlags.geminiThinking)){ + const thought = rDatas.splice(rDatas.length-2, 1)[0] + rDatas[rDatas.length-1] = `${thought}\n\n${rDatas.join('\n\n')}` + } + else{ + rDatas[rDatas.length-1] = rDatas.join('\n\n') + } } + control.enqueue({ '0': rDatas[rDatas.length-1], }) @@ -2110,7 +2142,7 @@ async function requestGoogleCloudVertex(arg:RequestDataArgumentExtended):Promise } let rDatas:string[] = [''] - const processDataItem = (data:any) => { + const processDataItem = async (data:any) => { const parts = data?.candidates?.[0]?.content?.parts if(parts){ @@ -2120,7 +2152,21 @@ async function requestGoogleCloudVertex(arg:RequestDataArgumentExtended):Promise rDatas.push('') } - rDatas[rDatas.length-1] += part.text + rDatas[rDatas.length-1] += part.text ?? '' + if(part.inlineData){ + const imgHTML = new Image() + const id = crypto.randomUUID() + imgHTML.src = `data:${part.inlineData.mimeType};base64,${part.inlineData.data}` + console.log('decoding', part.inlineData.mimeType, part.inlineData.data, id) + console.log('writing') + await writeInlayImage(imgHTML, { + id: id + }) + console.log(JSON.stringify(rDatas)) + rDatas[rDatas.length-1] += (`\n{{inlayeddata::${id}}}\n`) + console.log(JSON.stringify(rDatas)) + console.log('done', id) + } } } @@ -2141,10 +2187,10 @@ async function requestGoogleCloudVertex(arg:RequestDataArgumentExtended):Promise // traverse responded data if it contains multipart contents if (typeof (res.data)[Symbol.iterator] === 'function') { for(const data of res.data){ - processDataItem(data) + await processDataItem(data) } } else { - processDataItem(res.data) + await processDataItem(res.data) } if(arg.extractJson && (db.jsonSchemaEnabled || arg.schema)){ @@ -2154,10 +2200,13 @@ async function requestGoogleCloudVertex(arg:RequestDataArgumentExtended):Promise } } - if(rDatas.length > 1){ + if(rDatas.length > 1 && arg.modelInfo.flags.includes(LLMFlags.geminiThinking)){ const thought = rDatas.splice(rDatas.length-2, 1)[0] rDatas[rDatas.length-1] = `${thought}\n\n${rDatas.join('\n\n')}` } + else if(rDatas.length > 1){ + rDatas[rDatas.length-1] = rDatas.join('\n\n') + } return { type: 'success', diff --git a/src/ts/process/stableDiff.ts b/src/ts/process/stableDiff.ts index 3aacbfc1..3b5ce4bf 100644 --- a/src/ts/process/stableDiff.ts +++ b/src/ts/process/stableDiff.ts @@ -3,7 +3,7 @@ import { getDatabase, type character } from "../storage/database.svelte" import { requestChatData } from "./request" import { alertError } from "../alert" import { fetchNative, globalFetch, readImage } from "../globalApi.svelte" -import { CharEmotion } from "../stores.svelte" +import { CharEmotion, DBState } from "../stores.svelte" import type { OpenAIChat } from "./index.svelte" import { processZip } from "./processzip" import { keiServerURL } from "../kei/kei" diff --git a/src/ts/storage/database.svelte.ts b/src/ts/storage/database.svelte.ts index 7cbcde53..f1cc5797 100644 --- a/src/ts/storage/database.svelte.ts +++ b/src/ts/storage/database.svelte.ts @@ -928,6 +928,8 @@ export interface Database{ automaticCachePoint: boolean chatCompression: boolean claudeRetrivalCaching: boolean + outputImageModal: boolean + } interface SeparateParameters{ @@ -941,8 +943,11 @@ interface SeparateParameters{ presence_penalty?:number reasoning_effort?:number thinking_tokens?:number + outputImageModal?:boolean } +type OutputModal = 'image'|'audio'|'video' + export interface customscript{ comment: string; in:string @@ -1258,6 +1263,7 @@ export interface botPreset{ regex?:customscript[] reasonEffort?:number thinkingTokens?:number + outputImageModal?:boolean } @@ -1574,6 +1580,7 @@ export function saveCurrentPreset(){ image: pres?.[db.botPresetsId]?.image ?? '', reasonEffort: db.reasoningEffort ?? 0, thinkingTokens: db.thinkingTokens ?? null, + outputImageModal: db.outputImageModal ?? false } db.botPresets = pres setDatabase(db) @@ -1685,6 +1692,7 @@ export function setPreset(db:Database, newPres: botPreset){ db.presetRegex = newPres.regex ?? [] db.reasoningEffort = newPres.reasonEffort ?? 0 db.thinkingTokens = newPres.thinkingTokens ?? null + db.outputImageModal = newPres.outputImageModal ?? false return db }