Refactor inlay image parsing and add image embedding functionality

This commit is contained in:
kwaroran
2024-03-18 00:05:03 +09:00
parent 1954a79bd0
commit 6e5465356a
4 changed files with 48 additions and 46 deletions

View File

@@ -154,16 +154,13 @@ async function parseAdditionalAssets(data:string, char:simpleCharacterArgument|c
} }
async function parseInlayImages(data:string){ async function parseInlayImages(data:string){
const db = get(DataBase) const inlayMatch = data.match(/{{inlay::(.+?)}}/g)
if(db.inlayImage){ if(inlayMatch){
const inlayMatch = data.match(/{{inlay::(.+?)}}/g) for(const inlay of inlayMatch){
if(inlayMatch){ const id = inlay.substring(9, inlay.length - 2)
for(const inlay of inlayMatch){ const img = await getInlayImage(id)
const id = inlay.substring(9, inlay.length - 2) if(img){
const img = await getInlayImage(id) data = data.replace(inlay, `<img src="${img.data}"/>`)
if(img){
data = data.replace(inlay, `<img src="${img.data}"/>`)
}
} }
} }
} }

View File

@@ -25,6 +25,7 @@ import { sendPeerChar } from "../sync/multiuser";
import { runInlayScreen } from "./inlayScreen"; import { runInlayScreen } from "./inlayScreen";
import { runCharacterJS } from "../plugins/embedscript"; import { runCharacterJS } from "../plugins/embedscript";
import { addRerolls } from "./prereroll"; import { addRerolls } from "./prereroll";
import { runImageEmbedding } from "./transformers";
export interface OpenAIChat{ export interface OpenAIChat{
role: 'system'|'user'|'assistant'|'function' role: 'system'|'user'|'assistant'|'function'
@@ -562,16 +563,14 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
msg.chatId = v4() msg.chatId = v4()
} }
let inlays:string[] = [] let inlays:string[] = []
if(db.inlayImage){ if(msg.role === 'char'){
if(msg.role === 'char'){ formatedChat = formatedChat.replace(/{{inlay::(.+?)}}/g, '')
formatedChat = formatedChat.replace(/{{inlay::(.+?)}}/g, '') }
} else{
else{ const inlayMatch = formatedChat.match(/{{inlay::(.+?)}}/g)
const inlayMatch = formatedChat.match(/{{inlay::(.+?)}}/g) if(inlayMatch){
if(inlayMatch){ for(const inlay of inlayMatch){
for(const inlay of inlayMatch){ inlays.push(inlay)
inlays.push(inlay)
}
} }
} }
} }
@@ -590,6 +589,10 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
height: inlayData.height height: inlayData.height
}) })
} }
else{
const captionResult = await runImageEmbedding(inlayData.data)
formatedChat += `[${captionResult[0].generated_text}]`
}
} }
formatedChat = formatedChat.replace(inlay, '') formatedChat = formatedChat.replace(inlay, '')
} }

View File

@@ -172,35 +172,30 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
case 'mistral-large-latest': case 'mistral-large-latest':
case 'reverse_proxy':{ case 'reverse_proxy':{
let formatedChat:OpenAIChatExtra[] = [] let formatedChat:OpenAIChatExtra[] = []
if(db.inlayImage){ for(let i=0;i<formated.length;i++){
for(let i=0;i<formated.length;i++){ const m = formated[i]
const m = formated[i] if(m.multimodals && m.multimodals.length > 0 && m.role === 'user'){
if(m.multimodals && m.multimodals.length > 0 && m.role === 'user'){ let v:OpenAIChatExtra = cloneDeep(m)
let v:OpenAIChatExtra = cloneDeep(m) let contents:OpenAIContents[] = []
let contents:OpenAIContents[] = [] for(let j=0;j<m.multimodals.length;j++){
for(let j=0;j<m.multimodals.length;j++){
contents.push({
"type": "image",
"image_url": {
"url": m.multimodals[j].base64,
"detail": db.gptVisionQuality
}
})
}
contents.push({ contents.push({
"type": "text", "type": "image",
"text": m.content "image_url": {
"url": m.multimodals[j].base64,
"detail": db.gptVisionQuality
}
}) })
v.content = contents
formatedChat.push(v)
}
else{
formatedChat.push(m)
} }
contents.push({
"type": "text",
"text": m.content
})
v.content = contents
formatedChat.push(v)
}
else{
formatedChat.push(m)
} }
}
else{
formatedChat = formated
} }
let oobaSystemPrompts:string[] = [] let oobaSystemPrompts:string[] = []

View File

@@ -1,4 +1,4 @@
import {env, AutoTokenizer, pipeline, type SummarizationOutput, type TextGenerationConfig, type TextGenerationOutput, FeatureExtractionPipeline, TextToAudioPipeline } from '@xenova/transformers'; import {env, AutoTokenizer, pipeline, type SummarizationOutput, type TextGenerationConfig, type TextGenerationOutput, FeatureExtractionPipeline, TextToAudioPipeline, type ImageToTextOutput } from '@xenova/transformers';
import { unzip } from 'fflate'; import { unzip } from 'fflate';
import { globalFetch, loadAsset, saveAsset } from 'src/ts/storage/globalApi'; import { globalFetch, loadAsset, saveAsset } from 'src/ts/storage/globalApi';
import { selectSingleFile } from 'src/ts/util'; import { selectSingleFile } from 'src/ts/util';
@@ -93,6 +93,13 @@ export const runEmbedding = async (text: string):Promise<Float32Array> => {
return (result?.data as Float32Array) ?? null; return (result?.data as Float32Array) ?? null;
} }
export const runImageEmbedding = async (dataurl:string) => {
await initTransformers()
const captioner = await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning');
const output = await captioner(dataurl)
return output as ImageToTextOutput
}
let synthesizer:TextToAudioPipeline = null let synthesizer:TextToAudioPipeline = null
let lastSynth:string = null let lastSynth:string = null