Refactor multimodal and add claude-3 vision support

This commit is contained in:
kwaroran
2024-03-17 23:48:24 +09:00
parent 859069f112
commit dbe1a45317
4 changed files with 180 additions and 79 deletions

View File

@@ -87,6 +87,7 @@ export async function getInlayImage(id: string){
export function supportsInlayImage(){ export function supportsInlayImage(){
const db = get(DataBase) const db = get(DataBase)
return db.aiModel.startsWith('gptv') || (db.aiModel === 'reverse_proxy' && db.proxyRequestModel?.startsWith('gptv')) || db.aiModel === 'gemini-pro-vision' return db.aiModel.startsWith('gptv') || (db.aiModel === 'reverse_proxy' && db.proxyRequestModel?.startsWith('gptv')) || db.aiModel === 'gemini-pro-vision'
|| db.aiModel.startsWith('claude-3') || db.proxyRequestModel?.startsWith('claude-3')
} }
export async function reencodeImage(img:Uint8Array){ export async function reencodeImage(img:Uint8Array){

View File

@@ -33,6 +33,14 @@ export interface OpenAIChat{
name?:string name?:string
removable?:boolean removable?:boolean
attr?:string[] attr?:string[]
multimodals?: MultiModal[]
}
export interface MultiModal{
type:'image'|'video'
base64:string,
height?:number,
width?:number
} }
export interface OpenAIChatFull extends OpenAIChat{ export interface OpenAIChatFull extends OpenAIChat{
@@ -568,19 +576,19 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
} }
} }
let multimodal:MultiModal[] = []
if(inlays.length > 0){ if(inlays.length > 0){
for(const inlay of inlays){ for(const inlay of inlays){
const inlayName = inlay.replace('{{inlay::', '').replace('}}', '') const inlayName = inlay.replace('{{inlay::', '').replace('}}', '')
const inlayData = await getInlayImage(inlayName) const inlayData = await getInlayImage(inlayName)
if(inlayData){ if(inlayData){
if(supportsInlayImage()){ if(supportsInlayImage()){
const imgchat = { multimodal.push({
role: msg.role === 'user' ? 'user' : 'assistant', type: 'image',
content: inlayData.data, base64: inlayData.data,
memo: `inlayImage-${inlayData.height}-${inlayData.width}`, width: inlayData.width,
} as const height: inlayData.height
chats.push(imgchat) })
currentTokens += await tokenizer.tokenizeChat(imgchat)
} }
} }
formatedChat = formatedChat.replace(inlay, '') formatedChat = formatedChat.replace(inlay, '')
@@ -604,7 +612,11 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
role: msg.role === 'user' ? 'user' : 'assistant', role: msg.role === 'user' ? 'user' : 'assistant',
content: formatedChat, content: formatedChat,
memo: msg.chatId, memo: msg.chatId,
attr: attr attr: attr,
multimodals: multimodal
}
if(chat.multimodals.length === 0){
delete chat.multimodals
} }
chats.push(chat) chats.push(chat)
currentTokens += await tokenizer.tokenizeChat(chat) currentTokens += await tokenizer.tokenizeChat(chat)

View File

@@ -1,5 +1,5 @@
import { get } from "svelte/store"; import { get } from "svelte/store";
import type { OpenAIChat, OpenAIChatFull } from "."; import type { MultiModal, OpenAIChat, OpenAIChatFull } from ".";
import { DataBase, setDatabase, type character } from "../storage/database"; import { DataBase, setDatabase, type character } from "../storage/database";
import { pluginProcess } from "../plugins/plugins"; import { pluginProcess } from "../plugins/plugins";
import { language } from "../../lang"; import { language } from "../../lang";
@@ -117,6 +117,7 @@ export interface OpenAIChatExtra {
name?:string name?:string
removable?:boolean removable?:boolean
attr?:string[] attr?:string[]
multimodals?:MultiModal[]
} }
@@ -172,33 +173,29 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
case 'reverse_proxy':{ case 'reverse_proxy':{
let formatedChat:OpenAIChatExtra[] = [] let formatedChat:OpenAIChatExtra[] = []
if(db.inlayImage){ if(db.inlayImage){
let pendingImages:OpenAIImageContents[] = []
for(let i=0;i<formated.length;i++){ for(let i=0;i<formated.length;i++){
const m = formated[i] const m = formated[i]
if(m.memo && m.memo.startsWith('inlayImage')){ if(m.multimodals && m.multimodals.length > 0 && m.role === 'user'){
pendingImages.push({ let v:OpenAIChatExtra = cloneDeep(m)
"type": "image", let contents:OpenAIContents[] = []
"image_url": { for(let j=0;j<m.multimodals.length;j++){
"url": m.content, contents.push({
"detail": db.gptVisionQuality "type": "image",
} "image_url": {
"url": m.multimodals[j].base64,
"detail": db.gptVisionQuality
}
})
}
contents.push({
"type": "text",
"text": m.content
}) })
v.content = contents
formatedChat.push(v)
} }
else{ else{
if(pendingImages.length > 0 && m.role === 'user'){ formatedChat.push(m)
let v:OpenAIChatExtra = cloneDeep(m)
let contents:OpenAIContents[] = pendingImages
contents.push({
"type": "text",
"text": m.content
})
v.content = contents
formatedChat.push(v)
pendingImages = []
}
else{
formatedChat.push(m)
}
} }
} }
} }
@@ -218,6 +215,7 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
delete formatedChat[i].memo delete formatedChat[i].memo
delete formatedChat[i].removable delete formatedChat[i].removable
delete formatedChat[i].attr delete formatedChat[i].attr
delete formatedChat[i].multimodals
} }
if(aiModel === 'reverse_proxy' && db.reverseProxyOobaMode && formatedChat[i].role === 'system'){ if(aiModel === 'reverse_proxy' && db.reverseProxyOobaMode && formatedChat[i].role === 'system'){
const cont = formatedChat[i].content const cont = formatedChat[i].content
@@ -301,6 +299,7 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
openrouterRequestModel = await getFreeOpenRouterModel() openrouterRequestModel = await getFreeOpenRouterModel()
} }
console.log(formatedChat)
if(aiModel.startsWith('mistral')){ if(aiModel.startsWith('mistral')){
requestModel = aiModel requestModel = aiModel
@@ -1414,20 +1413,106 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
} }
} }
interface Claude3TextBlock {
type: 'text',
text: string
}
interface Claude3ImageBlock {
type: 'image',
source: {
type: 'base64'
media_type: string,
data: string
}
}
type Claude3ContentBlock = Claude3TextBlock|Claude3ImageBlock
interface Claude3Chat { interface Claude3Chat {
role: 'user'|'assistant' role: 'user'|'assistant'
content: string content: string|Claude3ContentBlock[]
} }
let claudeChat: Claude3Chat[] = [] let claudeChat: Claude3Chat[] = []
let systemPrompt:string = '' let systemPrompt:string = ''
const addClaudeChat = (chat:Claude3Chat) => { const addClaudeChat = (chat:{
role: 'user'|'assistant'
content: string
}, multimodals?:MultiModal[]) => {
if(claudeChat.length > 0 && claudeChat[claudeChat.length-1].role === chat.role){ if(claudeChat.length > 0 && claudeChat[claudeChat.length-1].role === chat.role){
claudeChat[claudeChat.length-1].content += "\n\n" + chat.content let content = claudeChat[claudeChat.length-1].content
if(multimodals && multimodals.length > 0 && !Array.isArray(content)){
content = [{
type: 'text',
text: content
}]
}
if(Array.isArray(content)){
let lastContent = content[content.length-1]
if( lastContent?.type === 'text'){
lastContent.text += "\n\n" + chat.content
content[content.length-1] = lastContent
}
else{
content.push({
type: 'text',
text: chat.content
})
}
if(multimodals && multimodals.length > 0){
for(const modal of multimodals){
if(modal.type === 'image'){
const dataurl = modal.base64
const base64 = dataurl.split(',')[1]
const mediaType = dataurl.split(';')[0].split(':')[1]
content.unshift({
type: 'image',
source: {
type: 'base64',
media_type: mediaType,
data: base64
}
})
}
}
}
}
else{
content += "\n\n" + chat.content
}
claudeChat[claudeChat.length-1].content = content
} }
else{ else{
claudeChat.push(chat) let formatedChat:Claude3Chat = chat
if(multimodals && multimodals.length > 0){
formatedChat.content = [{
type: 'text',
text: chat.content
}]
for(const modal of multimodals){
if(modal.type === 'image'){
const dataurl = modal.base64
const base64 = dataurl.split(',')[1]
const mediaType = dataurl.split(';')[0].split(':')[1]
formatedChat.content.unshift({
type: 'image',
source: {
type: 'base64',
media_type: mediaType,
data: base64
}
})
}
}
}
claudeChat.push(formatedChat)
} }
} }
for(const chat of formated){ for(const chat of formated){
@@ -1436,14 +1521,14 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
addClaudeChat({ addClaudeChat({
role: 'user', role: 'user',
content: chat.content content: chat.content
}) }, chat.multimodals)
break break
} }
case 'assistant':{ case 'assistant':{
addClaudeChat({ addClaudeChat({
role: 'assistant', role: 'assistant',
content: chat.content content: chat.content
}) }, chat.multimodals)
break break
} }
case 'system':{ case 'system':{
@@ -1464,7 +1549,7 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
} }
} }
} }
console.log(claudeChat)
if(claudeChat.length === 0 && systemPrompt === ''){ if(claudeChat.length === 0 && systemPrompt === ''){
return { return {
type: 'fail', type: 'fail',

View File

@@ -2,7 +2,7 @@ import type { Tiktoken } from "@dqbd/tiktoken";
import type { Tokenizer } from "@mlc-ai/web-tokenizers"; import type { Tokenizer } from "@mlc-ai/web-tokenizers";
import { DataBase, type character } from "./storage/database"; import { DataBase, type character } from "./storage/database";
import { get } from "svelte/store"; import { get } from "svelte/store";
import type { OpenAIChat } from "./process"; import type { MultiModal, OpenAIChat } from "./process";
import { supportsInlayImage } from "./process/files/image"; import { supportsInlayImage } from "./process/files/image";
import { risuChatParser } from "./parser"; import { risuChatParser } from "./parser";
import { tokenizeGGUFModel } from "./process/models/local"; import { tokenizeGGUFModel } from "./process/models/local";
@@ -132,53 +132,56 @@ export class ChatTokenizer {
this.useName = useName this.useName = useName
} }
async tokenizeChat(data:OpenAIChat) { async tokenizeChat(data:OpenAIChat) {
if(data.memo && data.memo.startsWith('inlayImage')){
const db = get(DataBase)
if(!supportsInlayImage()){
return this.chatAdditonalTokens
}
if(db.gptVisionQuality === 'low'){
return 87
}
let encoded = this.chatAdditonalTokens
const memo = data.memo.split('-')
let height = parseInt(memo[1])
let width = parseInt(memo[2])
if(height === width){
if(height > 768){
height = 768
width = 768
}
}
else if(height > width){
if(width > 768){
width = 768
height = height * (768 / width)
}
}
else{
if(height > 768){
height = 768
width = width * (768 / height)
}
}
const chunkSize = Math.ceil(width / 512) * Math.ceil(height / 512)
encoded += chunkSize * 2
encoded += 85
return encoded
}
let encoded = (await encode(data.content)).length + this.chatAdditonalTokens let encoded = (await encode(data.content)).length + this.chatAdditonalTokens
if(data.name && this.useName ==='name'){ if(data.name && this.useName ==='name'){
encoded += (await encode(data.name)).length + 1 encoded += (await encode(data.name)).length + 1
} }
if(data.multimodals && data.multimodals.length > 0){
for(const multimodal of data.multimodals){
encoded += await this.tokenizeMultiModal(multimodal)
}
}
return encoded return encoded
} }
async tokenizeMultiModal(data:MultiModal){
const db = get(DataBase)
if(!supportsInlayImage()){
return this.chatAdditonalTokens
}
if(db.gptVisionQuality === 'low'){
return 87
}
let encoded = this.chatAdditonalTokens
let height = data.height ?? 0
let width = data.width ?? 0
if(height === width){
if(height > 768){
height = 768
width = 768
}
}
else if(height > width){
if(width > 768){
width = 768
height = height * (768 / width)
}
}
else{
if(height > 768){
height = 768
width = width * (768 / height)
}
}
const chunkSize = Math.ceil(width / 512) * Math.ceil(height / 512)
encoded += chunkSize * 2
encoded += 85
return encoded
}
} }