Refactor multimodal and add claude-3 vision support
This commit is contained in:
@@ -87,6 +87,7 @@ export async function getInlayImage(id: string){
|
|||||||
export function supportsInlayImage(){
|
export function supportsInlayImage(){
|
||||||
const db = get(DataBase)
|
const db = get(DataBase)
|
||||||
return db.aiModel.startsWith('gptv') || (db.aiModel === 'reverse_proxy' && db.proxyRequestModel?.startsWith('gptv')) || db.aiModel === 'gemini-pro-vision'
|
return db.aiModel.startsWith('gptv') || (db.aiModel === 'reverse_proxy' && db.proxyRequestModel?.startsWith('gptv')) || db.aiModel === 'gemini-pro-vision'
|
||||||
|
|| db.aiModel.startsWith('claude-3') || db.proxyRequestModel?.startsWith('claude-3')
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function reencodeImage(img:Uint8Array){
|
export async function reencodeImage(img:Uint8Array){
|
||||||
|
|||||||
@@ -33,6 +33,14 @@ export interface OpenAIChat{
|
|||||||
name?:string
|
name?:string
|
||||||
removable?:boolean
|
removable?:boolean
|
||||||
attr?:string[]
|
attr?:string[]
|
||||||
|
multimodals?: MultiModal[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MultiModal{
|
||||||
|
type:'image'|'video'
|
||||||
|
base64:string,
|
||||||
|
height?:number,
|
||||||
|
width?:number
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface OpenAIChatFull extends OpenAIChat{
|
export interface OpenAIChatFull extends OpenAIChat{
|
||||||
@@ -568,19 +576,19 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let multimodal:MultiModal[] = []
|
||||||
if(inlays.length > 0){
|
if(inlays.length > 0){
|
||||||
for(const inlay of inlays){
|
for(const inlay of inlays){
|
||||||
const inlayName = inlay.replace('{{inlay::', '').replace('}}', '')
|
const inlayName = inlay.replace('{{inlay::', '').replace('}}', '')
|
||||||
const inlayData = await getInlayImage(inlayName)
|
const inlayData = await getInlayImage(inlayName)
|
||||||
if(inlayData){
|
if(inlayData){
|
||||||
if(supportsInlayImage()){
|
if(supportsInlayImage()){
|
||||||
const imgchat = {
|
multimodal.push({
|
||||||
role: msg.role === 'user' ? 'user' : 'assistant',
|
type: 'image',
|
||||||
content: inlayData.data,
|
base64: inlayData.data,
|
||||||
memo: `inlayImage-${inlayData.height}-${inlayData.width}`,
|
width: inlayData.width,
|
||||||
} as const
|
height: inlayData.height
|
||||||
chats.push(imgchat)
|
})
|
||||||
currentTokens += await tokenizer.tokenizeChat(imgchat)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
formatedChat = formatedChat.replace(inlay, '')
|
formatedChat = formatedChat.replace(inlay, '')
|
||||||
@@ -604,7 +612,11 @@ export async function sendChat(chatProcessIndex = -1,arg:{chatAdditonalTokens?:n
|
|||||||
role: msg.role === 'user' ? 'user' : 'assistant',
|
role: msg.role === 'user' ? 'user' : 'assistant',
|
||||||
content: formatedChat,
|
content: formatedChat,
|
||||||
memo: msg.chatId,
|
memo: msg.chatId,
|
||||||
attr: attr
|
attr: attr,
|
||||||
|
multimodals: multimodal
|
||||||
|
}
|
||||||
|
if(chat.multimodals.length === 0){
|
||||||
|
delete chat.multimodals
|
||||||
}
|
}
|
||||||
chats.push(chat)
|
chats.push(chat)
|
||||||
currentTokens += await tokenizer.tokenizeChat(chat)
|
currentTokens += await tokenizer.tokenizeChat(chat)
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { get } from "svelte/store";
|
import { get } from "svelte/store";
|
||||||
import type { OpenAIChat, OpenAIChatFull } from ".";
|
import type { MultiModal, OpenAIChat, OpenAIChatFull } from ".";
|
||||||
import { DataBase, setDatabase, type character } from "../storage/database";
|
import { DataBase, setDatabase, type character } from "../storage/database";
|
||||||
import { pluginProcess } from "../plugins/plugins";
|
import { pluginProcess } from "../plugins/plugins";
|
||||||
import { language } from "../../lang";
|
import { language } from "../../lang";
|
||||||
@@ -117,6 +117,7 @@ export interface OpenAIChatExtra {
|
|||||||
name?:string
|
name?:string
|
||||||
removable?:boolean
|
removable?:boolean
|
||||||
attr?:string[]
|
attr?:string[]
|
||||||
|
multimodals?:MultiModal[]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -172,33 +173,29 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
|
|||||||
case 'reverse_proxy':{
|
case 'reverse_proxy':{
|
||||||
let formatedChat:OpenAIChatExtra[] = []
|
let formatedChat:OpenAIChatExtra[] = []
|
||||||
if(db.inlayImage){
|
if(db.inlayImage){
|
||||||
let pendingImages:OpenAIImageContents[] = []
|
|
||||||
for(let i=0;i<formated.length;i++){
|
for(let i=0;i<formated.length;i++){
|
||||||
const m = formated[i]
|
const m = formated[i]
|
||||||
if(m.memo && m.memo.startsWith('inlayImage')){
|
if(m.multimodals && m.multimodals.length > 0 && m.role === 'user'){
|
||||||
pendingImages.push({
|
let v:OpenAIChatExtra = cloneDeep(m)
|
||||||
"type": "image",
|
let contents:OpenAIContents[] = []
|
||||||
"image_url": {
|
for(let j=0;j<m.multimodals.length;j++){
|
||||||
"url": m.content,
|
contents.push({
|
||||||
"detail": db.gptVisionQuality
|
"type": "image",
|
||||||
}
|
"image_url": {
|
||||||
|
"url": m.multimodals[j].base64,
|
||||||
|
"detail": db.gptVisionQuality
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
contents.push({
|
||||||
|
"type": "text",
|
||||||
|
"text": m.content
|
||||||
})
|
})
|
||||||
|
v.content = contents
|
||||||
|
formatedChat.push(v)
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
if(pendingImages.length > 0 && m.role === 'user'){
|
formatedChat.push(m)
|
||||||
let v:OpenAIChatExtra = cloneDeep(m)
|
|
||||||
let contents:OpenAIContents[] = pendingImages
|
|
||||||
contents.push({
|
|
||||||
"type": "text",
|
|
||||||
"text": m.content
|
|
||||||
})
|
|
||||||
v.content = contents
|
|
||||||
formatedChat.push(v)
|
|
||||||
pendingImages = []
|
|
||||||
}
|
|
||||||
else{
|
|
||||||
formatedChat.push(m)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -218,6 +215,7 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
|
|||||||
delete formatedChat[i].memo
|
delete formatedChat[i].memo
|
||||||
delete formatedChat[i].removable
|
delete formatedChat[i].removable
|
||||||
delete formatedChat[i].attr
|
delete formatedChat[i].attr
|
||||||
|
delete formatedChat[i].multimodals
|
||||||
}
|
}
|
||||||
if(aiModel === 'reverse_proxy' && db.reverseProxyOobaMode && formatedChat[i].role === 'system'){
|
if(aiModel === 'reverse_proxy' && db.reverseProxyOobaMode && formatedChat[i].role === 'system'){
|
||||||
const cont = formatedChat[i].content
|
const cont = formatedChat[i].content
|
||||||
@@ -301,6 +299,7 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
|
|||||||
openrouterRequestModel = await getFreeOpenRouterModel()
|
openrouterRequestModel = await getFreeOpenRouterModel()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log(formatedChat)
|
||||||
if(aiModel.startsWith('mistral')){
|
if(aiModel.startsWith('mistral')){
|
||||||
requestModel = aiModel
|
requestModel = aiModel
|
||||||
|
|
||||||
@@ -1414,20 +1413,106 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface Claude3TextBlock {
|
||||||
|
type: 'text',
|
||||||
|
text: string
|
||||||
|
}
|
||||||
|
|
||||||
|
interface Claude3ImageBlock {
|
||||||
|
type: 'image',
|
||||||
|
source: {
|
||||||
|
type: 'base64'
|
||||||
|
media_type: string,
|
||||||
|
data: string
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type Claude3ContentBlock = Claude3TextBlock|Claude3ImageBlock
|
||||||
|
|
||||||
interface Claude3Chat {
|
interface Claude3Chat {
|
||||||
role: 'user'|'assistant'
|
role: 'user'|'assistant'
|
||||||
content: string
|
content: string|Claude3ContentBlock[]
|
||||||
}
|
}
|
||||||
|
|
||||||
let claudeChat: Claude3Chat[] = []
|
let claudeChat: Claude3Chat[] = []
|
||||||
let systemPrompt:string = ''
|
let systemPrompt:string = ''
|
||||||
|
|
||||||
const addClaudeChat = (chat:Claude3Chat) => {
|
const addClaudeChat = (chat:{
|
||||||
|
role: 'user'|'assistant'
|
||||||
|
content: string
|
||||||
|
}, multimodals?:MultiModal[]) => {
|
||||||
if(claudeChat.length > 0 && claudeChat[claudeChat.length-1].role === chat.role){
|
if(claudeChat.length > 0 && claudeChat[claudeChat.length-1].role === chat.role){
|
||||||
claudeChat[claudeChat.length-1].content += "\n\n" + chat.content
|
let content = claudeChat[claudeChat.length-1].content
|
||||||
|
if(multimodals && multimodals.length > 0 && !Array.isArray(content)){
|
||||||
|
content = [{
|
||||||
|
type: 'text',
|
||||||
|
text: content
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
if(Array.isArray(content)){
|
||||||
|
let lastContent = content[content.length-1]
|
||||||
|
if( lastContent?.type === 'text'){
|
||||||
|
lastContent.text += "\n\n" + chat.content
|
||||||
|
content[content.length-1] = lastContent
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
content.push({
|
||||||
|
type: 'text',
|
||||||
|
text: chat.content
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if(multimodals && multimodals.length > 0){
|
||||||
|
for(const modal of multimodals){
|
||||||
|
if(modal.type === 'image'){
|
||||||
|
const dataurl = modal.base64
|
||||||
|
const base64 = dataurl.split(',')[1]
|
||||||
|
const mediaType = dataurl.split(';')[0].split(':')[1]
|
||||||
|
|
||||||
|
content.unshift({
|
||||||
|
type: 'image',
|
||||||
|
source: {
|
||||||
|
type: 'base64',
|
||||||
|
media_type: mediaType,
|
||||||
|
data: base64
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
content += "\n\n" + chat.content
|
||||||
|
}
|
||||||
|
claudeChat[claudeChat.length-1].content = content
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
claudeChat.push(chat)
|
let formatedChat:Claude3Chat = chat
|
||||||
|
if(multimodals && multimodals.length > 0){
|
||||||
|
formatedChat.content = [{
|
||||||
|
type: 'text',
|
||||||
|
text: chat.content
|
||||||
|
}]
|
||||||
|
for(const modal of multimodals){
|
||||||
|
if(modal.type === 'image'){
|
||||||
|
const dataurl = modal.base64
|
||||||
|
const base64 = dataurl.split(',')[1]
|
||||||
|
const mediaType = dataurl.split(';')[0].split(':')[1]
|
||||||
|
|
||||||
|
formatedChat.content.unshift({
|
||||||
|
type: 'image',
|
||||||
|
source: {
|
||||||
|
type: 'base64',
|
||||||
|
media_type: mediaType,
|
||||||
|
data: base64
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
claudeChat.push(formatedChat)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for(const chat of formated){
|
for(const chat of formated){
|
||||||
@@ -1436,14 +1521,14 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
|
|||||||
addClaudeChat({
|
addClaudeChat({
|
||||||
role: 'user',
|
role: 'user',
|
||||||
content: chat.content
|
content: chat.content
|
||||||
})
|
}, chat.multimodals)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
case 'assistant':{
|
case 'assistant':{
|
||||||
addClaudeChat({
|
addClaudeChat({
|
||||||
role: 'assistant',
|
role: 'assistant',
|
||||||
content: chat.content
|
content: chat.content
|
||||||
})
|
}, chat.multimodals)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
case 'system':{
|
case 'system':{
|
||||||
@@ -1464,7 +1549,7 @@ export async function requestChatDataMain(arg:requestDataArgument, model:'model'
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
console.log(claudeChat)
|
||||||
if(claudeChat.length === 0 && systemPrompt === ''){
|
if(claudeChat.length === 0 && systemPrompt === ''){
|
||||||
return {
|
return {
|
||||||
type: 'fail',
|
type: 'fail',
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import type { Tiktoken } from "@dqbd/tiktoken";
|
|||||||
import type { Tokenizer } from "@mlc-ai/web-tokenizers";
|
import type { Tokenizer } from "@mlc-ai/web-tokenizers";
|
||||||
import { DataBase, type character } from "./storage/database";
|
import { DataBase, type character } from "./storage/database";
|
||||||
import { get } from "svelte/store";
|
import { get } from "svelte/store";
|
||||||
import type { OpenAIChat } from "./process";
|
import type { MultiModal, OpenAIChat } from "./process";
|
||||||
import { supportsInlayImage } from "./process/files/image";
|
import { supportsInlayImage } from "./process/files/image";
|
||||||
import { risuChatParser } from "./parser";
|
import { risuChatParser } from "./parser";
|
||||||
import { tokenizeGGUFModel } from "./process/models/local";
|
import { tokenizeGGUFModel } from "./process/models/local";
|
||||||
@@ -132,53 +132,56 @@ export class ChatTokenizer {
|
|||||||
this.useName = useName
|
this.useName = useName
|
||||||
}
|
}
|
||||||
async tokenizeChat(data:OpenAIChat) {
|
async tokenizeChat(data:OpenAIChat) {
|
||||||
if(data.memo && data.memo.startsWith('inlayImage')){
|
|
||||||
const db = get(DataBase)
|
|
||||||
if(!supportsInlayImage()){
|
|
||||||
return this.chatAdditonalTokens
|
|
||||||
}
|
|
||||||
if(db.gptVisionQuality === 'low'){
|
|
||||||
return 87
|
|
||||||
}
|
|
||||||
|
|
||||||
let encoded = this.chatAdditonalTokens
|
|
||||||
const memo = data.memo.split('-')
|
|
||||||
let height = parseInt(memo[1])
|
|
||||||
let width = parseInt(memo[2])
|
|
||||||
|
|
||||||
if(height === width){
|
|
||||||
if(height > 768){
|
|
||||||
height = 768
|
|
||||||
width = 768
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if(height > width){
|
|
||||||
if(width > 768){
|
|
||||||
width = 768
|
|
||||||
height = height * (768 / width)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else{
|
|
||||||
if(height > 768){
|
|
||||||
height = 768
|
|
||||||
width = width * (768 / height)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const chunkSize = Math.ceil(width / 512) * Math.ceil(height / 512)
|
|
||||||
encoded += chunkSize * 2
|
|
||||||
encoded += 85
|
|
||||||
|
|
||||||
return encoded
|
|
||||||
}
|
|
||||||
|
|
||||||
let encoded = (await encode(data.content)).length + this.chatAdditonalTokens
|
let encoded = (await encode(data.content)).length + this.chatAdditonalTokens
|
||||||
if(data.name && this.useName ==='name'){
|
if(data.name && this.useName ==='name'){
|
||||||
encoded += (await encode(data.name)).length + 1
|
encoded += (await encode(data.name)).length + 1
|
||||||
}
|
}
|
||||||
|
if(data.multimodals && data.multimodals.length > 0){
|
||||||
|
for(const multimodal of data.multimodals){
|
||||||
|
encoded += await this.tokenizeMultiModal(multimodal)
|
||||||
|
}
|
||||||
|
}
|
||||||
return encoded
|
return encoded
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async tokenizeMultiModal(data:MultiModal){
|
||||||
|
const db = get(DataBase)
|
||||||
|
if(!supportsInlayImage()){
|
||||||
|
return this.chatAdditonalTokens
|
||||||
|
}
|
||||||
|
if(db.gptVisionQuality === 'low'){
|
||||||
|
return 87
|
||||||
|
}
|
||||||
|
|
||||||
|
let encoded = this.chatAdditonalTokens
|
||||||
|
let height = data.height ?? 0
|
||||||
|
let width = data.width ?? 0
|
||||||
|
|
||||||
|
if(height === width){
|
||||||
|
if(height > 768){
|
||||||
|
height = 768
|
||||||
|
width = 768
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(height > width){
|
||||||
|
if(width > 768){
|
||||||
|
width = 768
|
||||||
|
height = height * (768 / width)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
if(height > 768){
|
||||||
|
height = 768
|
||||||
|
width = width * (768 / height)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunkSize = Math.ceil(width / 512) * Math.ceil(height / 512)
|
||||||
|
encoded += chunkSize * 2
|
||||||
|
encoded += 85
|
||||||
|
|
||||||
|
return encoded
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user