Fix Gemini Image Input Functionality (#678)
# PR Checklist - [ ] Did you check if it works normally in all models? *ignore this when it dosen't uses models* - [x] Did you check if it works normally in all of web, local and node hosted versions? if it dosen't, did you blocked it in those versions? - [x] Did you added a type def? # Description This PR fixes the image input functionality for the Gemini model. ### Issue with Existing Code The previous implementation attempted to process images in the following way: 1. In the `OpenAIChat` type, if the `memo` field started with `inlayImage`, the `content` field's value was copied into a variable called `pendingImage`. 2. Later, if the `chat` role was `'user'` and `pendingImage` was not an empty string, the code processed the image. However, it does not seem to work as expected in the current state. ### Update I updated the image input handling to align with the method used in RisuAI for handling GPT’s image input. Specifically, the changes include: 1. In `image.ts`, I explicitly specified the `gemini-exp` model. 2. If the `chat` object has a `multimodals` field and the `role` is `user`: - I created an array called `geminiParts` to store `GeminiPart` objects. - The `chat.content` value is set as the `text` field of the `GeminiPart` object in the array. - I then iterated over `chat.multimodals` and created an object for each `image` type, formatting it to match the Gemini structure, and added it to the `geminiParts` array. - After the iteration, the `geminiParts` array is assigned to the `parts` field of `reformatedChat`. ### Notes - I removed the previous non-functional code entirely. If this causes any inconvenience or violates any conventions, I sincerely apologize. - As the final name of the next-generation Gemini model is currently unknown, I restricted the functionality to the **gemini-exp** model in the `image.ts` file for now. This can be updated later when the official name is confirmed. The Gemini model is currently very widely used, so I kindly request you to review the updated code. If you have any feedback or if the changes are not acceptable, I completely understand if this PR is rejected. Thank you for your time and consideration! Let me know if there's anything I can improve or clarify.
This commit is contained in:
@@ -84,7 +84,7 @@ export async function getInlayImage(id: string){
|
|||||||
|
|
||||||
export function supportsInlayImage(){
|
export function supportsInlayImage(){
|
||||||
const db = getDatabase()
|
const db = getDatabase()
|
||||||
return db.aiModel.startsWith('gptv') || db.aiModel === 'gemini-pro-vision' || db.aiModel.startsWith('claude-3') || db.aiModel.startsWith('gpt4_turbo') || db.aiModel.startsWith('gpt5') || db.aiModel.startsWith('gpt4o') ||
|
return db.aiModel.startsWith('gptv') || db.aiModel === 'gemini-pro-vision' || db.aiModel.startsWith('gemini-exp') || db.aiModel.startsWith('claude-3') || db.aiModel.startsWith('gpt4_turbo') || db.aiModel.startsWith('gpt5') || db.aiModel.startsWith('gpt4o') ||
|
||||||
(db.aiModel === 'reverse_proxy' && (
|
(db.aiModel === 'reverse_proxy' && (
|
||||||
db.proxyRequestModel?.startsWith('gptv') || db.proxyRequestModel === 'gemini-pro-vision' || db.proxyRequestModel?.startsWith('claude-3') || db.proxyRequestModel.startsWith('gpt4_turbo') ||
|
db.proxyRequestModel?.startsWith('gptv') || db.proxyRequestModel === 'gemini-pro-vision' || db.proxyRequestModel?.startsWith('claude-3') || db.proxyRequestModel.startsWith('gpt4_turbo') ||
|
||||||
db.proxyRequestModel?.startsWith('gpt5') || db.proxyRequestModel?.startsWith('gpt4o') ||
|
db.proxyRequestModel?.startsWith('gpt5') || db.proxyRequestModel?.startsWith('gpt4o') ||
|
||||||
|
|||||||
@@ -1364,7 +1364,6 @@ async function requestGoogleCloudVertex(arg:RequestDataArgumentExtended):Promise
|
|||||||
|
|
||||||
|
|
||||||
let reformatedChat:GeminiChat[] = []
|
let reformatedChat:GeminiChat[] = []
|
||||||
let pendingImage = ''
|
|
||||||
let systemPrompt = ''
|
let systemPrompt = ''
|
||||||
|
|
||||||
if(formated[0].role === 'system'){
|
if(formated[0].role === 'system'){
|
||||||
@@ -1374,10 +1373,7 @@ async function requestGoogleCloudVertex(arg:RequestDataArgumentExtended):Promise
|
|||||||
|
|
||||||
for(let i=0;i<formated.length;i++){
|
for(let i=0;i<formated.length;i++){
|
||||||
const chat = formated[i]
|
const chat = formated[i]
|
||||||
if(chat.memo && chat.memo.startsWith('inlayImage')){
|
|
||||||
pendingImage = chat.content
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if(i === 0){
|
if(i === 0){
|
||||||
if(chat.role === 'user' || chat.role === 'assistant'){
|
if(chat.role === 'user' || chat.role === 'assistant'){
|
||||||
reformatedChat.push({
|
reformatedChat.push({
|
||||||
@@ -1403,7 +1399,34 @@ async function requestGoogleCloudVertex(arg:RequestDataArgumentExtended):Promise
|
|||||||
chat.role === 'assistant' ? 'MODEL' :
|
chat.role === 'assistant' ? 'MODEL' :
|
||||||
chat.role
|
chat.role
|
||||||
|
|
||||||
if(prevChat.role === qRole){
|
if (chat.multimodals && chat.multimodals.length > 0 && chat.role === "user") {
|
||||||
|
let geminiParts: GeminiPart[] = [];
|
||||||
|
|
||||||
|
geminiParts.push({
|
||||||
|
text: chat.content,
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const modal of chat.multimodals) {
|
||||||
|
if (modal.type === "image") {
|
||||||
|
const dataurl = modal.base64;
|
||||||
|
const base64 = dataurl.split(",")[1];
|
||||||
|
const mediaType = dataurl.split(";")[0].split(":")[1];
|
||||||
|
|
||||||
|
geminiParts.push({
|
||||||
|
inlineData: {
|
||||||
|
mimeType: mediaType,
|
||||||
|
data: base64,
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
reformatedChat.push({
|
||||||
|
role: "USER",
|
||||||
|
parts: geminiParts,
|
||||||
|
});
|
||||||
|
|
||||||
|
} else if (prevChat.role === qRole) {
|
||||||
reformatedChat[reformatedChat.length-1].parts[0].text += '\n' + chat.content
|
reformatedChat[reformatedChat.length-1].parts[0].text += '\n' + chat.content
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -1420,36 +1443,7 @@ async function requestGoogleCloudVertex(arg:RequestDataArgumentExtended):Promise
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if(chat.role === 'user' && pendingImage !== ''){
|
|
||||||
//conver image to jpeg so it can be inlined
|
|
||||||
const canv = document.createElement('canvas')
|
|
||||||
const img = new Image()
|
|
||||||
img.src = pendingImage
|
|
||||||
await img.decode()
|
|
||||||
canv.width = img.width
|
|
||||||
canv.height = img.height
|
|
||||||
const ctx = canv.getContext('2d')
|
|
||||||
ctx.drawImage(img, 0, 0)
|
|
||||||
const base64 = canv.toDataURL('image/jpeg').replace(/^data:image\/jpeg;base64,/, "")
|
|
||||||
const mimeType = 'image/jpeg'
|
|
||||||
pendingImage = ''
|
|
||||||
canv.remove()
|
|
||||||
img.remove()
|
|
||||||
|
|
||||||
reformatedChat.push({
|
|
||||||
role: "USER",
|
|
||||||
parts: [
|
|
||||||
{
|
|
||||||
text: chat.content,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
inlineData: {
|
|
||||||
mimeType: mimeType,
|
|
||||||
data: base64
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
})
|
|
||||||
}
|
|
||||||
else if(chat.role === 'assistant' || chat.role === 'user'){
|
else if(chat.role === 'assistant' || chat.role === 'user'){
|
||||||
reformatedChat.push({
|
reformatedChat.push({
|
||||||
role: chat.role === 'user' ? 'USER' : 'MODEL',
|
role: chat.role === 'user' ? 'USER' : 'MODEL',
|
||||||
|
|||||||
Reference in New Issue
Block a user