feat: Add gemini tokenizer for gemma model
This commit is contained in:
BIN
public/token/gemma/tokenizer.model
Normal file
BIN
public/token/gemma/tokenizer.model
Normal file
Binary file not shown.
@@ -6,6 +6,7 @@ import type { MultiModal, OpenAIChat } from "./process";
|
|||||||
import { supportsInlayImage } from "./process/files/image";
|
import { supportsInlayImage } from "./process/files/image";
|
||||||
import { risuChatParser } from "./parser";
|
import { risuChatParser } from "./parser";
|
||||||
import { tokenizeGGUFModel } from "./process/models/local";
|
import { tokenizeGGUFModel } from "./process/models/local";
|
||||||
|
import { globalFetch } from "./storage/globalApi";
|
||||||
|
|
||||||
|
|
||||||
export const tokenizerList = [
|
export const tokenizerList = [
|
||||||
@@ -78,11 +79,14 @@ export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Arr
|
|||||||
if(db.aiModel.startsWith('gpt4o')){
|
if(db.aiModel.startsWith('gpt4o')){
|
||||||
return await tikJS(data, 'o200k_base')
|
return await tikJS(data, 'o200k_base')
|
||||||
}
|
}
|
||||||
|
if(db.aiModel.startsWith('gemini')){
|
||||||
|
return await tokenizeWebTokenizers(data, 'gemma')
|
||||||
|
}
|
||||||
|
|
||||||
return await tikJS(data)
|
return await tikJS(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
type tokenizerType = 'novellist'|'claude'|'novelai'|'llama'|'mistral'|'llama3'
|
type tokenizerType = 'novellist'|'claude'|'novelai'|'llama'|'mistral'|'llama3'|'gemma'
|
||||||
|
|
||||||
let tikParser:Tiktoken = null
|
let tikParser:Tiktoken = null
|
||||||
let tokenizersTokenizer:Tokenizer = null
|
let tokenizersTokenizer:Tokenizer = null
|
||||||
@@ -116,6 +120,31 @@ async function tikJS(text:string, model='cl100k_base') {
|
|||||||
return tikParser.encode(text)
|
return tikParser.encode(text)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function geminiTokenizer(text:string) {
|
||||||
|
const db = get(DataBase)
|
||||||
|
const fetchResult = await globalFetch(`https://generativelanguage.googleapis.com/v1beta/${db.aiModel}:countTextTokens`, {
|
||||||
|
"headers": {
|
||||||
|
"content-type": "application/json",
|
||||||
|
"authorization": `Bearer ${db.google.accessToken}`
|
||||||
|
},
|
||||||
|
"body": JSON.stringify({
|
||||||
|
"prompt":{
|
||||||
|
text: text
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
"method": "POST"
|
||||||
|
})
|
||||||
|
|
||||||
|
if(!fetchResult.ok){
|
||||||
|
//fallback to tiktoken
|
||||||
|
return await tikJS(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = fetchResult.data
|
||||||
|
|
||||||
|
return result.tokenCount ?? 0
|
||||||
|
}
|
||||||
|
|
||||||
async function tokenizeWebTokenizers(text:string, type:tokenizerType) {
|
async function tokenizeWebTokenizers(text:string, type:tokenizerType) {
|
||||||
if(type !== tokenizersType || !tokenizersTokenizer){
|
if(type !== tokenizersType || !tokenizersTokenizer){
|
||||||
const webTokenizer = await import('@mlc-ai/web-tokenizers')
|
const webTokenizer = await import('@mlc-ai/web-tokenizers')
|
||||||
@@ -151,6 +180,11 @@ async function tokenizeWebTokenizers(text:string, type:tokenizerType) {
|
|||||||
await (await fetch("/token/mistral/tokenizer.model")
|
await (await fetch("/token/mistral/tokenizer.model")
|
||||||
).arrayBuffer())
|
).arrayBuffer())
|
||||||
break
|
break
|
||||||
|
case 'gemma':
|
||||||
|
tokenizersTokenizer = await webTokenizer.Tokenizer.fromSentencePiece(
|
||||||
|
await (await fetch("/token/gemma/tokenizer.model")
|
||||||
|
).arrayBuffer())
|
||||||
|
break
|
||||||
|
|
||||||
}
|
}
|
||||||
tokenizersType = type
|
tokenizersType = type
|
||||||
|
|||||||
Reference in New Issue
Block a user