Add LoadingStatusState and improve tokenizer functionality

This commit is contained in:
Kwaroran
2024-12-26 06:00:07 +09:00
parent 575eb0505e
commit fe47f58c61
7 changed files with 836867 additions and 38 deletions

View File

@@ -8,6 +8,7 @@ import { tokenizeGGUFModel } from "./process/models/local";
import { globalFetch } from "./globalApi.svelte";
import { getModelInfo, LLMTokenizer } from "./model/modellist";
import { pluginV2 } from "./plugins/plugins";
import type { GemmaTokenizer } from "@huggingface/transformers";
export const tokenizerList = [
@@ -39,7 +40,7 @@ export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Arr
case 'llama3':
return await tokenizeWebTokenizers(data, 'llama')
case 'gemma':
return await tokenizeWebTokenizers(data, 'gemma')
return await gemmaTokenize(data)
case 'cohere':
return await tokenizeWebTokenizers(data, 'cohere')
default:
@@ -65,7 +66,7 @@ export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Arr
case 'llama3':
return await tokenizeWebTokenizers(data, 'llama')
case 'gemma':
return await tokenizeWebTokenizers(data, 'gemma')
return await gemmaTokenize(data)
case 'cohere':
return await tokenizeWebTokenizers(data, 'cohere')
case 'o200k_base':
@@ -105,7 +106,7 @@ export async function encode(data:string):Promise<(number[]|Uint32Array|Int32Arr
return await tokenizeGoogleCloud(data)
}
if(modelInfo.tokenizer === LLMTokenizer.Gemma || modelInfo.tokenizer === LLMTokenizer.GoogleCloud){
return await tokenizeWebTokenizers(data, 'gemma')
return await gemmaTokenize(data)
}
if(modelInfo.tokenizer === LLMTokenizer.Cohere){
return await tokenizeWebTokenizers(data, 'cohere')
@@ -157,6 +158,17 @@ async function tokenizeGoogleCloud(text:string) {
return new Uint32Array(count)
}
let gemmaTokenizer:GemmaTokenizer = null
async function gemmaTokenize(text:string) {
if(!gemmaTokenizer){
const {GemmaTokenizer} = await import('@huggingface/transformers')
gemmaTokenizer = new GemmaTokenizer(
await (await fetch("/token/llama/llama3.json")
).json(), {})
}
return gemmaTokenizer.encode(text)
}
async function tikJS(text:string, model='cl100k_base') {
if(!tikParser || lastTikModel !== model){
if(model === 'cl100k_base'){