Add Llama3 tokenizer support

This commit is contained in:
kwaroran
2024-04-23 23:02:27 +09:00
parent baa0fc07f3
commit f46df1af20
2 changed files with 410513 additions and 1 deletions

410504
public/token/llama/llama3.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -14,6 +14,7 @@ export const tokenizerList = [
['novelai', 'NovelAI'],
['claude', 'Claude'],
['llama', 'Llama'],
['llama3', 'Llama3'],
['novellist', 'Novellist'],
] as const
@@ -31,6 +32,8 @@ async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{
return await tokenizeWebTokenizers(data, 'claude')
case 'novellist':
return await tokenizeWebTokenizers(data, 'novellist')
case 'llama3':
return await tokenizeWebTokenizers(data, 'llama')
default:
return await tikJS(data)
}
@@ -71,7 +74,7 @@ async function encode(data:string):Promise<(number[]|Uint32Array|Int32Array)>{
return await tikJS(data)
}
type tokenizerType = 'novellist'|'claude'|'novelai'|'llama'|'mistral'
type tokenizerType = 'novellist'|'claude'|'novelai'|'llama'|'mistral'|'llama3'
let tikParser:Tiktoken = null
let tokenizersTokenizer:Tokenizer = null
@@ -105,6 +108,11 @@ async function tokenizeWebTokenizers(text:string, type:tokenizerType) {
await (await fetch("/token/claude/claude.json")
).arrayBuffer())
break
case 'llama3':
tokenizersTokenizer = await webTokenizer.Tokenizer.fromJSON(
await (await fetch("/token/llama/llama3.json")
).arrayBuffer())
break
case 'novelai':
tokenizersTokenizer = await webTokenizer.Tokenizer.fromSentencePiece(
await (await fetch("/token/nai/nerdstash_v2.model")