Add model selection for VitsModel
This commit is contained in:
@@ -1,6 +1,11 @@
|
||||
import {env, AutoTokenizer, pipeline, VitsModel, type SummarizationOutput, type TextGenerationConfig, type TextGenerationOutput, FeatureExtractionPipeline, TextToAudioPipeline } from '@xenova/transformers';
|
||||
import {env, AutoTokenizer, pipeline, type SummarizationOutput, type TextGenerationConfig, type TextGenerationOutput, FeatureExtractionPipeline, TextToAudioPipeline } from '@xenova/transformers';
|
||||
import { unzip } from 'fflate';
|
||||
import { loadAsset, saveAsset } from 'src/ts/storage/globalApi';
|
||||
import { selectSingleFile } from 'src/ts/util';
|
||||
import { v4 } from 'uuid';
|
||||
|
||||
env.localModelPath = "https://sv.risuai.xyz/transformers/"
|
||||
env.localModelPath = "/transformers/"
|
||||
env.remoteHost = "https://sv.risuai.xyz/transformers/"
|
||||
|
||||
export const runTransformers = async (baseText:string, model:string,config:TextGenerationConfig = {}) => {
|
||||
let text = baseText
|
||||
@@ -61,11 +66,49 @@ export const runEmbedding = async (text: string):Promise<Float32Array> => {
|
||||
|
||||
let synthesizer:TextToAudioPipeline = null
|
||||
let lastSynth:string = null
|
||||
export const runVITS = async (text: string, model:string = 'Xenova/mms-tts-eng') => {
|
||||
|
||||
export interface OnnxModelFiles {
|
||||
files: {[key:string]:string},
|
||||
id: string,
|
||||
name?: string
|
||||
}
|
||||
|
||||
export const runVITS = async (text: string, modelData:string|OnnxModelFiles = 'Xenova/mms-tts-eng') => {
|
||||
const {WaveFile} = await import('wavefile')
|
||||
if((!synthesizer) || (lastSynth !== model)){
|
||||
lastSynth = model
|
||||
synthesizer = await pipeline('text-to-speech', model);
|
||||
if(modelData === null){
|
||||
return
|
||||
}
|
||||
if(typeof modelData === 'string'){
|
||||
if((!synthesizer) || (lastSynth !== modelData)){
|
||||
lastSynth = modelData
|
||||
synthesizer = await pipeline('text-to-speech', modelData);
|
||||
}
|
||||
}
|
||||
else{
|
||||
if((!synthesizer) || (lastSynth !== modelData.id)){
|
||||
const files = modelData.files
|
||||
const keys = Object.keys(files)
|
||||
for(const key of keys){
|
||||
const hasCache:boolean = (await (await fetch("/sw/check/", {
|
||||
headers: {
|
||||
'x-register-url': encodeURIComponent(key)
|
||||
}
|
||||
})).json()).able
|
||||
|
||||
if(!hasCache){
|
||||
await fetch("/sw/register/", {
|
||||
method: "POST",
|
||||
body: await loadAsset(files[key]),
|
||||
headers: {
|
||||
'x-register-url': encodeURIComponent(key),
|
||||
'x-no-content-type': 'true'
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
lastSynth = modelData.id
|
||||
synthesizer = await pipeline('text-to-speech', modelData.id);
|
||||
}
|
||||
}
|
||||
let out = await synthesizer(text, {});
|
||||
const wav = new WaveFile();
|
||||
@@ -77,4 +120,52 @@ export const runVITS = async (text: string, model:string = 'Xenova/mms-tts-eng')
|
||||
sourceNode.connect(audioContext.destination);
|
||||
sourceNode.start();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export const registerOnnxModel = async ():Promise<OnnxModelFiles> => {
|
||||
const id = v4().replace(/-/g, '')
|
||||
|
||||
const modelFile = await selectSingleFile(['zip'])
|
||||
|
||||
if(!modelFile){
|
||||
return
|
||||
}
|
||||
|
||||
const unziped = await new Promise((res, rej) => {unzip(modelFile.data, {
|
||||
filter: (file) => {
|
||||
return file.name.endsWith('.onnx') || file.size < 10_000_000 || file.name.includes('.git')
|
||||
}
|
||||
}, (err, unzipped) => {
|
||||
if(err){
|
||||
rej(err)
|
||||
}
|
||||
else{
|
||||
res(unzipped)
|
||||
}
|
||||
})})
|
||||
|
||||
console.log(unziped)
|
||||
|
||||
let fileIdMapped:{[key:string]:string} = {}
|
||||
|
||||
const keys = Object.keys(unziped)
|
||||
for(let i = 0; i < keys.length; i++){
|
||||
const key = keys[i]
|
||||
const file = unziped[key]
|
||||
const fid = await saveAsset(file)
|
||||
let url = key
|
||||
if(url.startsWith('/')){
|
||||
url = url.substring(1)
|
||||
}
|
||||
url = '/transformers/' + id +'/' + url
|
||||
fileIdMapped[url] = fid
|
||||
}
|
||||
|
||||
return {
|
||||
files: fileIdMapped,
|
||||
name: modelFile.name,
|
||||
id: id,
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -5,211 +5,171 @@ import { runTranslator, translateVox } from "../translator/translator";
|
||||
import { globalFetch } from "../storage/globalApi";
|
||||
import { language } from "src/lang";
|
||||
import { getCurrentCharacter, sleep } from "../util";
|
||||
import { runVITS } from "./embedding/transformers";
|
||||
import { registerOnnxModel, runVITS } from "./embedding/transformers";
|
||||
|
||||
let sourceNode:AudioBufferSourceNode = null
|
||||
|
||||
export async function sayTTS(character:character,text:string) {
|
||||
if(!character){
|
||||
const v = getCurrentCharacter()
|
||||
if(v.type === 'group'){
|
||||
return
|
||||
}
|
||||
character = v
|
||||
}
|
||||
|
||||
let db = get(DataBase)
|
||||
text = text.replace(/\*/g,'')
|
||||
|
||||
if(character.ttsReadOnlyQuoted){
|
||||
const matches = text.match(/"(.*?)"/g)
|
||||
if(matches && matches.length > 0){
|
||||
text = matches.map(match => match.slice(1, -1)).join("");
|
||||
}
|
||||
else{
|
||||
text = ''
|
||||
}
|
||||
}
|
||||
|
||||
switch(character.ttsMode){
|
||||
case "webspeech":{
|
||||
if(speechSynthesis && SpeechSynthesisUtterance){
|
||||
const utterThis = new SpeechSynthesisUtterance(text);
|
||||
const voices = speechSynthesis.getVoices();
|
||||
let voiceIndex = 0
|
||||
for(let i=0;i<voices.length;i++){
|
||||
if(voices[i].name === character.ttsSpeech){
|
||||
voiceIndex = i
|
||||
}
|
||||
}
|
||||
utterThis.voice = voices[voiceIndex]
|
||||
const speak = speechSynthesis.speak(utterThis)
|
||||
try {
|
||||
if(!character){
|
||||
const v = getCurrentCharacter()
|
||||
if(v.type === 'group'){
|
||||
return
|
||||
}
|
||||
break
|
||||
character = v
|
||||
}
|
||||
case "elevenlab": {
|
||||
const audioContext = new AudioContext();
|
||||
const da = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${character.ttsSpeech}`, {
|
||||
body: JSON.stringify({
|
||||
text: text
|
||||
}),
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
'xi-api-key': db.elevenLabKey || undefined
|
||||
}
|
||||
})
|
||||
if(da.status >= 200 && da.status < 300){
|
||||
const audioBuffer = await audioContext.decodeAudioData(await da.arrayBuffer())
|
||||
sourceNode = audioContext.createBufferSource();
|
||||
sourceNode.buffer = audioBuffer;
|
||||
sourceNode.connect(audioContext.destination);
|
||||
sourceNode.start();
|
||||
|
||||
let db = get(DataBase)
|
||||
text = text.replace(/\*/g,'')
|
||||
|
||||
if(character.ttsReadOnlyQuoted){
|
||||
const matches = text.match(/"(.*?)"/g)
|
||||
if(matches && matches.length > 0){
|
||||
text = matches.map(match => match.slice(1, -1)).join("");
|
||||
}
|
||||
else{
|
||||
alertError(await da.text())
|
||||
text = ''
|
||||
}
|
||||
break
|
||||
}
|
||||
case "VOICEVOX": {
|
||||
const jpText = await translateVox(text)
|
||||
const audioContext = new AudioContext();
|
||||
const query = await fetch(`${db.voicevoxUrl}/audio_query?text=${jpText}&speaker=${character.ttsSpeech}`, {
|
||||
method: 'POST',
|
||||
headers: { "Content-Type": "application/json"},
|
||||
})
|
||||
if (query.status == 200){
|
||||
const queryJson = await query.json();
|
||||
const bodyData = {
|
||||
accent_phrases: queryJson.accent_phrases,
|
||||
speedScale: character.voicevoxConfig.SPEED_SCALE,
|
||||
pitchScale: character.voicevoxConfig.PITCH_SCALE,
|
||||
volumeScale: character.voicevoxConfig.VOLUME_SCALE,
|
||||
intonationScale: character.voicevoxConfig.INTONATION_SCALE,
|
||||
prePhonemeLength: queryJson.prePhonemeLength,
|
||||
postPhonemeLength: queryJson.postPhonemeLength,
|
||||
outputSamplingRate: queryJson.outputSamplingRate,
|
||||
outputStereo: queryJson.outputStereo,
|
||||
kana: queryJson.kana,
|
||||
|
||||
switch(character.ttsMode){
|
||||
case "webspeech":{
|
||||
if(speechSynthesis && SpeechSynthesisUtterance){
|
||||
const utterThis = new SpeechSynthesisUtterance(text);
|
||||
const voices = speechSynthesis.getVoices();
|
||||
let voiceIndex = 0
|
||||
for(let i=0;i<voices.length;i++){
|
||||
if(voices[i].name === character.ttsSpeech){
|
||||
voiceIndex = i
|
||||
}
|
||||
}
|
||||
utterThis.voice = voices[voiceIndex]
|
||||
const speak = speechSynthesis.speak(utterThis)
|
||||
}
|
||||
const getVoice = await fetch(`${db.voicevoxUrl}/synthesis?speaker=${character.ttsSpeech}`, {
|
||||
break
|
||||
}
|
||||
case "elevenlab": {
|
||||
const audioContext = new AudioContext();
|
||||
const da = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${character.ttsSpeech}`, {
|
||||
body: JSON.stringify({
|
||||
text: text
|
||||
}),
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
'xi-api-key': db.elevenLabKey || undefined
|
||||
}
|
||||
})
|
||||
if(da.status >= 200 && da.status < 300){
|
||||
const audioBuffer = await audioContext.decodeAudioData(await da.arrayBuffer())
|
||||
sourceNode = audioContext.createBufferSource();
|
||||
sourceNode.buffer = audioBuffer;
|
||||
sourceNode.connect(audioContext.destination);
|
||||
sourceNode.start();
|
||||
}
|
||||
else{
|
||||
alertError(await da.text())
|
||||
}
|
||||
break
|
||||
}
|
||||
case "VOICEVOX": {
|
||||
const jpText = await translateVox(text)
|
||||
const audioContext = new AudioContext();
|
||||
const query = await fetch(`${db.voicevoxUrl}/audio_query?text=${jpText}&speaker=${character.ttsSpeech}`, {
|
||||
method: 'POST',
|
||||
headers: { "Content-Type": "application/json"},
|
||||
body: JSON.stringify(bodyData),
|
||||
})
|
||||
if (getVoice.status == 200 && getVoice.headers.get('content-type') === 'audio/wav'){
|
||||
const audioBuffer = await audioContext.decodeAudioData(await getVoice.arrayBuffer())
|
||||
sourceNode = audioContext.createBufferSource();
|
||||
sourceNode.buffer = audioBuffer;
|
||||
sourceNode.connect(audioContext.destination);
|
||||
sourceNode.start();
|
||||
}
|
||||
}
|
||||
break
|
||||
}
|
||||
case 'openai':{
|
||||
const key = db.openAIKey
|
||||
const res = await globalFetch('https://api.openai.com/v1/audio/speech', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': 'Bearer ' + key,
|
||||
},
|
||||
body: {
|
||||
model: 'tts-1',
|
||||
input: text,
|
||||
voice: character.oaiVoice,
|
||||
|
||||
},
|
||||
rawResponse: true,
|
||||
})
|
||||
const dat = res.data
|
||||
|
||||
if(res.ok){
|
||||
try {
|
||||
const audio = Buffer.from(dat).buffer
|
||||
const audioContext = new AudioContext();
|
||||
const audioBuffer = await audioContext.decodeAudioData(audio)
|
||||
sourceNode = audioContext.createBufferSource();
|
||||
sourceNode.buffer = audioBuffer;
|
||||
sourceNode.connect(audioContext.destination);
|
||||
sourceNode.start();
|
||||
} catch (error) {
|
||||
alertError(language.errors.httpError + `${error}`)
|
||||
}
|
||||
}
|
||||
else{
|
||||
if(dat.error && dat.error.message){
|
||||
alertError((language.errors.httpError + `${dat.error.message}`))
|
||||
}
|
||||
else{
|
||||
alertError((language.errors.httpError + `${Buffer.from(res.data).toString()}`))
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
case 'novelai': {
|
||||
const audioContext = new AudioContext();
|
||||
if(text === ''){
|
||||
break;
|
||||
}
|
||||
const encodedText = encodeURIComponent(text);
|
||||
const encodedSeed = encodeURIComponent(character.naittsConfig.voice);
|
||||
|
||||
const url = `https://api.novelai.net/ai/generate-voice?text=${encodedText}&voice=-1&seed=${encodedSeed}&opus=false&version=${character.naittsConfig.version}`;
|
||||
|
||||
const response = await globalFetch(url, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
"Authorization": "Bearer " + db.NAIApiKey,
|
||||
},
|
||||
rawResponse: true
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const audioBuffer = response.data.buffer;
|
||||
audioContext.decodeAudioData(audioBuffer, (decodedData) => {
|
||||
const sourceNode = audioContext.createBufferSource();
|
||||
sourceNode.buffer = decodedData;
|
||||
sourceNode.connect(audioContext.destination);
|
||||
sourceNode.start();
|
||||
});
|
||||
} else {
|
||||
alertError("Error fetching or decoding audio data");
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'huggingface': {
|
||||
while(true){
|
||||
if(character.hfTTS.language !== 'en'){
|
||||
text = await runTranslator(text, false, 'en', character.hfTTS.language)
|
||||
}
|
||||
const audioContext = new AudioContext();
|
||||
const response = await fetch(`https://api-inference.huggingface.co/models/${character.hfTTS.model}`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
"Authorization": "Bearer " + db.huggingfaceKey,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
inputs: text,
|
||||
if (query.status == 200){
|
||||
const queryJson = await query.json();
|
||||
const bodyData = {
|
||||
accent_phrases: queryJson.accent_phrases,
|
||||
speedScale: character.voicevoxConfig.SPEED_SCALE,
|
||||
pitchScale: character.voicevoxConfig.PITCH_SCALE,
|
||||
volumeScale: character.voicevoxConfig.VOLUME_SCALE,
|
||||
intonationScale: character.voicevoxConfig.INTONATION_SCALE,
|
||||
prePhonemeLength: queryJson.prePhonemeLength,
|
||||
postPhonemeLength: queryJson.postPhonemeLength,
|
||||
outputSamplingRate: queryJson.outputSamplingRate,
|
||||
outputStereo: queryJson.outputStereo,
|
||||
kana: queryJson.kana,
|
||||
}
|
||||
const getVoice = await fetch(`${db.voicevoxUrl}/synthesis?speaker=${character.ttsSpeech}`, {
|
||||
method: 'POST',
|
||||
headers: { "Content-Type": "application/json"},
|
||||
body: JSON.stringify(bodyData),
|
||||
})
|
||||
});
|
||||
|
||||
if(response.status === 503 && response.headers.get('content-type') === 'application/json'){
|
||||
const json = await response.json()
|
||||
if(json.estimated_time){
|
||||
await sleep(json.estimated_time * 1000)
|
||||
continue
|
||||
if (getVoice.status == 200 && getVoice.headers.get('content-type') === 'audio/wav'){
|
||||
const audioBuffer = await audioContext.decodeAudioData(await getVoice.arrayBuffer())
|
||||
sourceNode = audioContext.createBufferSource();
|
||||
sourceNode.buffer = audioBuffer;
|
||||
sourceNode.connect(audioContext.destination);
|
||||
sourceNode.start();
|
||||
}
|
||||
}
|
||||
else if(response.status >= 400){
|
||||
alertError(language.errors.httpError + `${await response.text()}`)
|
||||
return
|
||||
break
|
||||
}
|
||||
case 'openai':{
|
||||
const key = db.openAIKey
|
||||
const res = await globalFetch('https://api.openai.com/v1/audio/speech', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': 'Bearer ' + key,
|
||||
},
|
||||
body: {
|
||||
model: 'tts-1',
|
||||
input: text,
|
||||
voice: character.oaiVoice,
|
||||
|
||||
},
|
||||
rawResponse: true,
|
||||
})
|
||||
const dat = res.data
|
||||
|
||||
if(res.ok){
|
||||
try {
|
||||
const audio = Buffer.from(dat).buffer
|
||||
const audioContext = new AudioContext();
|
||||
const audioBuffer = await audioContext.decodeAudioData(audio)
|
||||
sourceNode = audioContext.createBufferSource();
|
||||
sourceNode.buffer = audioBuffer;
|
||||
sourceNode.connect(audioContext.destination);
|
||||
sourceNode.start();
|
||||
} catch (error) {
|
||||
alertError(language.errors.httpError + `${error}`)
|
||||
}
|
||||
}
|
||||
else if (response.status === 200) {
|
||||
const audioBuffer = await response.arrayBuffer();
|
||||
else{
|
||||
if(dat.error && dat.error.message){
|
||||
alertError((language.errors.httpError + `${dat.error.message}`))
|
||||
}
|
||||
else{
|
||||
alertError((language.errors.httpError + `${Buffer.from(res.data).toString()}`))
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
case 'novelai': {
|
||||
const audioContext = new AudioContext();
|
||||
if(text === ''){
|
||||
break;
|
||||
}
|
||||
const encodedText = encodeURIComponent(text);
|
||||
const encodedSeed = encodeURIComponent(character.naittsConfig.voice);
|
||||
|
||||
const url = `https://api.novelai.net/ai/generate-voice?text=${encodedText}&voice=-1&seed=${encodedSeed}&opus=false&version=${character.naittsConfig.version}`;
|
||||
|
||||
const response = await globalFetch(url, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
"Authorization": "Bearer " + db.NAIApiKey,
|
||||
},
|
||||
rawResponse: true
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const audioBuffer = response.data.buffer;
|
||||
audioContext.decodeAudioData(audioBuffer, (decodedData) => {
|
||||
const sourceNode = audioContext.createBufferSource();
|
||||
sourceNode.buffer = decodedData;
|
||||
@@ -219,12 +179,56 @@ export async function sayTTS(character:character,text:string) {
|
||||
} else {
|
||||
alertError("Error fetching or decoding audio data");
|
||||
}
|
||||
return
|
||||
break;
|
||||
}
|
||||
}
|
||||
case 'vits':{
|
||||
await runVITS(text)
|
||||
}
|
||||
case 'huggingface': {
|
||||
while(true){
|
||||
if(character.hfTTS.language !== 'en'){
|
||||
text = await runTranslator(text, false, 'en', character.hfTTS.language)
|
||||
}
|
||||
const audioContext = new AudioContext();
|
||||
const response = await fetch(`https://api-inference.huggingface.co/models/${character.hfTTS.model}`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
"Authorization": "Bearer " + db.huggingfaceKey,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
inputs: text,
|
||||
})
|
||||
});
|
||||
|
||||
if(response.status === 503 && response.headers.get('content-type') === 'application/json'){
|
||||
const json = await response.json()
|
||||
if(json.estimated_time){
|
||||
await sleep(json.estimated_time * 1000)
|
||||
continue
|
||||
}
|
||||
}
|
||||
else if(response.status >= 400){
|
||||
alertError(language.errors.httpError + `${await response.text()}`)
|
||||
return
|
||||
}
|
||||
else if (response.status === 200) {
|
||||
const audioBuffer = await response.arrayBuffer();
|
||||
audioContext.decodeAudioData(audioBuffer, (decodedData) => {
|
||||
const sourceNode = audioContext.createBufferSource();
|
||||
sourceNode.buffer = decodedData;
|
||||
sourceNode.connect(audioContext.destination);
|
||||
sourceNode.start();
|
||||
});
|
||||
} else {
|
||||
alertError("Error fetching or decoding audio data");
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
case 'vits':{
|
||||
await runVITS(text, character.vits)
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
alertError(`TTS Error: ${error}`)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user