From 0ca4ec3695f34826520d5de6d5b5acfbd9486e67 Mon Sep 17 00:00:00 2001 From: kwaroran Date: Sat, 6 Jan 2024 00:28:06 +0900 Subject: [PATCH] Add VITS support --- package.json | 1 + pnpm-lock.yaml | 9 +++++++ src/lib/SideBars/CharConfig.svelte | 8 ++---- src/ts/process/embedding/transformers.ts | 33 +++++++++++++++++------- src/ts/process/tts.ts | 4 +++ 5 files changed, 39 insertions(+), 16 deletions(-) diff --git a/package.json b/package.json index ddcb0016..9e7361a3 100644 --- a/package.json +++ b/package.json @@ -64,6 +64,7 @@ "tippy.js": "^6.3.7", "uuid": "^9.0.1", "wasmoon": "^1.15.1", + "wavefile": "^11.0.0", "web-streams-polyfill": "^3.2.1" }, "devDependencies": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 6988caa4..6169ce9d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -155,6 +155,9 @@ dependencies: wasmoon: specifier: ^1.15.1 version: 1.15.1 + wavefile: + specifier: ^11.0.0 + version: 11.0.0 web-streams-polyfill: specifier: ^3.2.1 version: 3.2.1 @@ -5675,6 +5678,12 @@ packages: '@types/emscripten': 1.39.9 dev: false + /wavefile@11.0.0: + resolution: {integrity: sha512-/OBiAALgWU24IG7sC84cDO/KfFuvajWc5Uec0oV2zrpOOZZDgGdOwHwgEzOrwh8jkubBk7PtZfQBIcI1OaE5Ng==} + engines: {node: '>=8'} + hasBin: true + dev: false + /web-streams-polyfill@3.2.1: resolution: {integrity: sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==} engines: {node: '>= 8'} diff --git a/src/lib/SideBars/CharConfig.svelte b/src/lib/SideBars/CharConfig.svelte index 762a782d..dfaa8240 100644 --- a/src/lib/SideBars/CharConfig.svelte +++ b/src/lib/SideBars/CharConfig.svelte @@ -526,6 +526,7 @@ OpenAI NovelAI Huggingface + VITS @@ -625,12 +626,7 @@ Language {/if} - {#if currentChar.data.ttsMode === 'webspeech' || - currentChar.data.ttsMode === 'elevenlab' || - currentChar.data.ttsMode === 'VOICEVOX' || - currentChar.data.ttsMode === 'huggingface' || - currentChar.data.ttsMode === 'openai' || - currentChar.data.ttsMode === 'novelai'} + {#if currentChar.data.ttsMode}
diff --git a/src/ts/process/embedding/transformers.ts b/src/ts/process/embedding/transformers.ts index cf3b1af8..6cd2e49e 100644 --- a/src/ts/process/embedding/transformers.ts +++ b/src/ts/process/embedding/transformers.ts @@ -1,11 +1,11 @@ -import transformers, { AutoTokenizer, pipeline, type SummarizationOutput } from '@xenova/transformers'; +import {env, AutoTokenizer, pipeline, VitsModel, type SummarizationOutput, type TextGenerationConfig, type TextGenerationOutput, FeatureExtractionPipeline, TextToAudioPipeline } from '@xenova/transformers'; -transformers.env.localModelPath = "https://sv.risuai.xyz/transformers/" +env.localModelPath = "https://sv.risuai.xyz/transformers/" -export const runTransformers = async (baseText:string, model:string,config:transformers.TextGenerationConfig = {}) => { +export const runTransformers = async (baseText:string, model:string,config:TextGenerationConfig = {}) => { let text = baseText let generator = await pipeline('text-generation', model); - let output = await generator(text, config) as transformers.TextGenerationOutput + let output = await generator(text, config) as TextGenerationOutput const outputOne = output[0] return outputOne } @@ -16,7 +16,7 @@ export const runSummarizer = async (text: string) => { return v[0].summary_text } -let extractor:transformers.FeatureExtractionPipeline = null +let extractor:FeatureExtractionPipeline = null export const runEmbedding = async (text: string):Promise => { if(!extractor){ extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2'); @@ -59,9 +59,22 @@ export const runEmbedding = async (text: string):Promise => { return (result?.data as Float32Array) ?? null; } -export const runTTS = async (text: string) => { - let speaker_embeddings = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin'; - let synthesizer = await pipeline('text-to-speech', 'Xenova/speecht5_tts', { local_files_only: true }); - let out = await synthesizer(text, { speaker_embeddings }); - return out +let synthesizer:TextToAudioPipeline = null +let lastSynth:string = null +export const runVITS = async (text: string, model:string = 'Xenova/mms-tts-eng') => { + const {WaveFile} = await import('wavefile') + if((!synthesizer) || (lastSynth !== model)){ + lastSynth = model + synthesizer = await pipeline('text-to-speech', model); + } + let out = await synthesizer(text, {}); + const wav = new WaveFile(); + wav.fromScratch(1, out.sampling_rate, '32f', out.audio); + const audioContext = new AudioContext(); + audioContext.decodeAudioData(wav.toBuffer().buffer, (decodedData) => { + const sourceNode = audioContext.createBufferSource(); + sourceNode.buffer = decodedData; + sourceNode.connect(audioContext.destination); + sourceNode.start(); + }); } \ No newline at end of file diff --git a/src/ts/process/tts.ts b/src/ts/process/tts.ts index 3a8bb41c..8c29f28a 100644 --- a/src/ts/process/tts.ts +++ b/src/ts/process/tts.ts @@ -5,6 +5,7 @@ import { runTranslator, translateVox } from "../translator/translator"; import { globalFetch } from "../storage/globalApi"; import { language } from "src/lang"; import { getCurrentCharacter, sleep } from "../util"; +import { runVITS } from "./embedding/transformers"; let sourceNode:AudioBufferSourceNode = null @@ -221,6 +222,9 @@ export async function sayTTS(character:character,text:string) { return } } + case 'vits':{ + await runVITS(text) + } } }