Add Gpt-SoVITS TTS, change google translate lang to auto (#600)

# PR Checklist
- [ v ] Did you check if it works normally in all models? *ignore this
when it dosen't uses models*
- [ v ] Did you check if it works normally in all of web, local and node
hosted versions? if it dosen't, did you blocked it in those versions?
- [ v ] Did you added a type def?

# Description

Add connection Gpt-SoVITS Api to ngrok turnel.

ttsReadOnlyQuoted support to japan Quote (「」)

and change google translate api language to auto (experiment)
This commit is contained in:
kwaroran
2024-08-24 19:56:56 +09:00
committed by GitHub
4 changed files with 212 additions and 14 deletions

View File

@@ -9,7 +9,7 @@
import LoreBook from "./LoreBook/LoreBookSetting.svelte"; import LoreBook from "./LoreBook/LoreBookSetting.svelte";
import { alertConfirm, alertMd, alertNormal, alertSelectChar, alertTOS, showHypaV2Alert } from "../../ts/alert"; import { alertConfirm, alertMd, alertNormal, alertSelectChar, alertTOS, showHypaV2Alert } from "../../ts/alert";
import BarIcon from "./BarIcon.svelte"; import BarIcon from "./BarIcon.svelte";
import { findCharacterbyId, getAuthorNoteDefaultText, parseKeyValue, selectMultipleFile } from "../../ts/util"; import { findCharacterbyId, getAuthorNoteDefaultText, parseKeyValue, selectMultipleFile, selectSingleFile } from "../../ts/util";
import { onDestroy } from "svelte"; import { onDestroy } from "svelte";
import {isEqual} from 'lodash' import {isEqual} from 'lodash'
import Help from "../Others/Help.svelte"; import Help from "../Others/Help.svelte";
@@ -29,9 +29,11 @@
import { updateInlayScreen } from "src/ts/process/inlayScreen"; import { updateInlayScreen } from "src/ts/process/inlayScreen";
import { registerOnnxModel } from "src/ts/process/transformers"; import { registerOnnxModel } from "src/ts/process/transformers";
import MultiLangInput from "../UI/GUI/MultiLangInput.svelte"; import MultiLangInput from "../UI/GUI/MultiLangInput.svelte";
import { applyModule } from "src/ts/process/modules"; import { applyModule } from "src/ts/process/modules";
import { exportRegex, importRegex } from "src/ts/process/scripts"; import { exportRegex, importRegex } from "src/ts/process/scripts";
import Arcodion from "../UI/Arcodion.svelte"; import Arcodion from "../UI/Arcodion.svelte";
import SliderInput from "../UI/GUI/SliderInput.svelte";
let subMenu = 0 let subMenu = 0
@@ -105,6 +107,12 @@
} }
emos = currentChar.data.emotionImages emos = currentChar.data.emotionImages
currentChar = currentChar currentChar = currentChar
if (currentChar.data.ttsMode === 'gptsovits' && (currentChar.data as character).gptSoVitsConfig) {
if (!(currentChar.data as character).gptSoVitsConfig.use_prompt) {
(currentChar.data as character).gptSoVitsConfig.prompt = undefined
}
}
}) })
let assetFileExtensions:string[] = [] let assetFileExtensions:string[] = []
@@ -149,6 +157,27 @@
version: 'v2' version: 'v2'
}; };
} }
$: if (currentChar.data.ttsMode === 'gptsovits' && (currentChar.data as character).gptSoVitsConfig === undefined) {
(currentChar.data as character).gptSoVitsConfig = {
url: '',
ref_audio_path: 'C:/Users/user/Downloads/GPT-SoVITS-v2-240821',
ref_audio_data: {
fileName: '',
assetId: ''
},
volume: 1.0,
text_lang: 'auto',
text: 'en',
use_prompt: false,
prompt_lang: 'en',
top_p: 1,
temperature: 0.7,
speed: 1,
top_k: 5,
text_split_method: 'cut0',
};
}
</script> </script>
{#if licensed !== 'private'} {#if licensed !== 'private'}
@@ -678,7 +707,7 @@
{#if currentChar.type === 'character'} {#if currentChar.type === 'character'}
<h2 class="mb-2 text-2xl font-bold mt-2">TTS</h2> <h2 class="mb-2 text-2xl font-bold mt-2">TTS</h2>
<span class="text-textcolor">{language.provider}</span> <span class="text-textcolor">{language.provider}</span>
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.ttsMode} on:change={() => { <SelectInput className="mb-4 mt-2" bind:value={currentChar.data.ttsMode} on:change={(e) => {
if(currentChar.type === 'character'){ if(currentChar.type === 'character'){
currentChar.data.ttsSpeech = '' currentChar.data.ttsSpeech = ''
} }
@@ -691,6 +720,7 @@
<OptionInput value="novelai">NovelAI</OptionInput> <OptionInput value="novelai">NovelAI</OptionInput>
<OptionInput value="huggingface">Huggingface</OptionInput> <OptionInput value="huggingface">Huggingface</OptionInput>
<OptionInput value="vits">VITS</OptionInput> <OptionInput value="vits">VITS</OptionInput>
<OptionInput value="gptsovits">GPT-SoVITS</OptionInput>
</SelectInput> </SelectInput>
@@ -774,23 +804,20 @@
<OptionInput value="v1">v1</OptionInput> <OptionInput value="v1">v1</OptionInput>
<OptionInput value="v2">v2</OptionInput> <OptionInput value="v2">v2</OptionInput>
</SelectInput> </SelectInput>
{/if} {:else if currentChar.data.ttsMode === 'openai'}
{#if currentChar.data.ttsMode === 'openai'}
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.oaiVoice}> <SelectInput className="mb-4 mt-2" bind:value={currentChar.data.oaiVoice}>
<OptionInput value="">Unset</OptionInput> <OptionInput value="">Unset</OptionInput>
{#each oaiVoices as voice} {#each oaiVoices as voice}
<OptionInput value={voice}>{voice}</OptionInput> <OptionInput value={voice}>{voice}</OptionInput>
{/each} {/each}
</SelectInput> </SelectInput>
{/if} {:else if currentChar.data.ttsMode === 'huggingface'}
{#if currentChar.data.ttsMode === 'huggingface'}
<span class="text-textcolor">Model</span> <span class="text-textcolor">Model</span>
<TextInput className="mb-4 mt-2" bind:value={currentChar.data.hfTTS.model} /> <TextInput className="mb-4 mt-2" bind:value={currentChar.data.hfTTS.model} />
<span class="text-textcolor">Language</span> <span class="text-textcolor">Language</span>
<TextInput className="mb-4 mt-2" bind:value={currentChar.data.hfTTS.language} placeholder="en" /> <TextInput className="mb-4 mt-2" bind:value={currentChar.data.hfTTS.language} placeholder="en" />
{/if} {:else if currentChar.data.ttsMode === 'vits'}
{#if currentChar.data.ttsMode === 'vits'}
{#if currentChar.data.vits} {#if currentChar.data.vits}
<span class="text-textcolor">{currentChar.data.vits.name ?? 'Unnamed VitsModel'}</span> <span class="text-textcolor">{currentChar.data.vits.name ?? 'Unnamed VitsModel'}</span>
{:else} {:else}
@@ -802,6 +829,99 @@
currentChar.data.vits = model currentChar.data.vits = model
} }
}}>{language.selectModel}</Button> }}>{language.selectModel}</Button>
{:else if currentChar.data.ttsMode === 'gptsovits'}
<span class="text-textcolor">Volume</span>
<SliderInput min={0.0} max={1.0} step={0.01} fixed={2} bind:value={currentChar.data.gptSoVitsConfig.volume}/>
<span class="text-textcolor">URL</span>
<TextInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.url}/>
<span class="text-textcolor">Reference Audio Path (e.g. C:/Users/user/Downloads/GPT-SoVITS-v2-240821)</span>
<TextInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.ref_audio_path}/>
<span class="text-textcolor">Reference Audio Data (3~10s audio file)</span>
<Button on:click={async () => {
const audio = await selectSingleFile([
'wav',
'ogg',
'aac'
])
if(!audio){
return null
}
const saveId = await saveAsset(audio.data)
// @ts-expect-error not groupChat
currentChar.data.gptSoVitsConfig.ref_audio_data = {
fileName: audio.name,
assetId: saveId
}
}}
className="h-10">
{#if currentChar.data.gptSoVitsConfig.ref_audio_data.assetId === '' || currentChar.data.gptSoVitsConfig.ref_audio_data.assetId === undefined}
Select File
{:else}
{currentChar.data.gptSoVitsConfig.ref_audio_data.fileName}
{/if}
</Button>
<span class="text-textcolor">Text Language</span>
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.text_lang}>
<OptionInput value="auto">Auto</OptionInput>
<OptionInput value="auto_yue">Auto (Cantonese)</OptionInput>
<OptionInput value="en">English</OptionInput>
<OptionInput value="zh">Chinese</OptionInput>
<OptionInput value="ja">Japanese</OptionInput>
<OptionInput value="yue">Cantonese</OptionInput>
<OptionInput value="ko">Korean</OptionInput>
<OptionInput value="all_zh">All Chinese</OptionInput>
<OptionInput value="all_ja">All Japanese</OptionInput>
<OptionInput value="all_yue">All Cantonese</OptionInput>
<OptionInput value="all_ko">All Korean</OptionInput>
</SelectInput>
<span class="text-textcolor">Use Reference Audio Script</span>
<Check bind:check={currentChar.data.gptSoVitsConfig.use_prompt}/>
{#if currentChar.data.gptSoVitsConfig.use_prompt}
<span class="text-textcolor">Reference Audio Script</span>
<TextAreaInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.prompt}/>
{/if}
<span class="text-textcolor">Reference Audio Language</span>
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.prompt_lang}>
<OptionInput value="auto">Auto</OptionInput>
<OptionInput value="auto_yue">Auto (Cantonese)</OptionInput>
<OptionInput value="en">English</OptionInput>
<OptionInput value="zh">Chinese</OptionInput>
<OptionInput value="ja">Japanese</OptionInput>
<OptionInput value="yue">Cantonese</OptionInput>
<OptionInput value="ko">Korean</OptionInput>
<OptionInput value="all_zh">English And Chinese</OptionInput>
<OptionInput value="all_ja">English And Japanese</OptionInput>
<OptionInput value="all_yue">English And Cantonese</OptionInput>
<OptionInput value="all_ko">English And Korean</OptionInput>
</SelectInput>
<span class="text-textcolor">Top P</span>
<SliderInput min={0.0} max={1.0} step={0.05} fixed={2} bind:value={currentChar.data.gptSoVitsConfig.top_p}/>
<span class="text-textcolor">Temperature</span>
<SliderInput min={0.0} max={1.0} step={0.05} fixed={2} bind:value={currentChar.data.gptSoVitsConfig.temperature}/>
<span class="text-textcolor">Speed</span>
<SliderInput min={0.6} max={1.65} step={0.05} fixed={2} bind:value={currentChar.data.gptSoVitsConfig.speed}/>
<span class="text-textcolor">Top K</span>
<SliderInput min={1} max={100} step={1} bind:value={currentChar.data.gptSoVitsConfig.top_k}/>
<span class="text-textcolor">Text Split Method</span>
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.text_split_method}>
<OptionInput value="cut0">Cut 0 (No splitting)</OptionInput>
<OptionInput value="cut1">Cut 1 (Split every 4 sentences)</OptionInput>
<OptionInput value="cut2">Cut 2 (Split every 50 characters)</OptionInput>
<OptionInput value="cut3">Cut 3 (Split by Chinese periods)</OptionInput>
<OptionInput value="cut4">Cut 4 (Split by English periods)</OptionInput>
<OptionInput value="cut5">Cut 5 (Split by various punctuation marks)</OptionInput>
</SelectInput>
{/if} {/if}
{#if currentChar.data.ttsMode} {#if currentChar.data.ttsMode}
<div class="flex items-center mt-2"> <div class="flex items-center mt-2">

View File

@@ -2,7 +2,7 @@ import { get } from "svelte/store";
import { alertError } from "../alert"; import { alertError } from "../alert";
import { DataBase, type character } from "../storage/database"; import { DataBase, type character } from "../storage/database";
import { runTranslator, translateVox } from "../translator/translator"; import { runTranslator, translateVox } from "../translator/translator";
import { globalFetch } from "../storage/globalApi"; import { globalFetch, loadAsset } from "../storage/globalApi";
import { language } from "src/lang"; import { language } from "src/lang";
import { getCurrentCharacter, sleep } from "../util"; import { getCurrentCharacter, sleep } from "../util";
import { registerOnnxModel, runVITS } from "./transformers"; import { registerOnnxModel, runVITS } from "./transformers";
@@ -27,7 +27,7 @@ export async function sayTTS(character:character,text:string) {
text = text.replace(/\*/g,'') text = text.replace(/\*/g,'')
if(character.ttsReadOnlyQuoted){ if(character.ttsReadOnlyQuoted){
const matches = text.match(/"(.*?)"/g) const matches = text.match(/["「](.*?)["」]/g)
if(matches && matches.length > 0){ if(matches && matches.length > 0){
text = matches.map(match => match.slice(1, -1)).join(""); text = matches.map(match => match.slice(1, -1)).join("");
} }
@@ -231,12 +231,71 @@ export async function sayTTS(character:character,text:string) {
case 'vits':{ case 'vits':{
await runVITS(text, character.vits) await runVITS(text, character.vits)
} }
case 'gptsovits':{
const audioContext = new AudioContext();
const audio: Uint8Array = await loadAsset(character.gptSoVitsConfig.ref_audio_data.assetId);
const base64Audio = btoa(new Uint8Array(audio).reduce((data, byte) => data + String.fromCharCode(byte), ''));
const body = {
text: text,
text_lang: character.gptSoVitsConfig.text_lang,
ref_audio_path: character.gptSoVitsConfig.ref_audio_path + '/public/audio/' + character.gptSoVitsConfig.ref_audio_data.fileName,
ref_audio_name: character.gptSoVitsConfig.ref_audio_data.fileName,
ref_audio_data: base64Audio,
prompt_text: undefined,
prompt_lang: character.gptSoVitsConfig.prompt_lang,
top_p: character.gptSoVitsConfig.top_p,
temperature: character.gptSoVitsConfig.temperature,
speed_factor: character.gptSoVitsConfig.speed,
top_k: character.gptSoVitsConfig.top_k,
text_split_method: character.gptSoVitsConfig.text_split_method,
parallel_infer: false,
}
if (character.gptSoVitsConfig.use_prompt){
body.prompt_text = character.gptSoVitsConfig.prompt
}
console.log(body)
const response = await globalFetch(`${character.gptSoVitsConfig.url}/tts`, {
method: 'POST',
headers: {
"Content-Type": "application/json",
},
body: body,
rawResponse: true,
})
console.log(response)
if (response.ok) {
const audioBuffer = response.data.buffer;
audioContext.decodeAudioData(audioBuffer, (decodedData) => {
const sourceNode = audioContext.createBufferSource();
sourceNode.buffer = decodedData;
const gainNode = audioContext.createGain();
gainNode.gain.value = character.gptSoVitsConfig.volume || 1.0;
sourceNode.connect(gainNode);
gainNode.connect(audioContext.destination);
sourceNode.start();
});
} else {
const textBuffer: Uint8Array = response.data.buffer
const text = Buffer.from(textBuffer).toString('utf-8')
throw new Error(text);
}
}
} }
} catch (error) { } catch (error) {
alertError(`TTS Error: ${error}`) alertError(`TTS Error: ${error}`)
} }
} }
export const oaiVoices = [ export const oaiVoices = [
'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer' 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'
] ]

View File

@@ -807,6 +807,25 @@ export interface character{
voice?: string voice?: string
version?: string version?: string
} }
gptSoVitsConfig?:{
url?:string
ref_audio_path?:string
ref_audio_data?: {
fileName:string
assetId:string
}
volume?:number
text_lang?: "auto" | "auto_yue" | "en" | "zh" | "ja" | "yue" | "ko" | "all_zh" | "all_ja" | "all_yue" | "all_ko"
text?:string
use_prompt?:boolean
prompt?:string | null
prompt_lang?: "auto" | "auto_yue" | "en" | "zh" | "ja" | "yue" | "ko" | "all_zh" | "all_ja" | "all_yue" | "all_ko"
top_p?:number
temperature?:number
speed?:number
top_k?:number
text_split_method?: "cut0" | "cut1" | "cut2" | "cut3" | "cut4" | "cut5"
}
supaMemory?:boolean supaMemory?:boolean
additionalAssets?:[string, string, string][] additionalAssets?:[string, string, string][]
ttsReadOnlyQuoted?:boolean ttsReadOnlyQuoted?:boolean

View File

@@ -165,7 +165,7 @@ async function translateMain(text:string, arg:{from:string, to:string, host:stri
} }
const url = `https://${arg.host}/translate_a/single?client=gtx&dt=t&sl=${arg.from}&tl=${arg.to}&q=` + encodeURIComponent(text) const url = `https://${arg.host}/translate_a/single?client=gtx&dt=t&sl=auto&tl=${arg.to}&q=` + encodeURIComponent(text)