feat: Add support for gptSoVitsConfig in TTS processing
This commit is contained in:
@@ -9,7 +9,7 @@
|
|||||||
import LoreBook from "./LoreBook/LoreBookSetting.svelte";
|
import LoreBook from "./LoreBook/LoreBookSetting.svelte";
|
||||||
import { alertConfirm, alertMd, alertNormal, alertSelectChar, alertTOS, showHypaV2Alert } from "../../ts/alert";
|
import { alertConfirm, alertMd, alertNormal, alertSelectChar, alertTOS, showHypaV2Alert } from "../../ts/alert";
|
||||||
import BarIcon from "./BarIcon.svelte";
|
import BarIcon from "./BarIcon.svelte";
|
||||||
import { findCharacterbyId, getAuthorNoteDefaultText, parseKeyValue, selectMultipleFile } from "../../ts/util";
|
import { findCharacterbyId, getAuthorNoteDefaultText, parseKeyValue, selectMultipleFile, selectSingleFile } from "../../ts/util";
|
||||||
import { onDestroy } from "svelte";
|
import { onDestroy } from "svelte";
|
||||||
import {isEqual} from 'lodash'
|
import {isEqual} from 'lodash'
|
||||||
import Help from "../Others/Help.svelte";
|
import Help from "../Others/Help.svelte";
|
||||||
@@ -29,7 +29,8 @@
|
|||||||
import { updateInlayScreen } from "src/ts/process/inlayScreen";
|
import { updateInlayScreen } from "src/ts/process/inlayScreen";
|
||||||
import { registerOnnxModel } from "src/ts/process/transformers";
|
import { registerOnnxModel } from "src/ts/process/transformers";
|
||||||
import MultiLangInput from "../UI/GUI/MultiLangInput.svelte";
|
import MultiLangInput from "../UI/GUI/MultiLangInput.svelte";
|
||||||
import { applyModule } from "src/ts/process/modules";
|
import { applyModule } from "src/ts/process/modules";
|
||||||
|
import SliderInput from "../UI/GUI/SliderInput.svelte";
|
||||||
|
|
||||||
|
|
||||||
let subMenu = 0
|
let subMenu = 0
|
||||||
@@ -103,6 +104,12 @@
|
|||||||
}
|
}
|
||||||
emos = currentChar.data.emotionImages
|
emos = currentChar.data.emotionImages
|
||||||
currentChar = currentChar
|
currentChar = currentChar
|
||||||
|
|
||||||
|
if (currentChar.data.ttsMode === 'gptsovits' && (currentChar.data as character).gptSoVitsConfig) {
|
||||||
|
if (!(currentChar.data as character).gptSoVitsConfig.use_prompt) {
|
||||||
|
(currentChar.data as character).gptSoVitsConfig.prompt = undefined
|
||||||
|
}
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
let assetFileExtensions:string[] = []
|
let assetFileExtensions:string[] = []
|
||||||
@@ -147,6 +154,27 @@
|
|||||||
version: 'v2'
|
version: 'v2'
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
$: if (currentChar.data.ttsMode === 'gptsovits' && (currentChar.data as character).gptSoVitsConfig === undefined) {
|
||||||
|
(currentChar.data as character).gptSoVitsConfig = {
|
||||||
|
url: '',
|
||||||
|
ref_audio_path: 'C:/Users/user/Downloads/GPT-SoVITS-v2-240821',
|
||||||
|
ref_audio_data: {
|
||||||
|
fileName: '',
|
||||||
|
assetId: ''
|
||||||
|
},
|
||||||
|
volume: 1.0,
|
||||||
|
text_lang: 'auto',
|
||||||
|
text: 'en',
|
||||||
|
use_prompt: false,
|
||||||
|
prompt_lang: 'en',
|
||||||
|
top_p: 1,
|
||||||
|
temperature: 0.7,
|
||||||
|
speed: 1,
|
||||||
|
top_k: 5,
|
||||||
|
text_split_method: 'cut0',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
{#if licensed !== 'private'}
|
{#if licensed !== 'private'}
|
||||||
@@ -668,7 +696,7 @@
|
|||||||
{#if currentChar.type === 'character'}
|
{#if currentChar.type === 'character'}
|
||||||
<h2 class="mb-2 text-2xl font-bold mt-2">TTS</h2>
|
<h2 class="mb-2 text-2xl font-bold mt-2">TTS</h2>
|
||||||
<span class="text-textcolor">{language.provider}</span>
|
<span class="text-textcolor">{language.provider}</span>
|
||||||
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.ttsMode} on:change={() => {
|
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.ttsMode} on:change={(e) => {
|
||||||
if(currentChar.type === 'character'){
|
if(currentChar.type === 'character'){
|
||||||
currentChar.data.ttsSpeech = ''
|
currentChar.data.ttsSpeech = ''
|
||||||
}
|
}
|
||||||
@@ -681,6 +709,7 @@
|
|||||||
<OptionInput value="novelai">NovelAI</OptionInput>
|
<OptionInput value="novelai">NovelAI</OptionInput>
|
||||||
<OptionInput value="huggingface">Huggingface</OptionInput>
|
<OptionInput value="huggingface">Huggingface</OptionInput>
|
||||||
<OptionInput value="vits">VITS</OptionInput>
|
<OptionInput value="vits">VITS</OptionInput>
|
||||||
|
<OptionInput value="gptsovits">GPT-SoVITS</OptionInput>
|
||||||
</SelectInput>
|
</SelectInput>
|
||||||
|
|
||||||
|
|
||||||
@@ -764,23 +793,20 @@
|
|||||||
<OptionInput value="v1">v1</OptionInput>
|
<OptionInput value="v1">v1</OptionInput>
|
||||||
<OptionInput value="v2">v2</OptionInput>
|
<OptionInput value="v2">v2</OptionInput>
|
||||||
</SelectInput>
|
</SelectInput>
|
||||||
{/if}
|
{:else if currentChar.data.ttsMode === 'openai'}
|
||||||
{#if currentChar.data.ttsMode === 'openai'}
|
|
||||||
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.oaiVoice}>
|
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.oaiVoice}>
|
||||||
<OptionInput value="">Unset</OptionInput>
|
<OptionInput value="">Unset</OptionInput>
|
||||||
{#each oaiVoices as voice}
|
{#each oaiVoices as voice}
|
||||||
<OptionInput value={voice}>{voice}</OptionInput>
|
<OptionInput value={voice}>{voice}</OptionInput>
|
||||||
{/each}
|
{/each}
|
||||||
</SelectInput>
|
</SelectInput>
|
||||||
{/if}
|
{:else if currentChar.data.ttsMode === 'huggingface'}
|
||||||
{#if currentChar.data.ttsMode === 'huggingface'}
|
|
||||||
<span class="text-textcolor">Model</span>
|
<span class="text-textcolor">Model</span>
|
||||||
<TextInput className="mb-4 mt-2" bind:value={currentChar.data.hfTTS.model} />
|
<TextInput className="mb-4 mt-2" bind:value={currentChar.data.hfTTS.model} />
|
||||||
|
|
||||||
<span class="text-textcolor">Language</span>
|
<span class="text-textcolor">Language</span>
|
||||||
<TextInput className="mb-4 mt-2" bind:value={currentChar.data.hfTTS.language} placeholder="en" />
|
<TextInput className="mb-4 mt-2" bind:value={currentChar.data.hfTTS.language} placeholder="en" />
|
||||||
{/if}
|
{:else if currentChar.data.ttsMode === 'vits'}
|
||||||
{#if currentChar.data.ttsMode === 'vits'}
|
|
||||||
{#if currentChar.data.vits}
|
{#if currentChar.data.vits}
|
||||||
<span class="text-textcolor">{currentChar.data.vits.name ?? 'Unnamed VitsModel'}</span>
|
<span class="text-textcolor">{currentChar.data.vits.name ?? 'Unnamed VitsModel'}</span>
|
||||||
{:else}
|
{:else}
|
||||||
@@ -792,6 +818,99 @@
|
|||||||
currentChar.data.vits = model
|
currentChar.data.vits = model
|
||||||
}
|
}
|
||||||
}}>{language.selectModel}</Button>
|
}}>{language.selectModel}</Button>
|
||||||
|
{:else if currentChar.data.ttsMode === 'gptsovits'}
|
||||||
|
<span class="text-textcolor">Volume</span>
|
||||||
|
<SliderInput min={0.0} max={1.0} step={0.01} fixed={2} bind:value={currentChar.data.gptSoVitsConfig.volume}/>
|
||||||
|
<span class="text-textcolor">URL</span>
|
||||||
|
<TextInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.url}/>
|
||||||
|
|
||||||
|
<span class="text-textcolor">Reference Audio Path (e.g. C:/Users/user/Downloads/GPT-SoVITS-v2-240821)</span>
|
||||||
|
<TextInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.ref_audio_path}/>
|
||||||
|
|
||||||
|
<span class="text-textcolor">Reference Audio Data (3~10s audio file)</span>
|
||||||
|
<Button on:click={async () => {
|
||||||
|
const audio = await selectSingleFile([
|
||||||
|
'mp3',
|
||||||
|
'wav',
|
||||||
|
'ogg',
|
||||||
|
])
|
||||||
|
if(!audio){
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
const saveId = await saveAsset(audio.data)
|
||||||
|
// @ts-expect-error not groupChat
|
||||||
|
currentChar.data.gptSoVitsConfig.ref_audio_data = {
|
||||||
|
fileName: audio.name,
|
||||||
|
assetId: saveId
|
||||||
|
}
|
||||||
|
|
||||||
|
}}
|
||||||
|
className="h-10">
|
||||||
|
|
||||||
|
{#if currentChar.data.gptSoVitsConfig.ref_audio_data.assetId === '' || currentChar.data.gptSoVitsConfig.ref_audio_data.assetId === undefined}
|
||||||
|
Select File
|
||||||
|
{:else}
|
||||||
|
{currentChar.data.gptSoVitsConfig.ref_audio_data.fileName}
|
||||||
|
{/if}
|
||||||
|
</Button>
|
||||||
|
<span class="text-textcolor">Text Language</span>
|
||||||
|
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.text_lang}>
|
||||||
|
<OptionInput value="auto">Auto</OptionInput>
|
||||||
|
<OptionInput value="auto_yue">Auto (Cantonese)</OptionInput>
|
||||||
|
<OptionInput value="en">English</OptionInput>
|
||||||
|
<OptionInput value="zh">Chinese</OptionInput>
|
||||||
|
<OptionInput value="ja">Japanese</OptionInput>
|
||||||
|
<OptionInput value="yue">Cantonese</OptionInput>
|
||||||
|
<OptionInput value="ko">Korean</OptionInput>
|
||||||
|
<OptionInput value="all_zh">All Chinese</OptionInput>
|
||||||
|
<OptionInput value="all_ja">All Japanese</OptionInput>
|
||||||
|
<OptionInput value="all_yue">All Cantonese</OptionInput>
|
||||||
|
<OptionInput value="all_ko">All Korean</OptionInput>
|
||||||
|
</SelectInput>
|
||||||
|
|
||||||
|
<span class="text-textcolor">Use Reference Audio Script</span>
|
||||||
|
<Check bind:check={currentChar.data.gptSoVitsConfig.use_prompt}/>
|
||||||
|
|
||||||
|
{#if currentChar.data.gptSoVitsConfig.use_prompt}
|
||||||
|
<span class="text-textcolor">Reference Audio Script</span>
|
||||||
|
<TextAreaInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.prompt}/>
|
||||||
|
{/if}
|
||||||
|
|
||||||
|
<span class="text-textcolor">Reference Audio Language</span>
|
||||||
|
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.prompt_lang}>
|
||||||
|
<OptionInput value="auto">Auto</OptionInput>
|
||||||
|
<OptionInput value="auto_yue">Auto (Cantonese)</OptionInput>
|
||||||
|
<OptionInput value="en">English</OptionInput>
|
||||||
|
<OptionInput value="zh">Chinese</OptionInput>
|
||||||
|
<OptionInput value="ja">Japanese</OptionInput>
|
||||||
|
<OptionInput value="yue">Cantonese</OptionInput>
|
||||||
|
<OptionInput value="ko">Korean</OptionInput>
|
||||||
|
<OptionInput value="all_zh">English And Chinese</OptionInput>
|
||||||
|
<OptionInput value="all_ja">English And Japanese</OptionInput>
|
||||||
|
<OptionInput value="all_yue">English And Cantonese</OptionInput>
|
||||||
|
<OptionInput value="all_ko">English And Korean</OptionInput>
|
||||||
|
</SelectInput>
|
||||||
|
<span class="text-textcolor">Top P</span>
|
||||||
|
<SliderInput min={0.0} max={1.0} step={0.05} fixed={2} bind:value={currentChar.data.gptSoVitsConfig.top_p}/>
|
||||||
|
|
||||||
|
<span class="text-textcolor">Temperature</span>
|
||||||
|
<SliderInput min={0.0} max={1.0} step={0.05} fixed={2} bind:value={currentChar.data.gptSoVitsConfig.temperature}/>
|
||||||
|
|
||||||
|
<span class="text-textcolor">Speed</span>
|
||||||
|
<SliderInput min={0.6} max={1.65} step={0.05} fixed={2} bind:value={currentChar.data.gptSoVitsConfig.speed}/>
|
||||||
|
|
||||||
|
<span class="text-textcolor">Top K</span>
|
||||||
|
<SliderInput min={1} max={100} step={1} bind:value={currentChar.data.gptSoVitsConfig.top_k}/>
|
||||||
|
|
||||||
|
<span class="text-textcolor">Text Split Method</span>
|
||||||
|
<SelectInput className="mb-4 mt-2" bind:value={currentChar.data.gptSoVitsConfig.text_split_method}>
|
||||||
|
<OptionInput value="cut0">Cut 0 (No splitting)</OptionInput>
|
||||||
|
<OptionInput value="cut1">Cut 1 (Split every 4 sentences)</OptionInput>
|
||||||
|
<OptionInput value="cut2">Cut 2 (Split every 50 characters)</OptionInput>
|
||||||
|
<OptionInput value="cut3">Cut 3 (Split by Chinese periods)</OptionInput>
|
||||||
|
<OptionInput value="cut4">Cut 4 (Split by English periods)</OptionInput>
|
||||||
|
<OptionInput value="cut5">Cut 5 (Split by various punctuation marks)</OptionInput>
|
||||||
|
</SelectInput>
|
||||||
{/if}
|
{/if}
|
||||||
{#if currentChar.data.ttsMode}
|
{#if currentChar.data.ttsMode}
|
||||||
<div class="flex items-center mt-2">
|
<div class="flex items-center mt-2">
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import { get } from "svelte/store";
|
|||||||
import { alertError } from "../alert";
|
import { alertError } from "../alert";
|
||||||
import { DataBase, type character } from "../storage/database";
|
import { DataBase, type character } from "../storage/database";
|
||||||
import { runTranslator, translateVox } from "../translator/translator";
|
import { runTranslator, translateVox } from "../translator/translator";
|
||||||
import { globalFetch } from "../storage/globalApi";
|
import { globalFetch, loadAsset } from "../storage/globalApi";
|
||||||
import { language } from "src/lang";
|
import { language } from "src/lang";
|
||||||
import { getCurrentCharacter, sleep } from "../util";
|
import { getCurrentCharacter, sleep } from "../util";
|
||||||
import { registerOnnxModel, runVITS } from "./transformers";
|
import { registerOnnxModel, runVITS } from "./transformers";
|
||||||
@@ -27,7 +27,7 @@ export async function sayTTS(character:character,text:string) {
|
|||||||
text = text.replace(/\*/g,'')
|
text = text.replace(/\*/g,'')
|
||||||
|
|
||||||
if(character.ttsReadOnlyQuoted){
|
if(character.ttsReadOnlyQuoted){
|
||||||
const matches = text.match(/"(.*?)"/g)
|
const matches = text.match(/["「](.*?)["」]/g)
|
||||||
if(matches && matches.length > 0){
|
if(matches && matches.length > 0){
|
||||||
text = matches.map(match => match.slice(1, -1)).join("");
|
text = matches.map(match => match.slice(1, -1)).join("");
|
||||||
}
|
}
|
||||||
@@ -231,12 +231,71 @@ export async function sayTTS(character:character,text:string) {
|
|||||||
case 'vits':{
|
case 'vits':{
|
||||||
await runVITS(text, character.vits)
|
await runVITS(text, character.vits)
|
||||||
}
|
}
|
||||||
|
case 'gptsovits':{
|
||||||
|
const audioContext = new AudioContext();
|
||||||
|
|
||||||
|
const audio: Uint8Array = await loadAsset(character.gptSoVitsConfig.ref_audio_data.assetId);
|
||||||
|
const base64Audio = btoa(new Uint8Array(audio).reduce((data, byte) => data + String.fromCharCode(byte), ''));
|
||||||
|
|
||||||
|
const body = {
|
||||||
|
text: text,
|
||||||
|
text_lang: character.gptSoVitsConfig.text_lang,
|
||||||
|
ref_audio_path: character.gptSoVitsConfig.ref_audio_path + '/public/audio/' + character.gptSoVitsConfig.ref_audio_data.fileName,
|
||||||
|
ref_audio_name: character.gptSoVitsConfig.ref_audio_data.fileName,
|
||||||
|
ref_audio_data: base64Audio,
|
||||||
|
prompt_text: undefined,
|
||||||
|
prompt_lang: character.gptSoVitsConfig.prompt_lang,
|
||||||
|
top_p: character.gptSoVitsConfig.top_p,
|
||||||
|
temperature: character.gptSoVitsConfig.temperature,
|
||||||
|
speed_factor: character.gptSoVitsConfig.speed,
|
||||||
|
top_k: character.gptSoVitsConfig.top_k,
|
||||||
|
text_split_method: character.gptSoVitsConfig.text_split_method,
|
||||||
|
parallel_infer: false,
|
||||||
|
}
|
||||||
|
|
||||||
|
if (character.gptSoVitsConfig.use_prompt){
|
||||||
|
body.prompt_text = character.gptSoVitsConfig.prompt
|
||||||
|
}
|
||||||
|
console.log(body)
|
||||||
|
|
||||||
|
const response = await globalFetch(`${character.gptSoVitsConfig.url}/tts`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
body: body,
|
||||||
|
rawResponse: true,
|
||||||
|
})
|
||||||
|
console.log(response)
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
const audioBuffer = response.data.buffer;
|
||||||
|
audioContext.decodeAudioData(audioBuffer, (decodedData) => {
|
||||||
|
const sourceNode = audioContext.createBufferSource();
|
||||||
|
sourceNode.buffer = decodedData;
|
||||||
|
|
||||||
|
const gainNode = audioContext.createGain();
|
||||||
|
gainNode.gain.value = character.gptSoVitsConfig.volume || 1.0;
|
||||||
|
|
||||||
|
sourceNode.connect(gainNode);
|
||||||
|
gainNode.connect(audioContext.destination);
|
||||||
|
|
||||||
|
sourceNode.start();
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const textBuffer: Uint8Array = response.data.buffer
|
||||||
|
const text = Buffer.from(textBuffer).toString('utf-8')
|
||||||
|
throw new Error(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
alertError(`TTS Error: ${error}`)
|
alertError(`TTS Error: ${error}`)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export const oaiVoices = [
|
export const oaiVoices = [
|
||||||
'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'
|
'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -803,16 +803,23 @@ export interface character{
|
|||||||
version?: string
|
version?: string
|
||||||
}
|
}
|
||||||
gptSoVitsConfig?:{
|
gptSoVitsConfig?:{
|
||||||
ref_audio_data?:string
|
url?:string
|
||||||
|
ref_audio_path?:string
|
||||||
|
ref_audio_data?: {
|
||||||
|
fileName:string
|
||||||
|
assetId:string
|
||||||
|
}
|
||||||
|
volume?:number
|
||||||
text_lang?: "auto" | "auto_yue" | "en" | "zh" | "ja" | "yue" | "ko" | "all_zh" | "all_ja" | "all_yue" | "all_ko"
|
text_lang?: "auto" | "auto_yue" | "en" | "zh" | "ja" | "yue" | "ko" | "all_zh" | "all_ja" | "all_yue" | "all_ko"
|
||||||
text?:string
|
text?:string
|
||||||
|
use_prompt?:boolean
|
||||||
prompt?:string | null
|
prompt?:string | null
|
||||||
prompt_lang?:string
|
prompt_lang?: "auto" | "auto_yue" | "en" | "zh" | "ja" | "yue" | "ko" | "all_zh" | "all_ja" | "all_yue" | "all_ko"
|
||||||
top_p?:number
|
top_p?:number
|
||||||
temperature?:number
|
temperature?:number
|
||||||
speed?:number
|
speed?:number
|
||||||
top_k?:number
|
top_k?:number
|
||||||
text_split_method?:string
|
text_split_method?: "cut0" | "cut1" | "cut2" | "cut3" | "cut4" | "cut5"
|
||||||
}
|
}
|
||||||
supaMemory?:boolean
|
supaMemory?:boolean
|
||||||
additionalAssets?:[string, string, string][]
|
additionalAssets?:[string, string, string][]
|
||||||
|
|||||||
Reference in New Issue
Block a user