Files
risuai/src/ts/process/tts.ts
2024-09-01 01:50:21 +09:00

394 lines
16 KiB
TypeScript

import { get } from "svelte/store";
import { alertError } from "../alert";
import { DataBase, type character } from "../storage/database";
import { runTranslator, translateVox } from "../translator/translator";
import { globalFetch, loadAsset } from "../storage/globalApi";
import { language } from "src/lang";
import { getCurrentCharacter, sleep } from "../util";
import { registerOnnxModel, runVITS } from "./transformers";
let sourceNode:AudioBufferSourceNode = null
export async function sayTTS(character:character,text:string) {
try {
if(!character){
const v = getCurrentCharacter()
if(v.type === 'group'){
return
}
character = v
}
if(!text){
return
}
let db = get(DataBase)
text = text.replace(/\*/g,'')
if(character.ttsReadOnlyQuoted){
const matches = text.match(/["「](.*?)["」]/g)
if(matches && matches.length > 0){
text = matches.map(match => match.slice(1, -1)).join("");
}
else{
text = ''
}
}
switch(character.ttsMode){
case "webspeech":{
if(speechSynthesis && SpeechSynthesisUtterance){
const utterThis = new SpeechSynthesisUtterance(text);
const voices = speechSynthesis.getVoices();
let voiceIndex = 0
for(let i=0;i<voices.length;i++){
if(voices[i].name === character.ttsSpeech){
voiceIndex = i
}
}
utterThis.voice = voices[voiceIndex]
const speak = speechSynthesis.speak(utterThis)
}
break
}
case "elevenlab": {
const audioContext = new AudioContext();
const da = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${character.ttsSpeech}`, {
body: JSON.stringify({
text: text,
model_id: "eleven_multilingual_v2"
}),
method: "POST",
headers: {
"Content-Type": "application/json",
'xi-api-key': db.elevenLabKey || undefined
}
})
if(da.status >= 200 && da.status < 300){
const audioBuffer = await audioContext.decodeAudioData(await da.arrayBuffer())
sourceNode = audioContext.createBufferSource();
sourceNode.buffer = audioBuffer;
sourceNode.connect(audioContext.destination);
sourceNode.start();
}
else{
alertError(await da.text())
}
break
}
case "VOICEVOX": {
const jpText = await translateVox(text)
const audioContext = new AudioContext();
const query = await fetch(`${db.voicevoxUrl}/audio_query?text=${jpText}&speaker=${character.ttsSpeech}`, {
method: 'POST',
headers: { "Content-Type": "application/json"},
})
if (query.status == 200){
const queryJson = await query.json();
const bodyData = {
accent_phrases: queryJson.accent_phrases,
speedScale: character.voicevoxConfig.SPEED_SCALE,
pitchScale: character.voicevoxConfig.PITCH_SCALE,
volumeScale: character.voicevoxConfig.VOLUME_SCALE,
intonationScale: character.voicevoxConfig.INTONATION_SCALE,
prePhonemeLength: queryJson.prePhonemeLength,
postPhonemeLength: queryJson.postPhonemeLength,
outputSamplingRate: queryJson.outputSamplingRate,
outputStereo: queryJson.outputStereo,
kana: queryJson.kana,
}
const getVoice = await fetch(`${db.voicevoxUrl}/synthesis?speaker=${character.ttsSpeech}`, {
method: 'POST',
headers: { "Content-Type": "application/json"},
body: JSON.stringify(bodyData),
})
if (getVoice.status == 200 && getVoice.headers.get('content-type') === 'audio/wav'){
const audioBuffer = await audioContext.decodeAudioData(await getVoice.arrayBuffer())
sourceNode = audioContext.createBufferSource();
sourceNode.buffer = audioBuffer;
sourceNode.connect(audioContext.destination);
sourceNode.start();
}
}
break
}
case 'openai':{
const key = db.openAIKey
const res = await globalFetch('https://api.openai.com/v1/audio/speech', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + key,
},
body: {
model: 'tts-1',
input: text,
voice: character.oaiVoice,
},
rawResponse: true,
})
const dat = res.data
if(res.ok){
try {
const audio = Buffer.from(dat).buffer
const audioContext = new AudioContext();
const audioBuffer = await audioContext.decodeAudioData(audio)
sourceNode = audioContext.createBufferSource();
sourceNode.buffer = audioBuffer;
sourceNode.connect(audioContext.destination);
sourceNode.start();
} catch (error) {
alertError(language.errors.httpError + `${error}`)
}
}
else{
if(dat.error && dat.error.message){
alertError((language.errors.httpError + `${dat.error.message}`))
}
else{
alertError((language.errors.httpError + `${Buffer.from(res.data).toString()}`))
}
}
break;
}
case 'novelai': {
const audioContext = new AudioContext();
if(text === ''){
break;
}
const encodedText = encodeURIComponent(text);
const encodedSeed = encodeURIComponent(character.naittsConfig.voice);
const url = `https://api.novelai.net/ai/generate-voice?text=${encodedText}&voice=-1&seed=${encodedSeed}&opus=false&version=${character.naittsConfig.version}`;
const response = await globalFetch(url, {
method: 'GET',
headers: {
"Authorization": "Bearer " + db.NAIApiKey,
},
rawResponse: true
});
if (response.ok) {
const audioBuffer = response.data.buffer;
audioContext.decodeAudioData(audioBuffer, (decodedData) => {
const sourceNode = audioContext.createBufferSource();
sourceNode.buffer = decodedData;
sourceNode.connect(audioContext.destination);
sourceNode.start();
});
} else {
alertError("Error fetching or decoding audio data");
}
break;
}
case 'huggingface': {
while(true){
if(character.hfTTS.language !== 'en'){
text = await runTranslator(text, false, 'en', character.hfTTS.language)
}
const audioContext = new AudioContext();
const response = await fetch(`https://api-inference.huggingface.co/models/${character.hfTTS.model}`, {
method: 'POST',
headers: {
"Authorization": "Bearer " + db.huggingfaceKey,
"Content-Type": "application/json",
},
body: JSON.stringify({
inputs: text,
})
});
if(response.status === 503 && response.headers.get('content-type') === 'application/json'){
const json = await response.json()
if(json.estimated_time){
await sleep(json.estimated_time * 1000)
continue
}
}
else if(response.status >= 400){
alertError(language.errors.httpError + `${await response.text()}`)
return
}
else if (response.status === 200) {
const audioBuffer = await response.arrayBuffer();
audioContext.decodeAudioData(audioBuffer, (decodedData) => {
const sourceNode = audioContext.createBufferSource();
sourceNode.buffer = decodedData;
sourceNode.connect(audioContext.destination);
sourceNode.start();
});
} else {
alertError("Error fetching or decoding audio data");
}
return
}
}
case 'vits':{
await runVITS(text, character.vits)
}
case 'gptsovits':{
const audioContext = new AudioContext();
const audio: Uint8Array = await loadAsset(character.gptSoVitsConfig.ref_audio_data.assetId);
const base64Audio = btoa(new Uint8Array(audio).reduce((data, byte) => data + String.fromCharCode(byte), ''));
const body = {
text: text,
text_lang: character.gptSoVitsConfig.text_lang,
ref_audio_path: undefined,
ref_audio_name: character.gptSoVitsConfig.ref_audio_data.fileName,
ref_audio_data: base64Audio,
prompt_text: undefined,
prompt_lang: character.gptSoVitsConfig.prompt_lang,
top_p: character.gptSoVitsConfig.top_p,
temperature: character.gptSoVitsConfig.temperature,
speed_factor: character.gptSoVitsConfig.speed,
top_k: character.gptSoVitsConfig.top_k,
text_split_method: character.gptSoVitsConfig.text_split_method,
parallel_infer: true,
// media_type: character.gptSoVitsConfig.ref_audio_data.fileName.split('.')[1],
ref_free: character.gptSoVitsConfig.use_long_audio || !character.gptSoVitsConfig.use_prompt,
}
if (character.gptSoVitsConfig.use_prompt){
body.prompt_text = character.gptSoVitsConfig.prompt
}
if (character.gptSoVitsConfig.use_auto_path){
console.log('auto')
const path = await globalFetch(`${character.gptSoVitsConfig.url}/get_path`, {
method: 'GET',
headers: {
'Content-Type': 'application/json'
},
rawResponse: false,
plainFetchDeforce: true,
})
console.log(path)
if(path.ok){
body.ref_audio_path = path.data.message + '/public/audio/' + character.gptSoVitsConfig.ref_audio_data.fileName
}
else{
throw new Error('Failed to Auto get path')
}
} else {
body.ref_audio_path = character.gptSoVitsConfig.ref_audio_path + '/public/audio/' + character.gptSoVitsConfig.ref_audio_data.fileName
}
console.log(body)
const response = await globalFetch(`${character.gptSoVitsConfig.url}/tts`, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: body,
rawResponse: true,
})
console.log(response)
if (response.ok) {
const audioBuffer = response.data.buffer;
audioContext.decodeAudioData(audioBuffer, (decodedData) => {
const sourceNode = audioContext.createBufferSource();
sourceNode.buffer = decodedData;
const gainNode = audioContext.createGain();
gainNode.gain.value = character.gptSoVitsConfig.volume || 1.0;
sourceNode.connect(gainNode);
gainNode.connect(audioContext.destination);
sourceNode.start();
});
} else {
const textBuffer: Uint8Array = response.data.buffer
const text = Buffer.from(textBuffer).toString('utf-8')
throw new Error(text);
}
}
}
} catch (error) {
alertError(`TTS Error: ${error}`)
}
}
export const oaiVoices = [
'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'
]
export function stopTTS(){
if(sourceNode){
sourceNode.stop()
}
if(speechSynthesis && SpeechSynthesisUtterance){
speechSynthesis.cancel()
}
}
export function getWebSpeechTTSVoices() {
return speechSynthesis.getVoices().map(v => {
return v.name
})
}
export async function getElevenTTSVoices() {
let db = get(DataBase)
const data = await fetch('https://api.elevenlabs.io/v1/voices', {
headers: {
'xi-api-key': db.elevenLabKey || undefined
}
})
const res = await data.json()
console.log(res)
return res.voices
}
export async function getVOICEVOXVoices() {
const db = get(DataBase);
const speakerData = await fetch(`${db.voicevoxUrl}/speakers`)
const speakerList = await speakerData.json()
const speakersInfo = speakerList.map((speaker) => {
const styles = speaker.styles.map((style) => {
return {name: style.name, id: `${style.id}`}
})
return {name: speaker.name, list: JSON.stringify(styles)}
})
speakersInfo.unshift({ name: "None", list: null})
return speakersInfo;
}
export async function getNovelAIVoices(){
return [
{
gender: "UNISEX",
voices: ['Anananan']
},
{
gender: "FEMALE",
voices: ['Aini', 'Orea', 'Claea', 'Lim', 'Aurae', 'Naia']
},
{
gender: "MALE",
voices: ['Aulon', 'Elei', 'Ogma', 'Raid', 'Pega', 'Lam']
}
];
}
export async function FixNAITTS(data:character){
if (data.naittsConfig === undefined){
data.naittsConfig.voice = 'Anananan'
}
return data
}