From 5df9e68baee1799a39e312d6440b234020cd72b0 Mon Sep 17 00:00:00 2001 From: Kwaroran Date: Wed, 18 Dec 2024 03:26:02 +0900 Subject: [PATCH] Add o1 and fix some mistakes --- src/lang/en.ts | 3 + src/lib/Playground/PlaygroundImageGen.svelte | 78 ++++---- .../Playground/PlaygroundImageTrans.svelte | 15 ++ src/lib/Playground/PlaygroundSubtitle.svelte | 172 +++++++++++++++--- src/lib/Setting/Pages/BotSettings.svelte | 2 +- src/lib/Setting/Pages/LanguageSettings.svelte | 2 +- src/ts/model/modellist.ts | 26 ++- src/ts/process/request.ts | 11 +- src/ts/process/transformers.ts | 1 - 9 files changed, 233 insertions(+), 77 deletions(-) create mode 100644 src/lib/Playground/PlaygroundImageTrans.svelte diff --git a/src/lang/en.ts b/src/lang/en.ts index 83a8a3f9..dc720fc9 100644 --- a/src/lang/en.ts +++ b/src/lang/en.ts @@ -828,4 +828,7 @@ export const languageEnglish = { subtitlesWarning2: "You must use model with streaming feature to use this feature.", reset: "Reset", assetMaxDifference: "Asset Max Difference", + sourceLanguage: "Source Language", + destinationLanguage: "Destination Language", + noWebGPU: "Your Browser or OS doesn't support WebGPU. this will slow down the performance significantly.", } \ No newline at end of file diff --git a/src/lib/Playground/PlaygroundImageGen.svelte b/src/lib/Playground/PlaygroundImageGen.svelte index 38f1e4bd..059256a8 100644 --- a/src/lib/Playground/PlaygroundImageGen.svelte +++ b/src/lib/Playground/PlaygroundImageGen.svelte @@ -1,34 +1,44 @@ -
-
- - Card - -
-
- -
- -

- Card Title -

- - -

- Lorem ipsum dolor sit amet consectetur adipisicing elit. Voluptates rerum quisquam, temporibus quasi distinctio magnam. -

- - -
- - - - - 5 min read - -
-
-
\ No newline at end of file + + +

{language.imageGeneration}

+ +Prompt + + +Neg. Prompt + + +{#if img} + Generated + Generated +{/if} + + \ No newline at end of file diff --git a/src/lib/Playground/PlaygroundImageTrans.svelte b/src/lib/Playground/PlaygroundImageTrans.svelte new file mode 100644 index 00000000..99a8152e --- /dev/null +++ b/src/lib/Playground/PlaygroundImageTrans.svelte @@ -0,0 +1,15 @@ + + + +{language.destinationLanguage} + + +{language.prompt} + diff --git a/src/lib/Playground/PlaygroundSubtitle.svelte b/src/lib/Playground/PlaygroundSubtitle.svelte index e9504d3f..432d3a40 100644 --- a/src/lib/Playground/PlaygroundSubtitle.svelte +++ b/src/lib/Playground/PlaygroundSubtitle.svelte @@ -14,6 +14,7 @@ import SliderInput from "../UI/GUI/SliderInput.svelte"; import SelectInput from "../UI/GUI/SelectInput.svelte"; import OptionInput from "../UI/GUI/OptionInput.svelte"; + import sendSound from '../../etc/send.mp3' @@ -28,6 +29,39 @@ let vttB64 = $state('') let vobj:TranscribeObj[] = $state([]) let mode = $state('llm') + let sourceLang:string|null = $state(null) + + function getLanguageCodes(){ + let languageCodes:{ + code: string + name: string + }[] = [] + + for(let i=0x41;i<=0x5A;i++){ + for(let j=0x41;j<=0x5A;j++){ + languageCodes.push({ + code: String.fromCharCode(i) + String.fromCharCode(j), + name: '' + }) + } + } + + languageCodes = languageCodes.map(v => { + return { + code: v.code, + name: new Intl.DisplayNames([ + DBState.db.language === 'cn' ? 'zh' : DBState.db.language + ], { + type: 'language', + fallback: 'none' + }).of(v.code) + } + }).filter((a) => { + return a.name + }).sort((a, b) => a.name.localeCompare(b.name)) + + return languageCodes + } @@ -120,6 +154,9 @@ vobj = convertTransToObj(latest) outputText = makeWebVtt(vobj) vttB64 = `data:text/vtt;base64,${Buffer.from(outputText).toString('base64')}` + + const audio = new Audio(sendSound); + audio.play(); } async function runWhisperMode() { @@ -133,13 +170,12 @@ const file = files?.[0] + let requestFile:File = null + if(!file){ outputText = '' return } - - const formData = new FormData() - const videos = [ 'mp4', 'webm', 'mkv', 'avi', 'mov' ] @@ -198,28 +234,100 @@ }) outputText = 'Transcribing audio...\n\n' - formData.append('file', file2) + requestFile = file2 } else{ - formData.append('file', file) + requestFile = file } - formData.append('model', 'whisper-1') - formData.append('response_format', 'vtt') + + if(mode === 'whisperLocal'){ + try { + const {pipeline} = await import('@huggingface/transformers') + let stats:{ + [key:string]:{ + name:string + status:string + file:string + progress?:number + } + } = {} + + const device = ('gpu' in navigator) ? 'webgpu' : 'wasm' + + const transcriber = await pipeline( + "automatic-speech-recognition", + "onnx-community/whisper-large-v3-turbo_timestamped", + { + device: device, + progress_callback: (progress) => { + stats[progress.name + progress.file] = progress + outputText = Object.values(stats).map(v => `${v.name}-${v.file}: ${progress.status} ${v.progress ? `[${v.progress.toFixed(2)}%]` : ''}`).join('\n') + }, + dtype: 'q8' + }, + ); + + const audioContext = new AudioContext() + const audioBuffer = await audioContext.decodeAudioData(await requestFile.arrayBuffer()) + const combined = new Float32Array(audioBuffer.getChannelData(0).length) + for(let j = 0; j < audioBuffer.getChannelData(0).length; j++){ + for(let i = 0; i < audioBuffer.numberOfChannels; i++){ + combined[j] += audioBuffer.getChannelData(i)[j] + } + + if(combined[j] > 1){ + combined[j] = 1 + } + if(combined[j] < -1){ + combined[j] = -1 + } + } + + outputText = ('Transcribing... (This may take a while. Do not close the tab.)') + if(device !== 'webgpu'){ + outputText += `\nYour browser or OS do not support WebGPU, so the transcription may be slower.` + } + await sleep(10) + const res1 = await transcriber(combined, { + return_timestamps: true, + language: sourceLang, + }) + const res2 = Array.isArray(res1) ? res1[0] : res1 + const chunks = res2.chunks + + outputText = 'WEBVTT\n\n' + + for(const chunk of chunks){ + outputText += `${chunk.timestamp[0]} --> ${chunk.timestamp[1]}\n${chunk.text}\n\n` + } + + console.log(outputText) + + } catch (error) { + alertError(JSON.stringify(error)) + outputText = '' + return + } + } + else{ + const formData = new FormData() + formData.append('file', requestFile) + formData.append('model', 'whisper-1') + formData.append('response_format', 'vtt') - const d = await fetch('https://api.openai.com/v1/audio/transcriptions', { - method: 'POST', - headers: { - 'Authorization': `Bearer ${DBState.db.openAIKey}` - }, - body: formData + const d = await fetch('https://api.openai.com/v1/audio/transcriptions', { + method: 'POST', + headers: { + 'Authorization': `Bearer ${DBState.db.openAIKey}` + }, + body: formData - }) + }) + outputText = await d.text() + } - const fileBuffer = await file.arrayBuffer() - - outputText = await d.text() const v = await requestChatData({ formated: [{ @@ -254,9 +362,6 @@ outputText = value[firstKey] } - - console.log(outputText) - if(!outputText.trim().endsWith('```')){ outputText = outputText.trim() + '\n```' } @@ -268,12 +373,14 @@ latest = match[3].trim() } + const fileBuffer = await file.arrayBuffer() outputText = latest vttB64 = `data:text/vtt;base64,${Buffer.from(outputText).toString('base64')}` fileB64 = `data:audio/wav;base64,${Buffer.from(fileBuffer).toString('base64')}` vobj = convertWebVTTtoObj(outputText) - + const audio = new Audio(sendSound); + audio.play(); } @@ -355,7 +462,18 @@

{language.subtitles}

-{language.language} +{#if mode === 'whisperLocal'} + {language.sourceLanguage} + + Auto + {#each getLanguageCodes() as lang} + {lang.name} + {/each} + +{/if} + + +{language.destinationLanguage} {language.prompt} @@ -366,27 +484,31 @@ if(mode === 'llm'){ prompt = LLMModePrompt } - if(mode === 'whisper'){ + if(mode === 'whisper' || mode === 'whisperLocal'){ prompt = WhisperModePrompt } }}> LLM Whisper + Whisper Local -{#if !(modelInfo.flags.includes(LLMFlags.hasAudioInput) && modelInfo.flags.includes(LLMFlags.hasVideoInput))} +{#if !(modelInfo.flags.includes(LLMFlags.hasAudioInput) && modelInfo.flags.includes(LLMFlags.hasVideoInput)) && mode === 'llm'} {language.subtitlesWarning1} {/if} {#if !(modelInfo.flags.includes(LLMFlags.hasStreaming) && DBState.db.useStreaming)} {language.subtitlesWarning2} {/if} +{#if !('gpu' in navigator) && mode === 'whisperLocal'} + {language.noWebGPU} +{/if} {#if !outputText}