diff --git a/src/lang/en.ts b/src/lang/en.ts
index 83a8a3f9..dc720fc9 100644
--- a/src/lang/en.ts
+++ b/src/lang/en.ts
@@ -828,4 +828,7 @@ export const languageEnglish = {
subtitlesWarning2: "You must use model with streaming feature to use this feature.",
reset: "Reset",
assetMaxDifference: "Asset Max Difference",
+ sourceLanguage: "Source Language",
+ destinationLanguage: "Destination Language",
+ noWebGPU: "Your Browser or OS doesn't support WebGPU. this will slow down the performance significantly.",
}
\ No newline at end of file
diff --git a/src/lib/Playground/PlaygroundImageGen.svelte b/src/lib/Playground/PlaygroundImageGen.svelte
index 38f1e4bd..059256a8 100644
--- a/src/lib/Playground/PlaygroundImageGen.svelte
+++ b/src/lib/Playground/PlaygroundImageGen.svelte
@@ -1,34 +1,44 @@
-
-
-
-
-
-
-
-
-
-
-
- Card Title
-
-
-
-
- Lorem ipsum dolor sit amet consectetur adipisicing elit. Voluptates rerum quisquam, temporibus quasi distinctio magnam.
-
-
-
-
-
- Learn More
-
-
-
-
- 5 min read
-
-
-
-
\ No newline at end of file
+
+
+{language.imageGeneration}
+
+Prompt
+
+
+Neg. Prompt
+
+
+{#if img}
+ Generated
+
+{/if}
+
+
+ {#if generating}
+
+ {:else}
+ Generate
+ {/if}
+
\ No newline at end of file
diff --git a/src/lib/Playground/PlaygroundImageTrans.svelte b/src/lib/Playground/PlaygroundImageTrans.svelte
new file mode 100644
index 00000000..99a8152e
--- /dev/null
+++ b/src/lib/Playground/PlaygroundImageTrans.svelte
@@ -0,0 +1,15 @@
+
+
+
+{language.destinationLanguage}
+
+
+{language.prompt}
+
diff --git a/src/lib/Playground/PlaygroundSubtitle.svelte b/src/lib/Playground/PlaygroundSubtitle.svelte
index e9504d3f..432d3a40 100644
--- a/src/lib/Playground/PlaygroundSubtitle.svelte
+++ b/src/lib/Playground/PlaygroundSubtitle.svelte
@@ -14,6 +14,7 @@
import SliderInput from "../UI/GUI/SliderInput.svelte";
import SelectInput from "../UI/GUI/SelectInput.svelte";
import OptionInput from "../UI/GUI/OptionInput.svelte";
+ import sendSound from '../../etc/send.mp3'
@@ -28,6 +29,39 @@
let vttB64 = $state('')
let vobj:TranscribeObj[] = $state([])
let mode = $state('llm')
+ let sourceLang:string|null = $state(null)
+
+ function getLanguageCodes(){
+ let languageCodes:{
+ code: string
+ name: string
+ }[] = []
+
+ for(let i=0x41;i<=0x5A;i++){
+ for(let j=0x41;j<=0x5A;j++){
+ languageCodes.push({
+ code: String.fromCharCode(i) + String.fromCharCode(j),
+ name: ''
+ })
+ }
+ }
+
+ languageCodes = languageCodes.map(v => {
+ return {
+ code: v.code,
+ name: new Intl.DisplayNames([
+ DBState.db.language === 'cn' ? 'zh' : DBState.db.language
+ ], {
+ type: 'language',
+ fallback: 'none'
+ }).of(v.code)
+ }
+ }).filter((a) => {
+ return a.name
+ }).sort((a, b) => a.name.localeCompare(b.name))
+
+ return languageCodes
+ }
@@ -120,6 +154,9 @@
vobj = convertTransToObj(latest)
outputText = makeWebVtt(vobj)
vttB64 = `data:text/vtt;base64,${Buffer.from(outputText).toString('base64')}`
+
+ const audio = new Audio(sendSound);
+ audio.play();
}
async function runWhisperMode() {
@@ -133,13 +170,12 @@
const file = files?.[0]
+ let requestFile:File = null
+
if(!file){
outputText = ''
return
}
-
- const formData = new FormData()
-
const videos = [
'mp4', 'webm', 'mkv', 'avi', 'mov'
]
@@ -198,28 +234,100 @@
})
outputText = 'Transcribing audio...\n\n'
- formData.append('file', file2)
+ requestFile = file2
}
else{
- formData.append('file', file)
+ requestFile = file
}
- formData.append('model', 'whisper-1')
- formData.append('response_format', 'vtt')
+
+ if(mode === 'whisperLocal'){
+ try {
+ const {pipeline} = await import('@huggingface/transformers')
+ let stats:{
+ [key:string]:{
+ name:string
+ status:string
+ file:string
+ progress?:number
+ }
+ } = {}
+
+ const device = ('gpu' in navigator) ? 'webgpu' : 'wasm'
+
+ const transcriber = await pipeline(
+ "automatic-speech-recognition",
+ "onnx-community/whisper-large-v3-turbo_timestamped",
+ {
+ device: device,
+ progress_callback: (progress) => {
+ stats[progress.name + progress.file] = progress
+ outputText = Object.values(stats).map(v => `${v.name}-${v.file}: ${progress.status} ${v.progress ? `[${v.progress.toFixed(2)}%]` : ''}`).join('\n')
+ },
+ dtype: 'q8'
+ },
+ );
+
+ const audioContext = new AudioContext()
+ const audioBuffer = await audioContext.decodeAudioData(await requestFile.arrayBuffer())
+ const combined = new Float32Array(audioBuffer.getChannelData(0).length)
+ for(let j = 0; j < audioBuffer.getChannelData(0).length; j++){
+ for(let i = 0; i < audioBuffer.numberOfChannels; i++){
+ combined[j] += audioBuffer.getChannelData(i)[j]
+ }
+
+ if(combined[j] > 1){
+ combined[j] = 1
+ }
+ if(combined[j] < -1){
+ combined[j] = -1
+ }
+ }
+
+ outputText = ('Transcribing... (This may take a while. Do not close the tab.)')
+ if(device !== 'webgpu'){
+ outputText += `\nYour browser or OS do not support WebGPU, so the transcription may be slower.`
+ }
+ await sleep(10)
+ const res1 = await transcriber(combined, {
+ return_timestamps: true,
+ language: sourceLang,
+ })
+ const res2 = Array.isArray(res1) ? res1[0] : res1
+ const chunks = res2.chunks
+
+ outputText = 'WEBVTT\n\n'
+
+ for(const chunk of chunks){
+ outputText += `${chunk.timestamp[0]} --> ${chunk.timestamp[1]}\n${chunk.text}\n\n`
+ }
+
+ console.log(outputText)
+
+ } catch (error) {
+ alertError(JSON.stringify(error))
+ outputText = ''
+ return
+ }
+ }
+ else{
+ const formData = new FormData()
+ formData.append('file', requestFile)
+ formData.append('model', 'whisper-1')
+ formData.append('response_format', 'vtt')
- const d = await fetch('https://api.openai.com/v1/audio/transcriptions', {
- method: 'POST',
- headers: {
- 'Authorization': `Bearer ${DBState.db.openAIKey}`
- },
- body: formData
+ const d = await fetch('https://api.openai.com/v1/audio/transcriptions', {
+ method: 'POST',
+ headers: {
+ 'Authorization': `Bearer ${DBState.db.openAIKey}`
+ },
+ body: formData
- })
+ })
+ outputText = await d.text()
+ }
- const fileBuffer = await file.arrayBuffer()
-
- outputText = await d.text()
const v = await requestChatData({
formated: [{
@@ -254,9 +362,6 @@
outputText = value[firstKey]
}
-
- console.log(outputText)
-
if(!outputText.trim().endsWith('```')){
outputText = outputText.trim() + '\n```'
}
@@ -268,12 +373,14 @@
latest = match[3].trim()
}
+ const fileBuffer = await file.arrayBuffer()
outputText = latest
vttB64 = `data:text/vtt;base64,${Buffer.from(outputText).toString('base64')}`
fileB64 = `data:audio/wav;base64,${Buffer.from(fileBuffer).toString('base64')}`
vobj = convertWebVTTtoObj(outputText)
-
+ const audio = new Audio(sendSound);
+ audio.play();
}
@@ -355,7 +462,18 @@
{language.subtitles}
-{language.language}
+{#if mode === 'whisperLocal'}
+ {language.sourceLanguage}
+
+ Auto
+ {#each getLanguageCodes() as lang}
+ {lang.name}
+ {/each}
+
+{/if}
+
+
+{language.destinationLanguage}
{language.prompt}
@@ -366,27 +484,31 @@
if(mode === 'llm'){
prompt = LLMModePrompt
}
- if(mode === 'whisper'){
+ if(mode === 'whisper' || mode === 'whisperLocal'){
prompt = WhisperModePrompt
}
}}>
LLM
Whisper
+ Whisper Local
-{#if !(modelInfo.flags.includes(LLMFlags.hasAudioInput) && modelInfo.flags.includes(LLMFlags.hasVideoInput))}
+{#if !(modelInfo.flags.includes(LLMFlags.hasAudioInput) && modelInfo.flags.includes(LLMFlags.hasVideoInput)) && mode === 'llm'}
{language.subtitlesWarning1}
{/if}
{#if !(modelInfo.flags.includes(LLMFlags.hasStreaming) && DBState.db.useStreaming)}
{language.subtitlesWarning2}
{/if}
+{#if !('gpu' in navigator) && mode === 'whisperLocal'}
+ {language.noWebGPU}
+{/if}
{#if !outputText}
{
if(mode === 'llm'){
runLLMMode()
}
- if(mode === 'whisper'){
+ if(mode === 'whisper' || mode === 'whisperLocal'){
runWhisperMode()
}
}}>
diff --git a/src/lib/Setting/Pages/BotSettings.svelte b/src/lib/Setting/Pages/BotSettings.svelte
index 7f04094a..6da2252b 100644
--- a/src/lib/Setting/Pages/BotSettings.svelte
+++ b/src/lib/Setting/Pages/BotSettings.svelte
@@ -271,7 +271,7 @@
{#if submenu === 1 || submenu === -1}
{language.maxContextSize}
-
+
{language.maxResponseSize}
diff --git a/src/lib/Setting/Pages/LanguageSettings.svelte b/src/lib/Setting/Pages/LanguageSettings.svelte
index 7f313af1..171432bc 100644
--- a/src/lib/Setting/Pages/LanguageSettings.svelte
+++ b/src/lib/Setting/Pages/LanguageSettings.svelte
@@ -121,7 +121,7 @@
{/if}
{#if DBState.db.translatorType === 'google'}
- Translator Input Language
+ {language.sourceLanguage}
Auto
English
diff --git a/src/ts/model/modellist.ts b/src/ts/model/modellist.ts
index 680ac909..1367399f 100644
--- a/src/ts/model/modellist.ts
+++ b/src/ts/model/modellist.ts
@@ -14,7 +14,8 @@ export enum LLMFlags{
requiresAlternateRole,
mustStartWithUserInput,
poolSupported,
- hasVideoInput
+ hasVideoInput,
+ OAICompletionTokens
}
export enum LLMProvider{
@@ -409,7 +410,7 @@ export const LLMModels: LLMModel[] = [
flags: [
LLMFlags.hasImageInput,
LLMFlags.hasFullSystemPrompt,
- LLMFlags.hasStreaming
+ LLMFlags.hasStreaming,
],
parameters: OpenAIParameters,
tokenizer: LLMTokenizer.tiktokenO200Base
@@ -421,8 +422,8 @@ export const LLMModels: LLMModel[] = [
provider: LLMProvider.OpenAI,
format: LLMFormat.OpenAICompatible,
flags: [
- LLMFlags.hasFullSystemPrompt,
- LLMFlags.hasStreaming
+ LLMFlags.hasStreaming,
+ LLMFlags.OAICompletionTokens
],
parameters: OpenAIParameters,
tokenizer: LLMTokenizer.tiktokenO200Base
@@ -434,8 +435,23 @@ export const LLMModels: LLMModel[] = [
provider: LLMProvider.OpenAI,
format: LLMFormat.OpenAICompatible,
flags: [
+ LLMFlags.hasStreaming,
+ LLMFlags.OAICompletionTokens
+ ],
+ parameters: OpenAIParameters,
+ tokenizer: LLMTokenizer.tiktokenO200Base
+ },
+ {
+ id: 'o1',
+ internalID: 'o1',
+ name: 'o1',
+ provider: LLMProvider.OpenAI,
+ format: LLMFormat.OpenAICompatible,
+ flags: [
+ LLMFlags.hasStreaming,
+ LLMFlags.OAICompletionTokens,
LLMFlags.hasFullSystemPrompt,
- LLMFlags.hasStreaming
+ LLMFlags.hasImageInput
],
parameters: OpenAIParameters,
tokenizer: LLMTokenizer.tiktokenO200Base
diff --git a/src/ts/process/request.ts b/src/ts/process/request.ts
index adb27280..1e37df20 100644
--- a/src/ts/process/request.ts
+++ b/src/ts/process/request.ts
@@ -409,15 +409,6 @@ async function requestOpenAI(arg:RequestDataArgumentExtended):Promise${formatedChat[i].content}`
- formatedChat[i].role = 'user'
- }
- }
- }
-
for(let i=0;i => {
const id = v4().replace(/-/g, '')