Add subtitle

2024-12-15 19:33:26 +09:00
parent e2149ced1b
commit dcc48d5cb3
9 changed files with 611 additions and 85 deletions
--- a/src/lib/Playground/PlaygroundMenu.svelte
+++ b/src/lib/Playground/PlaygroundMenu.svelte
@@ -14,6 +14,7 @@
    import PlaygroundParser from "./PlaygroundParser.svelte";
    import ToolConvertion from "./ToolConvertion.svelte";
    import { joinMultiuserRoom } from "src/ts/sync/multiuser";
+  import PlaygroundSubtitle from "./PlaygroundSubtitle.svelte";

    let easterEggTouch = $state(0)

@@ -83,6 +84,11 @@
            }}>
                <h1 class="text-2xl font-bold text-start">Parser</h1>
            </button>
+            <button class="bg-darkbg rounded-md p-6 flex flex-col transition-shadow hover:ring-1" onclick={() => {
+                PlaygroundStore.set(9)
+            }}>
+                <h1 class="text-2xl font-bold text-start">{language.subtitles}</h1>
+            </button>
            <button class="bg-darkbg rounded-md p-6 flex flex-col transition-shadow hover:ring-1" onclick={() => {
                PlaygroundStore.set(101)
            }}>
@@ -139,6 +145,9 @@
            {#if $PlaygroundStore === 8}
                <PlaygroundParser/>
            {/if}  
+            {#if $PlaygroundStore === 9}
+                <PlaygroundSubtitle/>
+            {/if}  
            {#if $PlaygroundStore === 101}
                <ToolConvertion/>
            {/if}  
--- a/src/lib/Playground/PlaygroundSubtitle.svelte
+++ b/src/lib/Playground/PlaygroundSubtitle.svelte
@@ -0,0 +1,443 @@
+
+<script lang="ts">
+    import { language } from "src/lang";
+    import TextInput from "../UI/GUI/TextInput.svelte";
+    import TextAreaInput from "../UI/GUI/TextAreaInput.svelte";
+    import Button from "../UI/GUI/Button.svelte";
+    import { DBState } from "src/ts/stores.svelte";
+    import { getModelInfo, LLMFlags } from "src/ts/model/modellist";
+    import { requestChatData } from "src/ts/process/request";
+    import { selectFileByDom, selectSingleFile, sleep } from "src/ts/util";
+    import { alertError, alertSelect } from "src/ts/alert";
+    import { risuChatParser } from "src/ts/parser.svelte";
+    import { AppendableBuffer, downloadFile, globalFetch } from "src/ts/globalApi.svelte";
+    import SliderInput from "../UI/GUI/SliderInput.svelte";
+    import SelectInput from "../UI/GUI/SelectInput.svelte";
+    import OptionInput from "../UI/GUI/OptionInput.svelte";
+
+
+
+    let LLMModePrompt ="Transcribe and create a caption and timestamp of it, according to the user's audio or video input. inside a markdown code block. (prefix ```webvtt / postfix ```)\n\nFormat\n```\n[TIME] CONTENT\n```\n\nExample\n```\n[00:00] Hildy!\n[00:01] How are you?\n[00:03] Tell me, is the lord of the universe in?\n[00:07] Somebody must've stolen the crown jewels\n```\n\nStep 2. Generate another subtitle, this time, as a translation to {{slot}}, with same format with Step 1., using step 1 as ref.\n\n The translation must be in natural {{slot}}.\n\n Now, start (Hint: media length is {{slot::time}})"
+    let WhisperModePrompt = "```\n{{slot::data}}\n``` Translate the following WEBVTT to natural {{slot}}, with keeping the timestamp and header, inside a markdown code block. (prefix ``` / postfix ```)"
+
+    let selLang = $state(DBState.db.language)
+    let prompt = $state(LLMModePrompt)
+    let modelInfo = $derived(getModelInfo(DBState.db.aiModel))
+    let outputText = $state('')
+    let fileB64 = $state('')
+    let vttB64 = $state('')
+    let vobj:TranscribeObj[] = $state([])
+    let mode = $state('llm')
+
+
+
+    async function runLLMMode() {
+        outputText = 'Loading...\n\n'
+
+        const file = await selectSingleFile([
+            'mp3', 'ogg', 'wav', 'flac',
+            'mp4', 'webm', 'mkv', 'avi', 'mov'  
+        ])
+
+        if(!file){
+            outputText = ''
+            return
+        }
+
+        const videos = [
+            'mp4', 'webm', 'mkv', 'avi', 'mov'
+        ]
+
+        const ext = file.name.split('.').pop()
+
+        fileB64 = `data:${
+            videos.includes(ext) ? 'video' : 'audio'
+        }/${ext};base64,${Buffer.from(file.data).toString('base64')}`
+
+        const media = {
+            type: videos.includes(ext) ? 'video' : 'audio',
+            base64: fileB64,
+        } as const
+
+        let time = ''
+
+        if(prompt.includes('{{slot::time}}')){
+            const video = document.createElement('video')
+            video.src = fileB64
+            video.preload = 'metadata'
+            video.muted = true
+            await video.play()
+            const d = video.duration
+            console.log(d)
+            if(isNaN(d)){
+                time = 'unknown'
+            }else{
+                time = `${Math.floor(d / 60)}:${Math.floor(d % 60)}`
+            }
+            video.pause()
+            video.remove()
+        }
+
+        const v =await requestChatData({
+            formated: [{
+                role: "user",
+                content: risuChatParser(prompt).replace(/{{slot}}/g, selLang).replace(/{{slot::time}}/g, time),
+                multimodals: [media]
+            }],
+            bias: {},
+            useStreaming: true
+        }, 'model')
+
+        if(v.type === 'multiline'){
+            alertError(v.result[0][1])
+            return
+        }
+
+        if(v.type !== 'streaming'){
+            alertError(v.result)
+            return
+        }
+
+        const reader = v.result.getReader()
+
+        while(true){
+            const { done, value } = await reader.read()
+            if(done){
+                break
+            }
+            const firstKey = Object.keys(value)[0]
+
+            outputText = value[firstKey]
+        }
+
+        const extracted = outputText.matchAll(/```(web)?(vtt)?\n(.*?)\n```/gs)
+
+        let latest = ''
+        for(const match of extracted){
+            latest = match[3].trim()
+        }
+
+        vobj = convertTransToObj(latest)
+        outputText = makeWebVtt(vobj)
+        vttB64 = `data:text/vtt;base64,${Buffer.from(outputText).toString('base64')}`
+    }
+
+    async function runWhisperMode() {
+        outputText = 'Loading...\n\n'
+
+        const files = await selectFileByDom([
+            'mp3', 'ogg', 'wav', 'flac',
+            'mp4', 'webm', 'mkv', 'avi', 'mov'  
+
+        ])
+
+        const file = files?.[0]
+
+        if(!file){
+            outputText = ''
+            return
+        }
+
+        const formData = new FormData()
+
+        const videos = [
+            'mp4', 'webm', 'mkv', 'avi', 'mov'
+        ]
+
+        const ext = file.name.split('.').pop()
+        if(videos.includes(ext)){
+            
+
+            //check duration
+            let duration = 0
+            {
+                const video = document.createElement('video')
+                video.src = URL.createObjectURL(file)
+                video.preload = 'metadata'
+                video.muted = true
+                await video.play()
+                const d = video.duration
+                if(isNaN(d)){
+                    alertError('This video does not have a duration')
+                    return
+                }
+                video.pause()
+                video.remove()
+                duration = d
+            }
+
+            outputText = 'Converting video to audio...\n\n'
+            const audioContext = new AudioContext()
+            const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer())
+
+            const [left, right] =  [audioBuffer.getChannelData(0), audioBuffer.getChannelData(1)]
+
+            const leftInt16 = new Int16Array(left.length)
+            const rightInt16 = new Int16Array(right.length)
+
+            for(let i = 0; i < left.length; i++){
+                leftInt16[i] = left[i] * 0x7FFF
+                rightInt16[i] = right[i] * 0x7FFF
+            }
+
+            const lamejs = await import('@breezystack/lamejs')
+            const mp3encoder = new lamejs.Mp3Encoder(2, 44100, 128);
+            const enc = new AppendableBuffer()
+
+            for(let pointer = 0; pointer < leftInt16.length; pointer += 1152){
+                enc.append(mp3encoder.encodeBuffer(leftInt16.subarray(pointer, pointer + 1152), rightInt16.subarray(pointer, pointer + 1152)))
+                if(pointer % 115200 === 0){
+                    outputText = `Converting  video to audio... ${(pointer / leftInt16.length * 100).toFixed(2)}%\n`
+                    await sleep(1)
+                }
+            }
+            enc.append(mp3encoder.flush())
+
+            const file2 = new File([enc.buffer], 'audio.mp3', {
+                type: 'audio/mp3'
+            })
+
+            outputText = 'Transcribing audio...\n\n'
+            formData.append('file', file2)
+        }
+        else{
+            formData.append('file', file)
+        }
+
+        formData.append('model', 'whisper-1')
+        formData.append('response_format', 'vtt')
+
+
+        const d = await fetch('https://api.openai.com/v1/audio/transcriptions', {
+            method: 'POST',
+            headers: {
+                'Authorization': `Bearer ${DBState.db.openAIKey}`
+            },
+            body: formData
+
+        })
+
+        const fileBuffer = await file.arrayBuffer()
+
+        outputText = await d.text()
+
+        const v = await requestChatData({
+            formated: [{
+                role: "user",
+                content: risuChatParser(prompt).replace(/{{slot}}/g, selLang).replace(/{{slot::data}}/g, outputText),
+            }],
+            bias: {},
+            useStreaming: true
+        }, 'model')
+
+
+        if(v.type === 'multiline'){
+            alertError(v.result[0][1])
+            return
+        }
+
+        if(v.type !== 'streaming'){
+            alertError(v.result)
+            return
+        }
+
+        console.log("Reading...")
+
+        const reader = v.result.getReader()
+
+        while(true){
+            const { done, value } = await reader.read()
+            if(done){
+                break
+            }
+            const firstKey = Object.keys(value)[0]
+
+            outputText = value[firstKey]
+        }
+
+        console.log(outputText)
+
+        if(!outputText.trim().endsWith('```')){
+            outputText = outputText.trim() + '\n```'
+        }
+
+        const extracted = outputText.matchAll(/```(web)?(vtt)?\n(.*?)\n```/gs)
+
+        let latest = ''
+        for(const match of extracted){
+            latest = match[3].trim()
+        }
+
+        outputText = latest
+        vttB64 = `data:text/vtt;base64,${Buffer.from(outputText).toString('base64')}`
+        fileB64 = `data:audio/wav;base64,${Buffer.from(fileBuffer).toString('base64')}`
+        vobj = convertWebVTTtoObj(outputText)
+
+
+    }
+
+    
+
+
+    type TranscribeObj = {
+        start: string
+        end: string
+        text: string
+    }
+    
+
+    function convertTransToObj(r:string){
+        const lines = r.split('\n').map(v => v.trim()).filter(v => v)
+        const obj:TranscribeObj[] = []
+        for(let i = 0; i < lines.length; i++){
+            const line = lines[i]
+            if(line.startsWith('[')){
+                let [time, ...text] = line.split(']')
+                time = time.slice(1)
+                if(obj.length > 0){
+                    obj[obj.length - 1].end = time + '.000'
+                }
+                obj.push({
+                    start: time + '.000',
+                    end: '',
+                    text: text.join(' ')
+                })
+            }
+        }
+        //rediculously long line
+        obj[obj.length - 1].end = '99:99.000'
+        return obj
+    }
+
+    function convertWebVTTtoObj(r:string){
+        const chunks = r.split('\n\n').map(v => v.trim()).filter(v => v)
+        const obj:TranscribeObj[] = []
+        for(const chunk of chunks){
+            if(chunk.startsWith('WEBVTT')){
+                continue
+            }
+            const [time, ...text] = chunk.split('\n')
+            const [start, end] = time.split(' --> ')
+            obj.push({
+                start: start,
+                end: end,
+                text: text.join('\n')
+            })
+        }
+        return obj
+    }
+
+    function makeWebVtt(obj: TranscribeObj[]){
+        let vtt = 'WEBVTT\n\n'
+
+        for(const line of obj){
+            vtt += `${line.start} --> ${line.end}\n${line.text}\n\n`
+        }
+
+        return vtt
+    }
+
+    function webVttToSrt(){
+        const srt = outputText.replace('WEBVTT', '').trim().split('\n\n').map((v, i) => {
+            const [time, ...text] = v.split('\n')
+            const [start, end] = time.split(' --> ')
+            return `${i + 1}\n${start.replace('.', ',')} --> ${end.replace('.', ',')}\n${text.join('\n')}`
+        })
+        return srt
+    }
+
+    type WaveOptions = {
+        isFloat: boolean
+        numChannels: number
+        sampleRate: number
+    }
+</script>
+
+<h2 class="text-4xl text-textcolor my-6 font-black relative">{language.subtitles}</h2>
+
+<span class="text-textcolor text-lg mt-4">{language.language}</span>
+<TextInput bind:value={selLang} />
+
+<span class="text-textcolor text-lg mt-4">{language.prompt}</span>
+<TextAreaInput bind:value={prompt} />
+
+<span class="text-textcolor text-lg mt-4">{language.type}</span>
+<SelectInput bind:value={mode} onchange={(e) => {
+    if(mode === 'llm'){
+        prompt = LLMModePrompt
+    }
+    if(mode === 'whisper'){
+        prompt = WhisperModePrompt
+    }
+}}>
+    <OptionInput value="llm">LLM</OptionInput>
+    <OptionInput value="whisper">Whisper</OptionInput>
+</SelectInput>
+
+{#if !(modelInfo.flags.includes(LLMFlags.hasAudioInput) && modelInfo.flags.includes(LLMFlags.hasVideoInput))}
+    <span class="text-draculared text-lg mt-4">{language.subtitlesWarning1}</span>
+{/if}
+{#if !(modelInfo.flags.includes(LLMFlags.hasStreaming) && DBState.db.useStreaming)}
+    <span class="text-draculared text-lg mt-4">{language.subtitlesWarning2}</span>
+{/if}
+
+{#if !outputText}
+    <Button className="mt-4" onclick={() => {
+        if(mode === 'llm'){
+            runLLMMode()
+        }
+        if(mode === 'whisper'){
+            runWhisperMode()
+        }
+    }}>
+        {language.run}
+    </Button>
+{:else if vttB64 && fileB64}
+    <details class="mt-4">
+        <pre>{outputText}</pre>
+    </details>
+{:else}
+    <pre>{outputText}</pre>
+{/if}
+
+{#if vttB64 && fileB64}
+    <div class="mt-4">
+        {#key vttB64}
+            <video controls src={fileB64} class="w-full">
+                <track default kind="captions" src={vttB64} srclang="en" />
+            </video>
+        {/key}
+    </div>
+
+    <span class="text-textcolor text-lg mt-4">{language.download}</span>
+
+    <Button className="mt-4" onclick={() => {
+        outputText = ''
+        fileB64 = ''
+        vttB64 = ''
+    }}>
+        {language.reset}
+    </Button>
+
+    <Button className="mt-4" onclick={async () => {
+        const sel = parseInt(await alertSelect([
+            'WebVTT',
+            'SRT'
+        ]))
+        const a = document.createElement('a')
+
+        // WebVTT
+        if(sel === 0){
+            downloadFile('subtitle.vtt', outputText)
+            return
+        }
+
+        // SRT
+        if(sel === 1){
+            downloadFile('subtitle.srt', webVttToSrt().join('\n\n'))
+            return   
+        }
+    }}>
+        {language.download}
+    </Button>
+{/if}