Add subtitle

This commit is contained in:
Kwaroran
2024-12-15 19:33:26 +09:00
parent e2149ced1b
commit dcc48d5cb3
9 changed files with 611 additions and 85 deletions

View File

@@ -14,6 +14,7 @@
import PlaygroundParser from "./PlaygroundParser.svelte";
import ToolConvertion from "./ToolConvertion.svelte";
import { joinMultiuserRoom } from "src/ts/sync/multiuser";
import PlaygroundSubtitle from "./PlaygroundSubtitle.svelte";
let easterEggTouch = $state(0)
@@ -83,6 +84,11 @@
}}>
<h1 class="text-2xl font-bold text-start">Parser</h1>
</button>
<button class="bg-darkbg rounded-md p-6 flex flex-col transition-shadow hover:ring-1" onclick={() => {
PlaygroundStore.set(9)
}}>
<h1 class="text-2xl font-bold text-start">{language.subtitles}</h1>
</button>
<button class="bg-darkbg rounded-md p-6 flex flex-col transition-shadow hover:ring-1" onclick={() => {
PlaygroundStore.set(101)
}}>
@@ -139,6 +145,9 @@
{#if $PlaygroundStore === 8}
<PlaygroundParser/>
{/if}
{#if $PlaygroundStore === 9}
<PlaygroundSubtitle/>
{/if}
{#if $PlaygroundStore === 101}
<ToolConvertion/>
{/if}

View File

@@ -0,0 +1,443 @@
<script lang="ts">
import { language } from "src/lang";
import TextInput from "../UI/GUI/TextInput.svelte";
import TextAreaInput from "../UI/GUI/TextAreaInput.svelte";
import Button from "../UI/GUI/Button.svelte";
import { DBState } from "src/ts/stores.svelte";
import { getModelInfo, LLMFlags } from "src/ts/model/modellist";
import { requestChatData } from "src/ts/process/request";
import { selectFileByDom, selectSingleFile, sleep } from "src/ts/util";
import { alertError, alertSelect } from "src/ts/alert";
import { risuChatParser } from "src/ts/parser.svelte";
import { AppendableBuffer, downloadFile, globalFetch } from "src/ts/globalApi.svelte";
import SliderInput from "../UI/GUI/SliderInput.svelte";
import SelectInput from "../UI/GUI/SelectInput.svelte";
import OptionInput from "../UI/GUI/OptionInput.svelte";
let LLMModePrompt ="Transcribe and create a caption and timestamp of it, according to the user's audio or video input. inside a markdown code block. (prefix ```webvtt / postfix ```)\n\nFormat\n```\n[TIME] CONTENT\n```\n\nExample\n```\n[00:00] Hildy!\n[00:01] How are you?\n[00:03] Tell me, is the lord of the universe in?\n[00:07] Somebody must've stolen the crown jewels\n```\n\nStep 2. Generate another subtitle, this time, as a translation to {{slot}}, with same format with Step 1., using step 1 as ref.\n\n The translation must be in natural {{slot}}.\n\n Now, start (Hint: media length is {{slot::time}})"
let WhisperModePrompt = "```\n{{slot::data}}\n``` Translate the following WEBVTT to natural {{slot}}, with keeping the timestamp and header, inside a markdown code block. (prefix ``` / postfix ```)"
let selLang = $state(DBState.db.language)
let prompt = $state(LLMModePrompt)
let modelInfo = $derived(getModelInfo(DBState.db.aiModel))
let outputText = $state('')
let fileB64 = $state('')
let vttB64 = $state('')
let vobj:TranscribeObj[] = $state([])
let mode = $state('llm')
async function runLLMMode() {
outputText = 'Loading...\n\n'
const file = await selectSingleFile([
'mp3', 'ogg', 'wav', 'flac',
'mp4', 'webm', 'mkv', 'avi', 'mov'
])
if(!file){
outputText = ''
return
}
const videos = [
'mp4', 'webm', 'mkv', 'avi', 'mov'
]
const ext = file.name.split('.').pop()
fileB64 = `data:${
videos.includes(ext) ? 'video' : 'audio'
}/${ext};base64,${Buffer.from(file.data).toString('base64')}`
const media = {
type: videos.includes(ext) ? 'video' : 'audio',
base64: fileB64,
} as const
let time = ''
if(prompt.includes('{{slot::time}}')){
const video = document.createElement('video')
video.src = fileB64
video.preload = 'metadata'
video.muted = true
await video.play()
const d = video.duration
console.log(d)
if(isNaN(d)){
time = 'unknown'
}else{
time = `${Math.floor(d / 60)}:${Math.floor(d % 60)}`
}
video.pause()
video.remove()
}
const v =await requestChatData({
formated: [{
role: "user",
content: risuChatParser(prompt).replace(/{{slot}}/g, selLang).replace(/{{slot::time}}/g, time),
multimodals: [media]
}],
bias: {},
useStreaming: true
}, 'model')
if(v.type === 'multiline'){
alertError(v.result[0][1])
return
}
if(v.type !== 'streaming'){
alertError(v.result)
return
}
const reader = v.result.getReader()
while(true){
const { done, value } = await reader.read()
if(done){
break
}
const firstKey = Object.keys(value)[0]
outputText = value[firstKey]
}
const extracted = outputText.matchAll(/```(web)?(vtt)?\n(.*?)\n```/gs)
let latest = ''
for(const match of extracted){
latest = match[3].trim()
}
vobj = convertTransToObj(latest)
outputText = makeWebVtt(vobj)
vttB64 = `data:text/vtt;base64,${Buffer.from(outputText).toString('base64')}`
}
async function runWhisperMode() {
outputText = 'Loading...\n\n'
const files = await selectFileByDom([
'mp3', 'ogg', 'wav', 'flac',
'mp4', 'webm', 'mkv', 'avi', 'mov'
])
const file = files?.[0]
if(!file){
outputText = ''
return
}
const formData = new FormData()
const videos = [
'mp4', 'webm', 'mkv', 'avi', 'mov'
]
const ext = file.name.split('.').pop()
if(videos.includes(ext)){
//check duration
let duration = 0
{
const video = document.createElement('video')
video.src = URL.createObjectURL(file)
video.preload = 'metadata'
video.muted = true
await video.play()
const d = video.duration
if(isNaN(d)){
alertError('This video does not have a duration')
return
}
video.pause()
video.remove()
duration = d
}
outputText = 'Converting video to audio...\n\n'
const audioContext = new AudioContext()
const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer())
const [left, right] = [audioBuffer.getChannelData(0), audioBuffer.getChannelData(1)]
const leftInt16 = new Int16Array(left.length)
const rightInt16 = new Int16Array(right.length)
for(let i = 0; i < left.length; i++){
leftInt16[i] = left[i] * 0x7FFF
rightInt16[i] = right[i] * 0x7FFF
}
const lamejs = await import('@breezystack/lamejs')
const mp3encoder = new lamejs.Mp3Encoder(2, 44100, 128);
const enc = new AppendableBuffer()
for(let pointer = 0; pointer < leftInt16.length; pointer += 1152){
enc.append(mp3encoder.encodeBuffer(leftInt16.subarray(pointer, pointer + 1152), rightInt16.subarray(pointer, pointer + 1152)))
if(pointer % 115200 === 0){
outputText = `Converting video to audio... ${(pointer / leftInt16.length * 100).toFixed(2)}%\n`
await sleep(1)
}
}
enc.append(mp3encoder.flush())
const file2 = new File([enc.buffer], 'audio.mp3', {
type: 'audio/mp3'
})
outputText = 'Transcribing audio...\n\n'
formData.append('file', file2)
}
else{
formData.append('file', file)
}
formData.append('model', 'whisper-1')
formData.append('response_format', 'vtt')
const d = await fetch('https://api.openai.com/v1/audio/transcriptions', {
method: 'POST',
headers: {
'Authorization': `Bearer ${DBState.db.openAIKey}`
},
body: formData
})
const fileBuffer = await file.arrayBuffer()
outputText = await d.text()
const v = await requestChatData({
formated: [{
role: "user",
content: risuChatParser(prompt).replace(/{{slot}}/g, selLang).replace(/{{slot::data}}/g, outputText),
}],
bias: {},
useStreaming: true
}, 'model')
if(v.type === 'multiline'){
alertError(v.result[0][1])
return
}
if(v.type !== 'streaming'){
alertError(v.result)
return
}
console.log("Reading...")
const reader = v.result.getReader()
while(true){
const { done, value } = await reader.read()
if(done){
break
}
const firstKey = Object.keys(value)[0]
outputText = value[firstKey]
}
console.log(outputText)
if(!outputText.trim().endsWith('```')){
outputText = outputText.trim() + '\n```'
}
const extracted = outputText.matchAll(/```(web)?(vtt)?\n(.*?)\n```/gs)
let latest = ''
for(const match of extracted){
latest = match[3].trim()
}
outputText = latest
vttB64 = `data:text/vtt;base64,${Buffer.from(outputText).toString('base64')}`
fileB64 = `data:audio/wav;base64,${Buffer.from(fileBuffer).toString('base64')}`
vobj = convertWebVTTtoObj(outputText)
}
type TranscribeObj = {
start: string
end: string
text: string
}
function convertTransToObj(r:string){
const lines = r.split('\n').map(v => v.trim()).filter(v => v)
const obj:TranscribeObj[] = []
for(let i = 0; i < lines.length; i++){
const line = lines[i]
if(line.startsWith('[')){
let [time, ...text] = line.split(']')
time = time.slice(1)
if(obj.length > 0){
obj[obj.length - 1].end = time + '.000'
}
obj.push({
start: time + '.000',
end: '',
text: text.join(' ')
})
}
}
//rediculously long line
obj[obj.length - 1].end = '99:99.000'
return obj
}
function convertWebVTTtoObj(r:string){
const chunks = r.split('\n\n').map(v => v.trim()).filter(v => v)
const obj:TranscribeObj[] = []
for(const chunk of chunks){
if(chunk.startsWith('WEBVTT')){
continue
}
const [time, ...text] = chunk.split('\n')
const [start, end] = time.split(' --> ')
obj.push({
start: start,
end: end,
text: text.join('\n')
})
}
return obj
}
function makeWebVtt(obj: TranscribeObj[]){
let vtt = 'WEBVTT\n\n'
for(const line of obj){
vtt += `${line.start} --> ${line.end}\n${line.text}\n\n`
}
return vtt
}
function webVttToSrt(){
const srt = outputText.replace('WEBVTT', '').trim().split('\n\n').map((v, i) => {
const [time, ...text] = v.split('\n')
const [start, end] = time.split(' --> ')
return `${i + 1}\n${start.replace('.', ',')} --> ${end.replace('.', ',')}\n${text.join('\n')}`
})
return srt
}
type WaveOptions = {
isFloat: boolean
numChannels: number
sampleRate: number
}
</script>
<h2 class="text-4xl text-textcolor my-6 font-black relative">{language.subtitles}</h2>
<span class="text-textcolor text-lg mt-4">{language.language}</span>
<TextInput bind:value={selLang} />
<span class="text-textcolor text-lg mt-4">{language.prompt}</span>
<TextAreaInput bind:value={prompt} />
<span class="text-textcolor text-lg mt-4">{language.type}</span>
<SelectInput bind:value={mode} onchange={(e) => {
if(mode === 'llm'){
prompt = LLMModePrompt
}
if(mode === 'whisper'){
prompt = WhisperModePrompt
}
}}>
<OptionInput value="llm">LLM</OptionInput>
<OptionInput value="whisper">Whisper</OptionInput>
</SelectInput>
{#if !(modelInfo.flags.includes(LLMFlags.hasAudioInput) && modelInfo.flags.includes(LLMFlags.hasVideoInput))}
<span class="text-draculared text-lg mt-4">{language.subtitlesWarning1}</span>
{/if}
{#if !(modelInfo.flags.includes(LLMFlags.hasStreaming) && DBState.db.useStreaming)}
<span class="text-draculared text-lg mt-4">{language.subtitlesWarning2}</span>
{/if}
{#if !outputText}
<Button className="mt-4" onclick={() => {
if(mode === 'llm'){
runLLMMode()
}
if(mode === 'whisper'){
runWhisperMode()
}
}}>
{language.run}
</Button>
{:else if vttB64 && fileB64}
<details class="mt-4">
<pre>{outputText}</pre>
</details>
{:else}
<pre>{outputText}</pre>
{/if}
{#if vttB64 && fileB64}
<div class="mt-4">
{#key vttB64}
<video controls src={fileB64} class="w-full">
<track default kind="captions" src={vttB64} srclang="en" />
</video>
{/key}
</div>
<span class="text-textcolor text-lg mt-4">{language.download}</span>
<Button className="mt-4" onclick={() => {
outputText = ''
fileB64 = ''
vttB64 = ''
}}>
{language.reset}
</Button>
<Button className="mt-4" onclick={async () => {
const sel = parseInt(await alertSelect([
'WebVTT',
'SRT'
]))
const a = document.createElement('a')
// WebVTT
if(sel === 0){
downloadFile('subtitle.vtt', outputText)
return
}
// SRT
if(sel === 1){
downloadFile('subtitle.srt', webVttToSrt().join('\n\n'))
return
}
}}>
{language.download}
</Button>
{/if}