vibevoice/app.py

"""
VibeVoice 体验平台 — Liquid Glass 风格
FastAPI 后端 + 纯 HTML 前端
"""

import os
import sys
import json
import torch
import numpy as np
import tempfile
import time
import soundfile as sf
from pathlib import Path
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
import uvicorn

SOURCE_DIR = Path(__file__).parent / "source"
STATIC_DIR = Path(__file__).parent / "static"
sys.path.insert(0, str(SOURCE_DIR))

app = FastAPI()

# ========== 全局状态 ==========
asr_model_cache = {}
tts_model_cache = {}

DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
DTYPE = torch.float32


def load_asr():
    if asr_model_cache:
        return asr_model_cache

    from vibevoice.modular.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration
    from vibevoice.processor.vibevoice_asr_processor import VibeVoiceASRProcessor

    print(f"Loading ASR model to {DEVICE}...")
    processor = VibeVoiceASRProcessor.from_pretrained("microsoft/VibeVoice-ASR")
    model = VibeVoiceASRForConditionalGeneration.from_pretrained(
        "microsoft/VibeVoice-ASR",
        torch_dtype=DTYPE,
        attn_implementation="sdpa",
        trust_remote_code=True
    )
    model = model.to(DEVICE)
    model.eval()
    asr_model_cache["model"] = model
    asr_model_cache["processor"] = processor
    print("ASR model loaded")
    return asr_model_cache


def load_tts():
    if tts_model_cache:
        return tts_model_cache

    from vibevoice.modular.modeling_vibevoice_streaming_inference import (
        VibeVoiceStreamingForConditionalGenerationInference,
    )
    from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor

    print(f"Loading TTS model to {DEVICE}...")
    processor = VibeVoiceStreamingProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
    model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
        "microsoft/VibeVoice-Realtime-0.5B",
        torch_dtype=DTYPE,
        attn_implementation="sdpa",
    )
    model = model.to(DEVICE)
    model.eval()
    tts_model_cache["model"] = model
    tts_model_cache["processor"] = processor
    print("TTS model loaded")
    return tts_model_cache


@app.post("/api/asr")
async def api_asr(audio: UploadFile = File(...), hotwords: str = Form("")):
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    content = await audio.read()
    tmp.write(content)
    tmp.close()

    try:
        asr = load_asr()
        model = asr["model"]
        processor = asr["processor"]

        context_info = hotwords.strip() if hotwords.strip() else None
        inputs = processor(
            audio=tmp.name,
            sampling_rate=None,
            return_tensors="pt",
            add_generation_prompt=True,
            context_info=context_info
        )
        inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

        start_time = time.time()
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=32768,
                do_sample=False,
                pad_token_id=processor.pad_id,
                eos_token_id=processor.tokenizer.eos_token_id,
            )

        elapsed = time.time() - start_time
        input_length = inputs['input_ids'].shape[1]
        generated_ids = output_ids[0, input_length:]
        text = processor.decode(generated_ids, skip_special_tokens=True)

        try:
            segments = processor.post_process_transcription(text)
        except Exception:
            segments = [{"text": text}]

        return JSONResponse({"segments": segments, "raw": text, "time": round(elapsed, 1)})
    except Exception as e:
        return JSONResponse({"error": str(e)}, status_code=500)
    finally:
        os.unlink(tmp.name)


@app.post("/api/tts")
async def api_tts(text: str = Form(...)):
    if not text.strip():
        return JSONResponse({"error": "empty text"}, status_code=400)

    try:
        tts = load_tts()
        model = tts["model"]
        processor = tts["processor"]

        voices_dir = SOURCE_DIR / "demo" / "voices" / "streaming_model"
        voice_files = list(voices_dir.rglob("*.pt")) if voices_dir.exists() else []
        if not voice_files:
            return JSONResponse({"error": "no voice presets found"}, status_code=500)

        prefilled = torch.load(voice_files[0], map_location=DEVICE, weights_only=False)
        processed = processor.process_input_with_cached_prompt(
            text=text.strip(),
            cached_prompt=prefilled,
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )
        inputs = {k: v.to(DEVICE) if hasattr(v, "to") else v for k, v in processed.items()}

        from vibevoice.modular.streamer import AudioStreamer
        import copy, threading

        audio_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
        errors = []

        model.model.noise_scheduler = model.model.noise_scheduler.from_config(
            model.model.noise_scheduler.config,
            algorithm_type="sde-dpmsolver++",
            beta_schedule="squaredcos_cap_v2",
        )
        model.set_ddpm_inference_steps(num_steps=5)

        stop_event = threading.Event()

        def run_gen():
            try:
                model.generate(
                    **inputs,
                    max_new_tokens=None,
                    cfg_scale=1.5,
                    tokenizer=processor.tokenizer,
                    generation_config={"do_sample": False, "temperature": 1.0, "top_p": 1.0},
                    audio_streamer=audio_streamer,
                    stop_check_fn=stop_event.is_set,
                    verbose=False,
                    refresh_negative=True,
                    all_prefilled_outputs=copy.deepcopy(prefilled),
                )
            except Exception as e:
                errors.append(e)
                audio_streamer.end()

        thread = threading.Thread(target=run_gen, daemon=True)
        thread.start()

        audio_chunks = []
        for chunk in audio_streamer.get_stream(0):
            if torch.is_tensor(chunk):
                chunk = chunk.detach().cpu().to(torch.float32).numpy()
            else:
                chunk = np.asarray(chunk, dtype=np.float32)
            if chunk.ndim > 1:
                chunk = chunk.reshape(-1)
            audio_chunks.append(chunk)

        thread.join()
        if errors:
            return JSONResponse({"error": str(errors[0])}, status_code=500)

        audio = np.clip(np.concatenate(audio_chunks), -1.0, 1.0)
        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/tmp")
        sf.write(tmp.name, audio, 24000)
        return FileResponse(tmp.name, media_type="audio/wav", filename="vibevoice_tts.wav")

    except Exception as e:
        import traceback
        traceback.print_exc()
        return JSONResponse({"error": str(e)}, status_code=500)


@app.get("/")
def index():
    return HTMLResponse(HTML_PAGE)


HTML_PAGE = r"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>VibeVoice</title>
  <style>
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');

    * { margin: 0; padding: 0; box-sizing: border-box; }

    body {
      font-family: 'Inter', -apple-system, sans-serif;
      min-height: 100vh;
      background: url("https://images.unsplash.com/photo-1557682250-33bd709cbe85?w=1920&q=80") center/cover fixed;
      display: flex;
      flex-direction: column;
      align-items: center;
      padding: 2rem;
      color: #fff;
    }

    /* 背景动画叠加层 */
    body::before {
      content: '';
      position: fixed;
      inset: 0;
      background: linear-gradient(135deg,
        rgba(99, 102, 241, 0.15),
        rgba(168, 85, 247, 0.1),
        rgba(236, 72, 153, 0.1));
      z-index: 0;
      animation: shiftGradient 15s ease infinite;
    }
    @keyframes shiftGradient {
      0%, 100% { opacity: 0.6; }
      50% { opacity: 1; }
    }

    /* ===== 液态玻璃四层架构 ===== */
    .liquidGlass-wrapper {
      position: relative;
      overflow: hidden;
      box-shadow: 0 6px 6px rgba(0,0,0,0.2), 0 0 20px rgba(0,0,0,0.1);
      transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 2.2);
    }
    .liquidGlass-effect {
      position: absolute; z-index: 0; inset: 0;
      backdrop-filter: blur(3px);
      filter: url(#glass-distortion);
      overflow: hidden;
      isolation: isolate;
    }
    .liquidGlass-tint {
      z-index: 1; position: absolute; inset: 0;
      background: rgba(255, 255, 255, 0.12);
    }
    .liquidGlass-shine {
      position: absolute; inset: 0; z-index: 2; overflow: hidden;
      box-shadow: inset 2px 2px 1px 0 rgba(255,255,255,0.5),
                  inset -1px -1px 1px 1px rgba(255,255,255,0.5);
    }
    .liquidGlass-content {
      position: relative; z-index: 3;
    }

    /* ===== 布局 ===== */
    .container {
      position: relative; z-index: 1;
      max-width: 900px; width: 100%;
    }

    /* Header */
    .header {
      text-align: center;
      margin-bottom: 2rem;
    }
    .header h1 {
      font-size: 3rem; font-weight: 700;
      text-shadow: 0 2px 20px rgba(0,0,0,0.3);
      letter-spacing: -0.02em;
    }
    .header p {
      color: rgba(255,255,255,0.7); margin-top: 0.5rem; font-size: 1rem;
    }
    .badges { display: flex; gap: 0.5rem; justify-content: center; margin-top: 1rem; }
    .badge {
      padding: 0.25rem 0.75rem; border-radius: 2rem; font-size: 0.75rem;
      font-weight: 500; backdrop-filter: blur(10px);
      background: rgba(255,255,255,0.15); border: 1px solid rgba(255,255,255,0.25);
    }

    /* Tabs */
    .tabs {
      display: flex; gap: 0.5rem; margin-bottom: 1.5rem; justify-content: center;
    }
    .tab-btn {
      border-radius: 3rem; padding: 0.7rem 2rem; border: none;
      font-size: 0.95rem; font-weight: 600; cursor: pointer;
      color: rgba(255,255,255,0.6); background: transparent;
      transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 2.2);
    }
    .tab-btn.active {
      color: #fff;
    }
    .tab-btn:hover { transform: scale(1.05); }

    /* 玻璃卡片 */
    .glass-card {
      border-radius: 1.8rem;
      margin-bottom: 1.5rem;
    }
    .glass-card .liquidGlass-effect,
    .glass-card .liquidGlass-tint,
    .glass-card .liquidGlass-shine {
      border-radius: 1.8rem;
    }
    .glass-card .liquidGlass-content {
      padding: 2rem;
    }
    .glass-card:hover {
      box-shadow: 0 8px 12px rgba(0,0,0,0.25), 0 0 30px rgba(0,0,0,0.15);
    }

    /* 玻璃 Tab 按钮 */
    .tab-glass {
      border-radius: 3rem;
    }
    .tab-glass .liquidGlass-effect,
    .tab-glass .liquidGlass-tint,
    .tab-glass .liquidGlass-shine {
      border-radius: 3rem;
    }
    .tab-glass .liquidGlass-content {
      padding: 0;
    }

    /* 表单 */
    .form-label {
      font-size: 0.85rem; font-weight: 600; color: rgba(255,255,255,0.8);
      margin-bottom: 0.5rem; display: block;
    }
    .form-group { margin-bottom: 1.2rem; }

    textarea, input[type="text"] {
      width: 100%; padding: 0.8rem 1rem;
      background: rgba(0,0,0,0.2); border: 1px solid rgba(255,255,255,0.15);
      border-radius: 1rem; color: #fff; font-size: 0.9rem;
      font-family: inherit; resize: vertical;
      transition: border-color 0.3s, box-shadow 0.3s;
    }
    textarea:focus, input[type="text"]:focus {
      outline: none;
      border-color: rgba(255,255,255,0.4);
      box-shadow: 0 0 20px rgba(167,139,250,0.15);
    }
    textarea::placeholder, input::placeholder {
      color: rgba(255,255,255,0.3);
    }

    /* 上传区域 */
    .upload-area {
      border: 2px dashed rgba(255,255,255,0.2);
      border-radius: 1rem; padding: 2rem; text-align: center;
      cursor: pointer; transition: all 0.3s ease;
      background: rgba(0,0,0,0.1);
    }
    .upload-area:hover, .upload-area.dragover {
      border-color: rgba(255,255,255,0.5);
      background: rgba(255,255,255,0.05);
    }
    .upload-area .icon { font-size: 2.5rem; margin-bottom: 0.5rem; }
    .upload-area .text { color: rgba(255,255,255,0.5); font-size: 0.85rem; }
    .upload-area .filename {
      color: rgba(255,255,255,0.9); font-weight: 600; margin-top: 0.5rem;
    }

    /* 录音按钮 */
    .record-btn {
      display: inline-flex; align-items: center; gap: 0.4rem;
      padding: 0.5rem 1rem; border-radius: 2rem;
      border: 1px solid rgba(255,255,255,0.2);
      background: rgba(255,255,255,0.08);
      color: rgba(255,255,255,0.7); font-size: 0.85rem;
      cursor: pointer; transition: all 0.3s; margin-top: 0.8rem;
    }
    .record-btn:hover { background: rgba(255,255,255,0.15); }
    .record-btn.recording {
      border-color: #ef4444; color: #ef4444;
      animation: pulse 1.5s ease infinite;
    }
    @keyframes pulse {
      0%, 100% { box-shadow: 0 0 0 0 rgba(239,68,68,0.4); }
      50% { box-shadow: 0 0 0 8px rgba(239,68,68,0); }
    }

    /* 主按钮 — 液态玻璃 */
    .btn-primary {
      border-radius: 3rem; cursor: pointer; border: none;
      width: 100%;
    }
    .btn-primary .liquidGlass-tint {
      background: rgba(255,255,255,0.2);
    }
    .btn-primary .liquidGlass-effect,
    .btn-primary .liquidGlass-tint,
    .btn-primary .liquidGlass-shine {
      border-radius: 3rem;
    }
    .btn-primary .liquidGlass-content {
      padding: 0.9rem 2rem; text-align: center;
      font-weight: 700; font-size: 1rem; color: #fff;
    }
    .btn-primary:hover {
      transform: scale(1.02);
    }
    .btn-primary:active { transform: scale(0.98); }
    .btn-primary.loading .liquidGlass-content::after {
      content: ''; display: inline-block; width: 16px; height: 16px;
      border: 2px solid rgba(255,255,255,0.3);
      border-top-color: #fff; border-radius: 50%;
      margin-left: 8px; vertical-align: middle;
      animation: spin 0.8s linear infinite;
    }
    @keyframes spin { to { transform: rotate(360deg); } }

    /* 结果区域 */
    .result-area {
      background: rgba(0,0,0,0.25); border-radius: 1rem;
      padding: 1.2rem; min-height: 100px;
      font-family: 'SF Mono', 'Fira Code', monospace;
      font-size: 0.85rem; line-height: 1.7;
      color: rgba(255,255,255,0.85);
      white-space: pre-wrap; word-break: break-word;
      max-height: 400px; overflow-y: auto;
    }
    .result-area:empty::after {
      content: '等待识别...';
      color: rgba(255,255,255,0.25);
    }

    /* 音频播放器 */
    .audio-player {
      width: 100%; margin-top: 1rem; border-radius: 1rem;
    }

    .tab-panel { display: none; }
    .tab-panel.active { display: block; }

    /* 提示文字 */
    .hint {
      color: rgba(255,255,255,0.35); font-size: 0.8rem; margin-top: 1rem;
      line-height: 1.6;
    }

    .two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 1.5rem; }
    @media (max-width: 700px) { .two-col { grid-template-columns: 1fr; } }

    /* 底部 */
    .footer {
      text-align: center; color: rgba(255,255,255,0.2);
      font-size: 0.75rem; margin-top: 2rem;
    }
  </style>
</head>
<body>

  <!-- SVG 折射滤镜 -->
  <svg style="display:none">
    <filter id="glass-distortion" x="0%" y="0%" width="100%" height="100%" filterUnits="objectBoundingBox">
      <feTurbulence type="fractalNoise" baseFrequency="0.01 0.01" numOctaves="1" seed="5" result="turbulence"/>
      <feComponentTransfer in="turbulence" result="mapped">
        <feFuncR type="gamma" amplitude="1" exponent="10" offset="0.5"/>
        <feFuncG type="gamma" amplitude="0" exponent="1" offset="0"/>
        <feFuncB type="gamma" amplitude="0" exponent="1" offset="0.5"/>
      </feComponentTransfer>
      <feGaussianBlur in="turbulence" stdDeviation="3" result="softMap"/>
      <feSpecularLighting in="softMap" surfaceScale="5" specularConstant="1" specularExponent="100" lighting-color="white" result="specLight">
        <fePointLight x="-200" y="-200" z="300"/>
      </feSpecularLighting>
      <feComposite in="specLight" operator="arithmetic" k1="0" k2="1" k3="1" k4="0" result="litImage"/>
      <feDisplacementMap in="SourceGraphic" in2="softMap" scale="150" xChannelSelector="R" yChannelSelector="G"/>
    </filter>
  </svg>

  <div class="container">
    <!-- Header -->
    <div class="header">
      <h1>VibeVoice</h1>
      <p>Microsoft 开源语音 AI — 语音识别 & 语音合成</p>
      <div class="badges">
        <span class="badge">Microsoft Research</span>
        <span class="badge">MIT 开源</span>
        <span class="badge">MPS 本地加速</span>
      </div>
    </div>

    <!-- Tabs -->
    <div class="tabs">
      <div class="liquidGlass-wrapper tab-glass" onclick="switchTab('asr')" id="tab-asr">
        <div class="liquidGlass-effect"></div>
        <div class="liquidGlass-tint"></div>
        <div class="liquidGlass-shine"></div>
        <div class="liquidGlass-content">
          <button class="tab-btn active" data-tab="asr">语音识别 ASR</button>
        </div>
      </div>
      <div class="liquidGlass-wrapper tab-glass" onclick="switchTab('tts')" id="tab-tts">
        <div class="liquidGlass-effect"></div>
        <div class="liquidGlass-tint"></div>
        <div class="liquidGlass-shine"></div>
        <div class="liquidGlass-content">
          <button class="tab-btn" data-tab="tts">语音合成 TTS</button>
        </div>
      </div>
    </div>

    <!-- ASR Panel -->
    <div class="tab-panel active" id="panel-asr">
      <div class="two-col">
        <div>
          <div class="liquidGlass-wrapper glass-card">
            <div class="liquidGlass-effect"></div>
            <div class="liquidGlass-tint"></div>
            <div class="liquidGlass-shine"></div>
            <div class="liquidGlass-content">
              <div class="form-group">
                <label class="form-label">上传音频</label>
                <div class="upload-area" id="asr-upload" onclick="document.getElementById('asr-file').click()">
                  <div class="icon">🎵</div>
                  <div class="text">点击或拖拽上传音频文件<br>WAV / MP3 / FLAC / M4A</div>
                  <div class="filename" id="asr-filename"></div>
                </div>
                <input type="file" id="asr-file" accept="audio/*" style="display:none" onchange="onFileSelect(this)">
                <button class="record-btn" id="record-btn" onclick="toggleRecord()">
                  <span>🎙</span> <span id="record-text">录音</span>
                </button>
              </div>
              <div class="form-group">
                <label class="form-label">热词（可选，提升专业术语识别率）</label>
                <input type="text" id="asr-hotwords" placeholder="不当得利, 善意取得, 民法典">
              </div>
              <div class="liquidGlass-wrapper btn-primary" id="asr-btn" onclick="runASR()">
                <div class="liquidGlass-effect"></div>
                <div class="liquidGlass-tint"></div>
                <div class="liquidGlass-shine"></div>
                <div class="liquidGlass-content">开始识别</div>
              </div>
            </div>
          </div>
        </div>
        <div>
          <div class="liquidGlass-wrapper glass-card">
            <div class="liquidGlass-effect"></div>
            <div class="liquidGlass-tint"></div>
            <div class="liquidGlass-shine"></div>
            <div class="liquidGlass-content">
              <label class="form-label">识别结果</label>
              <div class="result-area" id="asr-result"></div>
              <audio id="asr-audio" class="audio-player" controls style="display:none"></audio>
            </div>
          </div>
        </div>
      </div>
    </div>

    <!-- TTS Panel -->
    <div class="tab-panel" id="panel-tts">
      <div class="two-col">
        <div>
          <div class="liquidGlass-wrapper glass-card">
            <div class="liquidGlass-effect"></div>
            <div class="liquidGlass-tint"></div>
            <div class="liquidGlass-shine"></div>
            <div class="liquidGlass-content">
              <div class="form-group">
                <label class="form-label">输入文字</label>
                <textarea id="tts-text" rows="8" placeholder="输入你想转换为语音的文字..."></textarea>
              </div>
              <div class="liquidGlass-wrapper btn-primary" id="tts-btn" onclick="runTTS()">
                <div class="liquidGlass-effect"></div>
                <div class="liquidGlass-tint"></div>
                <div class="liquidGlass-shine"></div>
                <div class="liquidGlass-content">生成语音</div>
              </div>
              <p class="hint">示例：今天我们来讲民法典中关于不当得利的规定。根据民法典第九百八十五条，得利人没有法律根据取得不当利益的，受损失的人可以请求得利人返还取得的利益。</p>
            </div>
          </div>
        </div>
        <div>
          <div class="liquidGlass-wrapper glass-card">
            <div class="liquidGlass-effect"></div>
            <div class="liquidGlass-tint"></div>
            <div class="liquidGlass-shine"></div>
            <div class="liquidGlass-content">
              <label class="form-label">生成结果</label>
              <div class="result-area" id="tts-result">等待生成...</div>
              <audio id="tts-audio" class="audio-player" controls style="display:none"></audio>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="footer">VibeVoice by Microsoft Research · 本地部署 · 数据不离开你的电脑</div>
  </div>

  <script>
    // Tab 切换
    function switchTab(tab) {
      document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active'));
      document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
      document.getElementById('panel-' + tab).classList.add('active');
      document.querySelector(`[data-tab="${tab}"]`).classList.add('active');
    }

    // 文件选择
    let selectedFile = null;
    function onFileSelect(input) {
      if (input.files.length > 0) {
        selectedFile = input.files[0];
        document.getElementById('asr-filename').textContent = selectedFile.name;
        // 显示播放器
        const audio = document.getElementById('asr-audio');
        audio.src = URL.createObjectURL(selectedFile);
        audio.style.display = 'block';
      }
    }

    // 拖拽上传
    const uploadArea = document.getElementById('asr-upload');
    uploadArea.addEventListener('dragover', e => { e.preventDefault(); uploadArea.classList.add('dragover'); });
    uploadArea.addEventListener('dragleave', () => uploadArea.classList.remove('dragover'));
    uploadArea.addEventListener('drop', e => {
      e.preventDefault();
      uploadArea.classList.remove('dragover');
      if (e.dataTransfer.files.length > 0) {
        selectedFile = e.dataTransfer.files[0];
        document.getElementById('asr-filename').textContent = selectedFile.name;
        const audio = document.getElementById('asr-audio');
        audio.src = URL.createObjectURL(selectedFile);
        audio.style.display = 'block';
      }
    });

    // 录音
    let mediaRecorder = null;
    let recordedChunks = [];
    async function toggleRecord() {
      const btn = document.getElementById('record-btn');
      const text = document.getElementById('record-text');

      if (mediaRecorder && mediaRecorder.state === 'recording') {
        mediaRecorder.stop();
        btn.classList.remove('recording');
        text.textContent = '录音';
        return;
      }

      try {
        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
        mediaRecorder = new MediaRecorder(stream);
        recordedChunks = [];
        mediaRecorder.ondataavailable = e => recordedChunks.push(e.data);
        mediaRecorder.onstop = () => {
          const blob = new Blob(recordedChunks, { type: 'audio/wav' });
          selectedFile = new File([blob], 'recording.wav', { type: 'audio/wav' });
          document.getElementById('asr-filename').textContent = '录音完成';
          const audio = document.getElementById('asr-audio');
          audio.src = URL.createObjectURL(blob);
          audio.style.display = 'block';
          stream.getTracks().forEach(t => t.stop());
        };
        mediaRecorder.start();
        btn.classList.add('recording');
        text.textContent = '停止';
      } catch (e) {
        alert('无法访问麦克风: ' + e.message);
      }
    }

    // ASR 调用
    async function runASR() {
      if (!selectedFile) { alert('请先上传或录制音频'); return; }

      const btn = document.getElementById('asr-btn');
      const result = document.getElementById('asr-result');
      btn.classList.add('loading');
      result.textContent = '正在加载模型并识别，首次需下载模型（~8GB）...';

      const form = new FormData();
      form.append('audio', selectedFile);
      form.append('hotwords', document.getElementById('asr-hotwords').value);

      try {
        const resp = await fetch('/api/asr', { method: 'POST', body: form });
        const data = await resp.json();

        if (data.error) {
          result.textContent = '错误: ' + data.error;
        } else if (data.segments && data.segments.length > 0) {
          const lines = data.segments.map(s => {
            const start = s.start_time || '';
            const end = s.end_time || '';
            const speaker = s.speaker_id || '';
            const text = s.text || '';
            if (start) return `[${start} → ${end}] 说话人${speaker}: ${text}`;
            return text;
          });
          result.textContent = lines.join('\n') + `\n\n--- 耗时 ${data.time}s ---`;
        } else {
          result.textContent = data.raw || '无结果';
        }
      } catch (e) {
        result.textContent = '请求失败: ' + e.message;
      } finally {
        btn.classList.remove('loading');
      }
    }

    // TTS 调用
    async function runTTS() {
      const text = document.getElementById('tts-text').value;
      if (!text.trim()) { alert('请输入文字'); return; }

      const btn = document.getElementById('tts-btn');
      const result = document.getElementById('tts-result');
      const audio = document.getElementById('tts-audio');
      btn.classList.add('loading');
      result.textContent = '正在加载模型并生成语音，首次需下载模型（~2GB）...';

      const form = new FormData();
      form.append('text', text);

      try {
        const resp = await fetch('/api/tts', { method: 'POST', body: form });
        if (resp.ok) {
          const blob = await resp.blob();
          audio.src = URL.createObjectURL(blob);
          audio.style.display = 'block';
          audio.play();
          result.textContent = '生成完成，点击播放';
        } else {
          const data = await resp.json();
          result.textContent = '错误: ' + (data.error || '未知错误');
        }
      } catch (e) {
        result.textContent = '请求失败: ' + e.message;
      } finally {
        btn.classList.remove('loading');
      }
    }
  </script>
</body>
</html>"""


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=4410)