diff --git a/app.py b/app.py
new file mode 100644
index 0000000..2d9152b
--- /dev/null
+++ b/app.py
@@ -0,0 +1,779 @@
+"""
+VibeVoice 体验平台 — Liquid Glass 风格
+FastAPI 后端 + 纯 HTML 前端
+"""
+
+import os
+import sys
+import json
+import torch
+import numpy as np
+import tempfile
+import time
+import soundfile as sf
+from pathlib import Path
+from fastapi import FastAPI, UploadFile, File, Form
+from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
+import uvicorn
+
+SOURCE_DIR = Path(__file__).parent / "source"
+STATIC_DIR = Path(__file__).parent / "static"
+sys.path.insert(0, str(SOURCE_DIR))
+
+app = FastAPI()
+
+# ========== 全局状态 ==========
+asr_model_cache = {}
+tts_model_cache = {}
+
+DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
+DTYPE = torch.float32
+
+
+def load_asr():
+    if asr_model_cache:
+        return asr_model_cache
+
+    from vibevoice.modular.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration
+    from vibevoice.processor.vibevoice_asr_processor import VibeVoiceASRProcessor
+
+    print(f"Loading ASR model to {DEVICE}...")
+    processor = VibeVoiceASRProcessor.from_pretrained("microsoft/VibeVoice-ASR")
+    model = VibeVoiceASRForConditionalGeneration.from_pretrained(
+        "microsoft/VibeVoice-ASR",
+        torch_dtype=DTYPE,
+        attn_implementation="sdpa",
+        trust_remote_code=True
+    )
+    model = model.to(DEVICE)
+    model.eval()
+    asr_model_cache["model"] = model
+    asr_model_cache["processor"] = processor
+    print("ASR model loaded")
+    return asr_model_cache
+
+
+def load_tts():
+    if tts_model_cache:
+        return tts_model_cache
+
+    from vibevoice.modular.modeling_vibevoice_streaming_inference import (
+        VibeVoiceStreamingForConditionalGenerationInference,
+    )
+    from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor
+
+    print(f"Loading TTS model to {DEVICE}...")
+    processor = VibeVoiceStreamingProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
+    model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
+        "microsoft/VibeVoice-Realtime-0.5B",
+        torch_dtype=DTYPE,
+        attn_implementation="sdpa",
+    )
+    model = model.to(DEVICE)
+    model.eval()
+    tts_model_cache["model"] = model
+    tts_model_cache["processor"] = processor
+    print("TTS model loaded")
+    return tts_model_cache
+
+
+@app.post("/api/asr")
+async def api_asr(audio: UploadFile = File(...), hotwords: str = Form("")):
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    content = await audio.read()
+    tmp.write(content)
+    tmp.close()
+
+    try:
+        asr = load_asr()
+        model = asr["model"]
+        processor = asr["processor"]
+
+        context_info = hotwords.strip() if hotwords.strip() else None
+        inputs = processor(
+            audio=tmp.name,
+            sampling_rate=None,
+            return_tensors="pt",
+            add_generation_prompt=True,
+            context_info=context_info
+        )
+        inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+
+        start_time = time.time()
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=32768,
+                do_sample=False,
+                pad_token_id=processor.pad_id,
+                eos_token_id=processor.tokenizer.eos_token_id,
+            )
+
+        elapsed = time.time() - start_time
+        input_length = inputs['input_ids'].shape[1]
+        generated_ids = output_ids[0, input_length:]
+        text = processor.decode(generated_ids, skip_special_tokens=True)
+
+        try:
+            segments = processor.post_process_transcription(text)
+        except Exception:
+            segments = [{"text": text}]
+
+        return JSONResponse({"segments": segments, "raw": text, "time": round(elapsed, 1)})
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
+    finally:
+        os.unlink(tmp.name)
+
+
+@app.post("/api/tts")
+async def api_tts(text: str = Form(...)):
+    if not text.strip():
+        return JSONResponse({"error": "empty text"}, status_code=400)
+
+    try:
+        tts = load_tts()
+        model = tts["model"]
+        processor = tts["processor"]
+
+        voices_dir = SOURCE_DIR / "demo" / "voices" / "streaming_model"
+        voice_files = list(voices_dir.rglob("*.pt")) if voices_dir.exists() else []
+        if not voice_files:
+            return JSONResponse({"error": "no voice presets found"}, status_code=500)
+
+        prefilled = torch.load(voice_files[0], map_location=DEVICE, weights_only=False)
+        processed = processor.process_input_with_cached_prompt(
+            text=text.strip(),
+            cached_prompt=prefilled,
+            padding=True,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        inputs = {k: v.to(DEVICE) if hasattr(v, "to") else v for k, v in processed.items()}
+
+        from vibevoice.modular.streamer import AudioStreamer
+        import copy, threading
+
+        audio_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
+        errors = []
+
+        model.model.noise_scheduler = model.model.noise_scheduler.from_config(
+            model.model.noise_scheduler.config,
+            algorithm_type="sde-dpmsolver++",
+            beta_schedule="squaredcos_cap_v2",
+        )
+        model.set_ddpm_inference_steps(num_steps=5)
+
+        stop_event = threading.Event()
+
+        def run_gen():
+            try:
+                model.generate(
+                    **inputs,
+                    max_new_tokens=None,
+                    cfg_scale=1.5,
+                    tokenizer=processor.tokenizer,
+                    generation_config={"do_sample": False, "temperature": 1.0, "top_p": 1.0},
+                    audio_streamer=audio_streamer,
+                    stop_check_fn=stop_event.is_set,
+                    verbose=False,
+                    refresh_negative=True,
+                    all_prefilled_outputs=copy.deepcopy(prefilled),
+                )
+            except Exception as e:
+                errors.append(e)
+                audio_streamer.end()
+
+        thread = threading.Thread(target=run_gen, daemon=True)
+        thread.start()
+
+        audio_chunks = []
+        for chunk in audio_streamer.get_stream(0):
+            if torch.is_tensor(chunk):
+                chunk = chunk.detach().cpu().to(torch.float32).numpy()
+            else:
+                chunk = np.asarray(chunk, dtype=np.float32)
+            if chunk.ndim > 1:
+                chunk = chunk.reshape(-1)
+            audio_chunks.append(chunk)
+
+        thread.join()
+        if errors:
+            return JSONResponse({"error": str(errors[0])}, status_code=500)
+
+        audio = np.clip(np.concatenate(audio_chunks), -1.0, 1.0)
+        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/tmp")
+        sf.write(tmp.name, audio, 24000)
+        return FileResponse(tmp.name, media_type="audio/wav", filename="vibevoice_tts.wav")
+
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return JSONResponse({"error": str(e)}, status_code=500)
+
+
+@app.get("/")
+def index():
+    return HTMLResponse(HTML_PAGE)
+
+
+HTML_PAGE = r"""<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>VibeVoice</title>
+  <style>
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
+
+    * { margin: 0; padding: 0; box-sizing: border-box; }
+
+    body {
+      font-family: 'Inter', -apple-system, sans-serif;
+      min-height: 100vh;
+      background: url("https://images.unsplash.com/photo-1557682250-33bd709cbe85?w=1920&q=80") center/cover fixed;
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      padding: 2rem;
+      color: #fff;
+    }
+
+    /* 背景动画叠加层 */
+    body::before {
+      content: '';
+      position: fixed;
+      inset: 0;
+      background: linear-gradient(135deg,
+        rgba(99, 102, 241, 0.15),
+        rgba(168, 85, 247, 0.1),
+        rgba(236, 72, 153, 0.1));
+      z-index: 0;
+      animation: shiftGradient 15s ease infinite;
+    }
+    @keyframes shiftGradient {
+      0%, 100% { opacity: 0.6; }
+      50% { opacity: 1; }
+    }
+
+    /* ===== 液态玻璃四层架构 ===== */
+    .liquidGlass-wrapper {
+      position: relative;
+      overflow: hidden;
+      box-shadow: 0 6px 6px rgba(0,0,0,0.2), 0 0 20px rgba(0,0,0,0.1);
+      transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 2.2);
+    }
+    .liquidGlass-effect {
+      position: absolute; z-index: 0; inset: 0;
+      backdrop-filter: blur(3px);
+      filter: url(#glass-distortion);
+      overflow: hidden;
+      isolation: isolate;
+    }
+    .liquidGlass-tint {
+      z-index: 1; position: absolute; inset: 0;
+      background: rgba(255, 255, 255, 0.12);
+    }
+    .liquidGlass-shine {
+      position: absolute; inset: 0; z-index: 2; overflow: hidden;
+      box-shadow: inset 2px 2px 1px 0 rgba(255,255,255,0.5),
+                  inset -1px -1px 1px 1px rgba(255,255,255,0.5);
+    }
+    .liquidGlass-content {
+      position: relative; z-index: 3;
+    }
+
+    /* ===== 布局 ===== */
+    .container {
+      position: relative; z-index: 1;
+      max-width: 900px; width: 100%;
+    }
+
+    /* Header */
+    .header {
+      text-align: center;
+      margin-bottom: 2rem;
+    }
+    .header h1 {
+      font-size: 3rem; font-weight: 700;
+      text-shadow: 0 2px 20px rgba(0,0,0,0.3);
+      letter-spacing: -0.02em;
+    }
+    .header p {
+      color: rgba(255,255,255,0.7); margin-top: 0.5rem; font-size: 1rem;
+    }
+    .badges { display: flex; gap: 0.5rem; justify-content: center; margin-top: 1rem; }
+    .badge {
+      padding: 0.25rem 0.75rem; border-radius: 2rem; font-size: 0.75rem;
+      font-weight: 500; backdrop-filter: blur(10px);
+      background: rgba(255,255,255,0.15); border: 1px solid rgba(255,255,255,0.25);
+    }
+
+    /* Tabs */
+    .tabs {
+      display: flex; gap: 0.5rem; margin-bottom: 1.5rem; justify-content: center;
+    }
+    .tab-btn {
+      border-radius: 3rem; padding: 0.7rem 2rem; border: none;
+      font-size: 0.95rem; font-weight: 600; cursor: pointer;
+      color: rgba(255,255,255,0.6); background: transparent;
+      transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 2.2);
+    }
+    .tab-btn.active {
+      color: #fff;
+    }
+    .tab-btn:hover { transform: scale(1.05); }
+
+    /* 玻璃卡片 */
+    .glass-card {
+      border-radius: 1.8rem;
+      margin-bottom: 1.5rem;
+    }
+    .glass-card .liquidGlass-effect,
+    .glass-card .liquidGlass-tint,
+    .glass-card .liquidGlass-shine {
+      border-radius: 1.8rem;
+    }
+    .glass-card .liquidGlass-content {
+      padding: 2rem;
+    }
+    .glass-card:hover {
+      box-shadow: 0 8px 12px rgba(0,0,0,0.25), 0 0 30px rgba(0,0,0,0.15);
+    }
+
+    /* 玻璃 Tab 按钮 */
+    .tab-glass {
+      border-radius: 3rem;
+    }
+    .tab-glass .liquidGlass-effect,
+    .tab-glass .liquidGlass-tint,
+    .tab-glass .liquidGlass-shine {
+      border-radius: 3rem;
+    }
+    .tab-glass .liquidGlass-content {
+      padding: 0;
+    }
+
+    /* 表单 */
+    .form-label {
+      font-size: 0.85rem; font-weight: 600; color: rgba(255,255,255,0.8);
+      margin-bottom: 0.5rem; display: block;
+    }
+    .form-group { margin-bottom: 1.2rem; }
+
+    textarea, input[type="text"] {
+      width: 100%; padding: 0.8rem 1rem;
+      background: rgba(0,0,0,0.2); border: 1px solid rgba(255,255,255,0.15);
+      border-radius: 1rem; color: #fff; font-size: 0.9rem;
+      font-family: inherit; resize: vertical;
+      transition: border-color 0.3s, box-shadow 0.3s;
+    }
+    textarea:focus, input[type="text"]:focus {
+      outline: none;
+      border-color: rgba(255,255,255,0.4);
+      box-shadow: 0 0 20px rgba(167,139,250,0.15);
+    }
+    textarea::placeholder, input::placeholder {
+      color: rgba(255,255,255,0.3);
+    }
+
+    /* 上传区域 */
+    .upload-area {
+      border: 2px dashed rgba(255,255,255,0.2);
+      border-radius: 1rem; padding: 2rem; text-align: center;
+      cursor: pointer; transition: all 0.3s ease;
+      background: rgba(0,0,0,0.1);
+    }
+    .upload-area:hover, .upload-area.dragover {
+      border-color: rgba(255,255,255,0.5);
+      background: rgba(255,255,255,0.05);
+    }
+    .upload-area .icon { font-size: 2.5rem; margin-bottom: 0.5rem; }
+    .upload-area .text { color: rgba(255,255,255,0.5); font-size: 0.85rem; }
+    .upload-area .filename {
+      color: rgba(255,255,255,0.9); font-weight: 600; margin-top: 0.5rem;
+    }
+
+    /* 录音按钮 */
+    .record-btn {
+      display: inline-flex; align-items: center; gap: 0.4rem;
+      padding: 0.5rem 1rem; border-radius: 2rem;
+      border: 1px solid rgba(255,255,255,0.2);
+      background: rgba(255,255,255,0.08);
+      color: rgba(255,255,255,0.7); font-size: 0.85rem;
+      cursor: pointer; transition: all 0.3s; margin-top: 0.8rem;
+    }
+    .record-btn:hover { background: rgba(255,255,255,0.15); }
+    .record-btn.recording {
+      border-color: #ef4444; color: #ef4444;
+      animation: pulse 1.5s ease infinite;
+    }
+    @keyframes pulse {
+      0%, 100% { box-shadow: 0 0 0 0 rgba(239,68,68,0.4); }
+      50% { box-shadow: 0 0 0 8px rgba(239,68,68,0); }
+    }
+
+    /* 主按钮 — 液态玻璃 */
+    .btn-primary {
+      border-radius: 3rem; cursor: pointer; border: none;
+      width: 100%;
+    }
+    .btn-primary .liquidGlass-tint {
+      background: rgba(255,255,255,0.2);
+    }
+    .btn-primary .liquidGlass-effect,
+    .btn-primary .liquidGlass-tint,
+    .btn-primary .liquidGlass-shine {
+      border-radius: 3rem;
+    }
+    .btn-primary .liquidGlass-content {
+      padding: 0.9rem 2rem; text-align: center;
+      font-weight: 700; font-size: 1rem; color: #fff;
+    }
+    .btn-primary:hover {
+      transform: scale(1.02);
+    }
+    .btn-primary:active { transform: scale(0.98); }
+    .btn-primary.loading .liquidGlass-content::after {
+      content: ''; display: inline-block; width: 16px; height: 16px;
+      border: 2px solid rgba(255,255,255,0.3);
+      border-top-color: #fff; border-radius: 50%;
+      margin-left: 8px; vertical-align: middle;
+      animation: spin 0.8s linear infinite;
+    }
+    @keyframes spin { to { transform: rotate(360deg); } }
+
+    /* 结果区域 */
+    .result-area {
+      background: rgba(0,0,0,0.25); border-radius: 1rem;
+      padding: 1.2rem; min-height: 100px;
+      font-family: 'SF Mono', 'Fira Code', monospace;
+      font-size: 0.85rem; line-height: 1.7;
+      color: rgba(255,255,255,0.85);
+      white-space: pre-wrap; word-break: break-word;
+      max-height: 400px; overflow-y: auto;
+    }
+    .result-area:empty::after {
+      content: '等待识别...';
+      color: rgba(255,255,255,0.25);
+    }
+
+    /* 音频播放器 */
+    .audio-player {
+      width: 100%; margin-top: 1rem; border-radius: 1rem;
+    }
+
+    .tab-panel { display: none; }
+    .tab-panel.active { display: block; }
+
+    /* 提示文字 */
+    .hint {
+      color: rgba(255,255,255,0.35); font-size: 0.8rem; margin-top: 1rem;
+      line-height: 1.6;
+    }
+
+    .two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 1.5rem; }
+    @media (max-width: 700px) { .two-col { grid-template-columns: 1fr; } }
+
+    /* 底部 */
+    .footer {
+      text-align: center; color: rgba(255,255,255,0.2);
+      font-size: 0.75rem; margin-top: 2rem;
+    }
+  </style>
+</head>
+<body>
+
+  <!-- SVG 折射滤镜 -->
+  <svg style="display:none">
+    <filter id="glass-distortion" x="0%" y="0%" width="100%" height="100%" filterUnits="objectBoundingBox">
+      <feTurbulence type="fractalNoise" baseFrequency="0.01 0.01" numOctaves="1" seed="5" result="turbulence"/>
+      <feComponentTransfer in="turbulence" result="mapped">
+        <feFuncR type="gamma" amplitude="1" exponent="10" offset="0.5"/>
+        <feFuncG type="gamma" amplitude="0" exponent="1" offset="0"/>
+        <feFuncB type="gamma" amplitude="0" exponent="1" offset="0.5"/>
+      </feComponentTransfer>
+      <feGaussianBlur in="turbulence" stdDeviation="3" result="softMap"/>
+      <feSpecularLighting in="softMap" surfaceScale="5" specularConstant="1" specularExponent="100" lighting-color="white" result="specLight">
+        <fePointLight x="-200" y="-200" z="300"/>
+      </feSpecularLighting>
+      <feComposite in="specLight" operator="arithmetic" k1="0" k2="1" k3="1" k4="0" result="litImage"/>
+      <feDisplacementMap in="SourceGraphic" in2="softMap" scale="150" xChannelSelector="R" yChannelSelector="G"/>
+    </filter>
+  </svg>
+
+  <div class="container">
+    <!-- Header -->
+    <div class="header">
+      <h1>VibeVoice</h1>
+      <p>Microsoft 开源语音 AI — 语音识别 & 语音合成</p>
+      <div class="badges">
+        <span class="badge">Microsoft Research</span>
+        <span class="badge">MIT 开源</span>
+        <span class="badge">MPS 本地加速</span>
+      </div>
+    </div>
+
+    <!-- Tabs -->
+    <div class="tabs">
+      <div class="liquidGlass-wrapper tab-glass" onclick="switchTab('asr')" id="tab-asr">
+        <div class="liquidGlass-effect"></div>
+        <div class="liquidGlass-tint"></div>
+        <div class="liquidGlass-shine"></div>
+        <div class="liquidGlass-content">
+          <button class="tab-btn active" data-tab="asr">语音识别 ASR</button>
+        </div>
+      </div>
+      <div class="liquidGlass-wrapper tab-glass" onclick="switchTab('tts')" id="tab-tts">
+        <div class="liquidGlass-effect"></div>
+        <div class="liquidGlass-tint"></div>
+        <div class="liquidGlass-shine"></div>
+        <div class="liquidGlass-content">
+          <button class="tab-btn" data-tab="tts">语音合成 TTS</button>
+        </div>
+      </div>
+    </div>
+
+    <!-- ASR Panel -->
+    <div class="tab-panel active" id="panel-asr">
+      <div class="two-col">
+        <div>
+          <div class="liquidGlass-wrapper glass-card">
+            <div class="liquidGlass-effect"></div>
+            <div class="liquidGlass-tint"></div>
+            <div class="liquidGlass-shine"></div>
+            <div class="liquidGlass-content">
+              <div class="form-group">
+                <label class="form-label">上传音频</label>
+                <div class="upload-area" id="asr-upload" onclick="document.getElementById('asr-file').click()">
+                  <div class="icon">🎵</div>
+                  <div class="text">点击或拖拽上传音频文件<br>WAV / MP3 / FLAC / M4A</div>
+                  <div class="filename" id="asr-filename"></div>
+                </div>
+                <input type="file" id="asr-file" accept="audio/*" style="display:none" onchange="onFileSelect(this)">
+                <button class="record-btn" id="record-btn" onclick="toggleRecord()">
+                  <span>🎙</span> <span id="record-text">录音</span>
+                </button>
+              </div>
+              <div class="form-group">
+                <label class="form-label">热词（可选，提升专业术语识别率）</label>
+                <input type="text" id="asr-hotwords" placeholder="不当得利, 善意取得, 民法典">
+              </div>
+              <div class="liquidGlass-wrapper btn-primary" id="asr-btn" onclick="runASR()">
+                <div class="liquidGlass-effect"></div>
+                <div class="liquidGlass-tint"></div>
+                <div class="liquidGlass-shine"></div>
+                <div class="liquidGlass-content">开始识别</div>
+              </div>
+            </div>
+          </div>
+        </div>
+        <div>
+          <div class="liquidGlass-wrapper glass-card">
+            <div class="liquidGlass-effect"></div>
+            <div class="liquidGlass-tint"></div>
+            <div class="liquidGlass-shine"></div>
+            <div class="liquidGlass-content">
+              <label class="form-label">识别结果</label>
+              <div class="result-area" id="asr-result"></div>
+              <audio id="asr-audio" class="audio-player" controls style="display:none"></audio>
+            </div>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <!-- TTS Panel -->
+    <div class="tab-panel" id="panel-tts">
+      <div class="two-col">
+        <div>
+          <div class="liquidGlass-wrapper glass-card">
+            <div class="liquidGlass-effect"></div>
+            <div class="liquidGlass-tint"></div>
+            <div class="liquidGlass-shine"></div>
+            <div class="liquidGlass-content">
+              <div class="form-group">
+                <label class="form-label">输入文字</label>
+                <textarea id="tts-text" rows="8" placeholder="输入你想转换为语音的文字..."></textarea>
+              </div>
+              <div class="liquidGlass-wrapper btn-primary" id="tts-btn" onclick="runTTS()">
+                <div class="liquidGlass-effect"></div>
+                <div class="liquidGlass-tint"></div>
+                <div class="liquidGlass-shine"></div>
+                <div class="liquidGlass-content">生成语音</div>
+              </div>
+              <p class="hint">示例：今天我们来讲民法典中关于不当得利的规定。根据民法典第九百八十五条，得利人没有法律根据取得不当利益的，受损失的人可以请求得利人返还取得的利益。</p>
+            </div>
+          </div>
+        </div>
+        <div>
+          <div class="liquidGlass-wrapper glass-card">
+            <div class="liquidGlass-effect"></div>
+            <div class="liquidGlass-tint"></div>
+            <div class="liquidGlass-shine"></div>
+            <div class="liquidGlass-content">
+              <label class="form-label">生成结果</label>
+              <div class="result-area" id="tts-result">等待生成...</div>
+              <audio id="tts-audio" class="audio-player" controls style="display:none"></audio>
+            </div>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <div class="footer">VibeVoice by Microsoft Research · 本地部署 · 数据不离开你的电脑</div>
+  </div>
+
+  <script>
+    // Tab 切换
+    function switchTab(tab) {
+      document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active'));
+      document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
+      document.getElementById('panel-' + tab).classList.add('active');
+      document.querySelector(`[data-tab="${tab}"]`).classList.add('active');
+    }
+
+    // 文件选择
+    let selectedFile = null;
+    function onFileSelect(input) {
+      if (input.files.length > 0) {
+        selectedFile = input.files[0];
+        document.getElementById('asr-filename').textContent = selectedFile.name;
+        // 显示播放器
+        const audio = document.getElementById('asr-audio');
+        audio.src = URL.createObjectURL(selectedFile);
+        audio.style.display = 'block';
+      }
+    }
+
+    // 拖拽上传
+    const uploadArea = document.getElementById('asr-upload');
+    uploadArea.addEventListener('dragover', e => { e.preventDefault(); uploadArea.classList.add('dragover'); });
+    uploadArea.addEventListener('dragleave', () => uploadArea.classList.remove('dragover'));
+    uploadArea.addEventListener('drop', e => {
+      e.preventDefault();
+      uploadArea.classList.remove('dragover');
+      if (e.dataTransfer.files.length > 0) {
+        selectedFile = e.dataTransfer.files[0];
+        document.getElementById('asr-filename').textContent = selectedFile.name;
+        const audio = document.getElementById('asr-audio');
+        audio.src = URL.createObjectURL(selectedFile);
+        audio.style.display = 'block';
+      }
+    });
+
+    // 录音
+    let mediaRecorder = null;
+    let recordedChunks = [];
+    async function toggleRecord() {
+      const btn = document.getElementById('record-btn');
+      const text = document.getElementById('record-text');
+
+      if (mediaRecorder && mediaRecorder.state === 'recording') {
+        mediaRecorder.stop();
+        btn.classList.remove('recording');
+        text.textContent = '录音';
+        return;
+      }
+
+      try {
+        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+        mediaRecorder = new MediaRecorder(stream);
+        recordedChunks = [];
+        mediaRecorder.ondataavailable = e => recordedChunks.push(e.data);
+        mediaRecorder.onstop = () => {
+          const blob = new Blob(recordedChunks, { type: 'audio/wav' });
+          selectedFile = new File([blob], 'recording.wav', { type: 'audio/wav' });
+          document.getElementById('asr-filename').textContent = '录音完成';
+          const audio = document.getElementById('asr-audio');
+          audio.src = URL.createObjectURL(blob);
+          audio.style.display = 'block';
+          stream.getTracks().forEach(t => t.stop());
+        };
+        mediaRecorder.start();
+        btn.classList.add('recording');
+        text.textContent = '停止';
+      } catch (e) {
+        alert('无法访问麦克风: ' + e.message);
+      }
+    }
+
+    // ASR 调用
+    async function runASR() {
+      if (!selectedFile) { alert('请先上传或录制音频'); return; }
+
+      const btn = document.getElementById('asr-btn');
+      const result = document.getElementById('asr-result');
+      btn.classList.add('loading');
+      result.textContent = '正在加载模型并识别，首次需下载模型（~8GB）...';
+
+      const form = new FormData();
+      form.append('audio', selectedFile);
+      form.append('hotwords', document.getElementById('asr-hotwords').value);
+
+      try {
+        const resp = await fetch('/api/asr', { method: 'POST', body: form });
+        const data = await resp.json();
+
+        if (data.error) {
+          result.textContent = '错误: ' + data.error;
+        } else if (data.segments && data.segments.length > 0) {
+          const lines = data.segments.map(s => {
+            const start = s.start_time || '';
+            const end = s.end_time || '';
+            const speaker = s.speaker_id || '';
+            const text = s.text || '';
+            if (start) return `[${start} → ${end}] 说话人${speaker}: ${text}`;
+            return text;
+          });
+          result.textContent = lines.join('\n') + `\n\n--- 耗时 ${data.time}s ---`;
+        } else {
+          result.textContent = data.raw || '无结果';
+        }
+      } catch (e) {
+        result.textContent = '请求失败: ' + e.message;
+      } finally {
+        btn.classList.remove('loading');
+      }
+    }
+
+    // TTS 调用
+    async function runTTS() {
+      const text = document.getElementById('tts-text').value;
+      if (!text.trim()) { alert('请输入文字'); return; }
+
+      const btn = document.getElementById('tts-btn');
+      const result = document.getElementById('tts-result');
+      const audio = document.getElementById('tts-audio');
+      btn.classList.add('loading');
+      result.textContent = '正在加载模型并生成语音，首次需下载模型（~2GB）...';
+
+      const form = new FormData();
+      form.append('text', text);
+
+      try {
+        const resp = await fetch('/api/tts', { method: 'POST', body: form });
+        if (resp.ok) {
+          const blob = await resp.blob();
+          audio.src = URL.createObjectURL(blob);
+          audio.style.display = 'block';
+          audio.play();
+          result.textContent = '生成完成，点击播放';
+        } else {
+          const data = await resp.json();
+          result.textContent = '错误: ' + (data.error || '未知错误');
+        }
+      } catch (e) {
+        result.textContent = '请求失败: ' + e.message;
+      } finally {
+        btn.classList.remove('loading');
+      }
+    }
+  </script>
+</body>
+</html>"""
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=4410)
diff --git a/index.html b/index.html
index a49defd..c3946b0 100644
--- a/index.html
+++ b/index.html
@@ -14,33 +14,261 @@
     .container { max-width: 1200px; margin: 0 auto; }
     h1 {
       font-size: 2.5rem; font-weight: 700;
-      background: linear-gradient(135deg, #60a5fa, #a78bfa);
+      background: linear-gradient(135deg, #f97316, #ef4444);
       -webkit-background-clip: text; -webkit-text-fill-color: transparent;
       margin-bottom: 0.5rem;
     }
     .subtitle { color: #888; font-size: 1.1rem; margin-bottom: 2rem; }
+    .badge-row { display: flex; gap: 0.5rem; margin-bottom: 2rem; flex-wrap: wrap; }
+    .badge {
+      display: inline-block; padding: 0.3rem 0.8rem; border-radius: 20px;
+      font-size: 0.8rem; font-weight: 600;
+    }
+    .badge-ms { background: #1a3a5c; color: #60a5fa; }
+    .badge-asr { background: #3c2e1a; color: #fbbf24; }
+    .badge-tts { background: #1a3c2a; color: #4ade80; }
+    .badge-mit { background: #2e1a3c; color: #c4b5fd; }
+
     .card {
       background: #141414; border: 1px solid #222; border-radius: 12px;
       padding: 2rem; margin-bottom: 1.5rem;
     }
-    .card h2 { color: #60a5fa; margin-bottom: 1rem; font-size: 1.3rem; }
-    .card p { line-height: 1.8; color: #aaa; }
+    .card h2 { color: #f97316; margin-bottom: 1rem; font-size: 1.3rem; }
+    .card p, .card li { line-height: 1.8; color: #aaa; }
+    .card ul { list-style: none; padding: 0; }
+    .card ul li::before { content: ""; margin-right: 0.5rem; }
+
+    .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(340px, 1fr)); gap: 1.5rem; }
+
+    table { width: 100%; border-collapse: collapse; margin-top: 0.5rem; }
+    th, td { text-align: left; padding: 0.7rem 1rem; border-bottom: 1px solid #222; }
+    th { color: #f97316; font-weight: 600; font-size: 0.9rem; }
+    td { color: #aaa; font-size: 0.9rem; }
+
+    .highlight { color: #4ade80; font-weight: 600; }
+    .warn { color: #f87171; font-weight: 600; }
+
+    .model-card {
+      background: #1a1a2e; border: 1px solid #2a2a4e; border-radius: 10px;
+      padding: 1.5rem; text-align: center;
+    }
+    .model-card .icon { font-size: 2.5rem; margin-bottom: 0.5rem; }
+    .model-card .name { font-size: 1.1rem; color: #f97316; font-weight: 700; margin-bottom: 0.3rem; }
+    .model-card .size { font-size: 0.8rem; color: #666; margin-bottom: 0.8rem; }
+    .model-card .desc { font-size: 0.85rem; color: #aaa; line-height: 1.6; text-align: left; }
+
+    .three-col { display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 1.5rem; margin-bottom: 1.5rem; }
+
+    .use-case {
+      background: #141414; border: 1px solid #2a4e2a; border-radius: 12px;
+      padding: 1.5rem;
+    }
+    .use-case h3 { color: #4ade80; font-size: 1rem; margin-bottom: 0.5rem; }
+    .use-case p { color: #888; font-size: 0.9rem; line-height: 1.6; }
+    .use-case .tag { display: inline-block; background: #1a3c2a; color: #4ade80; padding: 0.2rem 0.5rem; border-radius: 4px; font-size: 0.75rem; margin-top: 0.5rem; }
+
+    .code-block {
+      background: #1a1a2e; border: 1px solid #2a2a4e; border-radius: 8px;
+      padding: 1.2rem; margin-top: 1rem; overflow-x: auto;
+      font-family: "SF Mono", "Fira Code", monospace; font-size: 0.85rem;
+      color: #c4b5fd; line-height: 1.6;
+    }
+    .code-comment { color: #555; }
+
+    .links { display: flex; gap: 1rem; margin-top: 1.5rem; flex-wrap: wrap; }
+    .links a {
+      display: inline-flex; align-items: center; gap: 0.4rem;
+      padding: 0.6rem 1.2rem; border-radius: 8px; text-decoration: none;
+      font-size: 0.9rem; font-weight: 600; transition: opacity 0.2s;
+    }
+    .links a:hover { opacity: 0.8; }
+    .link-gh { background: #1a1a2e; color: #c4b5fd; border: 1px solid #2a2a4e; }
+    .link-hf { background: #1a2e1a; color: #4ade80; border: 1px solid #2a4e2a; }
+    .link-doc { background: #2e2a1a; color: #fbbf24; border: 1px solid #4e3a2a; }
+
+    .verdict {
+      background: linear-gradient(135deg, #1a0a00, #141414);
+      border: 1px solid #f9731633; border-radius: 12px;
+      padding: 2rem; margin-top: 1.5rem; text-align: center;
+    }
+    .verdict h2 { color: #f97316; margin-bottom: 0.5rem; }
+    .verdict p { color: #888; max-width: 600px; margin: 0 auto; }
+
+    footer { text-align: center; color: #333; margin-top: 3rem; font-size: 0.8rem; }
   </style>
 </head>
 <body>
   <div class="container">
-    <h1>VibeVoice 语音AI研究</h1>
-    <p class="subtitle">微软开源语音全家桶，ASR+TTS+实时语音，可用于法考字幕提取</p>
+    <h1>VibeVoice — 语音 AI 全家桶</h1>
+    <p class="subtitle">微软开源 | ASR + TTS + 实时语音 | MIT 许可</p>
 
+    <div class="badge-row">
+      <span class="badge badge-ms">Microsoft Research</span>
+      <span class="badge badge-asr">ASR 语音识别</span>
+      <span class="badge badge-tts">TTS 语音合成</span>
+      <span class="badge badge-mit">MIT 开源</span>
+    </div>
+
+    <!-- 三个模型 -->
+    <div class="three-col">
+      <div class="model-card">
+        <div class="icon">🎙</div>
+        <div class="name">VibeVoice-ASR</div>
+        <div class="size">语音识别模型</div>
+        <div class="desc">
+          <ul style="list-style:none; padding:0;">
+            <li>单次处理 60 分钟音频</li>
+            <li>输出：说话人 + 时间戳 + 内容</li>
+            <li>支持 50+ 语言</li>
+            <li>支持自定义热词</li>
+          </ul>
+        </div>
+      </div>
+      <div class="model-card">
+        <div class="icon">🔊</div>
+        <div class="name">VibeVoice-1.5B</div>
+        <div class="size">15 亿参数 · TTS</div>
+        <div class="desc">
+          <ul style="list-style:none; padding:0;">
+            <li>高质量文字转语音</li>
+            <li>自然语调和韵律</li>
+            <li>多语言支持</li>
+            <li>7.5Hz 超低帧率 token</li>
+          </ul>
+        </div>
+      </div>
+      <div class="model-card">
+        <div class="icon">⚡</div>
+        <div class="name">VibeVoice-Realtime-0.5B</div>
+        <div class="size">5 亿参数 · 实时 TTS</div>
+        <div class="desc">
+          <ul style="list-style:none; padding:0;">
+            <li>流式文字输入</li>
+            <li>首音延迟 ~300ms</li>
+            <li>支持长文本朗读</li>
+            <li>适合实时对话场景</li>
+          </ul>
+        </div>
+      </div>
+    </div>
+
+    <div class="grid">
+      <!-- 技术亮点 -->
+      <div class="card">
+        <h2>核心技术</h2>
+        <table>
+          <tr><th>技术</th><th>说明</th></tr>
+          <tr><td>连续语音 Tokenizer</td><td>声学 + 语义双 Tokenizer，7.5Hz 超低帧率</td></tr>
+          <tr><td>长音频处理</td><td>单次 60 分钟，无需分段</td></tr>
+          <tr><td>说话人分离</td><td>自动识别 Who + When + What</td></tr>
+          <tr><td>流式推理</td><td>边输入文字边生成语音，300ms 首音</td></tr>
+          <tr><td>热词支持</td><td>自定义专业术语提升识别率</td></tr>
+        </table>
+      </div>
+
+      <!-- 对比 -->
+      <div class="card">
+        <h2>vs 同类方案</h2>
+        <table>
+          <tr><th>维度</th><th>Whisper</th><th>ElevenLabs</th><th>VibeVoice</th></tr>
+          <tr><td>ASR</td><td class="highlight">有</td><td class="warn">无</td><td class="highlight">有（更强）</td></tr>
+          <tr><td>TTS</td><td class="warn">无</td><td class="highlight">有</td><td class="highlight">有</td></tr>
+          <tr><td>实时流式</td><td class="warn">无</td><td class="highlight">有</td><td class="highlight">有</td></tr>
+          <tr><td>说话人识别</td><td class="warn">无</td><td class="warn">无</td><td class="highlight">内置</td></tr>
+          <tr><td>长音频</td><td>需分段</td><td>N/A</td><td class="highlight">60分钟单次</td></tr>
+          <tr><td>开源</td><td class="highlight">是</td><td class="warn">否</td><td class="highlight">是（MIT）</td></tr>
+          <tr><td>费用</td><td>免费</td><td class="warn">按量付费</td><td>免费</td></tr>
+        </table>
+      </div>
+    </div>
+
+    <!-- 应用场景 -->
+    <h2 style="color: #f97316; margin: 1.5rem 0 1rem;">我们的应用场景</h2>
+    <div class="three-col">
+      <div class="use-case">
+        <h3>法考视频字幕提取</h3>
+        <p>9,553 个法考视频需要提取字幕。VibeVoice-ASR 单次处理 60 分钟 + 自动时间戳 + 说话人识别，配合法律热词（"不当得利""善意取得"等）可显著提升识别率。</p>
+        <span class="tag">高优先级</span>
+      </div>
+      <div class="use-case">
+        <h3>法海法考 App 语音朗读</h3>
+        <p>用 Realtime-0.5B 为题目和解析生成语音朗读，支持边看题边听讲解，提升学习体验。</p>
+        <span class="tag">中优先级</span>
+      </div>
+      <div class="use-case">
+        <h3>百陶会多语言介绍</h3>
+        <p>用 VibeVoice-1.5B 为产品页面生成中英文语音介绍，50+ 语言支持覆盖海外客户。</p>
+        <span class="tag">低优先级</span>
+      </div>
+    </div>
+
+    <!-- 代码示例 -->
     <div class="card">
-      <h2>概述</h2>
-      <p>待补充研究内容...</p>
+      <h2>ASR 使用示例</h2>
+      <div class="code-block">
+<span class="code-comment"># 安装</span>
+pip install transformers torch
+
+<span class="code-comment"># ASR：语音转文字（带时间戳和说话人）</span>
+from transformers import pipeline
+
+asr = pipeline(
+    "automatic-speech-recognition",
+    model="microsoft/VibeVoice-ASR"
+)
+
+result = asr("lecture_60min.wav")
+<span class="code-comment"># 输出：[{speaker: "A", start: 0.0, end: 3.2, text: "..."},  ...]</span>
+      </div>
     </div>
 
     <div class="card">
-      <h2>核心发现</h2>
-      <p>待补充...</p>
+      <h2>TTS 使用示例</h2>
+      <div class="code-block">
+<span class="code-comment"># 实时 TTS：文字转语音</span>
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/VibeVoice-Realtime-0.5B"
+)
+
+<span class="code-comment"># 流式生成，首音 ~300ms</span>
+for audio_chunk in model.generate_stream("今天我们来讲民法典..."):
+    play(audio_chunk)
+      </div>
     </div>
+
+    <!-- 硬件 -->
+    <div class="card">
+      <h2>硬件要求与本机适配</h2>
+      <table>
+        <tr><th>模型</th><th>显存需求</th><th>M2 Max 可运行？</th></tr>
+        <tr><td>VibeVoice-ASR</td><td>~8GB</td><td class="highlight">可以（MPS 加速）</td></tr>
+        <tr><td>VibeVoice-1.5B</td><td>~6GB</td><td class="highlight">可以</td></tr>
+        <tr><td>VibeVoice-Realtime-0.5B</td><td>~2GB</td><td class="highlight">可以</td></tr>
+      </table>
+      <p style="margin-top: 1rem; color: #4ade80; font-size: 0.9rem;">
+        本机 M2 Max 64GB 完全满足所有模型运行要求
+      </p>
+    </div>
+
+    <!-- 评价 -->
+    <div class="verdict">
+      <h2>评价：实用性很高</h2>
+      <p>ASR + TTS + 实时语音三合一开源方案，MIT 许可无商用限制。ASR 的 60 分钟长音频 + 说话人识别是真正的差异化优势。本机 M2 Max 可直接运行，不需要 GPU 服务器。对法考字幕提取项目有直接价值。</p>
+    </div>
+
+    <!-- 链接 -->
+    <div class="links">
+      <a href="https://github.com/microsoft/VibeVoice" target="_blank" class="link-gh">GitHub 源码</a>
+      <a href="https://huggingface.co/microsoft/VibeVoice-ASR" target="_blank" class="link-hf">ASR 模型</a>
+      <a href="https://huggingface.co/microsoft/VibeVoice-Realtime-0.5B" target="_blank" class="link-hf">Realtime 模型</a>
+      <a href="https://microsoft.github.io/VibeVoice/" target="_blank" class="link-doc">官方文档</a>
+    </div>
+
+    <footer>
+      研究项目 · 立项日期 2026-03-31 · 源码克隆至 ./source/
+    </footer>
   </div>
 </body>
 </html>
diff --git a/source b/source
new file mode 160000
index 0000000..3c97649
--- /dev/null
+++ b/source
@@ -0,0 +1 @@
+Subproject commit 3c976491d467698f13ebe4f096206812b91270b3