auto-save 2026-04-01 09:03 (+2, ~1)

This commit is contained in:
2026-04-01 09:04:18 +08:00
parent 6ae622c451
commit eeaeaa1e04
3 changed files with 1017 additions and 9 deletions

779
app.py Normal file
View File

@@ -0,0 +1,779 @@
"""
VibeVoice 体验平台 — Liquid Glass 风格
FastAPI 后端 + 纯 HTML 前端
"""
import os
import sys
import json
import torch
import numpy as np
import tempfile
import time
import soundfile as sf
from pathlib import Path
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
import uvicorn
SOURCE_DIR = Path(__file__).parent / "source"
STATIC_DIR = Path(__file__).parent / "static"
sys.path.insert(0, str(SOURCE_DIR))
app = FastAPI()
# ========== 全局状态 ==========
asr_model_cache = {}
tts_model_cache = {}
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
DTYPE = torch.float32
def load_asr():
if asr_model_cache:
return asr_model_cache
from vibevoice.modular.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration
from vibevoice.processor.vibevoice_asr_processor import VibeVoiceASRProcessor
print(f"Loading ASR model to {DEVICE}...")
processor = VibeVoiceASRProcessor.from_pretrained("microsoft/VibeVoice-ASR")
model = VibeVoiceASRForConditionalGeneration.from_pretrained(
"microsoft/VibeVoice-ASR",
torch_dtype=DTYPE,
attn_implementation="sdpa",
trust_remote_code=True
)
model = model.to(DEVICE)
model.eval()
asr_model_cache["model"] = model
asr_model_cache["processor"] = processor
print("ASR model loaded")
return asr_model_cache
def load_tts():
if tts_model_cache:
return tts_model_cache
from vibevoice.modular.modeling_vibevoice_streaming_inference import (
VibeVoiceStreamingForConditionalGenerationInference,
)
from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor
print(f"Loading TTS model to {DEVICE}...")
processor = VibeVoiceStreamingProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
"microsoft/VibeVoice-Realtime-0.5B",
torch_dtype=DTYPE,
attn_implementation="sdpa",
)
model = model.to(DEVICE)
model.eval()
tts_model_cache["model"] = model
tts_model_cache["processor"] = processor
print("TTS model loaded")
return tts_model_cache
@app.post("/api/asr")
async def api_asr(audio: UploadFile = File(...), hotwords: str = Form("")):
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
content = await audio.read()
tmp.write(content)
tmp.close()
try:
asr = load_asr()
model = asr["model"]
processor = asr["processor"]
context_info = hotwords.strip() if hotwords.strip() else None
inputs = processor(
audio=tmp.name,
sampling_rate=None,
return_tensors="pt",
add_generation_prompt=True,
context_info=context_info
)
inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
start_time = time.time()
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=32768,
do_sample=False,
pad_token_id=processor.pad_id,
eos_token_id=processor.tokenizer.eos_token_id,
)
elapsed = time.time() - start_time
input_length = inputs['input_ids'].shape[1]
generated_ids = output_ids[0, input_length:]
text = processor.decode(generated_ids, skip_special_tokens=True)
try:
segments = processor.post_process_transcription(text)
except Exception:
segments = [{"text": text}]
return JSONResponse({"segments": segments, "raw": text, "time": round(elapsed, 1)})
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
finally:
os.unlink(tmp.name)
@app.post("/api/tts")
async def api_tts(text: str = Form(...)):
if not text.strip():
return JSONResponse({"error": "empty text"}, status_code=400)
try:
tts = load_tts()
model = tts["model"]
processor = tts["processor"]
voices_dir = SOURCE_DIR / "demo" / "voices" / "streaming_model"
voice_files = list(voices_dir.rglob("*.pt")) if voices_dir.exists() else []
if not voice_files:
return JSONResponse({"error": "no voice presets found"}, status_code=500)
prefilled = torch.load(voice_files[0], map_location=DEVICE, weights_only=False)
processed = processor.process_input_with_cached_prompt(
text=text.strip(),
cached_prompt=prefilled,
padding=True,
return_tensors="pt",
return_attention_mask=True,
)
inputs = {k: v.to(DEVICE) if hasattr(v, "to") else v for k, v in processed.items()}
from vibevoice.modular.streamer import AudioStreamer
import copy, threading
audio_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
errors = []
model.model.noise_scheduler = model.model.noise_scheduler.from_config(
model.model.noise_scheduler.config,
algorithm_type="sde-dpmsolver++",
beta_schedule="squaredcos_cap_v2",
)
model.set_ddpm_inference_steps(num_steps=5)
stop_event = threading.Event()
def run_gen():
try:
model.generate(
**inputs,
max_new_tokens=None,
cfg_scale=1.5,
tokenizer=processor.tokenizer,
generation_config={"do_sample": False, "temperature": 1.0, "top_p": 1.0},
audio_streamer=audio_streamer,
stop_check_fn=stop_event.is_set,
verbose=False,
refresh_negative=True,
all_prefilled_outputs=copy.deepcopy(prefilled),
)
except Exception as e:
errors.append(e)
audio_streamer.end()
thread = threading.Thread(target=run_gen, daemon=True)
thread.start()
audio_chunks = []
for chunk in audio_streamer.get_stream(0):
if torch.is_tensor(chunk):
chunk = chunk.detach().cpu().to(torch.float32).numpy()
else:
chunk = np.asarray(chunk, dtype=np.float32)
if chunk.ndim > 1:
chunk = chunk.reshape(-1)
audio_chunks.append(chunk)
thread.join()
if errors:
return JSONResponse({"error": str(errors[0])}, status_code=500)
audio = np.clip(np.concatenate(audio_chunks), -1.0, 1.0)
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/tmp")
sf.write(tmp.name, audio, 24000)
return FileResponse(tmp.name, media_type="audio/wav", filename="vibevoice_tts.wav")
except Exception as e:
import traceback
traceback.print_exc()
return JSONResponse({"error": str(e)}, status_code=500)
@app.get("/")
def index():
return HTMLResponse(HTML_PAGE)
HTML_PAGE = r"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>VibeVoice</title>
<style>
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: 'Inter', -apple-system, sans-serif;
min-height: 100vh;
background: url("https://images.unsplash.com/photo-1557682250-33bd709cbe85?w=1920&q=80") center/cover fixed;
display: flex;
flex-direction: column;
align-items: center;
padding: 2rem;
color: #fff;
}
/* 背景动画叠加层 */
body::before {
content: '';
position: fixed;
inset: 0;
background: linear-gradient(135deg,
rgba(99, 102, 241, 0.15),
rgba(168, 85, 247, 0.1),
rgba(236, 72, 153, 0.1));
z-index: 0;
animation: shiftGradient 15s ease infinite;
}
@keyframes shiftGradient {
0%, 100% { opacity: 0.6; }
50% { opacity: 1; }
}
/* ===== 液态玻璃四层架构 ===== */
.liquidGlass-wrapper {
position: relative;
overflow: hidden;
box-shadow: 0 6px 6px rgba(0,0,0,0.2), 0 0 20px rgba(0,0,0,0.1);
transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 2.2);
}
.liquidGlass-effect {
position: absolute; z-index: 0; inset: 0;
backdrop-filter: blur(3px);
filter: url(#glass-distortion);
overflow: hidden;
isolation: isolate;
}
.liquidGlass-tint {
z-index: 1; position: absolute; inset: 0;
background: rgba(255, 255, 255, 0.12);
}
.liquidGlass-shine {
position: absolute; inset: 0; z-index: 2; overflow: hidden;
box-shadow: inset 2px 2px 1px 0 rgba(255,255,255,0.5),
inset -1px -1px 1px 1px rgba(255,255,255,0.5);
}
.liquidGlass-content {
position: relative; z-index: 3;
}
/* ===== 布局 ===== */
.container {
position: relative; z-index: 1;
max-width: 900px; width: 100%;
}
/* Header */
.header {
text-align: center;
margin-bottom: 2rem;
}
.header h1 {
font-size: 3rem; font-weight: 700;
text-shadow: 0 2px 20px rgba(0,0,0,0.3);
letter-spacing: -0.02em;
}
.header p {
color: rgba(255,255,255,0.7); margin-top: 0.5rem; font-size: 1rem;
}
.badges { display: flex; gap: 0.5rem; justify-content: center; margin-top: 1rem; }
.badge {
padding: 0.25rem 0.75rem; border-radius: 2rem; font-size: 0.75rem;
font-weight: 500; backdrop-filter: blur(10px);
background: rgba(255,255,255,0.15); border: 1px solid rgba(255,255,255,0.25);
}
/* Tabs */
.tabs {
display: flex; gap: 0.5rem; margin-bottom: 1.5rem; justify-content: center;
}
.tab-btn {
border-radius: 3rem; padding: 0.7rem 2rem; border: none;
font-size: 0.95rem; font-weight: 600; cursor: pointer;
color: rgba(255,255,255,0.6); background: transparent;
transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 2.2);
}
.tab-btn.active {
color: #fff;
}
.tab-btn:hover { transform: scale(1.05); }
/* 玻璃卡片 */
.glass-card {
border-radius: 1.8rem;
margin-bottom: 1.5rem;
}
.glass-card .liquidGlass-effect,
.glass-card .liquidGlass-tint,
.glass-card .liquidGlass-shine {
border-radius: 1.8rem;
}
.glass-card .liquidGlass-content {
padding: 2rem;
}
.glass-card:hover {
box-shadow: 0 8px 12px rgba(0,0,0,0.25), 0 0 30px rgba(0,0,0,0.15);
}
/* 玻璃 Tab 按钮 */
.tab-glass {
border-radius: 3rem;
}
.tab-glass .liquidGlass-effect,
.tab-glass .liquidGlass-tint,
.tab-glass .liquidGlass-shine {
border-radius: 3rem;
}
.tab-glass .liquidGlass-content {
padding: 0;
}
/* 表单 */
.form-label {
font-size: 0.85rem; font-weight: 600; color: rgba(255,255,255,0.8);
margin-bottom: 0.5rem; display: block;
}
.form-group { margin-bottom: 1.2rem; }
textarea, input[type="text"] {
width: 100%; padding: 0.8rem 1rem;
background: rgba(0,0,0,0.2); border: 1px solid rgba(255,255,255,0.15);
border-radius: 1rem; color: #fff; font-size: 0.9rem;
font-family: inherit; resize: vertical;
transition: border-color 0.3s, box-shadow 0.3s;
}
textarea:focus, input[type="text"]:focus {
outline: none;
border-color: rgba(255,255,255,0.4);
box-shadow: 0 0 20px rgba(167,139,250,0.15);
}
textarea::placeholder, input::placeholder {
color: rgba(255,255,255,0.3);
}
/* 上传区域 */
.upload-area {
border: 2px dashed rgba(255,255,255,0.2);
border-radius: 1rem; padding: 2rem; text-align: center;
cursor: pointer; transition: all 0.3s ease;
background: rgba(0,0,0,0.1);
}
.upload-area:hover, .upload-area.dragover {
border-color: rgba(255,255,255,0.5);
background: rgba(255,255,255,0.05);
}
.upload-area .icon { font-size: 2.5rem; margin-bottom: 0.5rem; }
.upload-area .text { color: rgba(255,255,255,0.5); font-size: 0.85rem; }
.upload-area .filename {
color: rgba(255,255,255,0.9); font-weight: 600; margin-top: 0.5rem;
}
/* 录音按钮 */
.record-btn {
display: inline-flex; align-items: center; gap: 0.4rem;
padding: 0.5rem 1rem; border-radius: 2rem;
border: 1px solid rgba(255,255,255,0.2);
background: rgba(255,255,255,0.08);
color: rgba(255,255,255,0.7); font-size: 0.85rem;
cursor: pointer; transition: all 0.3s; margin-top: 0.8rem;
}
.record-btn:hover { background: rgba(255,255,255,0.15); }
.record-btn.recording {
border-color: #ef4444; color: #ef4444;
animation: pulse 1.5s ease infinite;
}
@keyframes pulse {
0%, 100% { box-shadow: 0 0 0 0 rgba(239,68,68,0.4); }
50% { box-shadow: 0 0 0 8px rgba(239,68,68,0); }
}
/* 主按钮 — 液态玻璃 */
.btn-primary {
border-radius: 3rem; cursor: pointer; border: none;
width: 100%;
}
.btn-primary .liquidGlass-tint {
background: rgba(255,255,255,0.2);
}
.btn-primary .liquidGlass-effect,
.btn-primary .liquidGlass-tint,
.btn-primary .liquidGlass-shine {
border-radius: 3rem;
}
.btn-primary .liquidGlass-content {
padding: 0.9rem 2rem; text-align: center;
font-weight: 700; font-size: 1rem; color: #fff;
}
.btn-primary:hover {
transform: scale(1.02);
}
.btn-primary:active { transform: scale(0.98); }
.btn-primary.loading .liquidGlass-content::after {
content: ''; display: inline-block; width: 16px; height: 16px;
border: 2px solid rgba(255,255,255,0.3);
border-top-color: #fff; border-radius: 50%;
margin-left: 8px; vertical-align: middle;
animation: spin 0.8s linear infinite;
}
@keyframes spin { to { transform: rotate(360deg); } }
/* 结果区域 */
.result-area {
background: rgba(0,0,0,0.25); border-radius: 1rem;
padding: 1.2rem; min-height: 100px;
font-family: 'SF Mono', 'Fira Code', monospace;
font-size: 0.85rem; line-height: 1.7;
color: rgba(255,255,255,0.85);
white-space: pre-wrap; word-break: break-word;
max-height: 400px; overflow-y: auto;
}
.result-area:empty::after {
content: '等待识别...';
color: rgba(255,255,255,0.25);
}
/* 音频播放器 */
.audio-player {
width: 100%; margin-top: 1rem; border-radius: 1rem;
}
.tab-panel { display: none; }
.tab-panel.active { display: block; }
/* 提示文字 */
.hint {
color: rgba(255,255,255,0.35); font-size: 0.8rem; margin-top: 1rem;
line-height: 1.6;
}
.two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 1.5rem; }
@media (max-width: 700px) { .two-col { grid-template-columns: 1fr; } }
/* 底部 */
.footer {
text-align: center; color: rgba(255,255,255,0.2);
font-size: 0.75rem; margin-top: 2rem;
}
</style>
</head>
<body>
<!-- SVG 折射滤镜 -->
<svg style="display:none">
<filter id="glass-distortion" x="0%" y="0%" width="100%" height="100%" filterUnits="objectBoundingBox">
<feTurbulence type="fractalNoise" baseFrequency="0.01 0.01" numOctaves="1" seed="5" result="turbulence"/>
<feComponentTransfer in="turbulence" result="mapped">
<feFuncR type="gamma" amplitude="1" exponent="10" offset="0.5"/>
<feFuncG type="gamma" amplitude="0" exponent="1" offset="0"/>
<feFuncB type="gamma" amplitude="0" exponent="1" offset="0.5"/>
</feComponentTransfer>
<feGaussianBlur in="turbulence" stdDeviation="3" result="softMap"/>
<feSpecularLighting in="softMap" surfaceScale="5" specularConstant="1" specularExponent="100" lighting-color="white" result="specLight">
<fePointLight x="-200" y="-200" z="300"/>
</feSpecularLighting>
<feComposite in="specLight" operator="arithmetic" k1="0" k2="1" k3="1" k4="0" result="litImage"/>
<feDisplacementMap in="SourceGraphic" in2="softMap" scale="150" xChannelSelector="R" yChannelSelector="G"/>
</filter>
</svg>
<div class="container">
<!-- Header -->
<div class="header">
<h1>VibeVoice</h1>
<p>Microsoft 开源语音 AI — 语音识别 & 语音合成</p>
<div class="badges">
<span class="badge">Microsoft Research</span>
<span class="badge">MIT 开源</span>
<span class="badge">MPS 本地加速</span>
</div>
</div>
<!-- Tabs -->
<div class="tabs">
<div class="liquidGlass-wrapper tab-glass" onclick="switchTab('asr')" id="tab-asr">
<div class="liquidGlass-effect"></div>
<div class="liquidGlass-tint"></div>
<div class="liquidGlass-shine"></div>
<div class="liquidGlass-content">
<button class="tab-btn active" data-tab="asr">语音识别 ASR</button>
</div>
</div>
<div class="liquidGlass-wrapper tab-glass" onclick="switchTab('tts')" id="tab-tts">
<div class="liquidGlass-effect"></div>
<div class="liquidGlass-tint"></div>
<div class="liquidGlass-shine"></div>
<div class="liquidGlass-content">
<button class="tab-btn" data-tab="tts">语音合成 TTS</button>
</div>
</div>
</div>
<!-- ASR Panel -->
<div class="tab-panel active" id="panel-asr">
<div class="two-col">
<div>
<div class="liquidGlass-wrapper glass-card">
<div class="liquidGlass-effect"></div>
<div class="liquidGlass-tint"></div>
<div class="liquidGlass-shine"></div>
<div class="liquidGlass-content">
<div class="form-group">
<label class="form-label">上传音频</label>
<div class="upload-area" id="asr-upload" onclick="document.getElementById('asr-file').click()">
<div class="icon">🎵</div>
<div class="text">点击或拖拽上传音频文件<br>WAV / MP3 / FLAC / M4A</div>
<div class="filename" id="asr-filename"></div>
</div>
<input type="file" id="asr-file" accept="audio/*" style="display:none" onchange="onFileSelect(this)">
<button class="record-btn" id="record-btn" onclick="toggleRecord()">
<span>🎙</span> <span id="record-text">录音</span>
</button>
</div>
<div class="form-group">
<label class="form-label">热词(可选,提升专业术语识别率)</label>
<input type="text" id="asr-hotwords" placeholder="不当得利, 善意取得, 民法典">
</div>
<div class="liquidGlass-wrapper btn-primary" id="asr-btn" onclick="runASR()">
<div class="liquidGlass-effect"></div>
<div class="liquidGlass-tint"></div>
<div class="liquidGlass-shine"></div>
<div class="liquidGlass-content">开始识别</div>
</div>
</div>
</div>
</div>
<div>
<div class="liquidGlass-wrapper glass-card">
<div class="liquidGlass-effect"></div>
<div class="liquidGlass-tint"></div>
<div class="liquidGlass-shine"></div>
<div class="liquidGlass-content">
<label class="form-label">识别结果</label>
<div class="result-area" id="asr-result"></div>
<audio id="asr-audio" class="audio-player" controls style="display:none"></audio>
</div>
</div>
</div>
</div>
</div>
<!-- TTS Panel -->
<div class="tab-panel" id="panel-tts">
<div class="two-col">
<div>
<div class="liquidGlass-wrapper glass-card">
<div class="liquidGlass-effect"></div>
<div class="liquidGlass-tint"></div>
<div class="liquidGlass-shine"></div>
<div class="liquidGlass-content">
<div class="form-group">
<label class="form-label">输入文字</label>
<textarea id="tts-text" rows="8" placeholder="输入你想转换为语音的文字..."></textarea>
</div>
<div class="liquidGlass-wrapper btn-primary" id="tts-btn" onclick="runTTS()">
<div class="liquidGlass-effect"></div>
<div class="liquidGlass-tint"></div>
<div class="liquidGlass-shine"></div>
<div class="liquidGlass-content">生成语音</div>
</div>
<p class="hint">示例:今天我们来讲民法典中关于不当得利的规定。根据民法典第九百八十五条,得利人没有法律根据取得不当利益的,受损失的人可以请求得利人返还取得的利益。</p>
</div>
</div>
</div>
<div>
<div class="liquidGlass-wrapper glass-card">
<div class="liquidGlass-effect"></div>
<div class="liquidGlass-tint"></div>
<div class="liquidGlass-shine"></div>
<div class="liquidGlass-content">
<label class="form-label">生成结果</label>
<div class="result-area" id="tts-result">等待生成...</div>
<audio id="tts-audio" class="audio-player" controls style="display:none"></audio>
</div>
</div>
</div>
</div>
</div>
<div class="footer">VibeVoice by Microsoft Research · 本地部署 · 数据不离开你的电脑</div>
</div>
<script>
// Tab 切换
function switchTab(tab) {
document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active'));
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
document.getElementById('panel-' + tab).classList.add('active');
document.querySelector(`[data-tab="${tab}"]`).classList.add('active');
}
// 文件选择
let selectedFile = null;
function onFileSelect(input) {
if (input.files.length > 0) {
selectedFile = input.files[0];
document.getElementById('asr-filename').textContent = selectedFile.name;
// 显示播放器
const audio = document.getElementById('asr-audio');
audio.src = URL.createObjectURL(selectedFile);
audio.style.display = 'block';
}
}
// 拖拽上传
const uploadArea = document.getElementById('asr-upload');
uploadArea.addEventListener('dragover', e => { e.preventDefault(); uploadArea.classList.add('dragover'); });
uploadArea.addEventListener('dragleave', () => uploadArea.classList.remove('dragover'));
uploadArea.addEventListener('drop', e => {
e.preventDefault();
uploadArea.classList.remove('dragover');
if (e.dataTransfer.files.length > 0) {
selectedFile = e.dataTransfer.files[0];
document.getElementById('asr-filename').textContent = selectedFile.name;
const audio = document.getElementById('asr-audio');
audio.src = URL.createObjectURL(selectedFile);
audio.style.display = 'block';
}
});
// 录音
let mediaRecorder = null;
let recordedChunks = [];
async function toggleRecord() {
const btn = document.getElementById('record-btn');
const text = document.getElementById('record-text');
if (mediaRecorder && mediaRecorder.state === 'recording') {
mediaRecorder.stop();
btn.classList.remove('recording');
text.textContent = '录音';
return;
}
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
recordedChunks = [];
mediaRecorder.ondataavailable = e => recordedChunks.push(e.data);
mediaRecorder.onstop = () => {
const blob = new Blob(recordedChunks, { type: 'audio/wav' });
selectedFile = new File([blob], 'recording.wav', { type: 'audio/wav' });
document.getElementById('asr-filename').textContent = '录音完成';
const audio = document.getElementById('asr-audio');
audio.src = URL.createObjectURL(blob);
audio.style.display = 'block';
stream.getTracks().forEach(t => t.stop());
};
mediaRecorder.start();
btn.classList.add('recording');
text.textContent = '停止';
} catch (e) {
alert('无法访问麦克风: ' + e.message);
}
}
// ASR 调用
async function runASR() {
if (!selectedFile) { alert('请先上传或录制音频'); return; }
const btn = document.getElementById('asr-btn');
const result = document.getElementById('asr-result');
btn.classList.add('loading');
result.textContent = '正在加载模型并识别,首次需下载模型(~8GB...';
const form = new FormData();
form.append('audio', selectedFile);
form.append('hotwords', document.getElementById('asr-hotwords').value);
try {
const resp = await fetch('/api/asr', { method: 'POST', body: form });
const data = await resp.json();
if (data.error) {
result.textContent = '错误: ' + data.error;
} else if (data.segments && data.segments.length > 0) {
const lines = data.segments.map(s => {
const start = s.start_time || '';
const end = s.end_time || '';
const speaker = s.speaker_id || '';
const text = s.text || '';
if (start) return `[${start} → ${end}] 说话人${speaker}: ${text}`;
return text;
});
result.textContent = lines.join('\n') + `\n\n--- 耗时 ${data.time}s ---`;
} else {
result.textContent = data.raw || '无结果';
}
} catch (e) {
result.textContent = '请求失败: ' + e.message;
} finally {
btn.classList.remove('loading');
}
}
// TTS 调用
async function runTTS() {
const text = document.getElementById('tts-text').value;
if (!text.trim()) { alert('请输入文字'); return; }
const btn = document.getElementById('tts-btn');
const result = document.getElementById('tts-result');
const audio = document.getElementById('tts-audio');
btn.classList.add('loading');
result.textContent = '正在加载模型并生成语音,首次需下载模型(~2GB...';
const form = new FormData();
form.append('text', text);
try {
const resp = await fetch('/api/tts', { method: 'POST', body: form });
if (resp.ok) {
const blob = await resp.blob();
audio.src = URL.createObjectURL(blob);
audio.style.display = 'block';
audio.play();
result.textContent = '生成完成,点击播放';
} else {
const data = await resp.json();
result.textContent = '错误: ' + (data.error || '未知错误');
}
} catch (e) {
result.textContent = '请求失败: ' + e.message;
} finally {
btn.classList.remove('loading');
}
}
</script>
</body>
</html>"""
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=4410)

View File

@@ -14,33 +14,261 @@
.container { max-width: 1200px; margin: 0 auto; } .container { max-width: 1200px; margin: 0 auto; }
h1 { h1 {
font-size: 2.5rem; font-weight: 700; font-size: 2.5rem; font-weight: 700;
background: linear-gradient(135deg, #60a5fa, #a78bfa); background: linear-gradient(135deg, #f97316, #ef4444);
-webkit-background-clip: text; -webkit-text-fill-color: transparent; -webkit-background-clip: text; -webkit-text-fill-color: transparent;
margin-bottom: 0.5rem; margin-bottom: 0.5rem;
} }
.subtitle { color: #888; font-size: 1.1rem; margin-bottom: 2rem; } .subtitle { color: #888; font-size: 1.1rem; margin-bottom: 2rem; }
.badge-row { display: flex; gap: 0.5rem; margin-bottom: 2rem; flex-wrap: wrap; }
.badge {
display: inline-block; padding: 0.3rem 0.8rem; border-radius: 20px;
font-size: 0.8rem; font-weight: 600;
}
.badge-ms { background: #1a3a5c; color: #60a5fa; }
.badge-asr { background: #3c2e1a; color: #fbbf24; }
.badge-tts { background: #1a3c2a; color: #4ade80; }
.badge-mit { background: #2e1a3c; color: #c4b5fd; }
.card { .card {
background: #141414; border: 1px solid #222; border-radius: 12px; background: #141414; border: 1px solid #222; border-radius: 12px;
padding: 2rem; margin-bottom: 1.5rem; padding: 2rem; margin-bottom: 1.5rem;
} }
.card h2 { color: #60a5fa; margin-bottom: 1rem; font-size: 1.3rem; } .card h2 { color: #f97316; margin-bottom: 1rem; font-size: 1.3rem; }
.card p { line-height: 1.8; color: #aaa; } .card p, .card li { line-height: 1.8; color: #aaa; }
.card ul { list-style: none; padding: 0; }
.card ul li::before { content: ""; margin-right: 0.5rem; }
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(340px, 1fr)); gap: 1.5rem; }
table { width: 100%; border-collapse: collapse; margin-top: 0.5rem; }
th, td { text-align: left; padding: 0.7rem 1rem; border-bottom: 1px solid #222; }
th { color: #f97316; font-weight: 600; font-size: 0.9rem; }
td { color: #aaa; font-size: 0.9rem; }
.highlight { color: #4ade80; font-weight: 600; }
.warn { color: #f87171; font-weight: 600; }
.model-card {
background: #1a1a2e; border: 1px solid #2a2a4e; border-radius: 10px;
padding: 1.5rem; text-align: center;
}
.model-card .icon { font-size: 2.5rem; margin-bottom: 0.5rem; }
.model-card .name { font-size: 1.1rem; color: #f97316; font-weight: 700; margin-bottom: 0.3rem; }
.model-card .size { font-size: 0.8rem; color: #666; margin-bottom: 0.8rem; }
.model-card .desc { font-size: 0.85rem; color: #aaa; line-height: 1.6; text-align: left; }
.three-col { display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 1.5rem; margin-bottom: 1.5rem; }
.use-case {
background: #141414; border: 1px solid #2a4e2a; border-radius: 12px;
padding: 1.5rem;
}
.use-case h3 { color: #4ade80; font-size: 1rem; margin-bottom: 0.5rem; }
.use-case p { color: #888; font-size: 0.9rem; line-height: 1.6; }
.use-case .tag { display: inline-block; background: #1a3c2a; color: #4ade80; padding: 0.2rem 0.5rem; border-radius: 4px; font-size: 0.75rem; margin-top: 0.5rem; }
.code-block {
background: #1a1a2e; border: 1px solid #2a2a4e; border-radius: 8px;
padding: 1.2rem; margin-top: 1rem; overflow-x: auto;
font-family: "SF Mono", "Fira Code", monospace; font-size: 0.85rem;
color: #c4b5fd; line-height: 1.6;
}
.code-comment { color: #555; }
.links { display: flex; gap: 1rem; margin-top: 1.5rem; flex-wrap: wrap; }
.links a {
display: inline-flex; align-items: center; gap: 0.4rem;
padding: 0.6rem 1.2rem; border-radius: 8px; text-decoration: none;
font-size: 0.9rem; font-weight: 600; transition: opacity 0.2s;
}
.links a:hover { opacity: 0.8; }
.link-gh { background: #1a1a2e; color: #c4b5fd; border: 1px solid #2a2a4e; }
.link-hf { background: #1a2e1a; color: #4ade80; border: 1px solid #2a4e2a; }
.link-doc { background: #2e2a1a; color: #fbbf24; border: 1px solid #4e3a2a; }
.verdict {
background: linear-gradient(135deg, #1a0a00, #141414);
border: 1px solid #f9731633; border-radius: 12px;
padding: 2rem; margin-top: 1.5rem; text-align: center;
}
.verdict h2 { color: #f97316; margin-bottom: 0.5rem; }
.verdict p { color: #888; max-width: 600px; margin: 0 auto; }
footer { text-align: center; color: #333; margin-top: 3rem; font-size: 0.8rem; }
</style> </style>
</head> </head>
<body> <body>
<div class="container"> <div class="container">
<h1>VibeVoice 语音AI研究</h1> <h1>VibeVoice 语音 AI 全家桶</h1>
<p class="subtitle">微软开源语音全家桶ASR+TTS+实时语音,可用于法考字幕提取</p> <p class="subtitle">微软开源 | ASR + TTS + 实时语音 | MIT 许可</p>
<div class="badge-row">
<span class="badge badge-ms">Microsoft Research</span>
<span class="badge badge-asr">ASR 语音识别</span>
<span class="badge badge-tts">TTS 语音合成</span>
<span class="badge badge-mit">MIT 开源</span>
</div>
<!-- 三个模型 -->
<div class="three-col">
<div class="model-card">
<div class="icon">🎙</div>
<div class="name">VibeVoice-ASR</div>
<div class="size">语音识别模型</div>
<div class="desc">
<ul style="list-style:none; padding:0;">
<li>单次处理 60 分钟音频</li>
<li>输出:说话人 + 时间戳 + 内容</li>
<li>支持 50+ 语言</li>
<li>支持自定义热词</li>
</ul>
</div>
</div>
<div class="model-card">
<div class="icon">🔊</div>
<div class="name">VibeVoice-1.5B</div>
<div class="size">15 亿参数 · TTS</div>
<div class="desc">
<ul style="list-style:none; padding:0;">
<li>高质量文字转语音</li>
<li>自然语调和韵律</li>
<li>多语言支持</li>
<li>7.5Hz 超低帧率 token</li>
</ul>
</div>
</div>
<div class="model-card">
<div class="icon"></div>
<div class="name">VibeVoice-Realtime-0.5B</div>
<div class="size">5 亿参数 · 实时 TTS</div>
<div class="desc">
<ul style="list-style:none; padding:0;">
<li>流式文字输入</li>
<li>首音延迟 ~300ms</li>
<li>支持长文本朗读</li>
<li>适合实时对话场景</li>
</ul>
</div>
</div>
</div>
<div class="grid">
<!-- 技术亮点 -->
<div class="card">
<h2>核心技术</h2>
<table>
<tr><th>技术</th><th>说明</th></tr>
<tr><td>连续语音 Tokenizer</td><td>声学 + 语义双 Tokenizer7.5Hz 超低帧率</td></tr>
<tr><td>长音频处理</td><td>单次 60 分钟,无需分段</td></tr>
<tr><td>说话人分离</td><td>自动识别 Who + When + What</td></tr>
<tr><td>流式推理</td><td>边输入文字边生成语音300ms 首音</td></tr>
<tr><td>热词支持</td><td>自定义专业术语提升识别率</td></tr>
</table>
</div>
<!-- 对比 -->
<div class="card">
<h2>vs 同类方案</h2>
<table>
<tr><th>维度</th><th>Whisper</th><th>ElevenLabs</th><th>VibeVoice</th></tr>
<tr><td>ASR</td><td class="highlight"></td><td class="warn"></td><td class="highlight">有(更强)</td></tr>
<tr><td>TTS</td><td class="warn"></td><td class="highlight"></td><td class="highlight"></td></tr>
<tr><td>实时流式</td><td class="warn"></td><td class="highlight"></td><td class="highlight"></td></tr>
<tr><td>说话人识别</td><td class="warn"></td><td class="warn"></td><td class="highlight">内置</td></tr>
<tr><td>长音频</td><td>需分段</td><td>N/A</td><td class="highlight">60分钟单次</td></tr>
<tr><td>开源</td><td class="highlight"></td><td class="warn"></td><td class="highlight">MIT</td></tr>
<tr><td>费用</td><td>免费</td><td class="warn">按量付费</td><td>免费</td></tr>
</table>
</div>
</div>
<!-- 应用场景 -->
<h2 style="color: #f97316; margin: 1.5rem 0 1rem;">我们的应用场景</h2>
<div class="three-col">
<div class="use-case">
<h3>法考视频字幕提取</h3>
<p>9,553 个法考视频需要提取字幕。VibeVoice-ASR 单次处理 60 分钟 + 自动时间戳 + 说话人识别,配合法律热词("不当得利""善意取得"等)可显著提升识别率。</p>
<span class="tag">高优先级</span>
</div>
<div class="use-case">
<h3>法海法考 App 语音朗读</h3>
<p>用 Realtime-0.5B 为题目和解析生成语音朗读,支持边看题边听讲解,提升学习体验。</p>
<span class="tag">中优先级</span>
</div>
<div class="use-case">
<h3>百陶会多语言介绍</h3>
<p>用 VibeVoice-1.5B 为产品页面生成中英文语音介绍50+ 语言支持覆盖海外客户。</p>
<span class="tag">低优先级</span>
</div>
</div>
<!-- 代码示例 -->
<div class="card"> <div class="card">
<h2>概述</h2> <h2>ASR 使用示例</h2>
<p>待补充研究内容...</p> <div class="code-block">
<span class="code-comment"># 安装</span>
pip install transformers torch
<span class="code-comment"># ASR语音转文字带时间戳和说话人</span>
from transformers import pipeline
asr = pipeline(
"automatic-speech-recognition",
model="microsoft/VibeVoice-ASR"
)
result = asr("lecture_60min.wav")
<span class="code-comment"># 输出:[{speaker: "A", start: 0.0, end: 3.2, text: "..."}, ...]</span>
</div>
</div> </div>
<div class="card"> <div class="card">
<h2>核心发现</h2> <h2>TTS 使用示例</h2>
<p>待补充...</p> <div class="code-block">
<span class="code-comment"># 实时 TTS文字转语音</span>
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(
"microsoft/VibeVoice-Realtime-0.5B"
)
<span class="code-comment"># 流式生成,首音 ~300ms</span>
for audio_chunk in model.generate_stream("今天我们来讲民法典..."):
play(audio_chunk)
</div>
</div> </div>
<!-- 硬件 -->
<div class="card">
<h2>硬件要求与本机适配</h2>
<table>
<tr><th>模型</th><th>显存需求</th><th>M2 Max 可运行?</th></tr>
<tr><td>VibeVoice-ASR</td><td>~8GB</td><td class="highlight">可以MPS 加速)</td></tr>
<tr><td>VibeVoice-1.5B</td><td>~6GB</td><td class="highlight">可以</td></tr>
<tr><td>VibeVoice-Realtime-0.5B</td><td>~2GB</td><td class="highlight">可以</td></tr>
</table>
<p style="margin-top: 1rem; color: #4ade80; font-size: 0.9rem;">
本机 M2 Max 64GB 完全满足所有模型运行要求
</p>
</div>
<!-- 评价 -->
<div class="verdict">
<h2>评价:实用性很高</h2>
<p>ASR + TTS + 实时语音三合一开源方案MIT 许可无商用限制。ASR 的 60 分钟长音频 + 说话人识别是真正的差异化优势。本机 M2 Max 可直接运行,不需要 GPU 服务器。对法考字幕提取项目有直接价值。</p>
</div>
<!-- 链接 -->
<div class="links">
<a href="https://github.com/microsoft/VibeVoice" target="_blank" class="link-gh">GitHub 源码</a>
<a href="https://huggingface.co/microsoft/VibeVoice-ASR" target="_blank" class="link-hf">ASR 模型</a>
<a href="https://huggingface.co/microsoft/VibeVoice-Realtime-0.5B" target="_blank" class="link-hf">Realtime 模型</a>
<a href="https://microsoft.github.io/VibeVoice/" target="_blank" class="link-doc">官方文档</a>
</div>
<footer>
研究项目 · 立项日期 2026-03-31 · 源码克隆至 ./source/
</footer>
</div> </div>
</body> </body>
</html> </html>

1
source Submodule

Submodule source added at 3c976491d4