auto-save 2026-04-01 09:03 (+2, ~1)
This commit is contained in:
779
app.py
Normal file
779
app.py
Normal file
@@ -0,0 +1,779 @@
|
|||||||
|
"""
|
||||||
|
VibeVoice 体验平台 — Liquid Glass 风格
|
||||||
|
FastAPI 后端 + 纯 HTML 前端
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
import soundfile as sf
|
||||||
|
from pathlib import Path
|
||||||
|
from fastapi import FastAPI, UploadFile, File, Form
|
||||||
|
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
SOURCE_DIR = Path(__file__).parent / "source"
|
||||||
|
STATIC_DIR = Path(__file__).parent / "static"
|
||||||
|
sys.path.insert(0, str(SOURCE_DIR))
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
# ========== 全局状态 ==========
|
||||||
|
asr_model_cache = {}
|
||||||
|
tts_model_cache = {}
|
||||||
|
|
||||||
|
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||||
|
DTYPE = torch.float32
|
||||||
|
|
||||||
|
|
||||||
|
def load_asr():
|
||||||
|
if asr_model_cache:
|
||||||
|
return asr_model_cache
|
||||||
|
|
||||||
|
from vibevoice.modular.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration
|
||||||
|
from vibevoice.processor.vibevoice_asr_processor import VibeVoiceASRProcessor
|
||||||
|
|
||||||
|
print(f"Loading ASR model to {DEVICE}...")
|
||||||
|
processor = VibeVoiceASRProcessor.from_pretrained("microsoft/VibeVoice-ASR")
|
||||||
|
model = VibeVoiceASRForConditionalGeneration.from_pretrained(
|
||||||
|
"microsoft/VibeVoice-ASR",
|
||||||
|
torch_dtype=DTYPE,
|
||||||
|
attn_implementation="sdpa",
|
||||||
|
trust_remote_code=True
|
||||||
|
)
|
||||||
|
model = model.to(DEVICE)
|
||||||
|
model.eval()
|
||||||
|
asr_model_cache["model"] = model
|
||||||
|
asr_model_cache["processor"] = processor
|
||||||
|
print("ASR model loaded")
|
||||||
|
return asr_model_cache
|
||||||
|
|
||||||
|
|
||||||
|
def load_tts():
|
||||||
|
if tts_model_cache:
|
||||||
|
return tts_model_cache
|
||||||
|
|
||||||
|
from vibevoice.modular.modeling_vibevoice_streaming_inference import (
|
||||||
|
VibeVoiceStreamingForConditionalGenerationInference,
|
||||||
|
)
|
||||||
|
from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor
|
||||||
|
|
||||||
|
print(f"Loading TTS model to {DEVICE}...")
|
||||||
|
processor = VibeVoiceStreamingProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
|
||||||
|
model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
|
||||||
|
"microsoft/VibeVoice-Realtime-0.5B",
|
||||||
|
torch_dtype=DTYPE,
|
||||||
|
attn_implementation="sdpa",
|
||||||
|
)
|
||||||
|
model = model.to(DEVICE)
|
||||||
|
model.eval()
|
||||||
|
tts_model_cache["model"] = model
|
||||||
|
tts_model_cache["processor"] = processor
|
||||||
|
print("TTS model loaded")
|
||||||
|
return tts_model_cache
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/asr")
|
||||||
|
async def api_asr(audio: UploadFile = File(...), hotwords: str = Form("")):
|
||||||
|
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||||
|
content = await audio.read()
|
||||||
|
tmp.write(content)
|
||||||
|
tmp.close()
|
||||||
|
|
||||||
|
try:
|
||||||
|
asr = load_asr()
|
||||||
|
model = asr["model"]
|
||||||
|
processor = asr["processor"]
|
||||||
|
|
||||||
|
context_info = hotwords.strip() if hotwords.strip() else None
|
||||||
|
inputs = processor(
|
||||||
|
audio=tmp.name,
|
||||||
|
sampling_rate=None,
|
||||||
|
return_tensors="pt",
|
||||||
|
add_generation_prompt=True,
|
||||||
|
context_info=context_info
|
||||||
|
)
|
||||||
|
inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
with torch.no_grad():
|
||||||
|
output_ids = model.generate(
|
||||||
|
**inputs,
|
||||||
|
max_new_tokens=32768,
|
||||||
|
do_sample=False,
|
||||||
|
pad_token_id=processor.pad_id,
|
||||||
|
eos_token_id=processor.tokenizer.eos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
input_length = inputs['input_ids'].shape[1]
|
||||||
|
generated_ids = output_ids[0, input_length:]
|
||||||
|
text = processor.decode(generated_ids, skip_special_tokens=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
segments = processor.post_process_transcription(text)
|
||||||
|
except Exception:
|
||||||
|
segments = [{"text": text}]
|
||||||
|
|
||||||
|
return JSONResponse({"segments": segments, "raw": text, "time": round(elapsed, 1)})
|
||||||
|
except Exception as e:
|
||||||
|
return JSONResponse({"error": str(e)}, status_code=500)
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp.name)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/tts")
|
||||||
|
async def api_tts(text: str = Form(...)):
|
||||||
|
if not text.strip():
|
||||||
|
return JSONResponse({"error": "empty text"}, status_code=400)
|
||||||
|
|
||||||
|
try:
|
||||||
|
tts = load_tts()
|
||||||
|
model = tts["model"]
|
||||||
|
processor = tts["processor"]
|
||||||
|
|
||||||
|
voices_dir = SOURCE_DIR / "demo" / "voices" / "streaming_model"
|
||||||
|
voice_files = list(voices_dir.rglob("*.pt")) if voices_dir.exists() else []
|
||||||
|
if not voice_files:
|
||||||
|
return JSONResponse({"error": "no voice presets found"}, status_code=500)
|
||||||
|
|
||||||
|
prefilled = torch.load(voice_files[0], map_location=DEVICE, weights_only=False)
|
||||||
|
processed = processor.process_input_with_cached_prompt(
|
||||||
|
text=text.strip(),
|
||||||
|
cached_prompt=prefilled,
|
||||||
|
padding=True,
|
||||||
|
return_tensors="pt",
|
||||||
|
return_attention_mask=True,
|
||||||
|
)
|
||||||
|
inputs = {k: v.to(DEVICE) if hasattr(v, "to") else v for k, v in processed.items()}
|
||||||
|
|
||||||
|
from vibevoice.modular.streamer import AudioStreamer
|
||||||
|
import copy, threading
|
||||||
|
|
||||||
|
audio_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
model.model.noise_scheduler = model.model.noise_scheduler.from_config(
|
||||||
|
model.model.noise_scheduler.config,
|
||||||
|
algorithm_type="sde-dpmsolver++",
|
||||||
|
beta_schedule="squaredcos_cap_v2",
|
||||||
|
)
|
||||||
|
model.set_ddpm_inference_steps(num_steps=5)
|
||||||
|
|
||||||
|
stop_event = threading.Event()
|
||||||
|
|
||||||
|
def run_gen():
|
||||||
|
try:
|
||||||
|
model.generate(
|
||||||
|
**inputs,
|
||||||
|
max_new_tokens=None,
|
||||||
|
cfg_scale=1.5,
|
||||||
|
tokenizer=processor.tokenizer,
|
||||||
|
generation_config={"do_sample": False, "temperature": 1.0, "top_p": 1.0},
|
||||||
|
audio_streamer=audio_streamer,
|
||||||
|
stop_check_fn=stop_event.is_set,
|
||||||
|
verbose=False,
|
||||||
|
refresh_negative=True,
|
||||||
|
all_prefilled_outputs=copy.deepcopy(prefilled),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(e)
|
||||||
|
audio_streamer.end()
|
||||||
|
|
||||||
|
thread = threading.Thread(target=run_gen, daemon=True)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
audio_chunks = []
|
||||||
|
for chunk in audio_streamer.get_stream(0):
|
||||||
|
if torch.is_tensor(chunk):
|
||||||
|
chunk = chunk.detach().cpu().to(torch.float32).numpy()
|
||||||
|
else:
|
||||||
|
chunk = np.asarray(chunk, dtype=np.float32)
|
||||||
|
if chunk.ndim > 1:
|
||||||
|
chunk = chunk.reshape(-1)
|
||||||
|
audio_chunks.append(chunk)
|
||||||
|
|
||||||
|
thread.join()
|
||||||
|
if errors:
|
||||||
|
return JSONResponse({"error": str(errors[0])}, status_code=500)
|
||||||
|
|
||||||
|
audio = np.clip(np.concatenate(audio_chunks), -1.0, 1.0)
|
||||||
|
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/tmp")
|
||||||
|
sf.write(tmp.name, audio, 24000)
|
||||||
|
return FileResponse(tmp.name, media_type="audio/wav", filename="vibevoice_tts.wav")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return JSONResponse({"error": str(e)}, status_code=500)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
def index():
|
||||||
|
return HTMLResponse(HTML_PAGE)
|
||||||
|
|
||||||
|
|
||||||
|
HTML_PAGE = r"""<!DOCTYPE html>
|
||||||
|
<html lang="zh-CN">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>VibeVoice</title>
|
||||||
|
<style>
|
||||||
|
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
||||||
|
|
||||||
|
* { margin: 0; padding: 0; box-sizing: border-box; }
|
||||||
|
|
||||||
|
body {
|
||||||
|
font-family: 'Inter', -apple-system, sans-serif;
|
||||||
|
min-height: 100vh;
|
||||||
|
background: url("https://images.unsplash.com/photo-1557682250-33bd709cbe85?w=1920&q=80") center/cover fixed;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
align-items: center;
|
||||||
|
padding: 2rem;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 背景动画叠加层 */
|
||||||
|
body::before {
|
||||||
|
content: '';
|
||||||
|
position: fixed;
|
||||||
|
inset: 0;
|
||||||
|
background: linear-gradient(135deg,
|
||||||
|
rgba(99, 102, 241, 0.15),
|
||||||
|
rgba(168, 85, 247, 0.1),
|
||||||
|
rgba(236, 72, 153, 0.1));
|
||||||
|
z-index: 0;
|
||||||
|
animation: shiftGradient 15s ease infinite;
|
||||||
|
}
|
||||||
|
@keyframes shiftGradient {
|
||||||
|
0%, 100% { opacity: 0.6; }
|
||||||
|
50% { opacity: 1; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ===== 液态玻璃四层架构 ===== */
|
||||||
|
.liquidGlass-wrapper {
|
||||||
|
position: relative;
|
||||||
|
overflow: hidden;
|
||||||
|
box-shadow: 0 6px 6px rgba(0,0,0,0.2), 0 0 20px rgba(0,0,0,0.1);
|
||||||
|
transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 2.2);
|
||||||
|
}
|
||||||
|
.liquidGlass-effect {
|
||||||
|
position: absolute; z-index: 0; inset: 0;
|
||||||
|
backdrop-filter: blur(3px);
|
||||||
|
filter: url(#glass-distortion);
|
||||||
|
overflow: hidden;
|
||||||
|
isolation: isolate;
|
||||||
|
}
|
||||||
|
.liquidGlass-tint {
|
||||||
|
z-index: 1; position: absolute; inset: 0;
|
||||||
|
background: rgba(255, 255, 255, 0.12);
|
||||||
|
}
|
||||||
|
.liquidGlass-shine {
|
||||||
|
position: absolute; inset: 0; z-index: 2; overflow: hidden;
|
||||||
|
box-shadow: inset 2px 2px 1px 0 rgba(255,255,255,0.5),
|
||||||
|
inset -1px -1px 1px 1px rgba(255,255,255,0.5);
|
||||||
|
}
|
||||||
|
.liquidGlass-content {
|
||||||
|
position: relative; z-index: 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ===== 布局 ===== */
|
||||||
|
.container {
|
||||||
|
position: relative; z-index: 1;
|
||||||
|
max-width: 900px; width: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Header */
|
||||||
|
.header {
|
||||||
|
text-align: center;
|
||||||
|
margin-bottom: 2rem;
|
||||||
|
}
|
||||||
|
.header h1 {
|
||||||
|
font-size: 3rem; font-weight: 700;
|
||||||
|
text-shadow: 0 2px 20px rgba(0,0,0,0.3);
|
||||||
|
letter-spacing: -0.02em;
|
||||||
|
}
|
||||||
|
.header p {
|
||||||
|
color: rgba(255,255,255,0.7); margin-top: 0.5rem; font-size: 1rem;
|
||||||
|
}
|
||||||
|
.badges { display: flex; gap: 0.5rem; justify-content: center; margin-top: 1rem; }
|
||||||
|
.badge {
|
||||||
|
padding: 0.25rem 0.75rem; border-radius: 2rem; font-size: 0.75rem;
|
||||||
|
font-weight: 500; backdrop-filter: blur(10px);
|
||||||
|
background: rgba(255,255,255,0.15); border: 1px solid rgba(255,255,255,0.25);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Tabs */
|
||||||
|
.tabs {
|
||||||
|
display: flex; gap: 0.5rem; margin-bottom: 1.5rem; justify-content: center;
|
||||||
|
}
|
||||||
|
.tab-btn {
|
||||||
|
border-radius: 3rem; padding: 0.7rem 2rem; border: none;
|
||||||
|
font-size: 0.95rem; font-weight: 600; cursor: pointer;
|
||||||
|
color: rgba(255,255,255,0.6); background: transparent;
|
||||||
|
transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 2.2);
|
||||||
|
}
|
||||||
|
.tab-btn.active {
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
.tab-btn:hover { transform: scale(1.05); }
|
||||||
|
|
||||||
|
/* 玻璃卡片 */
|
||||||
|
.glass-card {
|
||||||
|
border-radius: 1.8rem;
|
||||||
|
margin-bottom: 1.5rem;
|
||||||
|
}
|
||||||
|
.glass-card .liquidGlass-effect,
|
||||||
|
.glass-card .liquidGlass-tint,
|
||||||
|
.glass-card .liquidGlass-shine {
|
||||||
|
border-radius: 1.8rem;
|
||||||
|
}
|
||||||
|
.glass-card .liquidGlass-content {
|
||||||
|
padding: 2rem;
|
||||||
|
}
|
||||||
|
.glass-card:hover {
|
||||||
|
box-shadow: 0 8px 12px rgba(0,0,0,0.25), 0 0 30px rgba(0,0,0,0.15);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 玻璃 Tab 按钮 */
|
||||||
|
.tab-glass {
|
||||||
|
border-radius: 3rem;
|
||||||
|
}
|
||||||
|
.tab-glass .liquidGlass-effect,
|
||||||
|
.tab-glass .liquidGlass-tint,
|
||||||
|
.tab-glass .liquidGlass-shine {
|
||||||
|
border-radius: 3rem;
|
||||||
|
}
|
||||||
|
.tab-glass .liquidGlass-content {
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 表单 */
|
||||||
|
.form-label {
|
||||||
|
font-size: 0.85rem; font-weight: 600; color: rgba(255,255,255,0.8);
|
||||||
|
margin-bottom: 0.5rem; display: block;
|
||||||
|
}
|
||||||
|
.form-group { margin-bottom: 1.2rem; }
|
||||||
|
|
||||||
|
textarea, input[type="text"] {
|
||||||
|
width: 100%; padding: 0.8rem 1rem;
|
||||||
|
background: rgba(0,0,0,0.2); border: 1px solid rgba(255,255,255,0.15);
|
||||||
|
border-radius: 1rem; color: #fff; font-size: 0.9rem;
|
||||||
|
font-family: inherit; resize: vertical;
|
||||||
|
transition: border-color 0.3s, box-shadow 0.3s;
|
||||||
|
}
|
||||||
|
textarea:focus, input[type="text"]:focus {
|
||||||
|
outline: none;
|
||||||
|
border-color: rgba(255,255,255,0.4);
|
||||||
|
box-shadow: 0 0 20px rgba(167,139,250,0.15);
|
||||||
|
}
|
||||||
|
textarea::placeholder, input::placeholder {
|
||||||
|
color: rgba(255,255,255,0.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 上传区域 */
|
||||||
|
.upload-area {
|
||||||
|
border: 2px dashed rgba(255,255,255,0.2);
|
||||||
|
border-radius: 1rem; padding: 2rem; text-align: center;
|
||||||
|
cursor: pointer; transition: all 0.3s ease;
|
||||||
|
background: rgba(0,0,0,0.1);
|
||||||
|
}
|
||||||
|
.upload-area:hover, .upload-area.dragover {
|
||||||
|
border-color: rgba(255,255,255,0.5);
|
||||||
|
background: rgba(255,255,255,0.05);
|
||||||
|
}
|
||||||
|
.upload-area .icon { font-size: 2.5rem; margin-bottom: 0.5rem; }
|
||||||
|
.upload-area .text { color: rgba(255,255,255,0.5); font-size: 0.85rem; }
|
||||||
|
.upload-area .filename {
|
||||||
|
color: rgba(255,255,255,0.9); font-weight: 600; margin-top: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 录音按钮 */
|
||||||
|
.record-btn {
|
||||||
|
display: inline-flex; align-items: center; gap: 0.4rem;
|
||||||
|
padding: 0.5rem 1rem; border-radius: 2rem;
|
||||||
|
border: 1px solid rgba(255,255,255,0.2);
|
||||||
|
background: rgba(255,255,255,0.08);
|
||||||
|
color: rgba(255,255,255,0.7); font-size: 0.85rem;
|
||||||
|
cursor: pointer; transition: all 0.3s; margin-top: 0.8rem;
|
||||||
|
}
|
||||||
|
.record-btn:hover { background: rgba(255,255,255,0.15); }
|
||||||
|
.record-btn.recording {
|
||||||
|
border-color: #ef4444; color: #ef4444;
|
||||||
|
animation: pulse 1.5s ease infinite;
|
||||||
|
}
|
||||||
|
@keyframes pulse {
|
||||||
|
0%, 100% { box-shadow: 0 0 0 0 rgba(239,68,68,0.4); }
|
||||||
|
50% { box-shadow: 0 0 0 8px rgba(239,68,68,0); }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 主按钮 — 液态玻璃 */
|
||||||
|
.btn-primary {
|
||||||
|
border-radius: 3rem; cursor: pointer; border: none;
|
||||||
|
width: 100%;
|
||||||
|
}
|
||||||
|
.btn-primary .liquidGlass-tint {
|
||||||
|
background: rgba(255,255,255,0.2);
|
||||||
|
}
|
||||||
|
.btn-primary .liquidGlass-effect,
|
||||||
|
.btn-primary .liquidGlass-tint,
|
||||||
|
.btn-primary .liquidGlass-shine {
|
||||||
|
border-radius: 3rem;
|
||||||
|
}
|
||||||
|
.btn-primary .liquidGlass-content {
|
||||||
|
padding: 0.9rem 2rem; text-align: center;
|
||||||
|
font-weight: 700; font-size: 1rem; color: #fff;
|
||||||
|
}
|
||||||
|
.btn-primary:hover {
|
||||||
|
transform: scale(1.02);
|
||||||
|
}
|
||||||
|
.btn-primary:active { transform: scale(0.98); }
|
||||||
|
.btn-primary.loading .liquidGlass-content::after {
|
||||||
|
content: ''; display: inline-block; width: 16px; height: 16px;
|
||||||
|
border: 2px solid rgba(255,255,255,0.3);
|
||||||
|
border-top-color: #fff; border-radius: 50%;
|
||||||
|
margin-left: 8px; vertical-align: middle;
|
||||||
|
animation: spin 0.8s linear infinite;
|
||||||
|
}
|
||||||
|
@keyframes spin { to { transform: rotate(360deg); } }
|
||||||
|
|
||||||
|
/* 结果区域 */
|
||||||
|
.result-area {
|
||||||
|
background: rgba(0,0,0,0.25); border-radius: 1rem;
|
||||||
|
padding: 1.2rem; min-height: 100px;
|
||||||
|
font-family: 'SF Mono', 'Fira Code', monospace;
|
||||||
|
font-size: 0.85rem; line-height: 1.7;
|
||||||
|
color: rgba(255,255,255,0.85);
|
||||||
|
white-space: pre-wrap; word-break: break-word;
|
||||||
|
max-height: 400px; overflow-y: auto;
|
||||||
|
}
|
||||||
|
.result-area:empty::after {
|
||||||
|
content: '等待识别...';
|
||||||
|
color: rgba(255,255,255,0.25);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 音频播放器 */
|
||||||
|
.audio-player {
|
||||||
|
width: 100%; margin-top: 1rem; border-radius: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.tab-panel { display: none; }
|
||||||
|
.tab-panel.active { display: block; }
|
||||||
|
|
||||||
|
/* 提示文字 */
|
||||||
|
.hint {
|
||||||
|
color: rgba(255,255,255,0.35); font-size: 0.8rem; margin-top: 1rem;
|
||||||
|
line-height: 1.6;
|
||||||
|
}
|
||||||
|
|
||||||
|
.two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 1.5rem; }
|
||||||
|
@media (max-width: 700px) { .two-col { grid-template-columns: 1fr; } }
|
||||||
|
|
||||||
|
/* 底部 */
|
||||||
|
.footer {
|
||||||
|
text-align: center; color: rgba(255,255,255,0.2);
|
||||||
|
font-size: 0.75rem; margin-top: 2rem;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<!-- SVG 折射滤镜 -->
|
||||||
|
<svg style="display:none">
|
||||||
|
<filter id="glass-distortion" x="0%" y="0%" width="100%" height="100%" filterUnits="objectBoundingBox">
|
||||||
|
<feTurbulence type="fractalNoise" baseFrequency="0.01 0.01" numOctaves="1" seed="5" result="turbulence"/>
|
||||||
|
<feComponentTransfer in="turbulence" result="mapped">
|
||||||
|
<feFuncR type="gamma" amplitude="1" exponent="10" offset="0.5"/>
|
||||||
|
<feFuncG type="gamma" amplitude="0" exponent="1" offset="0"/>
|
||||||
|
<feFuncB type="gamma" amplitude="0" exponent="1" offset="0.5"/>
|
||||||
|
</feComponentTransfer>
|
||||||
|
<feGaussianBlur in="turbulence" stdDeviation="3" result="softMap"/>
|
||||||
|
<feSpecularLighting in="softMap" surfaceScale="5" specularConstant="1" specularExponent="100" lighting-color="white" result="specLight">
|
||||||
|
<fePointLight x="-200" y="-200" z="300"/>
|
||||||
|
</feSpecularLighting>
|
||||||
|
<feComposite in="specLight" operator="arithmetic" k1="0" k2="1" k3="1" k4="0" result="litImage"/>
|
||||||
|
<feDisplacementMap in="SourceGraphic" in2="softMap" scale="150" xChannelSelector="R" yChannelSelector="G"/>
|
||||||
|
</filter>
|
||||||
|
</svg>
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<!-- Header -->
|
||||||
|
<div class="header">
|
||||||
|
<h1>VibeVoice</h1>
|
||||||
|
<p>Microsoft 开源语音 AI — 语音识别 & 语音合成</p>
|
||||||
|
<div class="badges">
|
||||||
|
<span class="badge">Microsoft Research</span>
|
||||||
|
<span class="badge">MIT 开源</span>
|
||||||
|
<span class="badge">MPS 本地加速</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Tabs -->
|
||||||
|
<div class="tabs">
|
||||||
|
<div class="liquidGlass-wrapper tab-glass" onclick="switchTab('asr')" id="tab-asr">
|
||||||
|
<div class="liquidGlass-effect"></div>
|
||||||
|
<div class="liquidGlass-tint"></div>
|
||||||
|
<div class="liquidGlass-shine"></div>
|
||||||
|
<div class="liquidGlass-content">
|
||||||
|
<button class="tab-btn active" data-tab="asr">语音识别 ASR</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="liquidGlass-wrapper tab-glass" onclick="switchTab('tts')" id="tab-tts">
|
||||||
|
<div class="liquidGlass-effect"></div>
|
||||||
|
<div class="liquidGlass-tint"></div>
|
||||||
|
<div class="liquidGlass-shine"></div>
|
||||||
|
<div class="liquidGlass-content">
|
||||||
|
<button class="tab-btn" data-tab="tts">语音合成 TTS</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ASR Panel -->
|
||||||
|
<div class="tab-panel active" id="panel-asr">
|
||||||
|
<div class="two-col">
|
||||||
|
<div>
|
||||||
|
<div class="liquidGlass-wrapper glass-card">
|
||||||
|
<div class="liquidGlass-effect"></div>
|
||||||
|
<div class="liquidGlass-tint"></div>
|
||||||
|
<div class="liquidGlass-shine"></div>
|
||||||
|
<div class="liquidGlass-content">
|
||||||
|
<div class="form-group">
|
||||||
|
<label class="form-label">上传音频</label>
|
||||||
|
<div class="upload-area" id="asr-upload" onclick="document.getElementById('asr-file').click()">
|
||||||
|
<div class="icon">🎵</div>
|
||||||
|
<div class="text">点击或拖拽上传音频文件<br>WAV / MP3 / FLAC / M4A</div>
|
||||||
|
<div class="filename" id="asr-filename"></div>
|
||||||
|
</div>
|
||||||
|
<input type="file" id="asr-file" accept="audio/*" style="display:none" onchange="onFileSelect(this)">
|
||||||
|
<button class="record-btn" id="record-btn" onclick="toggleRecord()">
|
||||||
|
<span>🎙</span> <span id="record-text">录音</span>
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label class="form-label">热词(可选,提升专业术语识别率)</label>
|
||||||
|
<input type="text" id="asr-hotwords" placeholder="不当得利, 善意取得, 民法典">
|
||||||
|
</div>
|
||||||
|
<div class="liquidGlass-wrapper btn-primary" id="asr-btn" onclick="runASR()">
|
||||||
|
<div class="liquidGlass-effect"></div>
|
||||||
|
<div class="liquidGlass-tint"></div>
|
||||||
|
<div class="liquidGlass-shine"></div>
|
||||||
|
<div class="liquidGlass-content">开始识别</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="liquidGlass-wrapper glass-card">
|
||||||
|
<div class="liquidGlass-effect"></div>
|
||||||
|
<div class="liquidGlass-tint"></div>
|
||||||
|
<div class="liquidGlass-shine"></div>
|
||||||
|
<div class="liquidGlass-content">
|
||||||
|
<label class="form-label">识别结果</label>
|
||||||
|
<div class="result-area" id="asr-result"></div>
|
||||||
|
<audio id="asr-audio" class="audio-player" controls style="display:none"></audio>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- TTS Panel -->
|
||||||
|
<div class="tab-panel" id="panel-tts">
|
||||||
|
<div class="two-col">
|
||||||
|
<div>
|
||||||
|
<div class="liquidGlass-wrapper glass-card">
|
||||||
|
<div class="liquidGlass-effect"></div>
|
||||||
|
<div class="liquidGlass-tint"></div>
|
||||||
|
<div class="liquidGlass-shine"></div>
|
||||||
|
<div class="liquidGlass-content">
|
||||||
|
<div class="form-group">
|
||||||
|
<label class="form-label">输入文字</label>
|
||||||
|
<textarea id="tts-text" rows="8" placeholder="输入你想转换为语音的文字..."></textarea>
|
||||||
|
</div>
|
||||||
|
<div class="liquidGlass-wrapper btn-primary" id="tts-btn" onclick="runTTS()">
|
||||||
|
<div class="liquidGlass-effect"></div>
|
||||||
|
<div class="liquidGlass-tint"></div>
|
||||||
|
<div class="liquidGlass-shine"></div>
|
||||||
|
<div class="liquidGlass-content">生成语音</div>
|
||||||
|
</div>
|
||||||
|
<p class="hint">示例:今天我们来讲民法典中关于不当得利的规定。根据民法典第九百八十五条,得利人没有法律根据取得不当利益的,受损失的人可以请求得利人返还取得的利益。</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="liquidGlass-wrapper glass-card">
|
||||||
|
<div class="liquidGlass-effect"></div>
|
||||||
|
<div class="liquidGlass-tint"></div>
|
||||||
|
<div class="liquidGlass-shine"></div>
|
||||||
|
<div class="liquidGlass-content">
|
||||||
|
<label class="form-label">生成结果</label>
|
||||||
|
<div class="result-area" id="tts-result">等待生成...</div>
|
||||||
|
<audio id="tts-audio" class="audio-player" controls style="display:none"></audio>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="footer">VibeVoice by Microsoft Research · 本地部署 · 数据不离开你的电脑</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// Tab 切换
|
||||||
|
function switchTab(tab) {
|
||||||
|
document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active'));
|
||||||
|
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
|
||||||
|
document.getElementById('panel-' + tab).classList.add('active');
|
||||||
|
document.querySelector(`[data-tab="${tab}"]`).classList.add('active');
|
||||||
|
}
|
||||||
|
|
||||||
|
// 文件选择
|
||||||
|
let selectedFile = null;
|
||||||
|
function onFileSelect(input) {
|
||||||
|
if (input.files.length > 0) {
|
||||||
|
selectedFile = input.files[0];
|
||||||
|
document.getElementById('asr-filename').textContent = selectedFile.name;
|
||||||
|
// 显示播放器
|
||||||
|
const audio = document.getElementById('asr-audio');
|
||||||
|
audio.src = URL.createObjectURL(selectedFile);
|
||||||
|
audio.style.display = 'block';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 拖拽上传
|
||||||
|
const uploadArea = document.getElementById('asr-upload');
|
||||||
|
uploadArea.addEventListener('dragover', e => { e.preventDefault(); uploadArea.classList.add('dragover'); });
|
||||||
|
uploadArea.addEventListener('dragleave', () => uploadArea.classList.remove('dragover'));
|
||||||
|
uploadArea.addEventListener('drop', e => {
|
||||||
|
e.preventDefault();
|
||||||
|
uploadArea.classList.remove('dragover');
|
||||||
|
if (e.dataTransfer.files.length > 0) {
|
||||||
|
selectedFile = e.dataTransfer.files[0];
|
||||||
|
document.getElementById('asr-filename').textContent = selectedFile.name;
|
||||||
|
const audio = document.getElementById('asr-audio');
|
||||||
|
audio.src = URL.createObjectURL(selectedFile);
|
||||||
|
audio.style.display = 'block';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// 录音
|
||||||
|
let mediaRecorder = null;
|
||||||
|
let recordedChunks = [];
|
||||||
|
async function toggleRecord() {
|
||||||
|
const btn = document.getElementById('record-btn');
|
||||||
|
const text = document.getElementById('record-text');
|
||||||
|
|
||||||
|
if (mediaRecorder && mediaRecorder.state === 'recording') {
|
||||||
|
mediaRecorder.stop();
|
||||||
|
btn.classList.remove('recording');
|
||||||
|
text.textContent = '录音';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
mediaRecorder = new MediaRecorder(stream);
|
||||||
|
recordedChunks = [];
|
||||||
|
mediaRecorder.ondataavailable = e => recordedChunks.push(e.data);
|
||||||
|
mediaRecorder.onstop = () => {
|
||||||
|
const blob = new Blob(recordedChunks, { type: 'audio/wav' });
|
||||||
|
selectedFile = new File([blob], 'recording.wav', { type: 'audio/wav' });
|
||||||
|
document.getElementById('asr-filename').textContent = '录音完成';
|
||||||
|
const audio = document.getElementById('asr-audio');
|
||||||
|
audio.src = URL.createObjectURL(blob);
|
||||||
|
audio.style.display = 'block';
|
||||||
|
stream.getTracks().forEach(t => t.stop());
|
||||||
|
};
|
||||||
|
mediaRecorder.start();
|
||||||
|
btn.classList.add('recording');
|
||||||
|
text.textContent = '停止';
|
||||||
|
} catch (e) {
|
||||||
|
alert('无法访问麦克风: ' + e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ASR 调用
|
||||||
|
async function runASR() {
|
||||||
|
if (!selectedFile) { alert('请先上传或录制音频'); return; }
|
||||||
|
|
||||||
|
const btn = document.getElementById('asr-btn');
|
||||||
|
const result = document.getElementById('asr-result');
|
||||||
|
btn.classList.add('loading');
|
||||||
|
result.textContent = '正在加载模型并识别,首次需下载模型(~8GB)...';
|
||||||
|
|
||||||
|
const form = new FormData();
|
||||||
|
form.append('audio', selectedFile);
|
||||||
|
form.append('hotwords', document.getElementById('asr-hotwords').value);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const resp = await fetch('/api/asr', { method: 'POST', body: form });
|
||||||
|
const data = await resp.json();
|
||||||
|
|
||||||
|
if (data.error) {
|
||||||
|
result.textContent = '错误: ' + data.error;
|
||||||
|
} else if (data.segments && data.segments.length > 0) {
|
||||||
|
const lines = data.segments.map(s => {
|
||||||
|
const start = s.start_time || '';
|
||||||
|
const end = s.end_time || '';
|
||||||
|
const speaker = s.speaker_id || '';
|
||||||
|
const text = s.text || '';
|
||||||
|
if (start) return `[${start} → ${end}] 说话人${speaker}: ${text}`;
|
||||||
|
return text;
|
||||||
|
});
|
||||||
|
result.textContent = lines.join('\n') + `\n\n--- 耗时 ${data.time}s ---`;
|
||||||
|
} else {
|
||||||
|
result.textContent = data.raw || '无结果';
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
result.textContent = '请求失败: ' + e.message;
|
||||||
|
} finally {
|
||||||
|
btn.classList.remove('loading');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TTS 调用
|
||||||
|
async function runTTS() {
|
||||||
|
const text = document.getElementById('tts-text').value;
|
||||||
|
if (!text.trim()) { alert('请输入文字'); return; }
|
||||||
|
|
||||||
|
const btn = document.getElementById('tts-btn');
|
||||||
|
const result = document.getElementById('tts-result');
|
||||||
|
const audio = document.getElementById('tts-audio');
|
||||||
|
btn.classList.add('loading');
|
||||||
|
result.textContent = '正在加载模型并生成语音,首次需下载模型(~2GB)...';
|
||||||
|
|
||||||
|
const form = new FormData();
|
||||||
|
form.append('text', text);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const resp = await fetch('/api/tts', { method: 'POST', body: form });
|
||||||
|
if (resp.ok) {
|
||||||
|
const blob = await resp.blob();
|
||||||
|
audio.src = URL.createObjectURL(blob);
|
||||||
|
audio.style.display = 'block';
|
||||||
|
audio.play();
|
||||||
|
result.textContent = '生成完成,点击播放';
|
||||||
|
} else {
|
||||||
|
const data = await resp.json();
|
||||||
|
result.textContent = '错误: ' + (data.error || '未知错误');
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
result.textContent = '请求失败: ' + e.message;
|
||||||
|
} finally {
|
||||||
|
btn.classList.remove('loading');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>"""
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=4410)
|
||||||
246
index.html
246
index.html
@@ -14,33 +14,261 @@
|
|||||||
.container { max-width: 1200px; margin: 0 auto; }
|
.container { max-width: 1200px; margin: 0 auto; }
|
||||||
h1 {
|
h1 {
|
||||||
font-size: 2.5rem; font-weight: 700;
|
font-size: 2.5rem; font-weight: 700;
|
||||||
background: linear-gradient(135deg, #60a5fa, #a78bfa);
|
background: linear-gradient(135deg, #f97316, #ef4444);
|
||||||
-webkit-background-clip: text; -webkit-text-fill-color: transparent;
|
-webkit-background-clip: text; -webkit-text-fill-color: transparent;
|
||||||
margin-bottom: 0.5rem;
|
margin-bottom: 0.5rem;
|
||||||
}
|
}
|
||||||
.subtitle { color: #888; font-size: 1.1rem; margin-bottom: 2rem; }
|
.subtitle { color: #888; font-size: 1.1rem; margin-bottom: 2rem; }
|
||||||
|
.badge-row { display: flex; gap: 0.5rem; margin-bottom: 2rem; flex-wrap: wrap; }
|
||||||
|
.badge {
|
||||||
|
display: inline-block; padding: 0.3rem 0.8rem; border-radius: 20px;
|
||||||
|
font-size: 0.8rem; font-weight: 600;
|
||||||
|
}
|
||||||
|
.badge-ms { background: #1a3a5c; color: #60a5fa; }
|
||||||
|
.badge-asr { background: #3c2e1a; color: #fbbf24; }
|
||||||
|
.badge-tts { background: #1a3c2a; color: #4ade80; }
|
||||||
|
.badge-mit { background: #2e1a3c; color: #c4b5fd; }
|
||||||
|
|
||||||
.card {
|
.card {
|
||||||
background: #141414; border: 1px solid #222; border-radius: 12px;
|
background: #141414; border: 1px solid #222; border-radius: 12px;
|
||||||
padding: 2rem; margin-bottom: 1.5rem;
|
padding: 2rem; margin-bottom: 1.5rem;
|
||||||
}
|
}
|
||||||
.card h2 { color: #60a5fa; margin-bottom: 1rem; font-size: 1.3rem; }
|
.card h2 { color: #f97316; margin-bottom: 1rem; font-size: 1.3rem; }
|
||||||
.card p { line-height: 1.8; color: #aaa; }
|
.card p, .card li { line-height: 1.8; color: #aaa; }
|
||||||
|
.card ul { list-style: none; padding: 0; }
|
||||||
|
.card ul li::before { content: ""; margin-right: 0.5rem; }
|
||||||
|
|
||||||
|
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(340px, 1fr)); gap: 1.5rem; }
|
||||||
|
|
||||||
|
table { width: 100%; border-collapse: collapse; margin-top: 0.5rem; }
|
||||||
|
th, td { text-align: left; padding: 0.7rem 1rem; border-bottom: 1px solid #222; }
|
||||||
|
th { color: #f97316; font-weight: 600; font-size: 0.9rem; }
|
||||||
|
td { color: #aaa; font-size: 0.9rem; }
|
||||||
|
|
||||||
|
.highlight { color: #4ade80; font-weight: 600; }
|
||||||
|
.warn { color: #f87171; font-weight: 600; }
|
||||||
|
|
||||||
|
.model-card {
|
||||||
|
background: #1a1a2e; border: 1px solid #2a2a4e; border-radius: 10px;
|
||||||
|
padding: 1.5rem; text-align: center;
|
||||||
|
}
|
||||||
|
.model-card .icon { font-size: 2.5rem; margin-bottom: 0.5rem; }
|
||||||
|
.model-card .name { font-size: 1.1rem; color: #f97316; font-weight: 700; margin-bottom: 0.3rem; }
|
||||||
|
.model-card .size { font-size: 0.8rem; color: #666; margin-bottom: 0.8rem; }
|
||||||
|
.model-card .desc { font-size: 0.85rem; color: #aaa; line-height: 1.6; text-align: left; }
|
||||||
|
|
||||||
|
.three-col { display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 1.5rem; margin-bottom: 1.5rem; }
|
||||||
|
|
||||||
|
.use-case {
|
||||||
|
background: #141414; border: 1px solid #2a4e2a; border-radius: 12px;
|
||||||
|
padding: 1.5rem;
|
||||||
|
}
|
||||||
|
.use-case h3 { color: #4ade80; font-size: 1rem; margin-bottom: 0.5rem; }
|
||||||
|
.use-case p { color: #888; font-size: 0.9rem; line-height: 1.6; }
|
||||||
|
.use-case .tag { display: inline-block; background: #1a3c2a; color: #4ade80; padding: 0.2rem 0.5rem; border-radius: 4px; font-size: 0.75rem; margin-top: 0.5rem; }
|
||||||
|
|
||||||
|
.code-block {
|
||||||
|
background: #1a1a2e; border: 1px solid #2a2a4e; border-radius: 8px;
|
||||||
|
padding: 1.2rem; margin-top: 1rem; overflow-x: auto;
|
||||||
|
font-family: "SF Mono", "Fira Code", monospace; font-size: 0.85rem;
|
||||||
|
color: #c4b5fd; line-height: 1.6;
|
||||||
|
}
|
||||||
|
.code-comment { color: #555; }
|
||||||
|
|
||||||
|
.links { display: flex; gap: 1rem; margin-top: 1.5rem; flex-wrap: wrap; }
|
||||||
|
.links a {
|
||||||
|
display: inline-flex; align-items: center; gap: 0.4rem;
|
||||||
|
padding: 0.6rem 1.2rem; border-radius: 8px; text-decoration: none;
|
||||||
|
font-size: 0.9rem; font-weight: 600; transition: opacity 0.2s;
|
||||||
|
}
|
||||||
|
.links a:hover { opacity: 0.8; }
|
||||||
|
.link-gh { background: #1a1a2e; color: #c4b5fd; border: 1px solid #2a2a4e; }
|
||||||
|
.link-hf { background: #1a2e1a; color: #4ade80; border: 1px solid #2a4e2a; }
|
||||||
|
.link-doc { background: #2e2a1a; color: #fbbf24; border: 1px solid #4e3a2a; }
|
||||||
|
|
||||||
|
.verdict {
|
||||||
|
background: linear-gradient(135deg, #1a0a00, #141414);
|
||||||
|
border: 1px solid #f9731633; border-radius: 12px;
|
||||||
|
padding: 2rem; margin-top: 1.5rem; text-align: center;
|
||||||
|
}
|
||||||
|
.verdict h2 { color: #f97316; margin-bottom: 0.5rem; }
|
||||||
|
.verdict p { color: #888; max-width: 600px; margin: 0 auto; }
|
||||||
|
|
||||||
|
footer { text-align: center; color: #333; margin-top: 3rem; font-size: 0.8rem; }
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div class="container">
|
<div class="container">
|
||||||
<h1>VibeVoice 语音AI研究</h1>
|
<h1>VibeVoice — 语音 AI 全家桶</h1>
|
||||||
<p class="subtitle">微软开源语音全家桶,ASR+TTS+实时语音,可用于法考字幕提取</p>
|
<p class="subtitle">微软开源 | ASR + TTS + 实时语音 | MIT 许可</p>
|
||||||
|
|
||||||
|
<div class="badge-row">
|
||||||
|
<span class="badge badge-ms">Microsoft Research</span>
|
||||||
|
<span class="badge badge-asr">ASR 语音识别</span>
|
||||||
|
<span class="badge badge-tts">TTS 语音合成</span>
|
||||||
|
<span class="badge badge-mit">MIT 开源</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 三个模型 -->
|
||||||
|
<div class="three-col">
|
||||||
|
<div class="model-card">
|
||||||
|
<div class="icon">🎙</div>
|
||||||
|
<div class="name">VibeVoice-ASR</div>
|
||||||
|
<div class="size">语音识别模型</div>
|
||||||
|
<div class="desc">
|
||||||
|
<ul style="list-style:none; padding:0;">
|
||||||
|
<li>单次处理 60 分钟音频</li>
|
||||||
|
<li>输出:说话人 + 时间戳 + 内容</li>
|
||||||
|
<li>支持 50+ 语言</li>
|
||||||
|
<li>支持自定义热词</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="model-card">
|
||||||
|
<div class="icon">🔊</div>
|
||||||
|
<div class="name">VibeVoice-1.5B</div>
|
||||||
|
<div class="size">15 亿参数 · TTS</div>
|
||||||
|
<div class="desc">
|
||||||
|
<ul style="list-style:none; padding:0;">
|
||||||
|
<li>高质量文字转语音</li>
|
||||||
|
<li>自然语调和韵律</li>
|
||||||
|
<li>多语言支持</li>
|
||||||
|
<li>7.5Hz 超低帧率 token</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="model-card">
|
||||||
|
<div class="icon">⚡</div>
|
||||||
|
<div class="name">VibeVoice-Realtime-0.5B</div>
|
||||||
|
<div class="size">5 亿参数 · 实时 TTS</div>
|
||||||
|
<div class="desc">
|
||||||
|
<ul style="list-style:none; padding:0;">
|
||||||
|
<li>流式文字输入</li>
|
||||||
|
<li>首音延迟 ~300ms</li>
|
||||||
|
<li>支持长文本朗读</li>
|
||||||
|
<li>适合实时对话场景</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="grid">
|
||||||
|
<!-- 技术亮点 -->
|
||||||
|
<div class="card">
|
||||||
|
<h2>核心技术</h2>
|
||||||
|
<table>
|
||||||
|
<tr><th>技术</th><th>说明</th></tr>
|
||||||
|
<tr><td>连续语音 Tokenizer</td><td>声学 + 语义双 Tokenizer,7.5Hz 超低帧率</td></tr>
|
||||||
|
<tr><td>长音频处理</td><td>单次 60 分钟,无需分段</td></tr>
|
||||||
|
<tr><td>说话人分离</td><td>自动识别 Who + When + What</td></tr>
|
||||||
|
<tr><td>流式推理</td><td>边输入文字边生成语音,300ms 首音</td></tr>
|
||||||
|
<tr><td>热词支持</td><td>自定义专业术语提升识别率</td></tr>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 对比 -->
|
||||||
|
<div class="card">
|
||||||
|
<h2>vs 同类方案</h2>
|
||||||
|
<table>
|
||||||
|
<tr><th>维度</th><th>Whisper</th><th>ElevenLabs</th><th>VibeVoice</th></tr>
|
||||||
|
<tr><td>ASR</td><td class="highlight">有</td><td class="warn">无</td><td class="highlight">有(更强)</td></tr>
|
||||||
|
<tr><td>TTS</td><td class="warn">无</td><td class="highlight">有</td><td class="highlight">有</td></tr>
|
||||||
|
<tr><td>实时流式</td><td class="warn">无</td><td class="highlight">有</td><td class="highlight">有</td></tr>
|
||||||
|
<tr><td>说话人识别</td><td class="warn">无</td><td class="warn">无</td><td class="highlight">内置</td></tr>
|
||||||
|
<tr><td>长音频</td><td>需分段</td><td>N/A</td><td class="highlight">60分钟单次</td></tr>
|
||||||
|
<tr><td>开源</td><td class="highlight">是</td><td class="warn">否</td><td class="highlight">是(MIT)</td></tr>
|
||||||
|
<tr><td>费用</td><td>免费</td><td class="warn">按量付费</td><td>免费</td></tr>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 应用场景 -->
|
||||||
|
<h2 style="color: #f97316; margin: 1.5rem 0 1rem;">我们的应用场景</h2>
|
||||||
|
<div class="three-col">
|
||||||
|
<div class="use-case">
|
||||||
|
<h3>法考视频字幕提取</h3>
|
||||||
|
<p>9,553 个法考视频需要提取字幕。VibeVoice-ASR 单次处理 60 分钟 + 自动时间戳 + 说话人识别,配合法律热词("不当得利""善意取得"等)可显著提升识别率。</p>
|
||||||
|
<span class="tag">高优先级</span>
|
||||||
|
</div>
|
||||||
|
<div class="use-case">
|
||||||
|
<h3>法海法考 App 语音朗读</h3>
|
||||||
|
<p>用 Realtime-0.5B 为题目和解析生成语音朗读,支持边看题边听讲解,提升学习体验。</p>
|
||||||
|
<span class="tag">中优先级</span>
|
||||||
|
</div>
|
||||||
|
<div class="use-case">
|
||||||
|
<h3>百陶会多语言介绍</h3>
|
||||||
|
<p>用 VibeVoice-1.5B 为产品页面生成中英文语音介绍,50+ 语言支持覆盖海外客户。</p>
|
||||||
|
<span class="tag">低优先级</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 代码示例 -->
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<h2>概述</h2>
|
<h2>ASR 使用示例</h2>
|
||||||
<p>待补充研究内容...</p>
|
<div class="code-block">
|
||||||
|
<span class="code-comment"># 安装</span>
|
||||||
|
pip install transformers torch
|
||||||
|
|
||||||
|
<span class="code-comment"># ASR:语音转文字(带时间戳和说话人)</span>
|
||||||
|
from transformers import pipeline
|
||||||
|
|
||||||
|
asr = pipeline(
|
||||||
|
"automatic-speech-recognition",
|
||||||
|
model="microsoft/VibeVoice-ASR"
|
||||||
|
)
|
||||||
|
|
||||||
|
result = asr("lecture_60min.wav")
|
||||||
|
<span class="code-comment"># 输出:[{speaker: "A", start: 0.0, end: 3.2, text: "..."}, ...]</span>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<h2>核心发现</h2>
|
<h2>TTS 使用示例</h2>
|
||||||
<p>待补充...</p>
|
<div class="code-block">
|
||||||
|
<span class="code-comment"># 实时 TTS:文字转语音</span>
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
"microsoft/VibeVoice-Realtime-0.5B"
|
||||||
|
)
|
||||||
|
|
||||||
|
<span class="code-comment"># 流式生成,首音 ~300ms</span>
|
||||||
|
for audio_chunk in model.generate_stream("今天我们来讲民法典..."):
|
||||||
|
play(audio_chunk)
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- 硬件 -->
|
||||||
|
<div class="card">
|
||||||
|
<h2>硬件要求与本机适配</h2>
|
||||||
|
<table>
|
||||||
|
<tr><th>模型</th><th>显存需求</th><th>M2 Max 可运行?</th></tr>
|
||||||
|
<tr><td>VibeVoice-ASR</td><td>~8GB</td><td class="highlight">可以(MPS 加速)</td></tr>
|
||||||
|
<tr><td>VibeVoice-1.5B</td><td>~6GB</td><td class="highlight">可以</td></tr>
|
||||||
|
<tr><td>VibeVoice-Realtime-0.5B</td><td>~2GB</td><td class="highlight">可以</td></tr>
|
||||||
|
</table>
|
||||||
|
<p style="margin-top: 1rem; color: #4ade80; font-size: 0.9rem;">
|
||||||
|
本机 M2 Max 64GB 完全满足所有模型运行要求
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 评价 -->
|
||||||
|
<div class="verdict">
|
||||||
|
<h2>评价:实用性很高</h2>
|
||||||
|
<p>ASR + TTS + 实时语音三合一开源方案,MIT 许可无商用限制。ASR 的 60 分钟长音频 + 说话人识别是真正的差异化优势。本机 M2 Max 可直接运行,不需要 GPU 服务器。对法考字幕提取项目有直接价值。</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 链接 -->
|
||||||
|
<div class="links">
|
||||||
|
<a href="https://github.com/microsoft/VibeVoice" target="_blank" class="link-gh">GitHub 源码</a>
|
||||||
|
<a href="https://huggingface.co/microsoft/VibeVoice-ASR" target="_blank" class="link-hf">ASR 模型</a>
|
||||||
|
<a href="https://huggingface.co/microsoft/VibeVoice-Realtime-0.5B" target="_blank" class="link-hf">Realtime 模型</a>
|
||||||
|
<a href="https://microsoft.github.io/VibeVoice/" target="_blank" class="link-doc">官方文档</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<footer>
|
||||||
|
研究项目 · 立项日期 2026-03-31 · 源码克隆至 ./source/
|
||||||
|
</footer>
|
||||||
</div>
|
</div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
1
source
Submodule
1
source
Submodule
Submodule source added at 3c976491d4
Reference in New Issue
Block a user