780 lines
27 KiB
Python
780 lines
27 KiB
Python
"""
|
||
VibeVoice 体验平台 — Liquid Glass 风格
|
||
FastAPI 后端 + 纯 HTML 前端
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import torch
|
||
import numpy as np
|
||
import tempfile
|
||
import time
|
||
import soundfile as sf
|
||
from pathlib import Path
|
||
from fastapi import FastAPI, UploadFile, File, Form
|
||
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
|
||
from fastapi.staticfiles import StaticFiles
|
||
import uvicorn
|
||
|
||
SOURCE_DIR = Path(__file__).parent / "source"
|
||
STATIC_DIR = Path(__file__).parent / "static"
|
||
sys.path.insert(0, str(SOURCE_DIR))
|
||
|
||
app = FastAPI()
|
||
|
||
# ========== 全局状态 ==========
|
||
asr_model_cache = {}
|
||
tts_model_cache = {}
|
||
|
||
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
|
||
DTYPE = torch.float32
|
||
|
||
|
||
def load_asr():
|
||
if asr_model_cache:
|
||
return asr_model_cache
|
||
|
||
from vibevoice.modular.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration
|
||
from vibevoice.processor.vibevoice_asr_processor import VibeVoiceASRProcessor
|
||
|
||
print(f"Loading ASR model to {DEVICE}...")
|
||
processor = VibeVoiceASRProcessor.from_pretrained("microsoft/VibeVoice-ASR")
|
||
model = VibeVoiceASRForConditionalGeneration.from_pretrained(
|
||
"microsoft/VibeVoice-ASR",
|
||
torch_dtype=DTYPE,
|
||
attn_implementation="sdpa",
|
||
trust_remote_code=True
|
||
)
|
||
model = model.to(DEVICE)
|
||
model.eval()
|
||
asr_model_cache["model"] = model
|
||
asr_model_cache["processor"] = processor
|
||
print("ASR model loaded")
|
||
return asr_model_cache
|
||
|
||
|
||
def load_tts():
|
||
if tts_model_cache:
|
||
return tts_model_cache
|
||
|
||
from vibevoice.modular.modeling_vibevoice_streaming_inference import (
|
||
VibeVoiceStreamingForConditionalGenerationInference,
|
||
)
|
||
from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor
|
||
|
||
print(f"Loading TTS model to {DEVICE}...")
|
||
processor = VibeVoiceStreamingProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
|
||
model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
|
||
"microsoft/VibeVoice-Realtime-0.5B",
|
||
torch_dtype=DTYPE,
|
||
attn_implementation="sdpa",
|
||
)
|
||
model = model.to(DEVICE)
|
||
model.eval()
|
||
tts_model_cache["model"] = model
|
||
tts_model_cache["processor"] = processor
|
||
print("TTS model loaded")
|
||
return tts_model_cache
|
||
|
||
|
||
@app.post("/api/asr")
|
||
async def api_asr(audio: UploadFile = File(...), hotwords: str = Form("")):
|
||
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||
content = await audio.read()
|
||
tmp.write(content)
|
||
tmp.close()
|
||
|
||
try:
|
||
asr = load_asr()
|
||
model = asr["model"]
|
||
processor = asr["processor"]
|
||
|
||
context_info = hotwords.strip() if hotwords.strip() else None
|
||
inputs = processor(
|
||
audio=tmp.name,
|
||
sampling_rate=None,
|
||
return_tensors="pt",
|
||
add_generation_prompt=True,
|
||
context_info=context_info
|
||
)
|
||
inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
||
|
||
start_time = time.time()
|
||
with torch.no_grad():
|
||
output_ids = model.generate(
|
||
**inputs,
|
||
max_new_tokens=32768,
|
||
do_sample=False,
|
||
pad_token_id=processor.pad_id,
|
||
eos_token_id=processor.tokenizer.eos_token_id,
|
||
)
|
||
|
||
elapsed = time.time() - start_time
|
||
input_length = inputs['input_ids'].shape[1]
|
||
generated_ids = output_ids[0, input_length:]
|
||
text = processor.decode(generated_ids, skip_special_tokens=True)
|
||
|
||
try:
|
||
segments = processor.post_process_transcription(text)
|
||
except Exception:
|
||
segments = [{"text": text}]
|
||
|
||
return JSONResponse({"segments": segments, "raw": text, "time": round(elapsed, 1)})
|
||
except Exception as e:
|
||
return JSONResponse({"error": str(e)}, status_code=500)
|
||
finally:
|
||
os.unlink(tmp.name)
|
||
|
||
|
||
@app.post("/api/tts")
|
||
async def api_tts(text: str = Form(...)):
|
||
if not text.strip():
|
||
return JSONResponse({"error": "empty text"}, status_code=400)
|
||
|
||
try:
|
||
tts = load_tts()
|
||
model = tts["model"]
|
||
processor = tts["processor"]
|
||
|
||
voices_dir = SOURCE_DIR / "demo" / "voices" / "streaming_model"
|
||
voice_files = list(voices_dir.rglob("*.pt")) if voices_dir.exists() else []
|
||
if not voice_files:
|
||
return JSONResponse({"error": "no voice presets found"}, status_code=500)
|
||
|
||
prefilled = torch.load(voice_files[0], map_location=DEVICE, weights_only=False)
|
||
processed = processor.process_input_with_cached_prompt(
|
||
text=text.strip(),
|
||
cached_prompt=prefilled,
|
||
padding=True,
|
||
return_tensors="pt",
|
||
return_attention_mask=True,
|
||
)
|
||
inputs = {k: v.to(DEVICE) if hasattr(v, "to") else v for k, v in processed.items()}
|
||
|
||
from vibevoice.modular.streamer import AudioStreamer
|
||
import copy, threading
|
||
|
||
audio_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None)
|
||
errors = []
|
||
|
||
model.model.noise_scheduler = model.model.noise_scheduler.from_config(
|
||
model.model.noise_scheduler.config,
|
||
algorithm_type="sde-dpmsolver++",
|
||
beta_schedule="squaredcos_cap_v2",
|
||
)
|
||
model.set_ddpm_inference_steps(num_steps=5)
|
||
|
||
stop_event = threading.Event()
|
||
|
||
def run_gen():
|
||
try:
|
||
model.generate(
|
||
**inputs,
|
||
max_new_tokens=None,
|
||
cfg_scale=1.5,
|
||
tokenizer=processor.tokenizer,
|
||
generation_config={"do_sample": False, "temperature": 1.0, "top_p": 1.0},
|
||
audio_streamer=audio_streamer,
|
||
stop_check_fn=stop_event.is_set,
|
||
verbose=False,
|
||
refresh_negative=True,
|
||
all_prefilled_outputs=copy.deepcopy(prefilled),
|
||
)
|
||
except Exception as e:
|
||
errors.append(e)
|
||
audio_streamer.end()
|
||
|
||
thread = threading.Thread(target=run_gen, daemon=True)
|
||
thread.start()
|
||
|
||
audio_chunks = []
|
||
for chunk in audio_streamer.get_stream(0):
|
||
if torch.is_tensor(chunk):
|
||
chunk = chunk.detach().cpu().to(torch.float32).numpy()
|
||
else:
|
||
chunk = np.asarray(chunk, dtype=np.float32)
|
||
if chunk.ndim > 1:
|
||
chunk = chunk.reshape(-1)
|
||
audio_chunks.append(chunk)
|
||
|
||
thread.join()
|
||
if errors:
|
||
return JSONResponse({"error": str(errors[0])}, status_code=500)
|
||
|
||
audio = np.clip(np.concatenate(audio_chunks), -1.0, 1.0)
|
||
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/tmp")
|
||
sf.write(tmp.name, audio, 24000)
|
||
return FileResponse(tmp.name, media_type="audio/wav", filename="vibevoice_tts.wav")
|
||
|
||
except Exception as e:
|
||
import traceback
|
||
traceback.print_exc()
|
||
return JSONResponse({"error": str(e)}, status_code=500)
|
||
|
||
|
||
@app.get("/")
|
||
def index():
|
||
return HTMLResponse(HTML_PAGE)
|
||
|
||
|
||
HTML_PAGE = r"""<!DOCTYPE html>
|
||
<html lang="zh-CN">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||
<title>VibeVoice</title>
|
||
<style>
|
||
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
||
|
||
* { margin: 0; padding: 0; box-sizing: border-box; }
|
||
|
||
body {
|
||
font-family: 'Inter', -apple-system, sans-serif;
|
||
min-height: 100vh;
|
||
background: url("https://images.unsplash.com/photo-1557682250-33bd709cbe85?w=1920&q=80") center/cover fixed;
|
||
display: flex;
|
||
flex-direction: column;
|
||
align-items: center;
|
||
padding: 2rem;
|
||
color: #fff;
|
||
}
|
||
|
||
/* 背景动画叠加层 */
|
||
body::before {
|
||
content: '';
|
||
position: fixed;
|
||
inset: 0;
|
||
background: linear-gradient(135deg,
|
||
rgba(99, 102, 241, 0.15),
|
||
rgba(168, 85, 247, 0.1),
|
||
rgba(236, 72, 153, 0.1));
|
||
z-index: 0;
|
||
animation: shiftGradient 15s ease infinite;
|
||
}
|
||
@keyframes shiftGradient {
|
||
0%, 100% { opacity: 0.6; }
|
||
50% { opacity: 1; }
|
||
}
|
||
|
||
/* ===== 液态玻璃四层架构 ===== */
|
||
.liquidGlass-wrapper {
|
||
position: relative;
|
||
overflow: hidden;
|
||
box-shadow: 0 6px 6px rgba(0,0,0,0.2), 0 0 20px rgba(0,0,0,0.1);
|
||
transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 2.2);
|
||
}
|
||
.liquidGlass-effect {
|
||
position: absolute; z-index: 0; inset: 0;
|
||
backdrop-filter: blur(3px);
|
||
filter: url(#glass-distortion);
|
||
overflow: hidden;
|
||
isolation: isolate;
|
||
}
|
||
.liquidGlass-tint {
|
||
z-index: 1; position: absolute; inset: 0;
|
||
background: rgba(255, 255, 255, 0.12);
|
||
}
|
||
.liquidGlass-shine {
|
||
position: absolute; inset: 0; z-index: 2; overflow: hidden;
|
||
box-shadow: inset 2px 2px 1px 0 rgba(255,255,255,0.5),
|
||
inset -1px -1px 1px 1px rgba(255,255,255,0.5);
|
||
}
|
||
.liquidGlass-content {
|
||
position: relative; z-index: 3;
|
||
}
|
||
|
||
/* ===== 布局 ===== */
|
||
.container {
|
||
position: relative; z-index: 1;
|
||
max-width: 900px; width: 100%;
|
||
}
|
||
|
||
/* Header */
|
||
.header {
|
||
text-align: center;
|
||
margin-bottom: 2rem;
|
||
}
|
||
.header h1 {
|
||
font-size: 3rem; font-weight: 700;
|
||
text-shadow: 0 2px 20px rgba(0,0,0,0.3);
|
||
letter-spacing: -0.02em;
|
||
}
|
||
.header p {
|
||
color: rgba(255,255,255,0.7); margin-top: 0.5rem; font-size: 1rem;
|
||
}
|
||
.badges { display: flex; gap: 0.5rem; justify-content: center; margin-top: 1rem; }
|
||
.badge {
|
||
padding: 0.25rem 0.75rem; border-radius: 2rem; font-size: 0.75rem;
|
||
font-weight: 500; backdrop-filter: blur(10px);
|
||
background: rgba(255,255,255,0.15); border: 1px solid rgba(255,255,255,0.25);
|
||
}
|
||
|
||
/* Tabs */
|
||
.tabs {
|
||
display: flex; gap: 0.5rem; margin-bottom: 1.5rem; justify-content: center;
|
||
}
|
||
.tab-btn {
|
||
border-radius: 3rem; padding: 0.7rem 2rem; border: none;
|
||
font-size: 0.95rem; font-weight: 600; cursor: pointer;
|
||
color: rgba(255,255,255,0.6); background: transparent;
|
||
transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 2.2);
|
||
}
|
||
.tab-btn.active {
|
||
color: #fff;
|
||
}
|
||
.tab-btn:hover { transform: scale(1.05); }
|
||
|
||
/* 玻璃卡片 */
|
||
.glass-card {
|
||
border-radius: 1.8rem;
|
||
margin-bottom: 1.5rem;
|
||
}
|
||
.glass-card .liquidGlass-effect,
|
||
.glass-card .liquidGlass-tint,
|
||
.glass-card .liquidGlass-shine {
|
||
border-radius: 1.8rem;
|
||
}
|
||
.glass-card .liquidGlass-content {
|
||
padding: 2rem;
|
||
}
|
||
.glass-card:hover {
|
||
box-shadow: 0 8px 12px rgba(0,0,0,0.25), 0 0 30px rgba(0,0,0,0.15);
|
||
}
|
||
|
||
/* 玻璃 Tab 按钮 */
|
||
.tab-glass {
|
||
border-radius: 3rem;
|
||
}
|
||
.tab-glass .liquidGlass-effect,
|
||
.tab-glass .liquidGlass-tint,
|
||
.tab-glass .liquidGlass-shine {
|
||
border-radius: 3rem;
|
||
}
|
||
.tab-glass .liquidGlass-content {
|
||
padding: 0;
|
||
}
|
||
|
||
/* 表单 */
|
||
.form-label {
|
||
font-size: 0.85rem; font-weight: 600; color: rgba(255,255,255,0.8);
|
||
margin-bottom: 0.5rem; display: block;
|
||
}
|
||
.form-group { margin-bottom: 1.2rem; }
|
||
|
||
textarea, input[type="text"] {
|
||
width: 100%; padding: 0.8rem 1rem;
|
||
background: rgba(0,0,0,0.2); border: 1px solid rgba(255,255,255,0.15);
|
||
border-radius: 1rem; color: #fff; font-size: 0.9rem;
|
||
font-family: inherit; resize: vertical;
|
||
transition: border-color 0.3s, box-shadow 0.3s;
|
||
}
|
||
textarea:focus, input[type="text"]:focus {
|
||
outline: none;
|
||
border-color: rgba(255,255,255,0.4);
|
||
box-shadow: 0 0 20px rgba(167,139,250,0.15);
|
||
}
|
||
textarea::placeholder, input::placeholder {
|
||
color: rgba(255,255,255,0.3);
|
||
}
|
||
|
||
/* 上传区域 */
|
||
.upload-area {
|
||
border: 2px dashed rgba(255,255,255,0.2);
|
||
border-radius: 1rem; padding: 2rem; text-align: center;
|
||
cursor: pointer; transition: all 0.3s ease;
|
||
background: rgba(0,0,0,0.1);
|
||
}
|
||
.upload-area:hover, .upload-area.dragover {
|
||
border-color: rgba(255,255,255,0.5);
|
||
background: rgba(255,255,255,0.05);
|
||
}
|
||
.upload-area .icon { font-size: 2.5rem; margin-bottom: 0.5rem; }
|
||
.upload-area .text { color: rgba(255,255,255,0.5); font-size: 0.85rem; }
|
||
.upload-area .filename {
|
||
color: rgba(255,255,255,0.9); font-weight: 600; margin-top: 0.5rem;
|
||
}
|
||
|
||
/* 录音按钮 */
|
||
.record-btn {
|
||
display: inline-flex; align-items: center; gap: 0.4rem;
|
||
padding: 0.5rem 1rem; border-radius: 2rem;
|
||
border: 1px solid rgba(255,255,255,0.2);
|
||
background: rgba(255,255,255,0.08);
|
||
color: rgba(255,255,255,0.7); font-size: 0.85rem;
|
||
cursor: pointer; transition: all 0.3s; margin-top: 0.8rem;
|
||
}
|
||
.record-btn:hover { background: rgba(255,255,255,0.15); }
|
||
.record-btn.recording {
|
||
border-color: #ef4444; color: #ef4444;
|
||
animation: pulse 1.5s ease infinite;
|
||
}
|
||
@keyframes pulse {
|
||
0%, 100% { box-shadow: 0 0 0 0 rgba(239,68,68,0.4); }
|
||
50% { box-shadow: 0 0 0 8px rgba(239,68,68,0); }
|
||
}
|
||
|
||
/* 主按钮 — 液态玻璃 */
|
||
.btn-primary {
|
||
border-radius: 3rem; cursor: pointer; border: none;
|
||
width: 100%;
|
||
}
|
||
.btn-primary .liquidGlass-tint {
|
||
background: rgba(255,255,255,0.2);
|
||
}
|
||
.btn-primary .liquidGlass-effect,
|
||
.btn-primary .liquidGlass-tint,
|
||
.btn-primary .liquidGlass-shine {
|
||
border-radius: 3rem;
|
||
}
|
||
.btn-primary .liquidGlass-content {
|
||
padding: 0.9rem 2rem; text-align: center;
|
||
font-weight: 700; font-size: 1rem; color: #fff;
|
||
}
|
||
.btn-primary:hover {
|
||
transform: scale(1.02);
|
||
}
|
||
.btn-primary:active { transform: scale(0.98); }
|
||
.btn-primary.loading .liquidGlass-content::after {
|
||
content: ''; display: inline-block; width: 16px; height: 16px;
|
||
border: 2px solid rgba(255,255,255,0.3);
|
||
border-top-color: #fff; border-radius: 50%;
|
||
margin-left: 8px; vertical-align: middle;
|
||
animation: spin 0.8s linear infinite;
|
||
}
|
||
@keyframes spin { to { transform: rotate(360deg); } }
|
||
|
||
/* 结果区域 */
|
||
.result-area {
|
||
background: rgba(0,0,0,0.25); border-radius: 1rem;
|
||
padding: 1.2rem; min-height: 100px;
|
||
font-family: 'SF Mono', 'Fira Code', monospace;
|
||
font-size: 0.85rem; line-height: 1.7;
|
||
color: rgba(255,255,255,0.85);
|
||
white-space: pre-wrap; word-break: break-word;
|
||
max-height: 400px; overflow-y: auto;
|
||
}
|
||
.result-area:empty::after {
|
||
content: '等待识别...';
|
||
color: rgba(255,255,255,0.25);
|
||
}
|
||
|
||
/* 音频播放器 */
|
||
.audio-player {
|
||
width: 100%; margin-top: 1rem; border-radius: 1rem;
|
||
}
|
||
|
||
.tab-panel { display: none; }
|
||
.tab-panel.active { display: block; }
|
||
|
||
/* 提示文字 */
|
||
.hint {
|
||
color: rgba(255,255,255,0.35); font-size: 0.8rem; margin-top: 1rem;
|
||
line-height: 1.6;
|
||
}
|
||
|
||
.two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 1.5rem; }
|
||
@media (max-width: 700px) { .two-col { grid-template-columns: 1fr; } }
|
||
|
||
/* 底部 */
|
||
.footer {
|
||
text-align: center; color: rgba(255,255,255,0.2);
|
||
font-size: 0.75rem; margin-top: 2rem;
|
||
}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
|
||
<!-- SVG 折射滤镜 -->
|
||
<svg style="display:none">
|
||
<filter id="glass-distortion" x="0%" y="0%" width="100%" height="100%" filterUnits="objectBoundingBox">
|
||
<feTurbulence type="fractalNoise" baseFrequency="0.01 0.01" numOctaves="1" seed="5" result="turbulence"/>
|
||
<feComponentTransfer in="turbulence" result="mapped">
|
||
<feFuncR type="gamma" amplitude="1" exponent="10" offset="0.5"/>
|
||
<feFuncG type="gamma" amplitude="0" exponent="1" offset="0"/>
|
||
<feFuncB type="gamma" amplitude="0" exponent="1" offset="0.5"/>
|
||
</feComponentTransfer>
|
||
<feGaussianBlur in="turbulence" stdDeviation="3" result="softMap"/>
|
||
<feSpecularLighting in="softMap" surfaceScale="5" specularConstant="1" specularExponent="100" lighting-color="white" result="specLight">
|
||
<fePointLight x="-200" y="-200" z="300"/>
|
||
</feSpecularLighting>
|
||
<feComposite in="specLight" operator="arithmetic" k1="0" k2="1" k3="1" k4="0" result="litImage"/>
|
||
<feDisplacementMap in="SourceGraphic" in2="softMap" scale="150" xChannelSelector="R" yChannelSelector="G"/>
|
||
</filter>
|
||
</svg>
|
||
|
||
<div class="container">
|
||
<!-- Header -->
|
||
<div class="header">
|
||
<h1>VibeVoice</h1>
|
||
<p>Microsoft 开源语音 AI — 语音识别 & 语音合成</p>
|
||
<div class="badges">
|
||
<span class="badge">Microsoft Research</span>
|
||
<span class="badge">MIT 开源</span>
|
||
<span class="badge">MPS 本地加速</span>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- Tabs -->
|
||
<div class="tabs">
|
||
<div class="liquidGlass-wrapper tab-glass" onclick="switchTab('asr')" id="tab-asr">
|
||
<div class="liquidGlass-effect"></div>
|
||
<div class="liquidGlass-tint"></div>
|
||
<div class="liquidGlass-shine"></div>
|
||
<div class="liquidGlass-content">
|
||
<button class="tab-btn active" data-tab="asr">语音识别 ASR</button>
|
||
</div>
|
||
</div>
|
||
<div class="liquidGlass-wrapper tab-glass" onclick="switchTab('tts')" id="tab-tts">
|
||
<div class="liquidGlass-effect"></div>
|
||
<div class="liquidGlass-tint"></div>
|
||
<div class="liquidGlass-shine"></div>
|
||
<div class="liquidGlass-content">
|
||
<button class="tab-btn" data-tab="tts">语音合成 TTS</button>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- ASR Panel -->
|
||
<div class="tab-panel active" id="panel-asr">
|
||
<div class="two-col">
|
||
<div>
|
||
<div class="liquidGlass-wrapper glass-card">
|
||
<div class="liquidGlass-effect"></div>
|
||
<div class="liquidGlass-tint"></div>
|
||
<div class="liquidGlass-shine"></div>
|
||
<div class="liquidGlass-content">
|
||
<div class="form-group">
|
||
<label class="form-label">上传音频</label>
|
||
<div class="upload-area" id="asr-upload" onclick="document.getElementById('asr-file').click()">
|
||
<div class="icon">🎵</div>
|
||
<div class="text">点击或拖拽上传音频文件<br>WAV / MP3 / FLAC / M4A</div>
|
||
<div class="filename" id="asr-filename"></div>
|
||
</div>
|
||
<input type="file" id="asr-file" accept="audio/*" style="display:none" onchange="onFileSelect(this)">
|
||
<button class="record-btn" id="record-btn" onclick="toggleRecord()">
|
||
<span>🎙</span> <span id="record-text">录音</span>
|
||
</button>
|
||
</div>
|
||
<div class="form-group">
|
||
<label class="form-label">热词(可选,提升专业术语识别率)</label>
|
||
<input type="text" id="asr-hotwords" placeholder="不当得利, 善意取得, 民法典">
|
||
</div>
|
||
<div class="liquidGlass-wrapper btn-primary" id="asr-btn" onclick="runASR()">
|
||
<div class="liquidGlass-effect"></div>
|
||
<div class="liquidGlass-tint"></div>
|
||
<div class="liquidGlass-shine"></div>
|
||
<div class="liquidGlass-content">开始识别</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
<div>
|
||
<div class="liquidGlass-wrapper glass-card">
|
||
<div class="liquidGlass-effect"></div>
|
||
<div class="liquidGlass-tint"></div>
|
||
<div class="liquidGlass-shine"></div>
|
||
<div class="liquidGlass-content">
|
||
<label class="form-label">识别结果</label>
|
||
<div class="result-area" id="asr-result"></div>
|
||
<audio id="asr-audio" class="audio-player" controls style="display:none"></audio>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- TTS Panel -->
|
||
<div class="tab-panel" id="panel-tts">
|
||
<div class="two-col">
|
||
<div>
|
||
<div class="liquidGlass-wrapper glass-card">
|
||
<div class="liquidGlass-effect"></div>
|
||
<div class="liquidGlass-tint"></div>
|
||
<div class="liquidGlass-shine"></div>
|
||
<div class="liquidGlass-content">
|
||
<div class="form-group">
|
||
<label class="form-label">输入文字</label>
|
||
<textarea id="tts-text" rows="8" placeholder="输入你想转换为语音的文字..."></textarea>
|
||
</div>
|
||
<div class="liquidGlass-wrapper btn-primary" id="tts-btn" onclick="runTTS()">
|
||
<div class="liquidGlass-effect"></div>
|
||
<div class="liquidGlass-tint"></div>
|
||
<div class="liquidGlass-shine"></div>
|
||
<div class="liquidGlass-content">生成语音</div>
|
||
</div>
|
||
<p class="hint">示例:今天我们来讲民法典中关于不当得利的规定。根据民法典第九百八十五条,得利人没有法律根据取得不当利益的,受损失的人可以请求得利人返还取得的利益。</p>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
<div>
|
||
<div class="liquidGlass-wrapper glass-card">
|
||
<div class="liquidGlass-effect"></div>
|
||
<div class="liquidGlass-tint"></div>
|
||
<div class="liquidGlass-shine"></div>
|
||
<div class="liquidGlass-content">
|
||
<label class="form-label">生成结果</label>
|
||
<div class="result-area" id="tts-result">等待生成...</div>
|
||
<audio id="tts-audio" class="audio-player" controls style="display:none"></audio>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="footer">VibeVoice by Microsoft Research · 本地部署 · 数据不离开你的电脑</div>
|
||
</div>
|
||
|
||
<script>
|
||
// Tab 切换
|
||
function switchTab(tab) {
|
||
document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active'));
|
||
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
|
||
document.getElementById('panel-' + tab).classList.add('active');
|
||
document.querySelector(`[data-tab="${tab}"]`).classList.add('active');
|
||
}
|
||
|
||
// 文件选择
|
||
let selectedFile = null;
|
||
function onFileSelect(input) {
|
||
if (input.files.length > 0) {
|
||
selectedFile = input.files[0];
|
||
document.getElementById('asr-filename').textContent = selectedFile.name;
|
||
// 显示播放器
|
||
const audio = document.getElementById('asr-audio');
|
||
audio.src = URL.createObjectURL(selectedFile);
|
||
audio.style.display = 'block';
|
||
}
|
||
}
|
||
|
||
// 拖拽上传
|
||
const uploadArea = document.getElementById('asr-upload');
|
||
uploadArea.addEventListener('dragover', e => { e.preventDefault(); uploadArea.classList.add('dragover'); });
|
||
uploadArea.addEventListener('dragleave', () => uploadArea.classList.remove('dragover'));
|
||
uploadArea.addEventListener('drop', e => {
|
||
e.preventDefault();
|
||
uploadArea.classList.remove('dragover');
|
||
if (e.dataTransfer.files.length > 0) {
|
||
selectedFile = e.dataTransfer.files[0];
|
||
document.getElementById('asr-filename').textContent = selectedFile.name;
|
||
const audio = document.getElementById('asr-audio');
|
||
audio.src = URL.createObjectURL(selectedFile);
|
||
audio.style.display = 'block';
|
||
}
|
||
});
|
||
|
||
// 录音
|
||
let mediaRecorder = null;
|
||
let recordedChunks = [];
|
||
async function toggleRecord() {
|
||
const btn = document.getElementById('record-btn');
|
||
const text = document.getElementById('record-text');
|
||
|
||
if (mediaRecorder && mediaRecorder.state === 'recording') {
|
||
mediaRecorder.stop();
|
||
btn.classList.remove('recording');
|
||
text.textContent = '录音';
|
||
return;
|
||
}
|
||
|
||
try {
|
||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||
mediaRecorder = new MediaRecorder(stream);
|
||
recordedChunks = [];
|
||
mediaRecorder.ondataavailable = e => recordedChunks.push(e.data);
|
||
mediaRecorder.onstop = () => {
|
||
const blob = new Blob(recordedChunks, { type: 'audio/wav' });
|
||
selectedFile = new File([blob], 'recording.wav', { type: 'audio/wav' });
|
||
document.getElementById('asr-filename').textContent = '录音完成';
|
||
const audio = document.getElementById('asr-audio');
|
||
audio.src = URL.createObjectURL(blob);
|
||
audio.style.display = 'block';
|
||
stream.getTracks().forEach(t => t.stop());
|
||
};
|
||
mediaRecorder.start();
|
||
btn.classList.add('recording');
|
||
text.textContent = '停止';
|
||
} catch (e) {
|
||
alert('无法访问麦克风: ' + e.message);
|
||
}
|
||
}
|
||
|
||
// ASR 调用
|
||
async function runASR() {
|
||
if (!selectedFile) { alert('请先上传或录制音频'); return; }
|
||
|
||
const btn = document.getElementById('asr-btn');
|
||
const result = document.getElementById('asr-result');
|
||
btn.classList.add('loading');
|
||
result.textContent = '正在加载模型并识别,首次需下载模型(~8GB)...';
|
||
|
||
const form = new FormData();
|
||
form.append('audio', selectedFile);
|
||
form.append('hotwords', document.getElementById('asr-hotwords').value);
|
||
|
||
try {
|
||
const resp = await fetch('/api/asr', { method: 'POST', body: form });
|
||
const data = await resp.json();
|
||
|
||
if (data.error) {
|
||
result.textContent = '错误: ' + data.error;
|
||
} else if (data.segments && data.segments.length > 0) {
|
||
const lines = data.segments.map(s => {
|
||
const start = s.start_time || '';
|
||
const end = s.end_time || '';
|
||
const speaker = s.speaker_id || '';
|
||
const text = s.text || '';
|
||
if (start) return `[${start} → ${end}] 说话人${speaker}: ${text}`;
|
||
return text;
|
||
});
|
||
result.textContent = lines.join('\n') + `\n\n--- 耗时 ${data.time}s ---`;
|
||
} else {
|
||
result.textContent = data.raw || '无结果';
|
||
}
|
||
} catch (e) {
|
||
result.textContent = '请求失败: ' + e.message;
|
||
} finally {
|
||
btn.classList.remove('loading');
|
||
}
|
||
}
|
||
|
||
// TTS 调用
|
||
async function runTTS() {
|
||
const text = document.getElementById('tts-text').value;
|
||
if (!text.trim()) { alert('请输入文字'); return; }
|
||
|
||
const btn = document.getElementById('tts-btn');
|
||
const result = document.getElementById('tts-result');
|
||
const audio = document.getElementById('tts-audio');
|
||
btn.classList.add('loading');
|
||
result.textContent = '正在加载模型并生成语音,首次需下载模型(~2GB)...';
|
||
|
||
const form = new FormData();
|
||
form.append('text', text);
|
||
|
||
try {
|
||
const resp = await fetch('/api/tts', { method: 'POST', body: form });
|
||
if (resp.ok) {
|
||
const blob = await resp.blob();
|
||
audio.src = URL.createObjectURL(blob);
|
||
audio.style.display = 'block';
|
||
audio.play();
|
||
result.textContent = '生成完成,点击播放';
|
||
} else {
|
||
const data = await resp.json();
|
||
result.textContent = '错误: ' + (data.error || '未知错误');
|
||
}
|
||
} catch (e) {
|
||
result.textContent = '请求失败: ' + e.message;
|
||
} finally {
|
||
btn.classList.remove('loading');
|
||
}
|
||
}
|
||
</script>
|
||
</body>
|
||
</html>"""
|
||
|
||
|
||
if __name__ == "__main__":
|
||
uvicorn.run(app, host="0.0.0.0", port=4410)
|