diff --git a/app.py b/app.py new file mode 100644 index 0000000..2d9152b --- /dev/null +++ b/app.py @@ -0,0 +1,779 @@ +""" +VibeVoice 体验平台 — Liquid Glass 风格 +FastAPI 后端 + 纯 HTML 前端 +""" + +import os +import sys +import json +import torch +import numpy as np +import tempfile +import time +import soundfile as sf +from pathlib import Path +from fastapi import FastAPI, UploadFile, File, Form +from fastapi.responses import FileResponse, HTMLResponse, JSONResponse +from fastapi.staticfiles import StaticFiles +import uvicorn + +SOURCE_DIR = Path(__file__).parent / "source" +STATIC_DIR = Path(__file__).parent / "static" +sys.path.insert(0, str(SOURCE_DIR)) + +app = FastAPI() + +# ========== 全局状态 ========== +asr_model_cache = {} +tts_model_cache = {} + +DEVICE = "mps" if torch.backends.mps.is_available() else "cpu" +DTYPE = torch.float32 + + +def load_asr(): + if asr_model_cache: + return asr_model_cache + + from vibevoice.modular.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration + from vibevoice.processor.vibevoice_asr_processor import VibeVoiceASRProcessor + + print(f"Loading ASR model to {DEVICE}...") + processor = VibeVoiceASRProcessor.from_pretrained("microsoft/VibeVoice-ASR") + model = VibeVoiceASRForConditionalGeneration.from_pretrained( + "microsoft/VibeVoice-ASR", + torch_dtype=DTYPE, + attn_implementation="sdpa", + trust_remote_code=True + ) + model = model.to(DEVICE) + model.eval() + asr_model_cache["model"] = model + asr_model_cache["processor"] = processor + print("ASR model loaded") + return asr_model_cache + + +def load_tts(): + if tts_model_cache: + return tts_model_cache + + from vibevoice.modular.modeling_vibevoice_streaming_inference import ( + VibeVoiceStreamingForConditionalGenerationInference, + ) + from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor + + print(f"Loading TTS model to {DEVICE}...") + processor = VibeVoiceStreamingProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B") + model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained( + "microsoft/VibeVoice-Realtime-0.5B", + torch_dtype=DTYPE, + attn_implementation="sdpa", + ) + model = model.to(DEVICE) + model.eval() + tts_model_cache["model"] = model + tts_model_cache["processor"] = processor + print("TTS model loaded") + return tts_model_cache + + +@app.post("/api/asr") +async def api_asr(audio: UploadFile = File(...), hotwords: str = Form("")): + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + content = await audio.read() + tmp.write(content) + tmp.close() + + try: + asr = load_asr() + model = asr["model"] + processor = asr["processor"] + + context_info = hotwords.strip() if hotwords.strip() else None + inputs = processor( + audio=tmp.name, + sampling_rate=None, + return_tensors="pt", + add_generation_prompt=True, + context_info=context_info + ) + inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} + + start_time = time.time() + with torch.no_grad(): + output_ids = model.generate( + **inputs, + max_new_tokens=32768, + do_sample=False, + pad_token_id=processor.pad_id, + eos_token_id=processor.tokenizer.eos_token_id, + ) + + elapsed = time.time() - start_time + input_length = inputs['input_ids'].shape[1] + generated_ids = output_ids[0, input_length:] + text = processor.decode(generated_ids, skip_special_tokens=True) + + try: + segments = processor.post_process_transcription(text) + except Exception: + segments = [{"text": text}] + + return JSONResponse({"segments": segments, "raw": text, "time": round(elapsed, 1)}) + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) + finally: + os.unlink(tmp.name) + + +@app.post("/api/tts") +async def api_tts(text: str = Form(...)): + if not text.strip(): + return JSONResponse({"error": "empty text"}, status_code=400) + + try: + tts = load_tts() + model = tts["model"] + processor = tts["processor"] + + voices_dir = SOURCE_DIR / "demo" / "voices" / "streaming_model" + voice_files = list(voices_dir.rglob("*.pt")) if voices_dir.exists() else [] + if not voice_files: + return JSONResponse({"error": "no voice presets found"}, status_code=500) + + prefilled = torch.load(voice_files[0], map_location=DEVICE, weights_only=False) + processed = processor.process_input_with_cached_prompt( + text=text.strip(), + cached_prompt=prefilled, + padding=True, + return_tensors="pt", + return_attention_mask=True, + ) + inputs = {k: v.to(DEVICE) if hasattr(v, "to") else v for k, v in processed.items()} + + from vibevoice.modular.streamer import AudioStreamer + import copy, threading + + audio_streamer = AudioStreamer(batch_size=1, stop_signal=None, timeout=None) + errors = [] + + model.model.noise_scheduler = model.model.noise_scheduler.from_config( + model.model.noise_scheduler.config, + algorithm_type="sde-dpmsolver++", + beta_schedule="squaredcos_cap_v2", + ) + model.set_ddpm_inference_steps(num_steps=5) + + stop_event = threading.Event() + + def run_gen(): + try: + model.generate( + **inputs, + max_new_tokens=None, + cfg_scale=1.5, + tokenizer=processor.tokenizer, + generation_config={"do_sample": False, "temperature": 1.0, "top_p": 1.0}, + audio_streamer=audio_streamer, + stop_check_fn=stop_event.is_set, + verbose=False, + refresh_negative=True, + all_prefilled_outputs=copy.deepcopy(prefilled), + ) + except Exception as e: + errors.append(e) + audio_streamer.end() + + thread = threading.Thread(target=run_gen, daemon=True) + thread.start() + + audio_chunks = [] + for chunk in audio_streamer.get_stream(0): + if torch.is_tensor(chunk): + chunk = chunk.detach().cpu().to(torch.float32).numpy() + else: + chunk = np.asarray(chunk, dtype=np.float32) + if chunk.ndim > 1: + chunk = chunk.reshape(-1) + audio_chunks.append(chunk) + + thread.join() + if errors: + return JSONResponse({"error": str(errors[0])}, status_code=500) + + audio = np.clip(np.concatenate(audio_chunks), -1.0, 1.0) + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/tmp") + sf.write(tmp.name, audio, 24000) + return FileResponse(tmp.name, media_type="audio/wav", filename="vibevoice_tts.wav") + + except Exception as e: + import traceback + traceback.print_exc() + return JSONResponse({"error": str(e)}, status_code=500) + + +@app.get("/") +def index(): + return HTMLResponse(HTML_PAGE) + + +HTML_PAGE = r""" + + + + + VibeVoice + + + + + + + + + + + + + + + + + + + + + + +
+ +
+

VibeVoice

+

Microsoft 开源语音 AI — 语音识别 & 语音合成

+
+ Microsoft Research + MIT 开源 + MPS 本地加速 +
+
+ + +
+
+
+
+
+
+ +
+
+
+
+
+
+
+ +
+
+
+ + +
+
+
+
+
+
+
+
+
+ +
+
🎵
+
点击或拖拽上传音频文件
WAV / MP3 / FLAC / M4A
+
+
+ + +
+
+ + +
+
+
+
+
+
开始识别
+
+
+
+
+
+
+
+
+
+
+ +
+ +
+
+
+
+
+ + +
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+
生成语音
+
+

示例:今天我们来讲民法典中关于不当得利的规定。根据民法典第九百八十五条,得利人没有法律根据取得不当利益的,受损失的人可以请求得利人返还取得的利益。

+
+
+
+
+
+
+
+
+
+ +
等待生成...
+ +
+
+
+
+
+ + +
+ + + +""" + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=4410) diff --git a/index.html b/index.html index a49defd..c3946b0 100644 --- a/index.html +++ b/index.html @@ -14,33 +14,261 @@ .container { max-width: 1200px; margin: 0 auto; } h1 { font-size: 2.5rem; font-weight: 700; - background: linear-gradient(135deg, #60a5fa, #a78bfa); + background: linear-gradient(135deg, #f97316, #ef4444); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin-bottom: 0.5rem; } .subtitle { color: #888; font-size: 1.1rem; margin-bottom: 2rem; } + .badge-row { display: flex; gap: 0.5rem; margin-bottom: 2rem; flex-wrap: wrap; } + .badge { + display: inline-block; padding: 0.3rem 0.8rem; border-radius: 20px; + font-size: 0.8rem; font-weight: 600; + } + .badge-ms { background: #1a3a5c; color: #60a5fa; } + .badge-asr { background: #3c2e1a; color: #fbbf24; } + .badge-tts { background: #1a3c2a; color: #4ade80; } + .badge-mit { background: #2e1a3c; color: #c4b5fd; } + .card { background: #141414; border: 1px solid #222; border-radius: 12px; padding: 2rem; margin-bottom: 1.5rem; } - .card h2 { color: #60a5fa; margin-bottom: 1rem; font-size: 1.3rem; } - .card p { line-height: 1.8; color: #aaa; } + .card h2 { color: #f97316; margin-bottom: 1rem; font-size: 1.3rem; } + .card p, .card li { line-height: 1.8; color: #aaa; } + .card ul { list-style: none; padding: 0; } + .card ul li::before { content: ""; margin-right: 0.5rem; } + + .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(340px, 1fr)); gap: 1.5rem; } + + table { width: 100%; border-collapse: collapse; margin-top: 0.5rem; } + th, td { text-align: left; padding: 0.7rem 1rem; border-bottom: 1px solid #222; } + th { color: #f97316; font-weight: 600; font-size: 0.9rem; } + td { color: #aaa; font-size: 0.9rem; } + + .highlight { color: #4ade80; font-weight: 600; } + .warn { color: #f87171; font-weight: 600; } + + .model-card { + background: #1a1a2e; border: 1px solid #2a2a4e; border-radius: 10px; + padding: 1.5rem; text-align: center; + } + .model-card .icon { font-size: 2.5rem; margin-bottom: 0.5rem; } + .model-card .name { font-size: 1.1rem; color: #f97316; font-weight: 700; margin-bottom: 0.3rem; } + .model-card .size { font-size: 0.8rem; color: #666; margin-bottom: 0.8rem; } + .model-card .desc { font-size: 0.85rem; color: #aaa; line-height: 1.6; text-align: left; } + + .three-col { display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 1.5rem; margin-bottom: 1.5rem; } + + .use-case { + background: #141414; border: 1px solid #2a4e2a; border-radius: 12px; + padding: 1.5rem; + } + .use-case h3 { color: #4ade80; font-size: 1rem; margin-bottom: 0.5rem; } + .use-case p { color: #888; font-size: 0.9rem; line-height: 1.6; } + .use-case .tag { display: inline-block; background: #1a3c2a; color: #4ade80; padding: 0.2rem 0.5rem; border-radius: 4px; font-size: 0.75rem; margin-top: 0.5rem; } + + .code-block { + background: #1a1a2e; border: 1px solid #2a2a4e; border-radius: 8px; + padding: 1.2rem; margin-top: 1rem; overflow-x: auto; + font-family: "SF Mono", "Fira Code", monospace; font-size: 0.85rem; + color: #c4b5fd; line-height: 1.6; + } + .code-comment { color: #555; } + + .links { display: flex; gap: 1rem; margin-top: 1.5rem; flex-wrap: wrap; } + .links a { + display: inline-flex; align-items: center; gap: 0.4rem; + padding: 0.6rem 1.2rem; border-radius: 8px; text-decoration: none; + font-size: 0.9rem; font-weight: 600; transition: opacity 0.2s; + } + .links a:hover { opacity: 0.8; } + .link-gh { background: #1a1a2e; color: #c4b5fd; border: 1px solid #2a2a4e; } + .link-hf { background: #1a2e1a; color: #4ade80; border: 1px solid #2a4e2a; } + .link-doc { background: #2e2a1a; color: #fbbf24; border: 1px solid #4e3a2a; } + + .verdict { + background: linear-gradient(135deg, #1a0a00, #141414); + border: 1px solid #f9731633; border-radius: 12px; + padding: 2rem; margin-top: 1.5rem; text-align: center; + } + .verdict h2 { color: #f97316; margin-bottom: 0.5rem; } + .verdict p { color: #888; max-width: 600px; margin: 0 auto; } + + footer { text-align: center; color: #333; margin-top: 3rem; font-size: 0.8rem; }
-

VibeVoice 语音AI研究

-

微软开源语音全家桶,ASR+TTS+实时语音,可用于法考字幕提取

+

VibeVoice — 语音 AI 全家桶

+

微软开源 | ASR + TTS + 实时语音 | MIT 许可

+
+ Microsoft Research + ASR 语音识别 + TTS 语音合成 + MIT 开源 +
+ + +
+
+
🎙
+
VibeVoice-ASR
+
语音识别模型
+
+
    +
  • 单次处理 60 分钟音频
  • +
  • 输出:说话人 + 时间戳 + 内容
  • +
  • 支持 50+ 语言
  • +
  • 支持自定义热词
  • +
+
+
+
+
🔊
+
VibeVoice-1.5B
+
15 亿参数 · TTS
+
+
    +
  • 高质量文字转语音
  • +
  • 自然语调和韵律
  • +
  • 多语言支持
  • +
  • 7.5Hz 超低帧率 token
  • +
+
+
+
+
+
VibeVoice-Realtime-0.5B
+
5 亿参数 · 实时 TTS
+
+
    +
  • 流式文字输入
  • +
  • 首音延迟 ~300ms
  • +
  • 支持长文本朗读
  • +
  • 适合实时对话场景
  • +
+
+
+
+ +
+ +
+

核心技术

+ + + + + + + +
技术说明
连续语音 Tokenizer声学 + 语义双 Tokenizer,7.5Hz 超低帧率
长音频处理单次 60 分钟,无需分段
说话人分离自动识别 Who + When + What
流式推理边输入文字边生成语音,300ms 首音
热词支持自定义专业术语提升识别率
+
+ + +
+

vs 同类方案

+ + + + + + + + + +
维度WhisperElevenLabsVibeVoice
ASR有(更强)
TTS
实时流式
说话人识别内置
长音频需分段N/A60分钟单次
开源是(MIT)
费用免费按量付费免费
+
+
+ + +

我们的应用场景

+
+
+

法考视频字幕提取

+

9,553 个法考视频需要提取字幕。VibeVoice-ASR 单次处理 60 分钟 + 自动时间戳 + 说话人识别,配合法律热词("不当得利""善意取得"等)可显著提升识别率。

+ 高优先级 +
+
+

法海法考 App 语音朗读

+

用 Realtime-0.5B 为题目和解析生成语音朗读,支持边看题边听讲解,提升学习体验。

+ 中优先级 +
+
+

百陶会多语言介绍

+

用 VibeVoice-1.5B 为产品页面生成中英文语音介绍,50+ 语言支持覆盖海外客户。

+ 低优先级 +
+
+ +
-

概述

-

待补充研究内容...

+

ASR 使用示例

+
+# 安装 +pip install transformers torch + +# ASR:语音转文字(带时间戳和说话人) +from transformers import pipeline + +asr = pipeline( + "automatic-speech-recognition", + model="microsoft/VibeVoice-ASR" +) + +result = asr("lecture_60min.wav") +# 输出:[{speaker: "A", start: 0.0, end: 3.2, text: "..."}, ...] +
-

核心发现

-

待补充...

+

TTS 使用示例

+
+# 实时 TTS:文字转语音 +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained( + "microsoft/VibeVoice-Realtime-0.5B" +) + +# 流式生成,首音 ~300ms +for audio_chunk in model.generate_stream("今天我们来讲民法典..."): + play(audio_chunk) +
+ + +
+

硬件要求与本机适配

+ + + + + +
模型显存需求M2 Max 可运行?
VibeVoice-ASR~8GB可以(MPS 加速)
VibeVoice-1.5B~6GB可以
VibeVoice-Realtime-0.5B~2GB可以
+

+ 本机 M2 Max 64GB 完全满足所有模型运行要求 +

+
+ + +
+

评价:实用性很高

+

ASR + TTS + 实时语音三合一开源方案,MIT 许可无商用限制。ASR 的 60 分钟长音频 + 说话人识别是真正的差异化优势。本机 M2 Max 可直接运行,不需要 GPU 服务器。对法考字幕提取项目有直接价值。

+
+ + + + +
diff --git a/source b/source new file mode 160000 index 0000000..3c97649 --- /dev/null +++ b/source @@ -0,0 +1 @@ +Subproject commit 3c976491d467698f13ebe4f096206812b91270b3