auto-save 2026-05-14 12:42 (~9)

2026-05-14 12:43:03 +08:00
parent 3733151ae4
commit 2d1a89f03e
9 changed files with 132 additions and 67 deletions
--- a/api/.env.example
+++ b/api/.env.example
@@ -20,6 +20,7 @@ MINIMAX_API_KEY=
 MINIMAX_TTS_BASE_URL=https://api.minimax.io
 MINIMAX_TTS_MODEL=speech-2.8-turbo
 MINIMAX_TTS_VOICE_ID=English_expressive_narrator
+MINIMAX_TTS_VOICE_POOL=English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner

 # Poe 视频 API（优先用于 Seedance / Kling / Veo）
 POE_API_BASE_URL=https://api.poe.com/v1
--- a/api/README.md
+++ b/api/README.md
@@ -1,6 +1,6 @@
 # SKG TK 二创 API

-FastAPI 后端，跑 yt-dlp + ffmpeg + ASR/翻译/英文 SKG 文案改写 + MiniMax 英文配音管线。
+FastAPI 后端，跑 yt-dlp + ffmpeg + ASR/翻译/英文 SKG 产品介绍文案 + MiniMax 英文配音管线。

 ## 启动

@@ -20,7 +20,7 @@ uvicorn main:app --host 127.0.0.1 --port 4291
 - `GET  /health` — 健康检查 + 配置状态
 - `POST /jobs` `{url}` — 创建 job，后台下载源视频，视频就绪后可手动解析或提取音频
 - `GET  /jobs/{id}` — 当前状态 + 产物；若原始音轨已拆出，会返回 `source_audio_url`
- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 英文文案改写；配置 MiniMax 后生成英文配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮，可与抽帧并行，不自动触发
+- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 英文产品介绍文案；文案长度按原音频时长估算，配置 MiniMax 后从英文随机音色池生成配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮，可与抽帧并行，不自动触发
 - `GET  /jobs/{id}/video.mp4` — 原视频
 - `GET  /jobs/{id}/audio.wav` — 拆轨后的原始音频，供前端底部音频条生成波形
 - `GET  /jobs/{id}/audio-script.mp3` — 英文改写文案的 MiniMax 配音
@@ -35,4 +35,4 @@ uvicorn main:app --host 127.0.0.1 --port 4291
 - `ffmpeg` 系统二进制（拆轨 / 抽帧）
 - `yt-dlp` 系统二进制（也可走 Python 包）
 - OpenAI 兼容 LLM 网关（ASR / 翻译 / 文案改写）；如果 `/audio/transcriptions` 不可用，会用 `ASR_FALLBACK_MODEL` 走 Gemini 多模态音频识别
- MiniMax T2A HTTP（英文改写文案配音，使用 `MINIMAX_API_KEY`；默认音色 `English_expressive_narrator`）
+- MiniMax T2A HTTP（英文产品介绍文案配音，使用 `MINIMAX_API_KEY`；默认随机音色池 `English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner`）
--- a/api/main.py
+++ b/api/main.py
@@ -4,6 +4,7 @@ import asyncio
 import base64
 import json
 import os
+import random
 import shutil
 import subprocess
 import threading
@@ -51,6 +52,16 @@ MINIMAX_TTS_VOICE_ID = os.getenv(
    "MINIMAX_TTS_VOICE_ID",
    "English_expressive_narrator",
 ).strip() or "English_expressive_narrator"
+DEFAULT_MINIMAX_TTS_VOICE_POOL = [
+    "English_magnetic_voiced_man",
+    "English_Upbeat_Woman",
+    "English_MaturePartner",
+]
+MINIMAX_TTS_VOICE_POOL = [
+    v.strip()
+    for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",")
+    if v.strip()
+]

 POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
 POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
@@ -1522,31 +1533,60 @@ def _transcript_join(segments: list[TranscriptSegment], field: Literal["en", "zh
    return "\n".join(lines)


-def _fallback_audio_script(segments: list[TranscriptSegment]) -> str:
-    joined = " ".join((s.en or s.zh).strip() for s in segments if (s.en or s.zh).strip())
-    if not joined:
-        return "Ease into the moment with SKG. Gentle warmth and rhythmic massage help everyday tension feel lighter, cleaner, and easier to leave behind."
+def _voiceover_target_words(target_seconds: float) -> tuple[int, int]:
+    seconds = max(4.0, min(float(target_seconds or 0) or 12.0, 45.0))
+    center = int(round(seconds * 2.35))
+    return max(10, int(center * 0.86)), min(110, max(14, int(center * 1.12)))
+
+
+def _segment_duration(segments: list[TranscriptSegment]) -> float:
+    if not segments:
+        return 0.0
+    start = min((s.start for s in segments), default=0.0)
+    end = max((s.end for s in segments), default=0.0)
+    return max(0.0, end - start)
+
+
+def _fallback_audio_script(segments: list[TranscriptSegment], target_seconds: float = 12.0) -> str:
+    seconds = max(target_seconds, _segment_duration(segments), 4.0)
+    if seconds <= 7:
+        return "Meet SKG: warm massage, easy comfort, and a tiny reset for busy bodies."
+    if seconds <= 13:
+        return (
+            "Meet SKG, your shortcut to a calmer body break. A little warmth, a steady massage rhythm, "
+            "and suddenly your day feels less tight and more yours."
+        )
+    if seconds <= 22:
+        return (
+            "This is SKG: smart massage for the moments your body asks for a pause. Warmth, rhythm, "
+            "and a clean wearable feel turn neck, back, or everyday tension into a softer reset."
+        )
    return (
-        "Let SKG turn a short break into real relief. With soothing warmth and steady massage rhythm, "
-        "everyday tension feels lighter, calmer, and easier to leave behind."
+        "Say hello to SKG, the small reset button your day keeps asking for. From neck and shoulder breaks "
+        "to back, eye, knee, or foot comfort, SKG brings warm, rhythmic massage into everyday routines, "
+        "so winding down feels simple, smart, and a little more fun."
    )


-def _rewrite_audio_script_sync(segments: list[TranscriptSegment]) -> tuple[str, str]:
-    fallback = _fallback_audio_script(segments)
+def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds: float = 12.0) -> tuple[str, str]:
+    fallback = _fallback_audio_script(segments, target_seconds)
    if not LLM_API_KEY:
        return fallback, "LLM_API_KEY 未配置，使用本地 SKG 模板"
    source_text = _transcript_join(segments, "en")
    source_zh = _transcript_join(segments, "zh")
+    min_words, max_words = _voiceover_target_words(target_seconds)
    prompt = (
        "You are an English short-video voice-over writer for SKG wellness massagers. "
-        "Use the source transcript only for structure, pacing, and emotional hook, then rewrite it into a clean English VO for SKG.\n"
+        "Write a fresh product-introduction VO for SKG. Use the source transcript only as timing and pacing reference; "
+        "do not summarize it unless it helps the rhythm.\n"
        "Rules:\n"
-        "1. Output 28-55 English words, suitable for an 8-18 second TTS voice-over.\n"
-        "2. Make it natural, premium, concise, and ready to read aloud.\n"
+        f"1. Target audio length is about {target_seconds:.1f} seconds. Output {min_words}-{max_words} English words.\n"
+        "2. Make it natural, warm, premium, and a little playful. It should sound like a real creator, not a stiff ad.\n"
        "3. Do not claim medical treatment, cure, pain elimination, or clinical effects.\n"
        "4. Do not copy the original brand, creator, price, platform language, or exact claims.\n"
-        "5. If the source transcript is too thin, write a general SKG relaxation VO.\n"
+        "5. Introduce SKG products directly: smart massage, warmth, rhythm, daily neck/back/eye/knee/foot relaxation.\n"
+        "6. Keep it easy for TTS: short sentences, spoken phrasing, no hashtags, no stage directions, no quotation marks.\n"
+        "7. If the source transcript is thin, ignore it and write a general SKG product intro.\n"
        'Return strict JSON only: {"rewritten_text":"..."}.\n\n'
        f"SKG product context: {AUDIO_PRODUCT_BRIEF}\n\n"
        f"English transcript:\n{source_text or 'None'}\n\n"
@@ -1560,7 +1600,7 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment]) -> tuple[str,
                {"role": "user", "content": prompt},
            ],
            response_format={"type": "json_object"},
-            temperature=0.45,
+            temperature=0.72,
            max_tokens=600,
        )
        raw = (resp.choices[0].message.content or "").strip()
@@ -1581,7 +1621,27 @@ def _minimax_tts_url() -> str:
    return f"{MINIMAX_TTS_BASE_URL}/v1/t2a_v2"


-def _minimax_tts_sync(job_id: str, text: str) -> str:
+def _choose_minimax_voice_id() -> str:
+    if MINIMAX_TTS_VOICE_POOL:
+        return random.choice(MINIMAX_TTS_VOICE_POOL)
+    return MINIMAX_TTS_VOICE_ID
+
+
+def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
+    words = len([w for w in text.replace("\n", " ").split(" ") if w.strip()])
+    estimated_seconds = words / 2.35 if words else target_seconds
+    if target_seconds > 0 and estimated_seconds > target_seconds * 1.12:
+        return 1.06
+    if target_seconds > 0 and estimated_seconds < target_seconds * 0.82:
+        return 0.94
+    if voice_id == "English_MaturePartner":
+        return 0.96
+    if voice_id == "English_Upbeat_Woman":
+        return 1.02
+    return 0.99
+
+
+def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
    if not MINIMAX_API_KEY:
        raise RuntimeError("MINIMAX_API_KEY 未配置，未生成配音")
    if not text.strip():
@@ -1593,8 +1653,8 @@ def _minimax_tts_sync(job_id: str, text: str) -> str:
        "language_boost": "English",
        "output_format": "hex",
        "voice_setting": {
-            "voice_id": MINIMAX_TTS_VOICE_ID,
-            "speed": 1,
+            "voice_id": voice_id,
+            "speed": _voice_speed_for(voice_id, target_seconds, text),
            "vol": 1,
            "pitch": 0,
        },
@@ -1628,14 +1688,16 @@ def _minimax_tts_sync(job_id: str, text: str) -> str:
    return f"/jobs/{job_id}/audio-script.mp3"


-def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment]) -> AudioScript:
+def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript:
    source_text = _transcript_join(segments, "en")
    source_zh = _transcript_join(segments, "zh")
-    rewritten, rewrite_error = _rewrite_audio_script_sync(segments)
+    duration = max(float(target_seconds or 0), _segment_duration(segments), 4.0)
+    rewritten, rewrite_error = _rewrite_audio_script_sync(segments, duration)
+    selected_voice_id = _choose_minimax_voice_id()
    voice_url = ""
    voice_error = ""
    try:
-        voice_url = _minimax_tts_sync(job_id, rewritten)
+        voice_url = _minimax_tts_sync(job_id, rewritten, selected_voice_id, duration)
    except Exception as e:
        voice_error = str(e)
    # 改写失败时已有本地 SKG 模板兜底，不把它标成用户可见错误；配音失败才需要提示。
@@ -1649,7 +1711,7 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment]) ->
        rewrite_model=AUDIO_REWRITE_MODEL,
        voice_provider="minimax",
        voice_model=MINIMAX_TTS_MODEL,
-        voice_id=MINIMAX_TTS_VOICE_ID,
+        voice_id=selected_voice_id,
        voice_url=voice_url,
        error=errors,
        created_at=time.time(),
@@ -1678,6 +1740,7 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
            if not wav.exists():
                raise RuntimeError("音频提取完成但找不到 audio.wav")
        update(job, source_audio_url=f"/jobs/{job_id}/audio.wav")
+        target_duration = max(media_duration(wav), float(job.duration or 0), 4.0)

        if not LLM_API_KEY:
            # 无 key 模式：mock 数据
@@ -1701,13 +1764,13 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
                    rewrite_model=AUDIO_REWRITE_MODEL,
                    voice_provider="minimax",
                    voice_model=MINIMAX_TTS_MODEL,
-                    voice_id=MINIMAX_TTS_VOICE_ID,
+                    voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
                ),
            }
            if manage_job_status:
-                update_kwargs.update(message="ASR mock 完成，生成 SKG 改写文案…", progress=92)
+                update_kwargs.update(message="ASR mock 完成，生成 SKG 英文产品口播…", progress=92)
            update(job, **update_kwargs)
-            audio_script = _build_audio_script_sync(job_id, mock)
+            audio_script = _build_audio_script_sync(job_id, mock, target_duration)
            if manage_job_status:
                update(job, transcript=mock, status="transcribed", progress=100,
                       audio_script=audio_script,
@@ -1728,9 +1791,9 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
                    if seg.en.strip()
                ]
            else:
-                raise
+                segments = [{"start": 0.0, "end": target_duration, "text": "Source audio timing reference."}]
        if not segments:
-            raise RuntimeError("ASR 返回 0 段（可能无人声 / 格式问题）")
+            segments = [{"start": 0.0, "end": target_duration, "text": "Source audio timing reference."}]

        # 先把英文段落落到 job 上（让 UI 提前看到，翻译再补 zh）
        en_only = [
@@ -1767,13 +1830,13 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
                rewrite_model=AUDIO_REWRITE_MODEL,
                voice_provider="minimax",
                voice_model=MINIMAX_TTS_MODEL,
-                voice_id=MINIMAX_TTS_VOICE_ID,
+                voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
            ),
        }
        if manage_job_status:
-            update_kwargs.update(message="翻译完成，生成 SKG 改写文案与 MiniMax 配音…", progress=94)
+            update_kwargs.update(message="翻译完成，生成 SKG 英文产品口播与 MiniMax 配音…", progress=94)
        update(job, **update_kwargs)
-        audio_script = _build_audio_script_sync(job_id, full)
+        audio_script = _build_audio_script_sync(job_id, full, target_duration)
        if manage_job_status:
            update(job, transcript=full, status="transcribed", progress=100,
                   audio_script=audio_script,
@@ -2017,6 +2080,7 @@ def health() -> dict:
            "audio_rewrite": AUDIO_REWRITE_MODEL,
            "minimax_tts": MINIMAX_TTS_MODEL,
            "minimax_voice": MINIMAX_TTS_VOICE_ID,
+            "minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID],
            "minimax_configured": bool(MINIMAX_API_KEY),
            "video": VIDEO_MODEL,
            "video_aliases": VIDEO_MODEL_ALIASES,
@@ -2216,7 +2280,7 @@ async def trigger_transcribe(job_id: str, bg: BackgroundTasks) -> Job:
        rewrite_model=AUDIO_REWRITE_MODEL,
        voice_provider="minimax",
        voice_model=MINIMAX_TTS_MODEL,
-        voice_id=MINIMAX_TTS_VOICE_ID,
+        voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
    )
    if manage_job_status:
        update(job, status="transcribing", progress=max(45, min(job.progress, 70)), error="", message="准备提取音频…", audio_script=audio_payload)