auto-save 2026-05-14 12:42 (~9)

This commit is contained in:
2026-05-14 12:43:03 +08:00
parent 3733151ae4
commit 2d1a89f03e
9 changed files with 132 additions and 67 deletions

View File

@@ -20,6 +20,7 @@ MINIMAX_API_KEY=
MINIMAX_TTS_BASE_URL=https://api.minimax.io
MINIMAX_TTS_MODEL=speech-2.8-turbo
MINIMAX_TTS_VOICE_ID=English_expressive_narrator
MINIMAX_TTS_VOICE_POOL=English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner
# Poe 视频 API优先用于 Seedance / Kling / Veo
POE_API_BASE_URL=https://api.poe.com/v1

View File

@@ -1,6 +1,6 @@
# SKG TK 二创 API
FastAPI 后端,跑 yt-dlp + ffmpeg + ASR/翻译/英文 SKG 文案改写 + MiniMax 英文配音管线。
FastAPI 后端,跑 yt-dlp + ffmpeg + ASR/翻译/英文 SKG 产品介绍文案 + MiniMax 英文配音管线。
## 启动
@@ -20,7 +20,7 @@ uvicorn main:app --host 127.0.0.1 --port 4291
- `GET /health` — 健康检查 + 配置状态
- `POST /jobs` `{url}` — 创建 job后台下载源视频视频就绪后可手动解析或提取音频
- `GET /jobs/{id}` — 当前状态 + 产物;若原始音轨已拆出,会返回 `source_audio_url`
- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 英文文案改写;配置 MiniMax 后生成英文配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮,可与抽帧并行,不自动触发
- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 英文产品介绍文案;文案长度按原音频时长估算,配置 MiniMax 后从英文随机音色池生成配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮,可与抽帧并行,不自动触发
- `GET /jobs/{id}/video.mp4` — 原视频
- `GET /jobs/{id}/audio.wav` — 拆轨后的原始音频,供前端底部音频条生成波形
- `GET /jobs/{id}/audio-script.mp3` — 英文改写文案的 MiniMax 配音
@@ -35,4 +35,4 @@ uvicorn main:app --host 127.0.0.1 --port 4291
- `ffmpeg` 系统二进制(拆轨 / 抽帧)
- `yt-dlp` 系统二进制(也可走 Python 包)
- OpenAI 兼容 LLM 网关ASR / 翻译 / 文案改写);如果 `/audio/transcriptions` 不可用,会用 `ASR_FALLBACK_MODEL` 走 Gemini 多模态音频识别
- MiniMax T2A HTTP英文改写文案配音,使用 `MINIMAX_API_KEY`;默认音色 `English_expressive_narrator`
- MiniMax T2A HTTP英文产品介绍文案配音,使用 `MINIMAX_API_KEY`;默认随机音色 `English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner`

View File

@@ -4,6 +4,7 @@ import asyncio
import base64
import json
import os
import random
import shutil
import subprocess
import threading
@@ -51,6 +52,16 @@ MINIMAX_TTS_VOICE_ID = os.getenv(
"MINIMAX_TTS_VOICE_ID",
"English_expressive_narrator",
).strip() or "English_expressive_narrator"
DEFAULT_MINIMAX_TTS_VOICE_POOL = [
"English_magnetic_voiced_man",
"English_Upbeat_Woman",
"English_MaturePartner",
]
MINIMAX_TTS_VOICE_POOL = [
v.strip()
for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",")
if v.strip()
]
POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
@@ -1522,31 +1533,60 @@ def _transcript_join(segments: list[TranscriptSegment], field: Literal["en", "zh
return "\n".join(lines)
def _fallback_audio_script(segments: list[TranscriptSegment]) -> str:
joined = " ".join((s.en or s.zh).strip() for s in segments if (s.en or s.zh).strip())
if not joined:
return "Ease into the moment with SKG. Gentle warmth and rhythmic massage help everyday tension feel lighter, cleaner, and easier to leave behind."
def _voiceover_target_words(target_seconds: float) -> tuple[int, int]:
seconds = max(4.0, min(float(target_seconds or 0) or 12.0, 45.0))
center = int(round(seconds * 2.35))
return max(10, int(center * 0.86)), min(110, max(14, int(center * 1.12)))
def _segment_duration(segments: list[TranscriptSegment]) -> float:
if not segments:
return 0.0
start = min((s.start for s in segments), default=0.0)
end = max((s.end for s in segments), default=0.0)
return max(0.0, end - start)
def _fallback_audio_script(segments: list[TranscriptSegment], target_seconds: float = 12.0) -> str:
seconds = max(target_seconds, _segment_duration(segments), 4.0)
if seconds <= 7:
return "Meet SKG: warm massage, easy comfort, and a tiny reset for busy bodies."
if seconds <= 13:
return (
"Meet SKG, your shortcut to a calmer body break. A little warmth, a steady massage rhythm, "
"and suddenly your day feels less tight and more yours."
)
if seconds <= 22:
return (
"This is SKG: smart massage for the moments your body asks for a pause. Warmth, rhythm, "
"and a clean wearable feel turn neck, back, or everyday tension into a softer reset."
)
return (
"Let SKG turn a short break into real relief. With soothing warmth and steady massage rhythm, "
"everyday tension feels lighter, calmer, and easier to leave behind."
"Say hello to SKG, the small reset button your day keeps asking for. From neck and shoulder breaks "
"to back, eye, knee, or foot comfort, SKG brings warm, rhythmic massage into everyday routines, "
"so winding down feels simple, smart, and a little more fun."
)
def _rewrite_audio_script_sync(segments: list[TranscriptSegment]) -> tuple[str, str]:
fallback = _fallback_audio_script(segments)
def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds: float = 12.0) -> tuple[str, str]:
fallback = _fallback_audio_script(segments, target_seconds)
if not LLM_API_KEY:
return fallback, "LLM_API_KEY 未配置,使用本地 SKG 模板"
source_text = _transcript_join(segments, "en")
source_zh = _transcript_join(segments, "zh")
min_words, max_words = _voiceover_target_words(target_seconds)
prompt = (
"You are an English short-video voice-over writer for SKG wellness massagers. "
"Use the source transcript only for structure, pacing, and emotional hook, then rewrite it into a clean English VO for SKG.\n"
"Write a fresh product-introduction VO for SKG. Use the source transcript only as timing and pacing reference; "
"do not summarize it unless it helps the rhythm.\n"
"Rules:\n"
"1. Output 28-55 English words, suitable for an 8-18 second TTS voice-over.\n"
"2. Make it natural, premium, concise, and ready to read aloud.\n"
f"1. Target audio length is about {target_seconds:.1f} seconds. Output {min_words}-{max_words} English words.\n"
"2. Make it natural, warm, premium, and a little playful. It should sound like a real creator, not a stiff ad.\n"
"3. Do not claim medical treatment, cure, pain elimination, or clinical effects.\n"
"4. Do not copy the original brand, creator, price, platform language, or exact claims.\n"
"5. If the source transcript is too thin, write a general SKG relaxation VO.\n"
"5. Introduce SKG products directly: smart massage, warmth, rhythm, daily neck/back/eye/knee/foot relaxation.\n"
"6. Keep it easy for TTS: short sentences, spoken phrasing, no hashtags, no stage directions, no quotation marks.\n"
"7. If the source transcript is thin, ignore it and write a general SKG product intro.\n"
'Return strict JSON only: {"rewritten_text":"..."}.\n\n'
f"SKG product context: {AUDIO_PRODUCT_BRIEF}\n\n"
f"English transcript:\n{source_text or 'None'}\n\n"
@@ -1560,7 +1600,7 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment]) -> tuple[str,
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
temperature=0.45,
temperature=0.72,
max_tokens=600,
)
raw = (resp.choices[0].message.content or "").strip()
@@ -1581,7 +1621,27 @@ def _minimax_tts_url() -> str:
return f"{MINIMAX_TTS_BASE_URL}/v1/t2a_v2"
def _minimax_tts_sync(job_id: str, text: str) -> str:
def _choose_minimax_voice_id() -> str:
if MINIMAX_TTS_VOICE_POOL:
return random.choice(MINIMAX_TTS_VOICE_POOL)
return MINIMAX_TTS_VOICE_ID
def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
words = len([w for w in text.replace("\n", " ").split(" ") if w.strip()])
estimated_seconds = words / 2.35 if words else target_seconds
if target_seconds > 0 and estimated_seconds > target_seconds * 1.12:
return 1.06
if target_seconds > 0 and estimated_seconds < target_seconds * 0.82:
return 0.94
if voice_id == "English_MaturePartner":
return 0.96
if voice_id == "English_Upbeat_Woman":
return 1.02
return 0.99
def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
if not MINIMAX_API_KEY:
raise RuntimeError("MINIMAX_API_KEY 未配置,未生成配音")
if not text.strip():
@@ -1593,8 +1653,8 @@ def _minimax_tts_sync(job_id: str, text: str) -> str:
"language_boost": "English",
"output_format": "hex",
"voice_setting": {
"voice_id": MINIMAX_TTS_VOICE_ID,
"speed": 1,
"voice_id": voice_id,
"speed": _voice_speed_for(voice_id, target_seconds, text),
"vol": 1,
"pitch": 0,
},
@@ -1628,14 +1688,16 @@ def _minimax_tts_sync(job_id: str, text: str) -> str:
return f"/jobs/{job_id}/audio-script.mp3"
def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment]) -> AudioScript:
def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript:
source_text = _transcript_join(segments, "en")
source_zh = _transcript_join(segments, "zh")
rewritten, rewrite_error = _rewrite_audio_script_sync(segments)
duration = max(float(target_seconds or 0), _segment_duration(segments), 4.0)
rewritten, rewrite_error = _rewrite_audio_script_sync(segments, duration)
selected_voice_id = _choose_minimax_voice_id()
voice_url = ""
voice_error = ""
try:
voice_url = _minimax_tts_sync(job_id, rewritten)
voice_url = _minimax_tts_sync(job_id, rewritten, selected_voice_id, duration)
except Exception as e:
voice_error = str(e)
# 改写失败时已有本地 SKG 模板兜底,不把它标成用户可见错误;配音失败才需要提示。
@@ -1649,7 +1711,7 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment]) ->
rewrite_model=AUDIO_REWRITE_MODEL,
voice_provider="minimax",
voice_model=MINIMAX_TTS_MODEL,
voice_id=MINIMAX_TTS_VOICE_ID,
voice_id=selected_voice_id,
voice_url=voice_url,
error=errors,
created_at=time.time(),
@@ -1678,6 +1740,7 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
if not wav.exists():
raise RuntimeError("音频提取完成但找不到 audio.wav")
update(job, source_audio_url=f"/jobs/{job_id}/audio.wav")
target_duration = max(media_duration(wav), float(job.duration or 0), 4.0)
if not LLM_API_KEY:
# 无 key 模式mock 数据
@@ -1701,13 +1764,13 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
rewrite_model=AUDIO_REWRITE_MODEL,
voice_provider="minimax",
voice_model=MINIMAX_TTS_MODEL,
voice_id=MINIMAX_TTS_VOICE_ID,
voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
),
}
if manage_job_status:
update_kwargs.update(message="ASR mock 完成,生成 SKG 改写文案", progress=92)
update_kwargs.update(message="ASR mock 完成,生成 SKG 英文产品口播", progress=92)
update(job, **update_kwargs)
audio_script = _build_audio_script_sync(job_id, mock)
audio_script = _build_audio_script_sync(job_id, mock, target_duration)
if manage_job_status:
update(job, transcript=mock, status="transcribed", progress=100,
audio_script=audio_script,
@@ -1728,9 +1791,9 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
if seg.en.strip()
]
else:
raise
segments = [{"start": 0.0, "end": target_duration, "text": "Source audio timing reference."}]
if not segments:
raise RuntimeError("ASR 返回 0 段(可能无人声 / 格式问题)")
segments = [{"start": 0.0, "end": target_duration, "text": "Source audio timing reference."}]
# 先把英文段落落到 job 上(让 UI 提前看到,翻译再补 zh
en_only = [
@@ -1767,13 +1830,13 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
rewrite_model=AUDIO_REWRITE_MODEL,
voice_provider="minimax",
voice_model=MINIMAX_TTS_MODEL,
voice_id=MINIMAX_TTS_VOICE_ID,
voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
),
}
if manage_job_status:
update_kwargs.update(message="翻译完成,生成 SKG 改写文案与 MiniMax 配音…", progress=94)
update_kwargs.update(message="翻译完成,生成 SKG 英文产品口播与 MiniMax 配音…", progress=94)
update(job, **update_kwargs)
audio_script = _build_audio_script_sync(job_id, full)
audio_script = _build_audio_script_sync(job_id, full, target_duration)
if manage_job_status:
update(job, transcript=full, status="transcribed", progress=100,
audio_script=audio_script,
@@ -2017,6 +2080,7 @@ def health() -> dict:
"audio_rewrite": AUDIO_REWRITE_MODEL,
"minimax_tts": MINIMAX_TTS_MODEL,
"minimax_voice": MINIMAX_TTS_VOICE_ID,
"minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID],
"minimax_configured": bool(MINIMAX_API_KEY),
"video": VIDEO_MODEL,
"video_aliases": VIDEO_MODEL_ALIASES,
@@ -2216,7 +2280,7 @@ async def trigger_transcribe(job_id: str, bg: BackgroundTasks) -> Job:
rewrite_model=AUDIO_REWRITE_MODEL,
voice_provider="minimax",
voice_model=MINIMAX_TTS_MODEL,
voice_id=MINIMAX_TTS_VOICE_ID,
voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
)
if manage_job_status:
update(job, status="transcribing", progress=max(45, min(job.progress, 70)), error="", message="准备提取音频…", audio_script=audio_payload)