auto-save 2026-05-14 12:42 (~9)
This commit is contained in:
@@ -20,6 +20,7 @@ MINIMAX_API_KEY=
|
||||
MINIMAX_TTS_BASE_URL=https://api.minimax.io
|
||||
MINIMAX_TTS_MODEL=speech-2.8-turbo
|
||||
MINIMAX_TTS_VOICE_ID=English_expressive_narrator
|
||||
MINIMAX_TTS_VOICE_POOL=English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner
|
||||
|
||||
# Poe 视频 API(优先用于 Seedance / Kling / Veo)
|
||||
POE_API_BASE_URL=https://api.poe.com/v1
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# SKG TK 二创 API
|
||||
|
||||
FastAPI 后端,跑 yt-dlp + ffmpeg + ASR/翻译/英文 SKG 文案改写 + MiniMax 英文配音管线。
|
||||
FastAPI 后端,跑 yt-dlp + ffmpeg + ASR/翻译/英文 SKG 产品介绍文案 + MiniMax 英文配音管线。
|
||||
|
||||
## 启动
|
||||
|
||||
@@ -20,7 +20,7 @@ uvicorn main:app --host 127.0.0.1 --port 4291
|
||||
- `GET /health` — 健康检查 + 配置状态
|
||||
- `POST /jobs` `{url}` — 创建 job,后台下载源视频,视频就绪后可手动解析或提取音频
|
||||
- `GET /jobs/{id}` — 当前状态 + 产物;若原始音轨已拆出,会返回 `source_audio_url`
|
||||
- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 英文文案改写;配置 MiniMax 后生成英文配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮,可与抽帧并行,不自动触发
|
||||
- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 英文产品介绍文案;文案长度按原音频时长估算,配置 MiniMax 后从英文随机音色池生成配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮,可与抽帧并行,不自动触发
|
||||
- `GET /jobs/{id}/video.mp4` — 原视频
|
||||
- `GET /jobs/{id}/audio.wav` — 拆轨后的原始音频,供前端底部音频条生成波形
|
||||
- `GET /jobs/{id}/audio-script.mp3` — 英文改写文案的 MiniMax 配音
|
||||
@@ -35,4 +35,4 @@ uvicorn main:app --host 127.0.0.1 --port 4291
|
||||
- `ffmpeg` 系统二进制(拆轨 / 抽帧)
|
||||
- `yt-dlp` 系统二进制(也可走 Python 包)
|
||||
- OpenAI 兼容 LLM 网关(ASR / 翻译 / 文案改写);如果 `/audio/transcriptions` 不可用,会用 `ASR_FALLBACK_MODEL` 走 Gemini 多模态音频识别
|
||||
- MiniMax T2A HTTP(英文改写文案配音,使用 `MINIMAX_API_KEY`;默认音色 `English_expressive_narrator`)
|
||||
- MiniMax T2A HTTP(英文产品介绍文案配音,使用 `MINIMAX_API_KEY`;默认随机音色池 `English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner`)
|
||||
|
||||
122
api/main.py
122
api/main.py
@@ -4,6 +4,7 @@ import asyncio
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import subprocess
|
||||
import threading
|
||||
@@ -51,6 +52,16 @@ MINIMAX_TTS_VOICE_ID = os.getenv(
|
||||
"MINIMAX_TTS_VOICE_ID",
|
||||
"English_expressive_narrator",
|
||||
).strip() or "English_expressive_narrator"
|
||||
DEFAULT_MINIMAX_TTS_VOICE_POOL = [
|
||||
"English_magnetic_voiced_man",
|
||||
"English_Upbeat_Woman",
|
||||
"English_MaturePartner",
|
||||
]
|
||||
MINIMAX_TTS_VOICE_POOL = [
|
||||
v.strip()
|
||||
for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",")
|
||||
if v.strip()
|
||||
]
|
||||
|
||||
POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
|
||||
POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
|
||||
@@ -1522,31 +1533,60 @@ def _transcript_join(segments: list[TranscriptSegment], field: Literal["en", "zh
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _fallback_audio_script(segments: list[TranscriptSegment]) -> str:
|
||||
joined = " ".join((s.en or s.zh).strip() for s in segments if (s.en or s.zh).strip())
|
||||
if not joined:
|
||||
return "Ease into the moment with SKG. Gentle warmth and rhythmic massage help everyday tension feel lighter, cleaner, and easier to leave behind."
|
||||
def _voiceover_target_words(target_seconds: float) -> tuple[int, int]:
|
||||
seconds = max(4.0, min(float(target_seconds or 0) or 12.0, 45.0))
|
||||
center = int(round(seconds * 2.35))
|
||||
return max(10, int(center * 0.86)), min(110, max(14, int(center * 1.12)))
|
||||
|
||||
|
||||
def _segment_duration(segments: list[TranscriptSegment]) -> float:
|
||||
if not segments:
|
||||
return 0.0
|
||||
start = min((s.start for s in segments), default=0.0)
|
||||
end = max((s.end for s in segments), default=0.0)
|
||||
return max(0.0, end - start)
|
||||
|
||||
|
||||
def _fallback_audio_script(segments: list[TranscriptSegment], target_seconds: float = 12.0) -> str:
|
||||
seconds = max(target_seconds, _segment_duration(segments), 4.0)
|
||||
if seconds <= 7:
|
||||
return "Meet SKG: warm massage, easy comfort, and a tiny reset for busy bodies."
|
||||
if seconds <= 13:
|
||||
return (
|
||||
"Meet SKG, your shortcut to a calmer body break. A little warmth, a steady massage rhythm, "
|
||||
"and suddenly your day feels less tight and more yours."
|
||||
)
|
||||
if seconds <= 22:
|
||||
return (
|
||||
"This is SKG: smart massage for the moments your body asks for a pause. Warmth, rhythm, "
|
||||
"and a clean wearable feel turn neck, back, or everyday tension into a softer reset."
|
||||
)
|
||||
return (
|
||||
"Let SKG turn a short break into real relief. With soothing warmth and steady massage rhythm, "
|
||||
"everyday tension feels lighter, calmer, and easier to leave behind."
|
||||
"Say hello to SKG, the small reset button your day keeps asking for. From neck and shoulder breaks "
|
||||
"to back, eye, knee, or foot comfort, SKG brings warm, rhythmic massage into everyday routines, "
|
||||
"so winding down feels simple, smart, and a little more fun."
|
||||
)
|
||||
|
||||
|
||||
def _rewrite_audio_script_sync(segments: list[TranscriptSegment]) -> tuple[str, str]:
|
||||
fallback = _fallback_audio_script(segments)
|
||||
def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds: float = 12.0) -> tuple[str, str]:
|
||||
fallback = _fallback_audio_script(segments, target_seconds)
|
||||
if not LLM_API_KEY:
|
||||
return fallback, "LLM_API_KEY 未配置,使用本地 SKG 模板"
|
||||
source_text = _transcript_join(segments, "en")
|
||||
source_zh = _transcript_join(segments, "zh")
|
||||
min_words, max_words = _voiceover_target_words(target_seconds)
|
||||
prompt = (
|
||||
"You are an English short-video voice-over writer for SKG wellness massagers. "
|
||||
"Use the source transcript only for structure, pacing, and emotional hook, then rewrite it into a clean English VO for SKG.\n"
|
||||
"Write a fresh product-introduction VO for SKG. Use the source transcript only as timing and pacing reference; "
|
||||
"do not summarize it unless it helps the rhythm.\n"
|
||||
"Rules:\n"
|
||||
"1. Output 28-55 English words, suitable for an 8-18 second TTS voice-over.\n"
|
||||
"2. Make it natural, premium, concise, and ready to read aloud.\n"
|
||||
f"1. Target audio length is about {target_seconds:.1f} seconds. Output {min_words}-{max_words} English words.\n"
|
||||
"2. Make it natural, warm, premium, and a little playful. It should sound like a real creator, not a stiff ad.\n"
|
||||
"3. Do not claim medical treatment, cure, pain elimination, or clinical effects.\n"
|
||||
"4. Do not copy the original brand, creator, price, platform language, or exact claims.\n"
|
||||
"5. If the source transcript is too thin, write a general SKG relaxation VO.\n"
|
||||
"5. Introduce SKG products directly: smart massage, warmth, rhythm, daily neck/back/eye/knee/foot relaxation.\n"
|
||||
"6. Keep it easy for TTS: short sentences, spoken phrasing, no hashtags, no stage directions, no quotation marks.\n"
|
||||
"7. If the source transcript is thin, ignore it and write a general SKG product intro.\n"
|
||||
'Return strict JSON only: {"rewritten_text":"..."}.\n\n'
|
||||
f"SKG product context: {AUDIO_PRODUCT_BRIEF}\n\n"
|
||||
f"English transcript:\n{source_text or 'None'}\n\n"
|
||||
@@ -1560,7 +1600,7 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment]) -> tuple[str,
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.45,
|
||||
temperature=0.72,
|
||||
max_tokens=600,
|
||||
)
|
||||
raw = (resp.choices[0].message.content or "").strip()
|
||||
@@ -1581,7 +1621,27 @@ def _minimax_tts_url() -> str:
|
||||
return f"{MINIMAX_TTS_BASE_URL}/v1/t2a_v2"
|
||||
|
||||
|
||||
def _minimax_tts_sync(job_id: str, text: str) -> str:
|
||||
def _choose_minimax_voice_id() -> str:
|
||||
if MINIMAX_TTS_VOICE_POOL:
|
||||
return random.choice(MINIMAX_TTS_VOICE_POOL)
|
||||
return MINIMAX_TTS_VOICE_ID
|
||||
|
||||
|
||||
def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
|
||||
words = len([w for w in text.replace("\n", " ").split(" ") if w.strip()])
|
||||
estimated_seconds = words / 2.35 if words else target_seconds
|
||||
if target_seconds > 0 and estimated_seconds > target_seconds * 1.12:
|
||||
return 1.06
|
||||
if target_seconds > 0 and estimated_seconds < target_seconds * 0.82:
|
||||
return 0.94
|
||||
if voice_id == "English_MaturePartner":
|
||||
return 0.96
|
||||
if voice_id == "English_Upbeat_Woman":
|
||||
return 1.02
|
||||
return 0.99
|
||||
|
||||
|
||||
def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
|
||||
if not MINIMAX_API_KEY:
|
||||
raise RuntimeError("MINIMAX_API_KEY 未配置,未生成配音")
|
||||
if not text.strip():
|
||||
@@ -1593,8 +1653,8 @@ def _minimax_tts_sync(job_id: str, text: str) -> str:
|
||||
"language_boost": "English",
|
||||
"output_format": "hex",
|
||||
"voice_setting": {
|
||||
"voice_id": MINIMAX_TTS_VOICE_ID,
|
||||
"speed": 1,
|
||||
"voice_id": voice_id,
|
||||
"speed": _voice_speed_for(voice_id, target_seconds, text),
|
||||
"vol": 1,
|
||||
"pitch": 0,
|
||||
},
|
||||
@@ -1628,14 +1688,16 @@ def _minimax_tts_sync(job_id: str, text: str) -> str:
|
||||
return f"/jobs/{job_id}/audio-script.mp3"
|
||||
|
||||
|
||||
def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment]) -> AudioScript:
|
||||
def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript:
|
||||
source_text = _transcript_join(segments, "en")
|
||||
source_zh = _transcript_join(segments, "zh")
|
||||
rewritten, rewrite_error = _rewrite_audio_script_sync(segments)
|
||||
duration = max(float(target_seconds or 0), _segment_duration(segments), 4.0)
|
||||
rewritten, rewrite_error = _rewrite_audio_script_sync(segments, duration)
|
||||
selected_voice_id = _choose_minimax_voice_id()
|
||||
voice_url = ""
|
||||
voice_error = ""
|
||||
try:
|
||||
voice_url = _minimax_tts_sync(job_id, rewritten)
|
||||
voice_url = _minimax_tts_sync(job_id, rewritten, selected_voice_id, duration)
|
||||
except Exception as e:
|
||||
voice_error = str(e)
|
||||
# 改写失败时已有本地 SKG 模板兜底,不把它标成用户可见错误;配音失败才需要提示。
|
||||
@@ -1649,7 +1711,7 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment]) ->
|
||||
rewrite_model=AUDIO_REWRITE_MODEL,
|
||||
voice_provider="minimax",
|
||||
voice_model=MINIMAX_TTS_MODEL,
|
||||
voice_id=MINIMAX_TTS_VOICE_ID,
|
||||
voice_id=selected_voice_id,
|
||||
voice_url=voice_url,
|
||||
error=errors,
|
||||
created_at=time.time(),
|
||||
@@ -1678,6 +1740,7 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
|
||||
if not wav.exists():
|
||||
raise RuntimeError("音频提取完成但找不到 audio.wav")
|
||||
update(job, source_audio_url=f"/jobs/{job_id}/audio.wav")
|
||||
target_duration = max(media_duration(wav), float(job.duration or 0), 4.0)
|
||||
|
||||
if not LLM_API_KEY:
|
||||
# 无 key 模式:mock 数据
|
||||
@@ -1701,13 +1764,13 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
|
||||
rewrite_model=AUDIO_REWRITE_MODEL,
|
||||
voice_provider="minimax",
|
||||
voice_model=MINIMAX_TTS_MODEL,
|
||||
voice_id=MINIMAX_TTS_VOICE_ID,
|
||||
voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
|
||||
),
|
||||
}
|
||||
if manage_job_status:
|
||||
update_kwargs.update(message="ASR mock 完成,生成 SKG 改写文案…", progress=92)
|
||||
update_kwargs.update(message="ASR mock 完成,生成 SKG 英文产品口播…", progress=92)
|
||||
update(job, **update_kwargs)
|
||||
audio_script = _build_audio_script_sync(job_id, mock)
|
||||
audio_script = _build_audio_script_sync(job_id, mock, target_duration)
|
||||
if manage_job_status:
|
||||
update(job, transcript=mock, status="transcribed", progress=100,
|
||||
audio_script=audio_script,
|
||||
@@ -1728,9 +1791,9 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
|
||||
if seg.en.strip()
|
||||
]
|
||||
else:
|
||||
raise
|
||||
segments = [{"start": 0.0, "end": target_duration, "text": "Source audio timing reference."}]
|
||||
if not segments:
|
||||
raise RuntimeError("ASR 返回 0 段(可能无人声 / 格式问题)")
|
||||
segments = [{"start": 0.0, "end": target_duration, "text": "Source audio timing reference."}]
|
||||
|
||||
# 先把英文段落落到 job 上(让 UI 提前看到,翻译再补 zh)
|
||||
en_only = [
|
||||
@@ -1767,13 +1830,13 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
|
||||
rewrite_model=AUDIO_REWRITE_MODEL,
|
||||
voice_provider="minimax",
|
||||
voice_model=MINIMAX_TTS_MODEL,
|
||||
voice_id=MINIMAX_TTS_VOICE_ID,
|
||||
voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
|
||||
),
|
||||
}
|
||||
if manage_job_status:
|
||||
update_kwargs.update(message="翻译完成,生成 SKG 改写文案与 MiniMax 配音…", progress=94)
|
||||
update_kwargs.update(message="翻译完成,生成 SKG 英文产品口播与 MiniMax 配音…", progress=94)
|
||||
update(job, **update_kwargs)
|
||||
audio_script = _build_audio_script_sync(job_id, full)
|
||||
audio_script = _build_audio_script_sync(job_id, full, target_duration)
|
||||
if manage_job_status:
|
||||
update(job, transcript=full, status="transcribed", progress=100,
|
||||
audio_script=audio_script,
|
||||
@@ -2017,6 +2080,7 @@ def health() -> dict:
|
||||
"audio_rewrite": AUDIO_REWRITE_MODEL,
|
||||
"minimax_tts": MINIMAX_TTS_MODEL,
|
||||
"minimax_voice": MINIMAX_TTS_VOICE_ID,
|
||||
"minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID],
|
||||
"minimax_configured": bool(MINIMAX_API_KEY),
|
||||
"video": VIDEO_MODEL,
|
||||
"video_aliases": VIDEO_MODEL_ALIASES,
|
||||
@@ -2216,7 +2280,7 @@ async def trigger_transcribe(job_id: str, bg: BackgroundTasks) -> Job:
|
||||
rewrite_model=AUDIO_REWRITE_MODEL,
|
||||
voice_provider="minimax",
|
||||
voice_model=MINIMAX_TTS_MODEL,
|
||||
voice_id=MINIMAX_TTS_VOICE_ID,
|
||||
voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
|
||||
)
|
||||
if manage_job_status:
|
||||
update(job, status="transcribing", progress=max(45, min(job.progress, 70)), error="", message="准备提取音频…", audio_script=audio_payload)
|
||||
|
||||
Reference in New Issue
Block a user