auto-save 2026-05-14 10:20 (~7)
This commit is contained in:
@@ -1,40 +1,5 @@
|
|||||||
{
|
{
|
||||||
"entries": [
|
"entries": [
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"hash": "ad36702",
|
|
||||||
"message": "auto-save 2026-05-13 00:33 (~1)",
|
|
||||||
"ts": "2026-05-13T00:34:03+08:00",
|
|
||||||
"type": "commit"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"hash": "70a88fc",
|
|
||||||
"message": "auto-save 2026-05-13 00:39 (~1)",
|
|
||||||
"ts": "2026-05-13T00:39:38+08:00",
|
|
||||||
"type": "commit"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"hash": "cd8a082",
|
|
||||||
"message": "auto-save 2026-05-13 00:44 (~1)",
|
|
||||||
"ts": "2026-05-13T00:45:12+08:00",
|
|
||||||
"type": "commit"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"hash": "2c48980",
|
|
||||||
"message": "auto-save 2026-05-13 00:50 (~1)",
|
|
||||||
"ts": "2026-05-13T00:50:45+08:00",
|
|
||||||
"type": "commit"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"hash": "bcc4933",
|
|
||||||
"message": "auto-save 2026-05-13 00:56 (~1)",
|
|
||||||
"ts": "2026-05-13T00:56:19+08:00",
|
|
||||||
"type": "commit"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"files_changed": 1,
|
"files_changed": 1,
|
||||||
"hash": "ffba726",
|
"hash": "ffba726",
|
||||||
@@ -3319,6 +3284,37 @@
|
|||||||
"type": "session-heartbeat",
|
"type": "session-heartbeat",
|
||||||
"message": "Claude 会话活跃 · 最近命令:claude · 5 项未提交变更 · 最近提交:auto-save 2026-05-14 10:08 (~4)",
|
"message": "Claude 会话活跃 · 最近命令:claude · 5 项未提交变更 · 最近提交:auto-save 2026-05-14 10:08 (~4)",
|
||||||
"files_changed": 5
|
"files_changed": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-14T10:14:43+08:00",
|
||||||
|
"type": "commit",
|
||||||
|
"message": "auto-save 2026-05-14 10:14 (~7)",
|
||||||
|
"hash": "ee32d83",
|
||||||
|
"files_changed": 7
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-14T02:14:59Z",
|
||||||
|
"type": "session-end",
|
||||||
|
"message": "Claude 会话结束 · 持续 0 秒 · 最近命令:claude · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 10:14 (~7)",
|
||||||
|
"files_changed": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-14T02:14:59Z",
|
||||||
|
"type": "session-end",
|
||||||
|
"message": "Claude 会话结束 · 持续 0 秒 · 最近命令:claude · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 10:14 (~7)",
|
||||||
|
"files_changed": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-14T02:16:09Z",
|
||||||
|
"type": "session-heartbeat",
|
||||||
|
"message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 10:14 (~7)",
|
||||||
|
"files_changed": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-14T02:18:38Z",
|
||||||
|
"type": "session-heartbeat",
|
||||||
|
"message": "Codex 会话活跃 · 最近命令:codex · 2 项未提交变更 · 最近提交:auto-save 2026-05-14 10:14 (~7)",
|
||||||
|
"files_changed": 2
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,14 @@ VIDEO_MODEL_SEEDANCE=seedance-2-fast
|
|||||||
VIDEO_MODEL_KLING=kling-omni
|
VIDEO_MODEL_KLING=kling-omni
|
||||||
VIDEO_MODEL_VEO3=veo-3.1-fast
|
VIDEO_MODEL_VEO3=veo-3.1-fast
|
||||||
|
|
||||||
|
# 音频文案改写 + MiniMax 配音
|
||||||
|
AUDIO_REWRITE_MODEL=gemini-2.5-pro
|
||||||
|
AUDIO_PRODUCT_BRIEF=SKG 智能按摩产品,主打日常肩颈、腰背、眼部、膝盖或足部放松;广告表达要高级、干净、可信,不做医疗疗效承诺。
|
||||||
|
MINIMAX_API_KEY=
|
||||||
|
MINIMAX_TTS_BASE_URL=https://api.minimax.io
|
||||||
|
MINIMAX_TTS_MODEL=speech-2.8-turbo
|
||||||
|
MINIMAX_TTS_VOICE_ID=Chinese (Mandarin)_Reliable_Executive
|
||||||
|
|
||||||
# Poe 视频 API(优先用于 Seedance / Kling / Veo)
|
# Poe 视频 API(优先用于 Seedance / Kling / Veo)
|
||||||
POE_API_BASE_URL=https://api.poe.com/v1
|
POE_API_BASE_URL=https://api.poe.com/v1
|
||||||
POE_API_KEY=
|
POE_API_KEY=
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# SKG TK 二创 API
|
# SKG TK 二创 API
|
||||||
|
|
||||||
FastAPI 后端,跑 yt-dlp + ffmpeg + Gemini ASR/翻译 管线。
|
FastAPI 后端,跑 yt-dlp + ffmpeg + ASR/翻译/文案改写 + MiniMax 配音管线。
|
||||||
|
|
||||||
## 启动
|
## 启动
|
||||||
|
|
||||||
@@ -18,16 +18,18 @@ uvicorn main:app --port 4291 --reload
|
|||||||
- `GET /health` — 健康检查 + 配置状态
|
- `GET /health` — 健康检查 + 配置状态
|
||||||
- `POST /jobs` `{url}` — 创建 job,后台跑下载/拆轨/抽帧
|
- `POST /jobs` `{url}` — 创建 job,后台跑下载/拆轨/抽帧
|
||||||
- `GET /jobs/{id}` — 当前状态 + 产物
|
- `GET /jobs/{id}` — 当前状态 + 产物
|
||||||
- `POST /jobs/{id}/transcribe` — 触发 Gemini ASR + 翻译
|
- `POST /jobs/{id}/transcribe` — 触发 ASR + 翻译 + SKG 文案改写;配置 MiniMax 后生成配音
|
||||||
- `GET /jobs/{id}/video.mp4` — 原视频
|
- `GET /jobs/{id}/video.mp4` — 原视频
|
||||||
|
- `GET /jobs/{id}/audio-script.mp3` — 改写文案的 MiniMax 配音
|
||||||
- `GET /jobs/{id}/frames/{i}.jpg` — 第 i 张关键帧(0-9)
|
- `GET /jobs/{id}/frames/{i}.jpg` — 第 i 张关键帧(0-9)
|
||||||
|
|
||||||
## Mock 模式
|
## Mock 模式
|
||||||
|
|
||||||
未设 `GEMINI_API_KEY` 时,转录走本地 mock,便于 UI 联调。
|
未设 `LLM_API_KEY` 时,转录走本地 mock,便于 UI 联调;未设 `MINIMAX_API_KEY` 时只生成改写文案,不生成配音文件。
|
||||||
|
|
||||||
## 依赖
|
## 依赖
|
||||||
|
|
||||||
- `ffmpeg` 系统二进制(拆轨 / 抽帧)
|
- `ffmpeg` 系统二进制(拆轨 / 抽帧)
|
||||||
- `yt-dlp` 系统二进制(也可走 Python 包)
|
- `yt-dlp` 系统二进制(也可走 Python 包)
|
||||||
- `google-generativeai` Python(ASR + 翻译)
|
- OpenAI 兼容 LLM 网关(ASR / 翻译 / 文案改写)
|
||||||
|
- MiniMax T2A HTTP(改写文案配音,使用 `MINIMAX_API_KEY`)
|
||||||
|
|||||||
227
api/main.py
227
api/main.py
@@ -12,6 +12,7 @@ from contextlib import asynccontextmanager
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
|
import httpx
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from fastapi import BackgroundTasks, FastAPI, File, HTTPException, UploadFile
|
from fastapi import BackgroundTasks, FastAPI, File, HTTPException, UploadFile
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
@@ -36,6 +37,18 @@ REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
|
|||||||
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
|
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
|
||||||
IMAGE_MODEL = os.getenv("IMAGE_MODEL", "gemini-3-pro-image-preview")
|
IMAGE_MODEL = os.getenv("IMAGE_MODEL", "gemini-3-pro-image-preview")
|
||||||
VIDEO_MODEL = os.getenv("VIDEO_MODEL", "seedance").strip() or "seedance"
|
VIDEO_MODEL = os.getenv("VIDEO_MODEL", "seedance").strip() or "seedance"
|
||||||
|
AUDIO_PRODUCT_BRIEF = os.getenv(
|
||||||
|
"AUDIO_PRODUCT_BRIEF",
|
||||||
|
"SKG 智能按摩产品,主打日常肩颈、腰背、眼部、膝盖或足部放松;广告表达要高级、干净、可信,不做医疗疗效承诺。",
|
||||||
|
).strip()
|
||||||
|
AUDIO_REWRITE_MODEL = os.getenv("AUDIO_REWRITE_MODEL", REWRITE_MODEL).strip() or REWRITE_MODEL
|
||||||
|
MINIMAX_API_KEY = os.getenv("MINIMAX_API_KEY", "").strip()
|
||||||
|
MINIMAX_TTS_BASE_URL = os.getenv("MINIMAX_TTS_BASE_URL", "https://api.minimax.io").strip().rstrip("/")
|
||||||
|
MINIMAX_TTS_MODEL = os.getenv("MINIMAX_TTS_MODEL", "speech-2.8-turbo").strip() or "speech-2.8-turbo"
|
||||||
|
MINIMAX_TTS_VOICE_ID = os.getenv(
|
||||||
|
"MINIMAX_TTS_VOICE_ID",
|
||||||
|
"Chinese (Mandarin)_Reliable_Executive",
|
||||||
|
).strip() or "Chinese (Mandarin)_Reliable_Executive"
|
||||||
|
|
||||||
POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
|
POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
|
||||||
POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
|
POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
|
||||||
@@ -337,6 +350,21 @@ class TranscriptSegment(BaseModel):
|
|||||||
zh: str = ""
|
zh: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class AudioScript(BaseModel):
|
||||||
|
status: Literal["idle", "rewriting", "completed", "failed"] = "idle"
|
||||||
|
source_text: str = ""
|
||||||
|
source_zh: str = ""
|
||||||
|
rewritten_text: str = ""
|
||||||
|
product_brief: str = ""
|
||||||
|
rewrite_model: str = ""
|
||||||
|
voice_provider: str = ""
|
||||||
|
voice_model: str = ""
|
||||||
|
voice_id: str = ""
|
||||||
|
voice_url: str = ""
|
||||||
|
error: str = ""
|
||||||
|
created_at: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
class Job(BaseModel):
|
class Job(BaseModel):
|
||||||
id: str
|
id: str
|
||||||
url: str
|
url: str
|
||||||
@@ -349,6 +377,7 @@ class Job(BaseModel):
|
|||||||
height: int = 0
|
height: int = 0
|
||||||
frames: list[KeyFrame] = Field(default_factory=list)
|
frames: list[KeyFrame] = Field(default_factory=list)
|
||||||
transcript: list[TranscriptSegment] = Field(default_factory=list)
|
transcript: list[TranscriptSegment] = Field(default_factory=list)
|
||||||
|
audio_script: AudioScript = Field(default_factory=AudioScript)
|
||||||
storyboard_images: list[StoryboardImage] = Field(default_factory=list)
|
storyboard_images: list[StoryboardImage] = Field(default_factory=list)
|
||||||
generated_videos: list[GeneratedVideo] = Field(default_factory=list)
|
generated_videos: list[GeneratedVideo] = Field(default_factory=list)
|
||||||
error: str = ""
|
error: str = ""
|
||||||
@@ -1351,6 +1380,148 @@ def _translate_sync(segments: list[dict]) -> list[str]:
|
|||||||
return [zh_by_idx.get(i, "") for i in range(len(segments))]
|
return [zh_by_idx.get(i, "") for i in range(len(segments))]
|
||||||
|
|
||||||
|
|
||||||
|
def _transcript_join(segments: list[TranscriptSegment], field: Literal["en", "zh"]) -> str:
|
||||||
|
lines: list[str] = []
|
||||||
|
for s in segments:
|
||||||
|
text = (s.zh if field == "zh" else s.en).strip()
|
||||||
|
if text:
|
||||||
|
lines.append(f"[{s.start:.1f}-{s.end:.1f}s] {text}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _fallback_audio_script(segments: list[TranscriptSegment]) -> str:
|
||||||
|
joined = " ".join((s.zh or s.en).strip() for s in segments if (s.zh or s.en).strip())
|
||||||
|
if not joined:
|
||||||
|
return "日常疲惫不用硬扛。戴上 SKG,让肩颈慢慢放松,跟着呼吸找回轻松状态。"
|
||||||
|
return (
|
||||||
|
"把日常紧绷交给 SKG。贴合身体需要放松的位置,热敷与按摩节奏自然陪伴,"
|
||||||
|
"让每一次短暂休息都更轻松、更有质感。"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _rewrite_audio_script_sync(segments: list[TranscriptSegment]) -> tuple[str, str]:
|
||||||
|
fallback = _fallback_audio_script(segments)
|
||||||
|
if not LLM_API_KEY:
|
||||||
|
return fallback, "LLM_API_KEY 未配置,使用本地 SKG 模板"
|
||||||
|
source_text = _transcript_join(segments, "en")
|
||||||
|
source_zh = _transcript_join(segments, "zh")
|
||||||
|
prompt = (
|
||||||
|
"你是 SKG 短视频口播编导。根据参考视频音频转写,抽取它的表达结构、情绪节奏和可复用卖点,"
|
||||||
|
"改写成适合 SKG 按摩/放松产品二创视频的中文口播文案。\n"
|
||||||
|
"要求:\n"
|
||||||
|
"1. 输出 35-90 个中文字,适合 8-18 秒短视频配音。\n"
|
||||||
|
"2. 口语化、干净、高级,能直接给 TTS 朗读。\n"
|
||||||
|
"3. 不承诺治疗、治愈、医学疗效,不夸大。\n"
|
||||||
|
"4. 不复刻原视频品牌/人物/价格/平台话术,只保留表达结构。\n"
|
||||||
|
"5. 如果参考转写信息不足,按产品信息生成通用 SKG 放松口播。\n"
|
||||||
|
'严格返回 JSON:{"rewritten_text":"..."}。\n\n'
|
||||||
|
f"SKG 产品信息:{AUDIO_PRODUCT_BRIEF}\n\n"
|
||||||
|
f"英文转写:\n{source_text or '无'}\n\n"
|
||||||
|
f"中文翻译:\n{source_zh or '无'}"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
resp = llm().chat.completions.create(
|
||||||
|
model=AUDIO_REWRITE_MODEL,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "只输出合法 JSON,不要解释,不要 markdown。"},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
response_format={"type": "json_object"},
|
||||||
|
temperature=0.45,
|
||||||
|
max_tokens=600,
|
||||||
|
)
|
||||||
|
raw = (resp.choices[0].message.content or "").strip()
|
||||||
|
if raw.startswith("```"):
|
||||||
|
import re as _re
|
||||||
|
match = _re.search(r"\{[\s\S]*\}", raw)
|
||||||
|
raw = match.group(0) if match else raw
|
||||||
|
data = json.loads(raw)
|
||||||
|
text = str(data.get("rewritten_text", "")).strip()
|
||||||
|
return (text or fallback), ""
|
||||||
|
except Exception as e:
|
||||||
|
return fallback, f"改写失败,使用本地模板:{e}"
|
||||||
|
|
||||||
|
|
||||||
|
def _minimax_tts_url() -> str:
|
||||||
|
if MINIMAX_TTS_BASE_URL.endswith("/v1/t2a_v2"):
|
||||||
|
return MINIMAX_TTS_BASE_URL
|
||||||
|
return f"{MINIMAX_TTS_BASE_URL}/v1/t2a_v2"
|
||||||
|
|
||||||
|
|
||||||
|
def _minimax_tts_sync(job_id: str, text: str) -> str:
|
||||||
|
if not MINIMAX_API_KEY:
|
||||||
|
raise RuntimeError("MINIMAX_API_KEY 未配置,未生成配音")
|
||||||
|
if not text.strip():
|
||||||
|
raise RuntimeError("改写文案为空,未生成配音")
|
||||||
|
payload = {
|
||||||
|
"model": MINIMAX_TTS_MODEL,
|
||||||
|
"text": text.strip()[:9500],
|
||||||
|
"stream": False,
|
||||||
|
"language_boost": "Chinese",
|
||||||
|
"output_format": "hex",
|
||||||
|
"voice_setting": {
|
||||||
|
"voice_id": MINIMAX_TTS_VOICE_ID,
|
||||||
|
"speed": 1,
|
||||||
|
"vol": 1,
|
||||||
|
"pitch": 0,
|
||||||
|
},
|
||||||
|
"audio_setting": {
|
||||||
|
"sample_rate": 32000,
|
||||||
|
"bitrate": 128000,
|
||||||
|
"format": "mp3",
|
||||||
|
"channel": 1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
resp = httpx.post(
|
||||||
|
_minimax_tts_url(),
|
||||||
|
headers={"Authorization": f"Bearer {MINIMAX_API_KEY}", "Content-Type": "application/json"},
|
||||||
|
json=payload,
|
||||||
|
timeout=90,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
base_resp = data.get("base_resp") or {}
|
||||||
|
if int(base_resp.get("status_code", 0) or 0) != 0:
|
||||||
|
raise RuntimeError(base_resp.get("status_msg") or "MiniMax TTS 返回失败")
|
||||||
|
audio_hex = ((data.get("data") or {}).get("audio") or "").strip()
|
||||||
|
if not audio_hex:
|
||||||
|
raise RuntimeError("MiniMax TTS 未返回 audio hex")
|
||||||
|
try:
|
||||||
|
audio_bytes = bytes.fromhex(audio_hex)
|
||||||
|
except ValueError as e:
|
||||||
|
raise RuntimeError(f"MiniMax TTS audio hex 无法解析:{e}") from e
|
||||||
|
out = job_dir(job_id) / "audio_script.mp3"
|
||||||
|
out.write_bytes(audio_bytes)
|
||||||
|
return f"/jobs/{job_id}/audio-script.mp3"
|
||||||
|
|
||||||
|
|
||||||
|
def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment]) -> AudioScript:
|
||||||
|
source_text = _transcript_join(segments, "en")
|
||||||
|
source_zh = _transcript_join(segments, "zh")
|
||||||
|
rewritten, rewrite_error = _rewrite_audio_script_sync(segments)
|
||||||
|
voice_url = ""
|
||||||
|
voice_error = ""
|
||||||
|
try:
|
||||||
|
voice_url = _minimax_tts_sync(job_id, rewritten)
|
||||||
|
except Exception as e:
|
||||||
|
voice_error = str(e)
|
||||||
|
errors = ";".join(x for x in [rewrite_error, voice_error] if x)
|
||||||
|
return AudioScript(
|
||||||
|
status="completed",
|
||||||
|
source_text=source_text,
|
||||||
|
source_zh=source_zh,
|
||||||
|
rewritten_text=rewritten,
|
||||||
|
product_brief=AUDIO_PRODUCT_BRIEF,
|
||||||
|
rewrite_model=AUDIO_REWRITE_MODEL,
|
||||||
|
voice_provider="minimax",
|
||||||
|
voice_model=MINIMAX_TTS_MODEL,
|
||||||
|
voice_id=MINIMAX_TTS_VOICE_ID,
|
||||||
|
voice_url=voice_url,
|
||||||
|
error=errors,
|
||||||
|
created_at=time.time(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def pipeline_transcribe(job_id: str) -> None:
|
async def pipeline_transcribe(job_id: str) -> None:
|
||||||
job = JOBS[job_id]
|
job = JOBS[job_id]
|
||||||
d = job_dir(job_id)
|
d = job_dir(job_id)
|
||||||
@@ -1371,7 +1542,25 @@ async def pipeline_transcribe(job_id: str) -> None:
|
|||||||
en="This device looks really sleek and minimal.",
|
en="This device looks really sleek and minimal.",
|
||||||
zh="这个设备看起来非常时尚和简约。"),
|
zh="这个设备看起来非常时尚和简约。"),
|
||||||
]
|
]
|
||||||
|
update(
|
||||||
|
job,
|
||||||
|
transcript=mock,
|
||||||
|
audio_script=AudioScript(
|
||||||
|
status="rewriting",
|
||||||
|
source_text=_transcript_join(mock, "en"),
|
||||||
|
source_zh=_transcript_join(mock, "zh"),
|
||||||
|
product_brief=AUDIO_PRODUCT_BRIEF,
|
||||||
|
rewrite_model=AUDIO_REWRITE_MODEL,
|
||||||
|
voice_provider="minimax",
|
||||||
|
voice_model=MINIMAX_TTS_MODEL,
|
||||||
|
voice_id=MINIMAX_TTS_VOICE_ID,
|
||||||
|
),
|
||||||
|
message="ASR mock 完成,生成 SKG 改写文案…",
|
||||||
|
progress=92,
|
||||||
|
)
|
||||||
|
audio_script = await asyncio.to_thread(_build_audio_script_sync, job_id, mock)
|
||||||
update(job, transcript=mock, status="transcribed", progress=100,
|
update(job, transcript=mock, status="transcribed", progress=100,
|
||||||
|
audio_script=audio_script,
|
||||||
message="转录完成(MOCK · 未设 LLM_API_KEY)")
|
message="转录完成(MOCK · 未设 LLM_API_KEY)")
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -1403,11 +1592,35 @@ async def pipeline_transcribe(job_id: str) -> None:
|
|||||||
)
|
)
|
||||||
for i, seg in enumerate(en_only)
|
for i, seg in enumerate(en_only)
|
||||||
]
|
]
|
||||||
|
update(
|
||||||
|
job,
|
||||||
|
transcript=full,
|
||||||
|
audio_script=AudioScript(
|
||||||
|
status="rewriting",
|
||||||
|
source_text=_transcript_join(full, "en"),
|
||||||
|
source_zh=_transcript_join(full, "zh"),
|
||||||
|
product_brief=AUDIO_PRODUCT_BRIEF,
|
||||||
|
rewrite_model=AUDIO_REWRITE_MODEL,
|
||||||
|
voice_provider="minimax",
|
||||||
|
voice_model=MINIMAX_TTS_MODEL,
|
||||||
|
voice_id=MINIMAX_TTS_VOICE_ID,
|
||||||
|
),
|
||||||
|
message="翻译完成,生成 SKG 改写文案与 MiniMax 配音…",
|
||||||
|
progress=94,
|
||||||
|
)
|
||||||
|
audio_script = await asyncio.to_thread(_build_audio_script_sync, job_id, full)
|
||||||
update(job, transcript=full, status="transcribed", progress=100,
|
update(job, transcript=full, status="transcribed", progress=100,
|
||||||
|
audio_script=audio_script,
|
||||||
message=f"转录完成 · {len(full)} 段({ASR_MODEL} + {TRANSLATE_MODEL})")
|
message=f"转录完成 · {len(full)} 段({ASR_MODEL} + {TRANSLATE_MODEL})")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
update(job, status="failed", error=str(e), message="转录失败")
|
update(
|
||||||
|
job,
|
||||||
|
status="failed",
|
||||||
|
audio_script=AudioScript(status="failed", error=str(e), created_at=time.time()),
|
||||||
|
error=str(e),
|
||||||
|
message="转录失败",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _image_edit_call(
|
def _image_edit_call(
|
||||||
@@ -1566,6 +1779,10 @@ def health() -> dict:
|
|||||||
"asr": ASR_MODEL,
|
"asr": ASR_MODEL,
|
||||||
"translate": TRANSLATE_MODEL,
|
"translate": TRANSLATE_MODEL,
|
||||||
"rewrite": REWRITE_MODEL,
|
"rewrite": REWRITE_MODEL,
|
||||||
|
"audio_rewrite": AUDIO_REWRITE_MODEL,
|
||||||
|
"minimax_tts": MINIMAX_TTS_MODEL,
|
||||||
|
"minimax_voice": MINIMAX_TTS_VOICE_ID,
|
||||||
|
"minimax_configured": bool(MINIMAX_API_KEY),
|
||||||
"video": VIDEO_MODEL,
|
"video": VIDEO_MODEL,
|
||||||
"video_aliases": VIDEO_MODEL_ALIASES,
|
"video_aliases": VIDEO_MODEL_ALIASES,
|
||||||
"video_provider": "poe" if video_uses_poe() else ("ark" if video_uses_ark() else "custom"),
|
"video_provider": "poe" if video_uses_poe() else ("ark" if video_uses_ark() else "custom"),
|
||||||
@@ -1765,6 +1982,14 @@ def get_video(job_id: str):
|
|||||||
return FileResponse(p, media_type="video/mp4")
|
return FileResponse(p, media_type="video/mp4")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/jobs/{job_id}/audio-script.mp3")
|
||||||
|
def get_audio_script(job_id: str):
|
||||||
|
p = job_dir(job_id) / "audio_script.mp3"
|
||||||
|
if not p.exists():
|
||||||
|
raise HTTPException(404, "audio script not found")
|
||||||
|
return FileResponse(p, media_type="audio/mpeg")
|
||||||
|
|
||||||
|
|
||||||
@app.get("/jobs/{job_id}/frames/{idx}.jpg")
|
@app.get("/jobs/{job_id}/frames/{idx}.jpg")
|
||||||
def get_frame(job_id: str, idx: int):
|
def get_frame(job_id: str, idx: int):
|
||||||
p = job_dir(job_id) / "frames" / f"{idx:03d}.jpg"
|
p = job_dir(job_id) / "frames" / f"{idx:03d}.jpg"
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import {
|
|||||||
Mic, Languages, FileEdit, Sparkles, Film, FileVideo, Loader2, Plus, Check,
|
Mic, Languages, FileEdit, Sparkles, Film, FileVideo, Loader2, Plus, Check,
|
||||||
ChevronDown, X, LayoutGrid,
|
ChevronDown, X, LayoutGrid,
|
||||||
} from "lucide-react"
|
} from "lucide-react"
|
||||||
import { type Job, type KeyFrame, frameUrl, effectiveFrameUrl, videoUrl, generateImage, selectGenerated, generatedImageUrl } from "@/lib/api"
|
import { type Job, type KeyFrame, frameUrl, effectiveFrameUrl, videoUrl, generateImage, selectGenerated, generatedImageUrl, apiAssetUrl } from "@/lib/api"
|
||||||
import { type NodeData } from "@/components/nodes"
|
import { type NodeData } from "@/components/nodes"
|
||||||
import { FrameLightbox } from "@/components/lightbox"
|
import { FrameLightbox } from "@/components/lightbox"
|
||||||
import { toast } from "sonner"
|
import { toast } from "sonner"
|
||||||
@@ -154,6 +154,8 @@ export const Dashboard = forwardRef<DashboardHandle, Props>(function Dashboard({
|
|||||||
const hasFrames = (job?.frames.length ?? 0) > 0
|
const hasFrames = (job?.frames.length ?? 0) > 0
|
||||||
const hasTranscript = (job?.transcript.length ?? 0) > 0
|
const hasTranscript = (job?.transcript.length ?? 0) > 0
|
||||||
const hasZh = job?.transcript.some((s) => s.zh) ?? false
|
const hasZh = job?.transcript.some((s) => s.zh) ?? false
|
||||||
|
const hasAudioRewrite = !!job?.audio_script?.rewritten_text?.trim()
|
||||||
|
const isAudioRewriting = job?.audio_script?.status === "rewriting"
|
||||||
const isFailed = job?.status === "failed"
|
const isFailed = job?.status === "failed"
|
||||||
|
|
||||||
const colState: Record<string, ColState> = {
|
const colState: Record<string, ColState> = {
|
||||||
@@ -168,7 +170,7 @@ export const Dashboard = forwardRef<DashboardHandle, Props>(function Dashboard({
|
|||||||
keyframe: !job ? "pending" : (isSplitting && !hasFrames) ? "running" : hasFrames ? "done" : isFailed && job.progress >= 50 && job.progress < 70 ? "failed" : "pending",
|
keyframe: !job ? "pending" : (isSplitting && !hasFrames) ? "running" : hasFrames ? "done" : isFailed && job.progress >= 50 && job.progress < 70 ? "failed" : "pending",
|
||||||
asr: !job ? "pending" : job.status === "transcribing" ? "running" : hasTranscript ? "done" : isFailed && job.progress >= 70 ? "failed" : "pending",
|
asr: !job ? "pending" : job.status === "transcribing" ? "running" : hasTranscript ? "done" : isFailed && job.progress >= 70 ? "failed" : "pending",
|
||||||
translate: !job ? "pending" : job.status === "transcribing" ? "running" : hasZh ? "done" : "pending",
|
translate: !job ? "pending" : job.status === "transcribing" ? "running" : hasZh ? "done" : "pending",
|
||||||
rewrite: "pending",
|
rewrite: !job ? "pending" : isAudioRewriting ? "running" : hasAudioRewrite ? "done" : "pending",
|
||||||
imagegen: "pending",
|
imagegen: "pending",
|
||||||
videogen: "pending",
|
videogen: "pending",
|
||||||
compose: "pending",
|
compose: "pending",
|
||||||
@@ -180,7 +182,7 @@ export const Dashboard = forwardRef<DashboardHandle, Props>(function Dashboard({
|
|||||||
keyframe: hasFrames ? `${data.selectedFrames.size}/${job!.frames.length} 选用` : "—",
|
keyframe: hasFrames ? `${data.selectedFrames.size}/${job!.frames.length} 选用` : "—",
|
||||||
asr: hasTranscript ? `${job!.transcript.length} 段` : "—",
|
asr: hasTranscript ? `${job!.transcript.length} 段` : "—",
|
||||||
translate: hasZh ? `${job!.transcript.filter((s) => s.zh).length} 段` : "—",
|
translate: hasZh ? `${job!.transcript.filter((s) => s.zh).length} 段` : "—",
|
||||||
rewrite: "占位",
|
rewrite: hasAudioRewrite ? "已生成" : isAudioRewriting ? "生成中…" : "待文案",
|
||||||
imagegen: data.selectedFrames.size > 0 ? `${data.selectedFrames.size} 帧待编排` : "占位",
|
imagegen: data.selectedFrames.size > 0 ? `${data.selectedFrames.size} 帧待编排` : "占位",
|
||||||
videogen: "占位",
|
videogen: "占位",
|
||||||
compose: "占位",
|
compose: "占位",
|
||||||
@@ -593,16 +595,31 @@ export const Dashboard = forwardRef<DashboardHandle, Props>(function Dashboard({
|
|||||||
{key === "rewrite" && (
|
{key === "rewrite" && (
|
||||||
<>
|
<>
|
||||||
<KanbanCard tone="green" tags={["产品信息"]} title="SKG 产品卖点">
|
<KanbanCard tone="green" tags={["产品信息"]} title="SKG 产品卖点">
|
||||||
<textarea
|
<div className="text-[12px] text-[var(--text-soft)] leading-relaxed">
|
||||||
rows={5}
|
{job?.audio_script?.product_brief || "等待音频转写完成后,按默认 SKG 放松产品卖点生成口播。"}
|
||||||
placeholder="粘贴 SKG 产品关键卖点(占位)"
|
</div>
|
||||||
disabled
|
|
||||||
className="w-full text-[12px] px-2 py-1.5 rounded-md bg-black/30 border border-dashed border-white/10 placeholder:text-[var(--text-faint)] text-[var(--text-strong)] resize-none opacity-70 mt-1"
|
|
||||||
/>
|
|
||||||
</KanbanCard>
|
</KanbanCard>
|
||||||
<KanbanCard tone="green" tags={["模型"]} title="gemini-2.5-pro">
|
<KanbanCard tone="green" tags={["改写"]} title={job?.audio_script?.rewrite_model || "gemini-2.5-pro"}>
|
||||||
<div className="text-[11px] text-[var(--text-soft)]">按英文转录 + 产品信息 → 输出改写中文文案</div>
|
{job?.audio_script?.rewritten_text ? (
|
||||||
<div className="kanban-meta">下一冲刺接入</div>
|
<div className="text-[13px] text-[var(--text-strong)] leading-relaxed">
|
||||||
|
{job.audio_script.rewritten_text}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="text-[11px] text-[var(--text-soft)]">
|
||||||
|
{isAudioRewriting ? "正在生成 SKG 口播文案…" : "转录完成后自动生成 SKG 口播文案"}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
<div className="kanban-meta">ASR + 翻译 + SKG 卖点转化</div>
|
||||||
|
</KanbanCard>
|
||||||
|
<KanbanCard tone="green" tags={["配音"]} title={job?.audio_script?.voice_model || "MiniMax T2A"}>
|
||||||
|
{job?.audio_script?.voice_url ? (
|
||||||
|
<audio controls className="h-8 w-full" src={apiAssetUrl(job.audio_script.voice_url)} />
|
||||||
|
) : (
|
||||||
|
<div className="text-[11px] text-[var(--text-soft)]">
|
||||||
|
{job?.audio_script?.error || "配置 MiniMax 后自动生成配音文件"}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
<div className="kanban-meta">{job?.audio_script?.voice_id || "Chinese (Mandarin)_Reliable_Executive"}</div>
|
||||||
</KanbanCard>
|
</KanbanCard>
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
|
|||||||
@@ -2101,12 +2101,16 @@ export function AudioNode({ data, selected }: any) {
|
|||||||
const d: NodeData = data
|
const d: NodeData = data
|
||||||
const job = d.job
|
const job = d.job
|
||||||
const transcript = job?.transcript ?? []
|
const transcript = job?.transcript ?? []
|
||||||
|
const audioScript = job?.audio_script
|
||||||
|
const rewrittenText = audioScript?.rewritten_text?.trim() ?? ""
|
||||||
|
const voiceUrl = apiAssetUrl(audioScript?.voice_url)
|
||||||
const hasASR = transcript.length > 0
|
const hasASR = transcript.length > 0
|
||||||
|
const isRewriting = audioScript?.status === "rewriting"
|
||||||
const status: NodeStatus = !job
|
const status: NodeStatus = !job
|
||||||
? "pending"
|
? "pending"
|
||||||
: job.status === "transcribing"
|
: job.status === "transcribing" || isRewriting
|
||||||
? "running"
|
? "running"
|
||||||
: hasASR
|
: rewrittenText || hasASR
|
||||||
? "done"
|
? "done"
|
||||||
: "pending"
|
: "pending"
|
||||||
return (
|
return (
|
||||||
@@ -2119,9 +2123,27 @@ export function AudioNode({ data, selected }: any) {
|
|||||||
pinned={d.pinnedNodes?.has("audio")}
|
pinned={d.pinnedNodes?.has("audio")}
|
||||||
onTogglePin={() => d.onToggleNodePin?.("audio")}
|
onTogglePin={() => d.onToggleNodePin?.("audio")}
|
||||||
>
|
>
|
||||||
<div className="text-[11px] text-[var(--text-soft)] leading-snug">
|
<div className="space-y-2 text-[11px] text-[var(--text-soft)] leading-snug">
|
||||||
音轨 → ASR 转录 → 英中翻译 → 接 SKG 卖点改写文案<br />
|
<div>
|
||||||
<span className="text-[var(--text-faint)] font-mono">Gemini 2.5 Flash</span>
|
音轨 → ASR 转录 → 英中翻译 → SKG 口播改写 → MiniMax 配音<br />
|
||||||
|
<span className="text-[var(--text-faint)] font-mono">
|
||||||
|
{audioScript?.rewrite_model || "Gemini 2.5 Pro"} → {audioScript?.voice_model || "MiniMax T2A"}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
{rewrittenText && (
|
||||||
|
<div className="rounded-md border border-emerald-400/25 bg-emerald-400/10 px-2.5 py-2 text-[11.5px] leading-relaxed text-[var(--text-strong)] break-words">
|
||||||
|
{rewrittenText}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
{voiceUrl && (
|
||||||
|
<audio controls src={voiceUrl} className="h-7 w-full" />
|
||||||
|
)}
|
||||||
|
{isRewriting && (
|
||||||
|
<div className="text-[10.5px] text-[var(--text-faint)]">正在生成改写文案和配音…</div>
|
||||||
|
)}
|
||||||
|
{audioScript?.error && rewrittenText && !voiceUrl && (
|
||||||
|
<div className="text-[10.5px] text-amber-300/85">配音待生成:{audioScript.error}</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
</NodeShell>
|
</NodeShell>
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -296,6 +296,21 @@ export interface TranscriptSegment {
|
|||||||
zh: string
|
zh: string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface AudioScript {
|
||||||
|
status: "idle" | "rewriting" | "completed" | "failed"
|
||||||
|
source_text: string
|
||||||
|
source_zh: string
|
||||||
|
rewritten_text: string
|
||||||
|
product_brief: string
|
||||||
|
rewrite_model: string
|
||||||
|
voice_provider: string
|
||||||
|
voice_model: string
|
||||||
|
voice_id: string
|
||||||
|
voice_url: string
|
||||||
|
error: string
|
||||||
|
created_at: number
|
||||||
|
}
|
||||||
|
|
||||||
export interface StoryboardImage {
|
export interface StoryboardImage {
|
||||||
ref_id: string
|
ref_id: string
|
||||||
kind: "keyframe" | "cutout" | "asset"
|
kind: "keyframe" | "cutout" | "asset"
|
||||||
@@ -318,6 +333,7 @@ export interface Job {
|
|||||||
height?: number
|
height?: number
|
||||||
frames: KeyFrame[]
|
frames: KeyFrame[]
|
||||||
transcript: TranscriptSegment[]
|
transcript: TranscriptSegment[]
|
||||||
|
audio_script?: AudioScript
|
||||||
storyboard_images?: StoryboardImage[]
|
storyboard_images?: StoryboardImage[]
|
||||||
generated_videos?: GeneratedVideo[]
|
generated_videos?: GeneratedVideo[]
|
||||||
error?: string
|
error?: string
|
||||||
@@ -331,6 +347,10 @@ export interface BackendHealth {
|
|||||||
asr?: string
|
asr?: string
|
||||||
translate?: string
|
translate?: string
|
||||||
rewrite?: string
|
rewrite?: string
|
||||||
|
audio_rewrite?: string
|
||||||
|
minimax_tts?: string
|
||||||
|
minimax_voice?: string
|
||||||
|
minimax_configured?: boolean
|
||||||
video?: string
|
video?: string
|
||||||
video_aliases?: Record<string, string>
|
video_aliases?: Record<string, string>
|
||||||
video_base_url?: string
|
video_base_url?: string
|
||||||
|
|||||||
Reference in New Issue
Block a user