feat: improve subject generation workflow

This commit is contained in:
2026-05-18 17:44:52 +08:00
parent 78bd294d57
commit 1f600ae436
12 changed files with 682 additions and 372 deletions

View File

@@ -56,19 +56,19 @@
- `ASR_TIMEOUT_SECONDS`:远端 ASR / 音频分析单次请求超时,默认 45 秒,避免第一步长时间停在转录中
- `LOCAL_ASR_BIN` / `LOCAL_ASR_MODEL` / `LOCAL_ASR_TIMEOUT_SECONDS`:本机 ASR 兜底,默认使用 `/opt/homebrew/bin/mlx_whisper` + `mlx-community/whisper-tiny`,用于当前 SKG 网关 `/audio/transcriptions` 不可用时生成真实逐句时间轴
- `TRANSLATE_MODEL`:字幕翻译模型,默认 `gemini-2.5-flash`
- `REWRITE_MODEL`通用改写/分镜描述模型,默认 `gemini-2.5-pro`
- `AUDIO_REWRITE_MODEL`后续音频口播改写模型,默认跟随 `REWRITE_MODEL`;当前第一步不默认调用口播改写,只保留原文案和声音分析
- `GPT_TEXT_MODEL`GPT 文本 / 视觉默认模型,默认 `gpt-4o`;用于兜底修正旧 Gemini 覆盖值
- `REWRITE_MODEL`通用改写/分镜描述模型,默认 `gpt-4o`;如果旧环境仍写 `gemini-*`,后端会自动改用 `GPT_TEXT_MODEL`
- `VISION_MODEL`:关键帧画面理解模型,默认 `gpt-4o`;如果旧环境仍写 `gemini-*`,后端会自动改用 `GPT_TEXT_MODEL`
- `AUDIO_REWRITE_MODEL`:后续音频口播改写模型,默认跟随 `REWRITE_MODEL`;如果旧环境仍写 `gemini-*`,后端会自动改用 `REWRITE_MODEL`
- `AUDIO_PRODUCT_BRIEF`:音频口播改写时注入的 SKG 产品卖点
- `PRODUCT_VIEW_MODEL`:同一产品素材池的视角标注/自动识别模型;当前按项目要求强制使用 `gpt-image-2`
- `IMAGE_BASE_URL` / `IMAGE_API_KEY` / `IMAGE_MODEL`OpenAI 兼容生图网关;当前所有生图入口一律强制使用 `gpt-image-2`,不做其他图片模型 fallback
- `GPT_IMAGE_MODEL` / `SUBJECT_ASSET_IMAGE_MODEL` / `SUBJECT_ASSET_IMAGE_MODELS`:保留兼容旧环境变量名,但服务端会强制主体 6 视图和所有其他生图入口都只使用 `gpt-image-2`
- `AI_HTTP_PROXY` / `IMAGE_HTTP_PROXY`:可选的 AI 网关出站代理;本地 launchd 后台进程不一定继承 shell 的 `http_proxy/https_proxy`,如生图报 DNS / ConnectError可在本地 `api/.env` 配置后重启后端。`/health` 只回传是否配置代理,不回传代理地址。
- `VOICE_PROVIDER`:配音通道,当前固定使用 `azure_openai`
- `YTDLP_COOKIES_FILE` / `YTDLP_COOKIES_FROM_BROWSER`:可选 TikTok 下载登录态;优先使用 cookies 文件,其次读取本机浏览器 cookies。cookies 文件属于敏感登录态,只能放本机或服务器私有路径,不允许入库。
- `VOICE_PROVIDER`:配音通道,服务端固定使用 `azure_openai`;旧环境若写 `minimax` 会被忽略
- `AZURE_OPENAI_BASE_URL` / `AZURE_OPENAI_API_KEY`:微软 Azure OpenAI 协议配音网关;本地未单独配置 Key 时回退复用 `LLM_API_KEY`
- `AZURE_TTS_MODEL` / `AZURE_TTS_VOICE_ID` / `AZURE_TTS_VOICE_POOL` / `AZURE_TTS_PATH`Azure OpenAI TTS 模型、默认音色、音色池和 OpenAI 协议语音路径
- `MINIMAX_API_KEY`MiniMax T2A 配音 Key只能放本地 `api/.env`,不能入库;当前第一步暂不默认调用
- `MINIMAX_TTS_BASE_URL` / `MINIMAX_TTS_MODEL` / `MINIMAX_TTS_VOICE_ID`MiniMax 旧配音端点、模型和兜底音色配置,仅作为保留兼容;当前不作为默认语音通道
- `MINIMAX_TTS_VOICE_POOL`MiniMax 英文随机音色池;当前默认男声 `English_magnetic_voiced_man`、女声 `English_Upbeat_Woman`、成熟声 `English_MaturePartner`,供后续新配音阶段使用
- `AZURE_TTS_MODEL` / `AZURE_TTS_VOICE_ID` / `AZURE_TTS_VOICE_POOL` / `AZURE_TTS_PATH` / `AZURE_TTS_PATHS`Azure OpenAI TTS 模型、默认音色、音色池和 OpenAI 协议语音路径;后端会按 `AZURE_TTS_PATHS` 依次尝试,便于区分路径不对和整条语音服务不可用
- `POE_API_KEY` / `VIDEO_API_KEY`:视频生成通道 Key只能放本地环境变量
- `WEB_AUTH_USERNAME` / `WEB_AUTH_PASSWORD` / `WEB_AUTH_SESSION_SECRET`:生产网页登录和会话签名配置;密码和 session secret 只放服务器环境变量,不入库
- `FFMPEG_BIN` / `FFPROBE_BIN`:可选本地媒体二进制路径;本机 Homebrew ffmpeg 动态库损坏时,后端会自动跳过不可用的 PATH 版本并尝试本机静态 ffmpeg 备选,生产仍建议使用系统 ffmpeg/ffprobe

View File

@@ -17,7 +17,9 @@ LOCAL_ASR_BIN=/opt/homebrew/bin/mlx_whisper
LOCAL_ASR_MODEL=mlx-community/whisper-tiny
LOCAL_ASR_TIMEOUT_SECONDS=180
TRANSLATE_MODEL=gemini-2.5-flash
REWRITE_MODEL=gemini-2.5-pro
GPT_TEXT_MODEL=gpt-4o
REWRITE_MODEL=gpt-4o
VISION_MODEL=gpt-4o
PRODUCT_VIEW_MODEL=gpt-image-2
IMAGE_BASE_URL=https://ai.skg.com/ezlink/v1
IMAGE_API_KEY=
@@ -27,6 +29,8 @@ SUBJECT_ASSET_IMAGE_MODEL=gpt-image-2
SUBJECT_ASSET_IMAGE_MODELS=gpt-image-2
# 可选:本地网络需要代理访问 ai.skg.com 时配置launchd 不一定继承 shell 代理变量。
AI_HTTP_PROXY=
YTDLP_COOKIES_FILE=
YTDLP_COOKIES_FROM_BROWSER=
VIDEO_MODEL=seedance
VIDEO_MODEL_SEEDANCE=seedance-2-fast
VIDEO_MODEL_KLING=kling-omni
@@ -35,6 +39,7 @@ VIDEO_MODEL_VEO3=veo-3.1-fast
# 音频文案改写 + Azure OpenAI 配音
AUDIO_REWRITE_MODEL=gemini-2.5-pro
AUDIO_PRODUCT_BRIEF="SKG 智能按摩产品,主打日常肩颈、腰背、眼部、膝盖或足部放松;广告表达要高级、干净、可信,不做医疗疗效承诺。"
# 语音通道服务端固定为 Azure OpenAI。
VOICE_PROVIDER=azure_openai
AZURE_OPENAI_BASE_URL=https://ai.skg.com/azure
AZURE_OPENAI_API_KEY=
@@ -42,13 +47,7 @@ AZURE_TTS_MODEL=gpt-4o-mini-tts
AZURE_TTS_VOICE_ID=alloy
AZURE_TTS_VOICE_POOL=alloy,verse,shimmer
AZURE_TTS_PATH=/audio/speech
# MiniMax 旧配音通道,保留兼容;默认不走
MINIMAX_API_KEY=
MINIMAX_TTS_BASE_URL=https://api.minimax.io
MINIMAX_TTS_MODEL=speech-2.8-turbo
MINIMAX_TTS_VOICE_ID=English_expressive_narrator
MINIMAX_TTS_VOICE_POOL=English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner
AZURE_TTS_PATHS=/audio/speech,/v1/audio/speech
# Poe 视频 API优先用于 Seedance / Kling / Veo
POE_API_BASE_URL=https://api.poe.com/v1

View File

@@ -1,6 +1,6 @@
# SKG TK 二创 API
FastAPI 后端,跑 yt-dlp + ffmpeg + ASR/翻译/英文 SKG 产品介绍文案 + MiniMax 英文配音管线。
FastAPI 后端,跑 yt-dlp + ffmpeg + ASR/翻译/英文 SKG 产品介绍文案 + Azure OpenAI 英文配音管线。
## 启动
@@ -9,7 +9,7 @@ cd api
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
cp .env.example .env # 按需填 LLM_API_KEY / MINIMAX_API_KEY
cp .env.example .env # 按需填 LLM_API_KEY / AZURE_OPENAI_API_KEY
uvicorn main:app --host 127.0.0.1 --port 4291
```
@@ -20,19 +20,19 @@ uvicorn main:app --host 127.0.0.1 --port 4291
- `GET /health` — 健康检查 + 配置状态
- `POST /jobs` `{url}` — 创建 job后台下载源视频视频就绪后可手动解析或提取音频
- `GET /jobs/{id}` — 当前状态 + 产物;若原始音轨已拆出,会返回 `source_audio_url`
- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 英文产品介绍文案;文案长度按原音频时长估算,配置 MiniMax 后从英文随机音色池生成配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮,可与抽帧并行,不自动触发
- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 英文产品介绍文案;文案长度按原音频时长估算,配置 Azure OpenAI TTS 后从 Azure 音色池生成配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮,可与抽帧并行,不自动触发
- `GET /jobs/{id}/video.mp4` — 原视频
- `GET /jobs/{id}/audio.wav` — 拆轨后的原始音频,供前端底部音频条生成波形
- `GET /jobs/{id}/audio-script.mp3` — 英文改写文案的 MiniMax 配音
- `GET /jobs/{id}/audio-script.mp3` — 英文改写文案的 Azure OpenAI TTS 配音
- `GET /jobs/{id}/frames/{i}.jpg` — 第 i 张关键帧0-9
## Mock 模式
未设 `LLM_API_KEY` 时,转录走本地 mock便于 UI 联调;未设 `MINIMAX_API_KEY` 时只生成改写文案,不生成配音文件。
未设 `LLM_API_KEY` 时,转录走本地 mock便于 UI 联调;未设 `AZURE_OPENAI_API_KEY` 且无法复用 `LLM_API_KEY` 时只生成改写文案,不生成配音文件。
## 依赖
- `ffmpeg` 系统二进制(拆轨 / 抽帧)
- `yt-dlp` 系统二进制(也可走 Python 包)
- OpenAI 兼容 LLM 网关ASR / 翻译 / 文案改写);如果 `/audio/transcriptions` 不可用,会用 `ASR_FALLBACK_MODEL` 走 Gemini 多模态音频识别
- MiniMax T2A HTTP(英文产品介绍文案配音,使用 `MINIMAX_API_KEY`;默认随机音色池 `English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner`
- Azure OpenAI TTS(英文产品介绍文案配音,使用 `AZURE_OPENAI_API_KEY` 或回退复用 `LLM_API_KEY`;默认音色池 `alloy,verse,shimmer`

View File

@@ -8,6 +8,7 @@
"name": "运动阳光男",
"folder": "01_运动阳光男",
"description": "运动阳光男透明骨架人角色含正面、左右45度、侧面、背面、半身近景和背部特写参考。",
"prompt_brief": "Athletic sunny male transparent wellness character, young adult energy, lean fit proportions, open and upbeat posture, clean translucent skin shell with visible white skeleton. The character should feel friendly, active, outdoor-sport inspired, bright, healthy, and suitable for premium SKG neck-and-shoulder wearable device ads. Keep neck, collarbone, shoulders, upper back, and cervical spine readable without bulky clothing or props.",
"primary_image": "character-01-front",
"images": [
{
@@ -80,6 +81,7 @@
"name": "都市型男",
"folder": "02_都市型男",
"description": "都市型男透明骨架人角色含正面、左右45度、侧面、背面、半身近景和背部特写参考。",
"prompt_brief": "Urban stylish male transparent wellness character, adult metropolitan feel, clean confident posture, refined proportions, translucent body shell with visible white skeleton. The commercial mood is premium city lifestyle, composed, sharp, and modern, suitable for office or commute-oriented SKG neck-and-shoulder massage ads. Keep shoulder line, side neck, collarbone, and upper back clear for wearable device placement.",
"primary_image": "character-02-front",
"images": [
{
@@ -152,6 +154,7 @@
"name": "优雅白领女",
"folder": "03_优雅白领女",
"description": "优雅白领女透明骨架人角色含正面、左右45度、侧面、背面、半身近景和背部特写参考。",
"prompt_brief": "Elegant professional female transparent wellness character, young adult to adult office-worker mood, slim balanced proportions, calm poised posture, translucent outer body with a clean visible white skeleton. The style should feel premium, gentle, trustworthy, and workplace-friendly for SKG neck-and-shoulder wearable device ads. Keep hair, collars, and accessories from hiding the neck, shoulders, collarbone, upper back, and cervical spine.",
"primary_image": "character-03-front",
"images": [
{
@@ -224,6 +227,7 @@
"name": "运动辣妹",
"folder": "04_运动辣妹",
"description": "运动辣妹透明骨架人角色含正面、左右45度、侧面、背面、半身近景和背部特写参考。",
"prompt_brief": "Sporty confident female transparent wellness character, energetic young adult fitness mood, toned proportions, expressive posture, translucent skin shell with visible white skeleton. The character should feel active, fashionable, bright, and creator-ad friendly while remaining premium and non-horror. Keep the neck, side neck, shoulders, collarbone, upper trapezius, and upper back open and readable for SKG wearable massage device scenes.",
"primary_image": "character-04-front",
"images": [
{
@@ -296,6 +300,7 @@
"name": "绅士大叔",
"folder": "05_绅士大叔",
"description": "绅士大叔透明骨架人角色含正面、左右45度、侧面、背面、半身近景和背部特写参考。",
"prompt_brief": "Mature gentleman transparent wellness character, adult to middle-aged presence without exact age, steady confident posture, slightly stronger build, translucent body shell with a clean visible white skeleton. The commercial mood is calm, trustworthy, premium, and lifestyle-oriented for SKG neck-and-shoulder wearable device ads. Keep collars and styling minimal so the neck, shoulders, upper back, cervical spine, and shoulder blades remain visible.",
"primary_image": "character-05-front",
"images": [
{
@@ -364,4 +369,4 @@
]
}
]
}
}

View File

@@ -52,8 +52,18 @@ LOCAL_ASR_BIN = os.getenv("LOCAL_ASR_BIN", "").strip()
LOCAL_ASR_MODEL = os.getenv("LOCAL_ASR_MODEL", "mlx-community/whisper-tiny").strip() or "mlx-community/whisper-tiny"
LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", "180")))
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
DEFAULT_GPT_TEXT_MODEL = os.getenv("GPT_TEXT_MODEL", "gpt-4o").strip() or "gpt-4o"
def gpt_model_env(name: str, default: str | None = None) -> str:
value = os.getenv(name, default or DEFAULT_GPT_TEXT_MODEL).strip()
if not value or value.lower().startswith("gemini-"):
return default or DEFAULT_GPT_TEXT_MODEL
return value
REWRITE_MODEL = gpt_model_env("REWRITE_MODEL")
VISION_MODEL = gpt_model_env("VISION_MODEL")
IMAGE_BASE_URL = os.getenv("IMAGE_BASE_URL", LLM_BASE_URL).strip()
IMAGE_API_KEY = os.getenv("IMAGE_API_KEY", LLM_API_KEY).strip()
AI_HTTP_PROXY = (
@@ -77,29 +87,14 @@ PRODUCT_ASSET_MIN_LONG_SIDE = max(512, int(os.getenv("PRODUCT_ASSET_MIN_LONG_SID
PRODUCT_ASSET_MIN_SHORT_SIDE = max(320, int(os.getenv("PRODUCT_ASSET_MIN_SHORT_SIDE", "600")))
PRODUCT_ASSET_JPEG_QUALITY = max(80, min(95, int(os.getenv("PRODUCT_ASSET_JPEG_QUALITY", "92"))))
VIDEO_MODEL = os.getenv("VIDEO_MODEL", "seedance").strip() or "seedance"
YTDLP_COOKIES_FILE = os.getenv("YTDLP_COOKIES_FILE", "").strip()
YTDLP_COOKIES_FROM_BROWSER = os.getenv("YTDLP_COOKIES_FROM_BROWSER", "").strip()
AUDIO_PRODUCT_BRIEF = os.getenv(
"AUDIO_PRODUCT_BRIEF",
"SKG 智能按摩产品,主打日常肩颈、腰背、眼部、膝盖或足部放松;广告表达要高级、干净、可信,不做医疗疗效承诺。",
).strip()
AUDIO_REWRITE_MODEL = os.getenv("AUDIO_REWRITE_MODEL", REWRITE_MODEL).strip() or REWRITE_MODEL
MINIMAX_API_KEY = os.getenv("MINIMAX_API_KEY", "").strip()
MINIMAX_TTS_BASE_URL = os.getenv("MINIMAX_TTS_BASE_URL", "https://api.minimax.io").strip().rstrip("/")
MINIMAX_TTS_MODEL = os.getenv("MINIMAX_TTS_MODEL", "speech-2.8-turbo").strip() or "speech-2.8-turbo"
MINIMAX_TTS_VOICE_ID = os.getenv(
"MINIMAX_TTS_VOICE_ID",
"English_expressive_narrator",
).strip() or "English_expressive_narrator"
DEFAULT_MINIMAX_TTS_VOICE_POOL = [
"English_magnetic_voiced_man",
"English_Upbeat_Woman",
"English_MaturePartner",
]
MINIMAX_TTS_VOICE_POOL = [
v.strip()
for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",")
if v.strip()
]
VOICE_PROVIDER = os.getenv("VOICE_PROVIDER", "azure_openai").strip().lower() or "azure_openai"
AUDIO_REWRITE_MODEL = gpt_model_env("AUDIO_REWRITE_MODEL", REWRITE_MODEL)
VOICE_PROVIDER = "azure_openai"
AZURE_OPENAI_BASE_URL = os.getenv("AZURE_OPENAI_BASE_URL", "https://ai.skg.com/azure").strip().rstrip("/")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", LLM_API_KEY).strip()
AZURE_TTS_MODEL = os.getenv("AZURE_TTS_MODEL", "gpt-4o-mini-tts").strip() or "gpt-4o-mini-tts"
@@ -111,6 +106,11 @@ AZURE_TTS_VOICE_POOL = [
if v.strip()
]
AZURE_TTS_PATH = os.getenv("AZURE_TTS_PATH", "/audio/speech").strip() or "/audio/speech"
AZURE_TTS_PATHS = [
p.strip()
for p in os.getenv("AZURE_TTS_PATHS", f"{AZURE_TTS_PATH},/audio/speech,/v1/audio/speech").split(",")
if p.strip()
]
POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
@@ -452,6 +452,7 @@ class CharacterLibraryItem(BaseModel):
name: str
folder: str = ""
description: str = ""
prompt_brief: str = ""
primary_image: str = ""
images: list[CharacterLibraryImage] = Field(default_factory=list)
@@ -477,6 +478,7 @@ class SubjectTemplateItem(BaseModel):
name: str
description: str = ""
note: str = ""
prompt_brief: str = ""
source: Literal["database"] = "database"
source_job_id: str = ""
source_frame_idx: int = -1
@@ -1075,6 +1077,35 @@ def run(cmd: list[str], cwd: Path | None = None) -> str:
return res.stdout
def ytdlp_cookie_args() -> list[str]:
if YTDLP_COOKIES_FILE:
cookies = Path(YTDLP_COOKIES_FILE).expanduser()
if not cookies.exists():
raise RuntimeError("TikTok cookies 文件不可用,请检查 YTDLP_COOKIES_FILE 配置。")
return ["--cookies", str(cookies)]
if YTDLP_COOKIES_FROM_BROWSER:
return ["--cookies-from-browser", YTDLP_COOKIES_FROM_BROWSER]
return []
def normalize_download_error(error: Exception) -> str:
raw = str(error)
lower = raw.lower()
auth_required = (
"log in for access" in lower
or "login" in lower and "cookies" in lower
or "cookies-from-browser" in lower
or "sign in" in lower and "tiktok" in lower
)
if auth_required:
return (
"TikTok 下载需要登录态。请上传视频文件,或在后端配置 "
"YTDLP_COOKIES_FILE / YTDLP_COOKIES_FROM_BROWSER 后重试。"
f"原始错误:{raw}"
)
return raw
# ---- 启发式选帧工具 ----
import imagehash
import numpy as np
@@ -1728,13 +1759,15 @@ def pipeline_download(job_id: str) -> None:
update(job, status="downloading", message="本地上传 · 跳过下载", progress=15)
else:
update(job, status="downloading", message="yt-dlp 下载中…", progress=5)
run([
cmd = [
"yt-dlp", "-f", "best[ext=mp4]/best",
"-o", str(mp4),
"--no-warnings", "--no-playlist",
"--retries", "3",
*ytdlp_cookie_args(),
job.url,
])
]
run(cmd)
if not mp4.exists():
raise RuntimeError("下载完成但找不到 source.mp4")
@@ -1757,7 +1790,7 @@ def pipeline_download(job_id: str) -> None:
)
except Exception as e:
message = "视频元数据解析失败" if stage == "metadata" else "下载失败"
update(job, status="failed", error=str(e), message=message)
update(job, status="failed", error=normalize_download_error(e), message=message)
def pipeline_analyze(
@@ -1929,7 +1962,7 @@ def analyze_queue_worker() -> None:
ANALYZE_WORKER_RUNNING = False
# ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ----------
# ---------- 音频转写 + 翻译 + SKG 改写 + Azure OpenAI 配音 ----------
class TranscriptionUnavailable(RuntimeError):
pass
@@ -2385,18 +2418,6 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds
return fallback, f"改写失败,使用本地模板:{e}"
def _minimax_tts_url() -> str:
if MINIMAX_TTS_BASE_URL.endswith("/v1/t2a_v2"):
return MINIMAX_TTS_BASE_URL
return f"{MINIMAX_TTS_BASE_URL}/v1/t2a_v2"
def _choose_minimax_voice_id() -> str:
if MINIMAX_TTS_VOICE_POOL:
return random.choice(MINIMAX_TTS_VOICE_POOL)
return MINIMAX_TTS_VOICE_ID
def _choose_azure_voice_id() -> str:
if AZURE_TTS_VOICE_POOL:
return random.choice(AZURE_TTS_VOICE_POOL)
@@ -2404,9 +2425,7 @@ def _choose_azure_voice_id() -> str:
def _choose_tts_voice_id() -> str:
if VOICE_PROVIDER == "azure_openai":
return _choose_azure_voice_id()
return _choose_minimax_voice_id()
return _choose_azure_voice_id()
def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
@@ -2423,60 +2442,22 @@ def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
return 0.99
def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
if not MINIMAX_API_KEY:
raise RuntimeError("MINIMAX_API_KEY 未配置,未生成配音")
if not text.strip():
raise RuntimeError("改写文案为空,未生成配音")
payload = {
"model": MINIMAX_TTS_MODEL,
"text": text.strip()[:9500],
"stream": False,
"language_boost": "English",
"output_format": "hex",
"voice_setting": {
"voice_id": voice_id,
"speed": _voice_speed_for(voice_id, target_seconds, text),
"vol": 1,
"pitch": 0,
},
"audio_setting": {
"sample_rate": 32000,
"bitrate": 128000,
"format": "mp3",
"channel": 1,
},
}
resp = httpx.post(
_minimax_tts_url(),
headers={"Authorization": f"Bearer {MINIMAX_API_KEY}", "Content-Type": "application/json"},
json=payload,
timeout=90,
)
resp.raise_for_status()
data = resp.json()
base_resp = data.get("base_resp") or {}
if int(base_resp.get("status_code", 0) or 0) != 0:
raise RuntimeError(base_resp.get("status_msg") or "MiniMax TTS 返回失败")
audio_hex = ((data.get("data") or {}).get("audio") or "").strip()
if not audio_hex:
raise RuntimeError("MiniMax TTS 未返回 audio hex")
try:
audio_bytes = bytes.fromhex(audio_hex)
except ValueError as e:
raise RuntimeError(f"MiniMax TTS audio hex 无法解析:{e}") from e
out = job_dir(job_id) / "audio_script.mp3"
out.write_bytes(audio_bytes)
return f"/jobs/{job_id}/audio-script.mp3"
def _azure_tts_url() -> str:
path = AZURE_TTS_PATH if AZURE_TTS_PATH.startswith("/") else f"/{AZURE_TTS_PATH}"
def _azure_tts_url_for(path_value: str) -> str:
path = path_value if path_value.startswith("/") else f"/{path_value}"
if AZURE_OPENAI_BASE_URL.endswith(path):
return AZURE_OPENAI_BASE_URL
return f"{AZURE_OPENAI_BASE_URL}{path}"
def _azure_tts_urls() -> list[str]:
urls: list[str] = []
for path in AZURE_TTS_PATHS or [AZURE_TTS_PATH]:
url = _azure_tts_url_for(path)
if url not in urls:
urls.append(url)
return urls
def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
if not AZURE_OPENAI_API_KEY:
raise RuntimeError("AZURE_OPENAI_API_KEY 或 LLM_API_KEY 未配置,未生成配音")
@@ -2489,18 +2470,32 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds
"response_format": "mp3",
"speed": _voice_speed_for(voice_id, target_seconds, text),
}
resp = httpx.post(
_azure_tts_url(),
headers={
"Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
"api-key": AZURE_OPENAI_API_KEY,
"Content-Type": "application/json",
},
json=payload,
timeout=120,
)
headers = {
"Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
"api-key": AZURE_OPENAI_API_KEY,
"Content-Type": "application/json",
}
resp: httpx.Response | None = None
errors: list[str] = []
with ai_http_client(timeout=120) as client:
for url in _azure_tts_urls():
try:
current = client.post(url, headers=headers, json=payload)
except Exception as e:
errors.append(f"{url}: {type(e).__name__}: {e}")
continue
if current.status_code < 400:
resp = current
break
errors.append(f"{url}: HTTP {current.status_code}: {current.text[:180]}")
if current.status_code not in {404, 405}:
resp = current
break
if resp is None:
raise RuntimeError("Azure OpenAI TTS 不可用;已尝试 " + " | ".join(errors))
if resp.status_code >= 400:
raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {resp.text[:300]}")
detail = " | ".join(errors) or resp.text[:300]
raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {detail[:600]}")
audio_bytes = resp.content
if not audio_bytes:
raise RuntimeError("Azure OpenAI TTS 未返回音频内容")
@@ -2517,9 +2512,7 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds
def _tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> tuple[str, str, str]:
if VOICE_PROVIDER == "azure_openai":
return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL
return _minimax_tts_sync(job_id, text, voice_id, target_seconds), "minimax", MINIMAX_TTS_MODEL
return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL
def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript:
@@ -2531,8 +2524,8 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id)
voice_url = ""
voice_error = ""
voice_provider = "azure_openai" if VOICE_PROVIDER == "azure_openai" else "minimax"
voice_model = AZURE_TTS_MODEL if voice_provider == "azure_openai" else MINIMAX_TTS_MODEL
voice_provider = "azure_openai"
voice_model = AZURE_TTS_MODEL
try:
voice_url, voice_provider, voice_model = _tts_sync(job_id, rewritten, selected_voice_id, duration)
except Exception as e:
@@ -2944,6 +2937,83 @@ def _image_text_call(
raise RuntimeError(_image_failure_message("image text", max_attempts, last_err, capacity_seen))
def _image_path_to_data_url(path: Path) -> str:
media_type = "image/png" if path.suffix.lower() == ".png" else "image/jpeg"
return f"data:{media_type};base64,{base64.b64encode(path.read_bytes()).decode('ascii')}"
def _vision_brief_from_images(image_paths: list[Path], prompt: str, max_images: int = 8) -> str:
paths = [path for path in image_paths if path.exists()][:max_images]
if not paths:
return ""
if not LLM_API_KEY:
return ""
content: list[dict] = [{"type": "text", "text": prompt}]
for path in paths:
content.append({"type": "image_url", "image_url": {"url": _image_path_to_data_url(path)}})
try:
resp = llm().chat.completions.create(
model=VISION_MODEL,
messages=[{"role": "user", "content": content}],
response_format={"type": "json_object"},
temperature=0.1,
max_tokens=1400,
)
raw = (resp.choices[0].message.content or "").strip()
if not raw:
raw = (getattr(resp.choices[0].message, "reasoning_content", "") or "").strip()
match = re.search(r"\{[\s\S]*\}", raw)
raw = match.group(0) if match else raw
data = json.loads(raw)
except Exception as e:
print(f"[vision brief failed] {e}", flush=True)
return ""
if isinstance(data, dict):
if isinstance(data.get("brief"), str) and data["brief"].strip():
return data["brief"].strip()[:1800]
parts: list[str] = []
for key in (
"gender_presentation", "age_range", "body_proportion", "hair", "skin_tone",
"wardrobe_style", "pose_language", "camera_visibility", "commercial_mood",
"neck_shoulder_readiness", "style_constraints",
):
value = data.get(key)
if isinstance(value, str) and value.strip():
parts.append(f"{key.replace('_', ' ')}: {value.strip()}")
if parts:
return "; ".join(parts)[:1800]
return ""
def _describe_source_subject(job_id: str, source_indices: list[int]) -> str:
"""Turn source keyframes into a non-identifying visual brief for similar-subject text generation."""
paths = [_source_frame_path(job_id, idx) for idx in source_indices]
prompt = (
"You are preparing a non-identifying character brief for generating a NEW similar but non-identical ad subject. "
"Look at these source video keyframes as evidence of one role and style, not as a person to identify. "
"Do NOT identify the person, do NOT estimate exact age, do NOT describe biometric identity, and do NOT mention celebrity or real-person likeness. "
"Output strict JSON only. Use broad style traits suitable for text-to-image generation.\n"
"Required keys: gender_presentation, age_range, body_proportion, hair, skin_tone, wardrobe_style, "
"pose_language, camera_visibility, commercial_mood, neck_shoulder_readiness, style_constraints, brief.\n"
"The brief should be 80-140 words and should preserve category, role, energy, camera readability, and commercial atmosphere while explicitly allowing a new non-identical subject."
)
return _vision_brief_from_images(paths, prompt, max_images=8)
def _describe_subject_template_from_images(name: str, subject_style: str, image_paths: list[Path], note: str = "") -> str:
prompt = (
f"You are summarizing a saved SKG subject template named '{name}' for future text-to-image generation. "
f"Subject style: {subject_style}. User note: {note[:500]}. "
"Look at the subject views and describe the reusable creative direction without copying identity or pixels. "
"Do NOT identify a person and do NOT describe exact facial identity. "
"Output strict JSON only with keys: gender_presentation, age_range, body_proportion, material_or_skin, "
"wardrobe_or_surface_style, pose_language, camera_readability, neck_shoulder_readiness, commercial_mood, brief. "
"The brief should be 80-140 words and must be useful as a reference character brief for creating a new innovative variation."
)
return _vision_brief_from_images(image_paths, prompt, max_images=10)
# ---------- API 路由 ----------
class CreateJobReq(BaseModel):
@@ -3130,7 +3200,7 @@ def health() -> dict:
"auth_configured": WEB_AUTH_CONFIGURED,
"base_url": LLM_BASE_URL or "openai-default",
"image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
"voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
"voice_base_url": AZURE_OPENAI_BASE_URL,
"models": {
"asr": ASR_MODEL,
"local_asr": LOCAL_ASR_MODEL,
@@ -3147,15 +3217,12 @@ def health() -> dict:
"subject_image": SUBJECT_ASSET_IMAGE_MODEL,
"subject_image_fallbacks": SUBJECT_ASSET_IMAGE_MODELS,
"voice_provider": VOICE_PROVIDER,
"voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
"voice_tts": AZURE_TTS_MODEL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_MODEL,
"voice_id": AZURE_TTS_VOICE_ID if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_VOICE_ID,
"voice_pool": AZURE_TTS_VOICE_POOL if VOICE_PROVIDER == "azure_openai" else (MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
"voice_configured": bool(AZURE_OPENAI_API_KEY) if VOICE_PROVIDER == "azure_openai" else bool(MINIMAX_API_KEY),
"minimax_tts": MINIMAX_TTS_MODEL,
"minimax_voice": MINIMAX_TTS_VOICE_ID,
"minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID],
"minimax_configured": bool(MINIMAX_API_KEY),
"voice_base_url": AZURE_OPENAI_BASE_URL,
"voice_tts": AZURE_TTS_MODEL,
"voice_tts_paths": AZURE_TTS_PATHS,
"voice_id": AZURE_TTS_VOICE_ID,
"voice_pool": AZURE_TTS_VOICE_POOL,
"voice_configured": bool(AZURE_OPENAI_API_KEY),
"video": VIDEO_MODEL,
"video_aliases": VIDEO_MODEL_ALIASES,
"video_provider": video_provider_name(),
@@ -3225,6 +3292,31 @@ async def create_job(req: CreateJobReq, bg: BackgroundTasks) -> Job:
return job
@app.post("/jobs/{job_id}/download/retry", response_model=Job)
async def retry_job_download(job_id: str, bg: BackgroundTasks) -> Job:
job = JOBS.get(job_id)
if not job:
raise HTTPException(404, "job not found")
if job.source_kind == "upload" or job.url.startswith("upload://"):
raise HTTPException(409, "uploaded videos cannot be redownloaded; upload the file again")
if job.status in {"downloading", "splitting", "transcribing"}:
raise HTTPException(409, f"job is busy: {job.status}")
mp4 = job_dir(job_id) / "source.mp4"
if mp4.exists() and mp4.stat().st_size == 0:
mp4.unlink()
update(
job,
status="downloading",
progress=1,
error="",
message="重新提交下载…",
video_url="",
)
bg.add_task(pipeline_download, job_id)
return job
@app.post("/jobs/upload", response_model=Job)
async def create_job_from_upload(bg: BackgroundTasks, file: UploadFile = File(...)) -> Job:
if not file.filename:
@@ -4308,43 +4400,56 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
source_indices = [idx] + source_indices
source_indices = list(dict.fromkeys(source_indices))[:12]
similar_mode = req.reconstruction_mode == "similar"
character_reference_paths: list[Path] = []
character_reference_clause = ""
template_brief_clause = ""
character_label = ""
subject_template_id = (req.subject_template_id or "").strip()
character_id = (req.character_id or "").strip()
if subject_template_id:
template = find_subject_template_item(subject_template_id)
character_label = template.name
for image in template.images[:10]:
character_reference_paths.append(subject_template_image_file(image.filename))
character_reference_clause = (
f"Selected reusable subject template from database: {template.name}. "
"Use these saved generated subject views as a high-quality creative direction and identity bible only; "
"do not copy pixels, file artifacts, exact pose, labels, or accidental defects. "
"Create a new innovative variation that keeps the same broad subject type, transparent wellness character language, "
"camera readability, shoulder/neck product compatibility, and commercial role. "
template_paths = [subject_template_image_file(image.filename) for image in template.images[:10]]
character_reference_paths.extend(template_paths)
brief = template.prompt_brief.strip() or template.note.strip() or template.description.strip()
if similar_mode and not brief:
brief = _describe_subject_template_from_images(template.name, template.subject_style, template_paths, template.note)
template_brief_clause = (
f"Reference character brief from saved database template '{template.name}': {brief}. "
"Use this as a high-quality creative direction and identity bible only; do not copy a face, exact pose, pixels, file artifacts, labels, or accidental defects. "
"Create a new innovative variation that keeps the same broad subject type, transparent wellness character language, camera readability, shoulder/neck product compatibility, and commercial role. "
if brief else
f"Selected reusable subject template from database: {template.name}. Create a new innovative variation, not a duplicate. "
)
elif character_id:
character = find_character_library_item(character_id)
character_label = character.name
for image in character.images[:7]:
character_reference_paths.append(character_library_file(image.filename))
character_reference_clause = (
f"Selected built-in creative character reference: {character.name}. "
"Use these planned character images as a high-quality creative direction and anatomy/style bible only; "
character_reference_paths.extend(character_library_file(image.filename) for image in character.images[:7])
brief = character.prompt_brief.strip() or character.description.strip()
template_brief_clause = (
f"Reference character brief from built-in creative character '{character.name}': {brief}. "
"Use this planned character brief as a high-quality creative direction and anatomy/style bible only; "
"do not copy the exact face, exact pose, exact silhouette, pixels, or make a duplicate. "
"Create a new innovative variation that keeps the same broad role, transparent wellness character language, "
"camera readability, and shoulder/neck product compatibility. "
"Create a new innovative variation that keeps the same broad role, transparent wellness character language, camera readability, and shoulder/neck product compatibility. "
)
model_src, tmp_focus = _focus_source_for_element(job_id, idx, el)
tmp_focus: Path | None = None
model_src: Path | list[Path] | None = None
frame_reference_paths = [p for p in (_source_frame_path(job_id, i) for i in source_indices) if p.exists()]
if character_reference_paths:
remaining = max(0, 10 - len(character_reference_paths))
model_src = character_reference_paths + frame_reference_paths[:remaining]
elif len(frame_reference_paths) > 1:
model_src = frame_reference_paths[:10]
source_subject_brief = _describe_source_subject(job_id, source_indices) if similar_mode else ""
source_subject_clause = (
f"Source video role brief from selected keyframes: {source_subject_brief}. "
"Use this brief to preserve role category, creator-ad energy, camera readability, and broad styling, while creating a new non-identical subject. "
if source_subject_brief else
"Source video role brief unavailable; create a new non-identical ad subject guided by the user direction, template brief, and requested view. "
)
if not similar_mode:
model_src, tmp_focus = _focus_source_for_element(job_id, idx, el)
if character_reference_paths:
remaining = max(0, 10 - len(character_reference_paths))
model_src = character_reference_paths + frame_reference_paths[:remaining]
elif len(frame_reference_paths) > 1:
model_src = frame_reference_paths[:10]
try:
with Image.open(_source_frame_path(job_id, idx)) as src_im:
@@ -4371,7 +4476,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
)
actor_style_clause = (
"Generate a believable normal commercial video actor, not a transparent or skeleton character. "
"Use the references to understand the source video's casting direction, age range, gender presentation, body proportion, wardrobe category, gesture vocabulary, framing, energy, lighting, and creator-ad style. "
"Use the text briefs to understand the source video's casting direction, age range, gender presentation, body proportion, wardrobe category, gesture vocabulary, framing, energy, lighting, and creator-ad style. "
"Do not recreate the exact person's face, biometric identity, unique likeness, tattoos, scars, logos, watermarks, captions, or platform UI. "
"The output must be a newly designed similar actor that could play the same role in a new ad, with consistent identity across all views. "
if similar_actor
@@ -4386,7 +4491,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
prompt_extra_clause = f"User direction: {prompt_extra[:1200]} " if prompt_extra else ""
identity_lock_clause = (
"Identity lock: these API calls generate one high-definition multi-view pack for ONE single subject, but each individual output file must show only its one requested view. "
"Before rendering, infer one consistent character bible from the reference image(s): gender presentation, age range, body proportions, head shape, face direction cues, material, silhouette, wardrobe/material style, and commercial mood. "
"Before rendering, infer one consistent character bible from the supplied text brief and generation instructions: gender presentation, age range, body proportions, head shape, face direction cues, material, silhouette, wardrobe/material style, and commercial mood. "
"Keep that same character bible unchanged across every generated view in separate files. "
"If user direction requests a gender, age, or style change, apply that one change uniformly to all views; never mix male/female, young/old, or multiple style identities inside the same pack. "
"For transparent humanoids, keep the same transparent skin shell, skeleton proportions, visible spine/rib cage/pelvis/limb bones, and non-horror wellness character style in every view. "
@@ -4427,14 +4532,22 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
if closeup_view and req.subject_kind == "living"
else "The subject must be complete, centered, full body or full object, head-to-feet visible when applicable, not cropped by the canvas. Make the subject large and readable: it should occupy about 85-95% of the image height with only small margins. "
)
reference_strategy_clause = (
"Text-only generation mode: no source image is attached to this image request. Use only the written source/video/template briefs below as creative constraints. "
"This is intentionally NOT image editing and NOT identity replication. "
+ source_subject_clause
+ template_brief_clause
if similar_mode else
"Use the reference image(s) only as visual evidence; do not crop, cut out, paste, trace, or extract pixels from the source. "
)
prompt = (
f"Use the reference image(s) only as visual evidence; do not crop, cut out, paste, trace, or extract pixels from the source. "
reference_strategy_clause
+
f"Generate one newly rendered {view_prompt} for {target}. "
f"The subject is a {kind_phrase}. If multiple frames are shown, treat them as evidence of one same subject, not multiple subjects. "
f"The subject is a {kind_phrase}. Treat all source evidence as one role and one consistent subject bible, not multiple subjects. "
+ single_view_clause
+ identity_clause
+ identity_lock_clause
+ character_reference_clause
+ neck_product_clause
+ canvas_clause
+ prompt_extra_clause
@@ -4447,7 +4560,16 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
+ transparent_character_clause
)
try:
img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
if similar_mode:
print(
f"[subject assets] reconstruction_mode=similar endpoint=/images/generations view={view} image_refs=0 model={GPT_IMAGE_MODEL}",
flush=True,
)
img_bytes, _mode = _image_text_call(prompt, models=models, max_attempts=3)
else:
if model_src is None:
raise RuntimeError("subject asset edit reference image missing")
img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
except RuntimeError as e:
raise HTTPException(_image_error_status(e), f"subject asset {view} failed: {e}")
@@ -5026,6 +5148,7 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
template_dir.mkdir(parents=True, exist_ok=True)
now = _time.time()
images: list[SubjectTemplateImage] = []
saved_image_paths: list[Path] = []
for asset in selected_assets:
src = job_dir(job_id) / "assets" / f"{asset.id}.jpg"
if not src.exists():
@@ -5034,6 +5157,7 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
filename = f"{template_id}/{image_id}.jpg"
dst = SUBJECT_TEMPLATE_IMAGE_DIR / filename
shutil.copy2(src, dst)
saved_image_paths.append(dst)
images.append(SubjectTemplateImage(
id=image_id,
view=asset.view,
@@ -5053,11 +5177,18 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
raise HTTPException(404, "subject asset files missing")
primary = next((image.id for image in images if image.view == "front"), images[0].id)
prompt_brief = _describe_subject_template_from_images(
name,
req.subject_style,
saved_image_paths,
req.note.strip(),
) or req.note.strip()
item = SubjectTemplateItem(
id=template_id,
name=name,
description=req.note.strip(),
note=req.note.strip(),
prompt_brief=prompt_brief,
source_job_id=job_id,
source_frame_idx=frame.index,
source_element_id=element.id,

View File

@@ -22,7 +22,9 @@ LLM_API_KEY=
ASR_MODEL=whisper-1
ASR_FALLBACK_MODEL=gemini-2.5-flash
TRANSLATE_MODEL=gemini-2.5-flash
REWRITE_MODEL=gemini-2.5-pro
GPT_TEXT_MODEL=gpt-4o
REWRITE_MODEL=gpt-4o
VISION_MODEL=gpt-4o
PRODUCT_VIEW_MODEL=gpt-image-2
IMAGE_BASE_URL=https://ai.skg.com/ezlink/v1
IMAGE_API_KEY=
@@ -33,9 +35,14 @@ SUBJECT_ASSET_IMAGE_MODELS=gpt-image-2
# Optional outbound proxy for AI gateway calls. Leave blank on normal VPS networking.
AI_HTTP_PROXY=
# Optional TikTok download login state for yt-dlp. Keep cookies files private.
YTDLP_COOKIES_FILE=
YTDLP_COOKIES_FROM_BROWSER=
# Audio rewrite and Azure OpenAI TTS
AUDIO_REWRITE_MODEL=gemini-2.5-pro
AUDIO_PRODUCT_BRIEF="SKG smart massage products for daily neck, shoulder, back, eye, knee, and foot relaxation. Keep claims premium, clean, credible, and non-medical."
# Voice is fixed to Azure OpenAI in the backend.
VOICE_PROVIDER=azure_openai
AZURE_OPENAI_BASE_URL=https://ai.skg.com/azure
AZURE_OPENAI_API_KEY=
@@ -43,13 +50,7 @@ AZURE_TTS_MODEL=gpt-4o-mini-tts
AZURE_TTS_VOICE_ID=alloy
AZURE_TTS_VOICE_POOL=alloy,verse,shimmer
AZURE_TTS_PATH=/audio/speech
# Legacy MiniMax TTS fallback; not the default voice provider.
MINIMAX_API_KEY=
MINIMAX_TTS_BASE_URL=https://api.minimax.io
MINIMAX_TTS_MODEL=speech-2.8-turbo
MINIMAX_TTS_VOICE_ID=English_expressive_narrator
MINIMAX_TTS_VOICE_POOL=English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner
AZURE_TTS_PATHS=/audio/speech,/v1/audio/speech
# Video generation. Use SKG Doubao / Seedance gateway in production.
POE_API_BASE_URL=https://api.poe.com/v1

File diff suppressed because one or more lines are too long

View File

@@ -17,6 +17,7 @@ import { AdRecreationBoard } from "@/components/ad-recreation-board"
import {
addManualFrame, analyzeJob, createJob, getJob, listJobs, uploadJob, deleteJob, deleteFrame, deleteGeneratedImage,
deleteGeneratedVideo, deleteCutout, generateStoryboardVideo, triggerTranscribe, describeFrame, updateStoryboard, copyProductLibraryAsset,
formatJobError, retryJobDownload,
type Job, type ImageRef, type KeyFrame, type ProductFusionShot, type StoryboardScene, type FrameExtractMode, type FrameExtractQuality, type FrameExtractTarget,
} from "@/lib/api"
import { TRANSPARENT_HUMAN_NEGATIVE_PROMPT, TRANSPARENT_HUMAN_VIDEO_PROMPT } from "@/lib/workflow-target"
@@ -569,15 +570,30 @@ export default function Home() {
const handleStartProduction = useCallback(async (inputUrl?: string) => {
const trimmed = inputUrl?.trim()
const created = trimmed ? await handleSubmit(trimmed) : undefined
const target = created ?? job
let target = created ?? job
if (!target) {
toast.info("先粘贴视频链接或选择一个素材任务")
return
}
if (!created && target.status === "failed") {
autoTriggeredRef.current.delete(`${target.id}:audio`)
autoTriggeredRef.current.delete(`${target.id}:visual`)
}
if (!created && target.status === "failed" && !target.video_url) {
try {
target = await retryJobDownload(target.id)
updateJobInList(target)
toast.info("已重新提交下载;下载完成后会自动跑音频文案路和视觉抽帧路")
} catch (e) {
toast.error("重新下载失败:" + (e instanceof Error ? e.message : String(e)))
return
}
}
setProductionJobIds((prev) => new Set(prev).add(target.id))
toast.success("已进入并行素材分析:下载完成后自动跑音频文案路和视觉抽帧路")
if (target.video_url) toast.success("已进入并行素材分析:音频文案路和视觉抽帧路会同步推进")
else toast.success("已进入并行素材分析:下载完成后自动跑音频文案路和视觉抽帧路")
void startProductionLanesForJob(target)
}, [handleSubmit, job, startProductionLanesForJob])
}, [handleSubmit, job, startProductionLanesForJob, updateJobInList])
useEffect(() => {
if (productionJobIds.size === 0) return
@@ -860,6 +876,9 @@ export default function Home() {
if (job?.status === "downloaded" && prevStatusRef.current !== "downloaded") {
toast.info("视频已下载,音频解析会自动开始;也可以在右侧手动重试", { duration: 6000 })
}
if (job?.status === "failed" && prevStatusRef.current !== "failed") {
toast.error(formatJobError(job.error) || "任务失败", { duration: 10000 })
}
prevStatusRef.current = job?.status ?? null
const TERMINAL: Job["status"][] = ["downloaded", "frames_extracted", "transcribed", "failed"]

View File

@@ -33,6 +33,7 @@ import {
cutoutElement,
deleteSubjectAsset,
effectiveFrameUrl,
formatJobError,
generateSceneAsset,
generateProductAngleAsset,
generateSubjectAssets,
@@ -117,6 +118,8 @@ type AudioStoryboardRow = {
type ProductRefItem = ProductRefStateItem
type SubjectPlanningRef = ImageRef & { view: string; roleHint: string }
type SubjectStyleMode = "transparent_human" | "source_actor"
type SubjectMode = "template" | "source_similar"
type SubjectViewMode = "all" | "common" | "custom"
type StoryboardVisualMode = NonNullable<StoryboardScene["visual_mode"]>
type RowPlanPatch = Partial<Pick<AudioStoryboardRow, "visualMode" | "needsProduct" | "needsSubject" | "subjectDescription" | "visualPlan" | "firstFramePlan" | "lastFramePlan" | "productIntegration" | "productPlacement">>
type WorkflowStepId = "input" | "source" | "audio" | "visual" | "subject" | "product" | "script" | "scene" | "video"
@@ -156,6 +159,8 @@ const SUBJECT_VIEW_ORDER = [
"back_detail",
]
const COMMON_SUBJECT_VIEW_VALUES = ["front", "three_quarter_left", "three_quarter_right", "bust_front"]
const SUBJECT_ASSET_SIZE = "2048" as const
type ModelTraceSpec = {
@@ -591,9 +596,9 @@ function similarSubjectModelTrace(models: RuntimeModels | undefined, subjectStyl
title: subjectStyle === "transparent_human" ? "相似透明骨架主体" : "相似普通真人主体",
model: subjectImageModelChain(models),
chain: [
"参考策略:未勾选关键帧时使用全部关键帧,勾选后只使用已选关键帧;也可叠加内置形象作为创意参考",
"参考策略:先用视觉模型把关键帧/模板转成非身份化文字 brief生图请求不再上传参考",
`主体类型:${subjectStyle === "transparent_human" ? "透明/半透明皮肤包裹可见白色骨架" : "普通商业广告真人"}`,
`图像生成:${subjectImageModelChain(models)} 逐张生成 10 张高清图,包含全身多视角和肩颈/后背特写`,
`图像生成:${subjectImageModelChain(models)} 走 /images/generations 逐张生成高清图,视图数量由“全部/常用/自定义”决定`,
"身份锁定:整套图必须是同一个主体,性别表现、年龄段、体型、材质和风格保持一致",
],
note: "这是生成类似但创新的主体,不是复制、抠出或复刻源视频人物身份;内置形象也只作为方向参考。",
@@ -1650,6 +1655,9 @@ function MaterialColumn({
onSubmitUrl: () => void
onStartProduction: () => void
}) {
const actionLabel = !url.trim() && job?.status === "failed"
? job.video_url ? "重新解析" : "重新下载"
: "开始分析"
return (
<section className="skg-board-panel flex min-h-0 flex-col gap-3 rounded-lg border border-white/10 bg-white/[0.035] p-3 shadow-2xl">
<header className="shrink-0 border-b border-white/10 pb-3">
@@ -1675,7 +1683,7 @@ function MaterialColumn({
disabled={data.submitting || (!url.trim() && !job)}
className="inline-flex h-10 items-center justify-center rounded-md bg-[#f0ead8] px-3 text-[13px] font-semibold text-black shadow-[0_14px_28px_rgba(0,0,0,0.28)] transition hover:bg-[#fff7df] disabled:cursor-not-allowed disabled:opacity-45"
>
{actionLabel}
</button>
<button
type="button"
@@ -2138,7 +2146,10 @@ function SourceReferenceBuildPanel({
}) {
const [subjectBusy, setSubjectBusy] = useState(false)
const [subjectAssetBusy, setSubjectAssetBusy] = useState<string | null>(null)
const [subjectMode, setSubjectMode] = useState<SubjectMode>("source_similar")
const [subjectStyle, setSubjectStyle] = useState<SubjectStyleMode>("transparent_human")
const [subjectViewMode, setSubjectViewMode] = useState<SubjectViewMode>("all")
const [customSubjectViews, setCustomSubjectViews] = useState<string[]>(COMMON_SUBJECT_VIEW_VALUES)
const [subjectDirection, setSubjectDirection] = useState("")
const [characterLibrary, setCharacterLibrary] = useState<CharacterLibraryItem[]>([])
const [selectedCharacterId, setSelectedCharacterId] = useState("")
@@ -2169,11 +2180,16 @@ function SourceReferenceBuildPanel({
() => subjectTemplateLibrary.find((template) => template.id === selectedSubjectTemplateId) ?? null,
[subjectTemplateLibrary, selectedSubjectTemplateId],
)
const selectedTemplatePrompt = selectedSubjectTemplate
const selectedTemplatePrompt = subjectMode === "template" && selectedSubjectTemplate
? { name: selectedSubjectTemplate.name, sourceLabel: "数据库主体模板" }
: selectedCharacter
: subjectMode === "template" && selectedCharacter
? { name: selectedCharacter.name, sourceLabel: "内置策划形象" }
: null
const selectedSubjectViews = useMemo(() => {
if (subjectViewMode === "common") return COMMON_SUBJECT_VIEW_VALUES
if (subjectViewMode === "custom") return customSubjectViews.length ? customSubjectViews : COMMON_SUBJECT_VIEW_VALUES
return SUBJECT_ASSET_VIEWS.map((view) => view.value)
}, [customSubjectViews, subjectViewMode])
const visibleActorAssets = useMemo(() => {
const latestByView = new Map<string, SubjectAsset>()
for (const asset of actorAssets) {
@@ -2195,14 +2211,18 @@ function SourceReferenceBuildPanel({
: "待抽帧"
const templateSaveHint = visibleActorAssets.length
? templateDraftName.trim()
? "保存后会进入左侧主体模板库,后续任务可直接复用"
? "保存后会进入主体模板库,后续任务可直接复用"
: "先给这套主体命名,再保存到主体模板库"
: "先生成本次主体视图,再决定是否入库"
const templateSourceLabel = selectedSubjectTemplate
const templateSourceLabel = subjectMode === "template" && selectedSubjectTemplate
? `${selectedSubjectTemplate.name} · 数据库模板`
: selectedCharacter
: subjectMode === "template" && selectedCharacter
? `${selectedCharacter.name} · 模板参考`
: "源视频关键帧 · 相似创新"
const templateRequired = subjectMode === "template" && !selectedSubjectTemplate && !selectedCharacter
const generationCtaLabel = subjectMode === "template"
? `用模板生成 ${selectedSubjectViews.length} 张主体视图`
: `从源视频创新生成 ${selectedSubjectViews.length} 张主体视图`
const loadSubjectTemplateLibrary = async (silent = false) => {
setTemplateLibraryBusy(true)
@@ -2239,6 +2259,10 @@ function SourceReferenceBuildPanel({
toast.warning("请先自动抽帧 12 张,或在原版视频上手动补帧。")
return
}
if (templateRequired) {
toast.warning("请先选择一个内置或数据库主体模板。")
return
}
const baseFrame = subjectReferenceFrames[0]
if (!baseFrame) return
setSubjectBusy(true)
@@ -2271,14 +2295,14 @@ function SourceReferenceBuildPanel({
background: "white",
size: SUBJECT_ASSET_SIZE,
source_frame_indices: subjectReferenceFrames.slice(0, 12).map((frame) => frame.index),
views: SUBJECT_ASSET_VIEWS.map((view) => view.value),
character_id: selectedCharacterId,
subject_template_id: selectedSubjectTemplateId,
views: selectedSubjectViews,
character_id: subjectMode === "template" ? selectedCharacterId : "",
subject_template_id: subjectMode === "template" ? selectedSubjectTemplateId : "",
prompt: buildSimilarSubjectPrompt(subjectStyle, subjectDirection, selectedTemplatePrompt),
replace_views: true,
})
onJobUpdate(updated)
toast.success("相似主体 10 张高清白底图已生成")
toast.success(`相似主体 ${selectedSubjectViews.length} 张高清白底图已生成`)
} catch (e) {
toast.error("相似主体重构失败:" + (e instanceof Error ? e.message : String(e)))
} finally {
@@ -2301,8 +2325,8 @@ function SourceReferenceBuildPanel({
size: SUBJECT_ASSET_SIZE,
source_frame_indices: sourceIndices,
views: [asset.view],
character_id: selectedCharacterId,
subject_template_id: selectedSubjectTemplateId,
character_id: subjectMode === "template" ? selectedCharacterId : "",
subject_template_id: subjectMode === "template" ? selectedSubjectTemplateId : "",
prompt: buildSimilarSubjectPrompt(subjectStyle, subjectDirection, selectedTemplatePrompt),
replace_views: true,
})
@@ -2373,207 +2397,234 @@ function SourceReferenceBuildPanel({
</div>
</div>
<div className="rounded-md border border-white/10 bg-black/32 p-2">
<div className="mb-2 grid gap-2 lg:grid-cols-[minmax(360px,1fr)_minmax(300px,0.8fr)]">
<div className="rounded-md border border-white/10 bg-black/28 p-2">
<div className="mb-1.5 flex items-center justify-between gap-2">
<div>
<div className="text-[10.5px] font-semibold text-white/70"></div>
<div className="mt-0.5 text-[9px] text-white/32"></div>
</div>
<button
type="button"
onClick={() => void loadSubjectTemplateLibrary()}
disabled={templateLibraryBusy}
className="inline-flex h-6 items-center gap-1 rounded border border-emerald-200/20 bg-emerald-300/10 px-1.5 text-[9px] font-semibold text-emerald-100/80 transition hover:border-emerald-200/40 disabled:cursor-wait disabled:opacity-50"
>
{templateLibraryBusy ? <Loader2 className="h-3 w-3 animate-spin" /> : <RefreshCw className="h-3 w-3" />}
{subjectTemplateLibrary.length}
</button>
<div className="rounded-md border border-white/10 bg-black/28 p-2.5">
<div className="mb-2 flex flex-wrap items-start justify-between gap-2">
<div>
<div className="text-[11px] font-semibold text-white/72"></div>
<div className="mt-0.5 text-[9.5px] text-white/34"></div>
</div>
<div className="grid grid-cols-[repeat(auto-fill,minmax(86px,1fr))] gap-1.5">
<button
type="button"
onClick={() => void loadSubjectTemplateLibrary()}
disabled={templateLibraryBusy}
className="inline-flex h-7 items-center gap-1 rounded border border-white/10 bg-white/[0.045] px-2 text-[10px] font-semibold text-white/58 transition hover:border-cyan-300/35 hover:text-cyan-100 disabled:cursor-wait disabled:opacity-50"
>
{templateLibraryBusy ? <Loader2 className="h-3 w-3 animate-spin" /> : <RefreshCw className="h-3 w-3" />}
{subjectTemplateLibrary.length}
</button>
</div>
<div className="mb-2 grid gap-1.5 sm:grid-cols-2">
{[
{ value: "template" as const, label: "用模板生成", desc: "从内置形象或数据库模板延展新主体" },
{ value: "source_similar" as const, label: "不用模板(从源视频关键帧创新)", desc: "只读取源视频角色文字特征,不上传参考图做复制" },
].map((item) => (
<button
key={item.value}
type="button"
onClick={() => {
setSelectedCharacterId("")
setSelectedSubjectTemplateId("")
}}
className={`min-h-[58px] rounded-md border px-2 py-1.5 text-left transition ${
!selectedCharacterId && !selectedSubjectTemplateId ? "border-cyan-200/55 bg-cyan-300/12 text-cyan-50" : "border-white/10 bg-black/25 text-white/45 hover:border-white/22 hover:text-white/70"
onClick={() => setSubjectMode(item.value)}
className={`flex min-h-[48px] items-start gap-2 rounded-md border px-2.5 py-2 text-left transition ${
subjectMode === item.value
? "border-cyan-200/65 bg-cyan-300/12 text-cyan-50"
: "border-white/10 bg-black/24 text-white/50 hover:border-cyan-200/30 hover:text-white/78"
}`}
>
<span className="block text-[10.5px] font-semibold"></span>
<span className="mt-1 block text-[9px] leading-tight opacity-70"></span>
<span className="mt-0.5 shrink-0">{subjectMode === item.value ? <Check className="h-3.5 w-3.5" /> : <Circle className="h-3.5 w-3.5" />}</span>
<span className="min-w-0">
<span className="block text-[11px] font-semibold">{item.label}</span>
<span className="mt-0.5 block text-[9.5px] leading-snug opacity-65">{item.desc}</span>
</span>
</button>
))}
</div>
<div className={`transition ${subjectMode === "source_similar" ? "pointer-events-none opacity-38 grayscale" : ""}`}>
<div className="grid grid-cols-[repeat(auto-fill,minmax(120px,1fr))] gap-2">
{subjectTemplateLibrary.map((template) => {
const preview = characterPreviewImage(template)
const active = selectedSubjectTemplateId === template.id
const active = subjectMode === "template" && selectedSubjectTemplateId === template.id
return (
<button
key={template.id}
type="button"
onClick={() => {
setSubjectMode("template")
setSelectedSubjectTemplateId(template.id)
setSelectedCharacterId("")
setSubjectStyle(template.subject_style || "transparent_human")
}}
className={`group flex min-h-[58px] items-center gap-1.5 rounded-md border px-1.5 py-1 text-left transition ${
active ? "border-cyan-200/65 bg-cyan-300/12 text-cyan-50" : "border-white/10 bg-black/25 text-white/50 hover:border-cyan-200/35 hover:text-white/80"
className={`group relative rounded-md border p-1.5 text-left transition ${
active ? "border-cyan-200/75 bg-cyan-300/12 text-cyan-50" : "border-white/10 bg-black/24 text-white/58 hover:border-cyan-200/35 hover:text-white/82"
}`}
>
<span className="h-12 w-9 shrink-0 overflow-hidden rounded border border-white/10 bg-white">
{active ? <span className="absolute right-2 top-2 z-10 rounded-full bg-cyan-200 p-0.5 text-black"><Check className="h-3 w-3" /></span> : null}
<span className="block aspect-[4/5] overflow-hidden rounded border border-white/10 bg-white">
{preview ? <img src={subjectTemplateImageUrl(preview.filename)} alt={template.name} className="h-full w-full object-cover" /> : null}
</span>
<span className="min-w-0">
<span className="block truncate text-[10px] font-semibold">{template.name}</span>
<span className="mt-0.5 block text-[8.5px] opacity-58"> · {template.images.length} </span>
</span>
<span className="mt-1 block truncate text-[10.5px] font-semibold">{template.name}</span>
<span className="mt-0.5 block truncate text-[9px] opacity-58"> · {template.images.length} </span>
</button>
)
})}
{characterLibrary.map((character) => {
const preview = characterPreviewImage(character)
const active = selectedCharacterId === character.id
const active = subjectMode === "template" && selectedCharacterId === character.id
return (
<button
key={character.id}
type="button"
onClick={() => {
setSubjectMode("template")
setSelectedCharacterId(character.id)
setSelectedSubjectTemplateId("")
setSubjectStyle("transparent_human")
}}
className={`group flex min-h-[58px] items-center gap-1.5 rounded-md border px-1.5 py-1 text-left transition ${
active ? "border-emerald-200/65 bg-emerald-300/12 text-emerald-50" : "border-white/10 bg-black/25 text-white/50 hover:border-emerald-200/35 hover:text-white/80"
className={`group relative rounded-md border p-1.5 text-left transition ${
active ? "border-cyan-200/75 bg-cyan-300/12 text-cyan-50" : "border-white/10 bg-black/24 text-white/58 hover:border-cyan-200/35 hover:text-white/82"
}`}
>
<span className="h-12 w-9 shrink-0 overflow-hidden rounded border border-white/10 bg-white">
{active ? <span className="absolute right-2 top-2 z-10 rounded-full bg-cyan-200 p-0.5 text-black"><Check className="h-3 w-3" /></span> : null}
<span className="block aspect-[4/5] overflow-hidden rounded border border-white/10 bg-white">
{preview ? <img src={characterLibraryImageUrl(preview.filename)} alt={character.name} className="h-full w-full object-cover" /> : null}
</span>
<span className="min-w-0">
<span className="block truncate text-[10px] font-semibold">{character.name}</span>
<span className="mt-0.5 block text-[8.5px] opacity-58"> · 7 </span>
</span>
<span className="mt-1 block truncate text-[10.5px] font-semibold">{character.name}</span>
<span className="mt-0.5 block truncate text-[9px] opacity-58"> · {character.images.length} </span>
</button>
)
})}
</div>
{!subjectTemplateLibrary.length ? (
<div className="mt-1.5 rounded border border-dashed border-white/10 px-2 py-1.5 text-[9px] leading-snug text-white/28">
</div>
) : null}
{selectedSubjectTemplate?.images?.length ? (
<div className="mt-1.5 flex gap-1 overflow-x-auto pb-0.5">
{selectedSubjectTemplate.images.slice(0, 10).map((image) => (
<div key={image.id} className="h-12 w-9 shrink-0 overflow-hidden rounded border border-white/10 bg-white" title={image.label}>
<img src={subjectTemplateImageUrl(image.filename)} alt={image.label} className="h-full w-full object-cover" />
</div>
))}
</div>
) : selectedCharacter?.images?.length ? (
<div className="mt-1.5 flex gap-1 overflow-x-auto pb-0.5">
{selectedCharacter.images.slice(0, 7).map((image) => (
<div key={image.id} className="h-12 w-9 shrink-0 overflow-hidden rounded border border-white/10 bg-white" title={image.label}>
<img src={characterLibraryImageUrl(image.filename)} alt={image.label} className="h-full w-full object-cover" />
</div>
))}
</div>
) : null}
</div>
<div className="rounded-md border border-white/10 bg-black/28 p-2">
<div className="mb-1.5 flex flex-wrap items-start justify-between gap-2">
<div>
<div className="flex items-center gap-2 text-[10.5px] font-semibold text-white/70">
<span> / 稿</span>
<ModelTrace trace={similarSubjectModelTrace(runtimeModels, subjectStyle)} compact />
{subjectMode === "template" && (selectedSubjectTemplate?.images?.length || selectedCharacter?.images?.length) ? (
<div className="mt-2 flex gap-1.5 overflow-x-auto pb-0.5">
{(selectedSubjectTemplate?.images ?? selectedCharacter?.images ?? []).slice(0, 10).map((image) => (
<div key={image.id} className="h-16 w-12 shrink-0 overflow-hidden rounded border border-white/10 bg-white" title={image.label}>
<img
src={selectedSubjectTemplate ? subjectTemplateImageUrl(image.filename) : characterLibraryImageUrl(image.filename)}
alt={image.label}
className="h-full w-full object-cover"
/>
</div>
<div className="mt-0.5 text-[9px] text-white/32">{templateSourceLabel} · {visibleActorAssets.length}/{SUBJECT_ASSET_VIEWS.length} </div>
</div>
<span className={`rounded border px-1.5 py-0.5 text-[9px] font-semibold ${
visibleActorAssets.length ? "border-emerald-200/25 bg-emerald-300/10 text-emerald-100/80" : "border-white/10 bg-white/5 text-white/36"
}`}>
{visibleActorAssets.length ? "可命名待入库" : "未生成"}
</span>
</div>
<div className="grid gap-1.5">
<input
value={templateDraftName}
onChange={(event) => setTemplateDraftName(event.target.value)}
placeholder="模板命名:如透明骨架女性 01"
className="h-7 rounded-md border border-white/10 bg-black/35 px-2 text-[10.5px] text-white outline-none placeholder:text-white/28 focus:border-cyan-300/50"
/>
<textarea
value={templateDraftNote}
onChange={(event) => setTemplateDraftNote(event.target.value)}
placeholder="备注:适合什么广告、人物年龄/性别/材质、禁用点"
className="min-h-[46px] resize-none rounded-md border border-white/10 bg-black/35 px-2 py-1.5 text-[10.5px] leading-snug text-white outline-none placeholder:text-white/28 focus:border-cyan-300/50"
/>
<div className="flex items-center justify-between gap-2">
<span className="min-w-0 text-[9px] leading-snug text-white/32">{templateSaveHint}</span>
<button
type="button"
onClick={() => void saveGeneratedSubjectTemplate()}
disabled={!visibleActorAssets.length || !templateDraftName.trim() || templateSaveBusy}
title={!visibleActorAssets.length ? "先生成主体视图" : !templateDraftName.trim() ? "先填写模板名称" : "保存到主体模板库"}
className="inline-flex h-7 shrink-0 items-center justify-center gap-1 rounded-md border border-emerald-200/25 bg-emerald-300/12 px-2 text-[10px] font-semibold text-emerald-50 transition hover:border-emerald-200/45 hover:bg-emerald-300/18 disabled:cursor-not-allowed disabled:border-white/10 disabled:bg-white/6 disabled:text-white/32"
>
{templateSaveBusy ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Plus className="h-3.5 w-3.5" />}
</button>
</div>
))}
</div>
) : null}
<div className="my-2 h-px bg-white/10" />
<div className="grid gap-2 lg:grid-cols-[1fr_1.6fr_auto]">
<input
value={templateDraftName}
onChange={(event) => setTemplateDraftName(event.target.value)}
placeholder={visibleActorAssets.length ? "模板名称" : "生成主体视图后可命名保存"}
className="h-8 rounded-md border border-white/10 bg-black/35 px-2 text-[10.5px] text-white outline-none placeholder:text-white/28 focus:border-cyan-300/50"
/>
<input
value={templateDraftNote}
onChange={(event) => setTemplateDraftNote(event.target.value)}
placeholder="保存为主体模板备注:适用广告、人物风格、禁用点"
className="h-8 rounded-md border border-white/10 bg-black/35 px-2 text-[10.5px] text-white outline-none placeholder:text-white/28 focus:border-cyan-300/50"
/>
<button
type="button"
onClick={() => void saveGeneratedSubjectTemplate()}
disabled={!visibleActorAssets.length || !templateDraftName.trim() || templateSaveBusy}
title={!visibleActorAssets.length ? "先生成主体视图" : !templateDraftName.trim() ? "先填写模板名称" : "保存到主体模板库"}
className="inline-flex h-8 items-center justify-center gap-1 rounded-md border border-white/10 bg-white/[0.055] px-3 text-[10.5px] font-semibold text-white/62 transition hover:border-cyan-300/35 hover:text-cyan-100 disabled:cursor-not-allowed disabled:opacity-35"
>
{templateSaveBusy ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Plus className="h-3.5 w-3.5" />}
</button>
</div>
<div className="mt-1 text-[9px] text-white/32">{templateSaveHint}</div>
</div>
<div>
<div className="mb-1.5 flex flex-wrap items-center justify-between gap-2 text-[10px] text-white/36">
<div className="flex items-center gap-2">
<span></span>
<span className="text-white/28">{referenceCountLabel}</span>
<div className="mt-2 rounded-md border border-white/10 bg-black/28 p-2.5">
<div className="mb-2 flex flex-wrap items-center justify-between gap-2">
<div>
<div className="text-[11px] font-semibold text-white/72"></div>
<div className="mt-0.5 text-[9.5px] text-white/34">{templateSourceLabel}</div>
</div>
<span className="text-[10px] text-white/32"></span>
<ModelTrace trace={similarSubjectModelTrace(runtimeModels, subjectStyle)} compact />
</div>
<div className="mb-1.5 flex flex-wrap items-center justify-end gap-2 text-[10px] text-white/36">
<div className="flex min-w-0 flex-wrap items-center justify-end gap-2">
<div className="flex rounded-md border border-white/10 bg-black/28 p-0.5">
{[
{ value: "transparent_human" as const, label: "透明骨架" },
{ value: "source_actor" as const, label: "普通真人" },
].map((item) => (
<div className="grid gap-2 xl:grid-cols-[auto_auto_minmax(220px,1fr)_auto] xl:items-start">
<div className="flex rounded-md border border-white/10 bg-black/28 p-0.5">
{[
{ value: "transparent_human" as const, label: "透明骨架" },
{ value: "source_actor" as const, label: "真人" },
].map((item) => (
<button
key={item.value}
type="button"
onClick={() => setSubjectStyle(item.value)}
className={`h-8 rounded px-2.5 text-[10.5px] font-semibold transition ${
subjectStyle === item.value ? "bg-white text-black" : "text-white/45 hover:text-white"
}`}
>
{item.label}
</button>
))}
</div>
<div className="flex flex-wrap rounded-md border border-white/10 bg-black/28 p-0.5">
{[
{ value: "all" as const, label: `全部 ${SUBJECT_ASSET_VIEWS.length}` },
{ value: "common" as const, label: `常用 ${COMMON_SUBJECT_VIEW_VALUES.length}` },
{ value: "custom" as const, label: "自定义" },
].map((item) => (
<button
key={item.value}
type="button"
onClick={() => setSubjectViewMode(item.value)}
className={`h-8 rounded px-2.5 text-[10.5px] font-semibold transition ${
subjectViewMode === item.value ? "bg-white text-black" : "text-white/45 hover:text-white"
}`}
>
{item.label}
</button>
))}
</div>
<input
value={subjectDirection}
onChange={(event) => setSubjectDirection(event.target.value)}
placeholder="统一方向:如年轻女性 / 更运动 / 更高级"
className="h-9 rounded-md border border-white/10 bg-black/35 px-2.5 text-[11px] text-white outline-none placeholder:text-white/28 focus:border-cyan-300/50"
/>
<button
type="button"
onClick={() => void generateSimilarActor()}
disabled={!frames.length || subjectBusy || templateRequired || !selectedSubjectViews.length}
className="inline-flex h-9 min-w-[170px] items-center justify-center gap-1 rounded-md bg-white px-3 text-[11px] font-semibold text-black transition hover:bg-white/90 disabled:cursor-not-allowed disabled:opacity-40"
>
{subjectBusy ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Sparkles className="h-3.5 w-3.5" />}
{generationCtaLabel}
</button>
</div>
{subjectViewMode === "custom" ? (
<div className="mt-2 flex flex-wrap gap-1.5">
{SUBJECT_ASSET_VIEWS.map((view) => {
const checked = customSubjectViews.includes(view.value)
return (
<button
key={item.value}
key={view.value}
type="button"
onClick={() => setSubjectStyle(item.value)}
className={`h-6 rounded px-2 text-[10px] font-semibold transition ${
subjectStyle === item.value ? "bg-white text-black" : "text-white/45 hover:text-white"
onClick={() => setCustomSubjectViews((current) =>
current.includes(view.value)
? current.filter((item) => item !== view.value)
: [...current, view.value],
)}
className={`h-7 rounded-md border px-2 text-[10px] font-semibold transition ${
checked ? "border-cyan-200/60 bg-cyan-300/12 text-cyan-50" : "border-white/10 bg-black/24 text-white/45 hover:border-cyan-200/28 hover:text-white/75"
}`}
>
{item.label}
{checked ? "✓ " : ""}{view.label}
</button>
))}
</div>
<input
value={subjectDirection}
onChange={(event) => setSubjectDirection(event.target.value)}
placeholder="统一方向:如年轻女性 / 更运动 / 更高级"
className="h-7 w-[240px] min-w-[180px] rounded-md border border-white/10 bg-black/35 px-2 text-[10.5px] text-white outline-none placeholder:text-white/28 focus:border-cyan-300/50"
/>
<span>{visibleActorAssets.length}/{SUBJECT_ASSET_VIEWS.length}</span>
<button
type="button"
onClick={() => void generateSimilarActor()}
disabled={!frames.length || subjectBusy}
className="inline-flex h-7 items-center justify-center gap-1 rounded-md bg-white px-2 text-[10.5px] font-semibold text-black transition hover:bg-white/90 disabled:cursor-not-allowed disabled:opacity-40"
>
{subjectBusy ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Sparkles className="h-3.5 w-3.5" />}
10
</button>
)
})}
</div>
</div>
) : null}
{visibleActorAssets.length ? (
<div className="flex flex-wrap gap-1.5">
<div className="mt-2 grid grid-cols-[repeat(auto-fill,minmax(96px,1fr))] gap-2">
{visibleActorAssets.map((asset) => {
const busyMode = subjectAssetBusy?.endsWith(asset.id) ? subjectAssetBusy.split(":")[0] : ""
return (
@@ -2584,7 +2635,7 @@ function SourceReferenceBuildPanel({
alt={asset.label || asset.view}
label={asset.label || asset.view || "主体视图预览"}
meta={asset.width && asset.height ? `${asset.width}x${asset.height}` : undefined}
className="aspect-[9/16] w-12 bg-white 2xl:w-14"
className="aspect-[9/16] w-20 bg-white 2xl:w-24"
objectFit="contain"
title={asset.label || asset.view}
actions={[{
@@ -2605,8 +2656,10 @@ function SourceReferenceBuildPanel({
})}
</div>
) : (
<div className="rounded border border-dashed border-white/12 px-2 py-2 text-[10.5px] leading-snug text-white/32">
/
<div className="mt-2 rounded border border-dashed border-white/12 px-2 py-2 text-[10.5px] leading-snug text-white/32">
{subjectMode === "template"
? "先选主体模板,再生成新主体视图;模板只作为文字化创意方向,不再作为强参考图复制。"
: "直接使用关键帧的文字化主体特征生成创新主体;后端不会上传源图给生图端点。"}
</div>
)}
</div>
@@ -4299,6 +4352,7 @@ function MaterialCard({
onDelete?: () => void
}) {
const tone = statusTone(job)
const errorText = formatJobError(job.error)
return (
<button
type="button"
@@ -4320,6 +4374,12 @@ function MaterialCard({
<Metric label="文案" value={job.audio_script?.source_text || job.transcript.length ? "ready" : "-"} compact />
<Metric label="段落" value={`${job.transcript.length}`} compact />
</div>
{job.status === "failed" && errorText && (
<div className="mt-2 flex gap-1.5 rounded-md border border-rose-300/18 bg-rose-500/[0.08] px-2 py-1.5 text-[11px] leading-snug text-rose-100/82">
<AlertTriangle className="mt-0.5 h-3.5 w-3.5 shrink-0" />
<span className="line-clamp-3">{errorText}</span>
</div>
)}
{onDelete && (
<span
role="button"

View File

@@ -641,15 +641,15 @@ export const Dashboard = forwardRef<DashboardHandle, Props>(function Dashboard({
</div>
</KanbanCard>
<KanbanCard tone="green" tags={["配音"]} title={job?.audio_script?.voice_model || "MiniMax T2A"}>
<KanbanCard tone="green" tags={["配音"]} title={job?.audio_script?.voice_model || "Azure OpenAI TTS"}>
{job?.audio_script?.voice_url ? (
<audio controls className="h-8 w-full" src={apiAssetUrl(job.audio_script.voice_url)} />
) : (
<div className="text-[11px] text-[var(--text-soft)]">
{job?.audio_script?.error || "配置 MiniMax 后自动生成配音文件"}
{job?.audio_script?.error || "配置 Azure OpenAI TTS 后自动生成配音文件"}
</div>
)}
<div className="kanban-meta">{job?.audio_script?.voice_id || "random English voice"}</div>
<div className="kanban-meta">{job?.audio_script?.voice_id || "Azure voice"}</div>
</KanbanCard>
</>
)}

View File

@@ -2102,7 +2102,7 @@ export function RewriteNode({ data, selected }: any) {
}
/* ============================================================
5b. AudioNode — 合并 ASR + 翻译 + 改写 + MiniMax 配音
5b. AudioNode — 合并 ASR + 翻译 + 改写 + Azure OpenAI 配音
============================================================ */
export function AudioNode({ data, selected }: any) {
const d: NodeData = data
@@ -2152,9 +2152,9 @@ export function AudioNode({ data, selected }: any) {
}}
>
<div>
/ SKG MiniMax <br />
/ SKG Azure OpenAI <br />
<span className="text-[var(--text-faint)] font-mono">
{audioScript?.rewrite_model || "AUDIO_REWRITE_MODEL"} {audioScript?.voice_model || "MiniMax T2A"}
{audioScript?.rewrite_model || "AUDIO_REWRITE_MODEL"} {audioScript?.voice_model || "Azure OpenAI TTS"}
</span>
</div>
{job && (
@@ -2195,7 +2195,7 @@ export function AudioNode({ data, selected }: any) {
)}
</div>
)}
{voiceUrl && <div className="text-[10.5px] text-emerald-200/85">MiniMax natural English voice ready · </div>}
{voiceUrl && <div className="text-[10.5px] text-emerald-200/85">Azure OpenAI English voice ready · </div>}
{isRewriting && (
<div className="text-[10.5px] text-[var(--text-faint)]"></div>
)}

View File

@@ -172,10 +172,7 @@ export interface RuntimeModels {
voice_id?: string
voice_pool?: string[]
voice_configured?: boolean
minimax_tts?: string
minimax_voice?: string
minimax_voice_pool?: string[]
minimax_configured?: boolean
voice_tts_paths?: string[]
video?: string
video_aliases?: Record<string, string>
video_provider?: string
@@ -559,6 +556,7 @@ export interface CharacterLibraryItem {
name: string
folder: string
description: string
prompt_brief?: string
primary_image: string
images: CharacterLibraryImage[]
}
@@ -584,6 +582,7 @@ export interface SubjectTemplateItem {
name: string
description: string
note: string
prompt_brief?: string
source: "database"
source_job_id: string
source_frame_idx: number
@@ -676,9 +675,7 @@ export interface BackendHealth {
translate?: string
rewrite?: string
audio_rewrite?: string
minimax_tts?: string
minimax_voice?: string
minimax_configured?: boolean
voice_tts_paths?: string[]
video?: string
video_aliases?: Record<string, string>
video_base_url?: string
@@ -692,6 +689,25 @@ export function apiAssetUrl(path?: string | null): string {
return `${API_BASE}${path.startsWith("/") ? "" : "/"}${path}`
}
export function isRestrictedDownloadError(error?: string | null): boolean {
const text = (error ?? "").toLowerCase()
return (
text.includes("tiktok 下载需要登录态") ||
text.includes("log in for access") ||
text.includes("cookies-from-browser") ||
text.includes("ytdlp_cookies_file") ||
(text.includes("tiktok") && text.includes("cookies"))
)
}
export function formatJobError(error?: string | null): string {
if (!error) return ""
if (isRestrictedDownloadError(error)) {
return "这个 TikTok 视频需要登录态。请上传 MP4或让后端配置 YTDLP_COOKIES_FROM_BROWSER / YTDLP_COOKIES_FILE 后重试。"
}
return error
}
export async function getHealth(): Promise<BackendHealth> {
const res = await fetch(`${API_BASE}/health`)
if (!res.ok) throw new Error(`health ${res.status}`)
@@ -708,6 +724,15 @@ export async function createJob(tkUrl: string): Promise<Job> {
return res.json()
}
export async function retryJobDownload(id: string): Promise<Job> {
const res = await fetch(`${API_BASE}/jobs/${id}/download/retry`, { method: "POST" })
if (!res.ok) {
const text = await res.text().catch(() => "")
throw apiError("retryJobDownload", res.status, text)
}
return res.json()
}
export async function uploadJob(file: File): Promise<Job> {
const fd = new FormData()
fd.append("file", file)