diff --git a/.memory/worklog.json b/.memory/worklog.json
index 79ed182..b860abc 100644
--- a/.memory/worklog.json
+++ b/.memory/worklog.json
@@ -1,37 +1,5 @@
{
"entries": [
- {
- "files_changed": 1,
- "message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 17:31 (~1)",
- "ts": "2026-05-14T09:36:15Z",
- "type": "session-heartbeat"
- },
- {
- "files_changed": 1,
- "hash": "c5cc460",
- "message": "auto-save 2026-05-14 17:37 (~1)",
- "ts": "2026-05-14T17:37:45+08:00",
- "type": "commit"
- },
- {
- "files_changed": 1,
- "message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 17:37 (~1)",
- "ts": "2026-05-14T09:38:43Z",
- "type": "session-heartbeat"
- },
- {
- "files_changed": 1,
- "hash": "43cb9d7",
- "message": "auto-save 2026-05-14 17:43 (~1)",
- "ts": "2026-05-14T17:43:17+08:00",
- "type": "commit"
- },
- {
- "files_changed": 1,
- "message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 17:43 (~1)",
- "ts": "2026-05-14T09:46:15Z",
- "type": "session-heartbeat"
- },
{
"files_changed": 1,
"message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 17:43 (~1)",
@@ -3268,6 +3236,39 @@
"message": "auto-save 2026-05-17 12:28 (~4)",
"hash": "08f1837",
"files_changed": 4
+ },
+ {
+ "ts": "2026-05-17T12:33:13+08:00",
+ "type": "commit",
+ "message": "feat: add automatic production start workflow",
+ "hash": "b02bc3f",
+ "files_changed": 7
+ },
+ {
+ "ts": "2026-05-17T04:38:24Z",
+ "type": "session-heartbeat",
+ "message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:feat: add automatic production start workflow",
+ "files_changed": 1
+ },
+ {
+ "ts": "2026-05-17T12:44:55+08:00",
+ "type": "commit",
+ "message": "auto-save 2026-05-17 12:44 (~5)",
+ "hash": "05e9e59",
+ "files_changed": 5
+ },
+ {
+ "ts": "2026-05-17T04:48:24Z",
+ "type": "session-heartbeat",
+ "message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 7 项未提交变更 · 最近提交:auto-save 2026-05-17 12:44 (~5)",
+ "files_changed": 7
+ },
+ {
+ "ts": "2026-05-17T12:50:17+08:00",
+ "type": "commit",
+ "message": "auto-save 2026-05-17 12:50 (~8)",
+ "hash": "4dc4092",
+ "files_changed": 8
}
]
}
diff --git a/.project.json b/.project.json
index 2911927..4c7ab81 100644
--- a/.project.json
+++ b/.project.json
@@ -27,7 +27,7 @@
"type": "web_login"
}
],
- "description": "SKG 信息流广告快速复刻分镜生产板:粘贴/上传素材后点击开始生产,自动下载、抽帧、解析音频、扫描关键元素并生成分镜初稿;用户在每张分镜卡中选择关键元素生成提取图和 6 视图,审核文案后可单条或全量生成视频候选,完整视频合成暂为待接入入口。",
+ "description": "SKG 信息流广告快速复刻第一步:粘贴 TK 链接或上传视频后点击开始,系统自动下载源视频;下载完成后优先解析原音频,提取原文案/字幕,分析讲话人、语速节奏、背景音乐/环境声/音效。抽帧、分镜、元素生成和视频合成暂保留为后续能力,不作为当前开始流程的默认动作。",
"kind": "app",
"name": "SKG Marketing Studio / SKG 营销内容工作台",
"ownership": "company",
diff --git a/RULES.md b/RULES.md
index 596b700..434a952 100644
--- a/RULES.md
+++ b/RULES.md
@@ -11,7 +11,7 @@
- 详见 `CLAUDE.md` 立项决策段 + `.memory/plan.md` 七步管线拆解
- 风格:`04-Dark-Gallery-Ambient`(路径:`~/Projects/research/20260305-网页风格库/04-Dark-Gallery-Ambient.md`)
- 第一冲刺:步骤 1-4(下载 / 拆轨 / 关键帧 / ASR+翻译)
-- 当前产品方向(2026-05-17 确认):优先做信息流广告快速复刻产出,不再把主界面做成可视化流程节点;主界面为“左侧素材输入列 + 右侧单一分镜生产板块”。用户粘贴链接或上传素材后点击“开始”,系统自动下载、抽帧、解析音频、扫描关键元素并生成分镜初稿;分镜生产板块内,每个分镜从上到下依次包含音频分镜文案、该分镜关键元素 / 抽帧生成、该分镜视频生成。用户在关键元素候选里选择后生成元素提取图和 6 视图,审核分镜规划后可单条生成或“生成全部视频”。完整视频合成入口保留为待接入能力。
+- 当前产品方向(2026-05-17 再确认):先解决信息流广告快速复刻的第一步,不再沿用“开始后自动抽帧、分镜、元素生成、合成”的默认做法。主界面为“左侧素材输入列 + 右侧音频解析工作表”。用户粘贴 TK 链接或上传视频后点击“开始”,系统自动下载源视频;下载完成后优先提取原音频文案/字幕,并分析讲话人、语速节奏、背景音乐/环境声/音效。抽帧、分镜规划、产品融入、元素 6 视图和视频合成暂作为后续能力保留,不在当前第一步自动触发。
## 部署事实
- 平台:VPS `76.13.31.179`(Ubuntu 24.04 / Docker Compose / Coolify Traefik)
@@ -55,11 +55,11 @@
- `ASR_FALLBACK_MODEL`:当当前网关没有 `/audio/transcriptions` 时,用 Gemini 多模态 chat 直接识别 wav,默认 `gemini-2.5-flash`
- `TRANSLATE_MODEL`:字幕翻译模型,默认 `gemini-2.5-flash`
- `REWRITE_MODEL`:通用改写/分镜描述模型,默认 `gemini-2.5-pro`
-- `AUDIO_REWRITE_MODEL`:音频口播改写模型,默认跟随 `REWRITE_MODEL`;当前产物要求按原音频时长输出英文 SKG 产品介绍 voice-over
+- `AUDIO_REWRITE_MODEL`:后续音频口播改写模型,默认跟随 `REWRITE_MODEL`;当前第一步不默认调用口播改写,只保留原文案和声音分析
- `AUDIO_PRODUCT_BRIEF`:音频口播改写时注入的 SKG 产品卖点
-- `MINIMAX_API_KEY`:MiniMax T2A 配音 Key,只能放本地 `api/.env`,不能入库
-- `MINIMAX_TTS_BASE_URL` / `MINIMAX_TTS_MODEL` / `MINIMAX_TTS_VOICE_ID`:MiniMax 配音端点、模型和兜底音色配置
-- `MINIMAX_TTS_VOICE_POOL`:MiniMax 英文随机音色池;当前默认男声 `English_magnetic_voiced_man`、女声 `English_Upbeat_Woman`、成熟声 `English_MaturePartner`
+- `MINIMAX_API_KEY`:MiniMax T2A 配音 Key,只能放本地 `api/.env`,不能入库;当前第一步暂不默认调用
+- `MINIMAX_TTS_BASE_URL` / `MINIMAX_TTS_MODEL` / `MINIMAX_TTS_VOICE_ID`:MiniMax 配音端点、模型和兜底音色配置,供后续新配音阶段使用
+- `MINIMAX_TTS_VOICE_POOL`:MiniMax 英文随机音色池;当前默认男声 `English_magnetic_voiced_man`、女声 `English_Upbeat_Woman`、成熟声 `English_MaturePartner`,供后续新配音阶段使用
- `POE_API_KEY` / `VIDEO_API_KEY`:视频生成通道 Key,只能放本地环境变量
- `WEB_AUTH_USERNAME` / `WEB_AUTH_PASSWORD` / `WEB_AUTH_SESSION_SECRET`:生产网页登录和会话签名配置;密码和 session secret 只放服务器环境变量,不入库
- 生产环境变量:服务器只使用 `deploy/.env.production`,模板为 `deploy/.env.production.example`;真实 Key 不入库
diff --git a/api/main.py b/api/main.py
index 49e4546..b960f1d 100644
--- a/api/main.py
+++ b/api/main.py
@@ -146,7 +146,7 @@ def llm() -> OpenAI:
return _llm_client
# Pipeline 状态:
-# created → downloading → downloaded(停,等用户点解析/提取音频)
+# created → downloading → downloaded(前端“开始”会继续触发音频解析)
# → splitting → frames_extracted
# → transcribing → transcribed | failed
JobStatus = Literal[
@@ -437,6 +437,7 @@ class AudioScript(BaseModel):
rewritten_text: str = ""
speaker_profile: str = ""
rhythm_profile: str = ""
+ background_audio_profile: str = ""
product_brief: str = ""
rewrite_model: str = ""
voice_provider: str = ""
@@ -1392,7 +1393,7 @@ def media_duration(path: Path) -> float:
def pipeline_download(job_id: str) -> None:
- """阶段 1:仅下载(或上传跳过),落 source.mp4,停在 downloaded 等用户点解析/提取音频。"""
+ """阶段 1:仅下载(或上传跳过),落 source.mp4;前端开始流程会在 downloaded 后触发音频解析。"""
job = JOBS[job_id]
d = job_dir(job_id)
try:
@@ -1423,7 +1424,7 @@ def pipeline_download(job_id: str) -> None:
height=int(v_stream["height"]) if v_stream else 0,
progress=25,
error="",
- message=f"视频就绪 · {duration:.1f}s · 等待解析",
+ message=f"视频就绪 · {duration:.1f}s · 等待音频解析",
)
except Exception as e:
update(job, status="failed", error=str(e), message="下载失败")
@@ -1785,6 +1786,91 @@ def _audio_delivery_profile(segments: list[TranscriptSegment], target_seconds: f
return speaker, rhythm
+def _fallback_audio_profile(segments: list[TranscriptSegment], target_seconds: float = 0.0) -> tuple[str, str, str]:
+ duration = max(float(target_seconds or 0), _segment_duration(segments), 0.0)
+ words = sum(len([w for w in s.en.replace("\n", " ").split(" ") if w.strip()]) for s in segments)
+ sentence_count = len([s for s in segments if (s.en or s.zh).strip()])
+ wpm = int(round(words / max(duration, 1.0) * 60)) if words else 0
+ avg_sentence = duration / sentence_count if sentence_count else 0.0
+ speaker = "检测到短视频口播人声;当前仅能根据转写段落估算,未做声纹克隆。"
+ rhythm = (
+ f"音频约 {duration:.1f}s,{sentence_count} 个文案段,语速约 {wpm} wpm,平均每段 {avg_sentence:.1f}s。"
+ if duration > 0 and sentence_count
+ else "音频节奏信息不足;等待模型返回更完整的语速和停顿分析。"
+ )
+ background = "背景音待模型细分;当前已保留原音频文件,可继续用于音乐、人声和环境声判断。"
+ return speaker, rhythm, background
+
+
+def _audio_profile_model_sync(wav: Path, segments: list[TranscriptSegment], target_seconds: float = 0.0) -> tuple[str, str, str]:
+ fallback = _fallback_audio_profile(segments, target_seconds)
+ if not LLM_API_KEY or not wav.exists():
+ return fallback
+ transcript = _transcript_join(segments, "en") or _transcript_join(segments, "zh") or "No reliable transcript."
+ try:
+ audio_b64 = base64.b64encode(wav.read_bytes()).decode("ascii")
+ except Exception:
+ return fallback
+ prompt = (
+ "Analyze this short-video audio for an ad recreation workflow. Return strict JSON only, no markdown.\n"
+ "Fields:\n"
+ "- speaker_profile: describe speaker count, likely gender/age range if audible, tone, energy, accent/language, confidence.\n"
+ "- rhythm_profile: describe pacing, pauses, speech density, segment rhythm, and timing pattern.\n"
+ "- background_audio_profile: describe music, background sound, ambience, SFX, loudness relationship to voice, and whether it should be recreated or replaced.\n"
+ "Do not invent an exact identity. If uncertain, state uncertainty.\n\n"
+ f"Known transcript/timestamps:\n{transcript[:5000]}"
+ )
+ last_error: Exception | None = None
+ for attempt in range(2):
+ try:
+ resp = llm().chat.completions.create(
+ model=ASR_FALLBACK_MODEL,
+ messages=[{"role": "user", "content": [
+ {"type": "text", "text": prompt},
+ {"type": "input_audio", "input_audio": {"data": audio_b64, "format": "wav"}},
+ ]}],
+ response_format={"type": "json_object"},
+ temperature=0.1,
+ max_tokens=900,
+ )
+ content = (resp.choices[0].message.content or "").strip()
+ data = json.loads(content)
+ speaker = str(data.get("speaker_profile") or "").strip()
+ rhythm = str(data.get("rhythm_profile") or "").strip()
+ background = str(data.get("background_audio_profile") or "").strip()
+ if speaker or rhythm or background:
+ return (
+ speaker or fallback[0],
+ rhythm or fallback[1],
+ background or fallback[2],
+ )
+ except Exception as e:
+ last_error = e
+ if attempt == 0:
+ time.sleep(1.0)
+ if last_error:
+ print(f"[audio profile fallback] {last_error}", flush=True)
+ return fallback
+
+
+def _build_audio_intake_sync(job_id: str, wav: Path, segments: list[TranscriptSegment], target_seconds: float = 0.0) -> AudioScript:
+ source_text = _transcript_join(segments, "en")
+ source_zh = _transcript_join(segments, "zh")
+ duration = max(float(target_seconds or 0), _segment_duration(segments), 0.0)
+ speaker_profile, rhythm_profile, background_audio_profile = _audio_profile_model_sync(wav, segments, duration)
+ return AudioScript(
+ status="completed",
+ source_text=source_text,
+ source_zh=source_zh,
+ speaker_profile=speaker_profile,
+ rhythm_profile=rhythm_profile,
+ background_audio_profile=background_audio_profile,
+ product_brief=AUDIO_PRODUCT_BRIEF,
+ rewrite_model=ASR_FALLBACK_MODEL,
+ created_at=time.time(),
+ )
+
+
def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds: float = 12.0) -> tuple[str, str]:
fallback = _fallback_audio_script(segments, target_seconds)
if not LLM_API_KEY:
@@ -1980,21 +2066,21 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
status="rewriting",
source_text=_transcript_join(mock, "en"),
source_zh=_transcript_join(mock, "zh"),
+ speaker_profile="正在分析原音频讲话人和口播节奏…",
+ rhythm_profile="正在按原音频时长、语速和停顿分析口播节奏…",
+ background_audio_profile="正在分析背景音乐、环境声和音效…",
product_brief=AUDIO_PRODUCT_BRIEF,
- rewrite_model=AUDIO_REWRITE_MODEL,
- voice_provider="minimax",
- voice_model=MINIMAX_TTS_MODEL,
- voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
+ rewrite_model=ASR_FALLBACK_MODEL,
),
}
if manage_job_status:
- update_kwargs.update(message="ASR mock 完成,生成 SKG 英文产品口播…", progress=92)
+ update_kwargs.update(message="ASR mock 完成,分析声音和背景音…", progress=92)
update(job, **update_kwargs)
- audio_script = _build_audio_script_sync(job_id, mock, target_duration)
+ audio_script = _build_audio_intake_sync(job_id, wav, mock, target_duration)
if manage_job_status:
update(job, transcript=mock, status="transcribed", progress=100,
audio_script=audio_script,
- message="转录完成(MOCK · 未设 LLM_API_KEY)")
+ message="音频解析完成(MOCK · 未设 LLM_API_KEY)")
else:
update(job, transcript=mock, audio_script=audio_script)
return
@@ -2046,21 +2132,21 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
status="rewriting",
source_text=_transcript_join(full, "en"),
source_zh=_transcript_join(full, "zh"),
+ speaker_profile="正在分析原音频讲话人和口播节奏…",
+ rhythm_profile="正在按原音频时长、语速和停顿分析口播节奏…",
+ background_audio_profile="正在分析背景音乐、环境声和音效…",
product_brief=AUDIO_PRODUCT_BRIEF,
- rewrite_model=AUDIO_REWRITE_MODEL,
- voice_provider="minimax",
- voice_model=MINIMAX_TTS_MODEL,
- voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
+ rewrite_model=ASR_FALLBACK_MODEL,
),
}
if manage_job_status:
- update_kwargs.update(message="翻译完成,生成 SKG 英文产品口播与 MiniMax 配音…", progress=94)
+ update_kwargs.update(message="翻译完成,分析讲话人、节奏和背景音…", progress=94)
update(job, **update_kwargs)
- audio_script = _build_audio_script_sync(job_id, full, target_duration)
+ audio_script = _build_audio_intake_sync(job_id, wav, full, target_duration)
if manage_job_status:
update(job, transcript=full, status="transcribed", progress=100,
audio_script=audio_script,
- message=f"转录完成 · {len(full)} 段({ASR_MODEL} + {TRANSLATE_MODEL})")
+ message=f"音频解析完成 · {len(full)} 段({ASR_MODEL} + {TRANSLATE_MODEL} + {ASR_FALLBACK_MODEL} 音频分析)")
else:
update(job, transcript=full, audio_script=audio_script)
@@ -2498,12 +2584,10 @@ async def trigger_transcribe(job_id: str, bg: BackgroundTasks) -> Job:
audio_payload = AudioScript(
status="rewriting",
speaker_profile="正在分析原音频讲话人和口播节奏…",
- rhythm_profile="正在按原音频时长、语速和停顿生成 SKG 产品配音脚本…",
+ rhythm_profile="正在按原音频时长、语速和停顿分析口播节奏…",
+ background_audio_profile="正在分析背景音乐、环境声和音效…",
product_brief=AUDIO_PRODUCT_BRIEF,
- rewrite_model=AUDIO_REWRITE_MODEL,
- voice_provider="minimax",
- voice_model=MINIMAX_TTS_MODEL,
- voice_id="random:" + ",".join(MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
+ rewrite_model=ASR_FALLBACK_MODEL,
)
if manage_job_status:
update(job, status="transcribing", progress=max(45, min(job.progress, 70)), error="", message="准备提取音频…", audio_script=audio_payload)
diff --git a/docs/source-analysis.html b/docs/source-analysis.html
index 7ce639a..954e3d5 100644
--- a/docs/source-analysis.html
+++ b/docs/source-analysis.html
@@ -485,7 +485,7 @@
这个页面是产品协作地图,不是应用功能页。
它把“你看到的界面、你想改的功能、实际要动的源码、可能影响的数据和接口”放在同一个地方。
- 后续描述需求时,可以直接说“改素材输入列 / 某个分镜卡片 / 某个接口行为”,这样改动范围会更准,也更容易追踪每次变更带来的影响。
+ 后续描述需求时,可以直接说“改素材输入列 / 音频解析工作表 / 某个接口行为”,这样改动范围会更准,也更容易追踪每次变更带来的影响。