diff --git a/.memory/worklog.json b/.memory/worklog.json
index 423c012..6f66e57 100644
--- a/.memory/worklog.json
+++ b/.memory/worklog.json
@@ -1,12 +1,5 @@
{
"entries": [
- {
- "files_changed": 1,
- "hash": "ab6f035",
- "message": "auto-save 2026-05-13 01:42 (~1)",
- "ts": "2026-05-13T01:42:52+08:00",
- "type": "commit"
- },
{
"files_changed": 1,
"hash": "6128084",
@@ -3311,6 +3304,13 @@
"type": "session-heartbeat",
"message": "Codex 会话活跃 · 最近命令:codex · 5 项未提交变更 · 最近提交:auto-save 2026-05-14 10:31 (~4)",
"files_changed": 5
+ },
+ {
+ "ts": "2026-05-14T10:40:12+08:00",
+ "type": "commit",
+ "message": "auto-save 2026-05-14 10:36 (~5)",
+ "hash": "1014114",
+ "files_changed": 5
}
]
}
diff --git a/api/main.py b/api/main.py
index 7860287..4ec88be 100644
--- a/api/main.py
+++ b/api/main.py
@@ -561,7 +561,36 @@ async def lifespan(_: FastAPI):
for p in JOBS_DIR.iterdir():
if p.is_dir() and (p / "state.json").exists():
try:
- JOBS[p.name] = Job.model_validate_json((p / "state.json").read_text())
+ job = Job.model_validate_json((p / "state.json").read_text())
+ source_exists = (p / "source.mp4").exists()
+ if job.status in {"created", "downloading"}:
+ if source_exists:
+ update(job, status="downloaded", progress=25, message="服务重启 · 视频已恢复,可重新解析")
+ else:
+ update(job, status="failed", message="服务重启 · 下载任务已中断,请重新提交")
+ elif job.status == "splitting":
+ update(
+ job,
+ status="frames_extracted" if job.frames else "downloaded",
+ progress=70 if job.frames else 25,
+ message="服务重启 · 上次抽帧已中断,可重新抽帧",
+ )
+ elif job.status == "transcribing":
+ audio_script = job.audio_script
+ if audio_script.status == "rewriting":
+ audio_script = audio_script.model_copy(update={
+ "status": "failed",
+ "error": "服务重启 · 上次音频改写/配音已中断,可重新处理",
+ "created_at": audio_script.created_at or time.time(),
+ })
+ update(
+ job,
+ status="frames_extracted",
+ progress=70,
+ audio_script=audio_script,
+ message="服务重启 · 上次音频处理已中断,可重新处理",
+ )
+ JOBS[p.name] = job
except Exception:
pass
yield
@@ -1122,7 +1151,7 @@ def ffprobe_meta(mp4: Path) -> dict:
return json.loads(out)
-async def pipeline_download(job_id: str) -> None:
+def pipeline_download(job_id: str) -> None:
"""阶段 1:仅下载(或上传跳过),落 source.mp4,停在 downloaded 等用户点解析。"""
job = JOBS[job_id]
d = job_dir(job_id)
@@ -1159,7 +1188,7 @@ async def pipeline_download(job_id: str) -> None:
update(job, status="failed", error=str(e), message="下载失败")
-async def pipeline_analyze(
+def pipeline_analyze(
job_id: str,
frame_count: int = KEYFRAME_COUNT,
target: FrameExtractTarget = "transparent_human",
@@ -1311,7 +1340,7 @@ async def pipeline_analyze(
update(job, status="failed", error=str(e), message="解析失败")
-async def analyze_queue_worker() -> None:
+def analyze_queue_worker() -> None:
global ANALYZE_WORKER_RUNNING
ANALYZE_WORKER_RUNNING = True
try:
@@ -1319,7 +1348,7 @@ async def analyze_queue_worker() -> None:
job_id, frames, target, mode, quality = ANALYZE_QUEUE.pop(0)
if job_id not in JOBS:
continue
- await pipeline_analyze(job_id, frames, target, mode, quality)
+ pipeline_analyze(job_id, frames, target, mode, quality)
if ANALYZE_QUEUE:
for pos, (queued_job_id, *_rest) in enumerate(ANALYZE_QUEUE, start=1):
queued_job = JOBS.get(queued_job_id)
@@ -1984,6 +2013,14 @@ def get_video(job_id: str):
return FileResponse(p, media_type="video/mp4")
+@app.get("/jobs/{job_id}/audio.wav")
+def get_source_audio(job_id: str):
+ p = job_dir(job_id) / "audio.wav"
+ if not p.exists():
+ raise HTTPException(404, "audio not found")
+ return FileResponse(p, media_type="audio/wav")
+
+
@app.get("/jobs/{job_id}/audio-script.mp3")
def get_audio_script(job_id: str):
p = job_dir(job_id) / "audio_script.mp3"
diff --git a/docs/source-analysis.html b/docs/source-analysis.html
index babcc52..cd876f6 100644
--- a/docs/source-analysis.html
+++ b/docs/source-analysis.html
@@ -572,6 +572,7 @@
web/app/page.tsx | 产品工作台主状态:jobs、activeJobId、按 job 隔离的 selectedFrames/详情面板状态、clipboard、ReactFlow 节点和边;负责打开/找回画布工作面板。 |
web/components/nodes/index.tsx | DAG 节点定义:Input、VisualLab、Audio、Compose,以及画布工作面板 KeyframePanel / VideoFramePanel;旧 Keyframe/Storyboard/VideoGen 组件保留但不再挂主画布。 |
+ web/components/audio-strip.tsx | 底部吸附音频条:可拖拽调整高度;按时间段展示英文、中文翻译和音频波形,并在右侧固定显示 SKG 改写稿和 MiniMax 配音。 |
web/components/lightbox.tsx | 关键帧素材准备面板:清洗、统一主体候选、参考帧网格、六张主体重绘图、每帧去主体场景图、纵向 6 行产品融合镜头工作表和审核。 |
web/components/product-library-picker.tsx | SKG 内置白底产品图库选择器:搜索、品类筛选、预览尺寸,并把库内图片复制为当前 job 的 asset。 |
web/components/storyboard-bar.tsx | 顶部分镜编排条:展示选入编排的关键帧,并作为唯一分镜导航。 |
@@ -587,6 +588,7 @@
api/main.py | FastAPI 单文件后端:状态模型、任务恢复、下载、抽帧、Vision、清洗、元素、分镜、音频文案改写、MiniMax 配音、文件返回。 |
api/product_library/skg-products | 内置 SKG 白底产品图库:manifest.json 记录从桌面产品图筛出的 gallery 白底图,images/ 存 41 张压缩后的参考图。 |
jobs/<jobId>/state.json | 运行时状态文件,不在源码列表里,但刷新恢复依赖它。 |
+ jobs/<jobId>/audio.wav | 拆轨得到的原始音频,底部 Audio Strip 会通过只读接口拉取并在浏览器里解码成波形峰值。 |
jobs/<jobId>/frames | 关键帧 jpg。注意 frame.index 是稳定 ID,不等于数组下标。 |
jobs/<jobId>/cleaned | 清洗后待应用图片。 |
jobs/<jobId>/elements | 元素提取图,多版本命名:idx_elementId_cutoutId.jpg。 |
@@ -599,6 +601,7 @@
web/app/page.tsx
-> ReactFlow 节点:web/components/nodes/index.tsx
-> 主画布:Input → VisualLab / Audio → Compose
+ -> 底部音频条:web/components/audio-strip.tsx(英文 / 中文 / 波形 / 改写稿)
-> 画布内视频抽帧面板:InputNode 单击视频缩略图打开 videoFramePanel
-> 画布内镜头拆解面板:VisualLabNode 打开 keyframePanel,内嵌 web/components/lightbox.tsx
-> 分镜工作台:web/components/storyboard-workbench.tsx(底层保留)
@@ -790,6 +793,7 @@ SubjectAsset {
| 删除输入视频 | DELETE /jobs/{id} | deleteJob | 从任务队列、URL 和磁盘 jobs/<id> 目录移除整个 job,包括源视频、关键帧、元素提取图和生成视频。 |
| 解析视频 | POST /jobs/{id}/analyze?frames=&target=&mode=&quality= | analyzeJob | 拆轨 + 目标化抽关键帧。默认 frames=12;target 支持透明骨架人、综合、清晰主体、转场变化、表情瞬间、动作峰值;当前 UI 默认 transparent_human。透明骨架人目标会先扩大本地候选池,再调用 Vision 按 6 个分数验收;不合格候选自动丢弃并抽下一候选。mode=append 追加新关键帧;quality=auto 根据本机算力和视频时长自动选择快速、精细或极准。多个抽帧请求进入后端队列顺序处理。 |
| 音频文案轨 | POST /jobs/{id}/transcribe | triggerTranscribe | 读取拆轨得到的 audio.wav,先 ASR 得到英文时间戳段落,再翻译中文,随后按 AUDIO_PRODUCT_BRIEF 生成 audio_script.rewritten_text;配置 MINIMAX_API_KEY 后调用 MiniMax T2A 生成 audio_script.voice_url。 |
+ | 原始音频文件 | GET /jobs/{id}/audio.wav | sourceAudioUrl | 返回拆轨得到的 wav;底部 AudioStrip 拉取该文件,用 Web Audio API 解码并计算波形峰值,只读展示,不参与改写。 |
| 改写配音文件 | GET /jobs/{id}/audio-script.mp3 | apiAssetUrl(job.audio_script.voice_url) | 返回 MiniMax T2A 生成的 mp3。没有配置 MiniMax 或生成失败时该文件不存在,但改写文案仍会保存在 audio_script.rewritten_text。 |
| 手动加帧 | POST /jobs/{id}/frames?t= | addManualFrame | 按视频时间戳抽一帧,index 递增但 frames 按 timestamp 排序。 |
| Vision 识别 | POST /frames/{idx}/describe | describeFrame | 写入 frame.description,后续可从 objects 加候选元素。 |
@@ -837,9 +841,9 @@ SubjectAsset {
| Audio / ASR / Rewrite |
- 独立声音文案轨:从 audio.wav 提取原始口播、翻译中文、改写成 SKG 产品语境口播;MiniMax T2A 配置后生成配音 mp3。主画布的 AudioNode 用“改前 · 原音频 / 改后 · SKG 口播”摘要展示,侧栏 Rewrite 展开后显示完整逐段 ASR/翻译、改写稿、产品依据和配音播放器。 |
+ 独立声音文案轨:从 audio.wav 提取原始口播、翻译中文、改写成 SKG 产品语境口播;MiniMax T2A 配置后生成配音 mp3。主画布的 AudioNode 用“改前 · 原音频 / 改后 · SKG 口播”摘要展示;底部 AudioStrip 吸附屏幕底端,可拖拽调整高度,按时间段展示英文、中文翻译和波形;侧栏 Rewrite 展开后显示完整审核视图。 |
不要阻断视觉素材管线。 |
- AudioNode、ASRNode、TranslateNode、RewriteNode、pipeline_transcribe、AudioScript |
+ AudioNode、AudioStrip、ASRNode、TranslateNode、RewriteNode、pipeline_transcribe、AudioScript |
| Video / Compose |
diff --git a/web/app/page.tsx b/web/app/page.tsx
index e66b627..0caff3f 100644
--- a/web/app/page.tsx
+++ b/web/app/page.tsx
@@ -16,6 +16,7 @@ import {
type NodeData,
} from "@/components/nodes"
import { ThemeToggle } from "@/components/theme-toggle"
+import { AudioStrip } from "@/components/audio-strip"
import {
addManualFrame, analyzeJob, createJob, getJob, listJobs, uploadJob, deleteJob, deleteFrame, deleteGeneratedImage,
deleteGeneratedVideo, deleteCutout, generateStoryboardVideo, createProductFusionGuide,
@@ -976,9 +977,10 @@ export default function Home() {
+
-
+
>
diff --git a/web/components/audio-strip.tsx b/web/components/audio-strip.tsx
new file mode 100644
index 0000000..f0bbd12
--- /dev/null
+++ b/web/components/audio-strip.tsx
@@ -0,0 +1,255 @@
+"use client"
+
+import { useEffect, useMemo, useRef, useState, type PointerEvent as ReactPointerEvent } from "react"
+import { ChevronDown, ChevronUp, GripHorizontal, Mic2, Volume2 } from "lucide-react"
+import { apiAssetUrl, sourceAudioUrl, type Job, type TranscriptSegment } from "@/lib/api"
+
+const STORAGE_KEY = "skg.audio-strip.height"
+const MIN_HEIGHT = 132
+const MAX_HEIGHT = 420
+const DEFAULT_HEIGHT = 236
+
+function clamp(value: number, min: number, max: number) {
+ return Math.min(max, Math.max(min, value))
+}
+
+function fallbackPeaks(count: number, seedText: string) {
+ let seed = 0
+ for (let i = 0; i < seedText.length; i++) seed = (seed * 31 + seedText.charCodeAt(i)) % 9973
+ return Array.from({ length: count }, (_, i) => {
+ const wave = Math.sin((i + seed) * 0.43) * 0.35 + Math.sin((i + seed) * 0.11) * 0.25
+ const pulse = ((i + seed) % 9) / 18
+ return clamp(0.22 + Math.abs(wave) + pulse, 0.18, 1)
+ })
+}
+
+function slicePeaks(peaks: number[], start: number, end: number, duration: number, count = 56) {
+ if (peaks.length === 0 || duration <= 0 || end <= start) return fallbackPeaks(count, `${start}-${end}`)
+ const from = clamp(Math.floor((start / duration) * peaks.length), 0, peaks.length - 1)
+ const to = clamp(Math.ceil((end / duration) * peaks.length), from + 1, peaks.length)
+ const source = peaks.slice(from, to)
+ return Array.from({ length: count }, (_, i) => {
+ const a = Math.floor((i / count) * source.length)
+ const b = Math.max(a + 1, Math.floor(((i + 1) / count) * source.length))
+ return Math.max(...source.slice(a, b), 0.12)
+ })
+}
+
+function Waveform({ peaks, active = false }: { peaks: number[]; active?: boolean }) {
+ return (
+
+ {peaks.map((p, i) => (
+
+ ))}
+
+ )
+}
+
+function SegmentCard({
+ segment,
+ peaks,
+ duration,
+}: {
+ segment: TranscriptSegment
+ peaks: number[]
+ duration: number
+}) {
+ const segDuration = Math.max(1.2, segment.end - segment.start)
+ const width = clamp(180 + segDuration * 42, 220, 520)
+ const segPeaks = slicePeaks(peaks, segment.start, segment.end, duration)
+
+ return (
+
+
+
+ {segment.start.toFixed(1)}s -> {segment.end.toFixed(1)}s
+
+
+ #{segment.index + 1}
+
+
+
+ {segment.en && (
+
+
English
+
{segment.en}
+
+ )}
+
+
中文翻译
+
+ {segment.zh || 翻译中...}
+
+
+
+
+
+ )
+}
+
+async function decodeWaveform(url: string, targetPeaks = 1800) {
+ const res = await fetch(url)
+ if (!res.ok) throw new Error(`audio ${res.status}`)
+ const arrayBuffer = await res.arrayBuffer()
+ const AudioContextClass = window.AudioContext || (window as typeof window & { webkitAudioContext?: typeof AudioContext }).webkitAudioContext
+ if (!AudioContextClass) throw new Error("AudioContext unavailable")
+ const ctx = new AudioContextClass()
+ try {
+ const buffer = await ctx.decodeAudioData(arrayBuffer.slice(0))
+ const data = buffer.getChannelData(0)
+ const bucket = Math.max(1, Math.floor(data.length / targetPeaks))
+ let maxPeak = 0.01
+ const raw: number[] = []
+ for (let i = 0; i < targetPeaks; i++) {
+ const start = i * bucket
+ const end = Math.min(data.length, start + bucket)
+ let peak = 0
+ for (let j = start; j < end; j++) peak = Math.max(peak, Math.abs(data[j] || 0))
+ raw.push(peak)
+ maxPeak = Math.max(maxPeak, peak)
+ }
+ return raw.map((p) => clamp(p / maxPeak, 0.08, 1))
+ } finally {
+ void ctx.close().catch(() => {})
+ }
+}
+
+export function AudioStrip({ job }: { job: Job | null }) {
+ const [collapsed, setCollapsed] = useState(false)
+ const [height, setHeight] = useState(DEFAULT_HEIGHT)
+ const [peaks, setPeaks] = useState([])
+ const dragRef = useRef<{ startY: number; startHeight: number } | null>(null)
+ const transcript = job?.transcript ?? []
+ const audioScript = job?.audio_script
+ const voiceUrl = apiAssetUrl(audioScript?.voice_url)
+ const hasAudio = !!job && (transcript.length > 0 || !!audioScript?.rewritten_text || job.status === "transcribing")
+ const duration = useMemo(() => {
+ const lastTranscriptEnd = transcript.reduce((max, s) => Math.max(max, s.end || 0), 0)
+ return Math.max(job?.duration ?? 0, lastTranscriptEnd, 1)
+ }, [job?.duration, transcript])
+
+ useEffect(() => {
+ if (typeof window === "undefined") return
+ const stored = Number(window.localStorage.getItem(STORAGE_KEY) || "")
+ if (Number.isFinite(stored) && stored > 0) setHeight(clamp(stored, MIN_HEIGHT, MAX_HEIGHT))
+ }, [])
+
+ useEffect(() => {
+ let cancelled = false
+ setPeaks([])
+ if (!job?.id || !hasAudio) return
+ decodeWaveform(sourceAudioUrl(job.id))
+ .then((next) => {
+ if (!cancelled) setPeaks(next)
+ })
+ .catch(() => {
+ if (!cancelled) setPeaks(fallbackPeaks(1800, `${job.id}-${transcript.length}`))
+ })
+ return () => {
+ cancelled = true
+ }
+ }, [job?.id, hasAudio, transcript.length])
+
+ if (!hasAudio || !job) return null
+
+ const startDrag = (e: ReactPointerEvent) => {
+ e.preventDefault()
+ dragRef.current = { startY: e.clientY, startHeight: height }
+ const onMove = (ev: PointerEvent) => {
+ if (!dragRef.current) return
+ const next = clamp(dragRef.current.startHeight + (dragRef.current.startY - ev.clientY), MIN_HEIGHT, MAX_HEIGHT)
+ setHeight(next)
+ }
+ const onUp = () => {
+ if (dragRef.current) {
+ try { window.localStorage.setItem(STORAGE_KEY, String(height)) } catch {}
+ }
+ dragRef.current = null
+ window.removeEventListener("pointermove", onMove)
+ window.removeEventListener("pointerup", onUp)
+ }
+ window.addEventListener("pointermove", onMove)
+ window.addEventListener("pointerup", onUp)
+ }
+
+ return (
+
+ )
+}
diff --git a/web/lib/api.ts b/web/lib/api.ts
index 5c5e0ed..a2541da 100644
--- a/web/lib/api.ts
+++ b/web/lib/api.ts
@@ -537,6 +537,10 @@ export function videoUrl(jobId: string): string {
return `${API_BASE}/jobs/${jobId}/video.mp4`
}
+export function sourceAudioUrl(jobId: string): string {
+ return `${API_BASE}/jobs/${jobId}/audio.wav`
+}
+
export function cleanedFrameUrl(jobId: string, frameIndex: number, bust?: string | number): string {
const u = `${API_BASE}/jobs/${jobId}/frames/${frameIndex}/cleaned.jpg`
return bust ? `${u}?t=${bust}` : u