diff --git a/.memory/worklog.json b/.memory/worklog.json index 423c012..6f66e57 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -1,12 +1,5 @@ { "entries": [ - { - "files_changed": 1, - "hash": "ab6f035", - "message": "auto-save 2026-05-13 01:42 (~1)", - "ts": "2026-05-13T01:42:52+08:00", - "type": "commit" - }, { "files_changed": 1, "hash": "6128084", @@ -3311,6 +3304,13 @@ "type": "session-heartbeat", "message": "Codex 会话活跃 · 最近命令:codex · 5 项未提交变更 · 最近提交:auto-save 2026-05-14 10:31 (~4)", "files_changed": 5 + }, + { + "ts": "2026-05-14T10:40:12+08:00", + "type": "commit", + "message": "auto-save 2026-05-14 10:36 (~5)", + "hash": "1014114", + "files_changed": 5 } ] } diff --git a/api/main.py b/api/main.py index 7860287..4ec88be 100644 --- a/api/main.py +++ b/api/main.py @@ -561,7 +561,36 @@ async def lifespan(_: FastAPI): for p in JOBS_DIR.iterdir(): if p.is_dir() and (p / "state.json").exists(): try: - JOBS[p.name] = Job.model_validate_json((p / "state.json").read_text()) + job = Job.model_validate_json((p / "state.json").read_text()) + source_exists = (p / "source.mp4").exists() + if job.status in {"created", "downloading"}: + if source_exists: + update(job, status="downloaded", progress=25, message="服务重启 · 视频已恢复,可重新解析") + else: + update(job, status="failed", message="服务重启 · 下载任务已中断,请重新提交") + elif job.status == "splitting": + update( + job, + status="frames_extracted" if job.frames else "downloaded", + progress=70 if job.frames else 25, + message="服务重启 · 上次抽帧已中断,可重新抽帧", + ) + elif job.status == "transcribing": + audio_script = job.audio_script + if audio_script.status == "rewriting": + audio_script = audio_script.model_copy(update={ + "status": "failed", + "error": "服务重启 · 上次音频改写/配音已中断,可重新处理", + "created_at": audio_script.created_at or time.time(), + }) + update( + job, + status="frames_extracted", + progress=70, + audio_script=audio_script, + message="服务重启 · 上次音频处理已中断,可重新处理", + ) + JOBS[p.name] = job except Exception: pass yield @@ -1122,7 +1151,7 @@ def ffprobe_meta(mp4: Path) -> dict: return json.loads(out) -async def pipeline_download(job_id: str) -> None: +def pipeline_download(job_id: str) -> None: """阶段 1:仅下载(或上传跳过),落 source.mp4,停在 downloaded 等用户点解析。""" job = JOBS[job_id] d = job_dir(job_id) @@ -1159,7 +1188,7 @@ async def pipeline_download(job_id: str) -> None: update(job, status="failed", error=str(e), message="下载失败") -async def pipeline_analyze( +def pipeline_analyze( job_id: str, frame_count: int = KEYFRAME_COUNT, target: FrameExtractTarget = "transparent_human", @@ -1311,7 +1340,7 @@ async def pipeline_analyze( update(job, status="failed", error=str(e), message="解析失败") -async def analyze_queue_worker() -> None: +def analyze_queue_worker() -> None: global ANALYZE_WORKER_RUNNING ANALYZE_WORKER_RUNNING = True try: @@ -1319,7 +1348,7 @@ async def analyze_queue_worker() -> None: job_id, frames, target, mode, quality = ANALYZE_QUEUE.pop(0) if job_id not in JOBS: continue - await pipeline_analyze(job_id, frames, target, mode, quality) + pipeline_analyze(job_id, frames, target, mode, quality) if ANALYZE_QUEUE: for pos, (queued_job_id, *_rest) in enumerate(ANALYZE_QUEUE, start=1): queued_job = JOBS.get(queued_job_id) @@ -1984,6 +2013,14 @@ def get_video(job_id: str): return FileResponse(p, media_type="video/mp4") +@app.get("/jobs/{job_id}/audio.wav") +def get_source_audio(job_id: str): + p = job_dir(job_id) / "audio.wav" + if not p.exists(): + raise HTTPException(404, "audio not found") + return FileResponse(p, media_type="audio/wav") + + @app.get("/jobs/{job_id}/audio-script.mp3") def get_audio_script(job_id: str): p = job_dir(job_id) / "audio_script.mp3" diff --git a/docs/source-analysis.html b/docs/source-analysis.html index babcc52..cd876f6 100644 --- a/docs/source-analysis.html +++ b/docs/source-analysis.html @@ -572,6 +572,7 @@ web/app/page.tsx产品工作台主状态:jobs、activeJobId、按 job 隔离的 selectedFrames/详情面板状态、clipboard、ReactFlow 节点和边;负责打开/找回画布工作面板。 web/components/nodes/index.tsxDAG 节点定义:Input、VisualLab、Audio、Compose,以及画布工作面板 KeyframePanel / VideoFramePanel;旧 Keyframe/Storyboard/VideoGen 组件保留但不再挂主画布。 + web/components/audio-strip.tsx底部吸附音频条:可拖拽调整高度;按时间段展示英文、中文翻译和音频波形,并在右侧固定显示 SKG 改写稿和 MiniMax 配音。 web/components/lightbox.tsx关键帧素材准备面板:清洗、统一主体候选、参考帧网格、六张主体重绘图、每帧去主体场景图、纵向 6 行产品融合镜头工作表和审核。 web/components/product-library-picker.tsxSKG 内置白底产品图库选择器:搜索、品类筛选、预览尺寸,并把库内图片复制为当前 job 的 assetweb/components/storyboard-bar.tsx顶部分镜编排条:展示选入编排的关键帧,并作为唯一分镜导航。 @@ -587,6 +588,7 @@ api/main.pyFastAPI 单文件后端:状态模型、任务恢复、下载、抽帧、Vision、清洗、元素、分镜、音频文案改写、MiniMax 配音、文件返回。 api/product_library/skg-products内置 SKG 白底产品图库:manifest.json 记录从桌面产品图筛出的 gallery 白底图,images/ 存 41 张压缩后的参考图。 jobs/<jobId>/state.json运行时状态文件,不在源码列表里,但刷新恢复依赖它。 + jobs/<jobId>/audio.wav拆轨得到的原始音频,底部 Audio Strip 会通过只读接口拉取并在浏览器里解码成波形峰值。 jobs/<jobId>/frames关键帧 jpg。注意 frame.index 是稳定 ID,不等于数组下标。 jobs/<jobId>/cleaned清洗后待应用图片。 jobs/<jobId>/elements元素提取图,多版本命名:idx_elementId_cutoutId.jpg。 @@ -599,6 +601,7 @@ web/app/page.tsx -> ReactFlow 节点:web/components/nodes/index.tsx -> 主画布:Input → VisualLab / Audio → Compose + -> 底部音频条:web/components/audio-strip.tsx(英文 / 中文 / 波形 / 改写稿) -> 画布内视频抽帧面板:InputNode 单击视频缩略图打开 videoFramePanel -> 画布内镜头拆解面板:VisualLabNode 打开 keyframePanel,内嵌 web/components/lightbox.tsx -> 分镜工作台:web/components/storyboard-workbench.tsx(底层保留) @@ -790,6 +793,7 @@ SubjectAsset { 删除输入视频DELETE /jobs/{id}deleteJob从任务队列、URL 和磁盘 jobs/<id> 目录移除整个 job,包括源视频、关键帧、元素提取图和生成视频。 解析视频POST /jobs/{id}/analyze?frames=&target=&mode=&quality=analyzeJob拆轨 + 目标化抽关键帧。默认 frames=12target 支持透明骨架人、综合、清晰主体、转场变化、表情瞬间、动作峰值;当前 UI 默认 transparent_human。透明骨架人目标会先扩大本地候选池,再调用 Vision 按 6 个分数验收;不合格候选自动丢弃并抽下一候选。mode=append 追加新关键帧;quality=auto 根据本机算力和视频时长自动选择快速、精细或极准。多个抽帧请求进入后端队列顺序处理。 音频文案轨POST /jobs/{id}/transcribetriggerTranscribe读取拆轨得到的 audio.wav,先 ASR 得到英文时间戳段落,再翻译中文,随后按 AUDIO_PRODUCT_BRIEF 生成 audio_script.rewritten_text;配置 MINIMAX_API_KEY 后调用 MiniMax T2A 生成 audio_script.voice_url。 + 原始音频文件GET /jobs/{id}/audio.wavsourceAudioUrl返回拆轨得到的 wav;底部 AudioStrip 拉取该文件,用 Web Audio API 解码并计算波形峰值,只读展示,不参与改写。 改写配音文件GET /jobs/{id}/audio-script.mp3apiAssetUrl(job.audio_script.voice_url)返回 MiniMax T2A 生成的 mp3。没有配置 MiniMax 或生成失败时该文件不存在,但改写文案仍会保存在 audio_script.rewritten_text。 手动加帧POST /jobs/{id}/frames?t=addManualFrame按视频时间戳抽一帧,index 递增但 frames 按 timestamp 排序。 Vision 识别POST /frames/{idx}/describedescribeFrame写入 frame.description,后续可从 objects 加候选元素。 @@ -837,9 +841,9 @@ SubjectAsset { Audio / ASR / Rewrite - 独立声音文案轨:从 audio.wav 提取原始口播、翻译中文、改写成 SKG 产品语境口播;MiniMax T2A 配置后生成配音 mp3。主画布的 AudioNode 用“改前 · 原音频 / 改后 · SKG 口播”摘要展示,侧栏 Rewrite 展开后显示完整逐段 ASR/翻译、改写稿、产品依据和配音播放器。 + 独立声音文案轨:从 audio.wav 提取原始口播、翻译中文、改写成 SKG 产品语境口播;MiniMax T2A 配置后生成配音 mp3。主画布的 AudioNode 用“改前 · 原音频 / 改后 · SKG 口播”摘要展示;底部 AudioStrip 吸附屏幕底端,可拖拽调整高度,按时间段展示英文、中文翻译和波形;侧栏 Rewrite 展开后显示完整审核视图。 不要阻断视觉素材管线。 - AudioNodeASRNodeTranslateNodeRewriteNodepipeline_transcribeAudioScript + AudioNodeAudioStripASRNodeTranslateNodeRewriteNodepipeline_transcribeAudioScript Video / Compose diff --git a/web/app/page.tsx b/web/app/page.tsx index e66b627..0caff3f 100644 --- a/web/app/page.tsx +++ b/web/app/page.tsx @@ -16,6 +16,7 @@ import { type NodeData, } from "@/components/nodes" import { ThemeToggle } from "@/components/theme-toggle" +import { AudioStrip } from "@/components/audio-strip" import { addManualFrame, analyzeJob, createJob, getJob, listJobs, uploadJob, deleteJob, deleteFrame, deleteGeneratedImage, deleteGeneratedVideo, deleteCutout, generateStoryboardVideo, createProductFusionGuide, @@ -976,9 +977,10 @@ export default function Home() { + - + diff --git a/web/components/audio-strip.tsx b/web/components/audio-strip.tsx new file mode 100644 index 0000000..f0bbd12 --- /dev/null +++ b/web/components/audio-strip.tsx @@ -0,0 +1,255 @@ +"use client" + +import { useEffect, useMemo, useRef, useState, type PointerEvent as ReactPointerEvent } from "react" +import { ChevronDown, ChevronUp, GripHorizontal, Mic2, Volume2 } from "lucide-react" +import { apiAssetUrl, sourceAudioUrl, type Job, type TranscriptSegment } from "@/lib/api" + +const STORAGE_KEY = "skg.audio-strip.height" +const MIN_HEIGHT = 132 +const MAX_HEIGHT = 420 +const DEFAULT_HEIGHT = 236 + +function clamp(value: number, min: number, max: number) { + return Math.min(max, Math.max(min, value)) +} + +function fallbackPeaks(count: number, seedText: string) { + let seed = 0 + for (let i = 0; i < seedText.length; i++) seed = (seed * 31 + seedText.charCodeAt(i)) % 9973 + return Array.from({ length: count }, (_, i) => { + const wave = Math.sin((i + seed) * 0.43) * 0.35 + Math.sin((i + seed) * 0.11) * 0.25 + const pulse = ((i + seed) % 9) / 18 + return clamp(0.22 + Math.abs(wave) + pulse, 0.18, 1) + }) +} + +function slicePeaks(peaks: number[], start: number, end: number, duration: number, count = 56) { + if (peaks.length === 0 || duration <= 0 || end <= start) return fallbackPeaks(count, `${start}-${end}`) + const from = clamp(Math.floor((start / duration) * peaks.length), 0, peaks.length - 1) + const to = clamp(Math.ceil((end / duration) * peaks.length), from + 1, peaks.length) + const source = peaks.slice(from, to) + return Array.from({ length: count }, (_, i) => { + const a = Math.floor((i / count) * source.length) + const b = Math.max(a + 1, Math.floor(((i + 1) / count) * source.length)) + return Math.max(...source.slice(a, b), 0.12) + }) +} + +function Waveform({ peaks, active = false }: { peaks: number[]; active?: boolean }) { + return ( +
+ {peaks.map((p, i) => ( +
+ ))} +
+ ) +} + +function SegmentCard({ + segment, + peaks, + duration, +}: { + segment: TranscriptSegment + peaks: number[] + duration: number +}) { + const segDuration = Math.max(1.2, segment.end - segment.start) + const width = clamp(180 + segDuration * 42, 220, 520) + const segPeaks = slicePeaks(peaks, segment.start, segment.end, duration) + + return ( +
+
+ + {segment.start.toFixed(1)}s -> {segment.end.toFixed(1)}s + + + #{segment.index + 1} + +
+
+ {segment.en && ( +
+
English
+

{segment.en}

+
+ )} +
+
中文翻译
+

+ {segment.zh || 翻译中...} +

+
+ +
+
+ ) +} + +async function decodeWaveform(url: string, targetPeaks = 1800) { + const res = await fetch(url) + if (!res.ok) throw new Error(`audio ${res.status}`) + const arrayBuffer = await res.arrayBuffer() + const AudioContextClass = window.AudioContext || (window as typeof window & { webkitAudioContext?: typeof AudioContext }).webkitAudioContext + if (!AudioContextClass) throw new Error("AudioContext unavailable") + const ctx = new AudioContextClass() + try { + const buffer = await ctx.decodeAudioData(arrayBuffer.slice(0)) + const data = buffer.getChannelData(0) + const bucket = Math.max(1, Math.floor(data.length / targetPeaks)) + let maxPeak = 0.01 + const raw: number[] = [] + for (let i = 0; i < targetPeaks; i++) { + const start = i * bucket + const end = Math.min(data.length, start + bucket) + let peak = 0 + for (let j = start; j < end; j++) peak = Math.max(peak, Math.abs(data[j] || 0)) + raw.push(peak) + maxPeak = Math.max(maxPeak, peak) + } + return raw.map((p) => clamp(p / maxPeak, 0.08, 1)) + } finally { + void ctx.close().catch(() => {}) + } +} + +export function AudioStrip({ job }: { job: Job | null }) { + const [collapsed, setCollapsed] = useState(false) + const [height, setHeight] = useState(DEFAULT_HEIGHT) + const [peaks, setPeaks] = useState([]) + const dragRef = useRef<{ startY: number; startHeight: number } | null>(null) + const transcript = job?.transcript ?? [] + const audioScript = job?.audio_script + const voiceUrl = apiAssetUrl(audioScript?.voice_url) + const hasAudio = !!job && (transcript.length > 0 || !!audioScript?.rewritten_text || job.status === "transcribing") + const duration = useMemo(() => { + const lastTranscriptEnd = transcript.reduce((max, s) => Math.max(max, s.end || 0), 0) + return Math.max(job?.duration ?? 0, lastTranscriptEnd, 1) + }, [job?.duration, transcript]) + + useEffect(() => { + if (typeof window === "undefined") return + const stored = Number(window.localStorage.getItem(STORAGE_KEY) || "") + if (Number.isFinite(stored) && stored > 0) setHeight(clamp(stored, MIN_HEIGHT, MAX_HEIGHT)) + }, []) + + useEffect(() => { + let cancelled = false + setPeaks([]) + if (!job?.id || !hasAudio) return + decodeWaveform(sourceAudioUrl(job.id)) + .then((next) => { + if (!cancelled) setPeaks(next) + }) + .catch(() => { + if (!cancelled) setPeaks(fallbackPeaks(1800, `${job.id}-${transcript.length}`)) + }) + return () => { + cancelled = true + } + }, [job?.id, hasAudio, transcript.length]) + + if (!hasAudio || !job) return null + + const startDrag = (e: ReactPointerEvent) => { + e.preventDefault() + dragRef.current = { startY: e.clientY, startHeight: height } + const onMove = (ev: PointerEvent) => { + if (!dragRef.current) return + const next = clamp(dragRef.current.startHeight + (dragRef.current.startY - ev.clientY), MIN_HEIGHT, MAX_HEIGHT) + setHeight(next) + } + const onUp = () => { + if (dragRef.current) { + try { window.localStorage.setItem(STORAGE_KEY, String(height)) } catch {} + } + dragRef.current = null + window.removeEventListener("pointermove", onMove) + window.removeEventListener("pointerup", onUp) + } + window.addEventListener("pointermove", onMove) + window.addEventListener("pointerup", onUp) + } + + return ( + + ) +} diff --git a/web/lib/api.ts b/web/lib/api.ts index 5c5e0ed..a2541da 100644 --- a/web/lib/api.ts +++ b/web/lib/api.ts @@ -537,6 +537,10 @@ export function videoUrl(jobId: string): string { return `${API_BASE}/jobs/${jobId}/video.mp4` } +export function sourceAudioUrl(jobId: string): string { + return `${API_BASE}/jobs/${jobId}/audio.wav` +} + export function cleanedFrameUrl(jobId: string, frameIndex: number, bust?: string | number): string { const u = `${API_BASE}/jobs/${jobId}/frames/${frameIndex}/cleaned.jpg` return bust ? `${u}?t=${bust}` : u