diff --git a/.memory/worklog.json b/.memory/worklog.json index 2ae3f44..0c5319d 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -1,19 +1,5 @@ { "entries": [ - { - "files_changed": 1, - "hash": "c8fd985", - "message": "auto-save 2026-05-13 03:54 (~1)", - "ts": "2026-05-13T03:54:21+08:00", - "type": "commit" - }, - { - "files_changed": 1, - "hash": "ffc7437", - "message": "auto-save 2026-05-13 04:00 (~1)", - "ts": "2026-05-13T04:00:13+08:00", - "type": "commit" - }, { "files_changed": 1, "hash": "7a5b09a", @@ -3299,6 +3285,19 @@ "type": "session-heartbeat", "message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 11:36 (~3)", "files_changed": 1 + }, + { + "ts": "2026-05-14T11:42:06+08:00", + "type": "commit", + "message": "auto-save 2026-05-14 11:41 (~1)", + "hash": "b474d80", + "files_changed": 1 + }, + { + "ts": "2026-05-14T03:46:10Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 6 项未提交变更 · 最近提交:auto-save 2026-05-14 11:41 (~1)", + "files_changed": 6 } ] } diff --git a/api/.env.example b/api/.env.example index 5b6a159..1ed17f6 100644 --- a/api/.env.example +++ b/api/.env.example @@ -19,7 +19,7 @@ AUDIO_PRODUCT_BRIEF="SKG 智能按摩产品,主打日常肩颈、腰背、眼 MINIMAX_API_KEY= MINIMAX_TTS_BASE_URL=https://api.minimax.io MINIMAX_TTS_MODEL=speech-2.8-turbo -MINIMAX_TTS_VOICE_ID="Chinese (Mandarin)_Reliable_Executive" +MINIMAX_TTS_VOICE_ID=English_expressive_narrator # Poe 视频 API(优先用于 Seedance / Kling / Veo) POE_API_BASE_URL=https://api.poe.com/v1 diff --git a/api/main.py b/api/main.py index 3abc6ac..84805b6 100644 --- a/api/main.py +++ b/api/main.py @@ -49,8 +49,8 @@ MINIMAX_TTS_BASE_URL = os.getenv("MINIMAX_TTS_BASE_URL", "https://api.minimax.io MINIMAX_TTS_MODEL = os.getenv("MINIMAX_TTS_MODEL", "speech-2.8-turbo").strip() or "speech-2.8-turbo" MINIMAX_TTS_VOICE_ID = os.getenv( "MINIMAX_TTS_VOICE_ID", - "Chinese (Mandarin)_Reliable_Executive", -).strip() or "Chinese (Mandarin)_Reliable_Executive" + "English_expressive_narrator", +).strip() or "English_expressive_narrator" POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1" POE_API_KEY = os.getenv("POE_API_KEY", "").strip() @@ -379,6 +379,7 @@ class Job(BaseModel): duration: float = 0.0 width: int = 0 height: int = 0 + source_audio_url: str = "" frames: list[KeyFrame] = Field(default_factory=list) transcript: list[TranscriptSegment] = Field(default_factory=list) audio_script: AudioScript = Field(default_factory=AudioScript) @@ -400,6 +401,14 @@ def job_dir(job_id: str) -> Path: return d +def source_audio_url_for(job_id: str) -> str: + return f"/jobs/{job_id}/audio.wav" if (JOBS_DIR / job_id / "audio.wav").exists() else "" + + +def job_with_artifacts(job: Job) -> Job: + return job.model_copy(update={"source_audio_url": source_audio_url_for(job.id)}) + + def save_state(job: Job) -> None: (job_dir(job.id) / "state.json").write_text(job.model_dump_json(indent=2)) @@ -1224,7 +1233,7 @@ def pipeline_analyze( wav = d / "audio.wav" if wav.exists(): - update(job, status="splitting", message="复用音轨 · 准备抽帧…", progress=35) + update(job, status="splitting", message="复用音轨 · 准备抽帧…", progress=35, source_audio_url=f"/jobs/{job_id}/audio.wav") else: update(job, status="splitting", message="ffmpeg 拆分音轨…", progress=35) run([ @@ -1232,6 +1241,7 @@ def pipeline_analyze( "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", str(wav), ]) + update(job, source_audio_url=f"/jobs/{job_id}/audio.wav") n = max(1, min(int(frame_count), 20)) target_label = FRAME_TARGET_LABELS.get(target, FRAME_TARGET_LABELS["balanced"]) duration = max(float(job.duration or 1.0), 0.1) @@ -1497,12 +1507,12 @@ def _transcript_join(segments: list[TranscriptSegment], field: Literal["en", "zh def _fallback_audio_script(segments: list[TranscriptSegment]) -> str: - joined = " ".join((s.zh or s.en).strip() for s in segments if (s.zh or s.en).strip()) + joined = " ".join((s.en or s.zh).strip() for s in segments if (s.en or s.zh).strip()) if not joined: - return "日常疲惫不用硬扛。戴上 SKG,让肩颈慢慢放松,跟着呼吸找回轻松状态。" + return "Ease into the moment with SKG. Gentle warmth and rhythmic massage help everyday tension feel lighter, cleaner, and easier to leave behind." return ( - "把日常紧绷交给 SKG。贴合身体需要放松的位置,热敷与按摩节奏自然陪伴," - "让每一次短暂休息都更轻松、更有质感。" + "Let SKG turn a short break into real relief. With soothing warmth and steady massage rhythm, " + "everyday tension feels lighter, calmer, and easier to leave behind." ) @@ -1513,24 +1523,24 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment]) -> tuple[str, source_text = _transcript_join(segments, "en") source_zh = _transcript_join(segments, "zh") prompt = ( - "你是 SKG 短视频口播编导。根据参考视频音频转写,抽取它的表达结构、情绪节奏和可复用卖点," - "改写成适合 SKG 按摩/放松产品二创视频的中文口播文案。\n" - "要求:\n" - "1. 输出 35-90 个中文字,适合 8-18 秒短视频配音。\n" - "2. 口语化、干净、高级,能直接给 TTS 朗读。\n" - "3. 不承诺治疗、治愈、医学疗效,不夸大。\n" - "4. 不复刻原视频品牌/人物/价格/平台话术,只保留表达结构。\n" - "5. 如果参考转写信息不足,按产品信息生成通用 SKG 放松口播。\n" - '严格返回 JSON:{"rewritten_text":"..."}。\n\n' - f"SKG 产品信息:{AUDIO_PRODUCT_BRIEF}\n\n" - f"英文转写:\n{source_text or '无'}\n\n" - f"中文翻译:\n{source_zh or '无'}" + "You are an English short-video voice-over writer for SKG wellness massagers. " + "Use the source transcript only for structure, pacing, and emotional hook, then rewrite it into a clean English VO for SKG.\n" + "Rules:\n" + "1. Output 28-55 English words, suitable for an 8-18 second TTS voice-over.\n" + "2. Make it natural, premium, concise, and ready to read aloud.\n" + "3. Do not claim medical treatment, cure, pain elimination, or clinical effects.\n" + "4. Do not copy the original brand, creator, price, platform language, or exact claims.\n" + "5. If the source transcript is too thin, write a general SKG relaxation VO.\n" + 'Return strict JSON only: {"rewritten_text":"..."}.\n\n' + f"SKG product context: {AUDIO_PRODUCT_BRIEF}\n\n" + f"English transcript:\n{source_text or 'None'}\n\n" + f"Chinese translation for reference:\n{source_zh or 'None'}" ) try: resp = llm().chat.completions.create( model=AUDIO_REWRITE_MODEL, messages=[ - {"role": "system", "content": "只输出合法 JSON,不要解释,不要 markdown。"}, + {"role": "system", "content": "Return valid JSON only. No explanation. No markdown."}, {"role": "user", "content": prompt}, ], response_format={"type": "json_object"}, @@ -1564,7 +1574,7 @@ def _minimax_tts_sync(job_id: str, text: str) -> str: "model": MINIMAX_TTS_MODEL, "text": text.strip()[:9500], "stream": False, - "language_boost": "Chinese", + "language_boost": "English", "output_format": "hex", "voice_setting": { "voice_id": MINIMAX_TTS_VOICE_ID, @@ -1651,6 +1661,7 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None: ]) if not wav.exists(): raise RuntimeError("音频提取完成但找不到 audio.wav") + update(job, source_audio_url=f"/jobs/{job_id}/audio.wav") if not LLM_API_KEY: # 无 key 模式:mock 数据 @@ -2112,7 +2123,7 @@ def get_job(job_id: str) -> Job: job = JOBS.get(job_id) if not job: raise HTTPException(404, "job not found") - return job + return job_with_artifacts(job) @app.delete("/jobs/{job_id}") @@ -2153,7 +2164,7 @@ async def trigger_transcribe(job_id: str, bg: BackgroundTasks) -> Job: update(job, error="", audio_script=audio_payload) if not start_audio_processing(job_id, manage_job_status=manage_job_status): update(job, message="音频已在处理中") - return job + return job_with_artifacts(job) @app.get("/jobs/{job_id}/video.mp4") diff --git a/web/app/page.tsx b/web/app/page.tsx index 5940cce..1bec5be 100644 --- a/web/app/page.tsx +++ b/web/app/page.tsx @@ -100,6 +100,8 @@ export default function Home() { const [jobs, setJobs] = useState([]) const [activeJobId, setActiveJobId] = useState(null) const job = useMemo(() => jobs.find((j) => j.id === activeJobId) ?? null, [jobs, activeJobId]) + const [audioStripJobId, setAudioStripJobId] = useState(null) + const audioStripJob = useMemo(() => jobs.find((j) => j.id === audioStripJobId) ?? null, [jobs, audioStripJobId]) const [submitting, setSubmitting] = useState(false) const [analyzing, setAnalyzing] = useState(false) const [frameTargets, setFrameTargets] = useState>({}) @@ -159,6 +161,10 @@ export default function Home() { const handleSwitchJob = useCallback((id: string) => { setActiveJobId(id) }, []) + const handleOpenAudioStrip = useCallback((jobId?: string) => { + const targetId = jobId ?? activeJobId + if (targetId) setAudioStripJobId(targetId) + }, [activeJobId]) const pollRef = useRef | null>(null) const handleSubmit = useCallback(async (url: string) => { @@ -393,16 +399,13 @@ export default function Home() { const handleTranscribeAudio = useCallback(async (jobId?: string, options?: { silent?: boolean }) => { const targetId = jobId ?? activeJobId if (!targetId) return + setAudioStripJobId(targetId) const target = jobs.find((item) => item.id === targetId) if (!target) return if (!target.video_url) { if (!options?.silent) toast.info("视频导入完成后,可在音频卡片点击提取音频") return } - if (target.status === "splitting") { - if (!options?.silent) toast.info("当前正在抽帧,结束后可重新点击提取音频") - return - } if (target.status === "transcribing" || target.audio_script?.status === "rewriting") { if (!options?.silent) toast.info("音频正在处理中") return @@ -728,9 +731,10 @@ export default function Home() { onCopyImage: handleCopyImage, onGenerateProductFusionVideo: handleGenerateProductFusionVideo, onTranscribeAudio: handleTranscribeAudio, + onOpenAudioStrip: handleOpenAudioStrip, pinnedNodes, onToggleNodePin: handleToggleNodePin, - }), [job, jobs, activeJobId, submitting, analyzing, frameTargets, frameCounts, frameQualities, selectedFrames, expandedFrame, framePanelScale, framePanelPinned, framePanelDock, videoPanelJobId, videoPanelScale, videoPanelDock, handleSubmit, handleUpload, handleAnalyze, handleAnalyzeJob, handleFrameTargetChange, handleFrameCountChange, handleFrameQualityChange, handleToggleFrame, handleOpenFramePanel, handleFramePanelScaleChange, handleCloseExpandedFrame, handleAddManualFrame, handleAddManualFrameForJob, handleOpenVideoPanel, handleVideoPanelScaleChange, handleSwitchJob, updateJobInList, handleDeleteJob, handleDeleteFrame, handleDeleteFrameForJob, handleDeleteGenerated, handleDeleteVideo, handleDeleteCutout, handleOpenStoryboard, handleOpenWorkbench, clipboard, handleCopyImage, handleGenerateProductFusionVideo, handleTranscribeAudio, pinnedNodes, handleToggleNodePin]) + }), [job, jobs, activeJobId, submitting, analyzing, frameTargets, frameCounts, frameQualities, selectedFrames, expandedFrame, framePanelScale, framePanelPinned, framePanelDock, videoPanelJobId, videoPanelScale, videoPanelDock, handleSubmit, handleUpload, handleAnalyze, handleAnalyzeJob, handleFrameTargetChange, handleFrameCountChange, handleFrameQualityChange, handleToggleFrame, handleOpenFramePanel, handleFramePanelScaleChange, handleCloseExpandedFrame, handleAddManualFrame, handleAddManualFrameForJob, handleOpenVideoPanel, handleVideoPanelScaleChange, handleSwitchJob, updateJobInList, handleDeleteJob, handleDeleteFrame, handleDeleteFrameForJob, handleDeleteGenerated, handleDeleteVideo, handleDeleteCutout, handleOpenStoryboard, handleOpenWorkbench, clipboard, handleCopyImage, handleGenerateProductFusionVideo, handleTranscribeAudio, handleOpenAudioStrip, pinnedNodes, handleToggleNodePin]) // 用 useNodesState 让 ReactFlow 自己管位置(避免轮询时重置 drag) const savedSizes = useMemo(() => loadNodeSizes(), []) @@ -1013,7 +1017,7 @@ export default function Home() {
)}
- {clientReady && } + {clientReady && setAudioStripJobId(null)} />} diff --git a/web/components/audio-strip.tsx b/web/components/audio-strip.tsx index 0b2d16f..ab479fd 100644 --- a/web/components/audio-strip.tsx +++ b/web/components/audio-strip.tsx @@ -1,7 +1,7 @@ "use client" import { useEffect, useMemo, useRef, useState, type PointerEvent as ReactPointerEvent } from "react" -import { ChevronDown, ChevronUp, GripHorizontal, Mic2, Volume2 } from "lucide-react" +import { ChevronDown, ChevronUp, GripHorizontal, Mic2, Volume2, X } from "lucide-react" import { apiAssetUrl, sourceAudioUrl, type Job, type TranscriptSegment } from "@/lib/api" const STORAGE_KEY = "skg.audio-strip.height" @@ -58,20 +58,34 @@ function SegmentCard({ segment, peaks, duration, + currentTime, }: { segment: TranscriptSegment peaks: number[] duration: number + currentTime: number }) { const segDuration = Math.max(1.2, segment.end - segment.start) const width = clamp(180 + segDuration * 42, 220, 520) const segPeaks = slicePeaks(peaks, segment.start, segment.end, duration) + const active = currentTime >= segment.start && currentTime <= Math.max(segment.end, segment.start + 0.2) + const pointerPct = active ? clamp(((currentTime - segment.start) / Math.max(0.2, segment.end - segment.start)) * 100, 0, 100) : 0 return (
+ {active && ( +
+ )}
{segment.start.toFixed(1)}s to {segment.end.toFixed(1)}s @@ -93,7 +107,7 @@ function SegmentCard({ {segment.zh || 翻译中...}

- +
) @@ -126,19 +140,33 @@ async function decodeWaveform(url: string, targetPeaks = 1800) { } } -export function AudioStrip({ job }: { job: Job | null }) { +export function AudioStrip({ job, open, onClose }: { job: Job | null; open: boolean; onClose?: () => void }) { const [collapsed, setCollapsed] = useState(false) const [height, setHeight] = useState(DEFAULT_HEIGHT) const [peaks, setPeaks] = useState([]) + const [sourceReady, setSourceReady] = useState(false) + const [audioKey, setAudioKey] = useState(0) + const [currentTime, setCurrentTime] = useState(0) const dragRef = useRef<{ startY: number; startHeight: number } | null>(null) + const audioRef = useRef(null) const transcript = job?.transcript ?? [] const audioScript = job?.audio_script const voiceUrl = apiAssetUrl(audioScript?.voice_url) - const hasAudio = !!job && (transcript.length > 0 || !!audioScript?.rewritten_text || job.status === "transcribing") + const sourceUrl = job ? apiAssetUrl(job.source_audio_url || sourceAudioUrl(job.id)) : "" + const processing = !!job && (job.status === "transcribing" || audioScript?.status === "rewriting") + const activeSegment = transcript.find((segment) => currentTime >= segment.start && currentTime <= Math.max(segment.end, segment.start + 0.2)) const duration = useMemo(() => { const lastTranscriptEnd = transcript.reduce((max, s) => Math.max(max, s.end || 0), 0) - return Math.max(job?.duration ?? 0, lastTranscriptEnd, 1) + const audioDuration = audioRef.current?.duration + return Math.max( + Number.isFinite(audioDuration) ? Number(audioDuration) : 0, + job?.duration ?? 0, + lastTranscriptEnd, + 1, + ) }, [job?.duration, transcript]) + const timelinePeaks = useMemo(() => slicePeaks(peaks, 0, duration, duration, 160), [duration, peaks]) + const pointerPct = clamp((currentTime / duration) * 100, 0, 100) useEffect(() => { if (typeof window === "undefined") return @@ -148,21 +176,38 @@ export function AudioStrip({ job }: { job: Job | null }) { useEffect(() => { let cancelled = false + let timer: ReturnType | null = null + let attempts = 0 setPeaks([]) - if (!job?.id || !hasAudio) return - decodeWaveform(sourceAudioUrl(job.id)) + setSourceReady(false) + setCurrentTime(0) + if (!job?.id || !open) return + setPeaks(fallbackPeaks(1800, `${job.id}-loading`)) + const load = () => { + attempts += 1 + decodeWaveform(sourceUrl) .then((next) => { - if (!cancelled) setPeaks(next) + if (cancelled) return + setPeaks(next) + setSourceReady(true) + setAudioKey((key) => key + 1) }) .catch(() => { - if (!cancelled) setPeaks(fallbackPeaks(1800, `${job.id}-${transcript.length}`)) + if (cancelled) return + setSourceReady(false) + if (attempts < (processing ? 45 : 6)) { + timer = setTimeout(load, 1000) + } }) + } + load() return () => { cancelled = true + if (timer) clearTimeout(timer) } - }, [job?.id, hasAudio, transcript.length]) + }, [job?.id, open, processing, sourceUrl, transcript.length]) - if (!hasAudio || !job) return null + if (!open || !job) return null const startDrag = (e: ReactPointerEvent) => { e.preventDefault() @@ -206,7 +251,7 @@ export function AudioStrip({ job }: { job: Job | null }) { {voiceUrl && (
- MiniMax ready + English VO ready
)} + {onClose && ( + + )} {!collapsed && (
-
- {transcript.length > 0 ? ( -
- {transcript.map((segment) => ( - - ))} +
+
+
+
+ Source audio playback + {activeSegment ? #{activeSegment.index + 1} : null} +
+
+ {currentTime.toFixed(1)}s / {duration.toFixed(1)}s +
+ {sourceReady ? ( +
-
改后 · SKG 口播
+
English VO · SKG rewrite

- {audioScript?.rewritten_text || "等待转录完成后生成适合 SKG 产品视频的口播文案。"} + {audioScript?.rewritten_text || "Waiting for the parsed transcript to become an English SKG voice-over."}

{voiceUrl && (