auto-save 2026-05-14 11:21 (~7)
This commit is contained in:
@@ -1,19 +1,5 @@
|
|||||||
{
|
{
|
||||||
"entries": [
|
"entries": [
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"hash": "989728d",
|
|
||||||
"message": "auto-save 2026-05-13 03:01 (~1)",
|
|
||||||
"ts": "2026-05-13T03:01:19+08:00",
|
|
||||||
"type": "commit"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"hash": "4ae9105",
|
|
||||||
"message": "auto-save 2026-05-13 03:07 (~1)",
|
|
||||||
"ts": "2026-05-13T03:07:14+08:00",
|
|
||||||
"type": "commit"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"files_changed": 1,
|
"files_changed": 1,
|
||||||
"hash": "06186cb",
|
"hash": "06186cb",
|
||||||
@@ -3304,6 +3290,19 @@
|
|||||||
"type": "session-heartbeat",
|
"type": "session-heartbeat",
|
||||||
"message": "Codex 会话活跃 · 最近命令:codex · 5 项未提交变更 · 最近提交:auto-save 2026-05-14 11:10 (~1)",
|
"message": "Codex 会话活跃 · 最近命令:codex · 5 项未提交变更 · 最近提交:auto-save 2026-05-14 11:10 (~1)",
|
||||||
"files_changed": 5
|
"files_changed": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-14T11:16:12+08:00",
|
||||||
|
"type": "commit",
|
||||||
|
"message": "auto-save 2026-05-14 11:15 (~5)",
|
||||||
|
"hash": "4127adc",
|
||||||
|
"files_changed": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-14T03:18:38Z",
|
||||||
|
"type": "session-heartbeat",
|
||||||
|
"message": "Codex 会话活跃 · 最近命令:codex · 4 项未提交变更 · 最近提交:auto-save 2026-05-14 11:15 (~5)",
|
||||||
|
"files_changed": 4
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
3
RULES.md
3
RULES.md
@@ -33,7 +33,8 @@
|
|||||||
|
|
||||||
## 环境变量
|
## 环境变量
|
||||||
- `LLM_BASE_URL` / `LLM_API_KEY`:OpenAI 兼容网关,用于 ASR、翻译、文案改写、图像等模型调用
|
- `LLM_BASE_URL` / `LLM_API_KEY`:OpenAI 兼容网关,用于 ASR、翻译、文案改写、图像等模型调用
|
||||||
- `ASR_MODEL`:音频转写模型,默认 `whisper-1`
|
- `ASR_MODEL`:OpenAI Audio Transcriptions 音频转写模型,默认 `whisper-1`
|
||||||
|
- `ASR_FALLBACK_MODEL`:当当前网关没有 `/audio/transcriptions` 时,用 Gemini 多模态 chat 直接识别 wav,默认 `gemini-2.5-flash`
|
||||||
- `TRANSLATE_MODEL`:字幕翻译模型,默认 `gemini-2.5-flash`
|
- `TRANSLATE_MODEL`:字幕翻译模型,默认 `gemini-2.5-flash`
|
||||||
- `REWRITE_MODEL`:通用改写/分镜描述模型,默认 `gemini-2.5-pro`
|
- `REWRITE_MODEL`:通用改写/分镜描述模型,默认 `gemini-2.5-pro`
|
||||||
- `AUDIO_REWRITE_MODEL`:音频口播改写模型,默认跟随 `REWRITE_MODEL`
|
- `AUDIO_REWRITE_MODEL`:音频口播改写模型,默认跟随 `REWRITE_MODEL`
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ LLM_API_KEY=
|
|||||||
|
|
||||||
# 模型分工
|
# 模型分工
|
||||||
ASR_MODEL=whisper-1
|
ASR_MODEL=whisper-1
|
||||||
|
ASR_FALLBACK_MODEL=gemini-2.5-flash
|
||||||
TRANSLATE_MODEL=gemini-2.5-flash
|
TRANSLATE_MODEL=gemini-2.5-flash
|
||||||
REWRITE_MODEL=gemini-2.5-pro
|
REWRITE_MODEL=gemini-2.5-pro
|
||||||
IMAGE_MODEL=gemini-3-pro-image-preview
|
IMAGE_MODEL=gemini-3-pro-image-preview
|
||||||
|
|||||||
@@ -34,5 +34,5 @@ uvicorn main:app --host 127.0.0.1 --port 4291
|
|||||||
|
|
||||||
- `ffmpeg` 系统二进制(拆轨 / 抽帧)
|
- `ffmpeg` 系统二进制(拆轨 / 抽帧)
|
||||||
- `yt-dlp` 系统二进制(也可走 Python 包)
|
- `yt-dlp` 系统二进制(也可走 Python 包)
|
||||||
- OpenAI 兼容 LLM 网关(ASR / 翻译 / 文案改写)
|
- OpenAI 兼容 LLM 网关(ASR / 翻译 / 文案改写);如果 `/audio/transcriptions` 不可用,会用 `ASR_FALLBACK_MODEL` 走 Gemini 多模态音频识别
|
||||||
- MiniMax T2A HTTP(改写文案配音,使用 `MINIMAX_API_KEY`)
|
- MiniMax T2A HTTP(改写文案配音,使用 `MINIMAX_API_KEY`)
|
||||||
|
|||||||
104
api/main.py
104
api/main.py
@@ -33,6 +33,7 @@ PRODUCT_LIBRARY_MANIFEST = PRODUCT_LIBRARY_DIR / "manifest.json"
|
|||||||
LLM_BASE_URL = os.getenv("LLM_BASE_URL", "").strip()
|
LLM_BASE_URL = os.getenv("LLM_BASE_URL", "").strip()
|
||||||
LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip()
|
LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip()
|
||||||
ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
|
ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
|
||||||
|
ASR_FALLBACK_MODEL = os.getenv("ASR_FALLBACK_MODEL", "gemini-2.5-flash").strip() or "gemini-2.5-flash"
|
||||||
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
|
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
|
||||||
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
|
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
|
||||||
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
|
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
|
||||||
@@ -687,8 +688,8 @@ def _resolve_frame_quality(duration: float, quality: FrameExtractQuality) -> Fra
|
|||||||
cores = os.cpu_count() or 4
|
cores = os.cpu_count() or 4
|
||||||
memory_gb = _physical_memory_gb()
|
memory_gb = _physical_memory_gb()
|
||||||
strong_machine = cores >= 10 and (memory_gb == 0.0 or memory_gb >= 32)
|
strong_machine = cores >= 10 and (memory_gb == 0.0 or memory_gb >= 32)
|
||||||
if strong_machine and duration <= 180:
|
# 展示/演示时不能把本机资源打满:auto 最高只到 accurate。
|
||||||
return "ultra"
|
# ultra 保留为手动选择项,不再由 auto 自动命中。
|
||||||
if strong_machine and duration <= 600:
|
if strong_machine and duration <= 600:
|
||||||
return "accurate"
|
return "accurate"
|
||||||
if cores >= 8 and duration <= 240:
|
if cores >= 8 and duration <= 240:
|
||||||
@@ -1157,6 +1158,16 @@ def ffprobe_meta(mp4: Path) -> dict:
|
|||||||
return json.loads(out)
|
return json.loads(out)
|
||||||
|
|
||||||
|
|
||||||
|
def media_duration(path: Path) -> float:
|
||||||
|
try:
|
||||||
|
out = run([
|
||||||
|
"ffprobe", "-v", "error", "-print_format", "json", "-show_format", str(path),
|
||||||
|
])
|
||||||
|
return float(json.loads(out).get("format", {}).get("duration") or 0)
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
def pipeline_download(job_id: str) -> None:
|
def pipeline_download(job_id: str) -> None:
|
||||||
"""阶段 1:仅下载(或上传跳过),落 source.mp4,停在 downloaded 等用户点解析/提取音频。"""
|
"""阶段 1:仅下载(或上传跳过),落 source.mp4,停在 downloaded 等用户点解析/提取音频。"""
|
||||||
job = JOBS[job_id]
|
job = JOBS[job_id]
|
||||||
@@ -1362,21 +1373,83 @@ def analyze_queue_worker() -> None:
|
|||||||
|
|
||||||
# ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ----------
|
# ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ----------
|
||||||
|
|
||||||
|
def _parse_asr_segments(content: str, duration: float) -> list[dict]:
|
||||||
|
raw = (content or "").strip()
|
||||||
|
if raw.startswith("```"):
|
||||||
|
import re as _re
|
||||||
|
match = _re.search(r"(\[[\s\S]*\]|\{[\s\S]*\})", raw)
|
||||||
|
raw = match.group(0) if match else raw
|
||||||
|
try:
|
||||||
|
data = json.loads(raw)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
text = raw.strip()
|
||||||
|
return [{"start": 0.0, "end": duration, "text": text}] if text else []
|
||||||
|
if isinstance(data, dict):
|
||||||
|
for key in ("segments", "data", "items", "result"):
|
||||||
|
if isinstance(data.get(key), list):
|
||||||
|
data = data[key]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
text = str(data.get("text") or data.get("transcript") or "").strip()
|
||||||
|
return [{"start": 0.0, "end": duration, "text": text}] if text else []
|
||||||
|
if not isinstance(data, list):
|
||||||
|
return []
|
||||||
|
segments: list[dict] = []
|
||||||
|
for i, item in enumerate(data):
|
||||||
|
if isinstance(item, str):
|
||||||
|
text = item.strip()
|
||||||
|
start = 0.0 if len(data) == 1 else duration * i / max(1, len(data))
|
||||||
|
end = duration if len(data) == 1 else duration * (i + 1) / max(1, len(data))
|
||||||
|
elif isinstance(item, dict):
|
||||||
|
text = str(item.get("text") or item.get("en") or item.get("transcript") or "").strip()
|
||||||
|
start = float(item.get("start") or item.get("start_time") or 0)
|
||||||
|
end = float(item.get("end") or item.get("end_time") or duration)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
if text:
|
||||||
|
segments.append({"start": max(0.0, start), "end": max(start, end), "text": text})
|
||||||
|
return segments
|
||||||
|
|
||||||
|
|
||||||
|
def _transcribe_gemini_sync(wav: Path) -> list[dict]:
|
||||||
|
duration = media_duration(wav)
|
||||||
|
audio_b64 = base64.b64encode(wav.read_bytes()).decode("ascii")
|
||||||
|
prompt = (
|
||||||
|
"Transcribe the attached audio. Return strict JSON only, no markdown. "
|
||||||
|
"Schema: [{\"start\": 0.0, \"end\": 1.2, \"text\": \"English transcript\"}]. "
|
||||||
|
"Use English for the transcript. If exact timestamps are uncertain, return one segment "
|
||||||
|
f"from 0 to {duration:.2f} seconds."
|
||||||
|
)
|
||||||
|
resp = llm().chat.completions.create(
|
||||||
|
model=ASR_FALLBACK_MODEL,
|
||||||
|
messages=[{"role": "user", "content": [
|
||||||
|
{"type": "text", "text": prompt},
|
||||||
|
{"type": "input_audio", "input_audio": {"data": audio_b64, "format": "wav"}},
|
||||||
|
]}],
|
||||||
|
temperature=0,
|
||||||
|
)
|
||||||
|
content = (resp.choices[0].message.content or "").strip()
|
||||||
|
return _parse_asr_segments(content, duration)
|
||||||
|
|
||||||
|
|
||||||
def _transcribe_sync(wav: Path) -> list[dict]:
|
def _transcribe_sync(wav: Path) -> list[dict]:
|
||||||
"""whisper-1 verbose_json → segments[{start, end, text}]"""
|
"""whisper-1 verbose_json → segments[{start, end, text}]"""
|
||||||
with wav.open("rb") as f:
|
try:
|
||||||
resp = llm().audio.transcriptions.create(
|
with wav.open("rb") as f:
|
||||||
file=(wav.name, f, "audio/wav"),
|
resp = llm().audio.transcriptions.create(
|
||||||
model=ASR_MODEL,
|
file=(wav.name, f, "audio/wav"),
|
||||||
response_format="verbose_json",
|
model=ASR_MODEL,
|
||||||
timestamp_granularities=["segment"],
|
response_format="verbose_json",
|
||||||
)
|
timestamp_granularities=["segment"],
|
||||||
raw = resp.model_dump() if hasattr(resp, "model_dump") else resp
|
)
|
||||||
segments = raw.get("segments") or []
|
raw = resp.model_dump() if hasattr(resp, "model_dump") else resp
|
||||||
# 兜底:网关如果不返回 segments,把全文当一段
|
segments = raw.get("segments") or []
|
||||||
if not segments and raw.get("text"):
|
# 兜底:网关如果不返回 segments,把全文当一段
|
||||||
segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}]
|
if not segments and raw.get("text"):
|
||||||
return segments
|
segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}]
|
||||||
|
return segments
|
||||||
|
except Exception:
|
||||||
|
return _transcribe_gemini_sync(wav)
|
||||||
|
|
||||||
|
|
||||||
def _translate_sync(segments: list[dict]) -> list[str]:
|
def _translate_sync(segments: list[dict]) -> list[str]:
|
||||||
@@ -1865,6 +1938,7 @@ def health() -> dict:
|
|||||||
"base_url": LLM_BASE_URL or "openai-default",
|
"base_url": LLM_BASE_URL or "openai-default",
|
||||||
"models": {
|
"models": {
|
||||||
"asr": ASR_MODEL,
|
"asr": ASR_MODEL,
|
||||||
|
"asr_fallback": ASR_FALLBACK_MODEL,
|
||||||
"translate": TRANSLATE_MODEL,
|
"translate": TRANSLATE_MODEL,
|
||||||
"rewrite": REWRITE_MODEL,
|
"rewrite": REWRITE_MODEL,
|
||||||
"audio_rewrite": AUDIO_REWRITE_MODEL,
|
"audio_rewrite": AUDIO_REWRITE_MODEL,
|
||||||
|
|||||||
@@ -791,7 +791,7 @@ SubjectAsset {
|
|||||||
<tr><td>创建任务</td><td><code>POST /jobs</code></td><td><code>createJob</code></td><td>提交 TK 链接,后台开始下载,停在 downloaded 等用户点解析。</td></tr>
|
<tr><td>创建任务</td><td><code>POST /jobs</code></td><td><code>createJob</code></td><td>提交 TK 链接,后台开始下载,停在 downloaded 等用户点解析。</td></tr>
|
||||||
<tr><td>上传视频</td><td><code>POST /jobs/upload</code></td><td><code>uploadJob</code></td><td>保存 source.mp4,然后同样进入下载完成状态。</td></tr>
|
<tr><td>上传视频</td><td><code>POST /jobs/upload</code></td><td><code>uploadJob</code></td><td>保存 source.mp4,然后同样进入下载完成状态。</td></tr>
|
||||||
<tr><td>删除输入视频</td><td><code>DELETE /jobs/{id}</code></td><td><code>deleteJob</code></td><td>从任务队列、URL 和磁盘 <code>jobs/<id></code> 目录移除整个 job,包括源视频、关键帧、元素提取图和生成视频。</td></tr>
|
<tr><td>删除输入视频</td><td><code>DELETE /jobs/{id}</code></td><td><code>deleteJob</code></td><td>从任务队列、URL 和磁盘 <code>jobs/<id></code> 目录移除整个 job,包括源视频、关键帧、元素提取图和生成视频。</td></tr>
|
||||||
<tr><td>解析视频</td><td><code>POST /jobs/{id}/analyze?frames=&target=&mode=&quality=</code></td><td><code>analyzeJob</code></td><td>拆轨 + 目标化抽关键帧。默认 <code>frames=12</code>;<code>target</code> 支持透明骨架人、综合、清晰主体、转场变化、表情瞬间、动作峰值;当前 UI 默认 <code>transparent_human</code>。透明骨架人目标现在只走本地清晰度、中心主体、对比度、画面变化和 pHash 去重,不在抽帧阶段逐帧调用 Vision;<code>mode=append</code> 追加新关键帧;<code>quality=auto</code> 根据本机算力和视频时长自动选择快速、精细或极准。抽帧开始时同步拆出 <code>audio.wav</code> 并启动音频处理线程。多个抽帧请求进入后端队列顺序处理。</td></tr>
|
<tr><td>解析视频</td><td><code>POST /jobs/{id}/analyze?frames=&target=&mode=&quality=</code></td><td><code>analyzeJob</code></td><td>拆轨 + 目标化抽关键帧。默认 <code>frames=12</code>;<code>target</code> 支持透明骨架人、综合、清晰主体、转场变化、表情瞬间、动作峰值;当前 UI 默认 <code>transparent_human</code>。透明骨架人目标现在只走本地清晰度、中心主体、对比度、画面变化和 pHash 去重,不在抽帧阶段逐帧调用 Vision;<code>mode=append</code> 追加新关键帧;<code>quality=auto</code> 为展示友好档,最高只自动选择精细,不会自动上极准;极准保留为手动选择。抽帧开始时同步拆出 <code>audio.wav</code> 并启动音频处理线程。多个抽帧请求进入后端队列顺序处理。</td></tr>
|
||||||
<tr><td>音频文案轨</td><td><code>POST /jobs/{id}/transcribe</code></td><td><code>triggerTranscribe</code></td><td>若尚未拆轨,先从 <code>source.mp4</code> 提取 <code>audio.wav</code>;随后 ASR 得到英文时间戳段落,再翻译中文,并按 <code>AUDIO_PRODUCT_BRIEF</code> 生成 <code>audio_script.rewritten_text</code>;配置 <code>MINIMAX_API_KEY</code> 后调用 MiniMax T2A 生成 <code>audio_script.voice_url</code>。前端不自动触发,用户在 Audio 节点点击“提取音频 / 重新提取音频”即可启动;抽帧中也允许并行触发,忙碌态由 <code>audio_script.status</code> 管理。</td></tr>
|
<tr><td>音频文案轨</td><td><code>POST /jobs/{id}/transcribe</code></td><td><code>triggerTranscribe</code></td><td>若尚未拆轨,先从 <code>source.mp4</code> 提取 <code>audio.wav</code>;随后 ASR 得到英文时间戳段落,再翻译中文,并按 <code>AUDIO_PRODUCT_BRIEF</code> 生成 <code>audio_script.rewritten_text</code>;配置 <code>MINIMAX_API_KEY</code> 后调用 MiniMax T2A 生成 <code>audio_script.voice_url</code>。前端不自动触发,用户在 Audio 节点点击“提取音频 / 重新提取音频”即可启动;抽帧中也允许并行触发,忙碌态由 <code>audio_script.status</code> 管理。</td></tr>
|
||||||
<tr><td>原始音频文件</td><td><code>GET /jobs/{id}/audio.wav</code></td><td><code>sourceAudioUrl</code></td><td>返回拆轨得到的 wav;底部 <code>AudioStrip</code> 拉取该文件,用 Web Audio API 解码并计算波形峰值,只读展示,不参与改写。</td></tr>
|
<tr><td>原始音频文件</td><td><code>GET /jobs/{id}/audio.wav</code></td><td><code>sourceAudioUrl</code></td><td>返回拆轨得到的 wav;底部 <code>AudioStrip</code> 拉取该文件,用 Web Audio API 解码并计算波形峰值,只读展示,不参与改写。</td></tr>
|
||||||
<tr><td>改写配音文件</td><td><code>GET /jobs/{id}/audio-script.mp3</code></td><td><code>apiAssetUrl(job.audio_script.voice_url)</code></td><td>返回 MiniMax T2A 生成的 mp3。没有配置 MiniMax 或生成失败时该文件不存在,但改写文案仍会保存在 <code>audio_script.rewritten_text</code>。</td></tr>
|
<tr><td>改写配音文件</td><td><code>GET /jobs/{id}/audio-script.mp3</code></td><td><code>apiAssetUrl(job.audio_script.voice_url)</code></td><td>返回 MiniMax T2A 生成的 mp3。没有配置 MiniMax 或生成失败时该文件不存在,但改写文案仍会保存在 <code>audio_script.rewritten_text</code>。</td></tr>
|
||||||
@@ -875,7 +875,7 @@ SubjectAsset {
|
|||||||
<div class="card">
|
<div class="card">
|
||||||
<h3>阻塞 / 占位</h3>
|
<h3>阻塞 / 占位</h3>
|
||||||
<ul>
|
<ul>
|
||||||
<li>ASR:仍依赖当前 OpenAI-compatible 音频转写入口;如果该网关 audio endpoint 不通,文案提取仍会失败。</li>
|
<li>ASR:优先走当前 OpenAI-compatible 音频转写入口;如果该网关没有 <code>/audio/transcriptions</code>,自动 fallback 到 <code>ASR_FALLBACK_MODEL</code>(默认 <code>gemini-2.5-flash</code>)的多模态音频识别。</li>
|
||||||
<li>MiniMax:当前接入的是官方 T2A 配音能力,不是 ASR;API Key 只能放本地环境变量,不能写入仓库。</li>
|
<li>MiniMax:当前接入的是官方 T2A 配音能力,不是 ASR;API Key 只能放本地环境变量,不能写入仓库。</li>
|
||||||
<li>Audio Product Brief:默认是通用 SKG 放松产品卖点,后续可改成跟已选产品库条目联动。</li>
|
<li>Audio Product Brief:默认是通用 SKG 放松产品卖点,后续可改成跟已选产品库条目联动。</li>
|
||||||
<li>Video Gen:模型层按业务保留 Seedance / Kling / Veo/Voe 选择;后端已支持 Poe、火山方舟和 SKG 豆包视频网关。Seedance 可通过 <code>VIDEO_API_BASE_URL=https://ai.skg.com/doubao</code> 走 content JSON 异步任务,提交后写入 Video Gen 节点并轮询到完成。</li>
|
<li>Video Gen:模型层按业务保留 Seedance / Kling / Veo/Voe 选择;后端已支持 Poe、火山方舟和 SKG 豆包视频网关。Seedance 可通过 <code>VIDEO_API_BASE_URL=https://ai.skg.com/doubao</code> 走 content JSON 异步任务,提交后写入 Video Gen 节点并轮询到完成。</li>
|
||||||
@@ -918,6 +918,18 @@ SubjectAsset {
|
|||||||
<h2>变更记录</h2>
|
<h2>变更记录</h2>
|
||||||
<p>这个记录不是 git log 的替代品。它记录“产品理解发生了什么变化、影响了哪些源码、你以后描述需求时该怎么说”。后续每次改功能都要补一条。</p>
|
<p>这个记录不是 git log 的替代品。它记录“产品理解发生了什么变化、影响了哪些源码、你以后描述需求时该怎么说”。后续每次改功能都要补一条。</p>
|
||||||
<div class="changelog">
|
<div class="changelog">
|
||||||
|
<article class="change">
|
||||||
|
<header>
|
||||||
|
<h3>2026-05-14 · 本地抽帧改为展示友好算力档</h3>
|
||||||
|
<span class="tag orange">抽帧</span>
|
||||||
|
<span class="tag gray">Audio</span>
|
||||||
|
</header>
|
||||||
|
<div class="body">
|
||||||
|
<p><strong>问题:</strong>透明骨架人目标逐帧调用 Vision 验收会拖慢抽帧;切回本机算力后,如果自动档直接跑最高极准,也可能在展示时占满机器资源。</p>
|
||||||
|
<p><strong>改动:</strong><code>transparent_human</code> 目标保留,但抽帧阶段只走本地扫描、评分、去重和时间覆盖,不再逐帧调用 Vision。<code>quality=auto</code> 最高只自动选择精细;极准仍保留为手动选项。抽帧开始拆出 <code>audio.wav</code> 后会启动独立音频线程,视觉抽帧和音频处理并行,互不标失败。</p>
|
||||||
|
<p><strong>影响:</strong><code>api/main.py</code>、<code>web/components/nodes/index.tsx</code>、<code>docs/source-analysis.html</code>。</p>
|
||||||
|
</div>
|
||||||
|
</article>
|
||||||
<article class="change">
|
<article class="change">
|
||||||
<header>
|
<header>
|
||||||
<h3>2026-05-14 · 修复 ReactFlow Hydration 和后端 reload 卡住</h3>
|
<h3>2026-05-14 · 修复 ReactFlow Hydration 和后端 reload 卡住</h3>
|
||||||
@@ -938,7 +950,7 @@ SubjectAsset {
|
|||||||
</header>
|
</header>
|
||||||
<div class="body">
|
<div class="body">
|
||||||
<p><strong>问题:</strong>等待抽帧完成后自动启动音频,不符合“先把声音文案拿出来审核”的工作流;用户需要在音频卡片上直接触发。</p>
|
<p><strong>问题:</strong>等待抽帧完成后自动启动音频,不符合“先把声音文案拿出来审核”的工作流;用户需要在音频卡片上直接触发。</p>
|
||||||
<p><strong>改动:</strong>移除前端抽帧完成后的自动转写逻辑;<code>AudioNode</code> 保留并固定显示“提取音频 / 重新提取音频”按钮。后端 <code>/transcribe</code> 不再要求 <code>frames_extracted</code>,视频就绪后可直接从 <code>source.mp4</code> 拆出 <code>audio.wav</code>,并继续 ASR、翻译、SKG 改写和 MiniMax 配音;抽帧中触发时不抢主状态,而是用 <code>audio_script.status</code> 表示音频处理中。</p>
|
<p><strong>改动:</strong>移除前端抽帧完成后的自动转写逻辑;<code>AudioNode</code> 保留并固定显示“提取音频 / 重新提取音频”按钮。后端 <code>/transcribe</code> 不再要求 <code>frames_extracted</code>,视频就绪后可直接从 <code>source.mp4</code> 拆出 <code>audio.wav</code>,并继续 ASR、翻译、SKG 改写和 MiniMax 配音;抽帧中触发时不抢主状态,而是用 <code>audio_script.status</code> 表示音频处理中。当当前网关的 <code>whisper-1</code> audio endpoint 返回 404 时,会 fallback 到 Gemini 多模态音频识别。</p>
|
||||||
<p><strong>影响:</strong><code>web/app/page.tsx</code>、<code>web/components/nodes/index.tsx</code>、<code>api/main.py</code>、<code>docs/source-analysis.html</code>。</p>
|
<p><strong>影响:</strong><code>web/app/page.tsx</code>、<code>web/components/nodes/index.tsx</code>、<code>api/main.py</code>、<code>docs/source-analysis.html</code>。</p>
|
||||||
</div>
|
</div>
|
||||||
</article>
|
</article>
|
||||||
@@ -1234,7 +1246,7 @@ SubjectAsset {
|
|||||||
</header>
|
</header>
|
||||||
<div class="body">
|
<div class="body">
|
||||||
<p><strong>问题:</strong>抽帧精度不应该每次都让用户判断;点击一个视频抽帧后,其他视频不应被全局禁用,而应该可以先后排队。另外打开视频抽帧侧边面板后,也应能自动抽帧。</p>
|
<p><strong>问题:</strong>抽帧精度不应该每次都让用户判断;点击一个视频抽帧后,其他视频不应被全局禁用,而应该可以先后排队。另外打开视频抽帧侧边面板后,也应能自动抽帧。</p>
|
||||||
<p><strong>改动:</strong><code>quality</code> 新增 <code>auto</code> 默认值,后端按 CPU 核数、内存和视频时长解析为快速、精细或极准;本机 M2 Max + 64GB 的短视频会自动走极准。后端新增内存队列 <code>ANALYZE_QUEUE</code>,多个 <code>analyze</code> 请求按顺序执行;前端轮询所有运行中的 job,不只轮询当前 active job。<code>VideoFramePanelNode</code> 内也加入同一套自动抽帧工具条。</p>
|
<p><strong>改动:</strong><code>quality</code> 新增 <code>auto</code> 默认值,后端按 CPU 核数、内存和视频时长解析为快速或精细;为了展示稳定,auto 不再自动进入极准,极准仅在用户手动选择时启用。后端新增内存队列 <code>ANALYZE_QUEUE</code>,多个 <code>analyze</code> 请求按顺序执行;前端轮询所有运行中的 job,不只轮询当前 active job。<code>VideoFramePanelNode</code> 内也加入同一套自动抽帧工具条。</p>
|
||||||
<p><strong>影响:</strong><code>api/main.py</code>、<code>web/lib/api.ts</code>、<code>web/app/page.tsx</code>、<code>web/components/nodes/index.tsx</code>、<code>docs/source-analysis.html</code>。队列目前是进程内队列,重启后不会恢复未执行的排队任务。</p>
|
<p><strong>影响:</strong><code>api/main.py</code>、<code>web/lib/api.ts</code>、<code>web/app/page.tsx</code>、<code>web/components/nodes/index.tsx</code>、<code>docs/source-analysis.html</code>。队列目前是进程内队列,重启后不会恢复未执行的排队任务。</p>
|
||||||
</div>
|
</div>
|
||||||
</article>
|
</article>
|
||||||
|
|||||||
@@ -140,7 +140,7 @@ const FRAME_TARGET_OPTIONS: Array<{ value: FrameExtractTarget; label: string; hi
|
|||||||
]
|
]
|
||||||
const FRAME_COUNT_OPTIONS = [12, 8, 5, 3]
|
const FRAME_COUNT_OPTIONS = [12, 8, 5, 3]
|
||||||
const FRAME_QUALITY_OPTIONS: Array<{ value: FrameExtractQuality; label: string; hint: string }> = [
|
const FRAME_QUALITY_OPTIONS: Array<{ value: FrameExtractQuality; label: string; hint: string }> = [
|
||||||
{ value: "auto", label: "自动", hint: "按电脑性能和视频时长自动选择" },
|
{ value: "auto", label: "自动", hint: "展示友好:按电脑性能选择,最高只到精细" },
|
||||||
{ value: "fast", label: "快速", hint: "2fps / 360px,长视频省电" },
|
{ value: "fast", label: "快速", hint: "2fps / 360px,长视频省电" },
|
||||||
{ value: "accurate", label: "精细", hint: "8fps / 720px,M2 Max 轻松可用" },
|
{ value: "accurate", label: "精细", hint: "8fps / 720px,M2 Max 轻松可用" },
|
||||||
{ value: "ultra", label: "极准", hint: "12fps / 960px,本机约 3 秒扫描 1 分钟视频" },
|
{ value: "ultra", label: "极准", hint: "12fps / 960px,本机约 3 秒扫描 1 分钟视频" },
|
||||||
|
|||||||
Reference in New Issue
Block a user