fix: retry transient xai video creation failures
This commit is contained in:
89
api/main.py
89
api/main.py
@@ -451,6 +451,8 @@ VIDEO_STATUS_PATH = os.getenv("VIDEO_STATUS_PATH", DEFAULT_VIDEO_STATUS_PATH).st
|
||||
VIDEO_CONTENT_PATH = os.getenv("VIDEO_CONTENT_PATH", DEFAULT_VIDEO_CONTENT_PATH).strip() or DEFAULT_VIDEO_CONTENT_PATH
|
||||
VIDEO_DURATION_FIELD = os.getenv("VIDEO_DURATION_FIELD", "seconds").strip() or "seconds"
|
||||
VIDEO_POLL_TIMEOUT_SECONDS = max(60, int(os.getenv("VIDEO_POLL_TIMEOUT_SECONDS", "900")))
|
||||
VIDEO_CREATE_RETRY_ATTEMPTS = max(1, int(os.getenv("VIDEO_CREATE_RETRY_ATTEMPTS", "3")))
|
||||
VIDEO_CREATE_RETRY_BACKOFF_SECONDS = max(0.5, float(os.getenv("VIDEO_CREATE_RETRY_BACKOFF_SECONDS", "2")))
|
||||
FFMPEG_BIN = os.getenv("FFMPEG_BIN", "").strip()
|
||||
FFPROBE_BIN = os.getenv("FFPROBE_BIN", "").strip()
|
||||
LOCAL_FFMPEG_CANDIDATES = [
|
||||
@@ -6729,6 +6731,8 @@ def health() -> dict:
|
||||
"video_base_url": video_api_base(),
|
||||
"video_configured": bool(video_api_key()),
|
||||
"video_create_paths": VIDEO_CREATE_PATHS,
|
||||
"video_create_retry_attempts": VIDEO_CREATE_RETRY_ATTEMPTS,
|
||||
"video_create_retry_backoff_seconds": VIDEO_CREATE_RETRY_BACKOFF_SECONDS,
|
||||
"xai_video_model": XAI_VIDEO_MODEL,
|
||||
"xai_video_base_url": XAI_VIDEO_API_BASE_URL,
|
||||
"xai_video_configured": bool(video_api_key(XAI_VIDEO_MODEL)),
|
||||
@@ -9067,6 +9071,9 @@ def _video_public_error(raw: object) -> str:
|
||||
"connecterror",
|
||||
"connecttimeout",
|
||||
"readtimeout",
|
||||
"connection reset",
|
||||
"connection aborted",
|
||||
"remote protocol error",
|
||||
"ssl:",
|
||||
"_ssl.c",
|
||||
"handshake",
|
||||
@@ -9124,6 +9131,19 @@ def _video_public_error(raw: object) -> str:
|
||||
if any(token in lower for token in ("timeout", "timed out", "readtimeout", "connecttimeout", "超时")):
|
||||
return "视频生成失败:视频模型响应超时,可能是上游繁忙或网络不稳定。请稍后重试,或缩短时长后再生成。"
|
||||
|
||||
if any(token in lower for token in (
|
||||
"http 500",
|
||||
"http 502",
|
||||
"http 503",
|
||||
"http 504",
|
||||
"internal server error",
|
||||
"bad gateway",
|
||||
"service unavailable",
|
||||
"gateway timeout",
|
||||
"server error",
|
||||
)):
|
||||
return "视频生成失败:视频模型上游服务暂时异常,系统已自动重试但仍未成功。请稍后重新生成;如果持续出现,请联系管理员检查视频网关。"
|
||||
|
||||
if any(token in lower for token in (
|
||||
"name or service not known",
|
||||
"temporary failure in name resolution",
|
||||
@@ -9131,6 +9151,9 @@ def _video_public_error(raw: object) -> str:
|
||||
"connection refused",
|
||||
"network is unreachable",
|
||||
"connecterror",
|
||||
"connection reset",
|
||||
"connection aborted",
|
||||
"remote protocol error",
|
||||
"ssl:",
|
||||
"网络",
|
||||
"dns",
|
||||
@@ -9309,6 +9332,21 @@ def submit_video_create(
|
||||
)
|
||||
|
||||
|
||||
_VIDEO_CREATE_RETRY_STATUS_CODES = {408, 409, 425, 429, 500, 502, 503, 504}
|
||||
|
||||
|
||||
def _video_create_attempts(model: str | None) -> int:
|
||||
return VIDEO_CREATE_RETRY_ATTEMPTS if video_uses_xai(model) else 1
|
||||
|
||||
|
||||
def _video_create_retry_delay(attempt: int) -> float:
|
||||
return min(20.0, VIDEO_CREATE_RETRY_BACKOFF_SECONDS * (2 ** max(0, attempt - 1)))
|
||||
|
||||
|
||||
def _video_create_transport_error(exc: Exception) -> bool:
|
||||
return isinstance(exc, (httpx.TransportError, httpx.TimeoutException))
|
||||
|
||||
|
||||
def render_storyboard_video(
|
||||
job_id: str,
|
||||
local_id: str,
|
||||
@@ -9352,22 +9390,43 @@ def render_storyboard_video(
|
||||
create = None
|
||||
create_errors: list[str] = []
|
||||
for create_path in video_create_paths(model):
|
||||
resp = submit_video_create(client, f"{base}{video_path(create_path)}", headers, ref_img, payload, source_ref, prepared_last_img, prepared_product_imgs, primary_role)
|
||||
if video_uses_ark(model) and source_ref and resp.status_code in {400, 422}:
|
||||
create_errors.append(f"{video_path(create_path)} + reference_video -> HTTP {resp.status_code}: {resp.text[:700]}")
|
||||
resp = submit_video_create(client, f"{base}{video_path(create_path)}", headers, ref_img, payload, None, prepared_last_img, prepared_product_imgs, primary_role)
|
||||
if video_uses_ark(model) and prepared_last_img and resp.status_code in {400, 422}:
|
||||
create_errors.append(f"{video_path(create_path)} + last_frame -> HTTP {resp.status_code}: {resp.text[:700]}")
|
||||
resp = submit_video_create(client, f"{base}{video_path(create_path)}", headers, ref_img, payload, None, None, prepared_product_imgs, primary_role)
|
||||
if video_uses_ark(model) and prepared_product_imgs and resp.status_code in {400, 422}:
|
||||
create_errors.append(f"{video_path(create_path)} + product_reference -> HTTP {resp.status_code}: {resp.text[:700]}")
|
||||
resp = submit_video_create(client, f"{base}{video_path(create_path)}", headers, ref_img, payload, None, prepared_last_img, None, primary_role)
|
||||
if resp.status_code < 400:
|
||||
create = resp
|
||||
path = video_path(create_path)
|
||||
url = f"{base}{path}"
|
||||
attempts = _video_create_attempts(model)
|
||||
for attempt in range(1, attempts + 1):
|
||||
try:
|
||||
resp = submit_video_create(client, url, headers, ref_img, payload, source_ref, prepared_last_img, prepared_product_imgs, primary_role)
|
||||
except Exception as exc:
|
||||
create_errors.append(f"{path} attempt {attempt}/{attempts} -> {exc.__class__.__name__}: {str(exc)[:700]}")
|
||||
if attempt < attempts and _video_create_transport_error(exc):
|
||||
delay = _video_create_retry_delay(attempt)
|
||||
print(f"[video create retry] job={job_id} video={local_id} path={path} attempt={attempt}/{attempts} error={str(exc)[:300]} retry_in={delay:.1f}s", flush=True)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
raise
|
||||
if video_uses_ark(model) and source_ref and resp.status_code in {400, 422}:
|
||||
create_errors.append(f"{path} + reference_video -> HTTP {resp.status_code}: {resp.text[:700]}")
|
||||
resp = submit_video_create(client, url, headers, ref_img, payload, None, prepared_last_img, prepared_product_imgs, primary_role)
|
||||
if video_uses_ark(model) and prepared_last_img and resp.status_code in {400, 422}:
|
||||
create_errors.append(f"{path} + last_frame -> HTTP {resp.status_code}: {resp.text[:700]}")
|
||||
resp = submit_video_create(client, url, headers, ref_img, payload, None, None, prepared_product_imgs, primary_role)
|
||||
if video_uses_ark(model) and prepared_product_imgs and resp.status_code in {400, 422}:
|
||||
create_errors.append(f"{path} + product_reference -> HTTP {resp.status_code}: {resp.text[:700]}")
|
||||
resp = submit_video_create(client, url, headers, ref_img, payload, None, prepared_last_img, None, primary_role)
|
||||
if resp.status_code < 400:
|
||||
create = resp
|
||||
break
|
||||
create_errors.append(f"{path} attempt {attempt}/{attempts} -> HTTP {resp.status_code}: {resp.text[:700]}")
|
||||
if resp.status_code in _VIDEO_CREATE_RETRY_STATUS_CODES and attempt < attempts:
|
||||
delay = _video_create_retry_delay(attempt)
|
||||
print(f"[video create retry] job={job_id} video={local_id} path={path} attempt={attempt}/{attempts} http={resp.status_code} retry_in={delay:.1f}s", flush=True)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
if resp.status_code not in {400, 404, 405}:
|
||||
raise RuntimeError(_video_create_failure_message(create_errors))
|
||||
break
|
||||
if create is not None:
|
||||
break
|
||||
create_errors.append(f"{video_path(create_path)} -> HTTP {resp.status_code}: {resp.text[:700]}")
|
||||
if resp.status_code not in {400, 404, 405}:
|
||||
resp.raise_for_status()
|
||||
if create is None:
|
||||
print(f"[video create failed] job={job_id} video={local_id} errors={' | '.join(create_errors)[:1800]}", flush=True)
|
||||
raise RuntimeError(_video_create_failure_message(create_errors))
|
||||
|
||||
Reference in New Issue
Block a user