init repo

This commit is contained in:
2026-04-25 19:25:22 +08:00
commit c7533eada2
50 changed files with 3732 additions and 0 deletions

0
app/graph/__init__.py Normal file
View File

108
app/graph/builder.py Normal file
View File

@@ -0,0 +1,108 @@
"""Graph builder — assembles nodes into an executable report generation graph.
No LangGraph dependency. Pure asyncio with a simple node-runner pattern.
v2: domain-aware, bilingual (translate node added)
"""
from __future__ import annotations
import logging
from typing import Callable, Awaitable
from app.middleware.chain import MiddlewareChain
from .state import ReportState, NodeStatus
from .nodes import (
DecomposeNode,
ParallelResearchNode,
WriteNode,
TranslateNode,
DataNode,
ReviewNode,
FormatNode,
)
logger = logging.getLogger(__name__)
NodeFn = Callable[[ReportState], Awaitable[ReportState]]
class ReportGraph:
"""Executable graph for report generation.
Graph structure (v2):
decompose → parallel_research → write → translate → data → review →
├─ pass → format → END
└─ revise → write → translate → data → review → ...
"""
def __init__(self, middleware: MiddlewareChain | None = None):
self.middleware = middleware
self.decompose = DecomposeNode()
self.parallel_research = ParallelResearchNode()
self.write = WriteNode()
self.translate = TranslateNode()
self.data = DataNode()
self.review = ReviewNode()
self.format = FormatNode()
async def _run_node(self, name: str, node: NodeFn, state: ReportState) -> ReportState:
"""Run a single node with error handling."""
try:
logger.info(f"[graph] entering node: {name}")
state = await node(state)
logger.info(f"[graph] completed node: {name}")
except Exception as e:
state.error = f"Node '{name}' failed: {e}"
state.log_node(name, NodeStatus.FAILED, str(e))
logger.exception(f"[graph] node '{name}' failed")
raise
return state
async def run(self, state: ReportState) -> ReportState:
"""Execute the full graph."""
# --- Middleware: before ---
if self.middleware:
state = await self.middleware.before(state)
try:
# 1. Decompose requirement into domain-tagged parallel tracks
state = await self._run_node("decompose", self.decompose, state)
# 2. Run parallel research (each track uses domain-optimal model)
state = await self._run_node("parallel_research", self.parallel_research, state)
# 3-6. Write → Translate → Data → Review (with revision loop)
while True:
state = await self._run_node("write", self.write, state)
state = await self._run_node("translate", self.translate, state)
state = await self._run_node("data", self.data, state)
state = await self._run_node("review", self.review, state)
verdict = state.review.get("verdict", "pass")
if verdict == "pass" or state.revision_count >= state.max_revisions:
if verdict != "pass":
logger.warning(
f"[graph] forcing pass after {state.revision_count} revisions"
)
break
# Revise — loop back to write
state.revision_count += 1
logger.info(
f"[graph] revision {state.revision_count}/{state.max_revisions}"
)
# 7. Format bilingual output files
state = await self._run_node("format", self.format, state)
except Exception:
# Error already logged in _run_node
pass
# --- Middleware: after ---
if self.middleware:
state = await self.middleware.after(state)
return state

393
app/graph/nodes.py Normal file
View File

@@ -0,0 +1,393 @@
"""Graph nodes — each node is an async function: ReportState → ReportState.
Node layout (v2 — domain-aware, bilingual):
START
[decompose] — Lead Agent 分解为并行研究轨道,每轨标注 domain + language
[parallel_research] — N 个子 Agent 并行,每个用最适合该领域的模型
│ global tracks → Claude/GPT (English)
│ china tracks → DeepSeek/Qwen (Chinese)
[write] — Writer 汇聚 → 生成主语言版本
[translate] — 高质量翻译 → 生成另一语言版本
[data] — Data Agent 生成图表/表格
[review] — Reviewer 审查(双语)
│ ├─ pass → [format]
│ └─ revise → [write]
[format] — 输出双语版本文件
END
"""
from __future__ import annotations
import asyncio
import json
import logging
from datetime import datetime
from typing import Any
from app.agents.base import BaseAgent
from app.agents.researcher import ResearcherAgent
from app.agents.writer import WriterAgent
from app.agents.data_agent import DataAgent
from app.agents.reviewer import ReviewerAgent
from app.agents.formatter import FormatterAgent
from app.config import settings
from .state import ReportState, SubtaskResult, NodeStatus, ContentDomain
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Node: decompose — Lead Agent decomposes into domain-tagged parallel tracks
# ---------------------------------------------------------------------------
class DecomposeNode:
"""Analyzes requirement and decomposes into domain-aware research tracks."""
def __init__(self):
self.agent = BaseAgent()
self.agent.name = "lead"
self.agent.model = settings.model_for_domain("reasoning")
async def __call__(self, state: ReportState) -> ReportState:
state.current_node = "decompose"
state.log_node("decompose", NodeStatus.RUNNING)
system = """\
You are a senior consulting partner planning a global industry report.
Your job is to decompose the client's requirement into 2-6 parallel research tracks.
CRITICAL: Each track must be tagged with a content domain and native language:
- domain: "global" → international markets, global competition, technology trends, overseas benchmarks
→ native_language: "en" (English sources are 10-100x richer for global analysis)
- domain: "china" → Chinese domestic market, government policy, local competitors, China-specific data
→ native_language: "zh" (Chinese sources are authoritative for domestic analysis)
The PRINCIPLE: whichever language has the richest professional literature for that topic
should be the native language. The other language version will be translated later.
Output (JSON):
{
"title_en": "English report title",
"title_zh": "中文报告标题",
"report_type": "report type",
"tracks": [
{
"title": "track title (in native language)",
"domain": "global|china",
"native_language": "en|zh",
"focus": "research focus description",
"prompt": "detailed research instructions (MUST be in the native_language)",
"data_needs": ["required data/charts"]
}
],
"synthesis_guide": "How to merge all tracks into a coherent report (bilingual structure notes)",
"methodology": "Analysis methodology"
}"""
prompt = f"""\
## Client requirement
{state.requirement}
## Report type
{state.report_type}
## Additional data
{state.extra_data or "(none)"}
## Client context
{state.client_context or "(none)"}
Decompose into parallel research tracks with domain and language tags. Output JSON."""
result = await self.agent.call_llm_json(prompt, system=system)
state.decomposition = result
state.log_node("decompose", NodeStatus.COMPLETED,
f"{len(result.get('tracks', []))} tracks")
return state
# ---------------------------------------------------------------------------
# Node: parallel_research — domain-aware parallel execution
# ---------------------------------------------------------------------------
class ParallelResearchNode:
"""Runs research subtasks in parallel, each using the optimal model for its domain."""
MAX_CONCURRENT = 5
async def _run_one(self, track: dict[str, Any]) -> SubtaskResult:
domain_str = track.get("domain", "global")
domain = ContentDomain(domain_str) if domain_str in ContentDomain.__members__.values() else ContentDomain.GLOBAL
native_lang = track.get("native_language", "en")
result = SubtaskResult(
description=track.get("title", ""),
domain=domain,
native_language=native_lang,
)
result.status = NodeStatus.RUNNING
result.started_at = datetime.now()
try:
# Select model based on domain
model = settings.model_for_domain(domain.value)
agent = ResearcherAgent(model=model, language=native_lang)
logger.info(
f"[parallel_research] track '{track.get('title')}' "
f"→ domain={domain.value}, lang={native_lang}, model={model}"
)
research = await agent.run({
"requirement": track["prompt"],
"report_type": track.get("focus", ""),
"extra_data": "",
})
result.content = research.get("research", {})
result.status = NodeStatus.COMPLETED
except Exception as e:
result.error = str(e)
result.status = NodeStatus.FAILED
logger.exception(f"Research track '{track.get('title')}' failed")
finally:
result.completed_at = datetime.now()
return result
async def __call__(self, state: ReportState) -> ReportState:
state.current_node = "parallel_research"
state.log_node("parallel_research", NodeStatus.RUNNING)
tracks = state.decomposition.get("tracks", [])
if not tracks:
state.log_node("parallel_research", NodeStatus.FAILED, "no tracks")
state.error = "Decomposition produced no research tracks"
return state
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT)
async def bounded(track):
async with semaphore:
return await self._run_one(track)
logger.info(f"[parallel_research] launching {len(tracks)} tracks concurrently")
results = await asyncio.gather(*[bounded(t) for t in tracks])
state.research_results = list(results)
succeeded = sum(1 for r in results if r.status == NodeStatus.COMPLETED)
domains = {}
for r in results:
domains.setdefault(r.domain.value, []).append(r.native_language)
state.log_node("parallel_research", NodeStatus.COMPLETED,
f"{succeeded}/{len(tracks)} ok, domains={domains}")
return state
# ---------------------------------------------------------------------------
# Node: write — synthesize research into primary-language draft
# ---------------------------------------------------------------------------
class WriteNode:
def __init__(self):
self.agent = WriterAgent()
async def __call__(self, state: ReportState) -> ReportState:
state.current_node = "write"
state.log_node("write", NodeStatus.RUNNING)
research_merged = []
for r in state.research_results:
if r.status == NodeStatus.COMPLETED:
research_merged.append({
"track": r.description,
"domain": r.domain.value,
"native_language": r.native_language,
"findings": r.content,
})
synthesis_guide = state.decomposition.get("synthesis_guide", "")
review_feedback = ""
if state.revision_count > 0 and state.review:
review_feedback = f"\n\n## Review feedback (revision {state.revision_count})\n"
for issue in state.review.get("issues", []):
review_feedback += f"- [{issue.get('severity')}] {issue.get('description')}{issue.get('suggestion')}\n"
result = await self.agent.run({
"requirement": state.requirement,
"research": {
"title_en": state.decomposition.get("title_en", ""),
"title_zh": state.decomposition.get("title_zh", ""),
"methodology": state.decomposition.get("methodology", ""),
"tracks": research_merged,
"synthesis_guide": synthesis_guide,
},
"revision_feedback": review_feedback,
})
state.draft = result.get("draft", {})
state.log_node("write", NodeStatus.COMPLETED)
return state
# ---------------------------------------------------------------------------
# Node: translate — produce the other language version
# ---------------------------------------------------------------------------
class TranslateNode:
"""Translates the draft into the other language version."""
def __init__(self):
self.agent = BaseAgent()
self.agent.name = "translator"
self.agent.model = settings.model_for_domain("translation")
async def __call__(self, state: ReportState) -> ReportState:
state.current_node = "translate"
state.log_node("translate", NodeStatus.RUNNING)
if not state.draft or "en" not in state.output_languages:
state.log_node("translate", NodeStatus.COMPLETED, "skipped")
return state
draft_json = json.dumps(state.draft, ensure_ascii=False, indent=2)
# Detect primary language of draft
title = state.draft.get("title", "")
is_chinese_primary = any('\u4e00' <= c <= '\u9fff' for c in title)
if is_chinese_primary:
target_lang = "English"
source_lang = "Chinese"
else:
target_lang = "Chinese (Simplified)"
source_lang = "English"
system = f"""\
You are a world-class {source_lang}{target_lang} translator specializing in
consulting and business reports.
Translation principles:
1. ACCURACY over fluency — every data point, percentage, and proper noun must be correct
2. Professional terminology — use standard {target_lang} business/industry terms
3. Preserve structure — keep the exact same JSON structure, only translate text values
4. Cultural adaptation — adjust phrasing for the target audience (not word-for-word)
5. Keep {{{{CHART:...}}}} and {{{{TABLE:...}}}} markers, translate their descriptions
Output the translated JSON with the exact same structure."""
prompt = f"""\
Translate this consulting report from {source_lang} to {target_lang}.
{draft_json}
Output the translated JSON."""
translated = await self.agent.call_llm_json(prompt, system=system, max_tokens=8192)
state.draft_translated = translated
state.log_node("translate", NodeStatus.COMPLETED,
f"{source_lang}{target_lang}")
return state
# ---------------------------------------------------------------------------
# Node: data — generate charts and tables
# ---------------------------------------------------------------------------
class DataNode:
def __init__(self):
self.agent = DataAgent()
async def __call__(self, state: ReportState) -> ReportState:
state.current_node = "data"
state.log_node("data", NodeStatus.RUNNING)
result = await self.agent.run({
"draft": state.draft,
"extra_data": state.extra_data,
})
state.data_assets = result.get("data_assets", {})
state.log_node("data", NodeStatus.COMPLETED)
return state
# ---------------------------------------------------------------------------
# Node: review — bilingual quality check
# ---------------------------------------------------------------------------
class ReviewNode:
def __init__(self):
self.agent = ReviewerAgent()
async def __call__(self, state: ReportState) -> ReportState:
state.current_node = "review"
state.log_node("review", NodeStatus.RUNNING)
result = await self.agent.run({
"draft": state.draft,
"draft_translated": state.draft_translated,
"research": state.decomposition,
})
state.review = result.get("review", {})
state.log_node("review", NodeStatus.COMPLETED,
f"verdict={state.review.get('verdict', '?')}")
return state
# ---------------------------------------------------------------------------
# Node: format — render bilingual output files
# ---------------------------------------------------------------------------
class FormatNode:
def __init__(self):
self.agent = FormatterAgent()
async def __call__(self, state: ReportState) -> ReportState:
state.current_node = "format"
state.log_node("format", NodeStatus.RUNNING)
all_files = []
# Primary version
result = await self.agent.run({
"draft": state.draft,
"data_assets": state.data_assets,
"output_dir": str(settings.output_dir / state.id / "primary"),
"output_formats": state.output_formats,
})
all_files.extend(result.get("generated_files", []))
# Translated version (if available)
if state.draft_translated:
result_tr = await self.agent.run({
"draft": state.draft_translated,
"data_assets": state.data_assets,
"output_dir": str(settings.output_dir / state.id / "translated"),
"output_formats": state.output_formats,
})
all_files.extend(result_tr.get("generated_files", []))
state.generated_files = all_files
state.log_node("format", NodeStatus.COMPLETED,
f"{len(all_files)} files")
return state

104
app/graph/state.py Normal file
View File

@@ -0,0 +1,104 @@
"""Report generation graph state — the shared context that flows through all nodes."""
from __future__ import annotations
import uuid
from datetime import datetime
from enum import Enum
from typing import Any
from pydantic import BaseModel, Field
class NodeStatus(str, Enum):
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
class ContentDomain(str, Enum):
"""Content domain — determines which model and language to use."""
GLOBAL = "global" # International markets, global trends → English-native
CHINA = "china" # Chinese market, domestic policy → Chinese-native
REASONING = "reasoning" # Synthesis, review, strategy → strongest reasoning
FAST = "fast" # Data processing, charts → cost-effective
TRANSLATION = "translation" # EN↔ZH translation
class SubtaskResult(BaseModel):
"""Result from a parallel research subtask."""
task_id: str = Field(default_factory=lambda: uuid.uuid4().hex[:8])
description: str = ""
domain: ContentDomain = ContentDomain.GLOBAL
native_language: str = "en" # "en" or "zh" — the original writing language
status: NodeStatus = NodeStatus.PENDING
content: dict[str, Any] = Field(default_factory=dict)
error: str | None = None
started_at: datetime | None = None
completed_at: datetime | None = None
@property
def duration_ms(self) -> int | None:
if self.started_at and self.completed_at:
return int((self.completed_at - self.started_at).total_seconds() * 1000)
return None
class ReportState(BaseModel):
"""Full state for a report generation run.
This is the single source of truth that all graph nodes read from and write to.
"""
# Identity
id: str = Field(default_factory=lambda: uuid.uuid4().hex[:12])
created_at: datetime = Field(default_factory=datetime.now)
# --- Input (set once at start) ---
requirement: str = ""
report_type: str = "行业分析报告"
extra_data: str = ""
output_formats: list[str] = Field(default=["docx"])
output_languages: list[str] = Field(default=["zh", "en"]) # produce both versions
template_name: str | None = None
client_id: str | None = None # for multi-tenant isolation
# --- Middleware injections ---
client_context: str = "" # injected by ClientContextMiddleware
memory_facts: list[str] = Field(default_factory=list) # injected by MemoryMiddleware
token_budget: int = 120000 # managed by TokenBudgetMiddleware
# --- Lead Agent output ---
decomposition: dict[str, Any] = Field(default_factory=dict)
# e.g. {"tracks": [{"title": "政策环境", "prompt": "研究..."}, ...]}
# --- Parallel research results ---
research_results: list[SubtaskResult] = Field(default_factory=list)
# --- Writer output ---
draft: dict[str, Any] = Field(default_factory=dict) # primary language version
draft_translated: dict[str, Any] = Field(default_factory=dict) # translated version
# --- Data Agent output ---
data_assets: dict[str, Any] = Field(default_factory=dict)
# --- Reviewer output ---
review: dict[str, Any] = Field(default_factory=dict)
revision_count: int = 0
max_revisions: int = 2
# --- Formatter output ---
generated_files: list[str] = Field(default_factory=list)
# --- Execution tracking ---
current_node: str = ""
node_history: list[dict[str, Any]] = Field(default_factory=list)
error: str | None = None
def log_node(self, node_name: str, status: NodeStatus, detail: str = ""):
self.node_history.append({
"node": node_name,
"status": status.value,
"detail": detail,
"timestamp": datetime.now().isoformat(),
})