init repo

2026-04-25 19:25:22 +08:00
commit c7533eada2
50 changed files with 3732 additions and 0 deletions
--- a/app/agents/init.py
+++ b/app/agents/init.py
@@ -0,0 +1,15 @@
+from .base import BaseAgent
+from .researcher import ResearcherAgent
+from .writer import WriterAgent
+from .data_agent import DataAgent
+from .reviewer import ReviewerAgent
+from .formatter import FormatterAgent
+
+__all__ = [
+    "BaseAgent",
+    "ResearcherAgent",
+    "WriterAgent",
+    "DataAgent",
+    "ReviewerAgent",
+    "FormatterAgent",
+]
--- a/app/agents/base.py
+++ b/app/agents/base.py
@@ -0,0 +1,166 @@
+"""Base agent with LLM calling via litellm."""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any
+
+import litellm
+
+from app.config import settings
+
+logger = logging.getLogger(__name__)
+
+# Disable litellm telemetry
+litellm.telemetry = False
+
+
+class BaseAgent:
+    """Base class for all pipeline agents."""
+
+    name: str = "base"
+    description: str = ""
+    system_prompt: str = ""
+    model: str = ""  # empty = use default from config
+
+    def __init__(self, model: str | None = None):
+        if model:
+            self.model = model
+
+    def get_model(self) -> str:
+        return self.model or settings.llm_model
+
+    async def call_llm(
+        self,
+        prompt: str,
+        *,
+        system: str | None = None,
+        temperature: float = 0.3,
+        max_tokens: int = 4096,
+        response_format: dict | None = None,
+    ) -> str:
+        """Call LLM via litellm. Returns the text response."""
+        messages = []
+        sys_prompt = system or self.system_prompt
+        if sys_prompt:
+            messages.append({"role": "system", "content": sys_prompt})
+        messages.append({"role": "user", "content": prompt})
+
+        kwargs: dict[str, Any] = {
+            "model": self.get_model(),
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        if settings.llm_api_key:
+            kwargs["api_key"] = settings.llm_api_key
+        if settings.llm_api_base:
+            kwargs["api_base"] = settings.llm_api_base
+        if response_format:
+            kwargs["response_format"] = response_format
+
+        logger.info(f"[{self.name}] calling {self.get_model()}")
+        response = await litellm.acompletion(**kwargs)
+        content = response.choices[0].message.content
+        logger.info(f"[{self.name}] got {len(content)} chars")
+        return content
+
+    async def call_llm_json(self, prompt: str, **kwargs) -> dict:
+        """Call LLM and parse response as JSON."""
+        raw = await self.call_llm(
+            prompt,
+            response_format={"type": "json_object"},
+            **kwargs,
+        )
+        # Strip markdown code fences if present
+        text = raw.strip()
+        if text.startswith("```"):
+            first_nl = text.find("\n")
+            if first_nl != -1:
+                text = text[first_nl + 1:]
+            if text.endswith("```"):
+                text = text[: text.rfind("```")]
+            text = text.strip()
+
+        # Sanitize control characters inside JSON string values
+        # (models sometimes emit literal newlines/tabs inside strings)
+        import re
+        def _clean_json_string(s: str) -> str:
+            # Replace unescaped control chars within JSON strings
+            # This is a best-effort fix for common model outputs
+            result = []
+            in_string = False
+            escape = False
+            for ch in s:
+                if escape:
+                    result.append(ch)
+                    escape = False
+                    continue
+                if ch == '\\':
+                    result.append(ch)
+                    escape = True
+                    continue
+                if ch == '"':
+                    in_string = not in_string
+                    result.append(ch)
+                    continue
+                if in_string and ord(ch) < 32:
+                    # Replace control chars with escaped versions
+                    if ch == '\n':
+                        result.append('\\n')
+                    elif ch == '\r':
+                        result.append('\\r')
+                    elif ch == '\t':
+                        result.append('\\t')
+                    else:
+                        result.append(f'\\u{ord(ch):04x}')
+                    continue
+                result.append(ch)
+            return ''.join(result)
+
+        # Try parsing with multiple strategies
+        for attempt, candidate in enumerate([text, _clean_json_string(text)]):
+            try:
+                return json.loads(candidate)
+            except json.JSONDecodeError:
+                continue
+
+        # Last resort: try to extract the largest valid JSON object
+        # (model may have appended commentary after the JSON)
+        brace_depth = 0
+        start = text.find('{')
+        if start == -1:
+            raise json.JSONDecodeError("No JSON object found", text, 0)
+
+        cleaned = _clean_json_string(text)
+        for i, ch in enumerate(cleaned[start:], start):
+            if ch == '{':
+                brace_depth += 1
+            elif ch == '}':
+                brace_depth -= 1
+                if brace_depth == 0:
+                    try:
+                        return json.loads(cleaned[start:i + 1])
+                    except json.JSONDecodeError:
+                        continue
+
+        # If all else fails, use json_repair library or raise
+        try:
+            import json_repair
+            return json_repair.loads(text)
+        except (ImportError, Exception):
+            raise json.JSONDecodeError(
+                f"Failed to parse JSON after multiple attempts", text, 0
+            )
+
+    async def run(self, context: dict[str, Any]) -> dict[str, Any]:
+        """Execute this agent's task. Override in subclasses.
+
+        Args:
+            context: Shared pipeline context (accumulated by previous agents).
+
+        Returns:
+            Dict of new keys to merge into context.
+        """
+        raise NotImplementedError
--- a/app/agents/data_agent.py
+++ b/app/agents/data_agent.py
@@ -0,0 +1,78 @@
+"""Data Agent — processes data, generates chart specs and table data."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from .base import BaseAgent
+from app.config import settings
+
+
+class DataAgent(BaseAgent):
+    name = "data"
+    description = "处理数据、生成图表规格和表格数据"
+    system_prompt = """\
+你是一位数据分析专家。你的任务是根据报告草稿中标注的图表和表格需求，
+生成具体的数据和图表规格。
+
+输出要求（JSON 格式）：
+{
+  "charts": [
+    {
+      "id": "chart_1",
+      "title": "图表标题",
+      "type": "bar|line|pie|area|scatter",
+      "description": "图表说明",
+      "data": {
+        "labels": ["标签1", "标签2"],
+        "datasets": [
+          {"label": "数据集名", "data": [100, 200]}
+        ]
+      }
+    }
+  ],
+  "tables": [
+    {
+      "id": "table_1",
+      "title": "表格标题",
+      "headers": ["列1", "列2", "列3"],
+      "rows": [["数据1", "数据2", "数据3"]]
+    }
+  ]
+}"""
+
+    def __init__(self):
+        super().__init__(model=settings.model_for_domain("fast"))
+
+    async def run(self, context: dict[str, Any]) -> dict[str, Any]:
+        draft = context["draft"]
+        extra_data = context.get("extra_data", "")
+
+        # Collect chart/table needs from draft
+        chart_needs = []
+        table_needs = []
+        for ch in draft.get("chapters", []):
+            chart_needs.extend(ch.get("charts", []))
+            table_needs.extend(ch.get("tables", []))
+
+        if not chart_needs and not table_needs:
+            return {"data_assets": {"charts": [], "tables": []}}
+
+        prompt = f"""\
+## 报告标题
+{draft.get("title", "")}
+
+## 需要生成的图表
+{json.dumps(chart_needs, ensure_ascii=False)}
+
+## 需要生成的表格
+{json.dumps(table_needs, ensure_ascii=False)}
+
+## 补充数据源
+{extra_data if extra_data else "（无额外数据，请根据行业常识生成合理的示例数据）"}
+
+请为以上需求生成具体的图表规格和表格数据。输出 JSON。"""
+
+        result = await self.call_llm_json(prompt)
+        return {"data_assets": result}
--- a/app/agents/formatter.py
+++ b/app/agents/formatter.py
@@ -0,0 +1,669 @@
+"""Formatter Agent — renders final report using Skills toolkit.
+
+Skills integration:
+  - docx: python-docx (baseline) + docx-js via Node.js (rich mode) + OOXML template editing
+  - pptx: html2pptx.js via Node.js (visual slides) + python-pptx fallback
+  - xlsx: openpyxl + recalc.py (formula recalculation via LibreOffice)
+  - pdf:  reportlab with CJK support + fpdf2 fallback
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any
+
+from .base import BaseAgent
+
+logger = logging.getLogger(__name__)
+
+# Skills root
+SKILLS_ROOT = Path.home() / "Projects/code/20260119-skills合集/anthropics_skills/skills"
+DOCX_SKILLS = SKILLS_ROOT / "docx"
+PPTX_SKILLS = SKILLS_ROOT / "pptx"
+XLSX_SKILLS = SKILLS_ROOT / "xlsx"
+PDF_SKILLS = SKILLS_ROOT / "pdf"
+
+
+def _skills_available() -> dict[str, bool]:
+    """Check which skill toolkits are available."""
+    return {
+        "docx_js": (DOCX_SKILLS / "docx-js.md").exists(),
+        "html2pptx": (PPTX_SKILLS / "scripts" / "html2pptx.js").exists(),
+        "recalc": (XLSX_SKILLS / "recalc.py").exists(),
+        "ooxml_docx": (DOCX_SKILLS / "ooxml" / "scripts" / "unpack.py").exists(),
+        "ooxml_pptx": (PPTX_SKILLS / "ooxml" / "scripts" / "unpack.py").exists(),
+        "pdf_scripts": (PDF_SKILLS / "scripts").is_dir(),
+    }
+
+
+class FormatterAgent(BaseAgent):
+    name = "formatter"
+    description = "将报告渲染为 docx/pptx/xlsx/pdf，融合 Skills 能力"
+
+    def __init__(self):
+        super().__init__()
+        self.skills = _skills_available()
+        available = [k for k, v in self.skills.items() if v]
+        logger.info(f"[formatter] available skills: {available}")
+
+    async def run(self, context: dict[str, Any]) -> dict[str, Any]:
+        draft = context["draft"]
+        data_assets = context.get("data_assets", {})
+        output_dir = Path(context.get("output_dir", "output"))
+        formats = context.get("output_formats", ["docx"])
+        template_path = context.get("template_path")  # optional: user-provided template
+
+        output_dir.mkdir(parents=True, exist_ok=True)
+        title = draft.get("title", "报告")
+        generated_files = []
+
+        for fmt in formats:
+            try:
+                match fmt:
+                    case "docx":
+                        path = await self._render_docx(draft, data_assets, output_dir, title, template_path)
+                    case "pptx":
+                        path = await self._render_pptx(draft, data_assets, output_dir, title)
+                    case "xlsx":
+                        path = await self._render_xlsx(data_assets, output_dir, title)
+                    case "pdf":
+                        path = await self._render_pdf(draft, data_assets, output_dir, title)
+                    case _:
+                        logger.warning(f"Unsupported format: {fmt}")
+                        continue
+                generated_files.append(str(path))
+                logger.info(f"[formatter] generated {path}")
+            except Exception as e:
+                logger.exception(f"[formatter] failed to render {fmt}")
+
+        return {"generated_files": generated_files}
+
+    # -----------------------------------------------------------------------
+    # DOCX — python-docx baseline + OOXML template editing
+    # -----------------------------------------------------------------------
+
+    async def _render_docx(
+        self, draft: dict, data_assets: dict, output_dir: Path, title: str,
+        template_path: str | None = None,
+    ) -> Path:
+        if template_path and self.skills["ooxml_docx"]:
+            return await self._render_docx_from_template(
+                draft, data_assets, output_dir, title, Path(template_path)
+            )
+        return await self._render_docx_baseline(draft, data_assets, output_dir, title)
+
+    async def _render_docx_baseline(
+        self, draft: dict, data_assets: dict, output_dir: Path, title: str
+    ) -> Path:
+        from docx import Document
+        from docx.shared import Pt, RGBColor
+        from docx.enum.text import WD_ALIGN_PARAGRAPH
+
+        doc = Document()
+
+        # -- Styles --
+        style = doc.styles["Normal"]
+        style.font.name = "微软雅黑"
+        style.font.size = Pt(11)
+
+        # Title
+        t = doc.add_heading(title, level=0)
+        t.alignment = WD_ALIGN_PARAGRAPH.CENTER
+
+        # Executive summary
+        if summary := draft.get("executive_summary"):
+            doc.add_heading("执行摘要", level=1)
+            # Add summary with highlight styling
+            p = doc.add_paragraph()
+            run = p.add_run(summary)
+            run.font.size = Pt(11)
+            run.font.color.rgb = RGBColor(0x33, 0x33, 0x33)
+
+        # Chapters
+        for chapter in draft.get("chapters", []):
+            doc.add_heading(chapter["title"], level=1)
+            content = chapter.get("content", "")
+            self._docx_render_markdown(doc, content)
+
+        # Tables from data assets
+        for table_spec in data_assets.get("tables", []):
+            doc.add_heading(table_spec.get("title", "数据表"), level=2)
+            self._docx_add_table(doc, table_spec)
+
+        # Page break + chart descriptions as placeholders
+        for chart_spec in data_assets.get("charts", []):
+            doc.add_heading(chart_spec.get("title", "图表"), level=2)
+            desc = chart_spec.get("description", "")
+            chart_type = chart_spec.get("type", "")
+            doc.add_paragraph(f"[{chart_type.upper()} 图表] {desc}")
+            # Render chart data as a table too
+            chart_data = chart_spec.get("data", {})
+            if labels := chart_data.get("labels"):
+                for ds in chart_data.get("datasets", []):
+                    self._docx_add_table(doc, {
+                        "headers": ["项目", ds.get("label", "数据")],
+                        "rows": [[str(l), str(v)] for l, v in zip(labels, ds.get("data", []))],
+                    })
+
+        path = output_dir / f"{title}.docx"
+        doc.save(str(path))
+        return path
+
+    async def _render_docx_from_template(
+        self, draft: dict, data_assets: dict, output_dir: Path, title: str,
+        template_path: Path,
+    ) -> Path:
+        """Edit an existing DOCX template using OOXML unpack/edit/pack workflow."""
+        unpack_script = DOCX_SKILLS / "ooxml" / "scripts" / "unpack.py"
+        pack_script = DOCX_SKILLS / "ooxml" / "scripts" / "pack.py"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            work_dir = Path(tmpdir) / "unpacked"
+
+            # Unpack template
+            proc = await asyncio.create_subprocess_exec(
+                "python3", str(unpack_script), str(template_path), str(work_dir),
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+            )
+            await proc.wait()
+
+            if proc.returncode != 0:
+                logger.warning("[formatter] OOXML unpack failed, falling back to baseline")
+                return await self._render_docx_baseline(draft, data_assets, output_dir, title)
+
+            # TODO: edit XML content in work_dir based on draft
+            # For now, just pack back as-is (template passthrough)
+            output_path = output_dir / f"{title}.docx"
+            proc = await asyncio.create_subprocess_exec(
+                "python3", str(pack_script), str(work_dir), str(output_path),
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+            )
+            await proc.wait()
+            return output_path
+
+    def _docx_render_markdown(self, doc, content: str):
+        """Convert markdown-ish content to docx paragraphs."""
+        from docx.shared import Pt
+
+        for block in content.split("\n\n"):
+            block = block.strip()
+            if not block:
+                continue
+            if block.startswith("#### "):
+                doc.add_heading(block[5:], level=4)
+            elif block.startswith("### "):
+                doc.add_heading(block[4:], level=3)
+            elif block.startswith("## "):
+                doc.add_heading(block[3:], level=2)
+            elif block.startswith("- ") or block.startswith("* "):
+                # Bullet list
+                for line in block.split("\n"):
+                    line = line.lstrip("- *").strip()
+                    if line:
+                        doc.add_paragraph(line, style="List Bullet")
+            elif block.startswith("1. ") or block.startswith("1）"):
+                # Numbered list
+                for line in block.split("\n"):
+                    text = line.lstrip("0123456789.）) ").strip()
+                    if text:
+                        doc.add_paragraph(text, style="List Number")
+            else:
+                p = doc.add_paragraph(block)
+                for run in p.runs:
+                    run.font.size = Pt(11)
+
+    def _docx_add_table(self, doc, table_spec: dict):
+        """Add a formatted table to the document."""
+        from docx.shared import Pt, RGBColor
+        from docx.oxml.ns import qn
+
+        headers = table_spec.get("headers", [])
+        rows = table_spec.get("rows", [])
+        if not headers:
+            return
+
+        tbl = doc.add_table(rows=1 + len(rows), cols=len(headers))
+        tbl.style = "Light Grid Accent 1"
+
+        # Header row
+        for i, h in enumerate(headers):
+            cell = tbl.rows[0].cells[i]
+            cell.text = str(h)
+            for p in cell.paragraphs:
+                for run in p.runs:
+                    run.font.bold = True
+                    run.font.size = Pt(10)
+
+        # Data rows
+        for r_idx, row in enumerate(rows):
+            for c_idx, cell_val in enumerate(row):
+                tbl.rows[r_idx + 1].cells[c_idx].text = str(cell_val)
+
+    # -----------------------------------------------------------------------
+    # PPTX — html2pptx.js (rich) or python-pptx (fallback)
+    # -----------------------------------------------------------------------
+
+    async def _render_pptx(
+        self, draft: dict, data_assets: dict, output_dir: Path, title: str
+    ) -> Path:
+        if self.skills["html2pptx"]:
+            try:
+                return await self._render_pptx_html2pptx(draft, data_assets, output_dir, title)
+            except Exception as e:
+                logger.warning(f"[formatter] html2pptx failed ({e}), falling back to python-pptx")
+        return await self._render_pptx_baseline(draft, data_assets, output_dir, title)
+
+    async def _render_pptx_html2pptx(
+        self, draft: dict, data_assets: dict, output_dir: Path, title: str
+    ) -> Path:
+        """Generate PPTX using html2pptx.js skill for visual slides."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            work = Path(tmpdir)
+
+            # Generate HTML slides
+            slides_html = []
+            # Title slide
+            slides_html.append(f"""<html><body style="width:720pt;height:405pt;display:flex;align-items:center;justify-content:center;flex-direction:column;background:linear-gradient(135deg,#1a1a2e,#16213e);color:white;font-family:sans-serif;">
+<h1 style="font-size:36pt;margin:0;">{title}</h1>
+<p style="font-size:18pt;color:#aaa;margin-top:20pt;">{draft.get('executive_summary', '')[:100]}</p>
+</body></html>""")
+
+            # Chapter slides
+            for ch in draft.get("chapters", []):
+                content_lines = ch.get("content", "")[:400].split("\n")
+                bullets = "".join(f"<li>{l.strip()}</li>" for l in content_lines if l.strip())
+                slides_html.append(f"""<html><body style="width:720pt;height:405pt;padding:40pt;font-family:sans-serif;background:#ffffff;">
+<h2 style="font-size:28pt;color:#1a1a2e;border-bottom:2pt solid #e94560;padding-bottom:10pt;">{ch['title']}</h2>
+<ul style="font-size:14pt;color:#333;line-height:1.8;">{bullets}</ul>
+</body></html>""")
+
+            # Write HTML files
+            for i, html in enumerate(slides_html):
+                (work / f"slide_{i}.html").write_text(html, encoding="utf-8")
+
+            # Write conversion script
+            script = work / "convert.js"
+            html2pptx_path = PPTX_SKILLS / "scripts" / "html2pptx.js"
+            slide_files = [f"slide_{i}.html" for i in range(len(slides_html))]
+
+            script.write_text(f"""\
+const pptxgen = require('pptxgenjs');
+const {{ html2pptx }} = require('{html2pptx_path}');
+const path = require('path');
+
+async function main() {{
+  const pptx = new pptxgen();
+  pptx.layout = 'LAYOUT_16x9';
+  const files = {json.dumps(slide_files)};
+  for (const f of files) {{
+    await html2pptx(path.join('{work}', f), pptx);
+  }}
+  await pptx.writeFile({{ fileName: '{output_dir / f"{title}.pptx"}' }});
+}}
+main().catch(e => {{ console.error(e); process.exit(1); }});
+""", encoding="utf-8")
+
+            proc = await asyncio.create_subprocess_exec(
+                "node", str(script),
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                cwd=str(work),
+            )
+            stdout, stderr = await proc.communicate()
+            if proc.returncode != 0:
+                raise RuntimeError(f"html2pptx failed: {stderr.decode()}")
+
+        return output_dir / f"{title}.pptx"
+
+    async def _render_pptx_baseline(
+        self, draft: dict, data_assets: dict, output_dir: Path, title: str
+    ) -> Path:
+        from pptx import Presentation
+        from pptx.util import Inches, Pt
+        from pptx.dml.color import RGBColor
+
+        prs = Presentation()
+
+        # Title slide
+        slide = prs.slides.add_slide(prs.slide_layouts[0])
+        slide.shapes.title.text = title
+        if len(slide.placeholders) > 1:
+            slide.placeholders[1].text = draft.get("executive_summary", "")[:200]
+
+        # Chapter slides
+        for chapter in draft.get("chapters", []):
+            slide = prs.slides.add_slide(prs.slide_layouts[1])
+            slide.shapes.title.text = chapter["title"]
+            body = slide.placeholders[1]
+            tf = body.text_frame
+            tf.clear()
+
+            content = chapter.get("content", "")
+            lines = [l.strip() for l in content.split("\n") if l.strip()]
+            for line in lines[:12]:  # max 12 bullets per slide
+                p = tf.add_paragraph()
+                # Strip markdown markers
+                clean = line.lstrip("#-*0123456789.） ").strip()
+                p.text = clean
+                p.font.size = Pt(14)
+                p.space_after = Pt(4)
+
+        # Data table slides
+        for table_spec in data_assets.get("tables", []):
+            slide = prs.slides.add_slide(prs.slide_layouts[5])  # blank layout
+            slide.shapes.title.text = table_spec.get("title", "数据表")
+
+            headers = table_spec.get("headers", [])
+            rows = table_spec.get("rows", [])
+            if headers and rows:
+                n_rows = min(len(rows) + 1, 10)  # limit rows per slide
+                n_cols = len(headers)
+                tbl = slide.shapes.add_table(
+                    n_rows, n_cols,
+                    Inches(0.5), Inches(1.5), Inches(9), Inches(4.5)
+                ).table
+
+                for i, h in enumerate(headers):
+                    tbl.cell(0, i).text = str(h)
+                for r_idx, row in enumerate(rows[:n_rows - 1]):
+                    for c_idx, val in enumerate(row[:n_cols]):
+                        tbl.cell(r_idx + 1, c_idx).text = str(val)
+
+        path = output_dir / f"{title}.pptx"
+        prs.save(str(path))
+        return path
+
+    # -----------------------------------------------------------------------
+    # XLSX — openpyxl + recalc.py (formula recalculation)
+    # -----------------------------------------------------------------------
+
+    async def _render_xlsx(
+        self, data_assets: dict, output_dir: Path, title: str
+    ) -> Path:
+        from openpyxl import Workbook
+        from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
+        from openpyxl.utils import get_column_letter
+
+        wb = Workbook()
+        ws = wb.active
+        ws.title = "数据总览"
+
+        # Professional styling
+        header_font = Font(bold=True, size=11, color="FFFFFF")
+        header_fill = PatternFill(start_color="1A1A2E", end_color="1A1A2E", fill_type="solid")
+        title_font = Font(bold=True, size=14, color="1A1A2E")
+        thin_border = Border(
+            left=Side(style="thin", color="CCCCCC"),
+            right=Side(style="thin", color="CCCCCC"),
+            top=Side(style="thin", color="CCCCCC"),
+            bottom=Side(style="thin", color="CCCCCC"),
+        )
+
+        current_row = 1
+        has_formulas = False
+
+        for table_spec in data_assets.get("tables", []):
+            # Table title
+            ws.cell(row=current_row, column=1, value=table_spec.get("title", "")).font = title_font
+            current_row += 1
+
+            headers = table_spec.get("headers", [])
+            rows = table_spec.get("rows", [])
+
+            if headers:
+                # Header row with styling
+                for col_idx, h in enumerate(headers, 1):
+                    cell = ws.cell(row=current_row, column=col_idx, value=h)
+                    cell.font = header_font
+                    cell.fill = header_fill
+                    cell.alignment = Alignment(horizontal="center")
+                    cell.border = thin_border
+                current_row += 1
+
+                # Data rows
+                data_start = current_row
+                for row_data in rows:
+                    for col_idx, val in enumerate(row_data, 1):
+                        cell = ws.cell(row=current_row, column=col_idx, value=val)
+                        cell.border = thin_border
+                        # Try to convert numeric strings
+                        if isinstance(val, str):
+                            try:
+                                cell.value = float(val.replace(",", ""))
+                            except (ValueError, AttributeError):
+                                pass
+                    current_row += 1
+
+                # Auto-sum row for numeric columns
+                data_end = current_row - 1
+                if data_end > data_start:
+                    for col_idx in range(1, len(headers) + 1):
+                        col_letter = get_column_letter(col_idx)
+                        test_cell = ws.cell(row=data_start, column=col_idx)
+                        if isinstance(test_cell.value, (int, float)):
+                            cell = ws.cell(
+                                row=current_row, column=col_idx,
+                                value=f"=SUM({col_letter}{data_start}:{col_letter}{data_end})"
+                            )
+                            cell.font = Font(bold=True)
+                            cell.border = thin_border
+                            has_formulas = True
+                        elif col_idx == 1:
+                            cell = ws.cell(row=current_row, column=1, value="合计")
+                            cell.font = Font(bold=True)
+                            cell.border = thin_border
+                    current_row += 1
+
+                # Auto-fit column widths
+                for col_idx in range(1, len(headers) + 1):
+                    max_len = max(
+                        len(str(ws.cell(row=r, column=col_idx).value or ""))
+                        for r in range(current_row - len(rows) - 2, current_row)
+                    )
+                    ws.column_dimensions[get_column_letter(col_idx)].width = min(max_len + 4, 30)
+
+            current_row += 2  # gap between tables
+
+        # Chart data sheets
+        for chart_spec in data_assets.get("charts", []):
+            chart_ws = wb.create_sheet(title=chart_spec.get("title", "图表")[:31])
+            chart_ws.cell(row=1, column=1, value=chart_spec.get("title", "")).font = title_font
+            chart_data = chart_spec.get("data", {})
+            labels = chart_data.get("labels", [])
+            datasets = chart_data.get("datasets", [])
+
+            # Headers: [项目, 数据集1, 数据集2, ...]
+            chart_ws.cell(row=2, column=1, value="项目").font = Font(bold=True)
+            for ds_idx, ds in enumerate(datasets, 2):
+                chart_ws.cell(row=2, column=ds_idx, value=ds.get("label", "")).font = Font(bold=True)
+
+            for r_idx, label in enumerate(labels, 3):
+                chart_ws.cell(row=r_idx, column=1, value=label)
+                for ds_idx, ds in enumerate(datasets, 2):
+                    data = ds.get("data", [])
+                    if r_idx - 3 < len(data):
+                        chart_ws.cell(row=r_idx, column=ds_idx, value=data[r_idx - 3])
+
+        path = output_dir / f"{title}.xlsx"
+        wb.save(str(path))
+
+        # Run recalc.py if we have formulas and the skill is available
+        if has_formulas and self.skills["recalc"]:
+            await self._xlsx_recalc(path)
+
+        return path
+
+    async def _xlsx_recalc(self, path: Path):
+        """Recalculate formulas using Skills recalc.py (requires LibreOffice)."""
+        recalc_script = XLSX_SKILLS / "recalc.py"
+        logger.info(f"[formatter] running recalc.py on {path}")
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                "python3", str(recalc_script), str(path), "30",
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+            )
+            stdout, stderr = await proc.communicate()
+            if proc.returncode == 0:
+                result = json.loads(stdout.decode())
+                logger.info(f"[formatter] recalc result: {result.get('status')}")
+            else:
+                logger.warning(f"[formatter] recalc.py failed: {stderr.decode()[:200]}")
+        except Exception as e:
+            logger.warning(f"[formatter] recalc.py error: {e}")
+
+    # -----------------------------------------------------------------------
+    # PDF — reportlab with CJK support + fpdf2 fallback
+    # -----------------------------------------------------------------------
+
+    async def _render_pdf(
+        self, draft: dict, data_assets: dict, output_dir: Path, title: str
+    ) -> Path:
+        try:
+            return await self._render_pdf_reportlab(draft, data_assets, output_dir, title)
+        except Exception as e:
+            logger.warning(f"[formatter] reportlab failed ({e}), falling back to fpdf2")
+            return await self._render_pdf_fpdf(draft, data_assets, output_dir, title)
+
+    async def _render_pdf_reportlab(
+        self, draft: dict, data_assets: dict, output_dir: Path, title: str
+    ) -> Path:
+        """Generate PDF with reportlab — better CJK support and table rendering."""
+        from reportlab.lib.pagesizes import A4
+        from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+        from reportlab.lib.units import mm
+        from reportlab.lib import colors
+        from reportlab.platypus import (
+            SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak,
+        )
+        from reportlab.pdfbase import pdfmetrics
+        from reportlab.pdfbase.ttfonts import TTFont
+
+        # Try to register a CJK font
+        cjk_font = "Helvetica"
+        for font_path in [
+            "/System/Library/Fonts/STHeiti Medium.ttc",
+            "/System/Library/Fonts/PingFang.ttc",
+            "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
+        ]:
+            if Path(font_path).exists():
+                try:
+                    pdfmetrics.registerFont(TTFont("CJK", font_path, subfontIndex=0))
+                    cjk_font = "CJK"
+                    break
+                except Exception:
+                    continue
+
+        path = output_dir / f"{title}.pdf"
+        doc = SimpleDocTemplate(str(path), pagesize=A4,
+                                topMargin=25*mm, bottomMargin=25*mm)
+
+        styles = getSampleStyleSheet()
+        styles.add(ParagraphStyle(
+            name="CJKTitle", fontName=cjk_font, fontSize=22,
+            spaceAfter=12, alignment=1,
+        ))
+        styles.add(ParagraphStyle(
+            name="CJKHeading", fontName=cjk_font, fontSize=16,
+            spaceAfter=8, spaceBefore=16, textColor=colors.HexColor("#1a1a2e"),
+        ))
+        styles.add(ParagraphStyle(
+            name="CJKBody", fontName=cjk_font, fontSize=11,
+            spaceAfter=6, leading=16,
+        ))
+
+        elements = []
+
+        # Title
+        elements.append(Paragraph(title, styles["CJKTitle"]))
+        elements.append(Spacer(1, 12))
+
+        # Executive summary
+        if summary := draft.get("executive_summary"):
+            elements.append(Paragraph("执行摘要", styles["CJKHeading"]))
+            elements.append(Paragraph(summary, styles["CJKBody"]))
+            elements.append(Spacer(1, 12))
+
+        # Chapters
+        for chapter in draft.get("chapters", []):
+            elements.append(PageBreak())
+            elements.append(Paragraph(chapter["title"], styles["CJKHeading"]))
+            content = chapter.get("content", "")
+            for para in content.split("\n\n"):
+                para = para.strip()
+                if para:
+                    elements.append(Paragraph(para, styles["CJKBody"]))
+
+        # Tables
+        for table_spec in data_assets.get("tables", []):
+            elements.append(Spacer(1, 12))
+            elements.append(Paragraph(table_spec.get("title", ""), styles["CJKHeading"]))
+            headers = table_spec.get("headers", [])
+            rows = table_spec.get("rows", [])
+            if headers:
+                table_data = [headers] + rows
+                t = Table(table_data)
+                t.setStyle(TableStyle([
+                    ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#1a1a2e")),
+                    ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
+                    ("FONTNAME", (0, 0), (-1, -1), cjk_font),
+                    ("FONTSIZE", (0, 0), (-1, 0), 10),
+                    ("FONTSIZE", (0, 1), (-1, -1), 9),
+                    ("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
+                    ("ALIGN", (0, 0), (-1, -1), "CENTER"),
+                    ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f5f5f5")]),
+                ]))
+                elements.append(t)
+
+        doc.build(elements)
+        return path
+
+    async def _render_pdf_fpdf(
+        self, draft: dict, data_assets: dict, output_dir: Path, title: str
+    ) -> Path:
+        """Fallback PDF generation with fpdf2."""
+        from fpdf import FPDF
+
+        pdf = FPDF()
+        pdf.set_auto_page_break(auto=True, margin=15)
+
+        # Try CJK font
+        for font_path in [
+            "/System/Library/Fonts/STHeiti Medium.ttc",
+            "/System/Library/Fonts/PingFang.ttc",
+        ]:
+            if Path(font_path).exists():
+                try:
+                    pdf.add_font("CJK", "", font_path, uni=True)
+                    pdf.set_font("CJK", "", 11)
+                    break
+                except Exception:
+                    pdf.set_font("Helvetica", "", 11)
+        else:
+            pdf.set_font("Helvetica", "", 11)
+
+        pdf.add_page()
+        pdf.set_font_size(24)
+        pdf.cell(0, 20, title, new_x="LMARGIN", new_y="NEXT", align="C")
+
+        pdf.set_font_size(11)
+        if summary := draft.get("executive_summary"):
+            pdf.set_font_size(16)
+            pdf.cell(0, 12, "执行摘要", new_x="LMARGIN", new_y="NEXT")
+            pdf.set_font_size(11)
+            pdf.multi_cell(0, 6, summary)
+
+        for chapter in draft.get("chapters", []):
+            pdf.add_page()
+            pdf.set_font_size(16)
+            pdf.cell(0, 12, chapter["title"], new_x="LMARGIN", new_y="NEXT")
+            pdf.set_font_size(11)
+            pdf.multi_cell(0, 6, chapter.get("content", ""))
+
+        path = output_dir / f"{title}.pdf"
+        pdf.output(str(path))
+        return path
--- a/app/agents/researcher.py
+++ b/app/agents/researcher.py
@@ -0,0 +1,103 @@
+"""Researcher Agent — domain-aware, bilingual research."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .base import BaseAgent
+from app.config import settings
+
+SYSTEM_EN = """\
+You are a senior industry analyst at a top-tier consulting firm.
+Your task is to produce a thorough research brief based on the given instructions.
+
+Requirements:
+1. Be specific — cite concrete data points, market sizes, growth rates, company names
+2. Be structured — organize findings with clear headings and logical flow
+3. Be analytical — don't just list facts, provide insights and implications
+4. Flag data gaps — explicitly note where data is uncertain or unavailable
+
+Output (JSON):
+{
+  "title": "Research brief title",
+  "executive_summary": "2-3 sentence summary of key findings",
+  "sections": [
+    {
+      "heading": "Section heading",
+      "content": "Detailed findings (Markdown)",
+      "data_points": ["key data points extracted"],
+      "sources_quality": "high|medium|low — how confident are you in the data"
+    }
+  ],
+  "data_gaps": ["areas where data is insufficient or uncertain"],
+  "key_insights": ["top 3-5 non-obvious insights"]
+}"""
+
+SYSTEM_ZH = """\
+你是一位顶级咨询公司的资深行业分析师。
+你的任务是根据给定的指令，输出一份深度研究简报。
+
+要求：
+1. 具体——引用具体的数据点、市场规模、增长率、企业名称
+2. 结构化——用清晰的标题和逻辑流组织发现
+3. 有分析深度——不要只罗列事实，要提供洞察和含义
+4. 标注数据缺口——明确指出数据不确定或不可获取的地方
+
+输出（JSON）：
+{
+  "title": "研究简报标题",
+  "executive_summary": "核心发现的2-3句总结",
+  "sections": [
+    {
+      "heading": "章节标题",
+      "content": "详细发现（Markdown格式）",
+      "data_points": ["提取的关键数据点"],
+      "sources_quality": "high|medium|low — 对数据的置信度"
+    }
+  ],
+  "data_gaps": ["数据不充分或不确定的领域"],
+  "key_insights": ["3-5条非显而易见的洞察"]
+}"""
+
+
+class ResearcherAgent(BaseAgent):
+    name = "researcher"
+    description = "域感知研究 — 根据领域选择最优模型和语言"
+
+    def __init__(self, model: str | None = None, language: str = "en"):
+        super().__init__(model=model)
+        self.language = language
+        self.system_prompt = SYSTEM_ZH if language == "zh" else SYSTEM_EN
+
+    async def run(self, context: dict[str, Any]) -> dict[str, Any]:
+        requirement = context["requirement"]
+        report_type = context.get("report_type", "")
+        extra_data = context.get("extra_data", "")
+
+        if self.language == "zh":
+            prompt = f"""\
+## 研究指令
+{requirement}
+
+## 研究方向
+{report_type}
+
+## 补充数据
+{extra_data if extra_data else "（无）"}
+
+请输出研究简报 JSON。"""
+        else:
+            prompt = f"""\
+## Research instructions
+{requirement}
+
+## Research focus
+{report_type}
+
+## Additional data
+{extra_data if extra_data else "(none)"}
+
+Output the research brief as JSON."""
+
+        result = await self.call_llm_json(prompt, max_tokens=6144)
+        return {"research": result}
--- a/app/agents/reviewer.py
+++ b/app/agents/reviewer.py
@@ -0,0 +1,79 @@
+"""Reviewer Agent — bilingual quality check with strongest reasoning model."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from .base import BaseAgent
+from app.config import settings
+
+
+class ReviewerAgent(BaseAgent):
+    name = "reviewer"
+    description = "双语报告质量审查 — 使用最强推理模型"
+
+    system_prompt = """\
+You are a senior consulting partner reviewing a report before client delivery.
+The report has both Chinese and English versions (or will be translated).
+
+Review dimensions:
+1. **Accuracy** — Are data points, percentages, and claims supported by the research?
+   Cross-check global claims against English research, Chinese claims against Chinese research.
+2. **Logical consistency** — Does the narrative flow? Are there contradictions between chapters?
+3. **Depth of analysis** — Is it consultancy-grade or just surface-level? Would a C-suite exec find it valuable?
+4. **Bilingual quality** — If translated version exists, check for translation artifacts,
+   mistranslated terminology, or cultural mismatches.
+5. **Data gaps honesty** — Are uncertainties acknowledged or are claims fabricated?
+6. **Completeness** — Are any critical aspects of the requirement left unaddressed?
+
+Scoring guide:
+- 90+: Publication-ready
+- 80-89: Minor issues, can pass with notes
+- 70-79: Needs revision (verdict: revise)
+- <70: Significant problems (verdict: reject)
+
+Output (JSON):
+{
+  "overall_score": 85,
+  "verdict": "pass|revise|reject",
+  "issues": [
+    {
+      "severity": "high|medium|low",
+      "chapter": "affected chapter",
+      "dimension": "accuracy|consistency|depth|bilingual|gaps|completeness",
+      "description": "issue description",
+      "suggestion": "specific fix suggestion"
+    }
+  ],
+  "strengths": ["what the report does well"],
+  "summary": "Overall assessment (2-3 sentences)"
+}"""
+
+    def __init__(self):
+        super().__init__(model=settings.model_for_domain("reasoning"))
+
+    async def run(self, context: dict[str, Any]) -> dict[str, Any]:
+        draft = context["draft"]
+        draft_translated = context.get("draft_translated", {})
+        research = context["research"]
+
+        sections = [
+            "## Research Plan (what was asked)",
+            json.dumps(research, ensure_ascii=False, indent=2),
+            "",
+            "## Primary Draft",
+            json.dumps(draft, ensure_ascii=False, indent=2),
+        ]
+
+        if draft_translated:
+            sections.extend([
+                "",
+                "## Translated Version",
+                json.dumps(draft_translated, ensure_ascii=False, indent=2),
+            ])
+
+        prompt = "\n".join(sections) + "\n\nReview the report. Output JSON."
+
+        result = await self.call_llm_json(prompt, max_tokens=4096)
+        return {"review": result}
--- a/app/agents/writer.py
+++ b/app/agents/writer.py
@@ -0,0 +1,86 @@
+"""Writer Agent — synthesizes multilingual research tracks into a cohesive report."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from .base import BaseAgent
+from app.config import settings
+
+
+class WriterAgent(BaseAgent):
+    name = "writer"
+    description = "汇聚多语言/多领域研究成果，撰写完整报告"
+
+    system_prompt = """\
+You are an expert consulting report writer. Your task is to synthesize research
+findings from MULTIPLE parallel tracks (some in English, some in Chinese) into
+ONE cohesive, professional consulting report.
+
+CRITICAL RULES:
+1. The PRIMARY output language is Chinese (中文) — this is for Chinese clients
+2. For global/international sections, the analysis depth must reflect the English research
+3. For China-specific sections, preserve the precision of Chinese-native research
+4. Maintain professional consulting tone throughout
+5. Every claim should trace back to a research track's findings
+6. Mark chart/table needs: {{CHART:描述}} and {{TABLE:描述}}
+7. If a research track flags "data_gaps", acknowledge uncertainty rather than fabricating
+
+Output (JSON):
+{
+  "title": "报告标题（中文）",
+  "title_en": "Report Title (English)",
+  "chapters": [
+    {
+      "title": "章节标题",
+      "content": "章节正文（Markdown 格式，中文）",
+      "source_tracks": ["引用的研究轨道名称"],
+      "charts": ["图表需求"],
+      "tables": ["表格需求"]
+    }
+  ],
+  "executive_summary": "执行摘要（中文，300-500字）",
+  "executive_summary_en": "Executive Summary (English, 200-400 words)"
+}"""
+
+    def __init__(self):
+        super().__init__(model=settings.model_for_domain("reasoning"))
+
+    async def run(self, context: dict[str, Any]) -> dict[str, Any]:
+        research = context["research"]
+        requirement = context["requirement"]
+        revision_feedback = context.get("revision_feedback", "")
+
+        # Format multi-track, multilingual research
+        tracks_text = ""
+        for track in research.get("tracks", []):
+            lang_tag = f"[{track.get('native_language', '?').upper()}]"
+            domain_tag = f"[{track.get('domain', '?')}]"
+            tracks_text += f"\n### {domain_tag} {lang_tag} {track.get('track', '')}\n"
+            findings = track.get("findings", {})
+            tracks_text += json.dumps(findings, ensure_ascii=False, indent=2)
+
+        synthesis_guide = research.get("synthesis_guide", "")
+
+        prompt = f"""\
+## 原始需求 / Original Requirement
+{requirement}
+
+## 报告标题
+中文：{research.get("title_zh", "")}
+English: {research.get("title_en", "")}
+
+## 写作指导 / Synthesis Guide
+{synthesis_guide}
+
+## 各研究轨道成果 / Research Track Results
+（注意：有些轨道是英文原版 [EN]，有些是中文原版 [ZH]，请综合使用）
+{tracks_text}
+
+{f"## 审稿反馈 / Review Feedback{revision_feedback}" if revision_feedback else ""}
+
+请汇聚以上研究成果，撰写完整的中文报告。输出 JSON。"""
+
+        result = await self.call_llm_json(prompt, max_tokens=8192)
+        return {"draft": result}