init repo

This commit is contained in:
2026-04-25 19:25:22 +08:00
commit c7533eada2
50 changed files with 3732 additions and 0 deletions

15
app/agents/__init__.py Normal file
View File

@@ -0,0 +1,15 @@
from .base import BaseAgent
from .researcher import ResearcherAgent
from .writer import WriterAgent
from .data_agent import DataAgent
from .reviewer import ReviewerAgent
from .formatter import FormatterAgent
__all__ = [
"BaseAgent",
"ResearcherAgent",
"WriterAgent",
"DataAgent",
"ReviewerAgent",
"FormatterAgent",
]

166
app/agents/base.py Normal file
View File

@@ -0,0 +1,166 @@
"""Base agent with LLM calling via litellm."""
from __future__ import annotations
import json
import logging
from typing import Any
import litellm
from app.config import settings
logger = logging.getLogger(__name__)
# Disable litellm telemetry
litellm.telemetry = False
class BaseAgent:
"""Base class for all pipeline agents."""
name: str = "base"
description: str = ""
system_prompt: str = ""
model: str = "" # empty = use default from config
def __init__(self, model: str | None = None):
if model:
self.model = model
def get_model(self) -> str:
return self.model or settings.llm_model
async def call_llm(
self,
prompt: str,
*,
system: str | None = None,
temperature: float = 0.3,
max_tokens: int = 4096,
response_format: dict | None = None,
) -> str:
"""Call LLM via litellm. Returns the text response."""
messages = []
sys_prompt = system or self.system_prompt
if sys_prompt:
messages.append({"role": "system", "content": sys_prompt})
messages.append({"role": "user", "content": prompt})
kwargs: dict[str, Any] = {
"model": self.get_model(),
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
}
if settings.llm_api_key:
kwargs["api_key"] = settings.llm_api_key
if settings.llm_api_base:
kwargs["api_base"] = settings.llm_api_base
if response_format:
kwargs["response_format"] = response_format
logger.info(f"[{self.name}] calling {self.get_model()}")
response = await litellm.acompletion(**kwargs)
content = response.choices[0].message.content
logger.info(f"[{self.name}] got {len(content)} chars")
return content
async def call_llm_json(self, prompt: str, **kwargs) -> dict:
"""Call LLM and parse response as JSON."""
raw = await self.call_llm(
prompt,
response_format={"type": "json_object"},
**kwargs,
)
# Strip markdown code fences if present
text = raw.strip()
if text.startswith("```"):
first_nl = text.find("\n")
if first_nl != -1:
text = text[first_nl + 1:]
if text.endswith("```"):
text = text[: text.rfind("```")]
text = text.strip()
# Sanitize control characters inside JSON string values
# (models sometimes emit literal newlines/tabs inside strings)
import re
def _clean_json_string(s: str) -> str:
# Replace unescaped control chars within JSON strings
# This is a best-effort fix for common model outputs
result = []
in_string = False
escape = False
for ch in s:
if escape:
result.append(ch)
escape = False
continue
if ch == '\\':
result.append(ch)
escape = True
continue
if ch == '"':
in_string = not in_string
result.append(ch)
continue
if in_string and ord(ch) < 32:
# Replace control chars with escaped versions
if ch == '\n':
result.append('\\n')
elif ch == '\r':
result.append('\\r')
elif ch == '\t':
result.append('\\t')
else:
result.append(f'\\u{ord(ch):04x}')
continue
result.append(ch)
return ''.join(result)
# Try parsing with multiple strategies
for attempt, candidate in enumerate([text, _clean_json_string(text)]):
try:
return json.loads(candidate)
except json.JSONDecodeError:
continue
# Last resort: try to extract the largest valid JSON object
# (model may have appended commentary after the JSON)
brace_depth = 0
start = text.find('{')
if start == -1:
raise json.JSONDecodeError("No JSON object found", text, 0)
cleaned = _clean_json_string(text)
for i, ch in enumerate(cleaned[start:], start):
if ch == '{':
brace_depth += 1
elif ch == '}':
brace_depth -= 1
if brace_depth == 0:
try:
return json.loads(cleaned[start:i + 1])
except json.JSONDecodeError:
continue
# If all else fails, use json_repair library or raise
try:
import json_repair
return json_repair.loads(text)
except (ImportError, Exception):
raise json.JSONDecodeError(
f"Failed to parse JSON after multiple attempts", text, 0
)
async def run(self, context: dict[str, Any]) -> dict[str, Any]:
"""Execute this agent's task. Override in subclasses.
Args:
context: Shared pipeline context (accumulated by previous agents).
Returns:
Dict of new keys to merge into context.
"""
raise NotImplementedError

78
app/agents/data_agent.py Normal file
View File

@@ -0,0 +1,78 @@
"""Data Agent — processes data, generates chart specs and table data."""
from __future__ import annotations
import json
from typing import Any
from .base import BaseAgent
from app.config import settings
class DataAgent(BaseAgent):
name = "data"
description = "处理数据、生成图表规格和表格数据"
system_prompt = """\
你是一位数据分析专家。你的任务是根据报告草稿中标注的图表和表格需求,
生成具体的数据和图表规格。
输出要求JSON 格式):
{
"charts": [
{
"id": "chart_1",
"title": "图表标题",
"type": "bar|line|pie|area|scatter",
"description": "图表说明",
"data": {
"labels": ["标签1", "标签2"],
"datasets": [
{"label": "数据集名", "data": [100, 200]}
]
}
}
],
"tables": [
{
"id": "table_1",
"title": "表格标题",
"headers": ["列1", "列2", "列3"],
"rows": [["数据1", "数据2", "数据3"]]
}
]
}"""
def __init__(self):
super().__init__(model=settings.model_for_domain("fast"))
async def run(self, context: dict[str, Any]) -> dict[str, Any]:
draft = context["draft"]
extra_data = context.get("extra_data", "")
# Collect chart/table needs from draft
chart_needs = []
table_needs = []
for ch in draft.get("chapters", []):
chart_needs.extend(ch.get("charts", []))
table_needs.extend(ch.get("tables", []))
if not chart_needs and not table_needs:
return {"data_assets": {"charts": [], "tables": []}}
prompt = f"""\
## 报告标题
{draft.get("title", "")}
## 需要生成的图表
{json.dumps(chart_needs, ensure_ascii=False)}
## 需要生成的表格
{json.dumps(table_needs, ensure_ascii=False)}
## 补充数据源
{extra_data if extra_data else "(无额外数据,请根据行业常识生成合理的示例数据)"}
请为以上需求生成具体的图表规格和表格数据。输出 JSON。"""
result = await self.call_llm_json(prompt)
return {"data_assets": result}

669
app/agents/formatter.py Normal file
View File

@@ -0,0 +1,669 @@
"""Formatter Agent — renders final report using Skills toolkit.
Skills integration:
- docx: python-docx (baseline) + docx-js via Node.js (rich mode) + OOXML template editing
- pptx: html2pptx.js via Node.js (visual slides) + python-pptx fallback
- xlsx: openpyxl + recalc.py (formula recalculation via LibreOffice)
- pdf: reportlab with CJK support + fpdf2 fallback
"""
from __future__ import annotations
import asyncio
import json
import logging
import shutil
import subprocess
import tempfile
from pathlib import Path
from typing import Any
from .base import BaseAgent
logger = logging.getLogger(__name__)
# Skills root
SKILLS_ROOT = Path.home() / "Projects/code/20260119-skills合集/anthropics_skills/skills"
DOCX_SKILLS = SKILLS_ROOT / "docx"
PPTX_SKILLS = SKILLS_ROOT / "pptx"
XLSX_SKILLS = SKILLS_ROOT / "xlsx"
PDF_SKILLS = SKILLS_ROOT / "pdf"
def _skills_available() -> dict[str, bool]:
"""Check which skill toolkits are available."""
return {
"docx_js": (DOCX_SKILLS / "docx-js.md").exists(),
"html2pptx": (PPTX_SKILLS / "scripts" / "html2pptx.js").exists(),
"recalc": (XLSX_SKILLS / "recalc.py").exists(),
"ooxml_docx": (DOCX_SKILLS / "ooxml" / "scripts" / "unpack.py").exists(),
"ooxml_pptx": (PPTX_SKILLS / "ooxml" / "scripts" / "unpack.py").exists(),
"pdf_scripts": (PDF_SKILLS / "scripts").is_dir(),
}
class FormatterAgent(BaseAgent):
name = "formatter"
description = "将报告渲染为 docx/pptx/xlsx/pdf融合 Skills 能力"
def __init__(self):
super().__init__()
self.skills = _skills_available()
available = [k for k, v in self.skills.items() if v]
logger.info(f"[formatter] available skills: {available}")
async def run(self, context: dict[str, Any]) -> dict[str, Any]:
draft = context["draft"]
data_assets = context.get("data_assets", {})
output_dir = Path(context.get("output_dir", "output"))
formats = context.get("output_formats", ["docx"])
template_path = context.get("template_path") # optional: user-provided template
output_dir.mkdir(parents=True, exist_ok=True)
title = draft.get("title", "报告")
generated_files = []
for fmt in formats:
try:
match fmt:
case "docx":
path = await self._render_docx(draft, data_assets, output_dir, title, template_path)
case "pptx":
path = await self._render_pptx(draft, data_assets, output_dir, title)
case "xlsx":
path = await self._render_xlsx(data_assets, output_dir, title)
case "pdf":
path = await self._render_pdf(draft, data_assets, output_dir, title)
case _:
logger.warning(f"Unsupported format: {fmt}")
continue
generated_files.append(str(path))
logger.info(f"[formatter] generated {path}")
except Exception as e:
logger.exception(f"[formatter] failed to render {fmt}")
return {"generated_files": generated_files}
# -----------------------------------------------------------------------
# DOCX — python-docx baseline + OOXML template editing
# -----------------------------------------------------------------------
async def _render_docx(
self, draft: dict, data_assets: dict, output_dir: Path, title: str,
template_path: str | None = None,
) -> Path:
if template_path and self.skills["ooxml_docx"]:
return await self._render_docx_from_template(
draft, data_assets, output_dir, title, Path(template_path)
)
return await self._render_docx_baseline(draft, data_assets, output_dir, title)
async def _render_docx_baseline(
self, draft: dict, data_assets: dict, output_dir: Path, title: str
) -> Path:
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
doc = Document()
# -- Styles --
style = doc.styles["Normal"]
style.font.name = "微软雅黑"
style.font.size = Pt(11)
# Title
t = doc.add_heading(title, level=0)
t.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Executive summary
if summary := draft.get("executive_summary"):
doc.add_heading("执行摘要", level=1)
# Add summary with highlight styling
p = doc.add_paragraph()
run = p.add_run(summary)
run.font.size = Pt(11)
run.font.color.rgb = RGBColor(0x33, 0x33, 0x33)
# Chapters
for chapter in draft.get("chapters", []):
doc.add_heading(chapter["title"], level=1)
content = chapter.get("content", "")
self._docx_render_markdown(doc, content)
# Tables from data assets
for table_spec in data_assets.get("tables", []):
doc.add_heading(table_spec.get("title", "数据表"), level=2)
self._docx_add_table(doc, table_spec)
# Page break + chart descriptions as placeholders
for chart_spec in data_assets.get("charts", []):
doc.add_heading(chart_spec.get("title", "图表"), level=2)
desc = chart_spec.get("description", "")
chart_type = chart_spec.get("type", "")
doc.add_paragraph(f"[{chart_type.upper()} 图表] {desc}")
# Render chart data as a table too
chart_data = chart_spec.get("data", {})
if labels := chart_data.get("labels"):
for ds in chart_data.get("datasets", []):
self._docx_add_table(doc, {
"headers": ["项目", ds.get("label", "数据")],
"rows": [[str(l), str(v)] for l, v in zip(labels, ds.get("data", []))],
})
path = output_dir / f"{title}.docx"
doc.save(str(path))
return path
async def _render_docx_from_template(
self, draft: dict, data_assets: dict, output_dir: Path, title: str,
template_path: Path,
) -> Path:
"""Edit an existing DOCX template using OOXML unpack/edit/pack workflow."""
unpack_script = DOCX_SKILLS / "ooxml" / "scripts" / "unpack.py"
pack_script = DOCX_SKILLS / "ooxml" / "scripts" / "pack.py"
with tempfile.TemporaryDirectory() as tmpdir:
work_dir = Path(tmpdir) / "unpacked"
# Unpack template
proc = await asyncio.create_subprocess_exec(
"python3", str(unpack_script), str(template_path), str(work_dir),
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
)
await proc.wait()
if proc.returncode != 0:
logger.warning("[formatter] OOXML unpack failed, falling back to baseline")
return await self._render_docx_baseline(draft, data_assets, output_dir, title)
# TODO: edit XML content in work_dir based on draft
# For now, just pack back as-is (template passthrough)
output_path = output_dir / f"{title}.docx"
proc = await asyncio.create_subprocess_exec(
"python3", str(pack_script), str(work_dir), str(output_path),
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
)
await proc.wait()
return output_path
def _docx_render_markdown(self, doc, content: str):
"""Convert markdown-ish content to docx paragraphs."""
from docx.shared import Pt
for block in content.split("\n\n"):
block = block.strip()
if not block:
continue
if block.startswith("#### "):
doc.add_heading(block[5:], level=4)
elif block.startswith("### "):
doc.add_heading(block[4:], level=3)
elif block.startswith("## "):
doc.add_heading(block[3:], level=2)
elif block.startswith("- ") or block.startswith("* "):
# Bullet list
for line in block.split("\n"):
line = line.lstrip("- *").strip()
if line:
doc.add_paragraph(line, style="List Bullet")
elif block.startswith("1. ") or block.startswith("1"):
# Numbered list
for line in block.split("\n"):
text = line.lstrip("0123456789.) ").strip()
if text:
doc.add_paragraph(text, style="List Number")
else:
p = doc.add_paragraph(block)
for run in p.runs:
run.font.size = Pt(11)
def _docx_add_table(self, doc, table_spec: dict):
"""Add a formatted table to the document."""
from docx.shared import Pt, RGBColor
from docx.oxml.ns import qn
headers = table_spec.get("headers", [])
rows = table_spec.get("rows", [])
if not headers:
return
tbl = doc.add_table(rows=1 + len(rows), cols=len(headers))
tbl.style = "Light Grid Accent 1"
# Header row
for i, h in enumerate(headers):
cell = tbl.rows[0].cells[i]
cell.text = str(h)
for p in cell.paragraphs:
for run in p.runs:
run.font.bold = True
run.font.size = Pt(10)
# Data rows
for r_idx, row in enumerate(rows):
for c_idx, cell_val in enumerate(row):
tbl.rows[r_idx + 1].cells[c_idx].text = str(cell_val)
# -----------------------------------------------------------------------
# PPTX — html2pptx.js (rich) or python-pptx (fallback)
# -----------------------------------------------------------------------
async def _render_pptx(
self, draft: dict, data_assets: dict, output_dir: Path, title: str
) -> Path:
if self.skills["html2pptx"]:
try:
return await self._render_pptx_html2pptx(draft, data_assets, output_dir, title)
except Exception as e:
logger.warning(f"[formatter] html2pptx failed ({e}), falling back to python-pptx")
return await self._render_pptx_baseline(draft, data_assets, output_dir, title)
async def _render_pptx_html2pptx(
self, draft: dict, data_assets: dict, output_dir: Path, title: str
) -> Path:
"""Generate PPTX using html2pptx.js skill for visual slides."""
with tempfile.TemporaryDirectory() as tmpdir:
work = Path(tmpdir)
# Generate HTML slides
slides_html = []
# Title slide
slides_html.append(f"""<html><body style="width:720pt;height:405pt;display:flex;align-items:center;justify-content:center;flex-direction:column;background:linear-gradient(135deg,#1a1a2e,#16213e);color:white;font-family:sans-serif;">
<h1 style="font-size:36pt;margin:0;">{title}</h1>
<p style="font-size:18pt;color:#aaa;margin-top:20pt;">{draft.get('executive_summary', '')[:100]}</p>
</body></html>""")
# Chapter slides
for ch in draft.get("chapters", []):
content_lines = ch.get("content", "")[:400].split("\n")
bullets = "".join(f"<li>{l.strip()}</li>" for l in content_lines if l.strip())
slides_html.append(f"""<html><body style="width:720pt;height:405pt;padding:40pt;font-family:sans-serif;background:#ffffff;">
<h2 style="font-size:28pt;color:#1a1a2e;border-bottom:2pt solid #e94560;padding-bottom:10pt;">{ch['title']}</h2>
<ul style="font-size:14pt;color:#333;line-height:1.8;">{bullets}</ul>
</body></html>""")
# Write HTML files
for i, html in enumerate(slides_html):
(work / f"slide_{i}.html").write_text(html, encoding="utf-8")
# Write conversion script
script = work / "convert.js"
html2pptx_path = PPTX_SKILLS / "scripts" / "html2pptx.js"
slide_files = [f"slide_{i}.html" for i in range(len(slides_html))]
script.write_text(f"""\
const pptxgen = require('pptxgenjs');
const {{ html2pptx }} = require('{html2pptx_path}');
const path = require('path');
async function main() {{
const pptx = new pptxgen();
pptx.layout = 'LAYOUT_16x9';
const files = {json.dumps(slide_files)};
for (const f of files) {{
await html2pptx(path.join('{work}', f), pptx);
}}
await pptx.writeFile({{ fileName: '{output_dir / f"{title}.pptx"}' }});
}}
main().catch(e => {{ console.error(e); process.exit(1); }});
""", encoding="utf-8")
proc = await asyncio.create_subprocess_exec(
"node", str(script),
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
cwd=str(work),
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
raise RuntimeError(f"html2pptx failed: {stderr.decode()}")
return output_dir / f"{title}.pptx"
async def _render_pptx_baseline(
self, draft: dict, data_assets: dict, output_dir: Path, title: str
) -> Path:
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
prs = Presentation()
# Title slide
slide = prs.slides.add_slide(prs.slide_layouts[0])
slide.shapes.title.text = title
if len(slide.placeholders) > 1:
slide.placeholders[1].text = draft.get("executive_summary", "")[:200]
# Chapter slides
for chapter in draft.get("chapters", []):
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = chapter["title"]
body = slide.placeholders[1]
tf = body.text_frame
tf.clear()
content = chapter.get("content", "")
lines = [l.strip() for l in content.split("\n") if l.strip()]
for line in lines[:12]: # max 12 bullets per slide
p = tf.add_paragraph()
# Strip markdown markers
clean = line.lstrip("#-*0123456789. ").strip()
p.text = clean
p.font.size = Pt(14)
p.space_after = Pt(4)
# Data table slides
for table_spec in data_assets.get("tables", []):
slide = prs.slides.add_slide(prs.slide_layouts[5]) # blank layout
slide.shapes.title.text = table_spec.get("title", "数据表")
headers = table_spec.get("headers", [])
rows = table_spec.get("rows", [])
if headers and rows:
n_rows = min(len(rows) + 1, 10) # limit rows per slide
n_cols = len(headers)
tbl = slide.shapes.add_table(
n_rows, n_cols,
Inches(0.5), Inches(1.5), Inches(9), Inches(4.5)
).table
for i, h in enumerate(headers):
tbl.cell(0, i).text = str(h)
for r_idx, row in enumerate(rows[:n_rows - 1]):
for c_idx, val in enumerate(row[:n_cols]):
tbl.cell(r_idx + 1, c_idx).text = str(val)
path = output_dir / f"{title}.pptx"
prs.save(str(path))
return path
# -----------------------------------------------------------------------
# XLSX — openpyxl + recalc.py (formula recalculation)
# -----------------------------------------------------------------------
async def _render_xlsx(
self, data_assets: dict, output_dir: Path, title: str
) -> Path:
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
wb = Workbook()
ws = wb.active
ws.title = "数据总览"
# Professional styling
header_font = Font(bold=True, size=11, color="FFFFFF")
header_fill = PatternFill(start_color="1A1A2E", end_color="1A1A2E", fill_type="solid")
title_font = Font(bold=True, size=14, color="1A1A2E")
thin_border = Border(
left=Side(style="thin", color="CCCCCC"),
right=Side(style="thin", color="CCCCCC"),
top=Side(style="thin", color="CCCCCC"),
bottom=Side(style="thin", color="CCCCCC"),
)
current_row = 1
has_formulas = False
for table_spec in data_assets.get("tables", []):
# Table title
ws.cell(row=current_row, column=1, value=table_spec.get("title", "")).font = title_font
current_row += 1
headers = table_spec.get("headers", [])
rows = table_spec.get("rows", [])
if headers:
# Header row with styling
for col_idx, h in enumerate(headers, 1):
cell = ws.cell(row=current_row, column=col_idx, value=h)
cell.font = header_font
cell.fill = header_fill
cell.alignment = Alignment(horizontal="center")
cell.border = thin_border
current_row += 1
# Data rows
data_start = current_row
for row_data in rows:
for col_idx, val in enumerate(row_data, 1):
cell = ws.cell(row=current_row, column=col_idx, value=val)
cell.border = thin_border
# Try to convert numeric strings
if isinstance(val, str):
try:
cell.value = float(val.replace(",", ""))
except (ValueError, AttributeError):
pass
current_row += 1
# Auto-sum row for numeric columns
data_end = current_row - 1
if data_end > data_start:
for col_idx in range(1, len(headers) + 1):
col_letter = get_column_letter(col_idx)
test_cell = ws.cell(row=data_start, column=col_idx)
if isinstance(test_cell.value, (int, float)):
cell = ws.cell(
row=current_row, column=col_idx,
value=f"=SUM({col_letter}{data_start}:{col_letter}{data_end})"
)
cell.font = Font(bold=True)
cell.border = thin_border
has_formulas = True
elif col_idx == 1:
cell = ws.cell(row=current_row, column=1, value="合计")
cell.font = Font(bold=True)
cell.border = thin_border
current_row += 1
# Auto-fit column widths
for col_idx in range(1, len(headers) + 1):
max_len = max(
len(str(ws.cell(row=r, column=col_idx).value or ""))
for r in range(current_row - len(rows) - 2, current_row)
)
ws.column_dimensions[get_column_letter(col_idx)].width = min(max_len + 4, 30)
current_row += 2 # gap between tables
# Chart data sheets
for chart_spec in data_assets.get("charts", []):
chart_ws = wb.create_sheet(title=chart_spec.get("title", "图表")[:31])
chart_ws.cell(row=1, column=1, value=chart_spec.get("title", "")).font = title_font
chart_data = chart_spec.get("data", {})
labels = chart_data.get("labels", [])
datasets = chart_data.get("datasets", [])
# Headers: [项目, 数据集1, 数据集2, ...]
chart_ws.cell(row=2, column=1, value="项目").font = Font(bold=True)
for ds_idx, ds in enumerate(datasets, 2):
chart_ws.cell(row=2, column=ds_idx, value=ds.get("label", "")).font = Font(bold=True)
for r_idx, label in enumerate(labels, 3):
chart_ws.cell(row=r_idx, column=1, value=label)
for ds_idx, ds in enumerate(datasets, 2):
data = ds.get("data", [])
if r_idx - 3 < len(data):
chart_ws.cell(row=r_idx, column=ds_idx, value=data[r_idx - 3])
path = output_dir / f"{title}.xlsx"
wb.save(str(path))
# Run recalc.py if we have formulas and the skill is available
if has_formulas and self.skills["recalc"]:
await self._xlsx_recalc(path)
return path
async def _xlsx_recalc(self, path: Path):
"""Recalculate formulas using Skills recalc.py (requires LibreOffice)."""
recalc_script = XLSX_SKILLS / "recalc.py"
logger.info(f"[formatter] running recalc.py on {path}")
try:
proc = await asyncio.create_subprocess_exec(
"python3", str(recalc_script), str(path), "30",
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode == 0:
result = json.loads(stdout.decode())
logger.info(f"[formatter] recalc result: {result.get('status')}")
else:
logger.warning(f"[formatter] recalc.py failed: {stderr.decode()[:200]}")
except Exception as e:
logger.warning(f"[formatter] recalc.py error: {e}")
# -----------------------------------------------------------------------
# PDF — reportlab with CJK support + fpdf2 fallback
# -----------------------------------------------------------------------
async def _render_pdf(
self, draft: dict, data_assets: dict, output_dir: Path, title: str
) -> Path:
try:
return await self._render_pdf_reportlab(draft, data_assets, output_dir, title)
except Exception as e:
logger.warning(f"[formatter] reportlab failed ({e}), falling back to fpdf2")
return await self._render_pdf_fpdf(draft, data_assets, output_dir, title)
async def _render_pdf_reportlab(
self, draft: dict, data_assets: dict, output_dir: Path, title: str
) -> Path:
"""Generate PDF with reportlab — better CJK support and table rendering."""
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import mm
from reportlab.lib import colors
from reportlab.platypus import (
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak,
)
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
# Try to register a CJK font
cjk_font = "Helvetica"
for font_path in [
"/System/Library/Fonts/STHeiti Medium.ttc",
"/System/Library/Fonts/PingFang.ttc",
"/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
]:
if Path(font_path).exists():
try:
pdfmetrics.registerFont(TTFont("CJK", font_path, subfontIndex=0))
cjk_font = "CJK"
break
except Exception:
continue
path = output_dir / f"{title}.pdf"
doc = SimpleDocTemplate(str(path), pagesize=A4,
topMargin=25*mm, bottomMargin=25*mm)
styles = getSampleStyleSheet()
styles.add(ParagraphStyle(
name="CJKTitle", fontName=cjk_font, fontSize=22,
spaceAfter=12, alignment=1,
))
styles.add(ParagraphStyle(
name="CJKHeading", fontName=cjk_font, fontSize=16,
spaceAfter=8, spaceBefore=16, textColor=colors.HexColor("#1a1a2e"),
))
styles.add(ParagraphStyle(
name="CJKBody", fontName=cjk_font, fontSize=11,
spaceAfter=6, leading=16,
))
elements = []
# Title
elements.append(Paragraph(title, styles["CJKTitle"]))
elements.append(Spacer(1, 12))
# Executive summary
if summary := draft.get("executive_summary"):
elements.append(Paragraph("执行摘要", styles["CJKHeading"]))
elements.append(Paragraph(summary, styles["CJKBody"]))
elements.append(Spacer(1, 12))
# Chapters
for chapter in draft.get("chapters", []):
elements.append(PageBreak())
elements.append(Paragraph(chapter["title"], styles["CJKHeading"]))
content = chapter.get("content", "")
for para in content.split("\n\n"):
para = para.strip()
if para:
elements.append(Paragraph(para, styles["CJKBody"]))
# Tables
for table_spec in data_assets.get("tables", []):
elements.append(Spacer(1, 12))
elements.append(Paragraph(table_spec.get("title", ""), styles["CJKHeading"]))
headers = table_spec.get("headers", [])
rows = table_spec.get("rows", [])
if headers:
table_data = [headers] + rows
t = Table(table_data)
t.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#1a1a2e")),
("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
("FONTNAME", (0, 0), (-1, -1), cjk_font),
("FONTSIZE", (0, 0), (-1, 0), 10),
("FONTSIZE", (0, 1), (-1, -1), 9),
("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
("ALIGN", (0, 0), (-1, -1), "CENTER"),
("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f5f5f5")]),
]))
elements.append(t)
doc.build(elements)
return path
async def _render_pdf_fpdf(
self, draft: dict, data_assets: dict, output_dir: Path, title: str
) -> Path:
"""Fallback PDF generation with fpdf2."""
from fpdf import FPDF
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
# Try CJK font
for font_path in [
"/System/Library/Fonts/STHeiti Medium.ttc",
"/System/Library/Fonts/PingFang.ttc",
]:
if Path(font_path).exists():
try:
pdf.add_font("CJK", "", font_path, uni=True)
pdf.set_font("CJK", "", 11)
break
except Exception:
pdf.set_font("Helvetica", "", 11)
else:
pdf.set_font("Helvetica", "", 11)
pdf.add_page()
pdf.set_font_size(24)
pdf.cell(0, 20, title, new_x="LMARGIN", new_y="NEXT", align="C")
pdf.set_font_size(11)
if summary := draft.get("executive_summary"):
pdf.set_font_size(16)
pdf.cell(0, 12, "执行摘要", new_x="LMARGIN", new_y="NEXT")
pdf.set_font_size(11)
pdf.multi_cell(0, 6, summary)
for chapter in draft.get("chapters", []):
pdf.add_page()
pdf.set_font_size(16)
pdf.cell(0, 12, chapter["title"], new_x="LMARGIN", new_y="NEXT")
pdf.set_font_size(11)
pdf.multi_cell(0, 6, chapter.get("content", ""))
path = output_dir / f"{title}.pdf"
pdf.output(str(path))
return path

103
app/agents/researcher.py Normal file
View File

@@ -0,0 +1,103 @@
"""Researcher Agent — domain-aware, bilingual research."""
from __future__ import annotations
from typing import Any
from .base import BaseAgent
from app.config import settings
SYSTEM_EN = """\
You are a senior industry analyst at a top-tier consulting firm.
Your task is to produce a thorough research brief based on the given instructions.
Requirements:
1. Be specific — cite concrete data points, market sizes, growth rates, company names
2. Be structured — organize findings with clear headings and logical flow
3. Be analytical — don't just list facts, provide insights and implications
4. Flag data gaps — explicitly note where data is uncertain or unavailable
Output (JSON):
{
"title": "Research brief title",
"executive_summary": "2-3 sentence summary of key findings",
"sections": [
{
"heading": "Section heading",
"content": "Detailed findings (Markdown)",
"data_points": ["key data points extracted"],
"sources_quality": "high|medium|low — how confident are you in the data"
}
],
"data_gaps": ["areas where data is insufficient or uncertain"],
"key_insights": ["top 3-5 non-obvious insights"]
}"""
SYSTEM_ZH = """\
你是一位顶级咨询公司的资深行业分析师。
你的任务是根据给定的指令,输出一份深度研究简报。
要求:
1. 具体——引用具体的数据点、市场规模、增长率、企业名称
2. 结构化——用清晰的标题和逻辑流组织发现
3. 有分析深度——不要只罗列事实,要提供洞察和含义
4. 标注数据缺口——明确指出数据不确定或不可获取的地方
输出JSON
{
"title": "研究简报标题",
"executive_summary": "核心发现的2-3句总结",
"sections": [
{
"heading": "章节标题",
"content": "详细发现Markdown格式",
"data_points": ["提取的关键数据点"],
"sources_quality": "high|medium|low — 对数据的置信度"
}
],
"data_gaps": ["数据不充分或不确定的领域"],
"key_insights": ["3-5条非显而易见的洞察"]
}"""
class ResearcherAgent(BaseAgent):
name = "researcher"
description = "域感知研究 — 根据领域选择最优模型和语言"
def __init__(self, model: str | None = None, language: str = "en"):
super().__init__(model=model)
self.language = language
self.system_prompt = SYSTEM_ZH if language == "zh" else SYSTEM_EN
async def run(self, context: dict[str, Any]) -> dict[str, Any]:
requirement = context["requirement"]
report_type = context.get("report_type", "")
extra_data = context.get("extra_data", "")
if self.language == "zh":
prompt = f"""\
## 研究指令
{requirement}
## 研究方向
{report_type}
## 补充数据
{extra_data if extra_data else "(无)"}
请输出研究简报 JSON。"""
else:
prompt = f"""\
## Research instructions
{requirement}
## Research focus
{report_type}
## Additional data
{extra_data if extra_data else "(none)"}
Output the research brief as JSON."""
result = await self.call_llm_json(prompt, max_tokens=6144)
return {"research": result}

79
app/agents/reviewer.py Normal file
View File

@@ -0,0 +1,79 @@
"""Reviewer Agent — bilingual quality check with strongest reasoning model."""
from __future__ import annotations
import json
from typing import Any
from .base import BaseAgent
from app.config import settings
class ReviewerAgent(BaseAgent):
name = "reviewer"
description = "双语报告质量审查 — 使用最强推理模型"
system_prompt = """\
You are a senior consulting partner reviewing a report before client delivery.
The report has both Chinese and English versions (or will be translated).
Review dimensions:
1. **Accuracy** — Are data points, percentages, and claims supported by the research?
Cross-check global claims against English research, Chinese claims against Chinese research.
2. **Logical consistency** — Does the narrative flow? Are there contradictions between chapters?
3. **Depth of analysis** — Is it consultancy-grade or just surface-level? Would a C-suite exec find it valuable?
4. **Bilingual quality** — If translated version exists, check for translation artifacts,
mistranslated terminology, or cultural mismatches.
5. **Data gaps honesty** — Are uncertainties acknowledged or are claims fabricated?
6. **Completeness** — Are any critical aspects of the requirement left unaddressed?
Scoring guide:
- 90+: Publication-ready
- 80-89: Minor issues, can pass with notes
- 70-79: Needs revision (verdict: revise)
- <70: Significant problems (verdict: reject)
Output (JSON):
{
"overall_score": 85,
"verdict": "pass|revise|reject",
"issues": [
{
"severity": "high|medium|low",
"chapter": "affected chapter",
"dimension": "accuracy|consistency|depth|bilingual|gaps|completeness",
"description": "issue description",
"suggestion": "specific fix suggestion"
}
],
"strengths": ["what the report does well"],
"summary": "Overall assessment (2-3 sentences)"
}"""
def __init__(self):
super().__init__(model=settings.model_for_domain("reasoning"))
async def run(self, context: dict[str, Any]) -> dict[str, Any]:
draft = context["draft"]
draft_translated = context.get("draft_translated", {})
research = context["research"]
sections = [
"## Research Plan (what was asked)",
json.dumps(research, ensure_ascii=False, indent=2),
"",
"## Primary Draft",
json.dumps(draft, ensure_ascii=False, indent=2),
]
if draft_translated:
sections.extend([
"",
"## Translated Version",
json.dumps(draft_translated, ensure_ascii=False, indent=2),
])
prompt = "\n".join(sections) + "\n\nReview the report. Output JSON."
result = await self.call_llm_json(prompt, max_tokens=4096)
return {"review": result}

86
app/agents/writer.py Normal file
View File

@@ -0,0 +1,86 @@
"""Writer Agent — synthesizes multilingual research tracks into a cohesive report."""
from __future__ import annotations
import json
from typing import Any
from .base import BaseAgent
from app.config import settings
class WriterAgent(BaseAgent):
name = "writer"
description = "汇聚多语言/多领域研究成果,撰写完整报告"
system_prompt = """\
You are an expert consulting report writer. Your task is to synthesize research
findings from MULTIPLE parallel tracks (some in English, some in Chinese) into
ONE cohesive, professional consulting report.
CRITICAL RULES:
1. The PRIMARY output language is Chinese (中文) — this is for Chinese clients
2. For global/international sections, the analysis depth must reflect the English research
3. For China-specific sections, preserve the precision of Chinese-native research
4. Maintain professional consulting tone throughout
5. Every claim should trace back to a research track's findings
6. Mark chart/table needs: {{CHART:描述}} and {{TABLE:描述}}
7. If a research track flags "data_gaps", acknowledge uncertainty rather than fabricating
Output (JSON):
{
"title": "报告标题(中文)",
"title_en": "Report Title (English)",
"chapters": [
{
"title": "章节标题",
"content": "章节正文Markdown 格式,中文)",
"source_tracks": ["引用的研究轨道名称"],
"charts": ["图表需求"],
"tables": ["表格需求"]
}
],
"executive_summary": "执行摘要中文300-500字",
"executive_summary_en": "Executive Summary (English, 200-400 words)"
}"""
def __init__(self):
super().__init__(model=settings.model_for_domain("reasoning"))
async def run(self, context: dict[str, Any]) -> dict[str, Any]:
research = context["research"]
requirement = context["requirement"]
revision_feedback = context.get("revision_feedback", "")
# Format multi-track, multilingual research
tracks_text = ""
for track in research.get("tracks", []):
lang_tag = f"[{track.get('native_language', '?').upper()}]"
domain_tag = f"[{track.get('domain', '?')}]"
tracks_text += f"\n### {domain_tag} {lang_tag} {track.get('track', '')}\n"
findings = track.get("findings", {})
tracks_text += json.dumps(findings, ensure_ascii=False, indent=2)
synthesis_guide = research.get("synthesis_guide", "")
prompt = f"""\
## 原始需求 / Original Requirement
{requirement}
## 报告标题
中文:{research.get("title_zh", "")}
English: {research.get("title_en", "")}
## 写作指导 / Synthesis Guide
{synthesis_guide}
## 各研究轨道成果 / Research Track Results
(注意:有些轨道是英文原版 [EN],有些是中文原版 [ZH],请综合使用)
{tracks_text}
{f"## 审稿反馈 / Review Feedback{revision_feedback}" if revision_feedback else ""}
请汇聚以上研究成果,撰写完整的中文报告。输出 JSON。"""
result = await self.call_llm_json(prompt, max_tokens=8192)
return {"draft": result}