init repo

2026-04-25 19:25:22 +08:00
commit c7533eada2
50 changed files with 3732 additions and 0 deletions
--- a/app/data/sources/init.py
+++ b/app/data/sources/init.py
--- a/app/data/sources/akshare_source.py
+++ b/app/data/sources/akshare_source.py
@@ -0,0 +1,94 @@
+"""AKShare data source — Chinese macro/industry data via open-source Python library.
+
+Covers: GDP, CPI, PMI, industrial profit, trade balance, and 30+ data categories.
+All data returned as Pandas DataFrames, converted to dicts for standardization.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from .base import DataSource, DataResult
+
+logger = logging.getLogger(__name__)
+
+# Map common data requests to AKShare function names
+AKSHARE_ENDPOINTS = {
+    "gdp": "macro_china_gdp",
+    "cpi": "macro_china_cpi_monthly",
+    "ppi": "macro_china_ppi",
+    "pmi": "macro_china_pmi",
+    "industrial_profit": "macro_china_industrial_profit",
+    "trade_balance": "macro_china_trade_balance",
+    "money_supply": "macro_china_money_supply",
+    "fdi": "macro_china_fdi",
+    "real_estate": "macro_china_real_estate",
+    "retail_sales": "macro_china_consumer_goods_retail",
+    "fixed_asset": "macro_china_fai",
+    "unemployment": "macro_china_urban_unemployment",
+    # US macro
+    "us_gdp": "macro_usa_gdp_monthly",
+    "us_cpi": "macro_usa_cpi_monthly",
+    "us_unemployment": "macro_usa_unemployment_rate",
+    # Global
+    "global_gdp": "macro_global_gdp",
+}
+
+
+class AKShareSource(DataSource):
+    name = "akshare"
+    description = "中国宏观经济/行业数据（免费开源，封装统计局等30+数据源）"
+
+    def supports(self, data_type: str, country: str | None = None) -> bool:
+        return data_type in ("macro", "industry", "general")
+
+    async def fetch(
+        self, query: str, *, data_type: str = "general", country: str | None = None, **kwargs,
+    ) -> DataResult:
+        try:
+            import akshare as ak
+        except ImportError:
+            return DataResult(source=self.name, error="akshare not installed (pip install akshare)")
+
+        # Try to match query to a known endpoint
+        endpoint_name = kwargs.get("endpoint")
+        if not endpoint_name:
+            query_lower = query.lower()
+            for key, func_name in AKSHARE_ENDPOINTS.items():
+                if key in query_lower:
+                    endpoint_name = func_name
+                    break
+
+        if not endpoint_name:
+            return DataResult(source=self.name, data=None, error=f"No matching AKShare endpoint for: {query}")
+
+        try:
+            func = getattr(ak, endpoint_name, None)
+            if not func:
+                return DataResult(source=self.name, error=f"AKShare function not found: {endpoint_name}")
+
+            logger.info(f"[akshare] calling ak.{endpoint_name}()")
+            df = func()
+
+            # Convert to dict for serialization
+            # Take last N rows for recent data
+            limit = kwargs.get("limit", 20)
+            recent = df.tail(limit)
+
+            return DataResult(
+                source=self.name,
+                data={
+                    "columns": list(recent.columns),
+                    "records": recent.to_dict(orient="records"),
+                    "total_rows": len(df),
+                    "returned_rows": len(recent),
+                },
+                metadata={
+                    "endpoint": endpoint_name,
+                    "description": f"AKShare {endpoint_name}",
+                    "format": "tabular",
+                },
+            )
+        except Exception as e:
+            return DataResult(source=self.name, error=f"AKShare call failed: {e}")
--- a/app/data/sources/base.py
+++ b/app/data/sources/base.py
@@ -0,0 +1,34 @@
+"""Base class for data sources."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class DataResult(BaseModel):
+    """Standardized result from any data source."""
+    source: str = ""
+    data: Any = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    # metadata includes: unit, time_range, update_date, confidence, etc.
+    error: str | None = None
+    cached: bool = False
+
+
+class DataSource(ABC):
+    """Abstract data source."""
+    name: str = "base"
+    description: str = ""
+
+    def supports(self, data_type: str, country: str | None = None) -> bool:
+        """Return True if this source can handle this data type / country."""
+        return True
+
+    @abstractmethod
+    async def fetch(
+        self, query: str, *, data_type: str = "general", country: str | None = None, **kwargs,
+    ) -> DataResult:
+        ...
--- a/app/data/sources/gpt_researcher_source.py
+++ b/app/data/sources/gpt_researcher_source.py
@@ -0,0 +1,61 @@
+"""GPT Researcher MCP — deep web research as fallback for any industry.
+
+This is the universal fallback: when structured data sources don't have
+data for a niche/cold industry, deep web research fills the gap.
+
+Requires GPT Researcher MCP server to be running (already configured in ~/.claude.json).
+For direct API use, we call the MCP tools via the subprocess approach.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import subprocess
+from typing import Any
+
+from .base import DataSource, DataResult
+
+logger = logging.getLogger(__name__)
+
+
+class GPTResearcherSource(DataSource):
+    name = "gpt_researcher"
+    description = "Deep web research — universal fallback for any industry/topic"
+
+    def supports(self, data_type: str, country: str | None = None) -> bool:
+        # Supports everything — this is the universal fallback
+        return True
+
+    async def fetch(
+        self, query: str, *, data_type: str = "general", country: str | None = None, **kwargs,
+    ) -> DataResult:
+        mode = kwargs.get("mode", "quick")  # "quick" or "deep"
+
+        # GPT Researcher is available as MCP tools in Claude Code.
+        # For standalone use, we need to call it via its API.
+        # The MCP server runs at a local port — check if available.
+
+        # For now, provide a structured placeholder that agents can use
+        # to request deep research. The actual MCP call happens at the
+        # agent level when integrated into the pipeline.
+        return DataResult(
+            source=self.name,
+            data={
+                "query": query,
+                "mode": mode,
+                "status": "ready",
+                "note": (
+                    "GPT Researcher MCP is available for deep web research. "
+                    "Call via MCP tools: deep_research() or quick_search(). "
+                    "This source returns research-ready queries for MCP integration."
+                ),
+            },
+            metadata={
+                "type": "mcp_research_request",
+                "mode": mode,
+                "data_type": data_type,
+                "country": country,
+            },
+        )
--- a/app/data/sources/worldbank_source.py
+++ b/app/data/sources/worldbank_source.py
@@ -0,0 +1,104 @@
+"""World Bank Open Data — global macro indicators, 217 economies, free API.
+
+API: https://api.worldbank.org/v2/
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import httpx
+
+from .base import DataSource, DataResult
+
+logger = logging.getLogger(__name__)
+
+BASE_URL = "https://api.worldbank.org/v2"
+
+# Common indicators for consulting reports
+INDICATORS = {
+    "gdp": "NY.GDP.MKTP.CD",           # GDP (current US$)
+    "gdp_growth": "NY.GDP.MKTP.KD.ZG",  # GDP growth (annual %)
+    "gdp_per_capita": "NY.GDP.PCAP.CD",  # GDP per capita
+    "population": "SP.POP.TOTL",          # Total population
+    "inflation": "FP.CPI.TOTL.ZG",       # Inflation (CPI %)
+    "trade_pct_gdp": "NE.TRD.GNFS.ZS",  # Trade (% of GDP)
+    "fdi_net": "BX.KLT.DINV.CD.WD",     # FDI net inflows
+    "unemployment": "SL.UEM.TOTL.ZS",    # Unemployment (%)
+    "exports": "NE.EXP.GNFS.CD",         # Exports
+    "imports": "NE.IMP.GNFS.CD",         # Imports
+    "r_and_d": "GB.XPD.RSDV.GD.ZS",     # R&D expenditure (% GDP)
+    "high_tech_exports": "TX.VAL.TECH.MF.ZS",  # High-tech exports (% manufactured)
+}
+
+
+class WorldBankSource(DataSource):
+    name = "worldbank"
+    description = "World Bank Open Data — 1600+ indicators, 217 economies, free"
+
+    def supports(self, data_type: str, country: str | None = None) -> bool:
+        return data_type in ("macro", "general")
+
+    async def fetch(
+        self, query: str, *, data_type: str = "general", country: str | None = None, **kwargs,
+    ) -> DataResult:
+        indicator_code = kwargs.get("indicator")
+        if not indicator_code:
+            query_lower = query.lower()
+            for key, code in INDICATORS.items():
+                if key in query_lower:
+                    indicator_code = code
+                    break
+
+        if not indicator_code:
+            # Default to GDP
+            indicator_code = INDICATORS["gdp"]
+
+        country_code = country or "WLD"  # WLD = World
+        per_page = kwargs.get("per_page", 20)
+
+        url = f"{BASE_URL}/country/{country_code}/indicator/{indicator_code}"
+        params = {
+            "format": "json",
+            "per_page": per_page,
+        }
+
+        try:
+            async with httpx.AsyncClient(timeout=15) as client:
+                resp = await client.get(url, params=params)
+                resp.raise_for_status()
+                data = resp.json()
+
+            if not data or len(data) < 2:
+                return DataResult(source=self.name, data=None, error="No data returned")
+
+            metadata_raw = data[0]
+            records = data[1]
+
+            # Parse into clean format
+            clean_records = []
+            for r in records:
+                if r.get("value") is not None:
+                    clean_records.append({
+                        "year": r["date"],
+                        "value": r["value"],
+                        "country": r["country"]["value"],
+                        "indicator": r["indicator"]["value"],
+                    })
+
+            return DataResult(
+                source=self.name,
+                data={
+                    "indicator": indicator_code,
+                    "country": country_code,
+                    "records": clean_records,
+                },
+                metadata={
+                    "total": metadata_raw.get("total", 0),
+                    "indicator_name": clean_records[0]["indicator"] if clean_records else "",
+                    "format": "timeseries",
+                },
+            )
+        except Exception as e:
+            return DataResult(source=self.name, error=f"World Bank API failed: {e}")