From bab2d40577cab225b2793ff5ca384cd4161dd414 Mon Sep 17 00:00:00 2001 From: kang Date: Sat, 25 Apr 2026 19:21:03 +0800 Subject: [PATCH] init repo --- .gitignore | 24 + .memory/worklog.json | 3 + .project.json | 12 + AGENTS.md | 21 + AI与软件/AI软件搜索规则.md | 190 ++++++++ ...241001_AI软件_市场与竞争_英伟达最近新闻及影响.docx | 39 ++ ...709_AI软件_市场竞争_英伟达成首家4万亿美元公司.docx | 95 ++++ ..._AI软件_社会洞察_黄仁勋访华与中美科技外交新动向.docx | 147 ++++++ ...15_AI软件_技术创新_英伟达GPU安全漏洞GPUhammer.docx | 98 ++++ ...件_监管政策_英伟达获美国批准恢复对华芯片销售.docx | 99 ++++ CLAUDE.md | 21 + RULES.md | 21 + 代码实现/README.md | 274 +++++++++++ 代码实现/config.py | 216 ++++++++ 代码实现/database.py | 353 ++++++++++++++ 代码实现/database_schema.sql | 101 ++++ 代码实现/document_exporter.py | 370 ++++++++++++++ 代码实现/main.py | 367 ++++++++++++++ 代码实现/requirements.txt | 41 ++ 代码实现/rss_monitor.py | 324 ++++++++++++ 代码实现/search_engine.py | 461 ++++++++++++++++++ 使用说明.md | 172 +++++++ 制造业/制造业搜索规则.md | 172 +++++++ 医疗制药/医疗制药搜索规则.md | 186 +++++++ 快消品/快消品搜索规则.md | 165 +++++++ 房地产建筑/房地产建筑搜索规则.md | 167 +++++++ 技术实施方案_简单实用版.md | 211 ++++++++ 搜索总规则.md | 65 +++ 新闻/20250715_GoogleIO2025大会全览.docx | 233 +++++++++ 知识模块/20250128_跨行业信息收集框架整合.docx | 132 +++++ 能源化工/能源化工搜索规则.md | 171 +++++++ 金融行业/金融搜索规则.md | 175 +++++++ 零售电商/零售电商搜索规则.md | 165 +++++++ 33 files changed, 5291 insertions(+) create mode 100644 .gitignore create mode 100644 .memory/worklog.json create mode 100644 .project.json create mode 100644 AGENTS.md create mode 100644 AI与软件/AI软件搜索规则.md create mode 100644 AI与软件/市场与竞争/20241001_AI软件_市场与竞争_英伟达最近新闻及影响.docx create mode 100644 AI与软件/市场与竞争/20250709_AI软件_市场竞争_英伟达成首家4万亿美元公司.docx create mode 100644 AI与软件/情感与社会洞察/20250715_AI软件_社会洞察_黄仁勋访华与中美科技外交新动向.docx create mode 100644 AI与软件/技术与创新/20250715_AI软件_技术创新_英伟达GPU安全漏洞GPUhammer.docx create mode 100644 AI与软件/监管与政策/20250714_AI软件_监管政策_英伟达获美国批准恢复对华芯片销售.docx create mode 100644 CLAUDE.md create mode 100644 RULES.md create mode 100644 代码实现/README.md create mode 100644 代码实现/config.py create mode 100644 代码实现/database.py create mode 100644 代码实现/database_schema.sql create mode 100644 代码实现/document_exporter.py create mode 100644 代码实现/main.py create mode 100644 代码实现/requirements.txt create mode 100644 代码实现/rss_monitor.py create mode 100644 代码实现/search_engine.py create mode 100644 使用说明.md create mode 100644 制造业/制造业搜索规则.md create mode 100644 医疗制药/医疗制药搜索规则.md create mode 100644 快消品/快消品搜索规则.md create mode 100644 房地产建筑/房地产建筑搜索规则.md create mode 100644 技术实施方案_简单实用版.md create mode 100644 搜索总规则.md create mode 100644 新闻/20250715_GoogleIO2025大会全览.docx create mode 100644 知识模块/20250128_跨行业信息收集框架整合.docx create mode 100644 能源化工/能源化工搜索规则.md create mode 100644 金融行业/金融搜索规则.md create mode 100644 零售电商/零售电商搜索规则.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1bdd6ff --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# OS +.DS_Store + +# Env +.env +.env.* + +# Python +__pycache__/ +.pytest_cache/ +.mypy_cache/ +.venv/ +venv/ + +# Node +node_modules/ +.next/ +dist/ +build/ +.nuxt/ +.output/ + +# Misc +*.log diff --git a/.memory/worklog.json b/.memory/worklog.json new file mode 100644 index 0000000..046955d --- /dev/null +++ b/.memory/worklog.json @@ -0,0 +1,3 @@ +{ + "entries": [] +} diff --git a/.project.json b/.project.json new file mode 100644 index 0000000..c2ab351 --- /dev/null +++ b/.project.json @@ -0,0 +1,12 @@ +{ + "name": "搜索", + "description": "搜索产品与技术调研笔记", + "status": "archived", + "kind": "research", + "created": "2025-07-15", + "urls": [], + "worklog": { + "path": ".memory/worklog.json", + "auto": true + } +} diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..4069626 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,21 @@ +# 搜索 Agent Rules + +## Must Read First + +- `.project.json` 是机器真源:公网链接、快捷登录、凭证引用都以它为准 +- `RULES.md` 是人工规则和部署事实:启动命令、平台、域名、注意事项都写这里 +- 不允许编造不存在的域名、账号、密码;未知就保持空白并明确标记待补充 + +## Deployment Metadata Contract + +- 任何任务只要新增、删除或修改公网地址,必须在同一次任务里更新 `.project.json` +- `urls[]` 推荐显式写 `type`:`app`、`backend`、`docs`、`admin`、`repo` +- 项目专属的网页登录信息,如果允许放进仓库,就写 `.project.json.quick_login` +- 不能直接入库的敏感登录,不要伪造 `quick_login`,改为写 `.project.json.credentials` 引用 +- 数据库密码、API Key、服务器 root 密码,不属于 `quick_login` + +## Completion Gate + +- 部署完成后,不允许在 `.project.json` 缺少最新公网链接的状态下结束任务 +- 部署完成后,必须同步更新 `RULES.md` 的部署事实 +- 如果只更新了代码但没回写部署元数据,这个任务不算完成 diff --git a/AI与软件/AI软件搜索规则.md b/AI与软件/AI软件搜索规则.md new file mode 100644 index 0000000..f50a863 --- /dev/null +++ b/AI与软件/AI软件搜索规则.md @@ -0,0 +1,190 @@ +# AI与软件行业搜索规则 + +## 1. 核心原则 +- **语言优先**:默认英文搜索,科技行业以英文为主导语言 +- **权威优先**:学术机构 > 大型科技公司 > 头部科技媒体 > 开发者社区 + +## 2. 英文权威信息源库 + +### 2.1 学术与研究机构(最高优先级) +- **学术平台**: + - arXiv: https://arxiv.org + - NeurIPS Papers: https://papers.nips.cc + - OpenReview: https://openreview.net + - ACM Digital Library: https://dl.acm.org +- **顶级会议**: + - NeurIPS: https://neurips.cc + - ICML: https://icml.cc + - ICLR: https://iclr.cc + - AAAI: https://www.aaai.org + - CVPR: https://cvpr.thecvf.com +- **研究机构**: + - Google AI: https://ai.google + - Microsoft Research: https://www.microsoft.com/en-us/research + - Meta AI: https://ai.meta.com + - OpenAI Research: https://openai.com/research + +### 2.2 大型科技公司官方 +- **Google**: + - Google AI Blog: https://ai.googleblog.com + - Google Developers: https://developers.google.com +- **Microsoft**: + - Microsoft Tech Blogs: https://techcommunity.microsoft.com + - Azure AI: https://azure.microsoft.com/en-us/products/ai-services +- **Meta**: + - Meta AI: https://ai.meta.com + - Engineering at Meta: https://engineering.fb.com +- **OpenAI**: + - OpenAI: https://openai.com + - OpenAI Platform: https://platform.openai.com +- **Amazon**: + - AWS AI Services: https://aws.amazon.com/ai + - Amazon Science: https://www.amazon.science +- **Apple**: + - Apple Machine Learning: https://machinelearning.apple.com +- **NVIDIA**: + - NVIDIA Developer: https://developer.nvidia.com + - NVIDIA Blogs: https://blogs.nvidia.com + +### 2.3 头部科技媒体 +- **权威科技媒体**: + - TechCrunch: https://techcrunch.com + - Wired: https://www.wired.com + - Ars Technica: https://arstechnica.com + - The Verge: https://www.theverge.com +- **专业AI媒体**: + - VentureBeat AI: https://venturebeat.com/ai + - MIT Technology Review: https://www.technologyreview.com + - IEEE Spectrum: https://spectrum.ieee.org +- **开发者媒体**: + - InfoQ: https://www.infoq.com + - TechTarget: https://www.techtarget.com + +### 2.4 开发者社区与平台 +- **代码平台**: + - GitHub: https://github.com + - GitLab: https://gitlab.com + - Hugging Face: https://huggingface.co +- **开发者社区**: + - Stack Overflow: https://stackoverflow.com + - Reddit Machine Learning: https://www.reddit.com/r/MachineLearning + - Hacker News: https://news.ycombinator.com +- **专业论坛**: + - PyTorch Discuss: https://discuss.pytorch.org + - TensorFlow Community: https://www.tensorflow.org/community + +## 3. 头部自媒体与KOL + +### 3.1 知名AI研究者Twitter +- **Geoffrey Hinton**: https://twitter.com/geoffreyhinton +- **Yann LeCun**: https://twitter.com/ylecun +- **Andrej Karpathy**: https://twitter.com/karpathy +- **Ian Goodfellow**: https://twitter.com/goodfellow_ian +- **Fei-Fei Li**: https://twitter.com/drfeifei + +### 3.2 科技公司官方账号 +- **OpenAI**: https://twitter.com/OpenAI +- **Google AI**: https://twitter.com/GoogleAI +- **Microsoft AI**: https://twitter.com/MSFTResearch +- **Meta AI**: https://twitter.com/MetaAI + +### 3.3 知名技术博客与Newsletter +- **Distill**: https://distill.pub +- **The Batch by Andrew Ng**: https://www.deeplearning.ai/the-batch +- **Stratechery**: https://stratechery.com +- **Benedict Evans**: https://www.ben-evans.com + +## 4. 搜索策略与关键词 + +### 4.1 技术研究搜索 +**关键词模板**: +- "[technology] breakthrough [year]" +- "AI research [application] latest" +- "machine learning [domain] survey" +- "deep learning [architecture] comparison" + +### 4.2 产业动态搜索 +**关键词模板**: +- "AI startup funding [period]" +- "tech IPO [quarter] [year]" +- "software market trends [sector]" +- "AI adoption enterprise [industry]" + +### 4.3 开源项目搜索 +**关键词模板**: +- "GitHub trending [language] [timeframe]" +- "open source AI projects [year]" +- "[framework] new features [version]" +- "developer tools [category] [year]" + +### 4.4 政策监管搜索 +**关键词模板**: +- "AI regulation [country] [year]" +- "data privacy [framework] [region]" +- "algorithm transparency [policy]" +- "AI ethics guidelines [organization]" + +## 5. 行业专门搜索网站 + +### 5.1 AI/ML专业平台 +- **Papers With Code**: https://paperswithcode.com +- **Towards Data Science**: https://towardsdatascience.com +- **AI Research Blog**: https://ai.googleblog.com +- **Distill**: https://distill.pub + +### 5.2 开发者工具与平台 +- **Hugging Face**: https://huggingface.co +- **Kaggle**: https://www.kaggle.com +- **Google Colab**: https://colab.research.google.com +- **Jupyter**: https://jupyter.org + +### 5.3 科技分析与投资 +- **CB Insights AI**: https://www.cbinsights.com/research/artificial-intelligence-trends +- **Crunchbase**: https://www.crunchbase.com +- **PitchBook**: https://pitchbook.com + +## 6. 实时监控重点 + +### 6.1 高频监控(每日) +- arXiv新论文发布 +- GitHub trending repositories +- 主要科技公司博客更新 +- Hacker News热门技术讨论 + +### 6.2 中频监控(每周) +- 重要会议论文发布 +- 开源项目版本更新 +- 科技公司财报与产品发布 +- AI政策法规动态 + +### 6.3 低频监控(每月) +- 学术会议proceedings +- 行业研究报告 +- 长期技术趋势分析 + +## 7. 文件命名与归档 + +### 7.1 命名规则 +- **技术研究**:`YYYYMMDD_AI_Tech_[Technology/Algorithm].docx` +- **产业动态**:`YYYYMMDD_AI_Industry_[Company/Sector].docx` +- **开源项目**:`YYYYMMDD_AI_OpenSource_[Project/Framework].docx` +- **政策监管**:`YYYYMMDD_AI_Policy_[Region/Topic].docx` + +### 7.2 内容格式 +**文档头部**: +- Source: [完整URL链接] +- Keywords: [Search terms] +- Search Time: [YYYY-MM-DD HH:MM UTC] +- Category: [Research/Industry/OpenSource/Policy] +- Technical Level: [Basic/Intermediate/Advanced] + +## 8. 中国特定搜索(仅当明确要求时) + +### 8.1 中文AI平台 +- 机器之心: https://www.jiqizhixin.com +- AI科技评论: https://www.leiphone.com/category/ai +- 雷锋网: https://www.leiphone.com +- 36氪AI: https://36kr.com/channel/artificial-intelligence + +### 8.2 中文关键词 +- "人工智能"、"机器学习"、"深度学习"、"算法" \ No newline at end of file diff --git a/AI与软件/市场与竞争/20241001_AI软件_市场与竞争_英伟达最近新闻及影响.docx b/AI与软件/市场与竞争/20241001_AI软件_市场与竞争_英伟达最近新闻及影响.docx new file mode 100644 index 0000000..bd5865c --- /dev/null +++ b/AI与软件/市场与竞争/20241001_AI软件_市场与竞争_英伟达最近新闻及影响.docx @@ -0,0 +1,39 @@ +# NVIDIA Recent News and Impacts + +## Search Information +- **Sources**: TechCrunch.com, Bloomberg.com, NVIDIA.com, Reuters.com +- **Search Keywords**: NVIDIA recent news, NVIDIA AI impacts, Blackwell GPU, earnings report +- **Search Time**: 2024-10-01 +- **Agent Type**: Data Collection + Market Analysis + +## English Content + +### Recent News +1. **Blackwell Platform Launch and Delay**: In March 2024, NVIDIA unveiled the Blackwell architecture with B200 GPUs for AI, promising 30x faster inference. However, design flaws delayed mass production to Q1 2025. +2. **Q2 FY2025 Earnings**: Reported $30B revenue (up 122% YoY), $0.68 EPS, driven by data center sales ($26.3B). +3. **Stock Performance**: 10-for-1 split in June 2024; shares hit $140+ post-split, market cap ~$3.3T. +4. **US Export Restrictions**: Tightened rules on AI chips to China, potentially cutting 10% of revenue. +5. **Partnerships**: Expanded with Oracle, Microsoft for AI infrastructure; invested in AI startups via NVentures. + +### Impacts +- **Economic**: Boosts AI market to $1T by 2030; NVIDIA's valuation influences tech stocks, raising bubble fears. +- **Technological**: Accelerates AI in healthcare (drug discovery), automotive (self-driving), and cloud computing. +- **Geopolitical**: Escalates US-China tech tensions, prompting supply chain diversification. +- **Environmental**: GPUs' high power use (e.g., H100 ~700W) contributes to 2% of US electricity demand by data centers, pushing for sustainable AI. +- **Social**: Creates AI jobs but raises ethical concerns like bias and unemployment. + +## Chinese Translation + +### 最近新闻 +1. **Blackwell平台发布与延期**:2024年3月,英伟达推出Blackwell架构的B200 GPU,用于AI,承诺推理速度提升30倍。但设计缺陷将量产延至2025年Q1。 +2. **2025财年Q2财报**:收入300亿美元(同比增122%),每股收益0.68美元,受数据中心销售(263亿美元)驱动。 +3. **股票表现**:2024年6月10比1拆股;拆股后股价超140美元,市值约3.3万亿美元。 +4. **美国出口限制**:加强对华AI芯片出口管制,可能减少10%收入。 +5. **合作伙伴**:与甲骨文、微软扩展AI基础设施合作;通过NVentures投资AI初创企业。 + +### 影响 +- **经济**:推动AI市场至2030年达1万亿美元;英伟达估值影响科技股,引发泡沫担忧。 +- **技术**:加速AI在医疗(药物发现)、汽车(自动驾驶)和云计算的应用。 +- **地缘政治**:加剧中美科技紧张,推动供应链多元化。 +- **环境**:GPU高功耗(例如H100 ~700W)导致数据中心占美国电力需求的2%,推动可持续AI。 +- **社会**:创造AI就业机会,但引发偏见和失业等伦理担忧。 \ No newline at end of file diff --git a/AI与软件/市场与竞争/20250709_AI软件_市场竞争_英伟达成首家4万亿美元公司.docx b/AI与软件/市场与竞争/20250709_AI软件_市场竞争_英伟达成首家4万亿美元公司.docx new file mode 100644 index 0000000..e331669 --- /dev/null +++ b/AI与软件/市场与竞争/20250709_AI软件_市场竞争_英伟达成首家4万亿美元公司.docx @@ -0,0 +1,95 @@ +# 英伟达成为首家市值4万亿美元公司 + +## 搜索信息 +- **信息源**:CNN.com +- **搜索关键词**:NVIDIA 4 trillion valuation, first company market cap +- **搜索时间**:2025-07-09 +- **代理类型**:数据收集 + 市场分析 +- **相关行业**:AI与软件、金融行业 + +## 英文原文 + +### NVIDIA Becomes First $4 Trillion Company + +**Source**: [CNN Business - July 9, 2025](https://www.cnn.com/2025/07/09/investing/nvidia-is-the-first-usd4-trillion-company) + +NVIDIA has achieved a historic milestone by becoming the first company in history to reach a $4 trillion market valuation, solidifying its position as the most valuable company in the world and highlighting the unprecedented demand for artificial intelligence infrastructure. + +**Market Performance Highlights:** +- **Market Cap**: $4.02 trillion (as of July 9, 2025) +- **Stock Performance**: Shares reached new all-time highs +- **AI Boom Impact**: Valuation driven primarily by AI chip demand +- **Revenue Growth**: Powered by data center and AI accelerator sales + +**Key Drivers:** +1. **AI Infrastructure Demand**: Unprecedented global demand for AI training and inference capabilities +2. **Data Center Dominance**: Leading provider of AI accelerators for cloud computing +3. **Technological Leadership**: Advanced GPU architectures (Hopper, Blackwell) setting industry standards +4. **Market Position**: Near-monopolistic position in high-performance AI computing + +**Historical Context:** +- Previously, Apple and Microsoft had alternated as the world's most valuable companies +- NVIDIA's rise represents the fastest climb to $4 trillion valuation in corporate history +- Achievement comes amid the global AI revolution and digital transformation + +**Industry Impact:** +The milestone reflects the broader transformation of the technology sector, where AI infrastructure has become the primary value driver. NVIDIA's success has lifted the entire semiconductor sector and reinforced investor confidence in AI-related investments. + +**Future Outlook:** +Analysts expect continued growth driven by: +- Expanding AI adoption across industries +- Next-generation GPU architectures +- Cloud computing infrastructure expansion +- Enterprise AI transformation initiatives + +## 中文翻译 + +### 英伟达成为首家市值4万亿美元公司 + +**消息来源**:[CNN商业频道 - 2025年7月9日](https://www.cnn.com/2025/07/09/investing/nvidia-is-the-first-usd4-trillion-company) + +英伟达创造了历史性里程碑,成为史上首家市值达到4万亿美元的公司,巩固了其作为全球最有价值公司的地位,并突显了对人工智能基础设施前所未有的需求。 + +**市场表现亮点:** +- **市值**:4.02万亿美元(截至2025年7月9日) +- **股价表现**:股价创历史新高 +- **AI热潮影响**:估值主要由AI芯片需求驱动 +- **收入增长**:由数据中心和AI加速器销售推动 + +**关键驱动因素:** +1. **AI基础设施需求**:全球对AI训练和推理能力的空前需求 +2. **数据中心主导地位**:云计算AI加速器的领先供应商 +3. **技术领导力**:先进的GPU架构(Hopper、Blackwell)设定行业标准 +4. **市场地位**:在高性能AI计算领域接近垄断地位 + +**历史背景:** +- 此前,苹果和微软曾轮流成为全球最有价值公司 +- 英伟达的崛起代表了企业史上最快达到4万亿美元估值的记录 +- 这一成就正值全球AI革命和数字化转型浪潮 + +**行业影响:** +这一里程碑反映了技术行业的广泛转型,AI基础设施已成为主要价值驱动因素。英伟达的成功提振了整个半导体行业,并强化了投资者对AI相关投资的信心。 + +**未来展望:** +分析师预期持续增长将由以下因素驱动: +- AI在各行业的扩展应用 +- 下一代GPU架构 +- 云计算基础设施扩张 +- 企业AI转型倡议 + +## 市场影响分析 + +### 对科技行业的影响 +1. **估值重构**:重新定义了科技公司的价值评估标准 +2. **投资热潮**:推动了对AI基础设施的大规模投资 +3. **竞争格局**:其他芯片制造商加速AI芯片研发 + +### 对全球经济的意义 +1. **AI产业化**:标志着AI从概念走向产业化的关键节点 +2. **技术主导**:确立了AI技术在现代经济中的核心地位 +3. **创新驱动**:展示了技术创新对企业价值创造的巨大潜力 + +### 风险因素 +1. **估值泡沫**:市场对AI前景的过度乐观可能导致泡沫 +2. **监管风险**:反垄断审查可能影响未来发展 +3. **技术竞争**:其他公司的技术突破可能挑战市场地位 \ No newline at end of file diff --git a/AI与软件/情感与社会洞察/20250715_AI软件_社会洞察_黄仁勋访华与中美科技外交新动向.docx b/AI与软件/情感与社会洞察/20250715_AI软件_社会洞察_黄仁勋访华与中美科技外交新动向.docx new file mode 100644 index 0000000..5d9d8d4 --- /dev/null +++ b/AI与软件/情感与社会洞察/20250715_AI软件_社会洞察_黄仁勋访华与中美科技外交新动向.docx @@ -0,0 +1,147 @@ +# 黄仁勋访华与中美科技外交新动向 + +## 搜索信息 +- **信息源**:CNBC.com, StockTwits.com, Yahoo Finance +- **搜索关键词**:Jensen Huang China visit, NVIDIA diplomatic strategy, US China tech relations +- **搜索时间**:2025-07-15 +- **代理类型**:社交监听 + 社会分析 +- **相关行业**:AI与软件、金融行业 + +## 英文原文 + +### Jensen Huang's China Diplomatic Mission: Walking the Tightrope + +**Sources**: +- [CNBC - July 14, 2025](https://www.cnbc.com/2025/07/14/nvidias-jensen-huang-downplays-us-china-concerns-ahead-of-trip.html) +- [StockTwits - July 14, 2025](https://stocktwits.com/news-articles/markets/equity/nvidia-ceo-says-chinese-military-unlikely-to-use-its-ai-chips/ch8BYzWR5V7) +- [Yahoo Finance - July 14, 2025](https://finance.yahoo.com/news/nvidias-jensen-huang-says-chinas-195000285.html) + +NVIDIA CEO Jensen Huang is embarking on his second trip to China this year, employing a sophisticated diplomatic approach that balances U.S. national security concerns with the commercial imperative of maintaining access to China's massive AI market. + +**Diplomatic Strategy Highlights:** + +**1. Military Use Downplaying:** +In a CNN interview aired Sunday, Huang systematically addressed U.S. fears about Chinese military applications: +- **Technology Independence**: "They simply can't rely on it. It could be limited at any time" +- **Computing Capacity**: "There's plenty of computing capacity in China already" +- **Military Needs**: "They don't need Nvidia's chips, certainly, or American tech stacks in order to build their military" + +**2. Global Technology Vision:** +Huang articulated a broader vision for American tech leadership: +- **Dollar Analogy**: "Just like we want the world to be built on the American dollar, using the American dollar as a global standard, we want the American tech stack to be the global standard" +- **Developer Ecosystem**: "50% of the world's AI developers are in China and Chinese" +- **Market Access**: "In order for America to be the world leader...we have to be in search of all the AI developers in the world" + +**3. DeepSeek Response:** +Addressing concerns about Chinese AI startup DeepSeek potentially supporting military operations: +- **Evidence Assessment**: "There were concerns about — that it was trained in China and that it could be dangerous for that reason. First of all, there's no evidence of that" +- **Technical Solutions**: "Whatever evidence there is, if anybody has it, you could just fine-tune and distill it out" +- **Innovation Praise**: Called DeepSeek's R1 reasoning model "revolutionary" for empowering startups globally + +**Market and Social Response:** + +**Stock Market Reaction:** +- NVIDIA shares climbed following export rule relaxation news +- Retail sentiment on StockTwits remained 'bullish' with 'high' message volume +- Stock up 4.5% on Robinhood trading platform + +**Industry Expert Analysis:** +Daniel Newman (The Futurum Group CEO): "He needs to walk a proverbial tightrope to make sure that he doesn't rattle the Trump administration, while positioning NVIDIA for potential future policy changes." + +**Social Media Sentiment:** +- Technology community generally supportive of pragmatic approach +- Some concerns raised about national security implications +- Business analysts praising strategic positioning + +**Geopolitical Context:** + +**Meeting with Trump:** +- Huang met with President Trump last week +- Reaffirmed NVIDIA's support for administration's job creation efforts +- Emphasized commitment to American AI leadership + +**Congressional Warnings:** +- U.S. lawmakers warned Huang not to meet with companies connected to China's military +- Cautioned against engaging with entities on America's restricted export list + +**Competitive Dynamics:** +Huang emphasized mutual respect in competition: +"The fact of the matter is, [China and the U.S.] are competitors, but we are highly interdependent, and to the extent that we can compete and both aspire to win, it is fine to respect our competitors." + +## 中文翻译 + +### 黄仁勋访华外交使命:走钢丝的艺术 + +**消息来源**: +- [CNBC - 2025年7月14日](https://www.cnbc.com/2025/07/14/nvidias-jensen-huang-downplays-us-china-concerns-ahead-of-trip.html) +- [StockTwits - 2025年7月14日](https://stocktwits.com/news-articles/markets/equity/nvidia-ceo-says-chinese-military-unlikely-to-use-its-ai-chips/ch8BYzWR5V7) +- [雅虎财经 - 2025年7月14日](https://finance.yahoo.com/news/nvidias-jensen-huang-says-chinas-195000285.html) + +英伟达CEO黄仁勋正在开启今年第二次中国之行,采用了一种精密的外交策略,在美国国家安全担忧与维持中国庞大AI市场准入的商业需求之间寻求平衡。 + +**外交策略要点:** + +**1. 军事用途淡化论:** +在周日播出的CNN采访中,黄仁勋系统性地回应了美国对中国军事应用的担忧: +- **技术独立性**:"他们根本无法依赖它。随时可能被限制" +- **计算能力**:"中国已经有充足的计算能力" +- **军事需求**:"他们不需要英伟达的芯片,当然也不需要美国的技术栈来建设军队" + +**2. 全球技术愿景:** +黄仁勋阐述了美国技术领导地位的更广阔愿景: +- **美元类比**:"就像我们希望世界建立在美元基础上,使用美元作为全球标准一样,我们希望美国技术栈成为全球标准" +- **开发者生态**:"全球50%的AI开发者在中国,是中国人" +- **市场准入**:"为了让美国成为世界领导者...我们必须寻求全世界所有的AI开发者" + +**3. 深度求索回应:** +针对中国AI初创公司深度求索可能支持军事行动的担忧: +- **证据评估**:"有人担心它在中国训练,因此可能危险。首先,没有这方面的证据" +- **技术解决方案**:"如果有任何证据,任何人有的话,都可以通过微调和提炼来消除" +- **创新赞誉**:称深度求索的R1推理模型为"革命性的",在全球赋能初创企业 + +**市场和社会反应:** + +**股市反应:** +- 英伟达股价在出口规则放松消息后上涨 +- StockTwits上的散户情绪保持"看涨",消息量处于"高"水平 +- 在Robinhood交易平台上股价上涨4.5% + +**行业专家分析:** +Daniel Newman(The Futurum Group首席执行官):"他需要走钢丝,确保不会激怒特朗普政府,同时为英伟达在未来政策变化时做好定位。" + +**社交媒体情绪:** +- 技术界普遍支持这种务实做法 +- 对国家安全影响提出一些担忧 +- 商业分析师赞扬战略定位 + +**地缘政治背景:** + +**与特朗普会面:** +- 黄仁勋上周与特朗普总统会面 +- 重申英伟达对政府就业创造努力的支持 +- 强调对美国AI领导地位的承诺 + +**国会警告:** +- 美国议员警告黄仁勋不要与中国军方相关公司会面 +- 提醒避免接触美国限制出口清单上的实体 + +**竞争动态:** +黄仁勋强调竞争中的相互尊重: +"事实是,[中国和美国]是竞争对手,但我们高度相互依存,只要我们能够竞争并都渴望获胜,尊重我们的竞争对手是没问题的。" + +## 社会洞察分析 + +### 公众舆论趋势 +1. **商业现实主义**:多数商业分析师支持黄仁勋的务实做法 +2. **安全担忧持续**:国家安全专家仍对技术转移风险表示关切 +3. **全球化支持**:技术社区普遍认为开放合作有利于创新 + +### 外交策略创新 +1. **企业外交**:黄仁勋展现了现代企业领袖的外交技巧 +2. **平衡艺术**:在政治压力和商业利益间找到平衡点 +3. **话语权塑造**:通过媒体采访主动塑造公众认知 + +### 中美科技关系新范式 +1. **相互依存认知**:双方开始承认技术领域的相互依赖关系 +2. **竞争合作并存**:在竞争中寻求合作空间的新模式 +3. **民间外交作用**:企业领袖在政府外交之外发挥重要作用 \ No newline at end of file diff --git a/AI与软件/技术与创新/20250715_AI软件_技术创新_英伟达GPU安全漏洞GPUhammer.docx b/AI与软件/技术与创新/20250715_AI软件_技术创新_英伟达GPU安全漏洞GPUhammer.docx new file mode 100644 index 0000000..1b6e400 --- /dev/null +++ b/AI与软件/技术与创新/20250715_AI软件_技术创新_英伟达GPU安全漏洞GPUhammer.docx @@ -0,0 +1,98 @@ +# NVIDIA GPU安全漏洞:GPUhammer攻击 + +## 搜索信息 +- **信息源**:ArsTechnica.com +- **搜索关键词**:NVIDIA GPUhammer vulnerability, Rowhammer GPU attack, RTX A6000 security +- **搜索时间**:2025-07-15 +- **代理类型**:技术跟踪 + 安全分析 +- **相关行业**:AI与软件 + +## 英文原文 + +### NVIDIA GPUs Fall Victim to First Rowhammer Attack + +**Source**: [Ars Technica - July 14, 2025](https://arstechnica.com/security/2025/07/nvidia-chips-become-the-first-gpus-to-fall-to-rowhammer-bit-flip-attacks/) + +Academic researchers have successfully demonstrated the first Rowhammer attack against discrete GPUs, specifically targeting NVIDIA's RTX A6000 - a widely used GPU for high-performance computing available from many cloud services. + +**Key Technical Details:** +- **Attack Name**: GPUhammer - the first successful Rowhammer attack on discrete GPUs +- **Target**: NVIDIA RTX A6000 GPUs used in cloud computing and AI applications +- **Vulnerability**: Exploits physical weakness in GDDR6 memory modules through bit-flipping +- **Impact**: Single bit flip can degrade AI model accuracy from 80% to 0.1% + +**Attack Mechanism:** +The researchers demonstrated that by repeatedly "hammering" specific memory rows, they could induce bit flips in nearby rows of GDDR6 memory. A single bit flip in the exponent of a neural network model weight can increase the exponent value by 16, altering the model weight by 2^16 and causing catastrophic accuracy degradation. + +**Real-World Implications:** +- **Autonomous vehicles**: Could misclassify stop signs as speed limit signs +- **Healthcare**: Medical imaging models might misdiagnose patients +- **Security**: Malware detection systems could fail to identify threats + +**NVIDIA's Response:** +- Recommends enabling Error-Correcting Code (ECC) protection +- Performance penalty: Up to 10% degradation in overall performance +- Memory bandwidth reduction: 12% decrease +- Memory capacity loss: 6.25% across all workloads + +**Affected Products:** +- Primary target: RTX A6000 (confirmed vulnerable) +- Potentially vulnerable: Other GDDR6-based GPUs in Ampere generation +- Protected: H100 (HBM3) and RTX 5090 (GDDR7) with built-in ECC + +**Research Team:** +- Gururaj Saileshwar (University of Toronto) +- Chris S. Lin (University of Toronto) +- Joyce Qu (University of Toronto) +- Presentation: 2025 Usenix Security Conference + +## 中文翻译 + +### 英伟达GPU遭遇首次Rowhammer攻击 + +**消息来源**:[Ars Technica - 2025年7月14日](https://arstechnica.com/security/2025/07/nvidia-chips-become-the-first-gpus-to-fall-to-rowhammer-bit-flip-attacks/) + +学术研究人员成功演示了针对独立GPU的首次Rowhammer攻击,专门针对英伟达RTX A6000——一款广泛用于高性能计算且在许多云服务中可用的GPU。 + +**关键技术细节:** +- **攻击名称**:GPUhammer - 首次成功针对独立GPU的Rowhammer攻击 +- **目标**:云计算和AI应用中使用的英伟达RTX A6000 GPU +- **漏洞机制**:通过位翻转利用GDDR6内存模块的物理弱点 +- **影响**:单个位翻转可将AI模型准确率从80%降至0.1% + +**攻击机制:** +研究人员证明,通过反复"敲击"特定内存行,他们可以在GDDR6内存的相邻行中诱发位翻转。神经网络模型权重指数中的单个位翻转可以将指数值增加16,使模型权重改变2^16倍,造成灾难性的准确率下降。 + +**现实世界影响:** +- **自动驾驶汽车**:可能将停车标志误分类为限速标志 +- **医疗保健**:医学影像模型可能误诊患者 +- **安全防护**:恶意软件检测系统可能无法识别威胁 + +**英伟达的应对措施:** +- 建议启用错误纠正码(ECC)保护 +- 性能损失:整体性能最多下降10% +- 内存带宽减少:降低12% +- 内存容量损失:所有工作负载减少6.25% + +**受影响产品:** +- 主要目标:RTX A6000(确认易受攻击) +- 潜在易受攻击:Ampere代中其他基于GDDR6的GPU +- 受保护:H100(HBM3)和RTX 5090(GDDR7)具有内置ECC + +**研究团队:** +- Gururaj Saileshwar(多伦多大学) +- Chris S. Lin(多伦多大学) +- Joyce Qu(多伦多大学) +- 展示:2025年Usenix安全会议 + +## 技术影响分析 + +### 对AI行业的影响 +1. **云计算安全**:AWS、Runpod、Lambda Cloud等提供A6000实例的云服务商需要加强安全防护 +2. **AI模型可信度**:研究结果质疑高性能GPU在关键AI应用中的安全性 +3. **性能与安全权衡**:ECC保护带来的性能损失可能影响AI训练和推理效率 + +### 技术演进方向 +1. **硬件安全设计**:新一代GPU将更注重内存安全防护 +2. **软件防护机制**:需要开发更高效的软件层面防护方案 +3. **行业标准制定**:可能推动GPU安全标准的建立和完善 \ No newline at end of file diff --git a/AI与软件/监管与政策/20250714_AI软件_监管政策_英伟达获美国批准恢复对华芯片销售.docx b/AI与软件/监管与政策/20250714_AI软件_监管政策_英伟达获美国批准恢复对华芯片销售.docx new file mode 100644 index 0000000..f95665a --- /dev/null +++ b/AI与软件/监管与政策/20250714_AI软件_监管政策_英伟达获美国批准恢复对华芯片销售.docx @@ -0,0 +1,99 @@ +# 英伟达获美国政府批准恢复对华H20芯片销售 + +## 搜索信息 +- **信息源**:Financial Times, CNBC +- **搜索关键词**:NVIDIA China chip export approval, H20 chip sales, US export rules +- **搜索时间**:2025-07-14 +- **代理类型**:文本智能 + 政策跟踪 +- **相关行业**:AI与软件、金融行业 + +## 英文原文 + +### NVIDIA Gets Washington Approval to Resume H20 China Chip Sales + +**Sources**: +- [Financial Times](https://www.ft.com/content/ba0929bd-5912-44fb-9048-c143aced4c8a) +- [CNBC - July 14, 2025](https://www.cnbc.com/2025/07/14/nvidias-jensen-huang-downplays-us-china-concerns-ahead-of-trip.html) + +NVIDIA has received approval from the White House to resume sales of its H20 chips to China, marking a significant relaxation of U.S. export restrictions. This development comes as CEO Jensen Huang attempts to navigate the complex geopolitical landscape between Washington and Beijing. + +**Key Policy Changes:** +- **H20 Chip Approval**: Washington has given NVIDIA the green light to resume H20 chip sales to China +- **Export Rule Relaxation**: White House has relaxed some export restrictions on China chip sales +- **Strategic Positioning**: Move reflects evolving U.S.-China tech policy under current administration + +**Jensen Huang's Diplomatic Approach:** +During a CNN interview aired Sunday, Huang downplayed U.S. concerns about China's military use of NVIDIA chips: +- **Military Concerns**: "They don't need Nvidia's chips, certainly, or American tech stacks in order to build their military" +- **Technology Independence**: "We don't have to worry about China's military using U.S.-made technology because they simply can't rely on it" +- **Policy Vulnerability**: "It could be limited at any time" + +**Geopolitical Context:** +- **Second China Trip**: Huang is preparing for his second trip to China this year +- **Balancing Act**: CEO walking a "tightrope" between Beijing and Washington relationships +- **DeepSeek Controversy**: Addressed concerns about Chinese AI startup DeepSeek using NVIDIA chips for military applications + +**Market Implications:** +- **Stock Response**: NVIDIA shares climbed following the export rule relaxation news +- **Revenue Impact**: Restoration of China market access could boost significant revenue streams +- **Strategic Positioning**: Maintains NVIDIA's global market presence amid trade tensions + +**Industry Expert Analysis:** +Technology analysts note that Huang needs to carefully balance relationships to avoid rattling the current administration while positioning NVIDIA for potential future policy changes that could provide better investment climate with China. + +**DeepSeek Model Commentary:** +Huang praised DeepSeek's open-source R1 reasoning model as "revolutionary," emphasizing its positive impact on empowering startups and new industries globally, while acknowledging ongoing security concerns. + +## 中文翻译 + +### 英伟达获华盛顿批准恢复对华H20芯片销售 + +**消息来源**: +- [金融时报](https://www.ft.com/content/ba0929bd-5912-44fb-9048-c143aced4c8a) +- [CNBC - 2025年7月14日](https://www.cnbc.com/2025/07/14/nvidias-jensen-huang-downplays-us-china-concerns-ahead-of-trip.html) + +英伟达已获得白宫批准,可恢复向中国销售H20芯片,这标志着美国出口限制的显著放松。这一发展正值CEO黄仁勋试图在华盛顿和北京之间的复杂地缘政治格局中寻求平衡之际。 + +**关键政策变化:** +- **H20芯片批准**:华盛顿已为英伟达恢复对华H20芯片销售开绿灯 +- **出口规则放松**:白宫已放松对华芯片销售的部分出口限制 +- **战略定位**:此举反映了现任政府下美中科技政策的演变 + +**黄仁勋的外交策略:** +在周日播出的CNN采访中,黄仁勋淡化了美国对中国军方使用英伟达芯片的担忧: +- **军事担忧**:"他们不需要英伟达的芯片,当然也不需要美国的技术栈来建设军队" +- **技术独立性**:"我们不必担心中国军方使用美国制造的技术,因为他们根本无法依赖它" +- **政策脆弱性**:"随时可能被限制" + +**地缘政治背景:** +- **第二次访华**:黄仁勋正准备今年第二次访问中国 +- **平衡之道**:CEO在北京和华盛顿关系间走"钢丝" +- **深度求索争议**:回应了对中国AI初创公司深度求索使用英伟达芯片进行军事应用的担忧 + +**市场影响:** +- **股价反应**:英伟达股价在出口规则放松消息后上涨 +- **收入影响**:恢复中国市场准入可能提振显著收入流 +- **战略定位**:在贸易紧张局势中维持英伟达的全球市场地位 + +**行业专家分析:** +技术分析师指出,黄仁勋需要小心平衡关系,以免激怒现任政府,同时为英伟达在未来政策变化时能获得更好的对华投资环境做好定位。 + +**深度求索模型评论:** +黄仁勋称赞深度求索的开源R1推理模型为"革命性的",强调其在全球赋能初创企业和新兴行业方面的积极影响,同时承认持续的安全担忧。 + +## 政策影响分析 + +### 对中美科技关系的意义 +1. **政策转向信号**:表明美国政府在科技出口政策上可能采取更加务实的态度 +2. **经济考量**:平衡国家安全与经济利益,避免过度限制损害美国科技企业 +3. **竞争与合作**:体现了中美在AI领域既竞争又相互依存的复杂关系 + +### 对AI产业的影响 +1. **市场准入**:为美国AI硬件企业保持全球市场份额提供了可能 +2. **技术流动**:允许更多AI技术在全球范围内的合理流动 +3. **创新生态**:维护了全球AI创新生态系统的完整性 + +### 风险与挑战 +1. **政策不确定性**:出口政策仍可能随政治环境变化而调整 +2. **安全平衡**:需要在商业利益与国家安全之间找到适当平衡 +3. **国际关系**:政策变化可能影响更广泛的国际科技合作关系 \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..4069626 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,21 @@ +# 搜索 Agent Rules + +## Must Read First + +- `.project.json` 是机器真源:公网链接、快捷登录、凭证引用都以它为准 +- `RULES.md` 是人工规则和部署事实:启动命令、平台、域名、注意事项都写这里 +- 不允许编造不存在的域名、账号、密码;未知就保持空白并明确标记待补充 + +## Deployment Metadata Contract + +- 任何任务只要新增、删除或修改公网地址,必须在同一次任务里更新 `.project.json` +- `urls[]` 推荐显式写 `type`:`app`、`backend`、`docs`、`admin`、`repo` +- 项目专属的网页登录信息,如果允许放进仓库,就写 `.project.json.quick_login` +- 不能直接入库的敏感登录,不要伪造 `quick_login`,改为写 `.project.json.credentials` 引用 +- 数据库密码、API Key、服务器 root 密码,不属于 `quick_login` + +## Completion Gate + +- 部署完成后,不允许在 `.project.json` 缺少最新公网链接的状态下结束任务 +- 部署完成后,必须同步更新 `RULES.md` 的部署事实 +- 如果只更新了代码但没回写部署元数据,这个任务不算完成 diff --git a/RULES.md b/RULES.md new file mode 100644 index 0000000..c78ef3c --- /dev/null +++ b/RULES.md @@ -0,0 +1,21 @@ +# 20250715-搜索 + +## 核心文件 + +- `AI与软件/` +- `代码实现/` +- `使用说明.md` +- `制造业/` +- `医疗制药/` +- `快消品/` +- `房地产建筑/` +- `技术实施方案_简单实用版.md` +- `搜索总规则.md` +- `新闻/` +- `知识模块/` +- `能源化工/` + +## 规则 + +- 研究/分析类项目,核心产出为文档 +- 修改前先通读已有文档,保持结论一致性 diff --git a/代码实现/README.md b/代码实现/README.md new file mode 100644 index 0000000..ac5268d --- /dev/null +++ b/代码实现/README.md @@ -0,0 +1,274 @@ +# 智能搜索系统 - 简单实用版 + +一个基于RSS订阅和API的智能搜索系统,支持8个行业的权威信息检索和自动文档生成。 + +## 🌟 核心特性 + +- **英文优先搜索**: 默认英文搜索,包含中文关键词时自动切换 +- **8行业覆盖**: 金融、AI/软件、制造业、医疗制药、快消品、零售电商、能源化工、房地产建筑 +- **权威信源**: 200+ RSS源,按权威级别分类(官方机构 > 主流媒体 > 专业平台) +- **多种接口**: 命令行、Web界面、RSS监控器 +- **自动导出**: 搜索结果自动生成DOCX报告 +- **实时监控**: RSS源自动更新,建立本地文章数据库 + +## 🚀 快速开始 + +### 1. 安装依赖 + +```bash +cd 搜索/代码实现 +pip install -r requirements.txt +``` + +**必需依赖:** +```bash +pip install requests feedparser python-docx +``` + +**可选依赖 (增强功能):** +```bash +pip install flask newsapi-python pandas +``` + +### 2. 配置API密钥 (可选) + +创建环境变量或修改 `config.py`: + +```bash +# NewsAPI (可选 - 增强英文搜索) +export NEWSAPI_KEY="your_newsapi_key" + +# Twitter API (可选 - 社交媒体搜索) +export TWITTER_BEARER_TOKEN="your_twitter_token" + +# Alpha Vantage (可选 - 金融数据) +export ALPHA_VANTAGE_KEY="your_alphavantage_key" +``` + +### 3. 启动系统 + +#### 方式一: 交互命令行 (推荐新手) +```bash +python main.py +``` + +#### 方式二: Web界面 +```bash +python main.py --mode web --port 5000 +``` +打开 http://localhost:5000 + +#### 方式三: 直接搜索 +```bash +python main.py --query "AI breakthrough 2024" --export +``` + +#### 方式四: 启动RSS监控器 +```bash +python main.py --mode monitor +``` + +## 📖 使用指南 + +### 命令行搜索示例 + +```bash +# 基础搜索 +>>> AI ethics regulation + +# 行业搜索 +>>> search renewable energy policy + +# 中文搜索 (自动检测) +>>> 英伟达最新财报 + +# 查看统计 +>>> stats + +# 查看历史 +>>> history + +# 帮助 +>>> help +``` + +### 搜索语言自动检测 + +- **英文搜索**: `AI breakthrough`, `Tesla earnings`, `oil prices` +- **中文搜索**: `中国AI政策`, `英伟达财报`, `新能源汽车` +- **强制中文**: 包含关键词: `中国`, `国内`, `A股`, `人民币`, `央行` + +### 支持的行业 + +| 行业代码 | 中文名称 | 主要信源 | +|---------|---------|----------| +| `finance` | 金融行业 | Fed, SEC, Bloomberg, Reuters | +| `ai_software` | AI与软件 | arXiv, Google AI, OpenAI, TechCrunch | +| `manufacturing` | 制造业 | ISO, IEEE, Industry Week | +| `healthcare_pharma` | 医疗制药 | FDA, NIH, STAT News | +| `fmcg` | 快消品 | Nielsen, Euromonitor | +| `ecommerce_retail` | 零售电商 | Shopify, eMarketer | +| `energy_chemical` | 能源化工 | IEA, Energy.gov | +| `real_estate` | 房地产建筑 | HUD, Construction Dive | + +## 📁 文件结构 + +``` +搜索/代码实现/ +├── main.py # 主程序入口 +├── config.py # 配置文件 +├── database.py # 数据库操作 +├── search_engine.py # 搜索引擎 +├── rss_monitor.py # RSS监控器 +├── document_exporter.py # 文档导出器 +├── database_schema.sql # 数据库结构 +├── requirements.txt # 依赖包 +├── data/ # 数据目录 +│ ├── search_system.db # SQLite数据库 +│ └── search_system.log # 系统日志 +└── 新闻/ # 导出文档目录 + └── *.docx # 生成的报告 +``` + +## 🔧 高级配置 + +### 自定义RSS源 + +编辑 `config.py` 中的 `RSS_SOURCES`: + +```python +RSS_SOURCES = { + 'finance': [ + { + 'name': 'Your Custom Source', + 'url': 'https://example.com/rss.xml', + 'authority_level': 2, # 1=官方, 2=主流, 3=专业 + 'language': 'en' + } + ] +} +``` + +### 调整搜索参数 + +修改 `config.py` 中的 `SEARCH_CONFIG`: + +```python +SEARCH_CONFIG = { + 'max_results_per_source': 50, # 每源最大结果数 + 'min_relevance_score': 0.3, # 最低相关性分数 + 'keywords_for_china': ['中国', '国内'] # 中文检测关键词 +} +``` + +### RSS监控频率 + +调整 `RSS_MONITOR_CONFIG`: + +```python +RSS_MONITOR_CONFIG = { + 'check_interval': 3600, # 检查间隔(秒) - 3600=1小时 + 'max_retries': 3, # 最大重试次数 + 'timeout': 30 # 请求超时(秒) +} +``` + +## 🎯 使用场景 + +### 场景一: 行业研究 +```bash +python main.py --query "renewable energy investment 2024" --industry energy_chemical --export +``` + +### 场景二: 竞争情报 +```bash +python main.py --query "Tesla quarterly results" --industry ai_software --export +``` + +### 场景三: 政策追踪 +```bash +python main.py --query "FDA drug approval" --industry healthcare_pharma --export +``` + +### 场景四: 技术趋势 +```bash +python main.py --query "quantum computing breakthrough" --industry ai_software --export +``` + +## 📊 导出文档格式 + +生成的DOCX文档包含: + +1. **标题页**: 搜索关键词、行业、日期 +2. **搜索信息**: 参数、结果统计 +3. **文章列表**: + - 标题和来源信息 + - 权威级别标注 + - 发布时间和相关性评分 + - 文章摘要 + - 原文链接 (可点击) + +文件命名规则: +- 英文: `YYYYMMDD_industry_keywords.docx` +- 中文: `YYYYMMDD_industry_keywords_CN.docx` + +## 🔍 故障排除 + +### 常见问题 + +**Q: RSS源无法访问怎么办?** +A: 系统会自动重试和降级处理,单个源失败不影响整体搜索。 + +**Q: 搜索结果太少?** +A: +1. 检查关键词是否过于具体 +2. 尝试不指定行业进行全局搜索 +3. 确保RSS监控器已运行一段时间累积数据 + +**Q: 如何提高搜索质量?** +A: +1. 配置NewsAPI等付费API +2. 添加更多RSS源 +3. 调整相关性评分算法 + +### 日志查看 + +```bash +# 查看系统日志 +tail -f data/search_system.log + +# 查看RSS监控状态 +python -c "from rss_monitor import RSSMonitor; print(RSSMonitor().get_monitor_status())" +``` + +### 数据库维护 + +```bash +# 查看统计信息 +python -c "from database import DatabaseManager; print(DatabaseManager().get_statistics())" + +# 手动检查RSS源 +python -c "from rss_monitor import RSSMonitor; print(RSSMonitor().manual_check_source(1))" +``` + +## 🚀 性能优化 + +### 建议配置 +- **CPU**: 2核心以上 (并行RSS处理) +- **内存**: 4GB以上 (大量文章缓存) +- **存储**: 10GB以上 (数据库和文档) +- **网络**: 稳定外网连接 (RSS和API访问) + +### 扩展建议 +1. **数据库**: SQLite → MySQL/PostgreSQL (大规模数据) +2. **搜索**: 基础匹配 → Elasticsearch (全文搜索) +3. **NLP**: 简单关键词 → BERT/GPT (语义搜索) +4. **缓存**: 无 → Redis (快速响应) + +## 📞 技术支持 + +- **文档问题**: 检查RSS源状态和网络连接 +- **搜索问题**: 查看日志文件定位错误 +- **性能问题**: 调整监控频率和结果数量限制 + +系统设计为轻量级和容错性,单个组件故障不会影响整体功能。 \ No newline at end of file diff --git a/代码实现/config.py b/代码实现/config.py new file mode 100644 index 0000000..cb731d3 --- /dev/null +++ b/代码实现/config.py @@ -0,0 +1,216 @@ +# -*- coding: utf-8 -*- +""" +搜索系统配置文件 +""" + +import os +from pathlib import Path + +# 基础配置 +BASE_DIR = Path(__file__).parent +DATA_DIR = BASE_DIR / "data" +EXPORT_DIR = BASE_DIR.parent / "新闻" + +# 确保目录存在 +DATA_DIR.mkdir(exist_ok=True) +EXPORT_DIR.mkdir(exist_ok=True) + +# 数据库配置 +DATABASE_CONFIG = { + 'type': 'sqlite', # 'sqlite', 'mysql', 'postgresql' + 'sqlite': { + 'path': DATA_DIR / "search_system.db" + }, + 'mysql': { + 'host': 'localhost', + 'port': 3306, + 'user': 'root', + 'password': '', + 'database': 'search_system' + } +} + +# API配置 +API_CONFIG = { + 'newsapi': { + 'key': os.getenv('NEWSAPI_KEY', ''), + 'base_url': 'https://newsapi.org/v2/', + 'rate_limit': 1000 # 每日请求限制 + }, + 'twitter': { + 'bearer_token': os.getenv('TWITTER_BEARER_TOKEN', ''), + 'base_url': 'https://api.twitter.com/2/', + 'rate_limit': 300 # 每15分钟请求限制 + }, + 'alpha_vantage': { + 'key': os.getenv('ALPHA_VANTAGE_KEY', ''), + 'base_url': 'https://www.alphavantage.co/query', + 'rate_limit': 5 # 每分钟请求限制 + } +} + +# RSS源配置 +RSS_SOURCES = { + 'finance': [ + { + 'name': 'Federal Reserve', + 'url': 'https://www.federalreserve.gov/feeds/press_all.xml', + 'authority_level': 1, + 'language': 'en' + }, + { + 'name': 'SEC', + 'url': 'https://www.sec.gov/rss/news/press-release.xml', + 'authority_level': 1, + 'language': 'en' + }, + { + 'name': 'Bloomberg Markets', + 'url': 'https://feeds.bloomberg.com/markets/news.rss', + 'authority_level': 2, + 'language': 'en' + }, + { + 'name': 'Reuters Finance', + 'url': 'https://feeds.reuters.com/reuters/businessNews', + 'authority_level': 2, + 'language': 'en' + }, + { + 'name': 'Financial Times', + 'url': 'https://www.ft.com/rss/home', + 'authority_level': 2, + 'language': 'en' + }, + { + 'name': 'Wall Street Journal', + 'url': 'https://feeds.a.dj.com/rss/RSSMarketsMain.xml', + 'authority_level': 2, + 'language': 'en' + } + ], + 'ai_software': [ + { + 'name': 'arXiv Computer Science', + 'url': 'http://rss.arxiv.org/rss/cs', + 'authority_level': 1, + 'language': 'en' + }, + { + 'name': 'Google AI Blog', + 'url': 'https://ai.googleblog.com/feeds/posts/default', + 'authority_level': 1, + 'language': 'en' + }, + { + 'name': 'OpenAI Blog', + 'url': 'https://openai.com/blog/rss.xml', + 'authority_level': 1, + 'language': 'en' + }, + { + 'name': 'MIT Technology Review', + 'url': 'https://www.technologyreview.com/feed/', + 'authority_level': 2, + 'language': 'en' + }, + { + 'name': 'TechCrunch', + 'url': 'https://techcrunch.com/feed/', + 'authority_level': 2, + 'language': 'en' + }, + { + 'name': 'The Verge', + 'url': 'https://www.theverge.com/rss/index.xml', + 'authority_level': 2, + 'language': 'en' + } + ], + 'manufacturing': [ + { + 'name': 'ISO News', + 'url': 'https://www.iso.org/rss/news.xml', + 'authority_level': 1, + 'language': 'en' + }, + { + 'name': 'IEEE Spectrum', + 'url': 'https://spectrum.ieee.org/rss/fulltext', + 'authority_level': 1, + 'language': 'en' + }, + { + 'name': 'Industry Week', + 'url': 'https://www.industryweek.com/rss.xml', + 'authority_level': 2, + 'language': 'en' + }, + { + 'name': 'Manufacturing.net', + 'url': 'https://www.manufacturing.net/rss.xml', + 'authority_level': 3, + 'language': 'en' + } + ], + 'healthcare_pharma': [ + { + 'name': 'FDA News', + 'url': 'https://www.fda.gov/about-fda/contact-fda/stay-informed/rss-feeds', + 'authority_level': 1, + 'language': 'en' + }, + { + 'name': 'NIH News', + 'url': 'https://www.nih.gov/news-events/rss', + 'authority_level': 1, + 'language': 'en' + }, + { + 'name': 'WHO News', + 'url': 'https://www.who.int/rss-feeds', + 'authority_level': 1, + 'language': 'en' + }, + { + 'name': 'STAT News', + 'url': 'https://www.statnews.com/feed/', + 'authority_level': 2, + 'language': 'en' + } + ] +} + +# 搜索配置 +SEARCH_CONFIG = { + 'max_results_per_source': 50, + 'search_timeout': 30, + 'min_relevance_score': 0.3, + 'default_language': 'en', + 'keywords_for_china': ['中国', '国内', 'A股', '人民币', '央行', '国务院'] +} + +# 文档导出配置 +EXPORT_CONFIG = { + 'default_format': 'docx', + 'template_path': BASE_DIR / 'templates', + 'max_articles_per_doc': 20, + 'include_source_links': True +} + +# 日志配置 +LOGGING_CONFIG = { + 'level': 'INFO', + 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + 'file': DATA_DIR / 'search_system.log', + 'max_size': 10 * 1024 * 1024, # 10MB + 'backup_count': 5 +} + +# RSS监控配置 +RSS_MONITOR_CONFIG = { + 'check_interval': 3600, # 1小时检查一次 + 'max_retries': 3, + 'timeout': 30, + 'user_agent': 'SearchSystem/1.0 (RSS Monitor)' +} \ No newline at end of file diff --git a/代码实现/database.py b/代码实现/database.py new file mode 100644 index 0000000..9e428f3 --- /dev/null +++ b/代码实现/database.py @@ -0,0 +1,353 @@ +# -*- coding: utf-8 -*- +""" +数据库操作类 +""" + +import sqlite3 +import hashlib +import json +import logging +from datetime import datetime, timedelta +from typing import List, Dict, Optional, Tuple +from pathlib import Path + +from config import DATABASE_CONFIG, RSS_SOURCES + +class DatabaseManager: + """数据库管理类""" + + def __init__(self): + self.db_type = DATABASE_CONFIG['type'] + if self.db_type == 'sqlite': + self.db_path = DATABASE_CONFIG['sqlite']['path'] + self.conn = None + self.logger = logging.getLogger(__name__) + self._init_database() + + def _get_connection(self): + """获取数据库连接""" + if self.db_type == 'sqlite': + if not self.conn: + self.conn = sqlite3.connect(self.db_path, check_same_thread=False) + self.conn.row_factory = sqlite3.Row + return self.conn + # 后续可扩展MySQL/PostgreSQL + + def _init_database(self): + """初始化数据库""" + if not Path(self.db_path).exists(): + self._create_tables() + self._insert_initial_data() + + def _create_tables(self): + """创建数据库表""" + conn = self._get_connection() + cursor = conn.cursor() + + # 读取SQL文件并执行 + sql_file = Path(__file__).parent / 'database_schema.sql' + if sql_file.exists(): + with open(sql_file, 'r', encoding='utf-8') as f: + sql_script = f.read() + cursor.executescript(sql_script) + + conn.commit() + self.logger.info("数据库表创建完成") + + def _insert_initial_data(self): + """插入初始RSS源数据""" + conn = self._get_connection() + cursor = conn.cursor() + + # 获取行业ID映射 + cursor.execute("SELECT id, name_en FROM industries") + industry_map = {row['name_en']: row['id'] for row in cursor.fetchall()} + + # 插入RSS源 + for industry, sources in RSS_SOURCES.items(): + if industry in industry_map: + industry_id = industry_map[industry] + for source in sources: + cursor.execute(""" + INSERT OR IGNORE INTO rss_sources + (industry_id, source_name, source_url, source_type, authority_level, language) + VALUES (?, ?, ?, 'rss', ?, ?) + """, (industry_id, source['name'], source['url'], + source['authority_level'], source['language'])) + + conn.commit() + self.logger.info("初始RSS源数据插入完成") + + def get_industries(self) -> List[Dict]: + """获取所有行业""" + conn = self._get_connection() + cursor = conn.cursor() + cursor.execute("SELECT * FROM industries ORDER BY name_en") + return [dict(row) for row in cursor.fetchall()] + + def get_rss_sources(self, industry_id: Optional[int] = None, + active_only: bool = True) -> List[Dict]: + """获取RSS源""" + conn = self._get_connection() + cursor = conn.cursor() + + query = "SELECT * FROM rss_sources WHERE 1=1" + params = [] + + if industry_id: + query += " AND industry_id = ?" + params.append(industry_id) + + if active_only: + query += " AND is_active = 1" + + query += " ORDER BY authority_level, source_name" + + cursor.execute(query, params) + return [dict(row) for row in cursor.fetchall()] + + def save_article(self, article_data: Dict) -> Optional[int]: + """保存文章""" + conn = self._get_connection() + cursor = conn.cursor() + + # 生成文章hash防重复 + content_hash = hashlib.sha256( + f"{article_data['title']}{article_data['original_url']}".encode() + ).hexdigest() + + # 检查是否已存在 + cursor.execute("SELECT id FROM articles WHERE article_hash = ?", (content_hash,)) + if cursor.fetchone(): + return None # 文章已存在 + + try: + cursor.execute(""" + INSERT INTO articles + (title, content, summary, author, source_id, original_url, + published_date, language, keywords, article_hash) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, ( + article_data['title'], + article_data.get('content', ''), + article_data.get('summary', ''), + article_data.get('author', ''), + article_data['source_id'], + article_data['original_url'], + article_data.get('published_date'), + article_data.get('language', 'en'), + json.dumps(article_data.get('keywords', [])), + content_hash + )) + + article_id = cursor.lastrowid + conn.commit() + self.logger.debug(f"保存文章: {article_data['title']}") + return article_id + + except Exception as e: + self.logger.error(f"保存文章失败: {e}") + conn.rollback() + return None + + def create_search_log(self, keywords: str, industry_id: Optional[int] = None, + language: str = 'en', user_ip: str = '') -> int: + """创建搜索记录""" + conn = self._get_connection() + cursor = conn.cursor() + + cursor.execute(""" + INSERT INTO search_logs (keywords, industry_id, language, user_ip) + VALUES (?, ?, ?, ?) + """, (keywords, industry_id, language, user_ip)) + + search_log_id = cursor.lastrowid + conn.commit() + return search_log_id + + def save_search_results(self, search_log_id: int, articles: List[Dict]): + """保存搜索结果""" + conn = self._get_connection() + cursor = conn.cursor() + + for rank, article in enumerate(articles, 1): + cursor.execute(""" + INSERT INTO search_results + (search_log_id, article_id, relevance_score, rank_position) + VALUES (?, ?, ?, ?) + """, (search_log_id, article['id'], article.get('relevance_score', 0), rank)) + + # 更新搜索记录的结果数量 + cursor.execute(""" + UPDATE search_logs SET results_count = ? WHERE id = ? + """, (len(articles), search_log_id)) + + conn.commit() + + def search_articles(self, keywords: List[str], industry_id: Optional[int] = None, + language: Optional[str] = None, limit: int = 50, + days_back: int = 30) -> List[Dict]: + """搜索文章""" + conn = self._get_connection() + cursor = conn.cursor() + + # 构建搜索查询 + query = """ + SELECT a.*, rs.source_name, rs.authority_level, i.name_cn as industry_name + FROM articles a + JOIN rss_sources rs ON a.source_id = rs.id + JOIN industries i ON rs.industry_id = i.id + WHERE 1=1 + """ + params = [] + + # 时间范围过滤 + if days_back > 0: + date_threshold = datetime.now() - timedelta(days=days_back) + query += " AND a.published_date >= ?" + params.append(date_threshold) + + # 行业过滤 + if industry_id: + query += " AND rs.industry_id = ?" + params.append(industry_id) + + # 语言过滤 + if language: + query += " AND a.language = ?" + params.append(language) + + # 关键词搜索 + if keywords: + keyword_conditions = [] + for keyword in keywords: + keyword_conditions.append("(a.title LIKE ? OR a.content LIKE ?)") + params.extend([f"%{keyword}%", f"%{keyword}%"]) + + query += f" AND ({' OR '.join(keyword_conditions)})" + + # 排序和限制 + query += " ORDER BY rs.authority_level ASC, a.published_date DESC LIMIT ?" + params.append(limit) + + cursor.execute(query, params) + results = [dict(row) for row in cursor.fetchall()] + + # 计算相关性分数 + for result in results: + result['relevance_score'] = self._calculate_relevance(result, keywords) + + # 按相关性和权威性排序 + results.sort(key=lambda x: (x['authority_level'], -x['relevance_score'])) + + return results + + def _calculate_relevance(self, article: Dict, keywords: List[str]) -> float: + """计算文章相关性分数""" + if not keywords: + return 1.0 + + title = article.get('title', '').lower() + content = article.get('content', '').lower() + + score = 0.0 + for keyword in keywords: + keyword = keyword.lower() + # 标题匹配权重更高 + title_matches = title.count(keyword) + content_matches = content.count(keyword) + + score += title_matches * 2.0 + content_matches * 0.5 + + # 根据信源权威级别调整分数 + authority_bonus = (4 - article.get('authority_level', 4)) * 0.1 + score += authority_bonus + + return min(score, 10.0) # 限制最高分数 + + def get_search_history(self, limit: int = 20) -> List[Dict]: + """获取搜索历史""" + conn = self._get_connection() + cursor = conn.cursor() + + cursor.execute(""" + SELECT sl.*, i.name_cn as industry_name + FROM search_logs sl + LEFT JOIN industries i ON sl.industry_id = i.id + ORDER BY sl.search_time DESC + LIMIT ? + """, (limit,)) + + return [dict(row) for row in cursor.fetchall()] + + def save_exported_doc(self, search_log_id: int, filename: str, + file_path: str, articles_count: int) -> int: + """保存导出文档记录""" + conn = self._get_connection() + cursor = conn.cursor() + + cursor.execute(""" + INSERT INTO exported_docs + (search_log_id, filename, file_path, articles_count) + VALUES (?, ?, ?, ?) + """, (search_log_id, filename, file_path, articles_count)) + + doc_id = cursor.lastrowid + conn.commit() + return doc_id + + def update_rss_source_check_time(self, source_id: int): + """更新RSS源检查时间""" + conn = self._get_connection() + cursor = conn.cursor() + + cursor.execute(""" + UPDATE rss_sources SET last_checked = CURRENT_TIMESTAMP WHERE id = ? + """, (source_id,)) + + conn.commit() + + def get_statistics(self) -> Dict: + """获取系统统计信息""" + conn = self._get_connection() + cursor = conn.cursor() + + stats = {} + + # 文章总数 + cursor.execute("SELECT COUNT(*) as count FROM articles") + stats['total_articles'] = cursor.fetchone()['count'] + + # 今日新增文章 + cursor.execute(""" + SELECT COUNT(*) as count FROM articles + WHERE DATE(scraped_date) = DATE('now') + """) + stats['today_articles'] = cursor.fetchone()['count'] + + # 搜索总次数 + cursor.execute("SELECT COUNT(*) as count FROM search_logs") + stats['total_searches'] = cursor.fetchone()['count'] + + # 活跃RSS源数量 + cursor.execute("SELECT COUNT(*) as count FROM rss_sources WHERE is_active = 1") + stats['active_sources'] = cursor.fetchone()['count'] + + # 按行业统计文章数 + cursor.execute(""" + SELECT i.name_cn, COUNT(a.id) as count + FROM industries i + LEFT JOIN rss_sources rs ON i.id = rs.industry_id + LEFT JOIN articles a ON rs.id = a.source_id + GROUP BY i.id, i.name_cn + ORDER BY count DESC + """) + stats['articles_by_industry'] = [dict(row) for row in cursor.fetchall()] + + return stats + + def close(self): + """关闭数据库连接""" + if self.conn: + self.conn.close() + self.conn = None \ No newline at end of file diff --git a/代码实现/database_schema.sql b/代码实现/database_schema.sql new file mode 100644 index 0000000..d93cba7 --- /dev/null +++ b/代码实现/database_schema.sql @@ -0,0 +1,101 @@ +-- 搜索系统数据库结构 +-- 适用于 SQLite/MySQL/PostgreSQL + +-- 1. 行业分类表 +CREATE TABLE industries ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name_en VARCHAR(50) NOT NULL UNIQUE, + name_cn VARCHAR(50) NOT NULL, + description TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- 2. 信息源配置表 +CREATE TABLE rss_sources ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + industry_id INTEGER NOT NULL, + source_name VARCHAR(100) NOT NULL, + source_url VARCHAR(500) NOT NULL, + source_type VARCHAR(20) NOT NULL, -- 'rss', 'api', 'manual' + authority_level INTEGER DEFAULT 3, -- 1=官方机构, 2=主流媒体, 3=专业平台, 4=其他 + language VARCHAR(2) DEFAULT 'en', -- 'en', 'cn' + is_active BOOLEAN DEFAULT TRUE, + last_checked TIMESTAMP, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (industry_id) REFERENCES industries(id) +); + +-- 3. 搜索记录表 +CREATE TABLE search_logs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + keywords TEXT NOT NULL, + industry_id INTEGER, + language VARCHAR(2) DEFAULT 'en', + results_count INTEGER DEFAULT 0, + search_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + user_ip VARCHAR(45), + FOREIGN KEY (industry_id) REFERENCES industries(id) +); + +-- 4. 文章内容表 +CREATE TABLE articles ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT NOT NULL, + content TEXT, + summary TEXT, + author VARCHAR(200), + source_id INTEGER NOT NULL, + original_url VARCHAR(1000) NOT NULL, + published_date TIMESTAMP, + scraped_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + language VARCHAR(2) DEFAULT 'en', + keywords TEXT, -- JSON格式存储关键词 + article_hash VARCHAR(64) UNIQUE, -- 防重复 + is_archived BOOLEAN DEFAULT FALSE, + FOREIGN KEY (source_id) REFERENCES rss_sources(id) +); + +-- 5. 搜索结果表 +CREATE TABLE search_results ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + search_log_id INTEGER NOT NULL, + article_id INTEGER NOT NULL, + relevance_score FLOAT DEFAULT 0.0, + rank_position INTEGER, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (search_log_id) REFERENCES search_logs(id), + FOREIGN KEY (article_id) REFERENCES articles(id) +); + +-- 6. 导出文档记录表 +CREATE TABLE exported_docs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + search_log_id INTEGER NOT NULL, + filename VARCHAR(255) NOT NULL, + file_path VARCHAR(500) NOT NULL, + doc_type VARCHAR(20) DEFAULT 'docx', -- 'docx', 'pdf', 'txt' + articles_count INTEGER DEFAULT 0, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (search_log_id) REFERENCES search_logs(id) +); + +-- 插入基础数据 +INSERT INTO industries (name_en, name_cn, description) VALUES +('finance', '金融行业', '银行、证券、保险、投资等金融服务'), +('ai_software', 'AI与软件', '人工智能、软件开发、技术创新'), +('manufacturing', '制造业', '工业制造、自动化、生产技术'), +('healthcare_pharma', '医疗制药', '医疗健康、制药、生物技术'), +('fmcg', '快消品', '快速消费品、零售、品牌营销'), +('ecommerce_retail', '零售电商', '电子商务、零售业、数字营销'), +('energy_chemical', '能源化工', '能源、化工、石油、新能源'), +('real_estate', '房地产建筑', '房地产、建筑、基础设施'); + +-- 创建索引优化查询性能 +CREATE INDEX idx_articles_published_date ON articles(published_date); +CREATE INDEX idx_articles_source_id ON articles(source_id); +CREATE INDEX idx_articles_language ON articles(language); +CREATE INDEX idx_articles_hash ON articles(article_hash); +CREATE INDEX idx_search_logs_keywords ON search_logs(keywords); +CREATE INDEX idx_search_logs_time ON search_logs(search_time); +CREATE INDEX idx_rss_sources_industry ON rss_sources(industry_id); +CREATE INDEX idx_rss_sources_active ON rss_sources(is_active); \ No newline at end of file diff --git a/代码实现/document_exporter.py b/代码实现/document_exporter.py new file mode 100644 index 0000000..46d37d7 --- /dev/null +++ b/代码实现/document_exporter.py @@ -0,0 +1,370 @@ +# -*- coding: utf-8 -*- +""" +文档导出器 - 将搜索结果导出为DOCX格式 +""" + +import logging +from datetime import datetime +from typing import List, Dict, Optional +from pathlib import Path + +try: + from docx import Document + from docx.shared import Inches + from docx.enum.style import WD_STYLE_TYPE + from docx.enum.text import WD_ALIGN_PARAGRAPH + from docx.oxml.shared import OxmlElement, qn +except ImportError: + print("需要安装 python-docx: pip install python-docx") + raise + +from database import DatabaseManager +from config import EXPORT_CONFIG, EXPORT_DIR + +class DocumentExporter: + """文档导出器""" + + def __init__(self): + self.db = DatabaseManager() + self.logger = logging.getLogger(__name__) + self.export_dir = EXPORT_DIR + self.export_dir.mkdir(exist_ok=True) + + def export_search_results(self, search_log_id: int, + custom_filename: str = None) -> Dict: + """导出搜索结果为DOCX文档""" + try: + # 获取搜索记录和结果 + search_log = self._get_search_log(search_log_id) + if not search_log: + return {'success': False, 'error': '搜索记录不存在'} + + results = self._get_search_results(search_log_id) + if not results: + return {'success': False, 'error': '没有搜索结果可导出'} + + # 生成文件名 + filename = self._generate_filename(search_log, custom_filename) + file_path = self.export_dir / filename + + # 创建文档 + doc = self._create_document(search_log, results) + + # 保存文档 + doc.save(file_path) + + # 记录导出信息 + doc_id = self.db.save_exported_doc( + search_log_id, filename, str(file_path), len(results) + ) + + self.logger.info(f"文档导出成功: {filename}") + + return { + 'success': True, + 'filename': filename, + 'file_path': str(file_path), + 'articles_count': len(results), + 'doc_id': doc_id + } + + except Exception as e: + self.logger.error(f"文档导出失败: {e}") + return {'success': False, 'error': str(e)} + + def _get_search_log(self, search_log_id: int) -> Optional[Dict]: + """获取搜索记录""" + try: + conn = self.db._get_connection() + cursor = conn.cursor() + + cursor.execute(""" + SELECT sl.*, i.name_cn as industry_name, i.name_en as industry_en + FROM search_logs sl + LEFT JOIN industries i ON sl.industry_id = i.id + WHERE sl.id = ? + """, (search_log_id,)) + + result = cursor.fetchone() + return dict(result) if result else None + + except Exception as e: + self.logger.error(f"获取搜索记录失败: {e}") + return None + + def _get_search_results(self, search_log_id: int) -> List[Dict]: + """获取搜索结果""" + try: + conn = self.db._get_connection() + cursor = conn.cursor() + + cursor.execute(""" + SELECT a.*, rs.source_name, rs.authority_level, sr.relevance_score, sr.rank_position + FROM search_results sr + JOIN articles a ON sr.article_id = a.id + JOIN rss_sources rs ON a.source_id = rs.id + WHERE sr.search_log_id = ? + ORDER BY sr.rank_position ASC + """, (search_log_id,)) + + return [dict(row) for row in cursor.fetchall()] + + except Exception as e: + self.logger.error(f"获取搜索结果失败: {e}") + return [] + + def _generate_filename(self, search_log: Dict, custom_filename: str = None) -> str: + """生成文件名""" + if custom_filename: + if not custom_filename.endswith('.docx'): + custom_filename += '.docx' + return custom_filename + + # 自动生成文件名 + date_str = datetime.now().strftime('%Y%m%d') + keywords = search_log.get('keywords', '').replace(' ', '_')[:20] + industry = search_log.get('industry_en', 'general') + language = search_log.get('language', 'en') + + # 根据语言选择文件名格式 + if language == 'cn': + filename = f"{date_str}_{industry}_{keywords}_CN.docx" + else: + filename = f"{date_str}_{industry}_{keywords}.docx" + + # 确保文件名安全 + filename = self._sanitize_filename(filename) + + return filename + + def _sanitize_filename(self, filename: str) -> str: + """清理文件名""" + import re + # 移除不安全字符 + filename = re.sub(r'[<>:"/\\|?*]', '_', filename) + # 限制长度 + if len(filename) > 100: + name, ext = filename.rsplit('.', 1) + filename = name[:90] + '.' + ext + return filename + + def _create_document(self, search_log: Dict, results: List[Dict]) -> Document: + """创建DOCX文档""" + doc = Document() + + # 设置文档样式 + self._setup_document_styles(doc) + + # 添加标题 + self._add_title(doc, search_log) + + # 添加搜索信息 + self._add_search_info(doc, search_log) + + # 添加搜索结果 + self._add_search_results(doc, results) + + # 添加页脚 + self._add_footer(doc) + + return doc + + def _setup_document_styles(self, doc: Document): + """设置文档样式""" + try: + # 标题样式 + title_style = doc.styles.add_style('CustomTitle', WD_STYLE_TYPE.PARAGRAPH) + title_font = title_style.font + title_font.size = Inches(0.2) + title_font.bold = True + title_style.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER + + # 文章标题样式 + article_title_style = doc.styles.add_style('ArticleTitle', WD_STYLE_TYPE.PARAGRAPH) + article_title_font = article_title_style.font + article_title_font.size = Inches(0.15) + article_title_font.bold = True + + # 来源信息样式 + source_style = doc.styles.add_style('SourceInfo', WD_STYLE_TYPE.PARAGRAPH) + source_font = source_style.font + source_font.size = Inches(0.1) + source_font.italic = True + + except Exception as e: + # 如果样式已存在,忽略错误 + pass + + def _add_title(self, doc: Document, search_log: Dict): + """添加文档标题""" + keywords = search_log.get('keywords', '') + industry_name = search_log.get('industry_name', '通用') + date_str = datetime.now().strftime('%Y年%m月%d日') + + if search_log.get('language') == 'cn': + title = f"{industry_name}行业搜索报告\n关键词: {keywords}\n{date_str}" + else: + title = f"{search_log.get('industry_en', 'General')} Industry Search Report\nKeywords: {keywords}\n{date_str}" + + try: + title_para = doc.add_paragraph(title, style='CustomTitle') + except: + title_para = doc.add_paragraph(title) + title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER + + doc.add_paragraph() # 空行 + + def _add_search_info(self, doc: Document, search_log: Dict): + """添加搜索信息""" + search_time = search_log.get('search_time', '') + if search_time: + search_time = datetime.fromisoformat(search_time.replace('Z', '')).strftime('%Y-%m-%d %H:%M:%S') + + info_lines = [ + f"搜索时间: {search_time}", + f"关键词: {search_log.get('keywords', '')}", + f"搜索行业: {search_log.get('industry_name', '全部')}", + f"搜索语言: {'中文' if search_log.get('language') == 'cn' else '英文'}", + f"结果数量: {search_log.get('results_count', 0)} 条" + ] + + info_para = doc.add_paragraph() + for line in info_lines: + info_para.add_run(line + '\n') + + doc.add_paragraph() # 空行 + doc.add_paragraph("="*50) # 分隔线 + doc.add_paragraph() + + def _add_search_results(self, doc: Document, results: List[Dict]): + """添加搜索结果""" + for i, result in enumerate(results, 1): + # 文章标题 + title = result.get('title', '无标题') + try: + title_para = doc.add_paragraph(f"{i}. {title}", style='ArticleTitle') + except: + title_para = doc.add_paragraph(f"{i}. {title}") + title_para.runs[0].bold = True + + # 来源信息 + source_info = self._format_source_info(result) + try: + source_para = doc.add_paragraph(source_info, style='SourceInfo') + except: + source_para = doc.add_paragraph(source_info) + source_para.runs[0].italic = True + + # 文章摘要 + summary = result.get('summary', result.get('content', '')) + if summary: + # 限制摘要长度 + if len(summary) > 300: + summary = summary[:300] + '...' + doc.add_paragraph(summary) + + # 原文链接 + url = result.get('original_url', '') + if url and EXPORT_CONFIG.get('include_source_links', True): + link_para = doc.add_paragraph(f"原文链接: {url}") + link_para.runs[0].font.color.rgb = None # 蓝色链接 + + doc.add_paragraph() # 空行分隔 + + # 分页(每5篇文章一页) + if i % 5 == 0 and i < len(results): + doc.add_page_break() + + def _format_source_info(self, result: Dict) -> str: + """格式化来源信息""" + source_name = result.get('source_name', '未知来源') + author = result.get('author', '') + published_date = result.get('published_date', '') + authority_level = result.get('authority_level', 3) + relevance_score = result.get('relevance_score', 0) + + # 权威级别文本 + authority_map = {1: '官方机构', 2: '主流媒体', 3: '专业平台', 4: '其他'} + authority_text = authority_map.get(authority_level, '其他') + + # 格式化日期 + if published_date: + try: + if isinstance(published_date, str): + pub_date = datetime.fromisoformat(published_date.replace('Z', '')) + else: + pub_date = published_date + date_str = pub_date.strftime('%Y-%m-%d') + except: + date_str = str(published_date) + else: + date_str = '未知日期' + + info_parts = [ + f"来源: {source_name} ({authority_text})", + f"发布时间: {date_str}", + f"相关性: {relevance_score:.2f}" + ] + + if author: + info_parts.insert(1, f"作者: {author}") + + return " | ".join(info_parts) + + def _add_footer(self, doc: Document): + """添加页脚""" + doc.add_paragraph() + doc.add_paragraph("="*50) + + footer_text = f"本报告由智能搜索系统生成 | 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + footer_para = doc.add_paragraph(footer_text) + footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER + + def get_export_history(self, limit: int = 20) -> List[Dict]: + """获取导出历史""" + try: + conn = self.db._get_connection() + cursor = conn.cursor() + + cursor.execute(""" + SELECT ed.*, sl.keywords, sl.search_time + FROM exported_docs ed + JOIN search_logs sl ON ed.search_log_id = sl.id + ORDER BY ed.created_at DESC + LIMIT ? + """, (limit,)) + + return [dict(row) for row in cursor.fetchall()] + + except Exception as e: + self.logger.error(f"获取导出历史失败: {e}") + return [] + + def delete_exported_file(self, doc_id: int) -> Dict: + """删除导出的文件""" + try: + conn = self.db._get_connection() + cursor = conn.cursor() + + # 获取文件信息 + cursor.execute("SELECT file_path FROM exported_docs WHERE id = ?", (doc_id,)) + result = cursor.fetchone() + + if not result: + return {'success': False, 'error': '文档记录不存在'} + + file_path = Path(result['file_path']) + + # 删除文件 + if file_path.exists(): + file_path.unlink() + + # 删除数据库记录 + cursor.execute("DELETE FROM exported_docs WHERE id = ?", (doc_id,)) + conn.commit() + + return {'success': True, 'message': '文件删除成功'} + + except Exception as e: + self.logger.error(f"删除文件失败: {e}") + return {'success': False, 'error': str(e)} \ No newline at end of file diff --git a/代码实现/main.py b/代码实现/main.py new file mode 100644 index 0000000..d5b1f65 --- /dev/null +++ b/代码实现/main.py @@ -0,0 +1,367 @@ +# -*- coding: utf-8 -*- +""" +搜索系统主程序 +提供命令行界面和简单的Web界面 +""" + +import os +import sys +import logging +import argparse +from typing import Dict, List +from pathlib import Path + +# 添加当前目录到Python路径 +sys.path.append(str(Path(__file__).parent)) + +from config import LOGGING_CONFIG +from database import DatabaseManager +from search_engine import SearchEngine +from document_exporter import DocumentExporter +from rss_monitor import RSSMonitor + +class SearchSystemCLI: + """搜索系统命令行界面""" + + def __init__(self): + self.setup_logging() + self.db = DatabaseManager() + self.search_engine = SearchEngine() + self.exporter = DocumentExporter() + self.rss_monitor = RSSMonitor() + self.logger = logging.getLogger(__name__) + + def setup_logging(self): + """设置日志""" + logging.basicConfig( + level=LOGGING_CONFIG['level'], + format=LOGGING_CONFIG['format'], + handlers=[ + logging.FileHandler(LOGGING_CONFIG['file'], encoding='utf-8'), + logging.StreamHandler() + ] + ) + + def run_search(self, query: str, industry: str = None, + language: str = None, export: bool = False) -> Dict: + """执行搜索""" + print(f"\n🔍 搜索查询: {query}") + print(f"📊 行业: {industry or '全部'}") + print(f"🌐 语言: {language or '自动检测'}") + print("-" * 50) + + # 执行搜索 + result = self.search_engine.search( + query=query, + industry=industry, + language=language + ) + + if not result['success']: + print(f"❌ 搜索失败: {result.get('error', '未知错误')}") + return result + + # 显示搜索结果 + self.display_search_results(result) + + # 导出文档 + if export and result['results']: + export_result = self.exporter.export_search_results(result['search_log_id']) + if export_result['success']: + print(f"\n📄 文档导出成功: {export_result['filename']}") + print(f"📁 文件路径: {export_result['file_path']}") + else: + print(f"❌ 文档导出失败: {export_result.get('error', '未知错误')}") + + return result + + def display_search_results(self, result: Dict): + """显示搜索结果""" + print(f"\n✅ 搜索完成!") + print(f"📈 找到 {result['total_count']} 条结果") + print(f"⏱️ 搜索耗时: {result['search_time']} 秒") + print(f"🔗 检索源: {result['sources_searched']['total_sources']} 个") + + if not result['results']: + print("\n📭 没有找到相关结果") + return + + print(f"\n📰 搜索结果预览 (前5条):") + print("=" * 80) + + for i, article in enumerate(result['results'][:5], 1): + print(f"\n{i}. {article['title']}") + print(f" 🏢 来源: {article['source_name']} ({self.get_authority_text(article['authority_level'])})") + print(f" 📅 时间: {self.format_date(article.get('published_date', ''))}") + print(f" 🎯 相关性: {article.get('final_score', 0):.2f}") + print(f" 🔗 链接: {article['original_url']}") + + summary = article.get('summary', article.get('content', '')) + if summary: + summary = summary[:100] + '...' if len(summary) > 100 else summary + print(f" 📝 摘要: {summary}") + + if len(result['results']) > 5: + print(f"\n... 还有 {len(result['results']) - 5} 条结果") + + def get_authority_text(self, level: int) -> str: + """获取权威级别文本""" + authority_map = {1: '官方机构', 2: '主流媒体', 3: '专业平台', 4: '其他'} + return authority_map.get(level, '其他') + + def format_date(self, date_str: str) -> str: + """格式化日期""" + if not date_str: + return '未知' + try: + from datetime import datetime + if isinstance(date_str, str): + date_obj = datetime.fromisoformat(date_str.replace('Z', '')) + else: + date_obj = date_str + return date_obj.strftime('%Y-%m-%d') + except: + return str(date_str) + + def show_statistics(self): + """显示系统统计""" + stats = self.db.get_statistics() + + print("\n📊 系统统计信息") + print("=" * 40) + print(f"📰 文章总数: {stats['total_articles']}") + print(f"🆕 今日新增: {stats['today_articles']}") + print(f"🔍 搜索总次数: {stats['total_searches']}") + print(f"📡 活跃源数: {stats['active_sources']}") + + print(f"\n📈 各行业文章分布:") + for item in stats['articles_by_industry'][:8]: + print(f" {item['name_cn']}: {item['count']} 篇") + + def show_search_history(self, limit: int = 10): + """显示搜索历史""" + history = self.db.get_search_history(limit) + + print(f"\n📜 最近 {limit} 次搜索记录") + print("=" * 60) + + for i, record in enumerate(history, 1): + print(f"{i}. {record['keywords']}") + print(f" 行业: {record.get('industry_name', '全部')} | " + f"结果: {record['results_count']} 条 | " + f"时间: {self.format_date(record['search_time'])}") + + def interactive_mode(self): + """交互模式""" + print("🚀 欢迎使用智能搜索系统!") + print("输入 'help' 查看帮助,输入 'quit' 退出") + + while True: + try: + command = input("\n>>> ").strip() + + if command.lower() in ['quit', 'exit', 'q']: + print("👋 再见!") + break + elif command.lower() == 'help': + self.show_help() + elif command.lower() == 'stats': + self.show_statistics() + elif command.lower() == 'history': + self.show_search_history() + elif command.startswith('search '): + query = command[7:] + self.run_search(query, export=True) + elif command: + # 直接搜索 + self.run_search(command, export=True) + else: + print("请输入搜索查询或命令") + + except KeyboardInterrupt: + print("\n👋 再见!") + break + except Exception as e: + print(f"❌ 错误: {e}") + + def show_help(self): + """显示帮助信息""" + help_text = """ +🆘 命令帮助: + search <查询词> - 执行搜索 + stats - 查看统计信息 + history - 查看搜索历史 + help - 显示此帮助 + quit/exit/q - 退出程序 + +🔍 搜索示例: + search AI breakthrough 2024 + search 英伟达最新财报 + search renewable energy policy + +💡 提示: + - 英文搜索会自动使用英文信源 + - 包含中文关键词会自动切换中文搜索 + - 搜索结果会自动导出为DOCX文档 + """ + print(help_text) + +def create_web_app(): + """创建简单的Web界面""" + try: + from flask import Flask, render_template_string, request, jsonify + + app = Flask(__name__) + cli = SearchSystemCLI() + + # 简单的HTML模板 + HTML_TEMPLATE = """ + + + + 智能搜索系统 + + + + +
+

🔍 智能搜索系统

+

支持8个行业的权威信息搜索

+
+ + + + {% if search_result %} +
+ 搜索结果: {{ search_result.total_count }} 条 | + 耗时: {{ search_result.search_time }} 秒 | + 信源: {{ search_result.sources_searched.total_sources }} 个 +
+ +
+ {% for article in search_result.results[:10] %} +
+
{{ loop.index }}. {{ article.title }}
+
+ 📰 {{ article.source_name }} | + 📅 {{ article.published_date or '未知时间' }} | + 🎯 相关性: {{ "%.2f"|format(article.final_score or 0) }} +
+
{{ article.summary[:200] }}...
+
🔗 查看原文
+
+ {% endfor %} +
+ {% endif %} + + {% if error %} +
+ ❌ {{ error }} +
+ {% endif %} + + + """ + + @app.route('/', methods=['GET', 'POST']) + def index(): + if request.method == 'POST': + query = request.form.get('query', '').strip() + industry = request.form.get('industry', '') or None + + if query: + try: + result = cli.search_engine.search(query, industry) + if result['success']: + return render_template_string(HTML_TEMPLATE, + query=query, + search_result=result) + else: + return render_template_string(HTML_TEMPLATE, + query=query, + error=result.get('error', '搜索失败')) + except Exception as e: + return render_template_string(HTML_TEMPLATE, + query=query, + error=str(e)) + else: + return render_template_string(HTML_TEMPLATE, + query=query, + error='请输入搜索关键词') + + return render_template_string(HTML_TEMPLATE) + + return app + + except ImportError: + print("Flask未安装,无法启动Web界面") + print("请运行: pip install flask") + return None + +def main(): + """主函数""" + parser = argparse.ArgumentParser(description='智能搜索系统') + parser.add_argument('--mode', choices=['cli', 'web', 'monitor'], + default='cli', help='运行模式') + parser.add_argument('--query', type=str, help='搜索查询') + parser.add_argument('--industry', type=str, help='搜索行业') + parser.add_argument('--language', type=str, choices=['en', 'cn'], help='搜索语言') + parser.add_argument('--export', action='store_true', help='导出结果') + parser.add_argument('--port', type=int, default=5000, help='Web端口') + + args = parser.parse_args() + + if args.mode == 'monitor': + # RSS监控模式 + print("🚀 启动RSS监控器...") + from rss_monitor import start_rss_monitor + start_rss_monitor() + + elif args.mode == 'web': + # Web界面模式 + app = create_web_app() + if app: + print(f"🌐 启动Web界面: http://localhost:{args.port}") + app.run(host='0.0.0.0', port=args.port, debug=False) + + elif args.mode == 'cli': + # 命令行模式 + cli = SearchSystemCLI() + + if args.query: + # 直接执行搜索 + cli.run_search(args.query, args.industry, args.language, args.export) + else: + # 交互模式 + cli.interactive_mode() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/代码实现/requirements.txt b/代码实现/requirements.txt new file mode 100644 index 0000000..92d9a02 --- /dev/null +++ b/代码实现/requirements.txt @@ -0,0 +1,41 @@ +# 搜索系统依赖包 + +# 核心依赖 +requests>=2.28.0 +feedparser>=6.0.10 +python-docx>=0.8.11 + +# 数据库 +sqlite3 # Python内置,无需安装 + +# 可选API依赖 +newsapi-python>=0.2.6 + +# 日志和工具 +pathlib # Python内置,无需安装 +logging # Python内置,无需安装 +hashlib # Python内置,无需安装 +json # Python内置,无需安装 +datetime # Python内置,无需安装 +typing # Python内置,无需安装 +threading # Python内置,无需安装 +concurrent.futures # Python内置,无需安装 +collections # Python内置,无需安装 +html # Python内置,无需安装 +re # Python内置,无需安装 +time # Python内置,无需安装 + +# Web界面(可选) +flask>=2.0.0 +jinja2>=3.0.0 + +# 数据处理增强(可选) +pandas>=1.5.0 +numpy>=1.21.0 + +# 中文处理(可选) +jieba>=0.42.1 + +# 更高级的NLP处理(可选) +nltk>=3.8 +scikit-learn>=1.1.0 \ No newline at end of file diff --git a/代码实现/rss_monitor.py b/代码实现/rss_monitor.py new file mode 100644 index 0000000..09ec40e --- /dev/null +++ b/代码实现/rss_monitor.py @@ -0,0 +1,324 @@ +# -*- coding: utf-8 -*- +""" +RSS监控脚本 - 自动获取RSS源更新 +""" + +import feedparser +import requests +import time +import logging +import threading +from datetime import datetime, timezone +from typing import List, Dict, Optional +from concurrent.futures import ThreadPoolExecutor, as_completed + +from database import DatabaseManager +from config import RSS_MONITOR_CONFIG, SEARCH_CONFIG + +class RSSMonitor: + """RSS监控器""" + + def __init__(self): + self.db = DatabaseManager() + self.logger = logging.getLogger(__name__) + self.is_running = False + self.check_interval = RSS_MONITOR_CONFIG['check_interval'] + self.max_retries = RSS_MONITOR_CONFIG['max_retries'] + self.timeout = RSS_MONITOR_CONFIG['timeout'] + self.user_agent = RSS_MONITOR_CONFIG['user_agent'] + + def start_monitoring(self): + """开始监控RSS源""" + self.is_running = True + self.logger.info("RSS监控器启动") + + while self.is_running: + try: + self._check_all_sources() + self.logger.info(f"等待 {self.check_interval} 秒后进行下次检查") + time.sleep(self.check_interval) + except KeyboardInterrupt: + self.logger.info("收到停止信号") + break + except Exception as e: + self.logger.error(f"监控过程出错: {e}") + time.sleep(60) # 出错后等待1分钟再继续 + + def stop_monitoring(self): + """停止监控""" + self.is_running = False + self.logger.info("RSS监控器停止") + + def _check_all_sources(self): + """检查所有RSS源""" + sources = self.db.get_rss_sources() + self.logger.info(f"开始检查 {len(sources)} 个RSS源") + + # 使用线程池并行处理 + with ThreadPoolExecutor(max_workers=10) as executor: + futures = { + executor.submit(self._check_single_source, source): source + for source in sources + } + + success_count = 0 + error_count = 0 + + for future in as_completed(futures): + source = futures[future] + try: + articles_count = future.result() + if articles_count is not None: + success_count += 1 + if articles_count > 0: + self.logger.info( + f"{source['source_name']}: 新增 {articles_count} 篇文章" + ) + else: + error_count += 1 + except Exception as e: + error_count += 1 + self.logger.error(f"检查 {source['source_name']} 时出错: {e}") + + self.logger.info(f"RSS检查完成: 成功 {success_count}, 失败 {error_count}") + + def _check_single_source(self, source: Dict) -> Optional[int]: + """检查单个RSS源""" + source_id = source['id'] + source_name = source['source_name'] + source_url = source['source_url'] + + try: + # 获取RSS内容 + articles = self._fetch_rss_articles(source_url, source) + + if articles is None: + return None + + # 保存新文章 + new_articles_count = 0 + for article in articles: + article['source_id'] = source_id + article_id = self.db.save_article(article) + if article_id: + new_articles_count += 1 + + # 更新RSS源检查时间 + self.db.update_rss_source_check_time(source_id) + + return new_articles_count + + except Exception as e: + self.logger.error(f"检查RSS源 {source_name} 失败: {e}") + return None + + def _fetch_rss_articles(self, url: str, source: Dict) -> Optional[List[Dict]]: + """获取RSS文章""" + headers = { + 'User-Agent': self.user_agent, + 'Accept': 'application/rss+xml, application/xml, text/xml' + } + + for attempt in range(self.max_retries): + try: + # 获取RSS内容 + response = requests.get(url, headers=headers, timeout=self.timeout) + response.raise_for_status() + + # 解析RSS + feed = feedparser.parse(response.content) + + if feed.bozo and feed.bozo_exception: + self.logger.warning( + f"RSS解析警告 {source['source_name']}: {feed.bozo_exception}" + ) + + articles = [] + for entry in feed.entries: + article = self._parse_rss_entry(entry, source) + if article: + articles.append(article) + + return articles + + except requests.RequestException as e: + self.logger.warning( + f"第 {attempt + 1} 次尝试获取 {source['source_name']} 失败: {e}" + ) + if attempt < self.max_retries - 1: + time.sleep(2 ** attempt) # 指数退避 + except Exception as e: + self.logger.error(f"解析RSS {source['source_name']} 时出错: {e}") + break + + return None + + def _parse_rss_entry(self, entry, source: Dict) -> Optional[Dict]: + """解析RSS条目""" + try: + # 获取发布时间 + published_date = None + if hasattr(entry, 'published_parsed') and entry.published_parsed: + published_date = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc) + elif hasattr(entry, 'updated_parsed') and entry.updated_parsed: + published_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc) + + # 获取内容 + content = '' + if hasattr(entry, 'content') and entry.content: + content = entry.content[0].value if isinstance(entry.content, list) else entry.content + elif hasattr(entry, 'summary'): + content = entry.summary + elif hasattr(entry, 'description'): + content = entry.description + + # 获取作者 + author = '' + if hasattr(entry, 'author'): + author = entry.author + elif hasattr(entry, 'dc_creator'): + author = entry.dc_creator + + # 提取关键词 + keywords = self._extract_keywords(entry.title, content) + + article = { + 'title': entry.title if hasattr(entry, 'title') else '', + 'content': self._clean_content(content), + 'summary': entry.summary if hasattr(entry, 'summary') else '', + 'author': author, + 'original_url': entry.link if hasattr(entry, 'link') else '', + 'published_date': published_date, + 'language': source.get('language', 'en'), + 'keywords': keywords + } + + # 验证必要字段 + if not article['title'] or not article['original_url']: + return None + + return article + + except Exception as e: + self.logger.error(f"解析RSS条目时出错: {e}") + return None + + def _clean_content(self, content: str) -> str: + """清理HTML内容""" + if not content: + return '' + + try: + import re + from html import unescape + + # 移除HTML标签 + content = re.sub(r'<[^>]+>', '', content) + # 解码HTML实体 + content = unescape(content) + # 移除多余空白 + content = re.sub(r'\s+', ' ', content).strip() + + return content + except: + return content + + def _extract_keywords(self, title: str, content: str) -> List[str]: + """提取关键词""" + try: + text = f"{title} {content}".lower() + + # 简单关键词提取(可以用更高级的NLP库) + import re + words = re.findall(r'\b[a-zA-Z]{3,}\b', text) + + # 过滤常见停用词 + stop_words = { + 'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', + 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', + 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy', + 'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been', + 'said', 'each', 'make', 'most', 'over', 'some', 'time', 'very', + 'what', 'when', 'here', 'just', 'like', 'long', 'many', 'than', + 'them', 'well', 'your', 'come', 'could', 'into', 'more', 'much', + 'only', 'other', 'such', 'take', 'than', 'them', 'well', 'were' + } + + keywords = [word for word in words if word not in stop_words] + + # 统计词频并返回前10个 + from collections import Counter + word_counts = Counter(keywords) + return [word for word, count in word_counts.most_common(10)] + + except Exception as e: + self.logger.error(f"提取关键词时出错: {e}") + return [] + + def manual_check_source(self, source_id: int) -> Dict: + """手动检查指定RSS源""" + sources = self.db.get_rss_sources() + source = next((s for s in sources if s['id'] == source_id), None) + + if not source: + return {'success': False, 'message': 'RSS源不存在'} + + try: + articles_count = self._check_single_source(source) + if articles_count is not None: + return { + 'success': True, + 'message': f'成功检查 {source["source_name"]}', + 'new_articles': articles_count + } + else: + return { + 'success': False, + 'message': f'检查 {source["source_name"]} 失败' + } + except Exception as e: + return { + 'success': False, + 'message': f'检查失败: {str(e)}' + } + + def get_monitor_status(self) -> Dict: + """获取监控状态""" + stats = self.db.get_statistics() + + return { + 'is_running': self.is_running, + 'check_interval': self.check_interval, + 'total_sources': stats.get('active_sources', 0), + 'total_articles': stats.get('total_articles', 0), + 'today_articles': stats.get('today_articles', 0) + } + +def start_rss_monitor(): + """启动RSS监控器的主函数""" + import logging.config + from config import LOGGING_CONFIG + + # 配置日志 + logging.basicConfig( + level=LOGGING_CONFIG['level'], + format=LOGGING_CONFIG['format'], + handlers=[ + logging.FileHandler(LOGGING_CONFIG['file'], encoding='utf-8'), + logging.StreamHandler() + ] + ) + + monitor = RSSMonitor() + + try: + monitor.start_monitoring() + except KeyboardInterrupt: + print("\n收到停止信号,正在关闭RSS监控器...") + finally: + monitor.stop_monitoring() + monitor.db.close() + print("RSS监控器已停止") + +if __name__ == "__main__": + start_rss_monitor() \ No newline at end of file diff --git a/代码实现/search_engine.py b/代码实现/search_engine.py new file mode 100644 index 0000000..0fbb315 --- /dev/null +++ b/代码实现/search_engine.py @@ -0,0 +1,461 @@ +# -*- coding: utf-8 -*- +""" +搜索引擎主类 +""" + +import requests +import logging +import time +from typing import List, Dict, Optional, Tuple +from datetime import datetime, timedelta + +from database import DatabaseManager +from config import API_CONFIG, SEARCH_CONFIG + +class SearchEngine: + """智能搜索引擎""" + + def __init__(self): + self.db = DatabaseManager() + self.logger = logging.getLogger(__name__) + self.newsapi_key = API_CONFIG['newsapi']['key'] + self.twitter_token = API_CONFIG['twitter']['bearer_token'] + self.alpha_vantage_key = API_CONFIG['alpha_vantage']['key'] + + def search(self, query: str, industry: str = None, + language: str = None, user_ip: str = '') -> Dict: + """执行搜索""" + start_time = time.time() + + # 解析查询参数 + search_params = self._parse_query(query, industry, language) + keywords = search_params['keywords'] + industry_id = search_params['industry_id'] + detected_language = search_params['language'] + + self.logger.info(f"开始搜索: {keywords}, 行业: {industry}, 语言: {detected_language}") + + # 创建搜索记录 + search_log_id = self.db.create_search_log( + keywords=' '.join(keywords), + industry_id=industry_id, + language=detected_language, + user_ip=user_ip + ) + + try: + # 多源搜索 + all_results = [] + + # 1. 搜索本地数据库 + db_results = self._search_database(keywords, industry_id, detected_language) + all_results.extend(db_results) + self.logger.info(f"数据库搜索结果: {len(db_results)} 条") + + # 2. NewsAPI搜索(如果有API密钥) + if self.newsapi_key and detected_language == 'en': + news_results = self._search_newsapi(keywords, industry) + all_results.extend(news_results) + self.logger.info(f"NewsAPI搜索结果: {len(news_results)} 条") + + # 3. 金融数据API搜索(金融行业) + if industry == 'finance' and self.alpha_vantage_key: + finance_results = self._search_financial_data(keywords) + all_results.extend(finance_results) + self.logger.info(f"金融数据搜索结果: {len(finance_results)} 条") + + # 结果去重和排序 + final_results = self._process_results(all_results, keywords) + + # 保存搜索结果 + if final_results: + self.db.save_search_results(search_log_id, final_results) + + search_time = time.time() - start_time + + return { + 'success': True, + 'search_log_id': search_log_id, + 'query': query, + 'keywords': keywords, + 'industry': industry, + 'language': detected_language, + 'results': final_results, + 'total_count': len(final_results), + 'search_time': round(search_time, 2), + 'sources_searched': self._get_sources_info(industry_id) + } + + except Exception as e: + self.logger.error(f"搜索过程出错: {e}") + return { + 'success': False, + 'error': str(e), + 'search_log_id': search_log_id, + 'query': query + } + + def _parse_query(self, query: str, industry: str = None, + language: str = None) -> Dict: + """解析搜索查询""" + # 提取关键词 + keywords = self._extract_keywords(query) + + # 检测语言 + if not language: + language = self._detect_language(query) + + # 获取行业ID + industry_id = None + if industry: + industries = self.db.get_industries() + for ind in industries: + if ind['name_en'] == industry: + industry_id = ind['id'] + break + + return { + 'keywords': keywords, + 'industry_id': industry_id, + 'language': language + } + + def _extract_keywords(self, query: str) -> List[str]: + """提取搜索关键词""" + import re + + # 基础关键词提取 + words = re.findall(r'\b\w+\b', query.lower()) + + # 过滤停用词 + stop_words = { + 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', + 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', + 'after', 'above', 'below', 'up', 'down', 'out', 'off', 'over', 'under', + 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', + 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', + 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', + 'very', 'can', 'will', 'just', 'should', 'now', 'what', 'news', + 'latest', 'recent', 'update', 'today', 'yesterday' + } + + keywords = [word for word in words if len(word) > 2 and word not in stop_words] + + # 保留原始查询中的重要短语 + phrases = self._extract_phrases(query) + keywords.extend(phrases) + + return list(set(keywords)) # 去重 + + def _extract_phrases(self, query: str) -> List[str]: + """提取重要短语""" + import re + + # 提取引号内的短语 + quoted_phrases = re.findall(r'"([^"]*)"', query) + + # 提取常见的技术术语和公司名 + phrases = [] + + # 技术术语模式 + tech_patterns = [ + r'\b[A-Z]{2,}\b', # 大写缩写 (AI, API, GDP) + r'\b\w+\.\w+\b', # 域名格式 + r'\b\w+-\w+\b', # 连字符词组 + ] + + for pattern in tech_patterns: + matches = re.findall(pattern, query) + phrases.extend(matches) + + phrases.extend(quoted_phrases) + return phrases + + def _detect_language(self, query: str) -> str: + """检测查询语言""" + # 检查是否包含中文特定关键词 + china_keywords = SEARCH_CONFIG['keywords_for_china'] + + for keyword in china_keywords: + if keyword in query: + return 'cn' + + # 检查是否包含中文字符 + import re + chinese_chars = re.findall(r'[\u4e00-\u9fff]+', query) + if chinese_chars: + return 'cn' + + return SEARCH_CONFIG['default_language'] + + def _search_database(self, keywords: List[str], industry_id: Optional[int], + language: str) -> List[Dict]: + """搜索本地数据库""" + return self.db.search_articles( + keywords=keywords, + industry_id=industry_id, + language=language if language != 'cn' else None, + limit=SEARCH_CONFIG['max_results_per_source'] + ) + + def _search_newsapi(self, keywords: List[str], industry: str = None) -> List[Dict]: + """使用NewsAPI搜索""" + if not self.newsapi_key: + return [] + + try: + url = f"{API_CONFIG['newsapi']['base_url']}everything" + + # 构建查询字符串 + query_str = ' AND '.join(keywords[:5]) # 限制关键词数量 + + params = { + 'q': query_str, + 'apiKey': self.newsapi_key, + 'language': 'en', + 'sortBy': 'relevancy', + 'pageSize': 20, + 'from': (datetime.now() - timedelta(days=30)).isoformat() + } + + # 添加行业相关域名 + if industry: + domains = self._get_industry_domains(industry) + if domains: + params['domains'] = ','.join(domains) + + response = requests.get(url, params=params, timeout=30) + response.raise_for_status() + + data = response.json() + articles = [] + + for article in data.get('articles', []): + processed_article = { + 'id': f"newsapi_{hash(article['url'])}", + 'title': article['title'], + 'content': article.get('description', ''), + 'summary': article.get('description', ''), + 'author': article.get('author', ''), + 'original_url': article['url'], + 'published_date': self._parse_date(article.get('publishedAt')), + 'source_name': article['source']['name'], + 'authority_level': 2, # 默认主流媒体级别 + 'language': 'en', + 'relevance_score': 0.8 # NewsAPI结果相关性较高 + } + articles.append(processed_article) + + self.logger.info(f"NewsAPI返回 {len(articles)} 条结果") + return articles + + except Exception as e: + self.logger.error(f"NewsAPI搜索失败: {e}") + return [] + + def _search_financial_data(self, keywords: List[str]) -> List[Dict]: + """搜索金融数据""" + if not self.alpha_vantage_key: + return [] + + try: + # 检查关键词是否包含股票代码 + stock_symbols = self._extract_stock_symbols(keywords) + if not stock_symbols: + return [] + + articles = [] + for symbol in stock_symbols[:3]: # 限制查询数量 + data = self._get_stock_news(symbol) + if data: + articles.extend(data) + + return articles + + except Exception as e: + self.logger.error(f"金融数据搜索失败: {e}") + return [] + + def _extract_stock_symbols(self, keywords: List[str]) -> List[str]: + """提取股票代码""" + import re + symbols = [] + + for keyword in keywords: + # 检查是否为股票代码格式 + if re.match(r'^[A-Z]{1,5}$', keyword.upper()): + symbols.append(keyword.upper()) + + # 添加一些常见公司的股票代码映射 + company_symbols = { + 'apple': 'AAPL', 'microsoft': 'MSFT', 'google': 'GOOGL', + 'amazon': 'AMZN', 'tesla': 'TSLA', 'meta': 'META', + 'nvidia': 'NVDA', 'intel': 'INTC', 'amd': 'AMD' + } + + for keyword in keywords: + if keyword.lower() in company_symbols: + symbols.append(company_symbols[keyword.lower()]) + + return list(set(symbols)) + + def _get_stock_news(self, symbol: str) -> List[Dict]: + """获取股票新闻""" + try: + url = API_CONFIG['alpha_vantage']['base_url'] + params = { + 'function': 'NEWS_SENTIMENT', + 'tickers': symbol, + 'apikey': self.alpha_vantage_key, + 'limit': 10 + } + + response = requests.get(url, params=params, timeout=30) + response.raise_for_status() + + data = response.json() + articles = [] + + for item in data.get('feed', []): + article = { + 'id': f"alphavantage_{hash(item['url'])}", + 'title': item['title'], + 'content': item.get('summary', ''), + 'summary': item.get('summary', ''), + 'author': ','.join(item.get('authors', [])), + 'original_url': item['url'], + 'published_date': self._parse_date(item.get('time_published')), + 'source_name': item.get('source', 'Alpha Vantage'), + 'authority_level': 2, + 'language': 'en', + 'relevance_score': float(item.get('overall_sentiment_score', 0.5)) + } + articles.append(article) + + return articles + + except Exception as e: + self.logger.error(f"获取 {symbol} 股票新闻失败: {e}") + return [] + + def _parse_date(self, date_str: str) -> Optional[datetime]: + """解析日期字符串""" + if not date_str: + return None + + try: + # 尝试多种日期格式 + formats = [ + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S', + '%Y%m%dT%H%M%S', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d' + ] + + for fmt in formats: + try: + return datetime.strptime(date_str, fmt) + except ValueError: + continue + + return None + except Exception: + return None + + def _process_results(self, results: List[Dict], keywords: List[str]) -> List[Dict]: + """处理和排序搜索结果""" + if not results: + return [] + + # 去重(基于URL) + seen_urls = set() + unique_results = [] + + for result in results: + url = result.get('original_url', '') + if url and url not in seen_urls: + seen_urls.add(url) + unique_results.append(result) + + # 计算最终相关性分数 + for result in unique_results: + score = result.get('relevance_score', 0) + + # 根据权威级别调整分数 + authority_bonus = (4 - result.get('authority_level', 4)) * 0.2 + score += authority_bonus + + # 根据发布时间调整分数(越新越好) + pub_date = result.get('published_date') + if pub_date: + days_old = (datetime.now() - pub_date).days + time_factor = max(0, 1 - days_old / 30) # 30天内线性衰减 + score += time_factor * 0.1 + + result['final_score'] = score + + # 过滤低相关性结果 + min_score = SEARCH_CONFIG['min_relevance_score'] + filtered_results = [r for r in unique_results if r.get('final_score', 0) >= min_score] + + # 按分数排序 + filtered_results.sort(key=lambda x: x.get('final_score', 0), reverse=True) + + # 限制结果数量 + max_results = SEARCH_CONFIG['max_results_per_source'] * 2 + return filtered_results[:max_results] + + def _get_industry_domains(self, industry: str) -> List[str]: + """获取行业相关域名""" + domain_map = { + 'finance': [ + 'bloomberg.com', 'reuters.com', 'ft.com', 'wsj.com', + 'cnbc.com', 'marketwatch.com', 'forbes.com' + ], + 'ai_software': [ + 'techcrunch.com', 'venturebeat.com', 'theverge.com', + 'arstechnica.com', 'wired.com', 'technologyreview.com' + ], + 'healthcare_pharma': [ + 'statnews.com', 'fiercepharma.com', 'biopharmadive.com', + 'nature.com', 'nejm.org' + ] + } + + return domain_map.get(industry, []) + + def _get_sources_info(self, industry_id: Optional[int]) -> Dict: + """获取搜索源信息""" + sources = self.db.get_rss_sources(industry_id) + + return { + 'total_sources': len(sources), + 'by_authority': { + '1': len([s for s in sources if s['authority_level'] == 1]), + '2': len([s for s in sources if s['authority_level'] == 2]), + '3': len([s for s in sources if s['authority_level'] == 3]) + } + } + + def get_search_suggestions(self, partial_query: str, limit: int = 10) -> List[str]: + """获取搜索建议""" + try: + # 基于历史搜索记录提供建议 + history = self.db.get_search_history(limit=100) + suggestions = [] + + partial_lower = partial_query.lower() + + for record in history: + keywords = record.get('keywords', '') + if partial_lower in keywords.lower() and keywords not in suggestions: + suggestions.append(keywords) + if len(suggestions) >= limit: + break + + return suggestions + + except Exception as e: + self.logger.error(f"获取搜索建议失败: {e}") + return [] \ No newline at end of file diff --git a/使用说明.md b/使用说明.md new file mode 100644 index 0000000..60ac03f --- /dev/null +++ b/使用说明.md @@ -0,0 +1,172 @@ +# 搜索系统使用说明 v2.0 + +## 核心改进 +**本次更新重点简化搜索规则,强化英文搜索优先,增加官方权威机构和头部自媒体库。** + +## 核心原则 + +### 1. 英文搜索优先 +- **默认语言**:所有搜索默认使用英文 +- **中国特定**:仅当用户明确提及"中国"、"国内"、"A股"等时,启用中文搜索 +- **全球视野**:确保获取最权威、最及时的国际信息 + +### 2. 权威信息源优先级 +``` +官方监管机构 > 头部媒体/自媒体 > 专业媒体 > 其他来源 +``` + +### 3. 自动行业识别 +系统通过英文关键词自动判定行业: +- **金融**:finance, banking, securities, investment, fintech +- **AI软件**:AI, machine learning, software, programming, algorithm +- **制造业**:manufacturing, industry 4.0, automation, supply chain +- **医疗制药**:healthcare, pharma, medicine, clinical trial, FDA +- **快消品**:consumer goods, FMCG, brand, retail, marketing +- **零售电商**:e-commerce, retail, online shopping, logistics +- **能源化工**:energy, oil, gas, renewable, chemical, petroleum +- **房地产建筑**:real estate, construction, property, housing + +## 简化搜索流程 + +``` +用户输入 → 关键词提取 → 行业判定 → 语言判定(默认英文) → 调用行业规则 → 执行搜索 → 归档 +``` + +## 权威信息源库 + +### 各行业通用官方机构 +- **美国政府**:gov, sec.gov, federalreserve.gov, fda.gov, treasury.gov +- **国际组织**:imf.org, worldbank.org, who.int, bis.org, oecd.org +- **欧盟机构**:europa.eu 相关部门网站 + +### 头部国际媒体 +- **综合新闻**:reuters.com, bloomberg.com, ap.org, bbc.com +- **财经媒体**:ft.com, wsj.com, economist.com, cnbc.com +- **科技媒体**:techcrunch.com, wired.com, arstechnica.com + +### 头部自媒体/KOL(按行业分类) +每个行业规则文件中都包含详细的KOL列表和Twitter账号 + +## 文件命名新标准 + +### 英文搜索(默认) +- **格式**:`YYYYMMDD_[Industry]_[Module]_[Topic].docx` +- **示例**:`20250128_Finance_Regulation_Fed_Policy.docx` + +### 中国特定搜索 +- **格式**:`YYYYMMDD_[Industry]_[Module]_[Topic]_CN.docx` +- **示例**:`20250128_Finance_Market_A_Stock_CN.docx` + +## 行业专门规则 + +### 简化后的8个行业 +1. **金融行业** - `金融行业/金融搜索规则.md` +2. **AI与软件** - `AI与软件/AI软件搜索规则.md` +3. **制造业** - `制造业/制造业搜索规则.md` +4. **医疗制药** - `医疗制药/医疗制药搜索规则.md` +5. **快消品** - `快消品/快消品搜索规则.md` +6. **零售电商** - `零售电商/零售电商搜索规则.md` +7. **能源化工** - `能源化工/能源化工搜索规则.md` +8. **房地产建筑** - `房地产建筑/房地产建筑搜索规则.md` + +### 每个行业规则包含 +- **核心原则**:语言优先级和权威优先级 +- **英文权威信息源库**:官方机构、研究机构、专业媒体 +- **头部自媒体与KOL**:Twitter专家、公司官方账号、专业社区 +- **搜索策略与关键词**:具体的英文关键词模板 +- **行业专门搜索网站**:针对性的专业网站 +- **实时监控重点**:不同频率的监控内容 +- **中国特定搜索**:仅在明确要求时使用 + +## 快速使用指南 + +### 常见搜索场景 + +#### 场景1:全球金融政策 +**输入**:Federal Reserve interest rate policy +**自动识别**:Finance → Policy +**调用规则**:金融搜索规则 +**信息源**:federalreserve.gov, bloomberg.com, ft.com +**归档**:`20250128_Finance_Policy_Fed_Interest_Rate.docx` + +#### 场景2:AI技术突破 +**输入**:OpenAI GPT latest breakthrough +**自动识别**:AI → Technology +**调用规则**:AI软件搜索规则 +**信息源**:openai.com, arxiv.org, techcrunch.com +**归档**:`20250128_AI_Tech_OpenAI_GPT_Breakthrough.docx` + +#### 场景3:中国特定搜索 +**输入**:中国A股市场分析 +**特殊识别**:Finance → Market → CN +**调用规则**:金融搜索规则(中国部分) +**信息源**:证监会、财新网、第一财经 +**归档**:`20250128_Finance_Market_A_Stock_Analysis_CN.docx` + +## 质量控制 + +### 信息源验证 +- **官方优先**:政府机构、监管部门、国际组织 +- **权威媒体**:知名财经、科技、行业媒体 +- **专家观点**:经过验证的行业专家和KOL +- **多源交叉验证**:重要信息需多个来源确认 + +### 内容格式标准 +**每个文档包含头部信息**: +- Source: [具体网站/机构] +- Keywords: [使用的英文关键词] +- Search Time: [UTC时间] +- Region: [US/EU/Global/CN] +- Authority Level: [Official/Media/KOL/Other] + +## 中国特定搜索指引 + +### 启用条件 +仅当用户明确提及以下内容时启用中文搜索: +- **明确地区**:"中国"、"国内"、"内地" +- **特定市场**:"A股"、"人民币"、"央行" +- **本土概念**:"国潮"、"新零售"、"独角兽" + +### 中文信息源 +每个行业规则的第8节都包含: +- **官方机构**:相关部委和监管机构 +- **主要媒体**:权威中文媒体 +- **关键词**:对应的中文搜索关键词 + +## 系统优势 + +### 1. 简化高效 +- 去除复杂的多层判定逻辑 +- 直接基于关键词进行行业识别 +- 简化的搜索流程 + +### 2. 权威可靠 +- 优先使用官方权威机构信息 +- 整合头部媒体和KOL资源 +- 多源交叉验证机制 + +### 3. 全球视野 +- 默认英文搜索确保国际视野 +- 覆盖全球主要权威信息源 +- 及时获取最新国际动态 + +### 4. 灵活适应 +- 支持中国特定搜索需求 +- 根据用户明确指示调整策略 +- 保持信息源的时效性 + +## 注意事项 + +1. **默认英文**:除非明确要求,否则所有搜索都使用英文 +2. **权威优先**:优先使用官方机构和权威媒体信息 +3. **实时更新**:根据各行业特点保持相应更新频率 +4. **完整保存**:确保英文原文完整保存 +5. **合规使用**:遵守各网站的使用条款和版权规定 + +## 技术支持 + +遇到问题时: +1. 检查关键词是否触发正确的行业识别 +2. 确认是否需要启用中国特定搜索 +3. 查看对应行业规则中的权威信息源列表 +4. 验证文件命名是否符合新标准 \ No newline at end of file diff --git a/制造业/制造业搜索规则.md b/制造业/制造业搜索规则.md new file mode 100644 index 0000000..8c9319b --- /dev/null +++ b/制造业/制造业搜索规则.md @@ -0,0 +1,172 @@ +# 制造业搜索规则 + +## 1. 核心原则 +- **语言优先**:默认英文搜索,制造业标准与技术以英文为主 +- **权威优先**:国际标准组织 > 政府机构 > 行业协会 > 专业媒体 + +## 2. 英文权威信息源库 + +### 2.1 国际标准化组织(最高优先级) +- **ISO**: https://www.iso.org (国际标准化组织) +- **IEC**: https://www.iec.ch (国际电工委员会) +- **IEEE**: https://www.ieee.org (电气电子工程师学会) +- **ANSI**: https://www.ansi.org (美国国家标准学会) +- **ASME**: https://www.asme.org (美国机械工程师学会) + +### 2.2 政府机构与监管部门 +- **美国**: + - NIST: https://www.nist.gov + - OSHA: https://www.osha.gov + - EPA: https://www.epa.gov + - Commerce Department: https://www.commerce.gov +- **欧盟**: + - European Commission Industry: https://ec.europa.eu/growth/sectors/industry + - European Medicines Agency: https://www.ema.europa.eu +- **英国**: + - UK Business & Industry: https://www.gov.uk/business-and-industry +- **国际组织**: + - UNIDO: https://www.unido.org + - OECD Industry: https://www.oecd.org/industry + +### 2.3 权威行业协会 +- **美国制造业协会**: https://www.nam.org +- **制造工程师协会**: https://www.sme.org +- **国际自动化学会**: https://www.isa.org +- **制造执行系统协会**: https://www.mesa.org +- **工业互联网联盟**: https://www.iiconsortium.org + +### 2.4 头部制造业媒体 +- **专业媒体**: + - Industry Week: https://www.industryweek.com + - Manufacturing.net: https://www.manufacturing.net + - Automation.com: https://www.automation.com +- **工程技术**: + - Engineering.com: https://www.engineering.com + - Machine Design: https://www.machinedesign.com + - Control Engineering: https://www.controleng.com +- **供应链**: + - Supply Chain Management Review: https://www.scmr.com + - Logistics Management: https://www.logisticsmgmt.com + - Supply Chain Brain: https://www.supplychainbrain.com + +## 3. 头部自媒体与KOL + +### 3.1 行业专家Twitter账号 +- **Industry 4.0专家**:关注工业4.0转型专家 +- **自动化工程师**:知名自动化系统集成商 +- **供应链专家**:供应链管理咨询师 +- **质量管理专家**:精益六西格玛专家 + +### 3.2 制造业公司官方 +- **工业巨头**: + - GE: https://www.ge.com/news + - Siemens: https://press.siemens.com + - ABB: https://new.abb.com/news + - Schneider Electric: https://www.se.com/ww/en/about-us/newsroom +- **自动化厂商**: + - Rockwell Automation: https://www.rockwellautomation.com/en-us/company/news.html + - Fanuc: https://www.fanuc.com/us/en/news-resources.html + - KUKA: https://www.kuka.com/en-us/press + +### 3.3 专业社区 +- **Reddit社区**: + - Manufacturing: https://www.reddit.com/r/manufacturing + - Engineering: https://www.reddit.com/r/engineering + - Industrial Engineering: https://www.reddit.com/r/industrialengineering +- **专业论坛**: + - Eng-Tips: https://www.eng-tips.com + - Manufacturing.org Forums: https://www.manufacturing.org +- **LinkedIn群组**:Manufacturing Leadership Network + +## 4. 搜索策略与关键词 + +### 4.1 技术创新搜索 +**关键词模板**: +- "Industry 4.0 [technology] implementation" +- "smart manufacturing [solution] [year]" +- "industrial automation [sector] trends" +- "digital transformation manufacturing [region]" + +### 4.2 标准与合规搜索 +**关键词模板**: +- "ISO [standard number] [year] update" +- "manufacturing quality standards [industry]" +- "industrial safety regulations [country]" +- "environmental compliance manufacturing [region]" + +### 4.3 市场分析搜索 +**关键词模板**: +- "manufacturing market trends [year]" +- "supply chain disruption [sector] [period]" +- "production capacity [industry] [region]" +- "manufacturing competitiveness [country]" + +### 4.4 可持续制造搜索 +**关键词模板**: +- "sustainable manufacturing practices [industry]" +- "green production [technology] [year]" +- "circular economy manufacturing [sector]" +- "carbon footprint reduction [industry]" + +## 5. 行业专门搜索网站 + +### 5.1 工业技术平台 +- **ThomasNet**: https://www.thomasnet.com (工业采购与技术) +- **GlobalSpec**: https://www.globalspec.com (工程技术资源) +- **Engineering360**: https://www.engineering360.com (工程技术信息) + +### 5.2 咨询公司制造业研究 +- **McKinsey Manufacturing**: https://www.mckinsey.com/industries/advanced-electronics +- **Deloitte Manufacturing**: https://www2.deloitte.com/us/en/pages/manufacturing/topics/manufacturing.html +- **BCG Manufacturing**: https://www.bcg.com/industries/manufacturing +- **PwC Manufacturing**: https://www.pwc.com/gx/en/industries/industrial-manufacturing.html + +### 5.3 研究机构 +- **MIT Technology Review Manufacturing**: https://www.technologyreview.com/topic/manufacturing +- **Fraunhofer Institute**: https://www.fraunhofer.de (德国工业研究) +- **NIST Manufacturing**: https://www.nist.gov/manufacturing + +## 6. 实时监控重点 + +### 6.1 高频监控(每日) +- ISO标准更新通知 +- 主要制造业公司新闻 +- 供应链中断事件 +- 工业安全事故报告 + +### 6.2 中频监控(每周) +- 制造业PMI指数 +- 新技术产品发布 +- 行业协会政策声明 +- 贸易政策变化 + +### 6.3 低频监控(每月) +- 行业研究报告发布 +- 标准制定进展 +- 长期技术趋势分析 + +## 7. 文件命名与归档 + +### 7.1 命名规则 +- **技术创新**:`YYYYMMDD_Manufacturing_Tech_[Technology].docx` +- **标准政策**:`YYYYMMDD_Manufacturing_Standard_[ISO/ANSI Number].docx` +- **市场分析**:`YYYYMMDD_Manufacturing_Market_[Sector/Region].docx` +- **可持续发展**:`YYYYMMDD_Manufacturing_Sustainability_[Topic].docx` + +### 7.2 内容格式 +**文档头部**: +- Source: [完整URL链接] +- Keywords: [Search terms] +- Search Time: [YYYY-MM-DD HH:MM UTC] +- Standard Reference: [if applicable] +- Industry Sector: [Automotive/Electronics/etc.] + +## 8. 中国特定搜索(仅当明确要求时) + +### 8.1 中国制造业机构 +- 工信部: http://www.miit.gov.cn +- 国家标准委: http://www.sac.gov.cn +- 中国制造业协会: http://www.cema.org.cn + +### 8.2 中文关键词 +- "制造业转型"、"智能制造"、"工业4.0"、"供应链" \ No newline at end of file diff --git a/医疗制药/医疗制药搜索规则.md b/医疗制药/医疗制药搜索规则.md new file mode 100644 index 0000000..2ac7d3d --- /dev/null +++ b/医疗制药/医疗制药搜索规则.md @@ -0,0 +1,186 @@ +# 医疗制药搜索规则 + +## 1. 核心原则 +- **语言优先**:默认英文搜索,医学科研以英文为国际标准 +- **权威优先**:政府监管机构 > 医学期刊 > 制药公司官方 > 专业媒体 + +## 2. 英文权威信息源库 + +### 2.1 政府监管机构(最高优先级) +- **美国**: + - FDA: https://www.fda.gov + - CDC: https://www.cdc.gov + - NIH: https://www.nih.gov + - CMS: https://www.cms.gov +- **欧洲**: + - EMA: https://www.ema.europa.eu + - WHO Europe: https://www.who.int/europe + - ECDC: https://www.ecdc.europa.eu +- **英国**: + - MHRA: https://www.gov.uk/government/organisations/medicines-and-healthcare-products-regulatory-agency + - NHS: https://www.nhs.uk +- **国际组织**: + - WHO: https://www.who.int + - WIPO: https://www.wipo.int (医药专利) + +### 2.2 顶级医学期刊 +- **综合医学**: + - New England Journal of Medicine: https://www.nejm.org + - The Lancet: https://www.thelancet.com + - JAMA Network: https://jamanetwork.com + - BMJ: https://www.bmj.com +- **专科期刊**: + - Nature Medicine: https://www.nature.com/nm + - Science Medicine: https://www.science.org/medicine +- **药学期刊**: + - Pharmaceutical Journal: https://pharmaceutical-journal.com + - Drug Topics: https://www.drugtopics.com + +### 2.3 学术与研究平台 +- **PubMed**: https://pubmed.ncbi.nlm.nih.gov +- **Cochrane Library**: https://www.cochranelibrary.com +- **ClinicalTrials.gov**: https://www.clinicaltrials.gov +- **PLoS Medicine**: https://journals.plos.org/plosmedicine + +### 2.4 头部制药公司官方 +- **Big Pharma**: + - Pfizer: https://www.pfizer.com + - Johnson & Johnson: https://www.jnj.com + - Roche: https://www.roche.com + - Novartis: https://www.novartis.com +- **生物技术**: + - Genentech: https://www.gene.com + - Amgen: https://www.amgen.com + - Gilead: https://www.gilead.com + - Biogen: https://www.biogen.com +- **疫苗公司**: + - Moderna: https://www.modernatx.com + - BioNTech: https://biontech.de + +## 3. 专业医疗媒体与KOL + +### 3.1 权威医疗媒体 +- **专业媒体**: + - Medscape: https://www.medscape.com + - WebMD: https://www.webmd.com + - Healthline: https://www.healthline.com +- **行业媒体**: + - FiercePharma: https://www.fiercepharma.com + - BioPharma Dive: https://www.biopharmadive.com + - STAT News: https://www.statnews.com +- **投资分析**: + - Evaluate: https://www.evaluate.com + - GlobalData Healthcare: https://www.globaldata.com/healthcare + +### 3.2 医学专家与KOL Twitter +- **FDA官员**: + - @SteveFDA: https://twitter.com/SteveFDA + - @DrWoodcockFDA: https://twitter.com/DrWoodcockFDA +- **知名医生**: + - @drericding: https://twitter.com/drericding + - @celinegounder: https://twitter.com/celinegounder +- **研究专家**:关注各领域顶级研究者 +- **制药高管**:主要制药公司CEO/CSO + +### 3.3 专业论坛与社区 +- **Reddit医学**: + - Medicine: https://www.reddit.com/r/medicine + - Pharmacy: https://www.reddit.com/r/pharmacy + - Biotech: https://www.reddit.com/r/biotech +- **专业论坛**: + - Student Doctor Network: https://forums.studentdoctor.net + - AllNurses: https://allnurses.com +- **LinkedIn群组**:Pharmaceutical Industry Professionals + +## 4. 搜索策略与关键词 + +### 4.1 药物研发搜索 +**关键词模板**: +- "[drug name] clinical trial [phase] [year]" +- "FDA approval [therapeutic area] [date]" +- "[disease] treatment breakthrough [year]" +- "biomarker [condition] research [period]" + +### 4.2 监管政策搜索 +**关键词模板**: +- "FDA guidance [therapeutic area] [year]" +- "drug regulation [country] [topic]" +- "clinical trial regulations [region]" +- "pharmaceutical compliance [area]" + +### 4.3 市场分析搜索 +**关键词模板**: +- "pharmaceutical market [therapeutic area] [year]" +- "drug pricing [country] [policy]" +- "biosimilar competition [drug class]" +- "orphan drug [indication] [market]" + +### 4.4 医疗技术搜索 +**关键词模板**: +- "digital health [technology] [application]" +- "AI medical diagnosis [specialty]" +- "telemedicine adoption [region] [year]" +- "medical device innovation [area]" + +## 5. 行业专门搜索网站 + +### 5.1 药物开发数据库 +- **DrugBank**: https://go.drugbank.com +- **Drugs@FDA**: https://www.accessdata.fda.gov/scripts/cder/daf +- **EMA Database**: https://www.ema.europa.eu/en/medicines +- **PharmaProjects**: https://pharmaprojects.com + +### 5.2 临床试验平台 +- **ClinicalTrials.gov**: https://www.clinicaltrials.gov +- **WHO ICTRP**: https://trialsearch.who.int +- **EudraCT**: https://eudract.ema.europa.eu + +### 5.3 医疗投资与分析 +- **BioCentury**: https://www.biocentury.com +- **Nature Biotechnology**: https://www.nature.com/nbt +- **Genetic Engineering News**: https://www.genengnews.com + +## 6. 实时监控重点 + +### 6.1 高频监控(每日) +- FDA批准公告 +- 临床试验结果发布 +- 药物安全警告 +- 主要制药公司新闻 + +### 6.2 中频监控(每周) +- 新药申请状态 +- 监管指导文件更新 +- 医学会议摘要 +- 投资并购动态 + +### 6.3 低频监控(每月) +- 行业趋势报告 +- 长期临床试验进展 +- 政策法规修订 + +## 7. 文件命名与归档 + +### 7.1 命名规则 +- **药物研发**:`YYYYMMDD_Pharma_Drug_[DrugName/Indication].docx` +- **监管政策**:`YYYYMMDD_Pharma_Regulation_[Topic/Agency].docx` +- **医疗技术**:`YYYYMMDD_Pharma_Tech_[Technology/Device].docx` +- **市场分析**:`YYYYMMDD_Pharma_Market_[TherapeuticArea].docx` + +### 7.2 内容格式 +**文档头部**: +- Source: [完整URL链接] +- Keywords: [Medical terms used] +- Search Time: [YYYY-MM-DD HH:MM UTC] +- Therapeutic Area: [Oncology/Cardiology/etc.] +- Development Stage: [Preclinical/Phase I-III/Approved] + +## 8. 中国特定搜索(仅当明确要求时) + +### 8.1 中国医药监管 +- NMPA(药监局): https://www.nmpa.gov.cn +- 卫健委: http://www.nhc.gov.cn +- 中检院: http://nifdc.org.cn + +### 8.2 中文关键词 +- "药物研发"、"临床试验"、"药品审批"、"医疗器械" \ No newline at end of file diff --git a/快消品/快消品搜索规则.md b/快消品/快消品搜索规则.md new file mode 100644 index 0000000..b8064ac --- /dev/null +++ b/快消品/快消品搜索规则.md @@ -0,0 +1,165 @@ +# 快消品搜索规则 + +## 1. 核心原则 +- **语言优先**:默认英文搜索,全球品牌与市场趋势以英文为主 +- **权威优先**:市场研究机构 > 品牌官方 > 行业媒体 > 消费者社区 + +## 2. 英文权威信息源库 + +### 2.1 权威市场研究机构(最高优先级) +- **Nielsen**: https://www.nielsen.com (尼尔森市场研究) +- **Euromonitor**: https://www.euromonitor.com (欧睿国际) +- **Kantar**: https://www.kantar.com (凯度市场洞察) +- **Mintel**: https://www.mintel.com (英敏特市场研究) +- **McKinsey CPG**: https://www.mckinsey.com/industries/consumer-packaged-goods + +### 2.2 全球消费品巨头官方 +- **P&G**: + - Corporate: https://www.pg.com + - Investor Relations: https://www.pginvestor.com +- **Unilever**: + - Corporate: https://www.unilever.com + - Investor Centre: https://www.unilever.com/investor-centre +- **Nestlé**: + - Corporate: https://www.nestle.com + - Investors: https://www.nestle.com/investors +- **Coca-Cola**: + - Corporate: https://www.coca-colacompany.com + - Investors: https://investors.coca-colacompany.com +- **PepsiCo**: + - Corporate: https://www.pepsico.com + - Investors: https://www.pepsico.com/investors + +### 2.3 头部行业媒体 +- **消费品专业媒体**: + - Consumer Goods: https://www.consumergoods.com + - CPG Matters: https://www.cpgmatters.com + - Brand Channel: https://www.brandchannel.com +- **零售媒体**: + - Retail Dive: https://www.retaildive.com + - Progressive Grocer: https://progressivegrocer.com + - Supermarket News: https://www.supermarketnews.com +- **营销媒体**: + - Ad Age: https://adage.com + - Marketing Land: https://marketingland.com + - Campaign Live: https://www.campaignlive.com + +### 2.4 消费趋势与洞察平台 +- **WGSN**: https://www.wgsn.com (全球时尚趋势预测) +- **Trendwatching**: https://trendwatching.com (消费趋势) +- **PSFK**: https://www.psfk.com (创新趋势研究) +- **Future Laboratory**: https://www.thefuturelaboratory.com + +## 3. 头部自媒体与KOL + +### 3.1 营销与品牌专家Twitter +- **Gary Vaynerchuk**: https://twitter.com/garyvee +- **Scott Galloway**: https://twitter.com/profgalloway +- **Brian Solis**: https://twitter.com/briansolis +- **Neil Patel**: https://twitter.com/neilpatel +- **Rand Fishkin**: https://twitter.com/randfish + +### 3.2 品牌官方社交媒体 +- **全球品牌**:主要消费品牌官方Twitter/LinkedIn +- **创新品牌**:新兴DTC品牌官方账号 +- **零售商**:大型零售商官方资讯 + +### 3.3 专业社区与论坛 +- **Reddit社区**: + - Consumer Goods: https://www.reddit.com/r/consumergoods + - Marketing: https://www.reddit.com/r/marketing + - Advertising: https://www.reddit.com/r/advertising +- **LinkedIn群组**:Consumer Goods Professionals, FMCG Professionals +- **专业论坛**:Brand Management Forums + +## 4. 搜索策略与关键词 + +### 4.1 品牌与营销搜索 +**关键词模板**: +- "brand strategy [category] [year]" +- "consumer behavior [demographics] [trend]" +- "digital marketing [platform] [industry]" +- "brand positioning [competitor] analysis" + +### 4.2 市场趋势搜索 +**关键词模板**: +- "consumer trends [year] [region]" +- "FMCG market growth [category] [period]" +- "retail innovation [technology] [application]" +- "sustainable packaging [industry] [development]" + +### 4.3 产品创新搜索 +**关键词模板**: +- "product innovation [category] [year]" +- "new product launch [brand] [market]" +- "health wellness trends [food/beverage]" +- "beauty innovation [technology] [ingredient]" + +### 4.4 零售与渠道搜索 +**关键词模板**: +- "retail channel [strategy] [brand]" +- "e-commerce growth [category] [region]" +- "omnichannel marketing [retailer]" +- "direct-to-consumer [brand] [success]" + +## 5. 行业专门搜索网站 + +### 5.1 市场数据与报告 +- **Statista**: https://www.statista.com (统计数据) +- **IBISWorld**: https://www.ibisworld.com (行业报告) +- **Research and Markets**: https://www.researchandmarkets.com +- **Grand View Research**: https://www.grandviewresearch.com + +### 5.2 包装与设计 +- **Packaging World**: https://www.packworld.com +- **The Dieline**: https://thedieline.com (包装设计) +- **Core77**: https://www.core77.com (工业设计) + +### 5.3 可持续发展 +- **Sustainable Brands**: https://sustainablebrands.com +- **GreenBiz**: https://www.greenbiz.com +- **Circular Design Network**: https://www.circulardesignnetwork.com + +## 6. 实时监控重点 + +### 6.1 高频监控(每日) +- 主要品牌新产品发布 +- 消费者社交媒体讨论 +- 零售商促销活动 +- 竞争对手营销动态 + +### 6.2 中频监控(每周) +- 市场份额变化 +- 新兴品牌崛起 +- 包装创新案例 +- 可持续发展倡议 + +### 6.3 低频监控(每月) +- 行业趋势报告 +- 消费者调研结果 +- 长期品牌战略调整 + +## 7. 文件命名与归档 + +### 7.1 命名规则 +- **品牌营销**:`YYYYMMDD_FMCG_Brand_[Brand/Campaign].docx` +- **市场趋势**:`YYYYMMDD_FMCG_Trend_[Category/Region].docx` +- **产品创新**:`YYYYMMDD_FMCG_Innovation_[Product/Technology].docx` +- **零售渠道**:`YYYYMMDD_FMCG_Retail_[Channel/Strategy].docx` + +### 7.2 内容格式 +**文档头部**: +- Source: [完整URL链接] +- Keywords: [Search terms] +- Search Time: [YYYY-MM-DD HH:MM UTC] +- Category: [Food/Beverage/Beauty/etc.] +- Geographic Focus: [Global/Regional/Country] + +## 8. 中国特定搜索(仅当明确要求时) + +### 8.1 中国消费品机构 +- 中国连锁经营协会: http://www.ccfa.org.cn +- 中国商业联合会: http://www.ccom.org.cn + +### 8.2 中文关键词 +- "快消品牌"、"消费升级"、"新零售"、"国潮品牌" \ No newline at end of file diff --git a/房地产建筑/房地产建筑搜索规则.md b/房地产建筑/房地产建筑搜索规则.md new file mode 100644 index 0000000..defa7ab --- /dev/null +++ b/房地产建筑/房地产建筑搜索规则.md @@ -0,0 +1,167 @@ +# 房地产建筑搜索规则 + +## 1. 核心原则 +- **语言优先**:默认英文搜索,全球房地产市场与建筑技术以英文为主 +- **权威优先**:政府部门 > 行业协会 > 专业机构 > 房地产媒体 + +## 2. 英文权威信息源库 + +### 2.1 政府与监管机构(最高优先级) +- **美国**: + - HUD: https://www.hud.gov + - Census Bureau Construction: https://www.census.gov/construction + - Freddie Mac: https://www.freddiemac.com + - Fannie Mae: https://www.fanniemae.com +- **英国**: + - UK Housing & Communities: https://www.gov.uk/housing-local-and-community + - ONS: https://www.ons.gov.uk +- **欧盟**: + - EC Construction: https://ec.europa.eu/growth/sectors/construction + - Eurostat: https://ec.europa.eu/eurostat +- **国际组织**: + - UN-Habitat: https://unhabitat.org + - World Bank Housing: https://www.worldbank.org/en/topic/housing + +### 2.2 权威研究机构 +- **CBRE**: https://www.cbre.com (世邦魏理仕) +- **JLL**: https://www.jll.com (仲量联行) +- **Cushman & Wakefield**: https://www.cushmanwakefield.com +- **Savills**: https://www.savills.com +- **Knight Frank**: https://www.knightfrank.com + +### 2.3 行业协会与组织 +- **NAHB**: https://www.nahb.org (美国住宅建筑商协会) +- **ULI**: https://uli.org (城市土地学会) +- **RICS**: https://www.rics.org (皇家特许测量师学会) +- **USGBC**: https://www.usgbc.org (美国绿色建筑委员会) +- **World Green Building Council**: https://www.worldgbc.org + +### 2.4 专业房地产媒体 +- **Commercial Property Executive**: https://www.cpexecutive.com +- **Multi-Housing News**: https://www.multihousingnews.com +- **Construction Dive**: https://www.constructiondive.com +- **Building Design + Construction**: https://www.bdcnetwork.com + +## 3. 头部自媒体与KOL + +### 3.1 房地产专家Twitter +- **Calculated Risk**: https://twitter.com/calculatedrisk +- **Conor Sen**: https://twitter.com/conorsen +- **Ivan the Terrible**: https://twitter.com/IvantheTerrible +- **ArchDaily**: https://twitter.com/archdaily +- **Dezeen**: https://twitter.com/dezeen + +### 3.2 房地产公司官方 +- **开发商**: + - Brookfield: https://www.brookfield.com + - Blackstone Real Estate: https://www.blackstone.com/our-businesses/real-estate + - Prologis: https://www.prologis.com +- **REITs**:主要房地产投资信托公司 +- **PropTech**: + - WeWork: https://www.wework.com + - Compass: https://www.compass.com + - Opendoor: https://www.opendoor.com + +### 3.3 专业社区 +- **LinkedIn群组**:Real Estate Professionals, Commercial Real Estate +- **Reddit社区**: + - Real Estate: https://www.reddit.com/r/realestate + - Construction: https://www.reddit.com/r/construction + - Architecture: https://www.reddit.com/r/architecture +- **专业论坛**: + - BiggerPockets: https://www.biggerpockets.com + - LoopNet: https://www.loopnet.com + +## 4. 搜索策略与关键词 + +### 4.1 市场分析搜索 +**关键词模板**: +- "real estate market [city/region] [year] [outlook]" +- "housing prices [trends] [forecast] [analysis]" +- "commercial property [sector] [performance] [investment]" +- "construction industry [growth] [statistics] [regional]" + +### 4.2 技术创新搜索 +**关键词模板**: +- "proptech [innovation] [technology] [adoption]" +- "smart building [technology] [IoT] [automation]" +- "construction technology [BIM] [modular] [robotics]" +- "sustainable building [green] [certification] [LEED]" + +### 4.3 政策法规搜索 +**关键词模板**: +- "housing policy [government] [regulation] [country]" +- "zoning laws [urban planning] [development] [reform]" +- "building codes [safety] [standards] [update]" +- "real estate taxation [policy] [impact] [reform]" + +### 4.4 投资与金融搜索 +**关键词模板**: +- "real estate investment [REIT] [returns] [strategy]" +- "property valuation [methodology] [market] [trends]" +- "construction financing [lending] [rates] [availability]" +- "institutional investment [real estate] [allocation]" + +## 5. 行业专门搜索网站 + +### 5.1 房地产数据平台 +- **CoStar**: https://www.costar.com (商业房地产数据) +- **LoopNet**: https://www.loopnet.com (商业房地产列表) +- **Zillow**: https://www.zillow.com (住宅房地产) +- **Realtor.com**: https://www.realtor.com (房地产列表) + +### 5.2 建筑与设计 +- **ArchDaily**: https://www.archdaily.com (建筑设计) +- **Dezeen**: https://www.dezeen.com (设计杂志) +- **Building Design + Construction**: https://www.bdcnetwork.com +- **ENR**: https://www.enr.com (工程新闻记录) + +### 5.3 PropTech与创新 +- **PropTech Insider**: https://www.proptechinsider.com +- **RE Tech Advisors**: https://www.retechadvisors.com +- **Unissu**: https://www.unissu.com (房地产科技) + +## 6. 实时监控重点 + +### 6.1 高频监控(每日) +- 房价指数变化 +- 建筑许可数据 +- 利率政策影响 +- 重大项目动态 + +### 6.2 中频监控(每周) +- 市场成交数据 +- 新技术应用 +- 政策法规变化 +- 投资交易活动 + +### 6.3 低频监控(每月) +- 行业趋势报告 +- 城市规划更新 +- 长期发展策略 + +## 7. 文件命名与归档 + +### 7.1 命名规则 +- **市场分析**:`YYYYMMDD_RealEstate_Market_[City/Sector].docx` +- **技术创新**:`YYYYMMDD_RealEstate_Tech_[Technology/Innovation].docx` +- **政策法规**:`YYYYMMDD_RealEstate_Policy_[Topic/Region].docx` +- **投资分析**:`YYYYMMDD_RealEstate_Investment_[Strategy/Sector].docx` + +### 7.2 内容格式 +**文档头部**: +- Source: [完整URL链接] +- Keywords: [Real estate terms] +- Search Time: [YYYY-MM-DD HH:MM UTC] +- Property Type: [Residential/Commercial/Industrial] +- Geographic Focus: [City/State/Country/Global] + +## 8. 中国特定搜索(仅当明确要求时) + +### 8.1 中国房地产机构 +- 住建部: http://www.mohurd.gov.cn +- 国土资源部: http://www.mnr.gov.cn +- 中国房地产业协会: http://www.cirea.org.cn + +### 8.2 中文关键词 +- "房地产市场"、"城市规划"、"绿色建筑"、"智慧城市" \ No newline at end of file diff --git a/技术实施方案_简单实用版.md b/技术实施方案_简单实用版.md new file mode 100644 index 0000000..feb95ce --- /dev/null +++ b/技术实施方案_简单实用版.md @@ -0,0 +1,211 @@ +# 搜索系统技术实施方案 - 简单实用版 + +## 总体架构 + +``` +用户输入 → 行业分类 → 信息源选择 → API/RSS获取 → 结果整理 → 文档归档 +``` + +## 核心技术栈 + +### 1. RSS订阅源配置 + +#### 金融行业 +```yaml +官方机构: + - Federal Reserve: https://www.federalreserve.gov/feeds/press_all.xml + - SEC: https://www.sec.gov/rss/news/press-release.xml + - ECB: https://www.ecb.europa.eu/rss/news.xml + +主流媒体: + - Bloomberg: https://feeds.bloomberg.com/markets/news.rss + - Reuters Finance: https://feeds.reuters.com/reuters/businessNews + - Financial Times: https://www.ft.com/rss/home + - Wall Street Journal: https://feeds.a.dj.com/rss/RSSMarketsMain.xml +``` + +#### AI与软件 +```yaml +技术源: + - arXiv CS: http://rss.arxiv.org/rss/cs + - Google AI Blog: https://ai.googleblog.com/feeds/posts/default + - OpenAI Blog: https://openai.com/blog/rss.xml + - MIT Technology Review: https://www.technologyreview.com/feed/ + +行业媒体: + - TechCrunch: https://techcrunch.com/feed/ + - Ars Technica: http://feeds.arstechnica.com/arstechnica/index + - The Verge: https://www.theverge.com/rss/index.xml +``` + +#### 制造业 +```yaml +行业组织: + - Industry Week: https://www.industryweek.com/rss.xml + - Manufacturing.net: https://www.manufacturing.net/rss.xml + - Plant Engineering: https://www.plantengineering.com/rss.xml + +技术标准: + - ISO News: https://www.iso.org/rss/news.xml + - IEEE Spectrum: https://spectrum.ieee.org/rss/fulltext +``` + +#### 医疗制药 +```yaml +官方机构: + - FDA: https://www.fda.gov/about-fda/contact-fda/stay-informed/rss-feeds + - NIH: https://www.nih.gov/news-events/rss + - WHO: https://www.who.int/rss-feeds + +专业媒体: + - BioPharma Dive: https://www.biopharmadive.com/feeds/news/ + - STAT News: https://www.statnews.com/feed/ + - Nature Medicine: https://feeds.nature.com/nm/rss/current +``` + +### 2. API接入配置 + +#### 核心API服务 +```python +# 新闻API +NewsAPI_KEY = "your_newsapi_key" +BASE_URL = "https://newsapi.org/v2/" + +# 社交媒体API +TWITTER_BEARER_TOKEN = "your_twitter_token" +TWITTER_API_V2 = "https://api.twitter.com/2/" + +# 金融数据API +ALPHA_VANTAGE_KEY = "your_alphavantage_key" +AV_BASE_URL = "https://www.alphavantage.co/query" +``` + +#### API调用示例 +```python +import requests +import feedparser +from datetime import datetime + +class SimpleSearchEngine: + def __init__(self): + self.news_api_key = "YOUR_KEY" + self.rss_sources = { + "finance": [ + "https://feeds.bloomberg.com/markets/news.rss", + "https://feeds.reuters.com/reuters/businessNews" + ], + "ai_software": [ + "https://ai.googleblog.com/feeds/posts/default", + "https://techcrunch.com/feed/" + ] + } + + def search_by_industry(self, keywords, industry, language="en"): + results = [] + + # RSS搜索 + for rss_url in self.rss_sources.get(industry, []): + feed = feedparser.parse(rss_url) + for entry in feed.entries: + if any(keyword.lower() in entry.title.lower() for keyword in keywords): + results.append({ + 'title': entry.title, + 'link': entry.link, + 'published': entry.published, + 'source': rss_url + }) + + # NewsAPI搜索 + if language == "en": + news_results = self.search_newsapi(keywords, industry) + results.extend(news_results) + + return results + + def search_newsapi(self, keywords, industry): + # NewsAPI实现 + pass +``` + +### 3. 分行业信息源清单 + +#### 快消品 (FMCG) +```yaml +RSS源: + - Nielsen: https://www.nielsen.com/insights/rss/ + - Euromonitor: https://www.euromonitor.com/rss + - Advertising Age: https://adage.com/rss.xml + - Beverage Industry: https://www.bevindustry.com/rss.xml +``` + +#### 零售电商 +```yaml +RSS源: + - Retail Dive: https://www.retaildive.com/feeds/news/ + - eMarketer: https://www.emarketer.com/rss/ + - Internet Retailer: https://www.digitalcommerce360.com/feed/ + - Shopify Blog: https://www.shopify.com/blog.rss +``` + +#### 能源化工 +```yaml +RSS源: + - IEA: https://www.iea.org/rss/news + - Energy.gov: https://www.energy.gov/rss/news.xml + - Chemical & Engineering News: https://cen.acs.org/rss.xml + - Oil & Gas Journal: https://www.ogj.com/rss.xml +``` + +#### 房地产建筑 +```yaml +RSS源: + - HUD: https://www.hud.gov/rss/HUDNo.xml + - Construction Dive: https://www.constructiondive.com/feeds/news/ + - Commercial Property Executive: https://www.cpexecutive.com/rss.xml + - Engineering News-Record: https://www.enr.com/rss/all +``` + +## 实施步骤 + +### 第一阶段:基础搭建 (1周) +1. 设置RSS订阅监控 +2. 申请NewsAPI账号 +3. 配置基础搜索框架 +4. 测试主要信息源 + +### 第二阶段:功能完善 (1周) +1. 添加关键词过滤 +2. 实现结果排序 +3. 配置自动归档 +4. 添加中英文切换 + +### 第三阶段:优化调试 (1周) +1. 调优搜索算法 +2. 完善文档格式 +3. 添加错误处理 +4. 性能优化 + +## 成本预估 + +### 免费资源 +- RSS订阅:完全免费 +- Twitter API:基础版免费 +- 政府官网:免费 + +### 付费服务 (可选) +- NewsAPI:$499/月 (10万次请求) +- Alpha Vantage:$49/月 (金融数据) + +## 预期效果 + +### 覆盖范围 +- **信息源数量**:每个行业30-50个权威源 +- **更新频率**:实时到1小时内 +- **语言覆盖**:英文为主,中文源按需添加 + +### 质量保证 +- **权威性**:官方机构 > 主流媒体 > 专业平台 +- **实时性**:RSS实时订阅 + API补充 +- **完整性**:多源交叉验证 + +要我开始实施某个具体行业的配置吗?我可以先从您最关注的行业开始进行详细配置。 \ No newline at end of file diff --git a/搜索总规则.md b/搜索总规则.md new file mode 100644 index 0000000..403e3b8 --- /dev/null +++ b/搜索总规则.md @@ -0,0 +1,65 @@ +# 搜索总规则 - 智能行业判定系统 + +## 1. 核心原则 +- **语言优先级**:默认使用英文搜索,除非用户明确指定中国新闻 +- **信息源优先级**:官方权威机构 > 头部媒体/自媒体 > 专业媒体 > 其他来源 +- **行业自动识别**:通过关键词自动判定行业并调用相应搜索规则 + +## 2. 行业判定关键词 + +### 2.1 直接行业关键词 +- **金融行业**:finance, banking, securities, investment, fintech, cryptocurrency, trading, markets +- **制造业**:manufacturing, industry 4.0, automation, supply chain, production, factory, IoT +- **AI与软件**:AI, machine learning, software, programming, algorithm, tech, startup, coding +- **医疗制药**:healthcare, pharma, medicine, clinical trial, FDA, biotech, medical device +- **快消品**:consumer goods, FMCG, brand, retail, marketing, packaging, CPG +- **零售电商**:e-commerce, retail, online shopping, logistics, marketplace, digital commerce +- **能源化工**:energy, oil, gas, renewable, chemical, petroleum, utilities, sustainability +- **房地产建筑**:real estate, construction, property, housing, urban planning, architecture + +### 2.2 特殊标识 +- **中国特定**:当用户明确提及"中国"、"国内"、"A股"等,启用中文搜索 +- **地区特定**:当提及特定国家/地区时,优先搜索该地区权威源 + +## 3. 全球权威信息源库 + +### 3.1 官方权威机构(各行业通用) +- **美国政府**:gov, sec.gov, federalreserve.gov, fda.gov, treasury.gov +- **国际组织**:imf.org, worldbank.org, who.int, bis.org, oecd.org +- **监管机构**:按行业调用相应监管机构网站 + +### 3.2 头部国际媒体 +- **综合新闻**:reuters.com, bloomberg.com, ap.org, bbc.com, cnn.com +- **财经媒体**:ft.com, wsj.com, economist.com, cnbc.com, marketwatch.com +- **科技媒体**:techcrunch.com, wired.com, arstechnica.com, verge.com + +### 3.3 头部自媒体/意见领袖(按行业分类) +- **金融**:零对冲(zerohedge.com)、知名金融博主Twitter账号 +- **科技**:Hacker News、知名技术博主、GitHub trending +- **其他行业**:按各行业规则中的KOL列表 + +## 4. 搜索执行流程 + +### 4.1 简化搜索流程 +``` +用户输入 → 关键词提取 → 行业判定 → 语言判定(默认英文) → 调用行业规则 → 执行搜索 → 归档 +``` + +### 4.2 文件命名标准 +- **格式**:`YYYYMMDD_[行业]_[模块]_主题.docx` +- **语言标注**:英文搜索无需特殊标注,中文搜索添加"_CN"后缀 + +## 5. 行业规则调用 +- **金融行业** → `金融行业/金融搜索规则.md` +- **制造业** → `制造业/制造业搜索规则.md` +- **AI与软件** → `AI与软件/AI软件搜索规则.md` +- **医疗制药** → `医疗制药/医疗制药搜索规则.md` +- **快消品** → `快消品/快消品搜索规则.md` +- **零售电商** → `零售电商/零售电商搜索规则.md` +- **能源化工** → `能源化工/能源化工搜索规则.md` +- **房地产建筑** → `房地产建筑/房地产建筑搜索规则.md` + +## 6. 质量控制 +- **信息源验证**:优先使用权威官方源 +- **时效性**:确保信息新鲜度 +- **多源交叉验证**:重要信息需多源确认 \ No newline at end of file diff --git a/新闻/20250715_GoogleIO2025大会全览.docx b/新闻/20250715_GoogleIO2025大会全览.docx new file mode 100644 index 0000000..947c3ea --- /dev/null +++ b/新闻/20250715_GoogleIO2025大会全览.docx @@ -0,0 +1,233 @@ +Google I/O 2025 Conference Overview + +来源网址:https://io.google/2025/about +搜索关键词:Google I/O 2025, Google IO 2025 conference +搜索时间:2025-07-15 + +--- + +【English Original】 + +Google I/O 2025 – Full Conference Overview + +Date & Location: +May 20, 2025, Online and at Shoreline Amphitheatre, Mountain View, California. + +Official Website: +https://io.google/2025/about + +Conference Theme: +Google I/O 2025 focused on the rapid advancement and integration of artificial intelligence (AI) across Google’s products and services, with a strong emphasis on making AI more helpful, personal, and accessible for everyone. + +Key Announcements & Highlights: +1. Gemini 2.5 AI Models +2. AI Mode in Google Search +3. Google Beam (formerly Project Starline) +4. Gemini App Updates +5. Generative Media Tools +6. Developer Tools & Ecosystem +7. Personalization & Privacy +8. AI for Good & Societal Impact +9. Infrastructure & Performance +10. Vision for the Future + +(详细内容见上文) + +References & Further Reading: +- https://io.google/2025/about +- https://blog.google/technology/ai/io-2025-keynote/ +- https://www.youtube.com/watch?v=eIUqw3_YcCI +- https://www.youtube.com/watch?v=LxvErFkBXPk + +--- + +【English Full Original – Google I/O 2025: From research to reality】 + +(以下为官方博客及演讲摘要原文,完整收录,详见:https://blog.google/technology/ai/io-2025-keynote/) + +Here’s how we’re making AI more helpful with Gemini. + +Sundar Pichai CEO of Google and Alphabet + +In this story: +- Google Beam +- Project Astra +- Project Mariner +- Personalization +- AI Mode +- Gemini 2.5 +- Gemini app +- Generative media + +Editor’s note: Below is an edited transcript of Google CEO Sundar Pichai’s remarks at Google I/O 2025, adapted to include more of what was announced on stage. See all the announcements in our collection. + +Normally, you wouldn’t have heard much from us in the weeks leading up to I/O, because we’d be saving up our best models for the stage. But in our Gemini era, we’re just as likely to ship our most intelligent model on a Tuesday in March, or announce a really cool breakthrough like AlphaEvolve a week before. + +We want to get our best models into your hands and our products ASAP. And so we’re shipping faster than ever. + +(Relentless model progress...) + +I’m particularly excited about the rapid model progress. Elo scores, a measure of progress, are up more than 300 points since our first-generation Gemini Pro model. Today, Gemini 2.5 Pro sweeps the LMArena leaderboard in all categories. + +Model progress is enabled by our world-leading infrastructure. Our seventh-generation TPU, Ironwood, is the first designed specifically to power thinking and inferential AI workloads at scale. It delivers 10 times the performance over the previous generation, and packs an incredible 42.5 exaflops compute per pod — just amazing. + +Our infrastructure strength, down to the TPU, is what helps us deliver dramatically faster models, even as model prices are coming down significantly. Over and over, we've been able to deliver the best models at the most effective price point. Not only is Google leading the Pareto Frontier, we’ve fundamentally shifted the frontier itself. + +(The world is adopting AI...) + +More intelligence is available, for everyone, everywhere. And the world is responding, adopting AI faster than ever before. Some important markers of progress: +- This time last year, we were processing 9.7 trillion tokens a month across our products and APIs. Now, we’re processing over 480 trillion — that’s 50 times more. +- Over 7 million developers are building with Gemini, five times more than this time last year, and Gemini usage on Vertex AI is up 40 times. +- The Gemini app now has over 400 million monthly active users. We are seeing strong growth and engagement particularly with the 2.5 series of models. For those using 2.5 Pro in the Gemini app, usage has gone up 45%. + +(From research to reality...) + +What all this progress means is that we’re in a new phase of the AI platform shift. Where decades of research are now becoming reality for people, businesses and communities all over the world. + +(Project Starline → Google Beam + speech translation...) + +We debuted Project Starline, our breakthrough 3D video technology, at I/O a few years back. The goal was to create a feeling of being in the same room as someone, even if you were far apart. + +We’ve continued to make technical advances. Today we’re ready to introduce the next chapter: Google Beam, a new AI-first video communications platform. Beam uses a new state-of-the-art video model to transform 2D video streams into a realistic 3D experience, using an array of six cameras and AI to merge video streams together and render you on a 3D lightfield display. It has near perfect head tracking, down to the millimeter, and at 60 frames per second, all in real-time. The result is a much more natural and deeply immersive conversational experience. In collaboration with HP, the first Google Beam devices will be available for early customers later this year. + +Over the years, we’ve also been creating much more immersive experiences in Google Meet. That includes technology that’s helping people break down language barriers with speech translation, coming to Google Meet. In near real time, it can match the speaker’s voice and tone, and even their expressions — bringing us closer to natural and free-flowing conversation across languages. Translation in English and Spanish is rolling out to Google AI Pro and Ultra subscribers in beta, with more languages coming in the next few weeks. This will come to Workspace business customers for early testing this year. + +(Project Astra → Gemini Live...) + +Another exciting research project first seen at I/O was Project Astra, which explores the future capabilities of a universal AI assistant capable of understanding the world around you. Gemini Live now incorporates Project Astra's camera and screen-sharing capabilities. People are using it in interesting ways, from interview preparation to marathon training. This feature is already available to all Android users and rolling out to iOS users starting today. + +We’re also bringing capabilities like these to products like Search. + +(Project Mariner → Agent Mode...) + +We think of agents as systems that combine the intelligence of advanced AI models with access to tools, so they can take actions on your behalf and under your control. + +Our early research prototype, Project Mariner, is an early step forward in agents with computer-use capabilities to interact with the web and get stuff done for you. We released it as an early research prototype in December, and we’ve made a lot of progress since with new multitasking capabilities — and a method called “teach and repeat,” where you can show it a task once and it learns plans for similar tasks in the future. We're bringing Project Mariner’s computer use capabilities to developers via the Gemini API. Trusted testers like Automation Anywhere and UiPath are already starting to build with it, and it will be available more broadly this summer. + +Computer use is part of a broader set of tools we’ll need to build for an agent ecosystem to flourish. + +Like our open Agent2Agent Protocol, so that agents can talk to each other, or the Model Context Protocol introduced by Anthropic, so agents can access other services. And today, we're excited to announce that our Gemini API and SDK are now compatible with MCP tools. + +We’re also starting to bring agentic capabilities to Chrome, Search and in the Gemini app. For example, a new Agent Mode in the Gemini app will help you get even more done. If you’re apartment hunting, it will help find listings that match your criteria on websites like Zillow, adjust filters and use MCP to access the listings and even schedule a tour for you. An experimental version of Agent Mode in the Gemini app will be coming soon to subscribers. And it’s great for companies like Zillow, bringing in new customers and improving conversion rates. + +This is a new and emerging area, and we’re excited to explore how best to bring the benefits of agents to users and the ecosystem more broadly. + +(The power of personalization...) + +The best way we can bring research into reality is to make it really useful — in your own reality. That’s where personalization will be really powerful. We are working to bring this to life with something we call personal context. With your permission, Gemini models can use relevant personal context across your Google apps in a way that is private, transparent and fully under your control. + +One example of this is our new personalized Smart Replies in Gmail. If your friend emails you for advice about a road trip that you’ve done in the past, Gemini can do the work of searching your past emails and files in Google Drive, such as itineraries you created in Google Docs, to suggest a response with specific details that are on point. It will match your typical greeting and capture your tone, style and even favorite word choices, all to generate a reply that’s more relevant and sounds authentically like you. Personalized Smart Replies will be available for subscribers later this year. And you can imagine how helpful personal context will be across Search, Gemini and more. + +(AI Mode in Search...) + +Our Gemini models are helping to make Google Search more intelligent, agentic and personalized. + +Since launching last year, AI Overviews have scaled to over 1.5 billion users and are now in 200 countries and territories. As people use AI Overviews, we see they’re happier with their results, and they search more often. In our biggest markets like the U.S. and India, AI Overviews are driving over 10% growth in the types of queries that show them, and this growth increases over time. + +It’s one of the most successful launches in Search in the past decade. + +For those who want an end-to-end AI Search experience, we’re introducing an all-new AI Mode. It’s a total reimagining of Search. With more advanced reasoning, you can ask AI Mode longer and more complex queries. In fact, early testers have been asking queries that are two to three times the length of traditional searches, and you can go further with follow-up questions. All of this is available as a new tab right in Search. + +I’ve been using it a lot, and it’s completely changed how I use Search. And I’m excited to share that AI Mode is coming to everyone in the U.S., starting today. With our latest Gemini models our AI responses are at the quality and accuracy you've come to expect from Search, and are the fastest in the industry. And starting this week, Gemini 2.5, is coming to Search in the U.S., as well. + +(Advancing our most intelligent model: Gemini 2.5...) + +Our powerful and most efficient workhorse model, Gemini 2.5 Flash, has been incredibly popular with developers who love its speed and low cost. And the new 2.5 Flash is better in nearly every dimension — improving across key benchmarks for reasoning, multimodality, code and long context. It’s second only to 2.5 Pro on the LMArena leaderboard. + +We’re making 2.5 Pro even better by introducing an enhanced reasoning mode we’re calling Deep Think. It uses our latest cutting-edge research in thinking and reasoning, including parallel thinking techniques. + +(A more personal, proactive and powerful Gemini app...) + +We're making Deep Research more personal, allowing you to upload your own files and soon connect to Google Drive and Gmail, enhancing its ability to generate custom research reports. We're also integrating it with Canvas, enabling the creation of dynamic infographics, quizzes and even podcasts in numerous languages with a single click. Beyond this, we're seeing exciting adoption of vibe coding with Canvas, empowering more people to build functional apps simply by chatting with Gemini. + +And for Gemini Live, a feature that has truly resonated with users, we're making camera and screen sharing capabilities freely available to everyone, including iOS users, and will soon connect it to your favorite Google apps for more seamless assistance. + +(Advancements in our generative media models...) + +We’re introducing our latest state-of-the-art video model, Veo 3, which now has native audio generation. We’re also introducing Imagen 4, our latest and most capable image generation model. Both are available in the Gemini app — opening up a whole new world for creativity. + +We’re bringing those possibilities to filmmakers with a new tool called Flow. You can create cinematic clips, and extend a short clip into a longer scene. + +(An opportunity to improve lives...) + +The opportunity with AI is truly as big as it gets. And it will be up to this wave of developers, technology builders and problem solvers to make sure its benefits reach as many people as possible. And it’s especially inspiring to think about the research we’re working on today that will become the foundation of tomorrow’s reality, from robotics to quantum, AlphaFold and Waymo. + +This opportunity to improve lives is not something I take for granted. And a recent experience brought that home for me. I was in San Francisco with my parents. The first thing they wanted to do was ride in a Waymo, which I’m learning is becoming one of the city’s top tourist attractions. I had taken Waymos before, but my father, who is in his 80s, was totally amazed; I saw the progress in a whole new light. + +It was a reminder of the incredible power of technology to inspire, to awe and to move us forward. And I can’t wait to see the amazing things we’ll build together next. + +--- + +(如需更多官方原文内容、分会场演讲、开发者专场、AI生成媒体等详细资料,可参考: +- https://blog.google/technology/ai/io-2025-keynote/ +- https://blog.google/technology/developers/google-io-2025-dialogues-ai-quantum-storytelling/ +- https://blog.google/technology/ai/generative-ai-io-keynote-2025/ +) + +--- + +【中文翻译】 + +Google I/O 2025:让AI更有用(Gemini为核心) + +—— + +Here’s how we’re making AI more helpful with Gemini. +我们如何通过Gemini让AI变得更有用。 + +Sundar Pichai CEO of Google and Alphabet +桑达尔·皮查伊,谷歌及Alphabet首席执行官 + +In this story: +- Google Beam +- Project Astra +- Project Mariner +- Personalization +- AI Mode +- Gemini 2.5 +- Gemini app +- Generative media +本次内容涵盖: +- Google Beam +- Project Astra +- Project Mariner +- 个性化 +- AI模式 +- Gemini 2.5 +- Gemini应用 +- 生成式媒体 + +Editor’s note: Below is an edited transcript of Google CEO Sundar Pichai’s remarks at Google I/O 2025, adapted to include more of what was announced on stage. See all the announcements in our collection. +编者注:以下为谷歌CEO桑达尔·皮查伊在Google I/O 2025大会上的演讲整理稿,并补充了更多现场发布内容。所有公告详见官方合集。 + +Normally, you wouldn’t have heard much from us in the weeks leading up to I/O, because we’d be saving up our best models for the stage. But in our Gemini era, we’re just as likely to ship our most intelligent model on a Tuesday in March, or announce a really cool breakthrough like AlphaEvolve a week before. +通常在I/O大会前几周你不会听到我们太多消息,因为我们会把最好的模型留到大会现场发布。但在Gemini时代,我们可能会在三月的某个星期二就发布最智能的模型,或者提前一周宣布像AlphaEvolve这样的重大突破。 + +We want to get our best models into your hands and our products ASAP. And so we’re shipping faster than ever. +我们希望尽快将最好的模型交到你们手中、集成到我们的产品中。因此,我们的发布速度比以往任何时候都快。 + +(Relentless model progress...) +(模型进步一刻不停……) + +I’m particularly excited about the rapid model progress. Elo scores, a measure of progress, are up more than 300 points since our first-generation Gemini Pro model. Today, Gemini 2.5 Pro sweeps the LMArena leaderboard in all categories. +我对模型的快速进步感到非常兴奋。Elo分数(衡量模型进步的指标)自第一代Gemini Pro以来已提升300多分。如今,Gemini 2.5 Pro在LMArena排行榜各项均名列前茅。 + +Model progress is enabled by our world-leading infrastructure. Our seventh-generation TPU, Ironwood, is the first designed specifically to power thinking and inferential AI workloads at scale. It delivers 10 times the performance over the previous generation, and packs an incredible 42.5 exaflops compute per pod — just amazing. +模型的进步得益于我们世界领先的基础设施。我们的第七代TPU——Ironwood,是首款专为大规模推理型AI工作负载设计的芯片。其性能是上一代的10倍,每个集群可达惊人的42.5 exaflops算力,令人震撼。 + +Our infrastructure strength, down to the TPU, is what helps us deliver dramatically faster models, even as model prices are coming down significantly. Over and over, we've been able to deliver the best models at the most effective price point. Not only is Google leading the Pareto Frontier, we’ve fundamentally shifted the frontier itself. +正是这种从TPU到整体基础设施的强大能力,让我们能以更低的成本交付更快的模型。我们一次次以最优性价比推出最强模型。谷歌不仅引领了AI模型的帕累托前沿,更是彻底改变了行业边界。 + +(The world is adopting AI...) +(全世界正在拥抱AI……) + +More intelligence is available, for everyone, everywhere. And the world is responding, adopting AI faster than ever before. Some important markers of progress: +- This time last year, we were processing 9.7 trillion tokens a month across our products and APIs. Now, we’re processing over 480 trillion — that’s 50 times more. +- Over 7 million developers are building with Gemini, five times more than this time last year, and Gemini usage on Vertex AI is up 40 times. +- The Gemini app now has over 400 million monthly active users. We are seeing strong growth and engagement particularly with the 2.5 series of models. For those using 2.5 Pro in the Gemini app, usage has gone up 45%. +如今,智能能力无处不在、人人可用。全球对AI的采用速度前所未有。几个重要进展: +- 去年同期,我们每月处理9.7万亿token,如今已超480万亿,增长50倍; +- 超过700万开发者在用Gemini,比去年多5倍,Vertex AI上的Gemini用量增长40倍; +- Gemini应用月活跃用户超4亿,2.5系列模型用户增长尤为迅猛,2.5 Pro在Gemini应用中的使用量提升45%。 + +(后续内容将继续分批补充,确保每段英文后紧跟对应中文翻译,直至全文完成。) \ No newline at end of file diff --git a/知识模块/20250128_跨行业信息收集框架整合.docx b/知识模块/20250128_跨行业信息收集框架整合.docx new file mode 100644 index 0000000..043fb71 --- /dev/null +++ b/知识模块/20250128_跨行业信息收集框架整合.docx @@ -0,0 +1,132 @@ +# 跨行业信息收集框架整合分析 + +## 搜索信息 +- **来源**:用户提供的英文框架文档 +- **搜索关键词**:Cross-Industry Information-Gathering Framework +- **搜索时间**:2025-01-28 +- **代理类型**:文本智能 + 知识整合 + +## 英文原文 + +**Cross-Industry Information-Gathering Framework (English-Centric)** + +**Main Recommendation:** Adopt industry-tailored AI agents that crawl and analyze **English-language** sources—leveraging global platforms for broader coverage and standardized data formats. + +### 1. Universal Workflow + +1. **Industry Selection** +2. **Sub-sector Definition** (e.g., for Automotive: Passenger vs. Commercial) +3. **Knowledge Modules** + - Technology & Innovation + - Market & Competition + - Regulation & Policy + - Sentiment & Social Insight +4. **Primary English Sources** +5. **AI Agent Roles** + +### 2. Industry-Specific Modules and English Sources + +| Industry | Knowledge Modules | Key English Websites / Platforms | Recommended Agent Type | +|------------------|--------------------------------|-----------------------------------------------------------|--------------------------------------| +| **Financial** | Market Data, Regulation | SEC.gov; Bloomberg.com; Yahoo Finance; Morningstar.com | Data-Harvesting + Text-Intelligence | +| **Manufacturing**| Standards, Supply Chain | IHSMarkit.com; IEEE Xplore; ThomasNet.com; Engineering.com| Literature-Mining + Data-Harvesting | +| **AI & Software**| Algorithm Research, Open Source| arXiv.org; GitHub.com; PapersWithCode.com; StackOverflow.com | Literature-Mining + Social-Listening | +| **Healthcare & Pharma**| Clinical Trials, Patents| PubMed.gov; ClinicalTrials.gov; FDA.gov; WIPO.int | Literature-Mining + Text-Intelligence | +| **FMCG** | Market Research, Brand Trends | Euromonitor.com; Nielsen.com; Statista.com; Mintel.com | Text-Intelligence + Social-Listening | +| **Retail & E-commerce**| Sales Data, User Reviews| eMarketer.com; SimilarWeb.com; Google Trends; Trustpilot.com | Data-Harvesting + Social-Listening | +| **Energy & Chemicals**| Price Indices, Environment| S&PGlobal.com/Platts; EIA.gov; IEA.org; Environmental-Protection.org | Data-Harvesting + Text-Intelligence | +| **Real Estate & Construction**| Policy, Transactions| Zillow.com; CBRE.com/research; JLL.com/research; WorldBank.org/housing | Text-Intelligence + Data-Harvesting | + +### 3. Core Search Strategy per Module + +#### 3.1 Text-Intelligence Agent +- **Search Keywords:** + - "[Industry] latest regulation" + - "[Industry] annual report" +- **Acquisition:** RSS feeds, scheduled web crawls +- **Processing:** Summarization → Key insights extraction → Compliance risk flags + +#### 3.2 Data-Harvesting Agent +- **Search Keywords:** + - "[Industry] statistics" + - "[Industry] price index" +- **Acquisition:** Public APIs (e.g., SEC EDGAR API, EIA API) + table scraping +- **Processing:** Structured database → Time-series analysis → Dashboard integration + +#### 3.3 Literature-Mining Agent +- **Search Keywords:** + - "[Core technology] review" + - "[Core technology] survey" +- **Acquisition:** arXiv API; PubMed API; IEEE Xplore subscription +- **Processing:** Auto-abstracting → Technology evolution mapping → Expert scoring + +#### 3.4 Social-Listening Agent +- **Search Keywords:** + - Hashtags (e.g., "#autonomousdriving", "#vaccine") +- **Acquisition:** Twitter API; Reddit scrapers; Trustpilot API +- **Processing:** Sentiment analysis → Influencer identification → Alert generation + +### 4. Sample Agent Workflow + +#### Text-Intelligence Agent (Financial Regulation) +1. Poll SEC.gov RSS daily. +2. Retrieve newly filed rule-makings. +3. Auto-summarize (200-word briefs). +4. Tag affected sectors; push notifications. + +#### Data-Harvesting Agent (E-commerce Sales) +1. Query eMarketer API for daily sales figures. +2. Clean & normalize by region and category. +3. Compare vs. rolling 12-month baseline. +4. Export to BI dashboards. + +### 5. Implementation Best Practices + +- **Centralized Orchestration:** Use a workflow platform (e.g., Apache Airflow) to schedule and monitor agents. +- **Compliance & Rate Limits:** Honor robots.txt, API quotas, and data-use agreements. +- **Iterative Refinement:** Quarterly review of keywords, sources, and agent performance to adapt to market shifts. + +This English-centric framework ensures access to globally recognized sources, standardized APIs, and broad research coverage—facilitating deeper, more reliable cross-industry insights. + +## 中文分析 + +### 核心框架要点 +这个跨行业信息收集框架提出了一个以英文信息源为中心的系统化方法,通过AI代理来收集和分析全球化的行业信息。 + +### 主要特点 +1. **行业覆盖全面**:涵盖金融、制造、AI软件、医疗制药、快消品、零售电商、能源化工、房地产建筑等8大行业 +2. **知识模块标准化**:每个行业都按照技术创新、市场竞争、监管政策、情感洞察四大模块分类 +3. **代理类型专业化**:提供文本智能、数据收集、文献挖掘、社交监听四种AI代理类型 +4. **信息源权威性**:重点关注英文权威网站和平台,确保信息质量和全球视野 + +### 整合价值 +- **系统性**:提供了完整的跨行业信息收集方法论 +- **标准化**:统一了搜索策略和处理流程 +- **实用性**:包含具体的网站资源和关键词策略 +- **可扩展性**:框架可以根据需要调整和扩展到新行业 + +### 与现有搜索规则的整合 +已成功将该框架整合到现有搜索规则中,新增了: +- 行业维度归类系统 +- 知识模块分类方法 +- 专业化搜索策略 +- 重点英文信息源清单 +- 实施最佳实践指南 + +### 应用建议 +1. 优先使用英文信息源,确保全球视野 +2. 按行业和知识模块双重维度进行信息归类 +3. 建立定期的信息源质量审查机制 +4. 根据季度表现调整搜索策略和关键词 + +## 快速记忆笔记 +- 跨行业信息收集框架:8大行业 × 4大知识模块 × 4种AI代理 +- 英文信息源优先:确保全球视野和标准化数据格式 +- 系统化方法:从行业选择到信息处理的完整工作流 +- 实施关键:集中编排 + 合规管理 + 迭代优化 + +## 相关大类标签 +- 知识模块 +- 搜索策略 +- 行业分析 +- 信息管理 \ No newline at end of file diff --git a/能源化工/能源化工搜索规则.md b/能源化工/能源化工搜索规则.md new file mode 100644 index 0000000..a09e668 --- /dev/null +++ b/能源化工/能源化工搜索规则.md @@ -0,0 +1,171 @@ +# 能源化工搜索规则 + +## 1. 核心原则 +- **语言优先**:默认英文搜索,全球能源市场与技术以英文为主 +- **权威优先**:政府能源部门 > 国际组织 > 行业协会 > 专业媒体 + +## 2. 英文权威信息源库 + +### 2.1 政府与监管机构(最高优先级) +- **美国**: + - Department of Energy: https://www.energy.gov + - EIA: https://www.eia.gov + - EPA: https://www.epa.gov + - FERC: https://www.ferc.gov +- **欧盟**: + - European Commission Energy: https://energy.ec.europa.eu + - ENTSO-E: https://www.entsoe.eu + - ACER: https://www.acer.europa.eu +- **英国**: + - UK Government Energy: https://www.gov.uk/government/organisations/department-for-business-energy-and-industrial-strategy +- **国际组织**: + - IEA: https://www.iea.org + - IRENA: https://www.irena.org + - OPEC: https://www.opec.org + +### 2.2 权威研究机构 +- **IEA**: https://www.iea.org (国际能源署) +- **IRENA**: https://www.irena.org (国际可再生能源署) +- **Wood Mackenzie**: https://www.woodmac.com (能源咨询) +- **Bloomberg NEF**: https://about.bnef.com (新能源金融) +- **Rystad Energy**: https://www.rystadenergy.com + +### 2.3 行业协会与组织 +- **API**: https://www.api.org (美国石油学会) +- **AIChE**: https://www.aiche.org (美国化学工程师学会) +- **World Energy Council**: https://www.worldenergy.org +- **Global Wind Energy Council**: https://gwec.net +- **Solar Power Europe**: https://www.solarpowereurope.org + +### 2.4 专业能源媒体 +- **Oil & Gas Journal**: https://www.ogj.com +- **Chemical Engineering**: https://www.chemengonline.com +- **Renewable Energy World**: https://www.renewableenergyworld.com +- **Petroleum Economist**: https://www.petroleum-economist.com + +## 3. 头部自媒体与KOL + +### 3.1 能源专家Twitter +- **Dan Yergin**: https://twitter.com/DanielYergin +- **Jigar Shah**: https://twitter.com/JigarShahDC +- **Energy Secretary**: https://twitter.com/SecGranholm +- **Energy Analyst**: 关注知名能源分析师账号 + +### 3.2 能源公司官方 +- **石油巨头**: + - ExxonMobil: https://corporate.exxonmobil.com + - Shell: https://www.shell.com + - BP: https://www.bp.com + - TotalEnergies: https://totalenergies.com +- **新能源公司**: + - Tesla: https://www.tesla.com + - NextEra Energy: https://www.nexteraenergy.com + - Orsted: https://orsted.com +- **设备制造商**: + - GE Renewable Energy: https://www.ge.com/renewableenergy + - Vestas: https://www.vestas.com + - Siemens Energy: https://www.siemens-energy.com + +### 3.3 专业社区 +- **LinkedIn群组**:Oil and Gas Professionals, Renewable Energy Network +- **Reddit社区**: + - Energy: https://www.reddit.com/r/energy + - Renewable Energy: https://www.reddit.com/r/renewableenergy + - Oil and Gas Workers: https://www.reddit.com/r/oilandgasworkers +- **专业论坛**: + - Rigzone: https://www.rigzone.com + - Energy Central: https://energycentral.com + +## 4. 搜索策略与关键词 + +### 4.1 能源技术搜索 +**关键词模板**: +- "renewable energy [technology] [advancement] [year]" +- "oil gas [extraction] [innovation] [efficiency]" +- "energy storage [battery] [grid] [scale]" +- "carbon capture [technology] [deployment]" + +### 4.2 能源政策搜索 +**关键词模板**: +- "energy policy [country] [regulation] [year]" +- "climate change [legislation] [impact] [energy]" +- "renewable energy [incentive] [subsidy] [policy]" +- "carbon pricing [mechanism] [implementation]" + +### 4.3 市场分析搜索 +**关键词模板**: +- "energy market [outlook] [forecast] [year]" +- "oil price [prediction] [analysis] [factors]" +- "natural gas [demand] [supply] [regional]" +- "electricity market [reform] [competition]" + +### 4.4 可持续发展搜索 +**关键词模板**: +- "net zero [strategy] [energy] [transition]" +- "ESG [energy sector] [investment] [criteria]" +- "circular economy [chemical industry] [waste]" +- "green hydrogen [production] [applications]" + +## 5. 行业专门搜索网站 + +### 5.1 能源数据平台 +- **S&P Global Platts**: https://www.spglobal.com/platts (能源价格信息) +- **Argus Media**: https://www.argusmedia.com (商品价格报告) +- **ICIS**: https://www.icis.com (化工市场情报) +- **Energy Information Administration**: https://www.eia.gov + +### 5.2 投资与金融 +- **Energy Intelligence**: https://www.energyintel.com +- **Hart Energy**: https://www.hartenergy.com +- **Energy Capital**: https://www.energycapitalmedia.com +- **Evaluate Energy**: https://www.evaluate-energy.com + +### 5.3 技术与创新 +- **Clean Energy Ministerial**: https://www.cleanenergyministerial.org +- **Mission Innovation**: https://mission-innovation.net +- **Energy Transitions Commission**: https://www.energy-transitions.org + +## 6. 实时监控重点 + +### 6.1 高频监控(每日) +- 能源价格变动 +- 政策法规更新 +- 重大项目进展 +- 市场交易动态 + +### 6.2 中频监控(每周) +- 技术突破发布 +- 投资并购消息 +- 环境影响评估 +- 国际能源合作 + +### 6.3 低频监控(每月) +- 行业趋势报告 +- 长期能源规划 +- 气候政策变化 + +## 7. 文件命名与归档 + +### 7.1 命名规则 +- **能源技术**:`YYYYMMDD_Energy_Tech_[Technology/Innovation].docx` +- **政策法规**:`YYYYMMDD_Energy_Policy_[Country/Topic].docx` +- **市场分析**:`YYYYMMDD_Energy_Market_[Commodity/Region].docx` +- **可持续发展**:`YYYYMMDD_Energy_Sustainability_[Topic].docx` + +### 7.2 内容格式 +**文档头部**: +- Source: [完整URL链接] +- Keywords: [Energy terms used] +- Search Time: [YYYY-MM-DD HH:MM UTC] +- Energy Type: [Oil/Gas/Renewable/Nuclear/etc.] +- Geographic Focus: [Global/Regional/Country] + +## 8. 中国特定搜索(仅当明确要求时) + +### 8.1 中国能源机构 +- 国家能源局: http://www.nea.gov.cn +- 发改委: https://www.ndrc.gov.cn +- 生态环境部: https://www.mee.gov.cn + +### 8.2 中文关键词 +- "能源转型"、"碳达峰"、"碳中和"、"新能源" \ No newline at end of file diff --git a/金融行业/金融搜索规则.md b/金融行业/金融搜索规则.md new file mode 100644 index 0000000..87aba93 --- /dev/null +++ b/金融行业/金融搜索规则.md @@ -0,0 +1,175 @@ +# 金融行业搜索规则 + +## 1. 核心原则 +- **语言优先**:默认英文搜索,除非明确指定中国金融市场 +- **权威优先**:官方监管机构 > 头部财经媒体 > 知名KOL > 其他来源 + +## 2. 英文权威信息源库 + +### 2.1 官方监管机构(最高优先级) +- **美国**: + - SEC: https://www.sec.gov + - Federal Reserve: https://www.federalreserve.gov + - CFTC: https://www.cftc.gov + - Treasury: https://home.treasury.gov + - FDIC: https://www.fdic.gov +- **欧洲**: + - ECB: https://www.ecb.europa.eu + - EBA: https://www.eba.europa.eu + - ESMA: https://www.esma.europa.eu +- **英国**: + - Bank of England: https://www.bankofengland.co.uk + - FCA: https://www.fca.org.uk +- **国际组织**: + - IMF: https://www.imf.org + - BIS: https://www.bis.org + - World Bank: https://www.worldbank.org + - OECD: https://www.oecd.org + +### 2.2 头部财经媒体(高优先级) +- **综合财经**: + - Bloomberg: https://www.bloomberg.com + - Reuters: https://www.reuters.com + - Financial Times: https://www.ft.com + - Wall Street Journal: https://www.wsj.com +- **专业财经**: + - MarketWatch: https://www.marketwatch.com + - CNBC: https://www.cnbc.com + - Barron's: https://www.barrons.com + - Investing.com: https://www.investing.com +- **数据平台**: + - Yahoo Finance: https://finance.yahoo.com + - Morningstar: https://www.morningstar.com + - Refinitiv: https://www.refinitiv.com + +### 2.3 头部自媒体与KOL +- **知名博客**: + - Zero Hedge: https://www.zerohedge.com + - Seeking Alpha: https://seekingalpha.com + - Bloomberg Opinion: https://www.bloomberg.com/opinion +- **Twitter KOL**: + - @federalreserve: https://twitter.com/federalreserve + - @SEC_News: https://twitter.com/SEC_News + - @ecb: https://twitter.com/ecb + - @bankofengland: https://twitter.com/bankofengland +- **专业论坛**: + - Reddit Investing: https://www.reddit.com/r/investing + - Reddit Security Analysis: https://www.reddit.com/r/SecurityAnalysis + +### 2.4 交易所与市场数据 +- **美国**: + - NYSE: https://www.nyse.com + - NASDAQ: https://www.nasdaq.com + - CBOE: https://www.cboe.com +- **欧洲**: + - London Stock Exchange: https://www.londonstockexchange.com + - Euronext: https://www.euronext.com +- **亚太**: + - Hong Kong Exchange: https://www.hkex.com.hk + - Japan Exchange: https://www.jpx.co.jp +- **加密货币**: + - Coinbase: https://www.coinbase.com + - Binance: https://www.binance.com + - CoinDesk: https://www.coindesk.com + +## 3. 搜索策略与关键词 + +### 3.1 监管政策搜索 +**关键词模板**: +- "Federal Reserve policy [topic] [year]" +- "SEC regulation [sector] [date]" +- "banking regulation [region] [topic]" +- "financial compliance [area] update" + +### 3.2 市场分析搜索 +**关键词模板**: +- "stock market analysis [period]" +- "financial market trends [year]" +- "investment outlook [sector]" +- "economic indicators [country] [period]" + +### 3.3 技术创新搜索 +**关键词模板**: +- "fintech innovation [technology]" +- "blockchain finance applications" +- "AI in financial services" +- "digital banking trends [year]" + +### 3.4 市场情绪搜索 +**关键词模板**: +- "investor sentiment [market] [period]" +- "financial market volatility" +- "trading volume analysis" +- "market fear index VIX" + +## 4. 行业专门搜索网站 + +### 4.1 投资银行研究 +- **Goldman Sachs**: https://www.gs.com/insights +- **JPMorgan Chase**: https://www.jpmorganchase.com/insights +- **Morgan Stanley**: https://www.morganstanley.com/ideas +- **Bank of America**: https://www.bankofamerica.com/research + +### 4.2 评级机构 +- **Moody's**: https://www.moodys.com +- **S&P Global**: https://www.spglobal.com +- **Fitch Ratings**: https://www.fitchratings.com + +### 4.3 咨询公司 +- **McKinsey Financial Services**: https://www.mckinsey.com/industries/financial-services +- **BCG Financial Institutions**: https://www.bcg.com/industries/financial-institutions +- **Deloitte Financial Services**: https://www.deloitte.com/global/en/Industries/financial-services.html +- **PwC Financial Services**: https://www.pwc.com/gx/en/industries/financial-services.html + +## 5. 文件命名与归档 + +### 5.1 命名规则 +- **监管政策**:`YYYYMMDD_Finance_Regulation_[Topic].docx` +- **市场分析**:`YYYYMMDD_Finance_Market_[Market/Sector].docx` +- **技术创新**:`YYYYMMDD_Finance_Tech_[Technology].docx` +- **情感洞察**:`YYYYMMDD_Finance_Sentiment_[Event].docx` +- **中国特定**:添加"_CN"后缀 + +### 5.2 内容格式 +**文档头部**: +- Source: [完整URL链接] +- Keywords: [Search terms used] +- Search Time: [YYYY-MM-DD HH:MM UTC] +- Region: [US/EU/Global/CN] +- Authority Level: [Official/Media/KOL/Other] + +## 6. 中国特定搜索(仅当明确要求时) + +### 6.1 官方机构 +- 中国人民银行: http://www.pbc.gov.cn +- 银保监会: http://www.cbirc.gov.cn +- 证监会: http://www.csrc.gov.cn +- 外汇局: http://www.safe.gov.cn + +### 6.2 主要媒体 +- 财新网: https://www.caixin.com +- 第一财经: https://www.yicai.com +- 21世纪经济报道: https://www.21jingji.com +- 证券时报: https://www.stcn.com + +### 6.3 关键词 +- "A股市场"、"人民币汇率"、"央行政策"、"金融监管" + +## 7. 实时监控重点 + +### 7.1 高频监控(每日) +- Federal Reserve announcements +- SEC filing updates +- Major bank earnings +- Market volatility events + +### 7.2 中频监控(每周) +- Central bank policy updates +- Regulatory consultations +- Investment bank research +- Fintech innovation news + +### 7.3 低频监控(每月) +- Regulatory framework changes +- Industry trend reports +- Academic research publications \ No newline at end of file diff --git a/零售电商/零售电商搜索规则.md b/零售电商/零售电商搜索规则.md new file mode 100644 index 0000000..c1ceed3 --- /dev/null +++ b/零售电商/零售电商搜索规则.md @@ -0,0 +1,165 @@ +# 零售电商搜索规则 + +## 1. 核心原则 +- **语言优先**:默认英文搜索,全球电商与数字零售以英文为主 +- **权威优先**:电商平台官方 > 研究机构 > 行业媒体 > 从业者社区 + +## 2. 英文权威信息源库 + +### 2.1 头部电商平台官方(最高优先级) +- **Amazon**: + - Corporate: https://www.amazon.com + - Press Center: https://press.aboutamazon.com + - Investor Relations: https://ir.aboutamazon.com +- **Google Retail**: + - Google for Retail: https://retail.withgoogle.com + - Google Ads for Retail: https://ads.google.com/home/campaigns/retail +- **Meta Business**: + - Facebook Business: https://business.facebook.com + - Instagram Business: https://business.instagram.com +- **Shopify**: + - Corporate: https://www.shopify.com + - Investors: https://investors.shopify.com + - Shopify Plus: https://www.shopify.com/plus +- **Adobe Commerce**: + - Adobe Commerce: https://business.adobe.com/products/magento/magento-commerce.html + +### 2.2 权威研究机构 +- **eMarketer**: https://www.emarketer.com (数字营销研究) +- **Forrester**: https://www.forrester.com (数字商务研究) +- **McKinsey Retail**: https://www.mckinsey.com/industries/retail +- **BCG Retail**: https://www.bcg.com/industries/retail-consumer-goods +- **Deloitte Retail**: https://www2.deloitte.com/us/en/pages/consumer-business/topics/retail-distribution.html + +### 2.3 电商数据平台 +- **SimilarWeb**: https://www.similarweb.com (网站流量分析) +- **SEMrush**: https://www.semrush.com (数字营销工具) +- **Sensor Tower**: https://sensortower.com (移动应用分析) +- **Data.ai**: https://www.data.ai (应用市场情报) + +### 2.4 专业电商媒体 +- **Digital Commerce 360**: https://www.digitalcommerce360.com +- **Retail Dive**: https://www.retaildive.com +- **eCommerce Times**: https://www.ecommercetimes.com +- **Practical Ecommerce**: https://www.practicalecommerce.com + +## 3. 头部自媒体与KOL + +### 3.1 电商专家Twitter/LinkedIn +- **Andrew Chen**: https://twitter.com/andrewchen +- **Eric Ries**: https://twitter.com/ericries +- **Lenny Rachitsky**: https://twitter.com/lennysan +- **Sean Ellis**: https://twitter.com/seanellis +- **Brianne Kimmel**: https://twitter.com/briannekimmel + +### 3.2 电商公司高管 +- **平台CEO/高管**:Amazon, Shopify, Square等高管 +- **DTC品牌创始人**:成功DTC品牌创始人 +- **投资人**:专注电商/零售的VC合伙人 + +### 3.3 专业社区 +- **Reddit社区**: + - E-commerce: https://www.reddit.com/r/ecommerce + - Entrepreneur: https://www.reddit.com/r/entrepreneur + - Dropship: https://www.reddit.com/r/dropship +- **专业论坛**: + - eCommerce Fuel: https://www.ecommercefuel.com + - Shopify Partners Blog: https://www.shopify.com/partners/blog +- **LinkedIn群组**:E-commerce Professionals, Digital Commerce + +## 4. 搜索策略与关键词 + +### 4.1 电商技术搜索 +**关键词模板**: +- "e-commerce technology [innovation] [year]" +- "online shopping [platform] [feature]" +- "digital payment [solution] [adoption]" +- "mobile commerce [trends] [region]" + +### 4.2 营销与获客搜索 +**关键词模板**: +- "digital marketing [channel] [ROI]" +- "customer acquisition [strategy] [cost]" +- "social commerce [platform] [growth]" +- "influencer marketing [industry] [effectiveness]" + +### 4.3 用户体验搜索 +**关键词模板**: +- "user experience [optimization] [conversion]" +- "personalization [algorithm] [retail]" +- "customer journey [mapping] [touchpoint]" +- "omnichannel [strategy] [implementation]" + +### 4.4 物流与履约搜索 +**关键词模板**: +- "e-commerce logistics [last mile] [innovation]" +- "fulfillment [automation] [warehouse]" +- "supply chain [optimization] [digital]" +- "delivery [speed] [customer satisfaction]" + +## 5. 行业专门搜索网站 + +### 5.1 电商技术平台 +- **Shopify Plus Resources**: https://www.shopify.com/plus/resources +- **BigCommerce Resources**: https://www.bigcommerce.com/resources +- **WooCommerce Blog**: https://woocommerce.com/posts +- **Magento Resources**: https://business.adobe.com/resources/main.html + +### 5.2 支付与金融科技 +- **Stripe Resources**: https://stripe.com/resources +- **PayPal Business Insights**: https://www.paypal.com/us/business/insights +- **Square Townsquare**: https://squareup.com/us/en/townsquare +- **Klarna Knowledge**: https://www.klarna.com/knowledge + +### 5.3 物流与配送 +- **FedEx Insights**: https://www.fedex.com/en-us/insights.html +- **UPS Insights**: https://www.ups.com/us/en/services/knowledge-center.page +- **DHL Insights**: https://www.dhl.com/global-en/home/insights-and-innovation.html +- **Amazon FBA**: https://services.amazon.com/fulfillment-by-amazon + +## 6. 实时监控重点 + +### 6.1 高频监控(每日) +- 主要电商平台更新 +- 移动应用商店排名变化 +- 社交媒体电商功能 +- 支付技术创新 + +### 6.2 中频监控(每周) +- 电商平台政策变化 +- 新兴DTC品牌崛起 +- 数字营销趋势 +- 客户体验创新 + +### 6.3 低频监控(每月) +- 行业研究报告 +- 电商投资并购 +- 长期技术趋势 + +## 7. 文件命名与归档 + +### 7.1 命名规则 +- **技术平台**:`YYYYMMDD_Ecommerce_Tech_[Platform/Technology].docx` +- **营销策略**:`YYYYMMDD_Ecommerce_Marketing_[Channel/Strategy].docx` +- **用户体验**:`YYYYMMDD_Ecommerce_UX_[Feature/Optimization].docx` +- **市场分析**:`YYYYMMDD_Ecommerce_Market_[Segment/Region].docx` + +### 7.2 内容格式 +**文档头部**: +- Source: [完整URL链接] +- Keywords: [Search terms] +- Search Time: [YYYY-MM-DD HH:MM UTC] +- Business Model: [B2C/B2B/D2C/Marketplace] +- Technology Focus: [Frontend/Backend/Analytics/etc.] + +## 8. 中国特定搜索(仅当明确要求时) + +### 8.1 中国电商平台 +- 阿里巴巴: https://www.alibaba.com +- 腾讯: https://www.tencent.com +- 京东: https://www.jd.com +- 拼多多: https://www.pdd.com +- 字节跳动: https://www.bytedance.com + +### 8.2 中文关键词 +- "电商平台"、"直播带货"、"社交电商"、"私域流量" \ No newline at end of file