Files
20250715-66bfff96/代码实现/rss_monitor.py
2026-04-25 19:21:03 +08:00

324 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
RSS监控脚本 - 自动获取RSS源更新
"""
import feedparser
import requests
import time
import logging
import threading
from datetime import datetime, timezone
from typing import List, Dict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from database import DatabaseManager
from config import RSS_MONITOR_CONFIG, SEARCH_CONFIG
class RSSMonitor:
"""RSS监控器"""
def __init__(self):
self.db = DatabaseManager()
self.logger = logging.getLogger(__name__)
self.is_running = False
self.check_interval = RSS_MONITOR_CONFIG['check_interval']
self.max_retries = RSS_MONITOR_CONFIG['max_retries']
self.timeout = RSS_MONITOR_CONFIG['timeout']
self.user_agent = RSS_MONITOR_CONFIG['user_agent']
def start_monitoring(self):
"""开始监控RSS源"""
self.is_running = True
self.logger.info("RSS监控器启动")
while self.is_running:
try:
self._check_all_sources()
self.logger.info(f"等待 {self.check_interval} 秒后进行下次检查")
time.sleep(self.check_interval)
except KeyboardInterrupt:
self.logger.info("收到停止信号")
break
except Exception as e:
self.logger.error(f"监控过程出错: {e}")
time.sleep(60) # 出错后等待1分钟再继续
def stop_monitoring(self):
"""停止监控"""
self.is_running = False
self.logger.info("RSS监控器停止")
def _check_all_sources(self):
"""检查所有RSS源"""
sources = self.db.get_rss_sources()
self.logger.info(f"开始检查 {len(sources)} 个RSS源")
# 使用线程池并行处理
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {
executor.submit(self._check_single_source, source): source
for source in sources
}
success_count = 0
error_count = 0
for future in as_completed(futures):
source = futures[future]
try:
articles_count = future.result()
if articles_count is not None:
success_count += 1
if articles_count > 0:
self.logger.info(
f"{source['source_name']}: 新增 {articles_count} 篇文章"
)
else:
error_count += 1
except Exception as e:
error_count += 1
self.logger.error(f"检查 {source['source_name']} 时出错: {e}")
self.logger.info(f"RSS检查完成: 成功 {success_count}, 失败 {error_count}")
def _check_single_source(self, source: Dict) -> Optional[int]:
"""检查单个RSS源"""
source_id = source['id']
source_name = source['source_name']
source_url = source['source_url']
try:
# 获取RSS内容
articles = self._fetch_rss_articles(source_url, source)
if articles is None:
return None
# 保存新文章
new_articles_count = 0
for article in articles:
article['source_id'] = source_id
article_id = self.db.save_article(article)
if article_id:
new_articles_count += 1
# 更新RSS源检查时间
self.db.update_rss_source_check_time(source_id)
return new_articles_count
except Exception as e:
self.logger.error(f"检查RSS源 {source_name} 失败: {e}")
return None
def _fetch_rss_articles(self, url: str, source: Dict) -> Optional[List[Dict]]:
"""获取RSS文章"""
headers = {
'User-Agent': self.user_agent,
'Accept': 'application/rss+xml, application/xml, text/xml'
}
for attempt in range(self.max_retries):
try:
# 获取RSS内容
response = requests.get(url, headers=headers, timeout=self.timeout)
response.raise_for_status()
# 解析RSS
feed = feedparser.parse(response.content)
if feed.bozo and feed.bozo_exception:
self.logger.warning(
f"RSS解析警告 {source['source_name']}: {feed.bozo_exception}"
)
articles = []
for entry in feed.entries:
article = self._parse_rss_entry(entry, source)
if article:
articles.append(article)
return articles
except requests.RequestException as e:
self.logger.warning(
f"{attempt + 1} 次尝试获取 {source['source_name']} 失败: {e}"
)
if attempt < self.max_retries - 1:
time.sleep(2 ** attempt) # 指数退避
except Exception as e:
self.logger.error(f"解析RSS {source['source_name']} 时出错: {e}")
break
return None
def _parse_rss_entry(self, entry, source: Dict) -> Optional[Dict]:
"""解析RSS条目"""
try:
# 获取发布时间
published_date = None
if hasattr(entry, 'published_parsed') and entry.published_parsed:
published_date = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc)
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
published_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
# 获取内容
content = ''
if hasattr(entry, 'content') and entry.content:
content = entry.content[0].value if isinstance(entry.content, list) else entry.content
elif hasattr(entry, 'summary'):
content = entry.summary
elif hasattr(entry, 'description'):
content = entry.description
# 获取作者
author = ''
if hasattr(entry, 'author'):
author = entry.author
elif hasattr(entry, 'dc_creator'):
author = entry.dc_creator
# 提取关键词
keywords = self._extract_keywords(entry.title, content)
article = {
'title': entry.title if hasattr(entry, 'title') else '',
'content': self._clean_content(content),
'summary': entry.summary if hasattr(entry, 'summary') else '',
'author': author,
'original_url': entry.link if hasattr(entry, 'link') else '',
'published_date': published_date,
'language': source.get('language', 'en'),
'keywords': keywords
}
# 验证必要字段
if not article['title'] or not article['original_url']:
return None
return article
except Exception as e:
self.logger.error(f"解析RSS条目时出错: {e}")
return None
def _clean_content(self, content: str) -> str:
"""清理HTML内容"""
if not content:
return ''
try:
import re
from html import unescape
# 移除HTML标签
content = re.sub(r'<[^>]+>', '', content)
# 解码HTML实体
content = unescape(content)
# 移除多余空白
content = re.sub(r'\s+', ' ', content).strip()
return content
except:
return content
def _extract_keywords(self, title: str, content: str) -> List[str]:
"""提取关键词"""
try:
text = f"{title} {content}".lower()
# 简单关键词提取可以用更高级的NLP库
import re
words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
# 过滤常见停用词
stop_words = {
'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had',
'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his',
'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy',
'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been',
'said', 'each', 'make', 'most', 'over', 'some', 'time', 'very',
'what', 'when', 'here', 'just', 'like', 'long', 'many', 'than',
'them', 'well', 'your', 'come', 'could', 'into', 'more', 'much',
'only', 'other', 'such', 'take', 'than', 'them', 'well', 'were'
}
keywords = [word for word in words if word not in stop_words]
# 统计词频并返回前10个
from collections import Counter
word_counts = Counter(keywords)
return [word for word, count in word_counts.most_common(10)]
except Exception as e:
self.logger.error(f"提取关键词时出错: {e}")
return []
def manual_check_source(self, source_id: int) -> Dict:
"""手动检查指定RSS源"""
sources = self.db.get_rss_sources()
source = next((s for s in sources if s['id'] == source_id), None)
if not source:
return {'success': False, 'message': 'RSS源不存在'}
try:
articles_count = self._check_single_source(source)
if articles_count is not None:
return {
'success': True,
'message': f'成功检查 {source["source_name"]}',
'new_articles': articles_count
}
else:
return {
'success': False,
'message': f'检查 {source["source_name"]} 失败'
}
except Exception as e:
return {
'success': False,
'message': f'检查失败: {str(e)}'
}
def get_monitor_status(self) -> Dict:
"""获取监控状态"""
stats = self.db.get_statistics()
return {
'is_running': self.is_running,
'check_interval': self.check_interval,
'total_sources': stats.get('active_sources', 0),
'total_articles': stats.get('total_articles', 0),
'today_articles': stats.get('today_articles', 0)
}
def start_rss_monitor():
"""启动RSS监控器的主函数"""
import logging.config
from config import LOGGING_CONFIG
# 配置日志
logging.basicConfig(
level=LOGGING_CONFIG['level'],
format=LOGGING_CONFIG['format'],
handlers=[
logging.FileHandler(LOGGING_CONFIG['file'], encoding='utf-8'),
logging.StreamHandler()
]
)
monitor = RSSMonitor()
try:
monitor.start_monitoring()
except KeyboardInterrupt:
print("\n收到停止信号正在关闭RSS监控器...")
finally:
monitor.stop_monitoring()
monitor.db.close()
print("RSS监控器已停止")
if __name__ == "__main__":
start_rss_monitor()