324 lines
12 KiB
Python
324 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
RSS监控脚本 - 自动获取RSS源更新
|
||
"""
|
||
|
||
import feedparser
|
||
import requests
|
||
import time
|
||
import logging
|
||
import threading
|
||
from datetime import datetime, timezone
|
||
from typing import List, Dict, Optional
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
||
from database import DatabaseManager
|
||
from config import RSS_MONITOR_CONFIG, SEARCH_CONFIG
|
||
|
||
class RSSMonitor:
|
||
"""RSS监控器"""
|
||
|
||
def __init__(self):
|
||
self.db = DatabaseManager()
|
||
self.logger = logging.getLogger(__name__)
|
||
self.is_running = False
|
||
self.check_interval = RSS_MONITOR_CONFIG['check_interval']
|
||
self.max_retries = RSS_MONITOR_CONFIG['max_retries']
|
||
self.timeout = RSS_MONITOR_CONFIG['timeout']
|
||
self.user_agent = RSS_MONITOR_CONFIG['user_agent']
|
||
|
||
def start_monitoring(self):
|
||
"""开始监控RSS源"""
|
||
self.is_running = True
|
||
self.logger.info("RSS监控器启动")
|
||
|
||
while self.is_running:
|
||
try:
|
||
self._check_all_sources()
|
||
self.logger.info(f"等待 {self.check_interval} 秒后进行下次检查")
|
||
time.sleep(self.check_interval)
|
||
except KeyboardInterrupt:
|
||
self.logger.info("收到停止信号")
|
||
break
|
||
except Exception as e:
|
||
self.logger.error(f"监控过程出错: {e}")
|
||
time.sleep(60) # 出错后等待1分钟再继续
|
||
|
||
def stop_monitoring(self):
|
||
"""停止监控"""
|
||
self.is_running = False
|
||
self.logger.info("RSS监控器停止")
|
||
|
||
def _check_all_sources(self):
|
||
"""检查所有RSS源"""
|
||
sources = self.db.get_rss_sources()
|
||
self.logger.info(f"开始检查 {len(sources)} 个RSS源")
|
||
|
||
# 使用线程池并行处理
|
||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||
futures = {
|
||
executor.submit(self._check_single_source, source): source
|
||
for source in sources
|
||
}
|
||
|
||
success_count = 0
|
||
error_count = 0
|
||
|
||
for future in as_completed(futures):
|
||
source = futures[future]
|
||
try:
|
||
articles_count = future.result()
|
||
if articles_count is not None:
|
||
success_count += 1
|
||
if articles_count > 0:
|
||
self.logger.info(
|
||
f"{source['source_name']}: 新增 {articles_count} 篇文章"
|
||
)
|
||
else:
|
||
error_count += 1
|
||
except Exception as e:
|
||
error_count += 1
|
||
self.logger.error(f"检查 {source['source_name']} 时出错: {e}")
|
||
|
||
self.logger.info(f"RSS检查完成: 成功 {success_count}, 失败 {error_count}")
|
||
|
||
def _check_single_source(self, source: Dict) -> Optional[int]:
|
||
"""检查单个RSS源"""
|
||
source_id = source['id']
|
||
source_name = source['source_name']
|
||
source_url = source['source_url']
|
||
|
||
try:
|
||
# 获取RSS内容
|
||
articles = self._fetch_rss_articles(source_url, source)
|
||
|
||
if articles is None:
|
||
return None
|
||
|
||
# 保存新文章
|
||
new_articles_count = 0
|
||
for article in articles:
|
||
article['source_id'] = source_id
|
||
article_id = self.db.save_article(article)
|
||
if article_id:
|
||
new_articles_count += 1
|
||
|
||
# 更新RSS源检查时间
|
||
self.db.update_rss_source_check_time(source_id)
|
||
|
||
return new_articles_count
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"检查RSS源 {source_name} 失败: {e}")
|
||
return None
|
||
|
||
def _fetch_rss_articles(self, url: str, source: Dict) -> Optional[List[Dict]]:
|
||
"""获取RSS文章"""
|
||
headers = {
|
||
'User-Agent': self.user_agent,
|
||
'Accept': 'application/rss+xml, application/xml, text/xml'
|
||
}
|
||
|
||
for attempt in range(self.max_retries):
|
||
try:
|
||
# 获取RSS内容
|
||
response = requests.get(url, headers=headers, timeout=self.timeout)
|
||
response.raise_for_status()
|
||
|
||
# 解析RSS
|
||
feed = feedparser.parse(response.content)
|
||
|
||
if feed.bozo and feed.bozo_exception:
|
||
self.logger.warning(
|
||
f"RSS解析警告 {source['source_name']}: {feed.bozo_exception}"
|
||
)
|
||
|
||
articles = []
|
||
for entry in feed.entries:
|
||
article = self._parse_rss_entry(entry, source)
|
||
if article:
|
||
articles.append(article)
|
||
|
||
return articles
|
||
|
||
except requests.RequestException as e:
|
||
self.logger.warning(
|
||
f"第 {attempt + 1} 次尝试获取 {source['source_name']} 失败: {e}"
|
||
)
|
||
if attempt < self.max_retries - 1:
|
||
time.sleep(2 ** attempt) # 指数退避
|
||
except Exception as e:
|
||
self.logger.error(f"解析RSS {source['source_name']} 时出错: {e}")
|
||
break
|
||
|
||
return None
|
||
|
||
def _parse_rss_entry(self, entry, source: Dict) -> Optional[Dict]:
|
||
"""解析RSS条目"""
|
||
try:
|
||
# 获取发布时间
|
||
published_date = None
|
||
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
||
published_date = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc)
|
||
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
|
||
published_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
|
||
|
||
# 获取内容
|
||
content = ''
|
||
if hasattr(entry, 'content') and entry.content:
|
||
content = entry.content[0].value if isinstance(entry.content, list) else entry.content
|
||
elif hasattr(entry, 'summary'):
|
||
content = entry.summary
|
||
elif hasattr(entry, 'description'):
|
||
content = entry.description
|
||
|
||
# 获取作者
|
||
author = ''
|
||
if hasattr(entry, 'author'):
|
||
author = entry.author
|
||
elif hasattr(entry, 'dc_creator'):
|
||
author = entry.dc_creator
|
||
|
||
# 提取关键词
|
||
keywords = self._extract_keywords(entry.title, content)
|
||
|
||
article = {
|
||
'title': entry.title if hasattr(entry, 'title') else '',
|
||
'content': self._clean_content(content),
|
||
'summary': entry.summary if hasattr(entry, 'summary') else '',
|
||
'author': author,
|
||
'original_url': entry.link if hasattr(entry, 'link') else '',
|
||
'published_date': published_date,
|
||
'language': source.get('language', 'en'),
|
||
'keywords': keywords
|
||
}
|
||
|
||
# 验证必要字段
|
||
if not article['title'] or not article['original_url']:
|
||
return None
|
||
|
||
return article
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"解析RSS条目时出错: {e}")
|
||
return None
|
||
|
||
def _clean_content(self, content: str) -> str:
|
||
"""清理HTML内容"""
|
||
if not content:
|
||
return ''
|
||
|
||
try:
|
||
import re
|
||
from html import unescape
|
||
|
||
# 移除HTML标签
|
||
content = re.sub(r'<[^>]+>', '', content)
|
||
# 解码HTML实体
|
||
content = unescape(content)
|
||
# 移除多余空白
|
||
content = re.sub(r'\s+', ' ', content).strip()
|
||
|
||
return content
|
||
except:
|
||
return content
|
||
|
||
def _extract_keywords(self, title: str, content: str) -> List[str]:
|
||
"""提取关键词"""
|
||
try:
|
||
text = f"{title} {content}".lower()
|
||
|
||
# 简单关键词提取(可以用更高级的NLP库)
|
||
import re
|
||
words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
|
||
|
||
# 过滤常见停用词
|
||
stop_words = {
|
||
'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had',
|
||
'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his',
|
||
'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy',
|
||
'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been',
|
||
'said', 'each', 'make', 'most', 'over', 'some', 'time', 'very',
|
||
'what', 'when', 'here', 'just', 'like', 'long', 'many', 'than',
|
||
'them', 'well', 'your', 'come', 'could', 'into', 'more', 'much',
|
||
'only', 'other', 'such', 'take', 'than', 'them', 'well', 'were'
|
||
}
|
||
|
||
keywords = [word for word in words if word not in stop_words]
|
||
|
||
# 统计词频并返回前10个
|
||
from collections import Counter
|
||
word_counts = Counter(keywords)
|
||
return [word for word, count in word_counts.most_common(10)]
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"提取关键词时出错: {e}")
|
||
return []
|
||
|
||
def manual_check_source(self, source_id: int) -> Dict:
|
||
"""手动检查指定RSS源"""
|
||
sources = self.db.get_rss_sources()
|
||
source = next((s for s in sources if s['id'] == source_id), None)
|
||
|
||
if not source:
|
||
return {'success': False, 'message': 'RSS源不存在'}
|
||
|
||
try:
|
||
articles_count = self._check_single_source(source)
|
||
if articles_count is not None:
|
||
return {
|
||
'success': True,
|
||
'message': f'成功检查 {source["source_name"]}',
|
||
'new_articles': articles_count
|
||
}
|
||
else:
|
||
return {
|
||
'success': False,
|
||
'message': f'检查 {source["source_name"]} 失败'
|
||
}
|
||
except Exception as e:
|
||
return {
|
||
'success': False,
|
||
'message': f'检查失败: {str(e)}'
|
||
}
|
||
|
||
def get_monitor_status(self) -> Dict:
|
||
"""获取监控状态"""
|
||
stats = self.db.get_statistics()
|
||
|
||
return {
|
||
'is_running': self.is_running,
|
||
'check_interval': self.check_interval,
|
||
'total_sources': stats.get('active_sources', 0),
|
||
'total_articles': stats.get('total_articles', 0),
|
||
'today_articles': stats.get('today_articles', 0)
|
||
}
|
||
|
||
def start_rss_monitor():
|
||
"""启动RSS监控器的主函数"""
|
||
import logging.config
|
||
from config import LOGGING_CONFIG
|
||
|
||
# 配置日志
|
||
logging.basicConfig(
|
||
level=LOGGING_CONFIG['level'],
|
||
format=LOGGING_CONFIG['format'],
|
||
handlers=[
|
||
logging.FileHandler(LOGGING_CONFIG['file'], encoding='utf-8'),
|
||
logging.StreamHandler()
|
||
]
|
||
)
|
||
|
||
monitor = RSSMonitor()
|
||
|
||
try:
|
||
monitor.start_monitoring()
|
||
except KeyboardInterrupt:
|
||
print("\n收到停止信号,正在关闭RSS监控器...")
|
||
finally:
|
||
monitor.stop_monitoring()
|
||
monitor.db.close()
|
||
print("RSS监控器已停止")
|
||
|
||
if __name__ == "__main__":
|
||
start_rss_monitor() |