20250715-66bfff96/代码实现/rss_monitor.py

# -*- coding: utf-8 -*-
"""
RSS监控脚本 - 自动获取RSS源更新
"""

import feedparser
import requests
import time
import logging
import threading
from datetime import datetime, timezone
from typing import List, Dict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed

from database import DatabaseManager
from config import RSS_MONITOR_CONFIG, SEARCH_CONFIG

class RSSMonitor:
    """RSS监控器"""

    def __init__(self):
        self.db = DatabaseManager()
        self.logger = logging.getLogger(__name__)
        self.is_running = False
        self.check_interval = RSS_MONITOR_CONFIG['check_interval']
        self.max_retries = RSS_MONITOR_CONFIG['max_retries']
        self.timeout = RSS_MONITOR_CONFIG['timeout']
        self.user_agent = RSS_MONITOR_CONFIG['user_agent']

    def start_monitoring(self):
        """开始监控RSS源"""
        self.is_running = True
        self.logger.info("RSS监控器启动")

        while self.is_running:
            try:
                self._check_all_sources()
                self.logger.info(f"等待 {self.check_interval} 秒后进行下次检查")
                time.sleep(self.check_interval)
            except KeyboardInterrupt:
                self.logger.info("收到停止信号")
                break
            except Exception as e:
                self.logger.error(f"监控过程出错: {e}")
                time.sleep(60)  # 出错后等待1分钟再继续

    def stop_monitoring(self):
        """停止监控"""
        self.is_running = False
        self.logger.info("RSS监控器停止")

    def _check_all_sources(self):
        """检查所有RSS源"""
        sources = self.db.get_rss_sources()
        self.logger.info(f"开始检查 {len(sources)} 个RSS源")

        # 使用线程池并行处理
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = {
                executor.submit(self._check_single_source, source): source
                for source in sources
            }

            success_count = 0
            error_count = 0

            for future in as_completed(futures):
                source = futures[future]
                try:
                    articles_count = future.result()
                    if articles_count is not None:
                        success_count += 1
                        if articles_count > 0:
                            self.logger.info(
                                f"{source['source_name']}: 新增 {articles_count} 篇文章"
                            )
                    else:
                        error_count += 1
                except Exception as e:
                    error_count += 1
                    self.logger.error(f"检查 {source['source_name']} 时出错: {e}")

        self.logger.info(f"RSS检查完成: 成功 {success_count}, 失败 {error_count}")

    def _check_single_source(self, source: Dict) -> Optional[int]:
        """检查单个RSS源"""
        source_id = source['id']
        source_name = source['source_name']
        source_url = source['source_url']

        try:
            # 获取RSS内容
            articles = self._fetch_rss_articles(source_url, source)

            if articles is None:
                return None

            # 保存新文章
            new_articles_count = 0
            for article in articles:
                article['source_id'] = source_id
                article_id = self.db.save_article(article)
                if article_id:
                    new_articles_count += 1

            # 更新RSS源检查时间
            self.db.update_rss_source_check_time(source_id)

            return new_articles_count

        except Exception as e:
            self.logger.error(f"检查RSS源 {source_name} 失败: {e}")
            return None

    def _fetch_rss_articles(self, url: str, source: Dict) -> Optional[List[Dict]]:
        """获取RSS文章"""
        headers = {
            'User-Agent': self.user_agent,
            'Accept': 'application/rss+xml, application/xml, text/xml'
        }

        for attempt in range(self.max_retries):
            try:
                # 获取RSS内容
                response = requests.get(url, headers=headers, timeout=self.timeout)
                response.raise_for_status()

                # 解析RSS
                feed = feedparser.parse(response.content)

                if feed.bozo and feed.bozo_exception:
                    self.logger.warning(
                        f"RSS解析警告 {source['source_name']}: {feed.bozo_exception}"
                    )

                articles = []
                for entry in feed.entries:
                    article = self._parse_rss_entry(entry, source)
                    if article:
                        articles.append(article)

                return articles

            except requests.RequestException as e:
                self.logger.warning(
                    f"第 {attempt + 1} 次尝试获取 {source['source_name']} 失败: {e}"
                )
                if attempt < self.max_retries - 1:
                    time.sleep(2 ** attempt)  # 指数退避
            except Exception as e:
                self.logger.error(f"解析RSS {source['source_name']} 时出错: {e}")
                break

        return None

    def _parse_rss_entry(self, entry, source: Dict) -> Optional[Dict]:
        """解析RSS条目"""
        try:
            # 获取发布时间
            published_date = None
            if hasattr(entry, 'published_parsed') and entry.published_parsed:
                published_date = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc)
            elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
                published_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)

            # 获取内容
            content = ''
            if hasattr(entry, 'content') and entry.content:
                content = entry.content[0].value if isinstance(entry.content, list) else entry.content
            elif hasattr(entry, 'summary'):
                content = entry.summary
            elif hasattr(entry, 'description'):
                content = entry.description

            # 获取作者
            author = ''
            if hasattr(entry, 'author'):
                author = entry.author
            elif hasattr(entry, 'dc_creator'):
                author = entry.dc_creator

            # 提取关键词
            keywords = self._extract_keywords(entry.title, content)

            article = {
                'title': entry.title if hasattr(entry, 'title') else '',
                'content': self._clean_content(content),
                'summary': entry.summary if hasattr(entry, 'summary') else '',
                'author': author,
                'original_url': entry.link if hasattr(entry, 'link') else '',
                'published_date': published_date,
                'language': source.get('language', 'en'),
                'keywords': keywords
            }

            # 验证必要字段
            if not article['title'] or not article['original_url']:
                return None

            return article

        except Exception as e:
            self.logger.error(f"解析RSS条目时出错: {e}")
            return None

    def _clean_content(self, content: str) -> str:
        """清理HTML内容"""
        if not content:
            return ''

        try:
            import re
            from html import unescape

            # 移除HTML标签
            content = re.sub(r'<[^>]+>', '', content)
            # 解码HTML实体
            content = unescape(content)
            # 移除多余空白
            content = re.sub(r'\s+', ' ', content).strip()

            return content
        except:
            return content

    def _extract_keywords(self, title: str, content: str) -> List[str]:
        """提取关键词"""
        try:
            text = f"{title} {content}".lower()

            # 简单关键词提取（可以用更高级的NLP库）
            import re
            words = re.findall(r'\b[a-zA-Z]{3,}\b', text)

            # 过滤常见停用词
            stop_words = {
                'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had',
                'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his',
                'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy',
                'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been',
                'said', 'each', 'make', 'most', 'over', 'some', 'time', 'very',
                'what', 'when', 'here', 'just', 'like', 'long', 'many', 'than',
                'them', 'well', 'your', 'come', 'could', 'into', 'more', 'much',
                'only', 'other', 'such', 'take', 'than', 'them', 'well', 'were'
            }

            keywords = [word for word in words if word not in stop_words]

            # 统计词频并返回前10个
            from collections import Counter
            word_counts = Counter(keywords)
            return [word for word, count in word_counts.most_common(10)]

        except Exception as e:
            self.logger.error(f"提取关键词时出错: {e}")
            return []

    def manual_check_source(self, source_id: int) -> Dict:
        """手动检查指定RSS源"""
        sources = self.db.get_rss_sources()
        source = next((s for s in sources if s['id'] == source_id), None)

        if not source:
            return {'success': False, 'message': 'RSS源不存在'}

        try:
            articles_count = self._check_single_source(source)
            if articles_count is not None:
                return {
                    'success': True,
                    'message': f'成功检查 {source["source_name"]}',
                    'new_articles': articles_count
                }
            else:
                return {
                    'success': False,
                    'message': f'检查 {source["source_name"]} 失败'
                }
        except Exception as e:
            return {
                'success': False,
                'message': f'检查失败: {str(e)}'
            }

    def get_monitor_status(self) -> Dict:
        """获取监控状态"""
        stats = self.db.get_statistics()

        return {
            'is_running': self.is_running,
            'check_interval': self.check_interval,
            'total_sources': stats.get('active_sources', 0),
            'total_articles': stats.get('total_articles', 0),
            'today_articles': stats.get('today_articles', 0)
        }

def start_rss_monitor():
    """启动RSS监控器的主函数"""
    import logging.config
    from config import LOGGING_CONFIG

    # 配置日志
    logging.basicConfig(
        level=LOGGING_CONFIG['level'],
        format=LOGGING_CONFIG['format'],
        handlers=[
            logging.FileHandler(LOGGING_CONFIG['file'], encoding='utf-8'),
            logging.StreamHandler()
        ]
    )

    monitor = RSSMonitor()

    try:
        monitor.start_monitoring()
    except KeyboardInterrupt:
        print("\n收到停止信号，正在关闭RSS监控器...")
    finally:
        monitor.stop_monitoring()
        monitor.db.close()
        print("RSS监控器已停止")

if __name__ == "__main__":
    start_rss_monitor()