AI-Trend-Scout/src/crawlers/rss_crawler.py

import aiohttp
import xml.etree.ElementTree as ET
from datetime import datetime
from email.utils import parsedate_to_datetime
from typing import List

from src.crawlers.base import ICrawler
from src.crawlers.dto import NewsItemDTO

class RSSCrawler(ICrawler):
    def __init__(self, url: str, source: str):
        self.url = url
        self.source = source

    async def fetch_latest(self) -> List[NewsItemDTO]:
        async with aiohttp.ClientSession() as session:
            async with session.get(self.url) as response:
                response.raise_for_status()
                xml_data = await response.text()
                return self._parse_xml(xml_data)

    def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]:
        root = ET.fromstring(xml_data)
        items = []
        for item in root.findall('.//item'):
            title = item.findtext('title') or ""
            link = item.findtext('link') or ""
            description = item.findtext('description') or ""
            pub_date_str = item.findtext('pubDate')

            if pub_date_str:
                try:
                    timestamp = parsedate_to_datetime(pub_date_str)
                except Exception:
                    timestamp = datetime.now()
            else:
                timestamp = datetime.now()

            items.append(
                NewsItemDTO(
                    title=title,
                    url=link,
                    content_text=description,
                    source=self.source,
                    timestamp=timestamp
                )
            )
        return items