import aiohttp import xml.etree.ElementTree as ET from datetime import datetime from email.utils import parsedate_to_datetime from typing import List from src.crawlers.base import ICrawler from src.crawlers.dto import NewsItemDTO class RSSCrawler(ICrawler): def __init__(self, url: str, source: str): self.url = url self.source = source async def fetch_latest(self) -> List[NewsItemDTO]: async with aiohttp.ClientSession() as session: async with session.get(self.url) as response: response.raise_for_status() xml_data = await response.text() return self._parse_xml(xml_data) def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]: root = ET.fromstring(xml_data) items = [] for item in root.findall('.//item'): title = item.findtext('title') or "" link = item.findtext('link') or "" description = item.findtext('description') or "" pub_date_str = item.findtext('pubDate') if pub_date_str: try: timestamp = parsedate_to_datetime(pub_date_str) except Exception: timestamp = datetime.now() else: timestamp = datetime.now() items.append( NewsItemDTO( title=title, url=link, content_text=description, source=self.source, timestamp=timestamp ) ) return items