AI-Trend-Scout/src/crawlers/rss_crawler.py

49 lines
1.6 KiB
Python

import aiohttp
import xml.etree.ElementTree as ET
from datetime import datetime
from email.utils import parsedate_to_datetime
from typing import List
from src.crawlers.base import ICrawler
from src.crawlers.dto import NewsItemDTO
class RSSCrawler(ICrawler):
def __init__(self, url: str, source: str):
self.url = url
self.source = source
async def fetch_latest(self) -> List[NewsItemDTO]:
async with aiohttp.ClientSession() as session:
async with session.get(self.url) as response:
response.raise_for_status()
xml_data = await response.text()
return self._parse_xml(xml_data)
def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]:
root = ET.fromstring(xml_data)
items = []
for item in root.findall('.//item'):
title = item.findtext('title') or ""
link = item.findtext('link') or ""
description = item.findtext('description') or ""
pub_date_str = item.findtext('pubDate')
if pub_date_str:
try:
timestamp = parsedate_to_datetime(pub_date_str)
except Exception:
timestamp = datetime.now()
else:
timestamp = datetime.now()
items.append(
NewsItemDTO(
title=title,
url=link,
content_text=description,
source=self.source,
timestamp=timestamp
)
)
return items