49 lines
1.6 KiB
Python
49 lines
1.6 KiB
Python
import aiohttp
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import datetime
|
|
from email.utils import parsedate_to_datetime
|
|
from typing import List
|
|
|
|
from src.crawlers.base import ICrawler
|
|
from src.crawlers.dto import NewsItemDTO
|
|
|
|
class RSSCrawler(ICrawler):
|
|
def __init__(self, url: str, source: str):
|
|
self.url = url
|
|
self.source = source
|
|
|
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(self.url) as response:
|
|
response.raise_for_status()
|
|
xml_data = await response.text()
|
|
return self._parse_xml(xml_data)
|
|
|
|
def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]:
|
|
root = ET.fromstring(xml_data)
|
|
items = []
|
|
for item in root.findall('.//item'):
|
|
title = item.findtext('title') or ""
|
|
link = item.findtext('link') or ""
|
|
description = item.findtext('description') or ""
|
|
pub_date_str = item.findtext('pubDate')
|
|
|
|
if pub_date_str:
|
|
try:
|
|
timestamp = parsedate_to_datetime(pub_date_str)
|
|
except Exception:
|
|
timestamp = datetime.now()
|
|
else:
|
|
timestamp = datetime.now()
|
|
|
|
items.append(
|
|
NewsItemDTO(
|
|
title=title,
|
|
url=link,
|
|
content_text=description,
|
|
source=self.source,
|
|
timestamp=timestamp
|
|
)
|
|
)
|
|
return items
|