import aiohttp from datetime import datetime, timezone from typing import List from bs4 import BeautifulSoup from urllib.parse import urljoin from .base import ICrawler from .dto import NewsItemDTO class SciRateCrawler(ICrawler): def __init__(self, url: str = "https://scirate.com/", source: str = "SciRate"): self.url = url self.source = source async def fetch_latest(self) -> List[NewsItemDTO]: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } async with aiohttp.ClientSession(headers=headers) as session: try: async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response: if response.status != 200: return [] html = await response.text() return self.parse_html(html) except Exception: return [] def parse_html(self, html: str) -> List[NewsItemDTO]: soup = BeautifulSoup(html, "html.parser") items = [] # SciRate papers are typically in li.paper-list-item or div.paper papers = soup.select("li.paper-list-item, div.paper") for paper in papers: title_el = paper.select_one(".title a") if not title_el: continue title = title_el.get_text(strip=True) link = title_el.get("href", "") if isinstance(link, list): link = link[0] if link else "" if link and link.startswith("/"): link = urljoin(self.url, link) authors_el = paper.select_one(".authors") authors = authors_el.get_text(strip=True) if authors_el else "" abstract_el = paper.select_one(".abstract") abstract = abstract_el.get_text(strip=True) if abstract_el else "" content_text = f"Authors: {authors}\n\n{abstract}" items.append(NewsItemDTO( title=title, url=link or self.url, content_text=content_text.strip(), source=self.source, timestamp=datetime.now(timezone.utc) )) return items