AI-Trend-Scout/src/crawlers/scirate_crawler.py

import aiohttp
from datetime import datetime, timezone
from typing import List
from bs4 import BeautifulSoup
from urllib.parse import urljoin

from .base import ICrawler
from .dto import NewsItemDTO

class SciRateCrawler(ICrawler):
    def __init__(self, url: str = "https://scirate.com/", source: str = "SciRate"):
        self.url = url
        self.source = source

    async def fetch_latest(self) -> List[NewsItemDTO]:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        async with aiohttp.ClientSession(headers=headers) as session:
            try:
                async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
                    if response.status != 200:
                        return []
                    html = await response.text()
                    return self.parse_html(html)
            except Exception:
                return []

    def parse_html(self, html: str) -> List[NewsItemDTO]:
        soup = BeautifulSoup(html, "html.parser")
        items = []

        # SciRate papers are typically in li.paper-list-item or div.paper
        papers = soup.select("li.paper-list-item, div.paper")

        for paper in papers:
            title_el = paper.select_one(".title a")
            if not title_el:
                continue

            title = title_el.get_text(strip=True)
            link = title_el.get("href", "")
            if isinstance(link, list):
                link = link[0] if link else ""

            if link and link.startswith("/"):
                link = urljoin(self.url, link)

            authors_el = paper.select_one(".authors")
            authors = authors_el.get_text(strip=True) if authors_el else ""

            abstract_el = paper.select_one(".abstract")
            abstract = abstract_el.get_text(strip=True) if abstract_el else ""

            content_text = f"Authors: {authors}\n\n{abstract}"

            items.append(NewsItemDTO(
                title=title,
                url=link or self.url,
                content_text=content_text.strip(),
                source=self.source,
                timestamp=datetime.now(timezone.utc)
            ))

        return items