AI-Trend-Scout/src/crawlers/scirate_crawler.py
Artur Mukhamadiev a304ae9cd2 feat(crawler): add academic and research sources
- Implement crawlers for Microsoft Research, SciRate, and Google Scholar
- Use Playwright with stealth for Google Scholar anti-bot mitigation
- Update CrawlerFactory to support new research crawler types
- Add unit and integration tests for all academic sources with high coverage
2026-03-16 00:11:15 +03:00

66 lines
2.4 KiB
Python

import aiohttp
from datetime import datetime, timezone
from typing import List
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from .base import ICrawler
from .dto import NewsItemDTO
class SciRateCrawler(ICrawler):
def __init__(self, url: str = "https://scirate.com/", source: str = "SciRate"):
self.url = url
self.source = source
async def fetch_latest(self) -> List[NewsItemDTO]:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
async with aiohttp.ClientSession(headers=headers) as session:
try:
async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
if response.status != 200:
return []
html = await response.text()
return self.parse_html(html)
except Exception:
return []
def parse_html(self, html: str) -> List[NewsItemDTO]:
soup = BeautifulSoup(html, "html.parser")
items = []
# SciRate papers are typically in li.paper-list-item or div.paper
papers = soup.select("li.paper-list-item, div.paper")
for paper in papers:
title_el = paper.select_one(".title a")
if not title_el:
continue
title = title_el.get_text(strip=True)
link = title_el.get("href", "")
if isinstance(link, list):
link = link[0] if link else ""
if link and link.startswith("/"):
link = urljoin(self.url, link)
authors_el = paper.select_one(".authors")
authors = authors_el.get_text(strip=True) if authors_el else ""
abstract_el = paper.select_one(".abstract")
abstract = abstract_el.get_text(strip=True) if abstract_el else ""
content_text = f"Authors: {authors}\n\n{abstract}"
items.append(NewsItemDTO(
title=title,
url=link or self.url,
content_text=content_text.strip(),
source=self.source,
timestamp=datetime.now(timezone.utc)
))
return items