- Implement crawlers for Microsoft Research, SciRate, and Google Scholar - Use Playwright with stealth for Google Scholar anti-bot mitigation - Update CrawlerFactory to support new research crawler types - Add unit and integration tests for all academic sources with high coverage
66 lines
2.4 KiB
Python
66 lines
2.4 KiB
Python
import aiohttp
|
|
from datetime import datetime, timezone
|
|
from typing import List
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
|
|
from .base import ICrawler
|
|
from .dto import NewsItemDTO
|
|
|
|
class SciRateCrawler(ICrawler):
|
|
def __init__(self, url: str = "https://scirate.com/", source: str = "SciRate"):
|
|
self.url = url
|
|
self.source = source
|
|
|
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
}
|
|
async with aiohttp.ClientSession(headers=headers) as session:
|
|
try:
|
|
async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
|
if response.status != 200:
|
|
return []
|
|
html = await response.text()
|
|
return self.parse_html(html)
|
|
except Exception:
|
|
return []
|
|
|
|
def parse_html(self, html: str) -> List[NewsItemDTO]:
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
items = []
|
|
|
|
# SciRate papers are typically in li.paper-list-item or div.paper
|
|
papers = soup.select("li.paper-list-item, div.paper")
|
|
|
|
for paper in papers:
|
|
title_el = paper.select_one(".title a")
|
|
if not title_el:
|
|
continue
|
|
|
|
title = title_el.get_text(strip=True)
|
|
link = title_el.get("href", "")
|
|
if isinstance(link, list):
|
|
link = link[0] if link else ""
|
|
|
|
if link and link.startswith("/"):
|
|
link = urljoin(self.url, link)
|
|
|
|
authors_el = paper.select_one(".authors")
|
|
authors = authors_el.get_text(strip=True) if authors_el else ""
|
|
|
|
abstract_el = paper.select_one(".abstract")
|
|
abstract = abstract_el.get_text(strip=True) if abstract_el else ""
|
|
|
|
content_text = f"Authors: {authors}\n\n{abstract}"
|
|
|
|
items.append(NewsItemDTO(
|
|
title=title,
|
|
url=link or self.url,
|
|
content_text=content_text.strip(),
|
|
source=self.source,
|
|
timestamp=datetime.now(timezone.utc)
|
|
))
|
|
|
|
return items
|