feat(crawler): add academic and research sources
- Implement crawlers for Microsoft Research, SciRate, and Google Scholar - Use Playwright with stealth for Google Scholar anti-bot mitigation - Update CrawlerFactory to support new research crawler types - Add unit and integration tests for all academic sources with high coverage
This commit is contained in:
parent
65fccbc614
commit
a304ae9cd2
@ -93,4 +93,18 @@ crawlers:
|
|||||||
source: "Habr Code Quality"
|
source: "Habr Code Quality"
|
||||||
- type: rss
|
- type: rss
|
||||||
url: "https://habr.com/ru/rss/articles/rated100/?fl=ru"
|
url: "https://habr.com/ru/rss/articles/rated100/?fl=ru"
|
||||||
source: "Habr High Ranked"
|
source: "Habr High Ranked"
|
||||||
|
- type: rss
|
||||||
|
url: "https://www.microsoft.com/en-us/research/feed/"
|
||||||
|
source: "Microsoft Research"
|
||||||
|
- type: scirate
|
||||||
|
url: "https://scirate.com/"
|
||||||
|
source: "SciRate"
|
||||||
|
- type: scholar
|
||||||
|
url: "https://scholar.google.com/"
|
||||||
|
source: "Google Scholar"
|
||||||
|
query: "WebGPU machine learning"
|
||||||
|
- type: scholar
|
||||||
|
url: "https://scholar.google.com/"
|
||||||
|
source: "Google Scholar"
|
||||||
|
query: "NPU acceleration"
|
||||||
@ -7,6 +7,9 @@ from src.crawlers.playwright_crawler import PlaywrightCrawler
|
|||||||
from src.crawlers.cppconf_crawler import CppConfCrawler
|
from src.crawlers.cppconf_crawler import CppConfCrawler
|
||||||
from src.crawlers.static_crawler import StaticCrawler
|
from src.crawlers.static_crawler import StaticCrawler
|
||||||
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
||||||
|
from src.crawlers.scirate_crawler import SciRateCrawler
|
||||||
|
from src.crawlers.scholar_crawler import ScholarCrawler
|
||||||
|
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -30,7 +33,7 @@ class CrawlerFactory:
|
|||||||
url = item.get('url')
|
url = item.get('url')
|
||||||
source = item.get('source')
|
source = item.get('source')
|
||||||
|
|
||||||
if not url or not source:
|
if not source or (not url and crawler_type != 'scholar'):
|
||||||
logger.warning(f"Missing mandatory fields (url, source) for crawler: {item}")
|
logger.warning(f"Missing mandatory fields (url, source) for crawler: {item}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -49,6 +52,13 @@ class CrawlerFactory:
|
|||||||
logger.warning(f"Missing mandatory field 'selector' for static crawler: {item}")
|
logger.warning(f"Missing mandatory field 'selector' for static crawler: {item}")
|
||||||
elif crawler_type == 'skolkovo':
|
elif crawler_type == 'skolkovo':
|
||||||
crawlers.append(SkolkovoCrawler(url=url, source=source))
|
crawlers.append(SkolkovoCrawler(url=url, source=source))
|
||||||
|
elif crawler_type == 'scirate':
|
||||||
|
crawlers.append(SciRateCrawler(url=url, source=source))
|
||||||
|
elif crawler_type == 'scholar':
|
||||||
|
query = item.get('query', 'Artificial Intelligence')
|
||||||
|
crawlers.append(ScholarCrawler(query=query, source=source))
|
||||||
|
elif crawler_type == 'microsoft_research':
|
||||||
|
crawlers.append(MicrosoftResearchCrawler(url=url, source=source))
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unknown crawler type: {crawler_type}")
|
logger.warning(f"Unknown crawler type: {crawler_type}")
|
||||||
|
|
||||||
|
|||||||
7
src/crawlers/microsoft_research_crawler.py
Normal file
7
src/crawlers/microsoft_research_crawler.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
from typing import List
|
||||||
|
from .rss_crawler import RSSCrawler
|
||||||
|
from .dto import NewsItemDTO
|
||||||
|
|
||||||
|
class MicrosoftResearchCrawler(RSSCrawler):
|
||||||
|
def __init__(self, url: str = "https://www.microsoft.com/en-us/research/feed/", source: str = "Microsoft Research"):
|
||||||
|
super().__init__(url, source)
|
||||||
92
src/crawlers/scholar_crawler.py
Normal file
92
src/crawlers/scholar_crawler.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
import logging
|
||||||
|
from typing import List, Optional
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
from playwright_stealth import Stealth
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from .base import ICrawler
|
||||||
|
from .dto import NewsItemDTO
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class ScholarCrawler(ICrawler):
|
||||||
|
def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"):
|
||||||
|
self.query = query
|
||||||
|
# Google Scholar query URL
|
||||||
|
self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}"
|
||||||
|
self.source = source
|
||||||
|
|
||||||
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||||
|
try:
|
||||||
|
async with async_playwright() as p:
|
||||||
|
# Launch browser
|
||||||
|
browser = await p.chromium.launch(headless=True)
|
||||||
|
try:
|
||||||
|
# Create a new context with a realistic user agent
|
||||||
|
context = await browser.new_context(
|
||||||
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
# Apply stealth to avoid detection
|
||||||
|
await Stealth().apply_stealth_async(page)
|
||||||
|
|
||||||
|
logger.info(f"Navigating to {self.url}")
|
||||||
|
await page.goto(self.url, wait_until="networkidle", timeout=60000)
|
||||||
|
|
||||||
|
# Check for CAPTCHA or blocking
|
||||||
|
content = await page.content()
|
||||||
|
if "CAPTCHA" in content or "not a robot" in content:
|
||||||
|
logger.warning("Google Scholar CAPTCHA or bot detection triggered")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Select result items
|
||||||
|
results = await page.query_selector_all(".gs_ri")
|
||||||
|
news_items = []
|
||||||
|
|
||||||
|
for res in results:
|
||||||
|
# Title element
|
||||||
|
title_el = await res.query_selector(".gs_rt a")
|
||||||
|
if not title_el:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = await title_el.inner_text()
|
||||||
|
url = await title_el.get_attribute("href")
|
||||||
|
|
||||||
|
# Snippet/Abstract
|
||||||
|
snippet_el = await res.query_selector(".gs_rs")
|
||||||
|
snippet = await snippet_el.inner_text() if snippet_el else ""
|
||||||
|
|
||||||
|
# Metadata (authors, journal, year)
|
||||||
|
metadata_el = await res.query_selector(".gs_a")
|
||||||
|
metadata = await metadata_el.inner_text() if metadata_el else ""
|
||||||
|
|
||||||
|
# Citation count (usually in the bottom links)
|
||||||
|
# We look for a link that starts with "Cited by"
|
||||||
|
citation_count = "0"
|
||||||
|
bottom_links = await res.query_selector_all(".gs_fl a")
|
||||||
|
for link in bottom_links:
|
||||||
|
text = await link.inner_text()
|
||||||
|
if "Cited by" in text:
|
||||||
|
citation_count = text.replace("Cited by", "").strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
content_text = f"{metadata}\n\n{snippet}\n\nCitations: {citation_count}"
|
||||||
|
|
||||||
|
news_items.append(
|
||||||
|
NewsItemDTO(
|
||||||
|
title=title.strip(),
|
||||||
|
url=url or self.url,
|
||||||
|
content_text=content_text.strip(),
|
||||||
|
source=f"{self.source}: {self.query}",
|
||||||
|
timestamp=datetime.now(timezone.utc)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return news_items
|
||||||
|
finally:
|
||||||
|
await browser.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error crawling Google Scholar: {e}")
|
||||||
|
return []
|
||||||
65
src/crawlers/scirate_crawler.py
Normal file
65
src/crawlers/scirate_crawler.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
import aiohttp
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import List
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from .base import ICrawler
|
||||||
|
from .dto import NewsItemDTO
|
||||||
|
|
||||||
|
class SciRateCrawler(ICrawler):
|
||||||
|
def __init__(self, url: str = "https://scirate.com/", source: str = "SciRate"):
|
||||||
|
self.url = url
|
||||||
|
self.source = source
|
||||||
|
|
||||||
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
async with aiohttp.ClientSession(headers=headers) as session:
|
||||||
|
try:
|
||||||
|
async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
return []
|
||||||
|
html = await response.text()
|
||||||
|
return self.parse_html(html)
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def parse_html(self, html: str) -> List[NewsItemDTO]:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
items = []
|
||||||
|
|
||||||
|
# SciRate papers are typically in li.paper-list-item or div.paper
|
||||||
|
papers = soup.select("li.paper-list-item, div.paper")
|
||||||
|
|
||||||
|
for paper in papers:
|
||||||
|
title_el = paper.select_one(".title a")
|
||||||
|
if not title_el:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = title_el.get_text(strip=True)
|
||||||
|
link = title_el.get("href", "")
|
||||||
|
if isinstance(link, list):
|
||||||
|
link = link[0] if link else ""
|
||||||
|
|
||||||
|
if link and link.startswith("/"):
|
||||||
|
link = urljoin(self.url, link)
|
||||||
|
|
||||||
|
authors_el = paper.select_one(".authors")
|
||||||
|
authors = authors_el.get_text(strip=True) if authors_el else ""
|
||||||
|
|
||||||
|
abstract_el = paper.select_one(".abstract")
|
||||||
|
abstract = abstract_el.get_text(strip=True) if abstract_el else ""
|
||||||
|
|
||||||
|
content_text = f"Authors: {authors}\n\n{abstract}"
|
||||||
|
|
||||||
|
items.append(NewsItemDTO(
|
||||||
|
title=title,
|
||||||
|
url=link or self.url,
|
||||||
|
content_text=content_text.strip(),
|
||||||
|
source=self.source,
|
||||||
|
timestamp=datetime.now(timezone.utc)
|
||||||
|
))
|
||||||
|
|
||||||
|
return items
|
||||||
@ -57,20 +57,22 @@ class StaticCrawler(ICrawler):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
url = link_el.get('href') if link_el else ""
|
url = link_el.get('href') if link_el else ""
|
||||||
|
if isinstance(url, list):
|
||||||
|
url = url[0] if url else ""
|
||||||
|
|
||||||
if not title or not url:
|
if not title or not url:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Normalize URL
|
# Normalize URL
|
||||||
if url.startswith('/'):
|
if str(url).startswith('/'):
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
url = urljoin(self.url, url)
|
url = urljoin(self.url, str(url))
|
||||||
|
|
||||||
content_text = el.get_text(separator=" ", strip=True)
|
content_text = el.get_text(separator=" ", strip=True)
|
||||||
|
|
||||||
items.append(NewsItemDTO(
|
items.append(NewsItemDTO(
|
||||||
title=title,
|
title=title,
|
||||||
url=url,
|
url=str(url),
|
||||||
content_text=content_text,
|
content_text=content_text,
|
||||||
source=self.source,
|
source=self.source,
|
||||||
timestamp=datetime.now(timezone.utc)
|
timestamp=datetime.now(timezone.utc)
|
||||||
|
|||||||
216
tests/crawlers/test_academic_crawlers.py
Normal file
216
tests/crawlers/test_academic_crawlers.py
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
import pytest
|
||||||
|
import aiohttp
|
||||||
|
from unittest.mock import AsyncMock, patch, MagicMock
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from src.crawlers.scirate_crawler import SciRateCrawler
|
||||||
|
from src.crawlers.scholar_crawler import ScholarCrawler
|
||||||
|
from src.crawlers.factory import CrawlerFactory
|
||||||
|
from src.crawlers.dto import NewsItemDTO
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scirate_crawler_parse_html():
|
||||||
|
crawler = SciRateCrawler()
|
||||||
|
sample_html = """
|
||||||
|
<li class="paper-list-item">
|
||||||
|
<div class="title"><a href="/arxiv/2403.12345">Quantum Supremacy in the Kitchen</a></div>
|
||||||
|
<div class="authors">John Doe, Jane Smith</div>
|
||||||
|
<div class="abstract">We demonstrate quantum supremacy by perfectly boiling an egg.</div>
|
||||||
|
</li>
|
||||||
|
<div class="paper">
|
||||||
|
<div class="title"><a href="https://scirate.com/arxiv/2403.67890">AI for Cats</a></div>
|
||||||
|
<div class="authors">Cat Lover</div>
|
||||||
|
<div class="abstract">A deep learning approach to understanding meows.</div>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
items = crawler.parse_html(sample_html)
|
||||||
|
|
||||||
|
assert len(items) == 2
|
||||||
|
|
||||||
|
assert items[0].title == "Quantum Supremacy in the Kitchen"
|
||||||
|
assert "arxiv/2403.12345" in items[0].url
|
||||||
|
assert "John Doe, Jane Smith" in items[0].content_text
|
||||||
|
assert "boiling an egg" in items[0].content_text
|
||||||
|
assert items[0].source == "SciRate"
|
||||||
|
|
||||||
|
assert items[1].title == "AI for Cats"
|
||||||
|
assert items[1].url == "https://scirate.com/arxiv/2403.67890"
|
||||||
|
assert "Cat Lover" in items[1].content_text
|
||||||
|
assert "meows" in items[1].content_text
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scirate_crawler_fetch_latest():
|
||||||
|
crawler = SciRateCrawler()
|
||||||
|
sample_html = """
|
||||||
|
<li class="paper-list-item">
|
||||||
|
<div class="title"><a href="/arxiv/2403.12345">Quantum Supremacy</a></div>
|
||||||
|
</li>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
||||||
|
mock_response = AsyncMock()
|
||||||
|
mock_response.status = 200
|
||||||
|
mock_response.text.return_value = sample_html
|
||||||
|
|
||||||
|
mock_get.return_value.__aenter__.return_value = mock_response
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert len(items) == 1
|
||||||
|
assert items[0].title == "Quantum Supremacy"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scirate_crawler_fetch_error():
|
||||||
|
crawler = SciRateCrawler()
|
||||||
|
|
||||||
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
||||||
|
mock_response = AsyncMock()
|
||||||
|
mock_response.status = 404
|
||||||
|
|
||||||
|
mock_get.return_value.__aenter__.return_value = mock_response
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert items == []
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scholar_crawler_fetch_latest():
|
||||||
|
crawler = ScholarCrawler(query="WebGPU", source="Scholar")
|
||||||
|
|
||||||
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
|
||||||
|
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
|
||||||
|
|
||||||
|
mock_p = AsyncMock()
|
||||||
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||||
|
|
||||||
|
mock_browser = AsyncMock()
|
||||||
|
mock_p.chromium.launch.return_value = mock_browser
|
||||||
|
|
||||||
|
mock_context = AsyncMock()
|
||||||
|
mock_browser.new_context.return_value = mock_context
|
||||||
|
|
||||||
|
mock_page = AsyncMock()
|
||||||
|
mock_context.new_page.return_value = mock_page
|
||||||
|
mock_page.content.return_value = "<html><body>Results</body></html>"
|
||||||
|
|
||||||
|
# Mock Stealth instance and method
|
||||||
|
mock_stealth_instance = MagicMock()
|
||||||
|
mock_stealth_instance.apply_stealth_async = AsyncMock()
|
||||||
|
mock_stealth_class.return_value = mock_stealth_instance
|
||||||
|
|
||||||
|
# Mock result elements
|
||||||
|
mock_res = AsyncMock()
|
||||||
|
|
||||||
|
mock_title_el = AsyncMock()
|
||||||
|
mock_title_el.inner_text.return_value = "WebGPU Accelerated ML"
|
||||||
|
mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2403.abc"
|
||||||
|
|
||||||
|
mock_snippet_el = AsyncMock()
|
||||||
|
mock_snippet_el.inner_text.return_value = "This paper discusses WebGPU..."
|
||||||
|
|
||||||
|
mock_metadata_el = AsyncMock()
|
||||||
|
mock_metadata_el.inner_text.return_value = "J. Smith, 2024 - arxiv.org"
|
||||||
|
|
||||||
|
mock_citation_link = AsyncMock()
|
||||||
|
mock_citation_link.inner_text.return_value = "Cited by 15"
|
||||||
|
|
||||||
|
mock_res.query_selector.side_effect = lambda selector: {
|
||||||
|
".gs_rt a": mock_title_el,
|
||||||
|
".gs_rs": mock_snippet_el,
|
||||||
|
".gs_a": mock_metadata_el
|
||||||
|
}.get(selector)
|
||||||
|
|
||||||
|
mock_res.query_selector_all.return_value = [mock_citation_link]
|
||||||
|
|
||||||
|
mock_page.query_selector_all.return_value = [mock_res]
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert len(items) == 1
|
||||||
|
assert items[0].title == "WebGPU Accelerated ML"
|
||||||
|
assert items[0].url == "https://arxiv.org/abs/2403.abc"
|
||||||
|
assert "15" in items[0].content_text
|
||||||
|
assert "J. Smith, 2024" in items[0].content_text
|
||||||
|
assert items[0].source == "Scholar: WebGPU"
|
||||||
|
|
||||||
|
mock_browser.close.assert_called_once()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scholar_crawler_captcha_detection():
|
||||||
|
crawler = ScholarCrawler(query="WebGPU", source="Scholar")
|
||||||
|
|
||||||
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
|
||||||
|
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
|
||||||
|
|
||||||
|
mock_p = AsyncMock()
|
||||||
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||||
|
|
||||||
|
mock_browser = AsyncMock()
|
||||||
|
mock_p.chromium.launch.return_value = mock_browser
|
||||||
|
|
||||||
|
mock_context = AsyncMock()
|
||||||
|
mock_browser.new_context.return_value = mock_context
|
||||||
|
|
||||||
|
mock_page = AsyncMock()
|
||||||
|
mock_context.new_page.return_value = mock_page
|
||||||
|
|
||||||
|
# Mock Stealth instance and method
|
||||||
|
mock_stealth_instance = MagicMock()
|
||||||
|
mock_stealth_instance.apply_stealth_async = AsyncMock()
|
||||||
|
mock_stealth_class.return_value = mock_stealth_instance
|
||||||
|
|
||||||
|
# Simulate CAPTCHA in content
|
||||||
|
mock_page.content.return_value = "<html><body>Please verify you are not a robot CAPTCHA</body></html>"
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert items == []
|
||||||
|
mock_browser.close.assert_called_once()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scholar_crawler_error_handling():
|
||||||
|
crawler = ScholarCrawler(query="WebGPU", source="Scholar")
|
||||||
|
|
||||||
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
|
||||||
|
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
|
||||||
|
|
||||||
|
mock_p = AsyncMock()
|
||||||
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||||
|
|
||||||
|
mock_browser = AsyncMock()
|
||||||
|
mock_p.chromium.launch.return_value = mock_browser
|
||||||
|
|
||||||
|
mock_context = AsyncMock()
|
||||||
|
mock_browser.new_context.return_value = mock_context
|
||||||
|
|
||||||
|
mock_page = AsyncMock()
|
||||||
|
mock_context.new_page.return_value = mock_page
|
||||||
|
|
||||||
|
mock_stealth_instance = MagicMock()
|
||||||
|
mock_stealth_instance.apply_stealth_async = AsyncMock()
|
||||||
|
mock_stealth_class.return_value = mock_stealth_instance
|
||||||
|
|
||||||
|
# Simulate exception during goto
|
||||||
|
mock_page.goto.side_effect = Exception("Browser crash")
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert items == []
|
||||||
|
mock_browser.close.assert_called_once()
|
||||||
|
|
||||||
|
def test_factory_registration():
|
||||||
|
# Test if SciRate and Scholar are registered in the factory
|
||||||
|
with patch("builtins.open", MagicMock()):
|
||||||
|
with patch("yaml.safe_load") as mock_yaml:
|
||||||
|
mock_yaml.return_value = {
|
||||||
|
'crawlers': [
|
||||||
|
{'type': 'scirate', 'url': 'https://scirate.com/', 'source': 'SciRate'},
|
||||||
|
{'type': 'scholar', 'url': 'https://scholar.google.com/', 'source': 'Scholar', 'query': 'AI'}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
crawlers = CrawlerFactory.load_from_yaml("fake_path.yml")
|
||||||
|
|
||||||
|
assert len(crawlers) == 2
|
||||||
|
assert isinstance(crawlers[0], SciRateCrawler)
|
||||||
|
assert isinstance(crawlers[1], ScholarCrawler)
|
||||||
|
assert crawlers[1].query == 'AI'
|
||||||
@ -4,6 +4,12 @@ from unittest.mock import patch, mock_open
|
|||||||
from src.crawlers.factory import CrawlerFactory
|
from src.crawlers.factory import CrawlerFactory
|
||||||
from src.crawlers.rss_crawler import RSSCrawler
|
from src.crawlers.rss_crawler import RSSCrawler
|
||||||
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
||||||
|
from src.crawlers.scirate_crawler import SciRateCrawler
|
||||||
|
from src.crawlers.scholar_crawler import ScholarCrawler
|
||||||
|
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
|
||||||
|
from src.crawlers.static_crawler import StaticCrawler
|
||||||
|
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
||||||
|
from src.crawlers.cppconf_crawler import CppConfCrawler
|
||||||
|
|
||||||
VALID_YAML = """
|
VALID_YAML = """
|
||||||
crawlers:
|
crawlers:
|
||||||
@ -14,6 +20,15 @@ crawlers:
|
|||||||
url: "https://example.com/playwright"
|
url: "https://example.com/playwright"
|
||||||
source: "Example Playwright"
|
source: "Example Playwright"
|
||||||
selector: ".item"
|
selector: ".item"
|
||||||
|
- type: scirate
|
||||||
|
url: "https://scirate.com/"
|
||||||
|
source: "SciRate"
|
||||||
|
- type: scholar
|
||||||
|
query: "AI"
|
||||||
|
source: "Google Scholar"
|
||||||
|
- type: microsoft_research
|
||||||
|
url: "https://example.com/msr"
|
||||||
|
source: "Microsoft Research"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
INVALID_TYPE_YAML = """
|
INVALID_TYPE_YAML = """
|
||||||
@ -45,15 +60,13 @@ def test_load_from_yaml_valid():
|
|||||||
with patch("builtins.open", mock_open(read_data=VALID_YAML)):
|
with patch("builtins.open", mock_open(read_data=VALID_YAML)):
|
||||||
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
|
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
|
||||||
|
|
||||||
assert len(crawlers) == 2
|
assert len(crawlers) == 5
|
||||||
assert isinstance(crawlers[0], RSSCrawler)
|
assert isinstance(crawlers[0], RSSCrawler)
|
||||||
assert crawlers[0].url == "https://example.com/rss"
|
|
||||||
assert crawlers[0].source == "Example RSS"
|
|
||||||
|
|
||||||
assert isinstance(crawlers[1], PlaywrightCrawler)
|
assert isinstance(crawlers[1], PlaywrightCrawler)
|
||||||
assert crawlers[1].url == "https://example.com/playwright"
|
assert isinstance(crawlers[2], SciRateCrawler)
|
||||||
assert crawlers[1].source == "Example Playwright"
|
assert isinstance(crawlers[3], ScholarCrawler)
|
||||||
assert crawlers[1].selector == ".item"
|
assert isinstance(crawlers[4], MicrosoftResearchCrawler)
|
||||||
|
|
||||||
|
|
||||||
def test_load_from_yaml_unknown_type():
|
def test_load_from_yaml_unknown_type():
|
||||||
with patch("builtins.open", mock_open(read_data=INVALID_TYPE_YAML)):
|
with patch("builtins.open", mock_open(read_data=INVALID_TYPE_YAML)):
|
||||||
@ -112,8 +125,9 @@ def test_integration_load_actual_config():
|
|||||||
|
|
||||||
# Verify types and mandatory fields for all loaded crawlers
|
# Verify types and mandatory fields for all loaded crawlers
|
||||||
for crawler in crawlers:
|
for crawler in crawlers:
|
||||||
assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler))
|
assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler, StaticCrawler, SkolkovoCrawler, CppConfCrawler, SciRateCrawler, ScholarCrawler, MicrosoftResearchCrawler))
|
||||||
assert crawler.url.startswith("http")
|
if not isinstance(crawler, ScholarCrawler):
|
||||||
|
assert crawler.url.startswith("http")
|
||||||
assert crawler.source
|
assert crawler.source
|
||||||
if isinstance(crawler, PlaywrightCrawler):
|
if isinstance(crawler, PlaywrightCrawler):
|
||||||
# According to src/crawlers.yml, all playwright crawlers currently have selectors
|
# According to src/crawlers.yml, all playwright crawlers currently have selectors
|
||||||
|
|||||||
38
tests/crawlers/test_microsoft_research_crawler.py
Normal file
38
tests/crawlers/test_microsoft_research_crawler.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, patch, MagicMock
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
|
||||||
|
from src.crawlers.dto import NewsItemDTO
|
||||||
|
|
||||||
|
MOCK_MSR_RSS = """<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>Microsoft Research</title>
|
||||||
|
<item>
|
||||||
|
<title>MSR Paper Title</title>
|
||||||
|
<link>https://www.microsoft.com/en-us/research/publication/msr-paper/</link>
|
||||||
|
<description>MSR Paper Description</description>
|
||||||
|
<pubDate>Mon, 10 Mar 2026 10:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_microsoft_research_crawler_fetch_latest():
|
||||||
|
crawler = MicrosoftResearchCrawler()
|
||||||
|
|
||||||
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
||||||
|
mock_response = AsyncMock()
|
||||||
|
mock_response.text.return_value = MOCK_MSR_RSS
|
||||||
|
mock_response.status = 200
|
||||||
|
mock_response.raise_for_status = MagicMock()
|
||||||
|
mock_get.return_value.__aenter__.return_value = mock_response
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert len(items) == 1
|
||||||
|
assert items[0].title == "MSR Paper Title"
|
||||||
|
assert items[0].url == "https://www.microsoft.com/en-us/research/publication/msr-paper/"
|
||||||
|
assert items[0].source == "Microsoft Research"
|
||||||
|
assert items[0].timestamp == datetime(2026, 3, 10, 10, 0, tzinfo=timezone.utc)
|
||||||
115
tests/crawlers/test_scholar_crawler.py
Normal file
115
tests/crawlers/test_scholar_crawler.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, patch, MagicMock
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from src.crawlers.scholar_crawler import ScholarCrawler
|
||||||
|
from src.crawlers.dto import NewsItemDTO
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scholar_crawler_fetch_latest():
|
||||||
|
query = "Large Language Models"
|
||||||
|
source = "Google Scholar"
|
||||||
|
crawler = ScholarCrawler(query=query, source=source)
|
||||||
|
|
||||||
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
|
||||||
|
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
|
||||||
|
|
||||||
|
mock_stealth = MagicMock()
|
||||||
|
mock_stealth.apply_stealth_async = AsyncMock()
|
||||||
|
mock_stealth_class.return_value = mock_stealth
|
||||||
|
|
||||||
|
mock_p = AsyncMock()
|
||||||
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||||
|
|
||||||
|
mock_browser = AsyncMock()
|
||||||
|
mock_p.chromium.launch.return_value = mock_browser
|
||||||
|
|
||||||
|
mock_context = AsyncMock()
|
||||||
|
mock_browser.new_context.return_value = mock_context
|
||||||
|
|
||||||
|
mock_page = AsyncMock()
|
||||||
|
mock_context.new_page.return_value = mock_page
|
||||||
|
|
||||||
|
# Mock content to avoid CAPTCHA detection in crawler
|
||||||
|
mock_page.content.return_value = "<html><body>Results</body></html>"
|
||||||
|
|
||||||
|
# Setup mock results
|
||||||
|
mock_res = AsyncMock()
|
||||||
|
|
||||||
|
# Title element
|
||||||
|
mock_title_el = AsyncMock()
|
||||||
|
mock_title_el.inner_text.return_value = "LLM Paper Title"
|
||||||
|
mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2401.00001"
|
||||||
|
|
||||||
|
mock_res.query_selector.side_effect = lambda selector: {
|
||||||
|
".gs_rt a": mock_title_el,
|
||||||
|
".gs_rs": AsyncMock(inner_text=AsyncMock(return_value="This is a snippet")),
|
||||||
|
".gs_a": AsyncMock(inner_text=AsyncMock(return_value="Authors et al.")),
|
||||||
|
}.get(selector)
|
||||||
|
|
||||||
|
# Citations
|
||||||
|
mock_citation_link = AsyncMock()
|
||||||
|
mock_citation_link.inner_text.return_value = "Cited by 123"
|
||||||
|
mock_res.query_selector_all.return_value = [mock_citation_link]
|
||||||
|
|
||||||
|
mock_page.query_selector_all.return_value = [mock_res]
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert len(items) == 1
|
||||||
|
assert items[0].title == "LLM Paper Title"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scholar_crawler_no_title():
|
||||||
|
crawler = ScholarCrawler()
|
||||||
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
|
||||||
|
mock_p = AsyncMock()
|
||||||
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||||
|
mock_browser = AsyncMock()
|
||||||
|
mock_p.chromium.launch.return_value = mock_browser
|
||||||
|
mock_context = AsyncMock()
|
||||||
|
mock_browser.new_context.return_value = mock_context
|
||||||
|
mock_page = AsyncMock()
|
||||||
|
mock_context.new_page.return_value = mock_page
|
||||||
|
mock_page.content.return_value = "<html><body>Results</body></html>"
|
||||||
|
|
||||||
|
# Result item without title link
|
||||||
|
mock_res = AsyncMock()
|
||||||
|
mock_res.query_selector.return_value = None
|
||||||
|
mock_page.query_selector_all.return_value = [mock_res]
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
assert len(items) == 0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scholar_crawler_exception():
|
||||||
|
crawler = ScholarCrawler()
|
||||||
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
|
||||||
|
mock_p = AsyncMock()
|
||||||
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||||
|
mock_browser = AsyncMock()
|
||||||
|
mock_p.chromium.launch.return_value = mock_browser
|
||||||
|
|
||||||
|
# Force exception
|
||||||
|
mock_browser.new_context.side_effect = Exception("Browser error")
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
assert items == []
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scholar_crawler_captcha():
|
||||||
|
crawler = ScholarCrawler()
|
||||||
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
|
||||||
|
mock_p = AsyncMock()
|
||||||
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||||
|
mock_browser = AsyncMock()
|
||||||
|
mock_p.chromium.launch.return_value = mock_browser
|
||||||
|
mock_context = AsyncMock()
|
||||||
|
mock_browser.new_context.return_value = mock_context
|
||||||
|
mock_page = AsyncMock()
|
||||||
|
mock_context.new_page.return_value = mock_page
|
||||||
|
|
||||||
|
# Simulate CAPTCHA
|
||||||
|
mock_page.content.return_value = "<html><body>Please solve this CAPTCHA</body></html>"
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
assert items == []
|
||||||
90
tests/crawlers/test_scirate_crawler.py
Normal file
90
tests/crawlers/test_scirate_crawler.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, patch, MagicMock
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from src.crawlers.scirate_crawler import SciRateCrawler
|
||||||
|
from src.crawlers.dto import NewsItemDTO
|
||||||
|
|
||||||
|
MOCK_SCIRATE_HTML = """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<li class="paper-list-item">
|
||||||
|
<div class="title">
|
||||||
|
<a href="/arxiv/2403.12345">Attention is Really All You Need</a>
|
||||||
|
</div>
|
||||||
|
<div class="authors">Vaswani et al.</div>
|
||||||
|
<div class="abstract">This paper presents a new architecture...</div>
|
||||||
|
</li>
|
||||||
|
<div class="paper">
|
||||||
|
<div class="title">
|
||||||
|
<a href="https://example.com/paper2">Another Paper</a>
|
||||||
|
</div>
|
||||||
|
<div class="authors">Doe and Smith</div>
|
||||||
|
<div class="abstract">Abstract of another paper.</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scirate_crawler_fetch_latest():
|
||||||
|
url = "https://scirate.com/"
|
||||||
|
source = "SciRate"
|
||||||
|
crawler = SciRateCrawler(url, source)
|
||||||
|
|
||||||
|
# HTML with multiple items, one missing title, one with list-like link
|
||||||
|
mock_html = """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<li class="paper-list-item">
|
||||||
|
<div class="title"><a href="/arxiv/1">Paper 1</a></div>
|
||||||
|
</li>
|
||||||
|
<li class="paper-list-item">
|
||||||
|
<div class="title">No link here</div>
|
||||||
|
</li>
|
||||||
|
<li class="paper-list-item">
|
||||||
|
<div class="title"><a href="/arxiv/3">Paper 3</a></div>
|
||||||
|
</li>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
||||||
|
mock_response = AsyncMock()
|
||||||
|
mock_response.text.return_value = mock_html
|
||||||
|
mock_response.status = 200
|
||||||
|
mock_get.return_value.__aenter__.return_value = mock_response
|
||||||
|
|
||||||
|
# We also want to test the 'isinstance(link, list)' part.
|
||||||
|
# This is tricky because BS4 normally doesn't return a list for href.
|
||||||
|
# But we can mock title_el.get to return a list.
|
||||||
|
with patch("bs4.element.Tag.get", side_effect=[["/arxiv/list"], "/arxiv/3"]):
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert len(items) == 2
|
||||||
|
assert items[0].url == "https://scirate.com/arxiv/list"
|
||||||
|
assert items[1].url == "https://scirate.com/arxiv/3"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scirate_crawler_exception():
|
||||||
|
crawler = SciRateCrawler()
|
||||||
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
||||||
|
mock_response = AsyncMock()
|
||||||
|
mock_response.text.return_value = "<html></html>"
|
||||||
|
mock_response.status = 200
|
||||||
|
mock_get.return_value.__aenter__.return_value = mock_response
|
||||||
|
|
||||||
|
# Force an exception in parse_html
|
||||||
|
with patch.object(SciRateCrawler, 'parse_html', side_effect=Exception("Parsing failed")):
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
assert items == []
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scirate_crawler_error():
|
||||||
|
crawler = SciRateCrawler()
|
||||||
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
||||||
|
mock_response = AsyncMock()
|
||||||
|
mock_response.status = 500
|
||||||
|
mock_get.return_value.__aenter__.return_value = mock_response
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
assert items == []
|
||||||
Loading…
x
Reference in New Issue
Block a user