- Implement crawlers for Microsoft Research, SciRate, and Google Scholar - Use Playwright with stealth for Google Scholar anti-bot mitigation - Update CrawlerFactory to support new research crawler types - Add unit and integration tests for all academic sources with high coverage
217 lines
8.1 KiB
Python
217 lines
8.1 KiB
Python
import pytest
|
|
import aiohttp
|
|
from unittest.mock import AsyncMock, patch, MagicMock
|
|
from datetime import datetime, timezone
|
|
from src.crawlers.scirate_crawler import SciRateCrawler
|
|
from src.crawlers.scholar_crawler import ScholarCrawler
|
|
from src.crawlers.factory import CrawlerFactory
|
|
from src.crawlers.dto import NewsItemDTO
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scirate_crawler_parse_html():
|
|
crawler = SciRateCrawler()
|
|
sample_html = """
|
|
<li class="paper-list-item">
|
|
<div class="title"><a href="/arxiv/2403.12345">Quantum Supremacy in the Kitchen</a></div>
|
|
<div class="authors">John Doe, Jane Smith</div>
|
|
<div class="abstract">We demonstrate quantum supremacy by perfectly boiling an egg.</div>
|
|
</li>
|
|
<div class="paper">
|
|
<div class="title"><a href="https://scirate.com/arxiv/2403.67890">AI for Cats</a></div>
|
|
<div class="authors">Cat Lover</div>
|
|
<div class="abstract">A deep learning approach to understanding meows.</div>
|
|
</div>
|
|
"""
|
|
|
|
items = crawler.parse_html(sample_html)
|
|
|
|
assert len(items) == 2
|
|
|
|
assert items[0].title == "Quantum Supremacy in the Kitchen"
|
|
assert "arxiv/2403.12345" in items[0].url
|
|
assert "John Doe, Jane Smith" in items[0].content_text
|
|
assert "boiling an egg" in items[0].content_text
|
|
assert items[0].source == "SciRate"
|
|
|
|
assert items[1].title == "AI for Cats"
|
|
assert items[1].url == "https://scirate.com/arxiv/2403.67890"
|
|
assert "Cat Lover" in items[1].content_text
|
|
assert "meows" in items[1].content_text
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scirate_crawler_fetch_latest():
|
|
crawler = SciRateCrawler()
|
|
sample_html = """
|
|
<li class="paper-list-item">
|
|
<div class="title"><a href="/arxiv/2403.12345">Quantum Supremacy</a></div>
|
|
</li>
|
|
"""
|
|
|
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
|
mock_response = AsyncMock()
|
|
mock_response.status = 200
|
|
mock_response.text.return_value = sample_html
|
|
|
|
mock_get.return_value.__aenter__.return_value = mock_response
|
|
|
|
items = await crawler.fetch_latest()
|
|
|
|
assert len(items) == 1
|
|
assert items[0].title == "Quantum Supremacy"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scirate_crawler_fetch_error():
|
|
crawler = SciRateCrawler()
|
|
|
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
|
mock_response = AsyncMock()
|
|
mock_response.status = 404
|
|
|
|
mock_get.return_value.__aenter__.return_value = mock_response
|
|
|
|
items = await crawler.fetch_latest()
|
|
|
|
assert items == []
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_fetch_latest():
|
|
crawler = ScholarCrawler(query="WebGPU", source="Scholar")
|
|
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
|
|
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
|
|
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
|
|
mock_context = AsyncMock()
|
|
mock_browser.new_context.return_value = mock_context
|
|
|
|
mock_page = AsyncMock()
|
|
mock_context.new_page.return_value = mock_page
|
|
mock_page.content.return_value = "<html><body>Results</body></html>"
|
|
|
|
# Mock Stealth instance and method
|
|
mock_stealth_instance = MagicMock()
|
|
mock_stealth_instance.apply_stealth_async = AsyncMock()
|
|
mock_stealth_class.return_value = mock_stealth_instance
|
|
|
|
# Mock result elements
|
|
mock_res = AsyncMock()
|
|
|
|
mock_title_el = AsyncMock()
|
|
mock_title_el.inner_text.return_value = "WebGPU Accelerated ML"
|
|
mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2403.abc"
|
|
|
|
mock_snippet_el = AsyncMock()
|
|
mock_snippet_el.inner_text.return_value = "This paper discusses WebGPU..."
|
|
|
|
mock_metadata_el = AsyncMock()
|
|
mock_metadata_el.inner_text.return_value = "J. Smith, 2024 - arxiv.org"
|
|
|
|
mock_citation_link = AsyncMock()
|
|
mock_citation_link.inner_text.return_value = "Cited by 15"
|
|
|
|
mock_res.query_selector.side_effect = lambda selector: {
|
|
".gs_rt a": mock_title_el,
|
|
".gs_rs": mock_snippet_el,
|
|
".gs_a": mock_metadata_el
|
|
}.get(selector)
|
|
|
|
mock_res.query_selector_all.return_value = [mock_citation_link]
|
|
|
|
mock_page.query_selector_all.return_value = [mock_res]
|
|
|
|
items = await crawler.fetch_latest()
|
|
|
|
assert len(items) == 1
|
|
assert items[0].title == "WebGPU Accelerated ML"
|
|
assert items[0].url == "https://arxiv.org/abs/2403.abc"
|
|
assert "15" in items[0].content_text
|
|
assert "J. Smith, 2024" in items[0].content_text
|
|
assert items[0].source == "Scholar: WebGPU"
|
|
|
|
mock_browser.close.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_captcha_detection():
|
|
crawler = ScholarCrawler(query="WebGPU", source="Scholar")
|
|
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
|
|
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
|
|
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
|
|
mock_context = AsyncMock()
|
|
mock_browser.new_context.return_value = mock_context
|
|
|
|
mock_page = AsyncMock()
|
|
mock_context.new_page.return_value = mock_page
|
|
|
|
# Mock Stealth instance and method
|
|
mock_stealth_instance = MagicMock()
|
|
mock_stealth_instance.apply_stealth_async = AsyncMock()
|
|
mock_stealth_class.return_value = mock_stealth_instance
|
|
|
|
# Simulate CAPTCHA in content
|
|
mock_page.content.return_value = "<html><body>Please verify you are not a robot CAPTCHA</body></html>"
|
|
|
|
items = await crawler.fetch_latest()
|
|
|
|
assert items == []
|
|
mock_browser.close.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_error_handling():
|
|
crawler = ScholarCrawler(query="WebGPU", source="Scholar")
|
|
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
|
|
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
|
|
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
|
|
mock_context = AsyncMock()
|
|
mock_browser.new_context.return_value = mock_context
|
|
|
|
mock_page = AsyncMock()
|
|
mock_context.new_page.return_value = mock_page
|
|
|
|
mock_stealth_instance = MagicMock()
|
|
mock_stealth_instance.apply_stealth_async = AsyncMock()
|
|
mock_stealth_class.return_value = mock_stealth_instance
|
|
|
|
# Simulate exception during goto
|
|
mock_page.goto.side_effect = Exception("Browser crash")
|
|
|
|
items = await crawler.fetch_latest()
|
|
|
|
assert items == []
|
|
mock_browser.close.assert_called_once()
|
|
|
|
def test_factory_registration():
|
|
# Test if SciRate and Scholar are registered in the factory
|
|
with patch("builtins.open", MagicMock()):
|
|
with patch("yaml.safe_load") as mock_yaml:
|
|
mock_yaml.return_value = {
|
|
'crawlers': [
|
|
{'type': 'scirate', 'url': 'https://scirate.com/', 'source': 'SciRate'},
|
|
{'type': 'scholar', 'url': 'https://scholar.google.com/', 'source': 'Scholar', 'query': 'AI'}
|
|
]
|
|
}
|
|
crawlers = CrawlerFactory.load_from_yaml("fake_path.yml")
|
|
|
|
assert len(crawlers) == 2
|
|
assert isinstance(crawlers[0], SciRateCrawler)
|
|
assert isinstance(crawlers[1], ScholarCrawler)
|
|
assert crawlers[1].query == 'AI'
|