AI-Trend-Scout/tests/crawlers/test_academic_crawlers.py
Artur Mukhamadiev a304ae9cd2 feat(crawler): add academic and research sources
- Implement crawlers for Microsoft Research, SciRate, and Google Scholar
- Use Playwright with stealth for Google Scholar anti-bot mitigation
- Update CrawlerFactory to support new research crawler types
- Add unit and integration tests for all academic sources with high coverage
2026-03-16 00:11:15 +03:00

217 lines
8.1 KiB
Python

import pytest
import aiohttp
from unittest.mock import AsyncMock, patch, MagicMock
from datetime import datetime, timezone
from src.crawlers.scirate_crawler import SciRateCrawler
from src.crawlers.scholar_crawler import ScholarCrawler
from src.crawlers.factory import CrawlerFactory
from src.crawlers.dto import NewsItemDTO
@pytest.mark.asyncio
async def test_scirate_crawler_parse_html():
crawler = SciRateCrawler()
sample_html = """
<li class="paper-list-item">
<div class="title"><a href="/arxiv/2403.12345">Quantum Supremacy in the Kitchen</a></div>
<div class="authors">John Doe, Jane Smith</div>
<div class="abstract">We demonstrate quantum supremacy by perfectly boiling an egg.</div>
</li>
<div class="paper">
<div class="title"><a href="https://scirate.com/arxiv/2403.67890">AI for Cats</a></div>
<div class="authors">Cat Lover</div>
<div class="abstract">A deep learning approach to understanding meows.</div>
</div>
"""
items = crawler.parse_html(sample_html)
assert len(items) == 2
assert items[0].title == "Quantum Supremacy in the Kitchen"
assert "arxiv/2403.12345" in items[0].url
assert "John Doe, Jane Smith" in items[0].content_text
assert "boiling an egg" in items[0].content_text
assert items[0].source == "SciRate"
assert items[1].title == "AI for Cats"
assert items[1].url == "https://scirate.com/arxiv/2403.67890"
assert "Cat Lover" in items[1].content_text
assert "meows" in items[1].content_text
@pytest.mark.asyncio
async def test_scirate_crawler_fetch_latest():
crawler = SciRateCrawler()
sample_html = """
<li class="paper-list-item">
<div class="title"><a href="/arxiv/2403.12345">Quantum Supremacy</a></div>
</li>
"""
with patch("aiohttp.ClientSession.get") as mock_get:
mock_response = AsyncMock()
mock_response.status = 200
mock_response.text.return_value = sample_html
mock_get.return_value.__aenter__.return_value = mock_response
items = await crawler.fetch_latest()
assert len(items) == 1
assert items[0].title == "Quantum Supremacy"
@pytest.mark.asyncio
async def test_scirate_crawler_fetch_error():
crawler = SciRateCrawler()
with patch("aiohttp.ClientSession.get") as mock_get:
mock_response = AsyncMock()
mock_response.status = 404
mock_get.return_value.__aenter__.return_value = mock_response
items = await crawler.fetch_latest()
assert items == []
@pytest.mark.asyncio
async def test_scholar_crawler_fetch_latest():
crawler = ScholarCrawler(query="WebGPU", source="Scholar")
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
mock_context = AsyncMock()
mock_browser.new_context.return_value = mock_context
mock_page = AsyncMock()
mock_context.new_page.return_value = mock_page
mock_page.content.return_value = "<html><body>Results</body></html>"
# Mock Stealth instance and method
mock_stealth_instance = MagicMock()
mock_stealth_instance.apply_stealth_async = AsyncMock()
mock_stealth_class.return_value = mock_stealth_instance
# Mock result elements
mock_res = AsyncMock()
mock_title_el = AsyncMock()
mock_title_el.inner_text.return_value = "WebGPU Accelerated ML"
mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2403.abc"
mock_snippet_el = AsyncMock()
mock_snippet_el.inner_text.return_value = "This paper discusses WebGPU..."
mock_metadata_el = AsyncMock()
mock_metadata_el.inner_text.return_value = "J. Smith, 2024 - arxiv.org"
mock_citation_link = AsyncMock()
mock_citation_link.inner_text.return_value = "Cited by 15"
mock_res.query_selector.side_effect = lambda selector: {
".gs_rt a": mock_title_el,
".gs_rs": mock_snippet_el,
".gs_a": mock_metadata_el
}.get(selector)
mock_res.query_selector_all.return_value = [mock_citation_link]
mock_page.query_selector_all.return_value = [mock_res]
items = await crawler.fetch_latest()
assert len(items) == 1
assert items[0].title == "WebGPU Accelerated ML"
assert items[0].url == "https://arxiv.org/abs/2403.abc"
assert "15" in items[0].content_text
assert "J. Smith, 2024" in items[0].content_text
assert items[0].source == "Scholar: WebGPU"
mock_browser.close.assert_called_once()
@pytest.mark.asyncio
async def test_scholar_crawler_captcha_detection():
crawler = ScholarCrawler(query="WebGPU", source="Scholar")
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
mock_context = AsyncMock()
mock_browser.new_context.return_value = mock_context
mock_page = AsyncMock()
mock_context.new_page.return_value = mock_page
# Mock Stealth instance and method
mock_stealth_instance = MagicMock()
mock_stealth_instance.apply_stealth_async = AsyncMock()
mock_stealth_class.return_value = mock_stealth_instance
# Simulate CAPTCHA in content
mock_page.content.return_value = "<html><body>Please verify you are not a robot CAPTCHA</body></html>"
items = await crawler.fetch_latest()
assert items == []
mock_browser.close.assert_called_once()
@pytest.mark.asyncio
async def test_scholar_crawler_error_handling():
crawler = ScholarCrawler(query="WebGPU", source="Scholar")
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
mock_context = AsyncMock()
mock_browser.new_context.return_value = mock_context
mock_page = AsyncMock()
mock_context.new_page.return_value = mock_page
mock_stealth_instance = MagicMock()
mock_stealth_instance.apply_stealth_async = AsyncMock()
mock_stealth_class.return_value = mock_stealth_instance
# Simulate exception during goto
mock_page.goto.side_effect = Exception("Browser crash")
items = await crawler.fetch_latest()
assert items == []
mock_browser.close.assert_called_once()
def test_factory_registration():
# Test if SciRate and Scholar are registered in the factory
with patch("builtins.open", MagicMock()):
with patch("yaml.safe_load") as mock_yaml:
mock_yaml.return_value = {
'crawlers': [
{'type': 'scirate', 'url': 'https://scirate.com/', 'source': 'SciRate'},
{'type': 'scholar', 'url': 'https://scholar.google.com/', 'source': 'Scholar', 'query': 'AI'}
]
}
crawlers = CrawlerFactory.load_from_yaml("fake_path.yml")
assert len(crawlers) == 2
assert isinstance(crawlers[0], SciRateCrawler)
assert isinstance(crawlers[1], ScholarCrawler)
assert crawlers[1].query == 'AI'