AI-Trend-Scout/tests/crawlers/test_scirate_crawler.py
Artur Mukhamadiev a304ae9cd2 feat(crawler): add academic and research sources
- Implement crawlers for Microsoft Research, SciRate, and Google Scholar
- Use Playwright with stealth for Google Scholar anti-bot mitigation
- Update CrawlerFactory to support new research crawler types
- Add unit and integration tests for all academic sources with high coverage
2026-03-16 00:11:15 +03:00

91 lines
3.1 KiB
Python

import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from datetime import datetime, timezone
from src.crawlers.scirate_crawler import SciRateCrawler
from src.crawlers.dto import NewsItemDTO
MOCK_SCIRATE_HTML = """
<html>
<body>
<li class="paper-list-item">
<div class="title">
<a href="/arxiv/2403.12345">Attention is Really All You Need</a>
</div>
<div class="authors">Vaswani et al.</div>
<div class="abstract">This paper presents a new architecture...</div>
</li>
<div class="paper">
<div class="title">
<a href="https://example.com/paper2">Another Paper</a>
</div>
<div class="authors">Doe and Smith</div>
<div class="abstract">Abstract of another paper.</div>
</div>
</body>
</html>
"""
@pytest.mark.asyncio
async def test_scirate_crawler_fetch_latest():
url = "https://scirate.com/"
source = "SciRate"
crawler = SciRateCrawler(url, source)
# HTML with multiple items, one missing title, one with list-like link
mock_html = """
<html>
<body>
<li class="paper-list-item">
<div class="title"><a href="/arxiv/1">Paper 1</a></div>
</li>
<li class="paper-list-item">
<div class="title">No link here</div>
</li>
<li class="paper-list-item">
<div class="title"><a href="/arxiv/3">Paper 3</a></div>
</li>
</body>
</html>
"""
with patch("aiohttp.ClientSession.get") as mock_get:
mock_response = AsyncMock()
mock_response.text.return_value = mock_html
mock_response.status = 200
mock_get.return_value.__aenter__.return_value = mock_response
# We also want to test the 'isinstance(link, list)' part.
# This is tricky because BS4 normally doesn't return a list for href.
# But we can mock title_el.get to return a list.
with patch("bs4.element.Tag.get", side_effect=[["/arxiv/list"], "/arxiv/3"]):
items = await crawler.fetch_latest()
assert len(items) == 2
assert items[0].url == "https://scirate.com/arxiv/list"
assert items[1].url == "https://scirate.com/arxiv/3"
@pytest.mark.asyncio
async def test_scirate_crawler_exception():
crawler = SciRateCrawler()
with patch("aiohttp.ClientSession.get") as mock_get:
mock_response = AsyncMock()
mock_response.text.return_value = "<html></html>"
mock_response.status = 200
mock_get.return_value.__aenter__.return_value = mock_response
# Force an exception in parse_html
with patch.object(SciRateCrawler, 'parse_html', side_effect=Exception("Parsing failed")):
items = await crawler.fetch_latest()
assert items == []
@pytest.mark.asyncio
async def test_scirate_crawler_error():
crawler = SciRateCrawler()
with patch("aiohttp.ClientSession.get") as mock_get:
mock_response = AsyncMock()
mock_response.status = 500
mock_get.return_value.__aenter__.return_value = mock_response
items = await crawler.fetch_latest()
assert items == []