- Implement crawlers for Microsoft Research, SciRate, and Google Scholar - Use Playwright with stealth for Google Scholar anti-bot mitigation - Update CrawlerFactory to support new research crawler types - Add unit and integration tests for all academic sources with high coverage
91 lines
3.1 KiB
Python
91 lines
3.1 KiB
Python
import pytest
|
|
from unittest.mock import AsyncMock, patch, MagicMock
|
|
from datetime import datetime, timezone
|
|
from src.crawlers.scirate_crawler import SciRateCrawler
|
|
from src.crawlers.dto import NewsItemDTO
|
|
|
|
MOCK_SCIRATE_HTML = """
|
|
<html>
|
|
<body>
|
|
<li class="paper-list-item">
|
|
<div class="title">
|
|
<a href="/arxiv/2403.12345">Attention is Really All You Need</a>
|
|
</div>
|
|
<div class="authors">Vaswani et al.</div>
|
|
<div class="abstract">This paper presents a new architecture...</div>
|
|
</li>
|
|
<div class="paper">
|
|
<div class="title">
|
|
<a href="https://example.com/paper2">Another Paper</a>
|
|
</div>
|
|
<div class="authors">Doe and Smith</div>
|
|
<div class="abstract">Abstract of another paper.</div>
|
|
</div>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scirate_crawler_fetch_latest():
|
|
url = "https://scirate.com/"
|
|
source = "SciRate"
|
|
crawler = SciRateCrawler(url, source)
|
|
|
|
# HTML with multiple items, one missing title, one with list-like link
|
|
mock_html = """
|
|
<html>
|
|
<body>
|
|
<li class="paper-list-item">
|
|
<div class="title"><a href="/arxiv/1">Paper 1</a></div>
|
|
</li>
|
|
<li class="paper-list-item">
|
|
<div class="title">No link here</div>
|
|
</li>
|
|
<li class="paper-list-item">
|
|
<div class="title"><a href="/arxiv/3">Paper 3</a></div>
|
|
</li>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
|
mock_response = AsyncMock()
|
|
mock_response.text.return_value = mock_html
|
|
mock_response.status = 200
|
|
mock_get.return_value.__aenter__.return_value = mock_response
|
|
|
|
# We also want to test the 'isinstance(link, list)' part.
|
|
# This is tricky because BS4 normally doesn't return a list for href.
|
|
# But we can mock title_el.get to return a list.
|
|
with patch("bs4.element.Tag.get", side_effect=[["/arxiv/list"], "/arxiv/3"]):
|
|
items = await crawler.fetch_latest()
|
|
|
|
assert len(items) == 2
|
|
assert items[0].url == "https://scirate.com/arxiv/list"
|
|
assert items[1].url == "https://scirate.com/arxiv/3"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scirate_crawler_exception():
|
|
crawler = SciRateCrawler()
|
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
|
mock_response = AsyncMock()
|
|
mock_response.text.return_value = "<html></html>"
|
|
mock_response.status = 200
|
|
mock_get.return_value.__aenter__.return_value = mock_response
|
|
|
|
# Force an exception in parse_html
|
|
with patch.object(SciRateCrawler, 'parse_html', side_effect=Exception("Parsing failed")):
|
|
items = await crawler.fetch_latest()
|
|
assert items == []
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scirate_crawler_error():
|
|
crawler = SciRateCrawler()
|
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
|
mock_response = AsyncMock()
|
|
mock_response.status = 500
|
|
mock_get.return_value.__aenter__.return_value = mock_response
|
|
|
|
items = await crawler.fetch_latest()
|
|
assert items == []
|