- Implement crawlers for Microsoft Research, SciRate, and Google Scholar - Use Playwright with stealth for Google Scholar anti-bot mitigation - Update CrawlerFactory to support new research crawler types - Add unit and integration tests for all academic sources with high coverage
39 lines
1.4 KiB
Python
39 lines
1.4 KiB
Python
import pytest
|
|
from unittest.mock import AsyncMock, patch, MagicMock
|
|
from datetime import datetime, timezone
|
|
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
|
|
from src.crawlers.dto import NewsItemDTO
|
|
|
|
MOCK_MSR_RSS = """<?xml version="1.0" encoding="UTF-8" ?>
|
|
<rss version="2.0">
|
|
<channel>
|
|
<title>Microsoft Research</title>
|
|
<item>
|
|
<title>MSR Paper Title</title>
|
|
<link>https://www.microsoft.com/en-us/research/publication/msr-paper/</link>
|
|
<description>MSR Paper Description</description>
|
|
<pubDate>Mon, 10 Mar 2026 10:00:00 GMT</pubDate>
|
|
</item>
|
|
</channel>
|
|
</rss>
|
|
"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_microsoft_research_crawler_fetch_latest():
|
|
crawler = MicrosoftResearchCrawler()
|
|
|
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
|
mock_response = AsyncMock()
|
|
mock_response.text.return_value = MOCK_MSR_RSS
|
|
mock_response.status = 200
|
|
mock_response.raise_for_status = MagicMock()
|
|
mock_get.return_value.__aenter__.return_value = mock_response
|
|
|
|
items = await crawler.fetch_latest()
|
|
|
|
assert len(items) == 1
|
|
assert items[0].title == "MSR Paper Title"
|
|
assert items[0].url == "https://www.microsoft.com/en-us/research/publication/msr-paper/"
|
|
assert items[0].source == "Microsoft Research"
|
|
assert items[0].timestamp == datetime(2026, 3, 10, 10, 0, tzinfo=timezone.utc)
|