AI-Trend-Scout/tests/crawlers/test_microsoft_research_crawler.py
Artur Mukhamadiev a304ae9cd2 feat(crawler): add academic and research sources
- Implement crawlers for Microsoft Research, SciRate, and Google Scholar
- Use Playwright with stealth for Google Scholar anti-bot mitigation
- Update CrawlerFactory to support new research crawler types
- Add unit and integration tests for all academic sources with high coverage
2026-03-16 00:11:15 +03:00

39 lines
1.4 KiB
Python

import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from datetime import datetime, timezone
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
from src.crawlers.dto import NewsItemDTO
MOCK_MSR_RSS = """<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>
<title>Microsoft Research</title>
<item>
<title>MSR Paper Title</title>
<link>https://www.microsoft.com/en-us/research/publication/msr-paper/</link>
<description>MSR Paper Description</description>
<pubDate>Mon, 10 Mar 2026 10:00:00 GMT</pubDate>
</item>
</channel>
</rss>
"""
@pytest.mark.asyncio
async def test_microsoft_research_crawler_fetch_latest():
crawler = MicrosoftResearchCrawler()
with patch("aiohttp.ClientSession.get") as mock_get:
mock_response = AsyncMock()
mock_response.text.return_value = MOCK_MSR_RSS
mock_response.status = 200
mock_response.raise_for_status = MagicMock()
mock_get.return_value.__aenter__.return_value = mock_response
items = await crawler.fetch_latest()
assert len(items) == 1
assert items[0].title == "MSR Paper Title"
assert items[0].url == "https://www.microsoft.com/en-us/research/publication/msr-paper/"
assert items[0].source == "Microsoft Research"
assert items[0].timestamp == datetime(2026, 3, 10, 10, 0, tzinfo=timezone.utc)