AI-Trend-Scout/tests/crawlers/test_scholar_crawler.py
Artur Mukhamadiev a304ae9cd2 feat(crawler): add academic and research sources
- Implement crawlers for Microsoft Research, SciRate, and Google Scholar
- Use Playwright with stealth for Google Scholar anti-bot mitigation
- Update CrawlerFactory to support new research crawler types
- Add unit and integration tests for all academic sources with high coverage
2026-03-16 00:11:15 +03:00

116 lines
4.4 KiB
Python

import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from datetime import datetime, timezone
from src.crawlers.scholar_crawler import ScholarCrawler
from src.crawlers.dto import NewsItemDTO
@pytest.mark.asyncio
async def test_scholar_crawler_fetch_latest():
query = "Large Language Models"
source = "Google Scholar"
crawler = ScholarCrawler(query=query, source=source)
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
mock_stealth = MagicMock()
mock_stealth.apply_stealth_async = AsyncMock()
mock_stealth_class.return_value = mock_stealth
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
mock_context = AsyncMock()
mock_browser.new_context.return_value = mock_context
mock_page = AsyncMock()
mock_context.new_page.return_value = mock_page
# Mock content to avoid CAPTCHA detection in crawler
mock_page.content.return_value = "<html><body>Results</body></html>"
# Setup mock results
mock_res = AsyncMock()
# Title element
mock_title_el = AsyncMock()
mock_title_el.inner_text.return_value = "LLM Paper Title"
mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2401.00001"
mock_res.query_selector.side_effect = lambda selector: {
".gs_rt a": mock_title_el,
".gs_rs": AsyncMock(inner_text=AsyncMock(return_value="This is a snippet")),
".gs_a": AsyncMock(inner_text=AsyncMock(return_value="Authors et al.")),
}.get(selector)
# Citations
mock_citation_link = AsyncMock()
mock_citation_link.inner_text.return_value = "Cited by 123"
mock_res.query_selector_all.return_value = [mock_citation_link]
mock_page.query_selector_all.return_value = [mock_res]
items = await crawler.fetch_latest()
assert len(items) == 1
assert items[0].title == "LLM Paper Title"
@pytest.mark.asyncio
async def test_scholar_crawler_no_title():
crawler = ScholarCrawler()
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
mock_context = AsyncMock()
mock_browser.new_context.return_value = mock_context
mock_page = AsyncMock()
mock_context.new_page.return_value = mock_page
mock_page.content.return_value = "<html><body>Results</body></html>"
# Result item without title link
mock_res = AsyncMock()
mock_res.query_selector.return_value = None
mock_page.query_selector_all.return_value = [mock_res]
items = await crawler.fetch_latest()
assert len(items) == 0
@pytest.mark.asyncio
async def test_scholar_crawler_exception():
crawler = ScholarCrawler()
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
# Force exception
mock_browser.new_context.side_effect = Exception("Browser error")
items = await crawler.fetch_latest()
assert items == []
@pytest.mark.asyncio
async def test_scholar_crawler_captcha():
crawler = ScholarCrawler()
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
mock_context = AsyncMock()
mock_browser.new_context.return_value = mock_context
mock_page = AsyncMock()
mock_context.new_page.return_value = mock_page
# Simulate CAPTCHA
mock_page.content.return_value = "<html><body>Please solve this CAPTCHA</body></html>"
items = await crawler.fetch_latest()
assert items == []