- Implement crawlers for Microsoft Research, SciRate, and Google Scholar - Use Playwright with stealth for Google Scholar anti-bot mitigation - Update CrawlerFactory to support new research crawler types - Add unit and integration tests for all academic sources with high coverage
116 lines
4.4 KiB
Python
116 lines
4.4 KiB
Python
import pytest
|
|
from unittest.mock import AsyncMock, patch, MagicMock
|
|
from datetime import datetime, timezone
|
|
from src.crawlers.scholar_crawler import ScholarCrawler
|
|
from src.crawlers.dto import NewsItemDTO
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_fetch_latest():
|
|
query = "Large Language Models"
|
|
source = "Google Scholar"
|
|
crawler = ScholarCrawler(query=query, source=source)
|
|
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
|
|
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
|
|
|
|
mock_stealth = MagicMock()
|
|
mock_stealth.apply_stealth_async = AsyncMock()
|
|
mock_stealth_class.return_value = mock_stealth
|
|
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
|
|
mock_context = AsyncMock()
|
|
mock_browser.new_context.return_value = mock_context
|
|
|
|
mock_page = AsyncMock()
|
|
mock_context.new_page.return_value = mock_page
|
|
|
|
# Mock content to avoid CAPTCHA detection in crawler
|
|
mock_page.content.return_value = "<html><body>Results</body></html>"
|
|
|
|
# Setup mock results
|
|
mock_res = AsyncMock()
|
|
|
|
# Title element
|
|
mock_title_el = AsyncMock()
|
|
mock_title_el.inner_text.return_value = "LLM Paper Title"
|
|
mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2401.00001"
|
|
|
|
mock_res.query_selector.side_effect = lambda selector: {
|
|
".gs_rt a": mock_title_el,
|
|
".gs_rs": AsyncMock(inner_text=AsyncMock(return_value="This is a snippet")),
|
|
".gs_a": AsyncMock(inner_text=AsyncMock(return_value="Authors et al.")),
|
|
}.get(selector)
|
|
|
|
# Citations
|
|
mock_citation_link = AsyncMock()
|
|
mock_citation_link.inner_text.return_value = "Cited by 123"
|
|
mock_res.query_selector_all.return_value = [mock_citation_link]
|
|
|
|
mock_page.query_selector_all.return_value = [mock_res]
|
|
|
|
items = await crawler.fetch_latest()
|
|
|
|
assert len(items) == 1
|
|
assert items[0].title == "LLM Paper Title"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_no_title():
|
|
crawler = ScholarCrawler()
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
mock_context = AsyncMock()
|
|
mock_browser.new_context.return_value = mock_context
|
|
mock_page = AsyncMock()
|
|
mock_context.new_page.return_value = mock_page
|
|
mock_page.content.return_value = "<html><body>Results</body></html>"
|
|
|
|
# Result item without title link
|
|
mock_res = AsyncMock()
|
|
mock_res.query_selector.return_value = None
|
|
mock_page.query_selector_all.return_value = [mock_res]
|
|
|
|
items = await crawler.fetch_latest()
|
|
assert len(items) == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_exception():
|
|
crawler = ScholarCrawler()
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
|
|
# Force exception
|
|
mock_browser.new_context.side_effect = Exception("Browser error")
|
|
|
|
items = await crawler.fetch_latest()
|
|
assert items == []
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_captcha():
|
|
crawler = ScholarCrawler()
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
mock_context = AsyncMock()
|
|
mock_browser.new_context.return_value = mock_context
|
|
mock_page = AsyncMock()
|
|
mock_context.new_page.return_value = mock_page
|
|
|
|
# Simulate CAPTCHA
|
|
mock_page.content.return_value = "<html><body>Please solve this CAPTCHA</body></html>"
|
|
|
|
items = await crawler.fetch_latest()
|
|
assert items == []
|