import pytest from unittest.mock import AsyncMock, patch, MagicMock from datetime import datetime, timezone from src.crawlers.scholar_crawler import ScholarCrawler from src.crawlers.dto import NewsItemDTO @pytest.mark.asyncio async def test_scholar_crawler_fetch_latest(): query = "Large Language Models" source = "Google Scholar" crawler = ScholarCrawler(query=query, source=source) with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \ patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class: mock_stealth = MagicMock() mock_stealth.apply_stealth_async = AsyncMock() mock_stealth_class.return_value = mock_stealth mock_p = AsyncMock() mock_playwright.return_value.__aenter__.return_value = mock_p mock_browser = AsyncMock() mock_p.chromium.launch.return_value = mock_browser mock_context = AsyncMock() mock_browser.new_context.return_value = mock_context mock_page = AsyncMock() mock_context.new_page.return_value = mock_page # Mock content to avoid CAPTCHA detection in crawler mock_page.content.return_value = "Results" # Setup mock results mock_res = AsyncMock() # Title element mock_title_el = AsyncMock() mock_title_el.inner_text.return_value = "LLM Paper Title" mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2401.00001" mock_res.query_selector.side_effect = lambda selector: { ".gs_rt a": mock_title_el, ".gs_rs": AsyncMock(inner_text=AsyncMock(return_value="This is a snippet")), ".gs_a": AsyncMock(inner_text=AsyncMock(return_value="Authors et al.")), }.get(selector) # Citations mock_citation_link = AsyncMock() mock_citation_link.inner_text.return_value = "Cited by 123" mock_res.query_selector_all.return_value = [mock_citation_link] mock_page.query_selector_all.return_value = [mock_res] items = await crawler.fetch_latest() assert len(items) == 1 assert items[0].title == "LLM Paper Title" @pytest.mark.asyncio async def test_scholar_crawler_no_title(): crawler = ScholarCrawler() with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright: mock_p = AsyncMock() mock_playwright.return_value.__aenter__.return_value = mock_p mock_browser = AsyncMock() mock_p.chromium.launch.return_value = mock_browser mock_context = AsyncMock() mock_browser.new_context.return_value = mock_context mock_page = AsyncMock() mock_context.new_page.return_value = mock_page mock_page.content.return_value = "Results" # Result item without title link mock_res = AsyncMock() mock_res.query_selector.return_value = None mock_page.query_selector_all.return_value = [mock_res] items = await crawler.fetch_latest() assert len(items) == 0 @pytest.mark.asyncio async def test_scholar_crawler_exception(): crawler = ScholarCrawler() with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright: mock_p = AsyncMock() mock_playwright.return_value.__aenter__.return_value = mock_p mock_browser = AsyncMock() mock_p.chromium.launch.return_value = mock_browser # Force exception mock_browser.new_context.side_effect = Exception("Browser error") items = await crawler.fetch_latest() assert items == [] @pytest.mark.asyncio async def test_scholar_crawler_captcha(): crawler = ScholarCrawler() with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright: mock_p = AsyncMock() mock_playwright.return_value.__aenter__.return_value = mock_p mock_browser = AsyncMock() mock_p.chromium.launch.return_value = mock_browser mock_context = AsyncMock() mock_browser.new_context.return_value = mock_context mock_page = AsyncMock() mock_context.new_page.return_value = mock_page # Simulate CAPTCHA mock_page.content.return_value = "Please solve this CAPTCHA" items = await crawler.fetch_latest() assert items == [] @pytest.mark.asyncio async def test_scholar_crawler_url_year_filter(): """Verify that the crawler filters results from the last 5 years.""" current_year = datetime.now().year expected_year = current_year - 5 query = "Edge AI" crawler = ScholarCrawler(query=query) # The URL should include the lower year bound filter assert f"&as_ylo={expected_year}" in crawler.url