import pytest import aiohttp from unittest.mock import AsyncMock, patch, MagicMock from datetime import datetime, timezone from src.crawlers.scirate_crawler import SciRateCrawler from src.crawlers.scholar_crawler import ScholarCrawler from src.crawlers.factory import CrawlerFactory from src.crawlers.dto import NewsItemDTO @pytest.mark.asyncio async def test_scirate_crawler_parse_html(): crawler = SciRateCrawler() sample_html = """
  • Quantum Supremacy in the Kitchen
    John Doe, Jane Smith
    We demonstrate quantum supremacy by perfectly boiling an egg.
  • AI for Cats
    Cat Lover
    A deep learning approach to understanding meows.
    """ items = crawler.parse_html(sample_html) assert len(items) == 2 assert items[0].title == "Quantum Supremacy in the Kitchen" assert "arxiv/2403.12345" in items[0].url assert "John Doe, Jane Smith" in items[0].content_text assert "boiling an egg" in items[0].content_text assert items[0].source == "SciRate" assert items[1].title == "AI for Cats" assert items[1].url == "https://scirate.com/arxiv/2403.67890" assert "Cat Lover" in items[1].content_text assert "meows" in items[1].content_text @pytest.mark.asyncio async def test_scirate_crawler_fetch_latest(): crawler = SciRateCrawler() sample_html = """
  • Quantum Supremacy
  • """ with patch("aiohttp.ClientSession.get") as mock_get: mock_response = AsyncMock() mock_response.status = 200 mock_response.text.return_value = sample_html mock_get.return_value.__aenter__.return_value = mock_response items = await crawler.fetch_latest() assert len(items) == 1 assert items[0].title == "Quantum Supremacy" @pytest.mark.asyncio async def test_scirate_crawler_fetch_error(): crawler = SciRateCrawler() with patch("aiohttp.ClientSession.get") as mock_get: mock_response = AsyncMock() mock_response.status = 404 mock_get.return_value.__aenter__.return_value = mock_response items = await crawler.fetch_latest() assert items == [] @pytest.mark.asyncio async def test_scholar_crawler_fetch_latest(): crawler = ScholarCrawler(query="WebGPU", source="Scholar") with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \ patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class: mock_p = AsyncMock() mock_playwright.return_value.__aenter__.return_value = mock_p mock_browser = AsyncMock() mock_p.chromium.launch.return_value = mock_browser mock_context = AsyncMock() mock_browser.new_context.return_value = mock_context mock_page = AsyncMock() mock_context.new_page.return_value = mock_page mock_page.content.return_value = "Results" # Mock Stealth instance and method mock_stealth_instance = MagicMock() mock_stealth_instance.apply_stealth_async = AsyncMock() mock_stealth_class.return_value = mock_stealth_instance # Mock result elements mock_res = AsyncMock() mock_title_el = AsyncMock() mock_title_el.inner_text.return_value = "WebGPU Accelerated ML" mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2403.abc" mock_snippet_el = AsyncMock() mock_snippet_el.inner_text.return_value = "This paper discusses WebGPU..." mock_metadata_el = AsyncMock() mock_metadata_el.inner_text.return_value = "J. Smith, 2024 - arxiv.org" mock_citation_link = AsyncMock() mock_citation_link.inner_text.return_value = "Cited by 15" mock_res.query_selector.side_effect = lambda selector: { ".gs_rt a": mock_title_el, ".gs_rs": mock_snippet_el, ".gs_a": mock_metadata_el }.get(selector) mock_res.query_selector_all.return_value = [mock_citation_link] mock_page.query_selector_all.return_value = [mock_res] items = await crawler.fetch_latest() assert len(items) == 1 assert items[0].title == "WebGPU Accelerated ML" assert items[0].url == "https://arxiv.org/abs/2403.abc" assert "15" in items[0].content_text assert "J. Smith, 2024" in items[0].content_text assert items[0].source == "Scholar: WebGPU" mock_browser.close.assert_called_once() @pytest.mark.asyncio async def test_scholar_crawler_captcha_detection(): crawler = ScholarCrawler(query="WebGPU", source="Scholar") with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \ patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class: mock_p = AsyncMock() mock_playwright.return_value.__aenter__.return_value = mock_p mock_browser = AsyncMock() mock_p.chromium.launch.return_value = mock_browser mock_context = AsyncMock() mock_browser.new_context.return_value = mock_context mock_page = AsyncMock() mock_context.new_page.return_value = mock_page # Mock Stealth instance and method mock_stealth_instance = MagicMock() mock_stealth_instance.apply_stealth_async = AsyncMock() mock_stealth_class.return_value = mock_stealth_instance # Simulate CAPTCHA in content mock_page.content.return_value = "Please verify you are not a robot CAPTCHA" items = await crawler.fetch_latest() assert items == [] mock_browser.close.assert_called_once() @pytest.mark.asyncio async def test_scholar_crawler_error_handling(): crawler = ScholarCrawler(query="WebGPU", source="Scholar") with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \ patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class: mock_p = AsyncMock() mock_playwright.return_value.__aenter__.return_value = mock_p mock_browser = AsyncMock() mock_p.chromium.launch.return_value = mock_browser mock_context = AsyncMock() mock_browser.new_context.return_value = mock_context mock_page = AsyncMock() mock_context.new_page.return_value = mock_page mock_stealth_instance = MagicMock() mock_stealth_instance.apply_stealth_async = AsyncMock() mock_stealth_class.return_value = mock_stealth_instance # Simulate exception during goto mock_page.goto.side_effect = Exception("Browser crash") items = await crawler.fetch_latest() assert items == [] mock_browser.close.assert_called_once() def test_factory_registration(): # Test if SciRate and Scholar are registered in the factory with patch("builtins.open", MagicMock()): with patch("yaml.safe_load") as mock_yaml: mock_yaml.return_value = { 'crawlers': [ {'type': 'scirate', 'url': 'https://scirate.com/', 'source': 'SciRate'}, {'type': 'scholar', 'url': 'https://scholar.google.com/', 'source': 'Scholar', 'query': 'AI'} ] } crawlers = CrawlerFactory.load_from_yaml("fake_path.yml") assert len(crawlers) == 2 assert isinstance(crawlers[0], SciRateCrawler) assert isinstance(crawlers[1], ScholarCrawler) assert crawlers[1].query == 'AI'