import pytest import os from unittest.mock import MagicMock, AsyncMock, patch from src.crawlers.factory import CrawlerFactory from src.crawlers.rss_crawler import RSSCrawler from src.crawlers.playwright_crawler import PlaywrightCrawler from src.crawlers.dto import NewsItemDTO def test_crawler_factory_load_real_file(): # Ensure the file exists assert os.path.exists("src/crawlers.yml") crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml") assert len(crawlers) > 0 # Check if we have both types types = [type(c) for c in crawlers] assert RSSCrawler in types assert PlaywrightCrawler in types @pytest.mark.asyncio async def test_rss_crawler_fetch_latest(): rss_url = "https://example.com/rss" source = "Test Source" crawler = RSSCrawler(rss_url, source) mock_xml = """ RSS Title Test News https://example.com/news1 Test Description Mon, 01 Jan 2024 00:00:00 +0000 """ with patch("aiohttp.ClientSession.get") as mock_get: mock_response = AsyncMock() mock_response.status = 200 mock_response.text.return_value = mock_xml mock_response.raise_for_status = MagicMock() mock_get.return_value.__aenter__.return_value = mock_response items = await crawler.fetch_latest() assert len(items) == 1 assert items[0].title == "Test News" assert items[0].url == "https://example.com/news1" assert items[0].source == source @pytest.mark.asyncio async def test_playwright_crawler_fetch_latest(): url = "https://example.com/news" source = "Test Playwright" selector = ".news-item" crawler = PlaywrightCrawler(url, source, selector) with patch("src.crawlers.playwright_crawler.async_playwright") as mock_playwright: mock_p = AsyncMock() mock_playwright.return_value.__aenter__.return_value = mock_p mock_browser = AsyncMock() mock_p.chromium.launch.return_value = mock_browser mock_page = AsyncMock() mock_browser.new_page.return_value = mock_page mock_element = AsyncMock() mock_element.evaluate.return_value = False # Not an 'a' tag mock_link = AsyncMock() mock_link.inner_text.return_value = "Test News" mock_link.get_attribute.return_value = "/news1" mock_element.query_selector.return_value = mock_link mock_page.query_selector_all.return_value = [mock_element] items = await crawler.fetch_latest() assert len(items) == 1 assert items[0].title == "Test News" assert items[0].url == "https://example.com/news1" assert items[0].source == source def test_crawler_factory_invalid_config(tmp_path): config_file = tmp_path / "invalid_crawlers.yml" config_file.write_text(""" crawlers: - type: unknown url: "https://example.com" source: "Unknown" - type: rss url: "https://example.com" # missing source - not_a_dict """) crawlers = CrawlerFactory.load_from_yaml(str(config_file)) assert len(crawlers) == 0 def test_crawler_factory_empty_file(tmp_path): config_file = tmp_path / "empty.yml" config_file.write_text("") crawlers = CrawlerFactory.load_from_yaml(str(config_file)) assert len(crawlers) == 0