import pytest import yaml from unittest.mock import patch, mock_open from src.crawlers.factory import CrawlerFactory from src.crawlers.rss_crawler import RSSCrawler from src.crawlers.playwright_crawler import PlaywrightCrawler VALID_YAML = """ crawlers: - type: rss url: "https://example.com/rss" source: "Example RSS" - type: playwright url: "https://example.com/playwright" source: "Example Playwright" selector: ".item" """ INVALID_TYPE_YAML = """ crawlers: - type: unknown url: "https://example.com/unknown" source: "Unknown" - type: rss url: "https://example.com/rss" source: "Example RSS" """ MALFORMED_YAML = """ crawlers: - type: rss [ missing stuff ] """ MISSING_KEYS_YAML = """ crawlers: - type: rss # url is missing source: "Missing URL" - url: "https://example.com/no-type" source: "Missing Type" """ def test_load_from_yaml_valid(): with patch("builtins.open", mock_open(read_data=VALID_YAML)): crawlers = CrawlerFactory.load_from_yaml("dummy.yml") assert len(crawlers) == 2 assert isinstance(crawlers[0], RSSCrawler) assert crawlers[0].url == "https://example.com/rss" assert crawlers[0].source == "Example RSS" assert isinstance(crawlers[1], PlaywrightCrawler) assert crawlers[1].url == "https://example.com/playwright" assert crawlers[1].source == "Example Playwright" assert crawlers[1].selector == ".item" def test_load_from_yaml_unknown_type(): with patch("builtins.open", mock_open(read_data=INVALID_TYPE_YAML)): with patch("src.crawlers.factory.logger") as mock_logger: crawlers = CrawlerFactory.load_from_yaml("dummy.yml") assert len(crawlers) == 1 assert isinstance(crawlers[0], RSSCrawler) mock_logger.warning.assert_called_with("Unknown crawler type: unknown") def test_load_from_yaml_malformed(): with patch("builtins.open", mock_open(read_data=MALFORMED_YAML)): with patch("src.crawlers.factory.logger") as mock_logger: crawlers = CrawlerFactory.load_from_yaml("dummy.yml") assert crawlers == [] # Error log should be called due to yaml.ScannerError or similar mock_logger.error.assert_called() def test_load_from_yaml_missing_keys(): with patch("builtins.open", mock_open(read_data=MISSING_KEYS_YAML)): with patch("src.crawlers.factory.logger") as mock_logger: crawlers = CrawlerFactory.load_from_yaml("dummy.yml") # First item missing url -> skipped with warning # Second item missing type -> warning in else block assert len(crawlers) == 0 # Check for warnings warning_calls = [call.args[0] for call in mock_logger.warning.call_args_list] assert any("Missing mandatory fields" in msg for msg in warning_calls) assert any("Unknown crawler type: None" in msg for msg in warning_calls) def test_load_from_yaml_file_not_found(): with patch("src.crawlers.factory.logger") as mock_logger: # We don't need to patch open here, just call with non-existent file crawlers = CrawlerFactory.load_from_yaml("non_existent_file_12345.yml") assert crawlers == [] mock_logger.error.assert_called() def test_load_from_yaml_empty_file(): with patch("builtins.open", mock_open(read_data="")): with patch("src.crawlers.factory.logger") as mock_logger: crawlers = CrawlerFactory.load_from_yaml("empty.yml") assert crawlers == [] mock_logger.warning.assert_called_with("Invalid or empty configuration in empty.yml") def test_integration_load_actual_config(): # This test verifies that the real src/crawlers.yml can be loaded without errors or warnings. with patch("src.crawlers.factory.logger") as mock_logger: crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml") assert len(crawlers) > 0 mock_logger.warning.assert_not_called() mock_logger.error.assert_not_called() # Verify types and mandatory fields for all loaded crawlers for crawler in crawlers: assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler)) assert crawler.url.startswith("http") assert crawler.source if isinstance(crawler, PlaywrightCrawler): # According to src/crawlers.yml, all playwright crawlers currently have selectors assert crawler.selector