AI-Trend-Scout/tests/crawlers/test_factory.py
Artur Mukhamadiev 87af585e1b Refactor crawlers configuration and add new sources
- Move hard-coded crawlers from main.py to crawlers.yml
- Use CrawlerFactory to load configuration
- Add 9 new sources: C++ Russia, ICRA 2025, Technoprom, INNOPROM, Hannover Messe, RSF, Skolkovo, Horizon Europe, Addmeto
- Update task list
2026-03-15 00:45:04 +03:00

121 lines
4.5 KiB
Python

import pytest
import yaml
from unittest.mock import patch, mock_open
from src.crawlers.factory import CrawlerFactory
from src.crawlers.rss_crawler import RSSCrawler
from src.crawlers.playwright_crawler import PlaywrightCrawler
VALID_YAML = """
crawlers:
- type: rss
url: "https://example.com/rss"
source: "Example RSS"
- type: playwright
url: "https://example.com/playwright"
source: "Example Playwright"
selector: ".item"
"""
INVALID_TYPE_YAML = """
crawlers:
- type: unknown
url: "https://example.com/unknown"
source: "Unknown"
- type: rss
url: "https://example.com/rss"
source: "Example RSS"
"""
MALFORMED_YAML = """
crawlers:
- type: rss
[ missing stuff ]
"""
MISSING_KEYS_YAML = """
crawlers:
- type: rss
# url is missing
source: "Missing URL"
- url: "https://example.com/no-type"
source: "Missing Type"
"""
def test_load_from_yaml_valid():
with patch("builtins.open", mock_open(read_data=VALID_YAML)):
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
assert len(crawlers) == 2
assert isinstance(crawlers[0], RSSCrawler)
assert crawlers[0].url == "https://example.com/rss"
assert crawlers[0].source == "Example RSS"
assert isinstance(crawlers[1], PlaywrightCrawler)
assert crawlers[1].url == "https://example.com/playwright"
assert crawlers[1].source == "Example Playwright"
assert crawlers[1].selector == ".item"
def test_load_from_yaml_unknown_type():
with patch("builtins.open", mock_open(read_data=INVALID_TYPE_YAML)):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
assert len(crawlers) == 1
assert isinstance(crawlers[0], RSSCrawler)
mock_logger.warning.assert_called_with("Unknown crawler type: unknown")
def test_load_from_yaml_malformed():
with patch("builtins.open", mock_open(read_data=MALFORMED_YAML)):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
assert crawlers == []
# Error log should be called due to yaml.ScannerError or similar
mock_logger.error.assert_called()
def test_load_from_yaml_missing_keys():
with patch("builtins.open", mock_open(read_data=MISSING_KEYS_YAML)):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
# First item missing url -> skipped with warning
# Second item missing type -> warning in else block
assert len(crawlers) == 0
# Check for warnings
warning_calls = [call.args[0] for call in mock_logger.warning.call_args_list]
assert any("Missing mandatory fields" in msg for msg in warning_calls)
assert any("Unknown crawler type: None" in msg for msg in warning_calls)
def test_load_from_yaml_file_not_found():
with patch("src.crawlers.factory.logger") as mock_logger:
# We don't need to patch open here, just call with non-existent file
crawlers = CrawlerFactory.load_from_yaml("non_existent_file_12345.yml")
assert crawlers == []
mock_logger.error.assert_called()
def test_load_from_yaml_empty_file():
with patch("builtins.open", mock_open(read_data="")):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("empty.yml")
assert crawlers == []
mock_logger.warning.assert_called_with("Invalid or empty configuration in empty.yml")
def test_integration_load_actual_config():
# This test verifies that the real src/crawlers.yml can be loaded without errors or warnings.
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml")
assert len(crawlers) > 0
mock_logger.warning.assert_not_called()
mock_logger.error.assert_not_called()
# Verify types and mandatory fields for all loaded crawlers
for crawler in crawlers:
assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler))
assert crawler.url.startswith("http")
assert crawler.source
if isinstance(crawler, PlaywrightCrawler):
# According to src/crawlers.yml, all playwright crawlers currently have selectors
assert crawler.selector