- Move hard-coded crawlers from main.py to crawlers.yml - Use CrawlerFactory to load configuration - Add 9 new sources: C++ Russia, ICRA 2025, Technoprom, INNOPROM, Hannover Messe, RSF, Skolkovo, Horizon Europe, Addmeto - Update task list
121 lines
4.5 KiB
Python
121 lines
4.5 KiB
Python
import pytest
|
|
import yaml
|
|
from unittest.mock import patch, mock_open
|
|
from src.crawlers.factory import CrawlerFactory
|
|
from src.crawlers.rss_crawler import RSSCrawler
|
|
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
|
|
|
VALID_YAML = """
|
|
crawlers:
|
|
- type: rss
|
|
url: "https://example.com/rss"
|
|
source: "Example RSS"
|
|
- type: playwright
|
|
url: "https://example.com/playwright"
|
|
source: "Example Playwright"
|
|
selector: ".item"
|
|
"""
|
|
|
|
INVALID_TYPE_YAML = """
|
|
crawlers:
|
|
- type: unknown
|
|
url: "https://example.com/unknown"
|
|
source: "Unknown"
|
|
- type: rss
|
|
url: "https://example.com/rss"
|
|
source: "Example RSS"
|
|
"""
|
|
|
|
MALFORMED_YAML = """
|
|
crawlers:
|
|
- type: rss
|
|
[ missing stuff ]
|
|
"""
|
|
|
|
MISSING_KEYS_YAML = """
|
|
crawlers:
|
|
- type: rss
|
|
# url is missing
|
|
source: "Missing URL"
|
|
- url: "https://example.com/no-type"
|
|
source: "Missing Type"
|
|
"""
|
|
|
|
def test_load_from_yaml_valid():
|
|
with patch("builtins.open", mock_open(read_data=VALID_YAML)):
|
|
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
|
|
|
|
assert len(crawlers) == 2
|
|
assert isinstance(crawlers[0], RSSCrawler)
|
|
assert crawlers[0].url == "https://example.com/rss"
|
|
assert crawlers[0].source == "Example RSS"
|
|
|
|
assert isinstance(crawlers[1], PlaywrightCrawler)
|
|
assert crawlers[1].url == "https://example.com/playwright"
|
|
assert crawlers[1].source == "Example Playwright"
|
|
assert crawlers[1].selector == ".item"
|
|
|
|
def test_load_from_yaml_unknown_type():
|
|
with patch("builtins.open", mock_open(read_data=INVALID_TYPE_YAML)):
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
|
|
|
|
assert len(crawlers) == 1
|
|
assert isinstance(crawlers[0], RSSCrawler)
|
|
mock_logger.warning.assert_called_with("Unknown crawler type: unknown")
|
|
|
|
def test_load_from_yaml_malformed():
|
|
with patch("builtins.open", mock_open(read_data=MALFORMED_YAML)):
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
|
|
|
|
assert crawlers == []
|
|
# Error log should be called due to yaml.ScannerError or similar
|
|
mock_logger.error.assert_called()
|
|
|
|
def test_load_from_yaml_missing_keys():
|
|
with patch("builtins.open", mock_open(read_data=MISSING_KEYS_YAML)):
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
|
|
|
|
# First item missing url -> skipped with warning
|
|
# Second item missing type -> warning in else block
|
|
assert len(crawlers) == 0
|
|
|
|
# Check for warnings
|
|
warning_calls = [call.args[0] for call in mock_logger.warning.call_args_list]
|
|
assert any("Missing mandatory fields" in msg for msg in warning_calls)
|
|
assert any("Unknown crawler type: None" in msg for msg in warning_calls)
|
|
|
|
def test_load_from_yaml_file_not_found():
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
# We don't need to patch open here, just call with non-existent file
|
|
crawlers = CrawlerFactory.load_from_yaml("non_existent_file_12345.yml")
|
|
assert crawlers == []
|
|
mock_logger.error.assert_called()
|
|
|
|
def test_load_from_yaml_empty_file():
|
|
with patch("builtins.open", mock_open(read_data="")):
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
crawlers = CrawlerFactory.load_from_yaml("empty.yml")
|
|
assert crawlers == []
|
|
mock_logger.warning.assert_called_with("Invalid or empty configuration in empty.yml")
|
|
|
|
def test_integration_load_actual_config():
|
|
# This test verifies that the real src/crawlers.yml can be loaded without errors or warnings.
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml")
|
|
|
|
assert len(crawlers) > 0
|
|
mock_logger.warning.assert_not_called()
|
|
mock_logger.error.assert_not_called()
|
|
|
|
# Verify types and mandatory fields for all loaded crawlers
|
|
for crawler in crawlers:
|
|
assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler))
|
|
assert crawler.url.startswith("http")
|
|
assert crawler.source
|
|
if isinstance(crawler, PlaywrightCrawler):
|
|
# According to src/crawlers.yml, all playwright crawlers currently have selectors
|
|
assert crawler.selector
|