AI-Trend-Scout/tests/crawlers/test_factory.py
Artur Mukhamadiev a304ae9cd2 feat(crawler): add academic and research sources
- Implement crawlers for Microsoft Research, SciRate, and Google Scholar
- Use Playwright with stealth for Google Scholar anti-bot mitigation
- Update CrawlerFactory to support new research crawler types
- Add unit and integration tests for all academic sources with high coverage
2026-03-16 00:11:15 +03:00

135 lines
5.1 KiB
Python

import pytest
import yaml
from unittest.mock import patch, mock_open
from src.crawlers.factory import CrawlerFactory
from src.crawlers.rss_crawler import RSSCrawler
from src.crawlers.playwright_crawler import PlaywrightCrawler
from src.crawlers.scirate_crawler import SciRateCrawler
from src.crawlers.scholar_crawler import ScholarCrawler
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
from src.crawlers.static_crawler import StaticCrawler
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
from src.crawlers.cppconf_crawler import CppConfCrawler
VALID_YAML = """
crawlers:
- type: rss
url: "https://example.com/rss"
source: "Example RSS"
- type: playwright
url: "https://example.com/playwright"
source: "Example Playwright"
selector: ".item"
- type: scirate
url: "https://scirate.com/"
source: "SciRate"
- type: scholar
query: "AI"
source: "Google Scholar"
- type: microsoft_research
url: "https://example.com/msr"
source: "Microsoft Research"
"""
INVALID_TYPE_YAML = """
crawlers:
- type: unknown
url: "https://example.com/unknown"
source: "Unknown"
- type: rss
url: "https://example.com/rss"
source: "Example RSS"
"""
MALFORMED_YAML = """
crawlers:
- type: rss
[ missing stuff ]
"""
MISSING_KEYS_YAML = """
crawlers:
- type: rss
# url is missing
source: "Missing URL"
- url: "https://example.com/no-type"
source: "Missing Type"
"""
def test_load_from_yaml_valid():
with patch("builtins.open", mock_open(read_data=VALID_YAML)):
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
assert len(crawlers) == 5
assert isinstance(crawlers[0], RSSCrawler)
assert isinstance(crawlers[1], PlaywrightCrawler)
assert isinstance(crawlers[2], SciRateCrawler)
assert isinstance(crawlers[3], ScholarCrawler)
assert isinstance(crawlers[4], MicrosoftResearchCrawler)
def test_load_from_yaml_unknown_type():
with patch("builtins.open", mock_open(read_data=INVALID_TYPE_YAML)):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
assert len(crawlers) == 1
assert isinstance(crawlers[0], RSSCrawler)
mock_logger.warning.assert_called_with("Unknown crawler type: unknown")
def test_load_from_yaml_malformed():
with patch("builtins.open", mock_open(read_data=MALFORMED_YAML)):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
assert crawlers == []
# Error log should be called due to yaml.ScannerError or similar
mock_logger.error.assert_called()
def test_load_from_yaml_missing_keys():
with patch("builtins.open", mock_open(read_data=MISSING_KEYS_YAML)):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
# First item missing url -> skipped with warning
# Second item missing type -> warning in else block
assert len(crawlers) == 0
# Check for warnings
warning_calls = [call.args[0] for call in mock_logger.warning.call_args_list]
assert any("Missing mandatory fields" in msg for msg in warning_calls)
assert any("Unknown crawler type: None" in msg for msg in warning_calls)
def test_load_from_yaml_file_not_found():
with patch("src.crawlers.factory.logger") as mock_logger:
# We don't need to patch open here, just call with non-existent file
crawlers = CrawlerFactory.load_from_yaml("non_existent_file_12345.yml")
assert crawlers == []
mock_logger.error.assert_called()
def test_load_from_yaml_empty_file():
with patch("builtins.open", mock_open(read_data="")):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("empty.yml")
assert crawlers == []
mock_logger.warning.assert_called_with("Invalid or empty configuration in empty.yml")
def test_integration_load_actual_config():
# This test verifies that the real src/crawlers.yml can be loaded without errors or warnings.
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml")
assert len(crawlers) > 0
mock_logger.warning.assert_not_called()
mock_logger.error.assert_not_called()
# Verify types and mandatory fields for all loaded crawlers
for crawler in crawlers:
assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler, StaticCrawler, SkolkovoCrawler, CppConfCrawler, SciRateCrawler, ScholarCrawler, MicrosoftResearchCrawler))
if not isinstance(crawler, ScholarCrawler):
assert crawler.url.startswith("http")
assert crawler.source
if isinstance(crawler, PlaywrightCrawler):
# According to src/crawlers.yml, all playwright crawlers currently have selectors
assert crawler.selector