AI-Trend-Scout/tests/crawlers/test_factory.py
Artur Mukhamadiev a49df98191 fix(tests): QA fixes for test suite verification
:Release Notes:
- Fix AsyncMock usage in mock_sqlite_store fixture (test_chroma_store.py)
- Add GitHubTrendingCrawler to isinstance check (test_factory.py)
- Replace live network calls with mocks (test_new_crawlers.py)

:Detailed Notes:
- ChromaStore tests were failing with TypeError due to sync MagicMock
- GitHubTrendingCrawler not in allowed types caused AssertionError
- Live crawler tests failed on network issues; now use robust mocks

:Testing Performed:
- python3 -m pytest tests/ -v (112 passed, 0 failed)

:QA Notes:
- All 112 tests passed after fixes
- Verified by Python QA Engineer subagent

:Issues Addressed:
- TypeError: 'list' object can't be awaited
- AssertionError: GitHubTrendingCrawler not in allowed types
- Live network tests flaky/failing

Change-Id: I3c77a186b5fcca6778c7bbb102c50bc6951bb37a
2026-03-30 13:54:53 +03:00

136 lines
5.2 KiB
Python

import pytest
import yaml
from unittest.mock import patch, mock_open
from src.crawlers.factory import CrawlerFactory
from src.crawlers.rss_crawler import RSSCrawler
from src.crawlers.playwright_crawler import PlaywrightCrawler
from src.crawlers.scirate_crawler import SciRateCrawler
from src.crawlers.scholar_crawler import ScholarCrawler
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
from src.crawlers.static_crawler import StaticCrawler
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
from src.crawlers.cppconf_crawler import CppConfCrawler
from src.crawlers.github_crawler import GitHubTrendingCrawler
VALID_YAML = """
crawlers:
- type: rss
url: "https://example.com/rss"
source: "Example RSS"
- type: playwright
url: "https://example.com/playwright"
source: "Example Playwright"
selector: ".item"
- type: scirate
url: "https://scirate.com/"
source: "SciRate"
- type: scholar
query: "AI"
source: "Google Scholar"
- type: microsoft_research
url: "https://example.com/msr"
source: "Microsoft Research"
"""
INVALID_TYPE_YAML = """
crawlers:
- type: unknown
url: "https://example.com/unknown"
source: "Unknown"
- type: rss
url: "https://example.com/rss"
source: "Example RSS"
"""
MALFORMED_YAML = """
crawlers:
- type: rss
[ missing stuff ]
"""
MISSING_KEYS_YAML = """
crawlers:
- type: rss
# url is missing
source: "Missing URL"
- url: "https://example.com/no-type"
source: "Missing Type"
"""
def test_load_from_yaml_valid():
with patch("builtins.open", mock_open(read_data=VALID_YAML)):
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
assert len(crawlers) == 5
assert isinstance(crawlers[0], RSSCrawler)
assert isinstance(crawlers[1], PlaywrightCrawler)
assert isinstance(crawlers[2], SciRateCrawler)
assert isinstance(crawlers[3], ScholarCrawler)
assert isinstance(crawlers[4], MicrosoftResearchCrawler)
def test_load_from_yaml_unknown_type():
with patch("builtins.open", mock_open(read_data=INVALID_TYPE_YAML)):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
assert len(crawlers) == 1
assert isinstance(crawlers[0], RSSCrawler)
mock_logger.warning.assert_called_with("Unknown crawler type: unknown")
def test_load_from_yaml_malformed():
with patch("builtins.open", mock_open(read_data=MALFORMED_YAML)):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
assert crawlers == []
# Error log should be called due to yaml.ScannerError or similar
mock_logger.error.assert_called()
def test_load_from_yaml_missing_keys():
with patch("builtins.open", mock_open(read_data=MISSING_KEYS_YAML)):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
# First item missing url -> skipped with warning
# Second item missing type -> warning in else block
assert len(crawlers) == 0
# Check for warnings
warning_calls = [call.args[0] for call in mock_logger.warning.call_args_list]
assert any("Missing mandatory fields" in msg for msg in warning_calls)
assert any("Unknown crawler type: None" in msg for msg in warning_calls)
def test_load_from_yaml_file_not_found():
with patch("src.crawlers.factory.logger") as mock_logger:
# We don't need to patch open here, just call with non-existent file
crawlers = CrawlerFactory.load_from_yaml("non_existent_file_12345.yml")
assert crawlers == []
mock_logger.error.assert_called()
def test_load_from_yaml_empty_file():
with patch("builtins.open", mock_open(read_data="")):
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("empty.yml")
assert crawlers == []
mock_logger.warning.assert_called_with("Invalid or empty configuration in empty.yml")
def test_integration_load_actual_config():
# This test verifies that the real src/crawlers.yml can be loaded without errors or warnings.
with patch("src.crawlers.factory.logger") as mock_logger:
crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml")
assert len(crawlers) > 0
mock_logger.warning.assert_not_called()
mock_logger.error.assert_not_called()
# Verify types and mandatory fields for all loaded crawlers
for crawler in crawlers:
assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler, StaticCrawler, SkolkovoCrawler, CppConfCrawler, SciRateCrawler, ScholarCrawler, MicrosoftResearchCrawler, GitHubTrendingCrawler))
if not isinstance(crawler, ScholarCrawler):
assert crawler.url.startswith("http")
assert crawler.source
if isinstance(crawler, PlaywrightCrawler):
# According to src/crawlers.yml, all playwright crawlers currently have selectors
assert crawler.selector