- Implement crawlers for Microsoft Research, SciRate, and Google Scholar - Use Playwright with stealth for Google Scholar anti-bot mitigation - Update CrawlerFactory to support new research crawler types - Add unit and integration tests for all academic sources with high coverage
135 lines
5.1 KiB
Python
135 lines
5.1 KiB
Python
import pytest
|
|
import yaml
|
|
from unittest.mock import patch, mock_open
|
|
from src.crawlers.factory import CrawlerFactory
|
|
from src.crawlers.rss_crawler import RSSCrawler
|
|
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
|
from src.crawlers.scirate_crawler import SciRateCrawler
|
|
from src.crawlers.scholar_crawler import ScholarCrawler
|
|
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
|
|
from src.crawlers.static_crawler import StaticCrawler
|
|
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
|
from src.crawlers.cppconf_crawler import CppConfCrawler
|
|
|
|
VALID_YAML = """
|
|
crawlers:
|
|
- type: rss
|
|
url: "https://example.com/rss"
|
|
source: "Example RSS"
|
|
- type: playwright
|
|
url: "https://example.com/playwright"
|
|
source: "Example Playwright"
|
|
selector: ".item"
|
|
- type: scirate
|
|
url: "https://scirate.com/"
|
|
source: "SciRate"
|
|
- type: scholar
|
|
query: "AI"
|
|
source: "Google Scholar"
|
|
- type: microsoft_research
|
|
url: "https://example.com/msr"
|
|
source: "Microsoft Research"
|
|
"""
|
|
|
|
INVALID_TYPE_YAML = """
|
|
crawlers:
|
|
- type: unknown
|
|
url: "https://example.com/unknown"
|
|
source: "Unknown"
|
|
- type: rss
|
|
url: "https://example.com/rss"
|
|
source: "Example RSS"
|
|
"""
|
|
|
|
MALFORMED_YAML = """
|
|
crawlers:
|
|
- type: rss
|
|
[ missing stuff ]
|
|
"""
|
|
|
|
MISSING_KEYS_YAML = """
|
|
crawlers:
|
|
- type: rss
|
|
# url is missing
|
|
source: "Missing URL"
|
|
- url: "https://example.com/no-type"
|
|
source: "Missing Type"
|
|
"""
|
|
|
|
def test_load_from_yaml_valid():
|
|
with patch("builtins.open", mock_open(read_data=VALID_YAML)):
|
|
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
|
|
|
|
assert len(crawlers) == 5
|
|
assert isinstance(crawlers[0], RSSCrawler)
|
|
assert isinstance(crawlers[1], PlaywrightCrawler)
|
|
assert isinstance(crawlers[2], SciRateCrawler)
|
|
assert isinstance(crawlers[3], ScholarCrawler)
|
|
assert isinstance(crawlers[4], MicrosoftResearchCrawler)
|
|
|
|
|
|
def test_load_from_yaml_unknown_type():
|
|
with patch("builtins.open", mock_open(read_data=INVALID_TYPE_YAML)):
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
|
|
|
|
assert len(crawlers) == 1
|
|
assert isinstance(crawlers[0], RSSCrawler)
|
|
mock_logger.warning.assert_called_with("Unknown crawler type: unknown")
|
|
|
|
def test_load_from_yaml_malformed():
|
|
with patch("builtins.open", mock_open(read_data=MALFORMED_YAML)):
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
|
|
|
|
assert crawlers == []
|
|
# Error log should be called due to yaml.ScannerError or similar
|
|
mock_logger.error.assert_called()
|
|
|
|
def test_load_from_yaml_missing_keys():
|
|
with patch("builtins.open", mock_open(read_data=MISSING_KEYS_YAML)):
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
|
|
|
|
# First item missing url -> skipped with warning
|
|
# Second item missing type -> warning in else block
|
|
assert len(crawlers) == 0
|
|
|
|
# Check for warnings
|
|
warning_calls = [call.args[0] for call in mock_logger.warning.call_args_list]
|
|
assert any("Missing mandatory fields" in msg for msg in warning_calls)
|
|
assert any("Unknown crawler type: None" in msg for msg in warning_calls)
|
|
|
|
def test_load_from_yaml_file_not_found():
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
# We don't need to patch open here, just call with non-existent file
|
|
crawlers = CrawlerFactory.load_from_yaml("non_existent_file_12345.yml")
|
|
assert crawlers == []
|
|
mock_logger.error.assert_called()
|
|
|
|
def test_load_from_yaml_empty_file():
|
|
with patch("builtins.open", mock_open(read_data="")):
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
crawlers = CrawlerFactory.load_from_yaml("empty.yml")
|
|
assert crawlers == []
|
|
mock_logger.warning.assert_called_with("Invalid or empty configuration in empty.yml")
|
|
|
|
def test_integration_load_actual_config():
|
|
# This test verifies that the real src/crawlers.yml can be loaded without errors or warnings.
|
|
with patch("src.crawlers.factory.logger") as mock_logger:
|
|
crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml")
|
|
|
|
assert len(crawlers) > 0
|
|
mock_logger.warning.assert_not_called()
|
|
mock_logger.error.assert_not_called()
|
|
|
|
# Verify types and mandatory fields for all loaded crawlers
|
|
for crawler in crawlers:
|
|
assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler, StaticCrawler, SkolkovoCrawler, CppConfCrawler, SciRateCrawler, ScholarCrawler, MicrosoftResearchCrawler))
|
|
if not isinstance(crawler, ScholarCrawler):
|
|
assert crawler.url.startswith("http")
|
|
assert crawler.source
|
|
if isinstance(crawler, PlaywrightCrawler):
|
|
# According to src/crawlers.yml, all playwright crawlers currently have selectors
|
|
assert crawler.selector
|