AI-Trend-Scout/tests/test_crawlers.py

import pytest
import os
from unittest.mock import MagicMock, AsyncMock, patch
from src.crawlers.factory import CrawlerFactory
from src.crawlers.rss_crawler import RSSCrawler
from src.crawlers.playwright_crawler import PlaywrightCrawler
from src.crawlers.dto import NewsItemDTO

def test_crawler_factory_load_real_file():
    # Ensure the file exists
    assert os.path.exists("src/crawlers.yml")

    crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml")
    assert len(crawlers) > 0

    # Check if we have both types
    types = [type(c) for c in crawlers]
    assert RSSCrawler in types
    assert PlaywrightCrawler in types

@pytest.mark.asyncio
async def test_rss_crawler_fetch_latest():
    rss_url = "https://example.com/rss"
    source = "Test Source"
    crawler = RSSCrawler(rss_url, source)

    mock_xml = """<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>
 <title>RSS Title</title>
 <item>
  <title>Test News</title>
  <link>https://example.com/news1</link>
  <description>Test Description</description>
  <pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate>
 </item>
</channel>
</rss>
"""

    with patch("aiohttp.ClientSession.get") as mock_get:
        mock_response = AsyncMock()
        mock_response.status = 200
        mock_response.text.return_value = mock_xml
        mock_response.raise_for_status = MagicMock()

        mock_get.return_value.__aenter__.return_value = mock_response

        items = await crawler.fetch_latest()

        assert len(items) == 1
        assert items[0].title == "Test News"
        assert items[0].url == "https://example.com/news1"
        assert items[0].source == source

@pytest.mark.asyncio
async def test_playwright_crawler_fetch_latest():
    url = "https://example.com/news"
    source = "Test Playwright"
    selector = ".news-item"
    crawler = PlaywrightCrawler(url, source, selector)

    with patch("src.crawlers.playwright_crawler.async_playwright") as mock_playwright:
        mock_p = AsyncMock()
        mock_playwright.return_value.__aenter__.return_value = mock_p

        mock_browser = AsyncMock()
        mock_p.chromium.launch.return_value = mock_browser

        mock_page = AsyncMock()
        mock_browser.new_page.return_value = mock_page

        mock_element = AsyncMock()
        mock_element.evaluate.return_value = False # Not an 'a' tag

        mock_link = AsyncMock()
        mock_link.inner_text.return_value = "Test News"
        mock_link.get_attribute.return_value = "/news1"

        mock_element.query_selector.return_value = mock_link
        mock_page.query_selector_all.return_value = [mock_element]

        items = await crawler.fetch_latest()

        assert len(items) == 1
        assert items[0].title == "Test News"
        assert items[0].url == "https://example.com/news1"
        assert items[0].source == source

def test_crawler_factory_invalid_config(tmp_path):
    config_file = tmp_path / "invalid_crawlers.yml"
    config_file.write_text("""
crawlers:
  - type: unknown
    url: "https://example.com"
    source: "Unknown"
  - type: rss
    url: "https://example.com"
    # missing source
  - not_a_dict
""")
    crawlers = CrawlerFactory.load_from_yaml(str(config_file))
    assert len(crawlers) == 0

def test_crawler_factory_empty_file(tmp_path):
    config_file = tmp_path / "empty.yml"
    config_file.write_text("")
    crawlers = CrawlerFactory.load_from_yaml(str(config_file))
    assert len(crawlers) == 0