AI-Trend-Scout/tests/crawlers/test_scirate_crawler.py

import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from datetime import datetime, timezone
from src.crawlers.scirate_crawler import SciRateCrawler
from src.crawlers.dto import NewsItemDTO

MOCK_SCIRATE_HTML = """
<html>
<body>
    <li class="paper-list-item">
        <div class="title">
            <a href="/arxiv/2403.12345">Attention is Really All You Need</a>
        </div>
        <div class="authors">Vaswani et al.</div>
        <div class="abstract">This paper presents a new architecture...</div>
    </li>
    <div class="paper">
        <div class="title">
            <a href="https://example.com/paper2">Another Paper</a>
        </div>
        <div class="authors">Doe and Smith</div>
        <div class="abstract">Abstract of another paper.</div>
    </div>
</body>
</html>
"""

@pytest.mark.asyncio
async def test_scirate_crawler_fetch_latest():
    url = "https://scirate.com/"
    source = "SciRate"
    crawler = SciRateCrawler(url, source)

    # HTML with multiple items, one missing title, one with list-like link
    mock_html = """
    <html>
    <body>
        <li class="paper-list-item">
            <div class="title"><a href="/arxiv/1">Paper 1</a></div>
        </li>
        <li class="paper-list-item">
            <div class="title">No link here</div>
        </li>
        <li class="paper-list-item">
            <div class="title"><a href="/arxiv/3">Paper 3</a></div>
        </li>
    </body>
    </html>
    """

    with patch("aiohttp.ClientSession.get") as mock_get:
        mock_response = AsyncMock()
        mock_response.text.return_value = mock_html
        mock_response.status = 200
        mock_get.return_value.__aenter__.return_value = mock_response

        # We also want to test the 'isinstance(link, list)' part.
        # This is tricky because BS4 normally doesn't return a list for href.
        # But we can mock title_el.get to return a list.
        with patch("bs4.element.Tag.get", side_effect=[["/arxiv/list"], "/arxiv/3"]):
             items = await crawler.fetch_latest()

        assert len(items) == 2
        assert items[0].url == "https://scirate.com/arxiv/list"
        assert items[1].url == "https://scirate.com/arxiv/3"

@pytest.mark.asyncio
async def test_scirate_crawler_exception():
    crawler = SciRateCrawler()
    with patch("aiohttp.ClientSession.get") as mock_get:
        mock_response = AsyncMock()
        mock_response.text.return_value = "<html></html>"
        mock_response.status = 200
        mock_get.return_value.__aenter__.return_value = mock_response

        # Force an exception in parse_html
        with patch.object(SciRateCrawler, 'parse_html', side_effect=Exception("Parsing failed")):
            items = await crawler.fetch_latest()
            assert items == []

@pytest.mark.asyncio
async def test_scirate_crawler_error():
    crawler = SciRateCrawler()
    with patch("aiohttp.ClientSession.get") as mock_get:
        mock_response = AsyncMock()
        mock_response.status = 500
        mock_get.return_value.__aenter__.return_value = mock_response

        items = await crawler.fetch_latest()
        assert items == []