import pytest
from unittest.mock import patch, MagicMock
from datetime import datetime, timezone
from src.crawlers.github_crawler import GitHubTrendingCrawler
from src.crawlers.dto import NewsItemDTO
@pytest.fixture
def monthly_html():
return """
Monthly description 1
Python
Monthly description 2
JavaScript
"""
@pytest.fixture
def weekly_html():
return """
Weekly description 3
Go
"""
@pytest.fixture
def daily_html():
return """
Daily description 1
Python
Daily description 4
Rust
"""
@pytest.mark.asyncio
async def test_github_trending_crawler_fetches_all_timeframes(monthly_html, weekly_html, daily_html):
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
# Configure mock to return different HTML for different URLs
def side_effect(url, **kwargs):
mock_resp = MagicMock()
mock_resp.status_code = 200
if "since=monthly" in url:
mock_resp.text = monthly_html
elif "since=weekly" in url:
mock_resp.text = weekly_html
elif "since=daily" in url:
mock_resp.text = daily_html
else:
mock_resp.text = ""
return mock_resp
mock_get.side_effect = side_effect
results = await crawler.fetch_latest()
# Verify it called all three URLs
called_urls = [call.args[0] for call in mock_get.call_args_list]
assert "https://github.com/trending?since=monthly" in called_urls
assert "https://github.com/trending?since=weekly" in called_urls
assert "https://github.com/trending?since=daily" in called_urls
@pytest.mark.asyncio
async def test_github_trending_crawler_parses_html_correctly(daily_html):
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = daily_html
mock_get.return_value = mock_resp
# We only care about one fetch here to verify parsing
# But fetch_latest might call all three, so we mock it to return empty for others if needed
# or just check the results.
results = await crawler.fetch_latest()
# Check if repo4 is correctly parsed
repo4 = next((item for item in results if "user/repo4" in item.url), None)
assert repo4 is not None
assert repo4.title == "user / repo4"
assert "Daily description 4" in repo4.content_text
assert "Rust" in repo4.content_text
assert repo4.source == "GitHub Trending"
@pytest.mark.asyncio
async def test_github_trending_crawler_deduplication(monthly_html, weekly_html, daily_html):
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
def side_effect(url, **kwargs):
mock_resp = MagicMock()
mock_resp.status_code = 200
if "since=monthly" in url:
mock_resp.text = monthly_html
elif "since=weekly" in url:
mock_resp.text = weekly_html
elif "since=daily" in url:
mock_resp.text = daily_html
return mock_resp
mock_get.side_effect = side_effect
results = await crawler.fetch_latest()
# repo1 appears in monthly and daily
repo1_items = [item for item in results if "user/repo1" in item.url]
# 1. Assert only ONE NewsItemDTO for repo1
assert len(repo1_items) == 1
# 2. Assert content_text or source indicates it appeared in both timeframes
# The prompt says: "its content_text (or source) should indicate it appeared in both timeframes"
repo1 = repo1_items[0]
assert "monthly" in repo1.content_text.lower() or "monthly" in repo1.source.lower()
assert "daily" in repo1.content_text.lower() or "daily" in repo1.source.lower()
@pytest.mark.asyncio
async def test_github_trending_crawler_handles_errors():
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
mock_get.side_effect = Exception("Network error")
results = await crawler.fetch_latest()
assert results == []