import pytest from unittest.mock import patch, MagicMock from datetime import datetime, timezone from src.crawlers.github_crawler import GitHubTrendingCrawler from src.crawlers.dto import NewsItemDTO @pytest.fixture def monthly_html(): return """

user / repo1

Monthly description 1

Python

user / repo2

Monthly description 2

JavaScript
""" @pytest.fixture def weekly_html(): return """

user / repo3

Weekly description 3

Go
""" @pytest.fixture def daily_html(): return """

user / repo1

Daily description 1

Python

user / repo4

Daily description 4

Rust
""" @pytest.mark.asyncio async def test_github_trending_crawler_fetches_all_timeframes(monthly_html, weekly_html, daily_html): crawler = GitHubTrendingCrawler() with patch("requests.get") as mock_get: # Configure mock to return different HTML for different URLs def side_effect(url, **kwargs): mock_resp = MagicMock() mock_resp.status_code = 200 if "since=monthly" in url: mock_resp.text = monthly_html elif "since=weekly" in url: mock_resp.text = weekly_html elif "since=daily" in url: mock_resp.text = daily_html else: mock_resp.text = "" return mock_resp mock_get.side_effect = side_effect results = await crawler.fetch_latest() # Verify it called all three URLs called_urls = [call.args[0] for call in mock_get.call_args_list] assert "https://github.com/trending?since=monthly" in called_urls assert "https://github.com/trending?since=weekly" in called_urls assert "https://github.com/trending?since=daily" in called_urls @pytest.mark.asyncio async def test_github_trending_crawler_parses_html_correctly(daily_html): crawler = GitHubTrendingCrawler() with patch("requests.get") as mock_get: mock_resp = MagicMock() mock_resp.status_code = 200 mock_resp.text = daily_html mock_get.return_value = mock_resp # We only care about one fetch here to verify parsing # But fetch_latest might call all three, so we mock it to return empty for others if needed # or just check the results. results = await crawler.fetch_latest() # Check if repo4 is correctly parsed repo4 = next((item for item in results if "user/repo4" in item.url), None) assert repo4 is not None assert repo4.title == "user / repo4" assert "Daily description 4" in repo4.content_text assert "Rust" in repo4.content_text assert repo4.source == "GitHub Trending" @pytest.mark.asyncio async def test_github_trending_crawler_deduplication(monthly_html, weekly_html, daily_html): crawler = GitHubTrendingCrawler() with patch("requests.get") as mock_get: def side_effect(url, **kwargs): mock_resp = MagicMock() mock_resp.status_code = 200 if "since=monthly" in url: mock_resp.text = monthly_html elif "since=weekly" in url: mock_resp.text = weekly_html elif "since=daily" in url: mock_resp.text = daily_html return mock_resp mock_get.side_effect = side_effect results = await crawler.fetch_latest() # repo1 appears in monthly and daily repo1_items = [item for item in results if "user/repo1" in item.url] # 1. Assert only ONE NewsItemDTO for repo1 assert len(repo1_items) == 1 # 2. Assert content_text or source indicates it appeared in both timeframes # The prompt says: "its content_text (or source) should indicate it appeared in both timeframes" repo1 = repo1_items[0] assert "monthly" in repo1.content_text.lower() or "monthly" in repo1.source.lower() assert "daily" in repo1.content_text.lower() or "daily" in repo1.source.lower() @pytest.mark.asyncio async def test_github_trending_crawler_handles_errors(): crawler = GitHubTrendingCrawler() with patch("requests.get") as mock_get: mock_get.side_effect = Exception("Network error") results = await crawler.fetch_latest() assert results == []