AI-Trend-Scout/tests/crawlers/test_github_crawler.py
Artur Mukhamadiev ef3faec7f8 #Feature: GitHub Trending Scouting
:Release Notes:
- Added a new GitHub Trending crawler that scouts for trending repositories across monthly, weekly, and daily timeframes.

:Detailed Notes:
- Created `GitHubTrendingCrawler` in `src/crawlers/github_crawler.py` to parse github.com/trending HTML.
- Implemented intra-run deduplication: repositories appearing in multiple timeframes (monthly, weekly, daily) are merged into a single item per run to avoid redundant LLM processing.
- Registered the new crawler in `src/crawlers/factory.py` and added it to the configuration file `src/crawlers.yml`.
- Created comprehensive test suite in `tests/crawlers/test_github_crawler.py` to verify fetching, HTML parsing, and deduplication logic using pytest and mocked responses.

:Testing Performed:
- Added unit tests for `GitHubTrendingCrawler` using pytest.
- Verified all tests pass successfully.
- Ensured no duplicate `NewsItemDTO` objects are generated for the same repository URL across different timeframes.

:QA Notes:
- The vector storage (`ChromaStore`) already handles inter-run deduplication by checking `await self.storage.exists(item.url)` before processing, ensuring repositories are only parsed and processed by the AI once even across multiple script executions.

:Issues Addressed:
- Resolves request for adding GitHub trending scouting (Month/Week/Day) with deduplication.

Change-Id: Ifbcde830263264576e4fadb70f09a6e2e12e3016
2026-03-19 21:35:51 +03:00

193 lines
6.8 KiB
Python

import pytest
from unittest.mock import patch, MagicMock
from datetime import datetime, timezone
from src.crawlers.github_crawler import GitHubTrendingCrawler
from src.crawlers.dto import NewsItemDTO
@pytest.fixture
def monthly_html():
return """
<html>
<body>
<article class="Box-row">
<h2 class="h3 lh-condensed">
<a href="/user/repo1">
<span class="text-normal">user / </span> repo1
</a>
</h2>
<p class="col-9 color-fg-muted my-1 pr-4">Monthly description 1</p>
<div class="f6 color-fg-muted mt-2">
<span class="d-inline-block ml-0 mr-3">
<span itemprop="programmingLanguage">Python</span>
</span>
</div>
</article>
<article class="Box-row">
<h2 class="h3 lh-condensed">
<a href="/user/repo2">
<span class="text-normal">user / </span> repo2
</a>
</h2>
<p class="col-9 color-fg-muted my-1 pr-4">Monthly description 2</p>
<div class="f6 color-fg-muted mt-2">
<span class="d-inline-block ml-0 mr-3">
<span itemprop="programmingLanguage">JavaScript</span>
</span>
</div>
</article>
</body>
</html>
"""
@pytest.fixture
def weekly_html():
return """
<html>
<body>
<article class="Box-row">
<h2 class="h3 lh-condensed">
<a href="/user/repo3">
<span class="text-normal">user / </span> repo3
</a>
</h2>
<p class="col-9 color-fg-muted my-1 pr-4">Weekly description 3</p>
<div class="f6 color-fg-muted mt-2">
<span class="d-inline-block ml-0 mr-3">
<span itemprop="programmingLanguage">Go</span>
</span>
</div>
</article>
</body>
</html>
"""
@pytest.fixture
def daily_html():
return """
<html>
<body>
<article class="Box-row">
<h2 class="h3 lh-condensed">
<a href="/user/repo1">
<span class="text-normal">user / </span> repo1
</a>
</h2>
<p class="col-9 color-fg-muted my-1 pr-4">Daily description 1</p>
<div class="f6 color-fg-muted mt-2">
<span class="d-inline-block ml-0 mr-3">
<span itemprop="programmingLanguage">Python</span>
</span>
</div>
</article>
<article class="Box-row">
<h2 class="h3 lh-condensed">
<a href="/user/repo4">
<span class="text-normal">user / </span> repo4
</a>
</h2>
<p class="col-9 color-fg-muted my-1 pr-4">Daily description 4</p>
<div class="f6 color-fg-muted mt-2">
<span class="d-inline-block ml-0 mr-3">
<span itemprop="programmingLanguage">Rust</span>
</span>
</div>
</article>
</body>
</html>
"""
@pytest.mark.asyncio
async def test_github_trending_crawler_fetches_all_timeframes(monthly_html, weekly_html, daily_html):
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
# Configure mock to return different HTML for different URLs
def side_effect(url, **kwargs):
mock_resp = MagicMock()
mock_resp.status_code = 200
if "since=monthly" in url:
mock_resp.text = monthly_html
elif "since=weekly" in url:
mock_resp.text = weekly_html
elif "since=daily" in url:
mock_resp.text = daily_html
else:
mock_resp.text = ""
return mock_resp
mock_get.side_effect = side_effect
results = await crawler.fetch_latest()
# Verify it called all three URLs
called_urls = [call.args[0] for call in mock_get.call_args_list]
assert "https://github.com/trending?since=monthly" in called_urls
assert "https://github.com/trending?since=weekly" in called_urls
assert "https://github.com/trending?since=daily" in called_urls
@pytest.mark.asyncio
async def test_github_trending_crawler_parses_html_correctly(daily_html):
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = daily_html
mock_get.return_value = mock_resp
# We only care about one fetch here to verify parsing
# But fetch_latest might call all three, so we mock it to return empty for others if needed
# or just check the results.
results = await crawler.fetch_latest()
# Check if repo4 is correctly parsed
repo4 = next((item for item in results if "user/repo4" in item.url), None)
assert repo4 is not None
assert repo4.title == "user / repo4"
assert "Daily description 4" in repo4.content_text
assert "Rust" in repo4.content_text
assert repo4.source == "GitHub Trending"
@pytest.mark.asyncio
async def test_github_trending_crawler_deduplication(monthly_html, weekly_html, daily_html):
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
def side_effect(url, **kwargs):
mock_resp = MagicMock()
mock_resp.status_code = 200
if "since=monthly" in url:
mock_resp.text = monthly_html
elif "since=weekly" in url:
mock_resp.text = weekly_html
elif "since=daily" in url:
mock_resp.text = daily_html
return mock_resp
mock_get.side_effect = side_effect
results = await crawler.fetch_latest()
# repo1 appears in monthly and daily
repo1_items = [item for item in results if "user/repo1" in item.url]
# 1. Assert only ONE NewsItemDTO for repo1
assert len(repo1_items) == 1
# 2. Assert content_text or source indicates it appeared in both timeframes
# The prompt says: "its content_text (or source) should indicate it appeared in both timeframes"
repo1 = repo1_items[0]
assert "monthly" in repo1.content_text.lower() or "monthly" in repo1.source.lower()
assert "daily" in repo1.content_text.lower() or "daily" in repo1.source.lower()
@pytest.mark.asyncio
async def test_github_trending_crawler_handles_errors():
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
mock_get.side_effect = Exception("Network error")
results = await crawler.fetch_latest()
assert results == []