#Feature: GitHub Trending Scouting

:Release Notes:
- Added a new GitHub Trending crawler that scouts for trending repositories across monthly, weekly, and daily timeframes.

:Detailed Notes:
- Created `GitHubTrendingCrawler` in `src/crawlers/github_crawler.py` to parse github.com/trending HTML.
- Implemented intra-run deduplication: repositories appearing in multiple timeframes (monthly, weekly, daily) are merged into a single item per run to avoid redundant LLM processing.
- Registered the new crawler in `src/crawlers/factory.py` and added it to the configuration file `src/crawlers.yml`.
- Created comprehensive test suite in `tests/crawlers/test_github_crawler.py` to verify fetching, HTML parsing, and deduplication logic using pytest and mocked responses.

:Testing Performed:
- Added unit tests for `GitHubTrendingCrawler` using pytest.
- Verified all tests pass successfully.
- Ensured no duplicate `NewsItemDTO` objects are generated for the same repository URL across different timeframes.

:QA Notes:
- The vector storage (`ChromaStore`) already handles inter-run deduplication by checking `await self.storage.exists(item.url)` before processing, ensuring repositories are only parsed and processed by the AI once even across multiple script executions.

:Issues Addressed:
- Resolves request for adding GitHub trending scouting (Month/Week/Day) with deduplication.

Change-Id: Ifbcde830263264576e4fadb70f09a6e2e12e3016
This commit is contained in:
Artur Mukhamadiev 2026-03-19 21:35:51 +03:00
parent 6d2ac9d0f0
commit ef3faec7f8
4 changed files with 299 additions and 1 deletions

View File

@ -116,4 +116,7 @@ crawlers:
- type: scholar
url: "https://scholar.google.com/"
source: "Google Scholar BMI"
query: "Brain-machine interface (IoT|Webengine|Linux)"
query: "Brain-machine interface (IoT|Webengine|Linux)"
- type: github_trending
url: "https://github.com/trending"
source: "GitHub Trending"

View File

@ -10,6 +10,7 @@ from src.crawlers.skolkovo_crawler import SkolkovoCrawler
from src.crawlers.scirate_crawler import SciRateCrawler
from src.crawlers.scholar_crawler import ScholarCrawler
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
from src.crawlers.github_crawler import GitHubTrendingCrawler
logger = logging.getLogger(__name__)
@ -59,6 +60,8 @@ class CrawlerFactory:
crawlers.append(ScholarCrawler(query=query, source=source))
elif crawler_type == 'microsoft_research':
crawlers.append(MicrosoftResearchCrawler(url=url, source=source))
elif crawler_type == 'github_trending':
crawlers.append(GitHubTrendingCrawler(url=url, source=source))
else:
logger.warning(f"Unknown crawler type: {crawler_type}")

View File

@ -0,0 +1,100 @@
import logging
import asyncio
import re
from datetime import datetime, timezone
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
from src.crawlers.base import ICrawler
from src.crawlers.dto import NewsItemDTO
logger = logging.getLogger(__name__)
class GitHubTrendingCrawler(ICrawler):
"""
Crawler for GitHub Trending repositories.
Fetches monthly, weekly, and daily trending repositories and deduplicates them.
"""
def __init__(self, url: str = None, source: str = "GitHub Trending"):
self.base_url = "https://github.com"
self.url = url or "https://github.com/trending"
self.source = source
async def fetch_latest(self) -> List[NewsItemDTO]:
timeframes = ["monthly", "weekly", "daily"]
repos: Dict[str, dict] = {}
try:
for timeframe in timeframes:
url = f"{self.base_url}/trending?since={timeframe}"
# Use asyncio.to_thread to run the synchronous requests.get
response = await asyncio.to_thread(requests.get, url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
articles = soup.find_all("article", class_="Box-row")
for article in articles:
h2 = article.find("h2", class_="h3")
if not h2:
continue
a_tag = h2.find("a")
if not a_tag:
continue
repo_path = a_tag.get("href", "")
if not repo_path:
continue
# Fix test compatibility. The test assumes the exact href is in the URL.
repo_url = f"{self.base_url}{repo_path}"
# Clean up title whitespace
raw_title = h2.get_text()
title = re.sub(r'\s+', ' ', raw_title).strip()
# Extract description
p_tag = article.find("p", class_="col-9")
description = p_tag.get_text(strip=True) if p_tag else ""
# Extract language
lang_span = article.find("span", attrs={"itemprop": "programmingLanguage"})
language = lang_span.get_text(strip=True) if lang_span else "Unknown"
if repo_url in repos:
if timeframe not in repos[repo_url]["timeframes"]:
repos[repo_url]["timeframes"].append(timeframe)
else:
repos[repo_url] = {
"title": title,
"description": description,
"language": language,
"timeframes": [timeframe]
}
results = []
for repo_url, data in repos.items():
timeframes_str = ", ".join(data["timeframes"])
content_text = f"{data['description']}\nLanguage: {data['language']}\nTrending: {timeframes_str}"
results.append(
NewsItemDTO(
title=data["title"],
url=repo_url,
content_text=content_text.strip(),
source=self.source,
timestamp=datetime.now(timezone.utc)
)
)
return results
except Exception as e:
logger.error(f"Error fetching GitHub trending: {e}")
return []

View File

@ -0,0 +1,192 @@
import pytest
from unittest.mock import patch, MagicMock
from datetime import datetime, timezone
from src.crawlers.github_crawler import GitHubTrendingCrawler
from src.crawlers.dto import NewsItemDTO
@pytest.fixture
def monthly_html():
return """
<html>
<body>
<article class="Box-row">
<h2 class="h3 lh-condensed">
<a href="/user/repo1">
<span class="text-normal">user / </span> repo1
</a>
</h2>
<p class="col-9 color-fg-muted my-1 pr-4">Monthly description 1</p>
<div class="f6 color-fg-muted mt-2">
<span class="d-inline-block ml-0 mr-3">
<span itemprop="programmingLanguage">Python</span>
</span>
</div>
</article>
<article class="Box-row">
<h2 class="h3 lh-condensed">
<a href="/user/repo2">
<span class="text-normal">user / </span> repo2
</a>
</h2>
<p class="col-9 color-fg-muted my-1 pr-4">Monthly description 2</p>
<div class="f6 color-fg-muted mt-2">
<span class="d-inline-block ml-0 mr-3">
<span itemprop="programmingLanguage">JavaScript</span>
</span>
</div>
</article>
</body>
</html>
"""
@pytest.fixture
def weekly_html():
return """
<html>
<body>
<article class="Box-row">
<h2 class="h3 lh-condensed">
<a href="/user/repo3">
<span class="text-normal">user / </span> repo3
</a>
</h2>
<p class="col-9 color-fg-muted my-1 pr-4">Weekly description 3</p>
<div class="f6 color-fg-muted mt-2">
<span class="d-inline-block ml-0 mr-3">
<span itemprop="programmingLanguage">Go</span>
</span>
</div>
</article>
</body>
</html>
"""
@pytest.fixture
def daily_html():
return """
<html>
<body>
<article class="Box-row">
<h2 class="h3 lh-condensed">
<a href="/user/repo1">
<span class="text-normal">user / </span> repo1
</a>
</h2>
<p class="col-9 color-fg-muted my-1 pr-4">Daily description 1</p>
<div class="f6 color-fg-muted mt-2">
<span class="d-inline-block ml-0 mr-3">
<span itemprop="programmingLanguage">Python</span>
</span>
</div>
</article>
<article class="Box-row">
<h2 class="h3 lh-condensed">
<a href="/user/repo4">
<span class="text-normal">user / </span> repo4
</a>
</h2>
<p class="col-9 color-fg-muted my-1 pr-4">Daily description 4</p>
<div class="f6 color-fg-muted mt-2">
<span class="d-inline-block ml-0 mr-3">
<span itemprop="programmingLanguage">Rust</span>
</span>
</div>
</article>
</body>
</html>
"""
@pytest.mark.asyncio
async def test_github_trending_crawler_fetches_all_timeframes(monthly_html, weekly_html, daily_html):
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
# Configure mock to return different HTML for different URLs
def side_effect(url, **kwargs):
mock_resp = MagicMock()
mock_resp.status_code = 200
if "since=monthly" in url:
mock_resp.text = monthly_html
elif "since=weekly" in url:
mock_resp.text = weekly_html
elif "since=daily" in url:
mock_resp.text = daily_html
else:
mock_resp.text = ""
return mock_resp
mock_get.side_effect = side_effect
results = await crawler.fetch_latest()
# Verify it called all three URLs
called_urls = [call.args[0] for call in mock_get.call_args_list]
assert "https://github.com/trending?since=monthly" in called_urls
assert "https://github.com/trending?since=weekly" in called_urls
assert "https://github.com/trending?since=daily" in called_urls
@pytest.mark.asyncio
async def test_github_trending_crawler_parses_html_correctly(daily_html):
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = daily_html
mock_get.return_value = mock_resp
# We only care about one fetch here to verify parsing
# But fetch_latest might call all three, so we mock it to return empty for others if needed
# or just check the results.
results = await crawler.fetch_latest()
# Check if repo4 is correctly parsed
repo4 = next((item for item in results if "user/repo4" in item.url), None)
assert repo4 is not None
assert repo4.title == "user / repo4"
assert "Daily description 4" in repo4.content_text
assert "Rust" in repo4.content_text
assert repo4.source == "GitHub Trending"
@pytest.mark.asyncio
async def test_github_trending_crawler_deduplication(monthly_html, weekly_html, daily_html):
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
def side_effect(url, **kwargs):
mock_resp = MagicMock()
mock_resp.status_code = 200
if "since=monthly" in url:
mock_resp.text = monthly_html
elif "since=weekly" in url:
mock_resp.text = weekly_html
elif "since=daily" in url:
mock_resp.text = daily_html
return mock_resp
mock_get.side_effect = side_effect
results = await crawler.fetch_latest()
# repo1 appears in monthly and daily
repo1_items = [item for item in results if "user/repo1" in item.url]
# 1. Assert only ONE NewsItemDTO for repo1
assert len(repo1_items) == 1
# 2. Assert content_text or source indicates it appeared in both timeframes
# The prompt says: "its content_text (or source) should indicate it appeared in both timeframes"
repo1 = repo1_items[0]
assert "monthly" in repo1.content_text.lower() or "monthly" in repo1.source.lower()
assert "daily" in repo1.content_text.lower() or "daily" in repo1.source.lower()
@pytest.mark.asyncio
async def test_github_trending_crawler_handles_errors():
crawler = GitHubTrendingCrawler()
with patch("requests.get") as mock_get:
mock_get.side_effect = Exception("Network error")
results = await crawler.fetch_latest()
assert results == []