From ef3faec7f89fc061745c3f22835c8138420ed1da Mon Sep 17 00:00:00 2001 From: Artur Mukhamadiev Date: Thu, 19 Mar 2026 21:35:51 +0300 Subject: [PATCH] #Feature: GitHub Trending Scouting :Release Notes: - Added a new GitHub Trending crawler that scouts for trending repositories across monthly, weekly, and daily timeframes. :Detailed Notes: - Created `GitHubTrendingCrawler` in `src/crawlers/github_crawler.py` to parse github.com/trending HTML. - Implemented intra-run deduplication: repositories appearing in multiple timeframes (monthly, weekly, daily) are merged into a single item per run to avoid redundant LLM processing. - Registered the new crawler in `src/crawlers/factory.py` and added it to the configuration file `src/crawlers.yml`. - Created comprehensive test suite in `tests/crawlers/test_github_crawler.py` to verify fetching, HTML parsing, and deduplication logic using pytest and mocked responses. :Testing Performed: - Added unit tests for `GitHubTrendingCrawler` using pytest. - Verified all tests pass successfully. - Ensured no duplicate `NewsItemDTO` objects are generated for the same repository URL across different timeframes. :QA Notes: - The vector storage (`ChromaStore`) already handles inter-run deduplication by checking `await self.storage.exists(item.url)` before processing, ensuring repositories are only parsed and processed by the AI once even across multiple script executions. :Issues Addressed: - Resolves request for adding GitHub trending scouting (Month/Week/Day) with deduplication. Change-Id: Ifbcde830263264576e4fadb70f09a6e2e12e3016 --- src/crawlers.yml | 5 +- src/crawlers/factory.py | 3 + src/crawlers/github_crawler.py | 100 ++++++++++++++ tests/crawlers/test_github_crawler.py | 192 ++++++++++++++++++++++++++ 4 files changed, 299 insertions(+), 1 deletion(-) create mode 100644 src/crawlers/github_crawler.py create mode 100644 tests/crawlers/test_github_crawler.py diff --git a/src/crawlers.yml b/src/crawlers.yml index c0d8ee0..a5813d7 100644 --- a/src/crawlers.yml +++ b/src/crawlers.yml @@ -116,4 +116,7 @@ crawlers: - type: scholar url: "https://scholar.google.com/" source: "Google Scholar BMI" - query: "Brain-machine interface (IoT|Webengine|Linux)" \ No newline at end of file + query: "Brain-machine interface (IoT|Webengine|Linux)" + - type: github_trending + url: "https://github.com/trending" + source: "GitHub Trending" \ No newline at end of file diff --git a/src/crawlers/factory.py b/src/crawlers/factory.py index 67285fa..51e84b6 100644 --- a/src/crawlers/factory.py +++ b/src/crawlers/factory.py @@ -10,6 +10,7 @@ from src.crawlers.skolkovo_crawler import SkolkovoCrawler from src.crawlers.scirate_crawler import SciRateCrawler from src.crawlers.scholar_crawler import ScholarCrawler from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler +from src.crawlers.github_crawler import GitHubTrendingCrawler logger = logging.getLogger(__name__) @@ -59,6 +60,8 @@ class CrawlerFactory: crawlers.append(ScholarCrawler(query=query, source=source)) elif crawler_type == 'microsoft_research': crawlers.append(MicrosoftResearchCrawler(url=url, source=source)) + elif crawler_type == 'github_trending': + crawlers.append(GitHubTrendingCrawler(url=url, source=source)) else: logger.warning(f"Unknown crawler type: {crawler_type}") diff --git a/src/crawlers/github_crawler.py b/src/crawlers/github_crawler.py new file mode 100644 index 0000000..966b29f --- /dev/null +++ b/src/crawlers/github_crawler.py @@ -0,0 +1,100 @@ +import logging +import asyncio +import re +from datetime import datetime, timezone +from typing import List, Dict + +import requests +from bs4 import BeautifulSoup + +from src.crawlers.base import ICrawler +from src.crawlers.dto import NewsItemDTO + +logger = logging.getLogger(__name__) + +class GitHubTrendingCrawler(ICrawler): + """ + Crawler for GitHub Trending repositories. + Fetches monthly, weekly, and daily trending repositories and deduplicates them. + """ + + def __init__(self, url: str = None, source: str = "GitHub Trending"): + self.base_url = "https://github.com" + self.url = url or "https://github.com/trending" + self.source = source + + async def fetch_latest(self) -> List[NewsItemDTO]: + timeframes = ["monthly", "weekly", "daily"] + repos: Dict[str, dict] = {} + + try: + for timeframe in timeframes: + url = f"{self.base_url}/trending?since={timeframe}" + + # Use asyncio.to_thread to run the synchronous requests.get + response = await asyncio.to_thread(requests.get, url) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + + articles = soup.find_all("article", class_="Box-row") + + for article in articles: + h2 = article.find("h2", class_="h3") + if not h2: + continue + + a_tag = h2.find("a") + if not a_tag: + continue + + repo_path = a_tag.get("href", "") + if not repo_path: + continue + + # Fix test compatibility. The test assumes the exact href is in the URL. + repo_url = f"{self.base_url}{repo_path}" + + # Clean up title whitespace + raw_title = h2.get_text() + title = re.sub(r'\s+', ' ', raw_title).strip() + + # Extract description + p_tag = article.find("p", class_="col-9") + description = p_tag.get_text(strip=True) if p_tag else "" + + # Extract language + lang_span = article.find("span", attrs={"itemprop": "programmingLanguage"}) + language = lang_span.get_text(strip=True) if lang_span else "Unknown" + + if repo_url in repos: + if timeframe not in repos[repo_url]["timeframes"]: + repos[repo_url]["timeframes"].append(timeframe) + else: + repos[repo_url] = { + "title": title, + "description": description, + "language": language, + "timeframes": [timeframe] + } + + results = [] + for repo_url, data in repos.items(): + timeframes_str = ", ".join(data["timeframes"]) + content_text = f"{data['description']}\nLanguage: {data['language']}\nTrending: {timeframes_str}" + + results.append( + NewsItemDTO( + title=data["title"], + url=repo_url, + content_text=content_text.strip(), + source=self.source, + timestamp=datetime.now(timezone.utc) + ) + ) + + return results + + except Exception as e: + logger.error(f"Error fetching GitHub trending: {e}") + return [] diff --git a/tests/crawlers/test_github_crawler.py b/tests/crawlers/test_github_crawler.py new file mode 100644 index 0000000..169b998 --- /dev/null +++ b/tests/crawlers/test_github_crawler.py @@ -0,0 +1,192 @@ +import pytest +from unittest.mock import patch, MagicMock +from datetime import datetime, timezone +from src.crawlers.github_crawler import GitHubTrendingCrawler +from src.crawlers.dto import NewsItemDTO + +@pytest.fixture +def monthly_html(): + return """ + + +
+

+ + user / repo1 + +

+

Monthly description 1

+
+ + Python + +
+
+
+

+ + user / repo2 + +

+

Monthly description 2

+
+ + JavaScript + +
+
+ + + """ + +@pytest.fixture +def weekly_html(): + return """ + + +
+

+ + user / repo3 + +

+

Weekly description 3

+
+ + Go + +
+
+ + + """ + +@pytest.fixture +def daily_html(): + return """ + + +
+

+ + user / repo1 + +

+

Daily description 1

+
+ + Python + +
+
+
+

+ + user / repo4 + +

+

Daily description 4

+
+ + Rust + +
+
+ + + """ + +@pytest.mark.asyncio +async def test_github_trending_crawler_fetches_all_timeframes(monthly_html, weekly_html, daily_html): + crawler = GitHubTrendingCrawler() + + with patch("requests.get") as mock_get: + # Configure mock to return different HTML for different URLs + def side_effect(url, **kwargs): + mock_resp = MagicMock() + mock_resp.status_code = 200 + if "since=monthly" in url: + mock_resp.text = monthly_html + elif "since=weekly" in url: + mock_resp.text = weekly_html + elif "since=daily" in url: + mock_resp.text = daily_html + else: + mock_resp.text = "" + return mock_resp + + mock_get.side_effect = side_effect + + results = await crawler.fetch_latest() + + # Verify it called all three URLs + called_urls = [call.args[0] for call in mock_get.call_args_list] + assert "https://github.com/trending?since=monthly" in called_urls + assert "https://github.com/trending?since=weekly" in called_urls + assert "https://github.com/trending?since=daily" in called_urls + +@pytest.mark.asyncio +async def test_github_trending_crawler_parses_html_correctly(daily_html): + crawler = GitHubTrendingCrawler() + + with patch("requests.get") as mock_get: + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = daily_html + mock_get.return_value = mock_resp + + # We only care about one fetch here to verify parsing + # But fetch_latest might call all three, so we mock it to return empty for others if needed + # or just check the results. + + results = await crawler.fetch_latest() + + # Check if repo4 is correctly parsed + repo4 = next((item for item in results if "user/repo4" in item.url), None) + assert repo4 is not None + assert repo4.title == "user / repo4" + assert "Daily description 4" in repo4.content_text + assert "Rust" in repo4.content_text + assert repo4.source == "GitHub Trending" + +@pytest.mark.asyncio +async def test_github_trending_crawler_deduplication(monthly_html, weekly_html, daily_html): + crawler = GitHubTrendingCrawler() + + with patch("requests.get") as mock_get: + def side_effect(url, **kwargs): + mock_resp = MagicMock() + mock_resp.status_code = 200 + if "since=monthly" in url: + mock_resp.text = monthly_html + elif "since=weekly" in url: + mock_resp.text = weekly_html + elif "since=daily" in url: + mock_resp.text = daily_html + return mock_resp + + mock_get.side_effect = side_effect + + results = await crawler.fetch_latest() + + # repo1 appears in monthly and daily + repo1_items = [item for item in results if "user/repo1" in item.url] + + # 1. Assert only ONE NewsItemDTO for repo1 + assert len(repo1_items) == 1 + + # 2. Assert content_text or source indicates it appeared in both timeframes + # The prompt says: "its content_text (or source) should indicate it appeared in both timeframes" + repo1 = repo1_items[0] + assert "monthly" in repo1.content_text.lower() or "monthly" in repo1.source.lower() + assert "daily" in repo1.content_text.lower() or "daily" in repo1.source.lower() + +@pytest.mark.asyncio +async def test_github_trending_crawler_handles_errors(): + crawler = GitHubTrendingCrawler() + + with patch("requests.get") as mock_get: + mock_get.side_effect = Exception("Network error") + + results = await crawler.fetch_latest() + assert results == []