From ef3faec7f89fc061745c3f22835c8138420ed1da Mon Sep 17 00:00:00 2001
From: Artur Mukhamadiev <artur.mukhamadiev@lge.com>
Date: Thu, 19 Mar 2026 21:35:51 +0300
Subject: [PATCH] #Feature: GitHub Trending Scouting

:Release Notes:
- Added a new GitHub Trending crawler that scouts for trending repositories across monthly, weekly, and daily timeframes.

:Detailed Notes:
- Created `GitHubTrendingCrawler` in `src/crawlers/github_crawler.py` to parse github.com/trending HTML.
- Implemented intra-run deduplication: repositories appearing in multiple timeframes (monthly, weekly, daily) are merged into a single item per run to avoid redundant LLM processing.
- Registered the new crawler in `src/crawlers/factory.py` and added it to the configuration file `src/crawlers.yml`.
- Created comprehensive test suite in `tests/crawlers/test_github_crawler.py` to verify fetching, HTML parsing, and deduplication logic using pytest and mocked responses.

:Testing Performed:
- Added unit tests for `GitHubTrendingCrawler` using pytest.
- Verified all tests pass successfully.
- Ensured no duplicate `NewsItemDTO` objects are generated for the same repository URL across different timeframes.

:QA Notes:
- The vector storage (`ChromaStore`) already handles inter-run deduplication by checking `await self.storage.exists(item.url)` before processing, ensuring repositories are only parsed and processed by the AI once even across multiple script executions.

:Issues Addressed:
- Resolves request for adding GitHub trending scouting (Month/Week/Day) with deduplication.

Change-Id: Ifbcde830263264576e4fadb70f09a6e2e12e3016
---
 src/crawlers.yml                      |   5 +-
 src/crawlers/factory.py               |   3 +
 src/crawlers/github_crawler.py        | 100 ++++++++++++++
 tests/crawlers/test_github_crawler.py | 192 ++++++++++++++++++++++++++
 4 files changed, 299 insertions(+), 1 deletion(-)
 create mode 100644 src/crawlers/github_crawler.py
 create mode 100644 tests/crawlers/test_github_crawler.py

diff --git a/src/crawlers.yml b/src/crawlers.yml
index c0d8ee0..a5813d7 100644
--- a/src/crawlers.yml
+++ b/src/crawlers.yml
@@ -116,4 +116,7 @@ crawlers:
   - type: scholar
     url: "https://scholar.google.com/"
     source: "Google Scholar BMI"
-    query: "Brain-machine interface (IoT|Webengine|Linux)"
\ No newline at end of file
+    query: "Brain-machine interface (IoT|Webengine|Linux)"
+  - type: github_trending
+    url: "https://github.com/trending"
+    source: "GitHub Trending"
\ No newline at end of file
diff --git a/src/crawlers/factory.py b/src/crawlers/factory.py
index 67285fa..51e84b6 100644
--- a/src/crawlers/factory.py
+++ b/src/crawlers/factory.py
@@ -10,6 +10,7 @@ from src.crawlers.skolkovo_crawler import SkolkovoCrawler
 from src.crawlers.scirate_crawler import SciRateCrawler
 from src.crawlers.scholar_crawler import ScholarCrawler
 from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
+from src.crawlers.github_crawler import GitHubTrendingCrawler
 
 logger = logging.getLogger(__name__)
 
@@ -59,6 +60,8 @@ class CrawlerFactory:
                     crawlers.append(ScholarCrawler(query=query, source=source))
                 elif crawler_type == 'microsoft_research':
                     crawlers.append(MicrosoftResearchCrawler(url=url, source=source))
+                elif crawler_type == 'github_trending':
+                    crawlers.append(GitHubTrendingCrawler(url=url, source=source))
                 else:
                     logger.warning(f"Unknown crawler type: {crawler_type}")
             
diff --git a/src/crawlers/github_crawler.py b/src/crawlers/github_crawler.py
new file mode 100644
index 0000000..966b29f
--- /dev/null
+++ b/src/crawlers/github_crawler.py
@@ -0,0 +1,100 @@
+import logging
+import asyncio
+import re
+from datetime import datetime, timezone
+from typing import List, Dict
+
+import requests
+from bs4 import BeautifulSoup
+
+from src.crawlers.base import ICrawler
+from src.crawlers.dto import NewsItemDTO
+
+logger = logging.getLogger(__name__)
+
+class GitHubTrendingCrawler(ICrawler):
+    """
+    Crawler for GitHub Trending repositories.
+    Fetches monthly, weekly, and daily trending repositories and deduplicates them.
+    """
+    
+    def __init__(self, url: str = None, source: str = "GitHub Trending"):
+        self.base_url = "https://github.com"
+        self.url = url or "https://github.com/trending"
+        self.source = source
+        
+    async def fetch_latest(self) -> List[NewsItemDTO]:
+        timeframes = ["monthly", "weekly", "daily"]
+        repos: Dict[str, dict] = {}
+        
+        try:
+            for timeframe in timeframes:
+                url = f"{self.base_url}/trending?since={timeframe}"
+                
+                # Use asyncio.to_thread to run the synchronous requests.get
+                response = await asyncio.to_thread(requests.get, url)
+                response.raise_for_status()
+                
+                soup = BeautifulSoup(response.text, "html.parser")
+                
+                articles = soup.find_all("article", class_="Box-row")
+                
+                for article in articles:
+                    h2 = article.find("h2", class_="h3")
+                    if not h2:
+                        continue
+                        
+                    a_tag = h2.find("a")
+                    if not a_tag:
+                        continue
+                        
+                    repo_path = a_tag.get("href", "")
+                    if not repo_path:
+                        continue
+                        
+                    # Fix test compatibility. The test assumes the exact href is in the URL.
+                    repo_url = f"{self.base_url}{repo_path}"
+                    
+                    # Clean up title whitespace
+                    raw_title = h2.get_text()
+                    title = re.sub(r'\s+', ' ', raw_title).strip()
+                    
+                    # Extract description
+                    p_tag = article.find("p", class_="col-9")
+                    description = p_tag.get_text(strip=True) if p_tag else ""
+                    
+                    # Extract language
+                    lang_span = article.find("span", attrs={"itemprop": "programmingLanguage"})
+                    language = lang_span.get_text(strip=True) if lang_span else "Unknown"
+                    
+                    if repo_url in repos:
+                        if timeframe not in repos[repo_url]["timeframes"]:
+                            repos[repo_url]["timeframes"].append(timeframe)
+                    else:
+                        repos[repo_url] = {
+                            "title": title,
+                            "description": description,
+                            "language": language,
+                            "timeframes": [timeframe]
+                        }
+                        
+            results = []
+            for repo_url, data in repos.items():
+                timeframes_str = ", ".join(data["timeframes"])
+                content_text = f"{data['description']}\nLanguage: {data['language']}\nTrending: {timeframes_str}"
+                
+                results.append(
+                    NewsItemDTO(
+                        title=data["title"],
+                        url=repo_url,
+                        content_text=content_text.strip(),
+                        source=self.source,
+                        timestamp=datetime.now(timezone.utc)
+                    )
+                )
+                
+            return results
+            
+        except Exception as e:
+            logger.error(f"Error fetching GitHub trending: {e}")
+            return []
diff --git a/tests/crawlers/test_github_crawler.py b/tests/crawlers/test_github_crawler.py
new file mode 100644
index 0000000..169b998
--- /dev/null
+++ b/tests/crawlers/test_github_crawler.py
@@ -0,0 +1,192 @@
+import pytest
+from unittest.mock import patch, MagicMock
+from datetime import datetime, timezone
+from src.crawlers.github_crawler import GitHubTrendingCrawler
+from src.crawlers.dto import NewsItemDTO
+
+@pytest.fixture
+def monthly_html():
+    return """
+    <html>
+    <body>
+        <article class="Box-row">
+            <h2 class="h3 lh-condensed">
+                <a href="/user/repo1">
+                    <span class="text-normal">user / </span> repo1
+                </a>
+            </h2>
+            <p class="col-9 color-fg-muted my-1 pr-4">Monthly description 1</p>
+            <div class="f6 color-fg-muted mt-2">
+                <span class="d-inline-block ml-0 mr-3">
+                    <span itemprop="programmingLanguage">Python</span>
+                </span>
+            </div>
+        </article>
+        <article class="Box-row">
+            <h2 class="h3 lh-condensed">
+                <a href="/user/repo2">
+                    <span class="text-normal">user / </span> repo2
+                </a>
+            </h2>
+            <p class="col-9 color-fg-muted my-1 pr-4">Monthly description 2</p>
+            <div class="f6 color-fg-muted mt-2">
+                <span class="d-inline-block ml-0 mr-3">
+                    <span itemprop="programmingLanguage">JavaScript</span>
+                </span>
+            </div>
+        </article>
+    </body>
+    </html>
+    """
+
+@pytest.fixture
+def weekly_html():
+    return """
+    <html>
+    <body>
+        <article class="Box-row">
+            <h2 class="h3 lh-condensed">
+                <a href="/user/repo3">
+                    <span class="text-normal">user / </span> repo3
+                </a>
+            </h2>
+            <p class="col-9 color-fg-muted my-1 pr-4">Weekly description 3</p>
+            <div class="f6 color-fg-muted mt-2">
+                <span class="d-inline-block ml-0 mr-3">
+                    <span itemprop="programmingLanguage">Go</span>
+                </span>
+            </div>
+        </article>
+    </body>
+    </html>
+    """
+
+@pytest.fixture
+def daily_html():
+    return """
+    <html>
+    <body>
+        <article class="Box-row">
+            <h2 class="h3 lh-condensed">
+                <a href="/user/repo1">
+                    <span class="text-normal">user / </span> repo1
+                </a>
+            </h2>
+            <p class="col-9 color-fg-muted my-1 pr-4">Daily description 1</p>
+            <div class="f6 color-fg-muted mt-2">
+                <span class="d-inline-block ml-0 mr-3">
+                    <span itemprop="programmingLanguage">Python</span>
+                </span>
+            </div>
+        </article>
+        <article class="Box-row">
+            <h2 class="h3 lh-condensed">
+                <a href="/user/repo4">
+                    <span class="text-normal">user / </span> repo4
+                </a>
+            </h2>
+            <p class="col-9 color-fg-muted my-1 pr-4">Daily description 4</p>
+            <div class="f6 color-fg-muted mt-2">
+                <span class="d-inline-block ml-0 mr-3">
+                    <span itemprop="programmingLanguage">Rust</span>
+                </span>
+            </div>
+        </article>
+    </body>
+    </html>
+    """
+
+@pytest.mark.asyncio
+async def test_github_trending_crawler_fetches_all_timeframes(monthly_html, weekly_html, daily_html):
+    crawler = GitHubTrendingCrawler()
+    
+    with patch("requests.get") as mock_get:
+        # Configure mock to return different HTML for different URLs
+        def side_effect(url, **kwargs):
+            mock_resp = MagicMock()
+            mock_resp.status_code = 200
+            if "since=monthly" in url:
+                mock_resp.text = monthly_html
+            elif "since=weekly" in url:
+                mock_resp.text = weekly_html
+            elif "since=daily" in url:
+                mock_resp.text = daily_html
+            else:
+                mock_resp.text = ""
+            return mock_resp
+        
+        mock_get.side_effect = side_effect
+        
+        results = await crawler.fetch_latest()
+        
+        # Verify it called all three URLs
+        called_urls = [call.args[0] for call in mock_get.call_args_list]
+        assert "https://github.com/trending?since=monthly" in called_urls
+        assert "https://github.com/trending?since=weekly" in called_urls
+        assert "https://github.com/trending?since=daily" in called_urls
+
+@pytest.mark.asyncio
+async def test_github_trending_crawler_parses_html_correctly(daily_html):
+    crawler = GitHubTrendingCrawler()
+    
+    with patch("requests.get") as mock_get:
+        mock_resp = MagicMock()
+        mock_resp.status_code = 200
+        mock_resp.text = daily_html
+        mock_get.return_value = mock_resp
+        
+        # We only care about one fetch here to verify parsing
+        # But fetch_latest might call all three, so we mock it to return empty for others if needed
+        # or just check the results.
+        
+        results = await crawler.fetch_latest()
+        
+        # Check if repo4 is correctly parsed
+        repo4 = next((item for item in results if "user/repo4" in item.url), None)
+        assert repo4 is not None
+        assert repo4.title == "user / repo4"
+        assert "Daily description 4" in repo4.content_text
+        assert "Rust" in repo4.content_text
+        assert repo4.source == "GitHub Trending"
+
+@pytest.mark.asyncio
+async def test_github_trending_crawler_deduplication(monthly_html, weekly_html, daily_html):
+    crawler = GitHubTrendingCrawler()
+    
+    with patch("requests.get") as mock_get:
+        def side_effect(url, **kwargs):
+            mock_resp = MagicMock()
+            mock_resp.status_code = 200
+            if "since=monthly" in url:
+                mock_resp.text = monthly_html
+            elif "since=weekly" in url:
+                mock_resp.text = weekly_html
+            elif "since=daily" in url:
+                mock_resp.text = daily_html
+            return mock_resp
+        
+        mock_get.side_effect = side_effect
+        
+        results = await crawler.fetch_latest()
+        
+        # repo1 appears in monthly and daily
+        repo1_items = [item for item in results if "user/repo1" in item.url]
+        
+        # 1. Assert only ONE NewsItemDTO for repo1
+        assert len(repo1_items) == 1
+        
+        # 2. Assert content_text or source indicates it appeared in both timeframes
+        # The prompt says: "its content_text (or source) should indicate it appeared in both timeframes"
+        repo1 = repo1_items[0]
+        assert "monthly" in repo1.content_text.lower() or "monthly" in repo1.source.lower()
+        assert "daily" in repo1.content_text.lower() or "daily" in repo1.source.lower()
+
+@pytest.mark.asyncio
+async def test_github_trending_crawler_handles_errors():
+    crawler = GitHubTrendingCrawler()
+    
+    with patch("requests.get") as mock_get:
+        mock_get.side_effect = Exception("Network error")
+        
+        results = await crawler.fetch_latest()
+        assert results == []