#Feature: GitHub Trending Scouting

:Release Notes: - Added a new GitHub Trending crawler that scouts for trending repositories across monthly, weekly, and daily timeframes. :Detailed Notes: - Created `GitHubTrendingCrawler` in `src/crawlers/github_crawler.py` to parse github.com/trending HTML. - Implemented intra-run deduplication: repositories appearing in multiple timeframes (monthly, weekly, daily) are merged into a single item per run to avoid redundant LLM processing. - Registered the new crawler in `src/crawlers/factory.py` and added it to the configuration file `src/crawlers.yml`. - Created comprehensive test suite in `tests/crawlers/test_github_crawler.py` to verify fetching, HTML parsing, and deduplication logic using pytest and mocked responses. :Testing Performed: - Added unit tests for `GitHubTrendingCrawler` using pytest. - Verified all tests pass successfully. - Ensured no duplicate `NewsItemDTO` objects are generated for the same repository URL across different timeframes. :QA Notes: - The vector storage (`ChromaStore`) already handles inter-run deduplication by checking `await self.storage.exists(item.url)` before processing, ensuring repositories are only parsed and processed by the AI once even across multiple script executions. :Issues Addressed: - Resolves request for adding GitHub trending scouting (Month/Week/Day) with deduplication. Change-Id: Ifbcde830263264576e4fadb70f09a6e2e12e3016
2026-03-19 21:35:51 +03:00 · 2026-03-19 21:35:51 +03:00 · ef3faec7f8
commit ef3faec7f8
parent 6d2ac9d0f0
4 changed files with 299 additions and 1 deletions
--- a/src/crawlers.yml
+++ b/src/crawlers.yml
@ -117,3 +117,6 @@ crawlers:
    url: "https://scholar.google.com/"
    source: "Google Scholar BMI"
    query: "Brain-machine interface (IoT|Webengine|Linux)"
  - type: github_trending
    url: "https://github.com/trending"
    source: "GitHub Trending"
--- a/src/crawlers/factory.py
+++ b/src/crawlers/factory.py
@ -10,6 +10,7 @@ from src.crawlers.skolkovo_crawler import SkolkovoCrawler
 from src.crawlers.scirate_crawler import SciRateCrawler
 from src.crawlers.scholar_crawler import ScholarCrawler
 from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
 from src.crawlers.github_crawler import GitHubTrendingCrawler
 logger = logging.getLogger(__name__)
@ -59,6 +60,8 @@ class CrawlerFactory:
                    crawlers.append(ScholarCrawler(query=query, source=source))
                elif crawler_type == 'microsoft_research':
                    crawlers.append(MicrosoftResearchCrawler(url=url, source=source))
                elif crawler_type == 'github_trending':
                    crawlers.append(GitHubTrendingCrawler(url=url, source=source))
                else:
                    logger.warning(f"Unknown crawler type: {crawler_type}")
--- a/src/crawlers/github_crawler.py
+++ b/src/crawlers/github_crawler.py
@ -0,0 +1,100 @@
 import logging
 import asyncio
 import re
 from datetime import datetime, timezone
 from typing import List, Dict
 import requests
 from bs4 import BeautifulSoup
 from src.crawlers.base import ICrawler
 from src.crawlers.dto import NewsItemDTO
 logger = logging.getLogger(__name__)
 class GitHubTrendingCrawler(ICrawler):
    """
    Crawler for GitHub Trending repositories.
    Fetches monthly, weekly, and daily trending repositories and deduplicates them.
    """
    def __init__(self, url: str = None, source: str = "GitHub Trending"):
        self.base_url = "https://github.com"
        self.url = url or "https://github.com/trending"
        self.source = source
    async def fetch_latest(self) -> List[NewsItemDTO]:
        timeframes = ["monthly", "weekly", "daily"]
        repos: Dict[str, dict] = {}
        try:
            for timeframe in timeframes:
                url = f"{self.base_url}/trending?since={timeframe}"
                # Use asyncio.to_thread to run the synchronous requests.get
                response = await asyncio.to_thread(requests.get, url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")
                articles = soup.find_all("article", class_="Box-row")
                for article in articles:
                    h2 = article.find("h2", class_="h3")
                    if not h2:
                        continue
                    a_tag = h2.find("a")
                    if not a_tag:
                        continue
                    repo_path = a_tag.get("href", "")
                    if not repo_path:
                        continue
                    # Fix test compatibility. The test assumes the exact href is in the URL.
                    repo_url = f"{self.base_url}{repo_path}"
                    # Clean up title whitespace
                    raw_title = h2.get_text()
                    title = re.sub(r'\s+', ' ', raw_title).strip()
                    # Extract description
                    p_tag = article.find("p", class_="col-9")
                    description = p_tag.get_text(strip=True) if p_tag else ""
                    # Extract language
                    lang_span = article.find("span", attrs={"itemprop": "programmingLanguage"})
                    language = lang_span.get_text(strip=True) if lang_span else "Unknown"
                    if repo_url in repos:
                        if timeframe not in repos[repo_url]["timeframes"]:
                            repos[repo_url]["timeframes"].append(timeframe)
                    else:
                        repos[repo_url] = {
                            "title": title,
                            "description": description,
                            "language": language,
                            "timeframes": [timeframe]
                        }
            results = []
            for repo_url, data in repos.items():
                timeframes_str = ", ".join(data["timeframes"])
                content_text = f"{data['description']}\nLanguage: {data['language']}\nTrending: {timeframes_str}"
                results.append(
                    NewsItemDTO(
                        title=data["title"],
                        url=repo_url,
                        content_text=content_text.strip(),
                        source=self.source,
                        timestamp=datetime.now(timezone.utc)
                    )
                )
            return results
        except Exception as e:
            logger.error(f"Error fetching GitHub trending: {e}")
            return []
--- a/tests/crawlers/test_github_crawler.py
+++ b/tests/crawlers/test_github_crawler.py
@ -0,0 +1,192 @@
 import pytest
 from unittest.mock import patch, MagicMock
 from datetime import datetime, timezone
 from src.crawlers.github_crawler import GitHubTrendingCrawler
 from src.crawlers.dto import NewsItemDTO
@pytest.fixture
 def monthly_html():
    return """
    <html>
    <body>
        <article class="Box-row">
            <h2 class="h3 lh-condensed">
                <a href="/user/repo1">
                    <span class="text-normal">user / </span> repo1
                </a>
            </h2>
            <p class="col-9 color-fg-muted my-1 pr-4">Monthly description 1</p>
            <div class="f6 color-fg-muted mt-2">
                <span class="d-inline-block ml-0 mr-3">
                    <span itemprop="programmingLanguage">Python</span>
                </span>
            </div>
        </article>
        <article class="Box-row">
            <h2 class="h3 lh-condensed">
                <a href="/user/repo2">
                    <span class="text-normal">user / </span> repo2
                </a>
            </h2>
            <p class="col-9 color-fg-muted my-1 pr-4">Monthly description 2</p>
            <div class="f6 color-fg-muted mt-2">
                <span class="d-inline-block ml-0 mr-3">
                    <span itemprop="programmingLanguage">JavaScript</span>
                </span>
            </div>
        </article>
    </body>
    </html>
    """
@pytest.fixture
 def weekly_html():
    return """
    <html>
    <body>
        <article class="Box-row">
            <h2 class="h3 lh-condensed">
                <a href="/user/repo3">
                    <span class="text-normal">user / </span> repo3
                </a>
            </h2>
            <p class="col-9 color-fg-muted my-1 pr-4">Weekly description 3</p>
            <div class="f6 color-fg-muted mt-2">
                <span class="d-inline-block ml-0 mr-3">
                    <span itemprop="programmingLanguage">Go</span>
                </span>
            </div>
        </article>
    </body>
    </html>
    """
@pytest.fixture
 def daily_html():
    return """
    <html>
    <body>
        <article class="Box-row">
            <h2 class="h3 lh-condensed">
                <a href="/user/repo1">
                    <span class="text-normal">user / </span> repo1
                </a>
            </h2>
            <p class="col-9 color-fg-muted my-1 pr-4">Daily description 1</p>
            <div class="f6 color-fg-muted mt-2">
                <span class="d-inline-block ml-0 mr-3">
                    <span itemprop="programmingLanguage">Python</span>
                </span>
            </div>
        </article>
        <article class="Box-row">
            <h2 class="h3 lh-condensed">
                <a href="/user/repo4">
                    <span class="text-normal">user / </span> repo4
                </a>
            </h2>
            <p class="col-9 color-fg-muted my-1 pr-4">Daily description 4</p>
            <div class="f6 color-fg-muted mt-2">
                <span class="d-inline-block ml-0 mr-3">
                    <span itemprop="programmingLanguage">Rust</span>
                </span>
            </div>
        </article>
    </body>
    </html>
    """
@pytest.mark.asyncio
 async def test_github_trending_crawler_fetches_all_timeframes(monthly_html, weekly_html, daily_html):
    crawler = GitHubTrendingCrawler()
    with patch("requests.get") as mock_get:
        # Configure mock to return different HTML for different URLs
        def side_effect(url, **kwargs):
            mock_resp = MagicMock()
            mock_resp.status_code = 200
            if "since=monthly" in url:
                mock_resp.text = monthly_html
            elif "since=weekly" in url:
                mock_resp.text = weekly_html
            elif "since=daily" in url:
                mock_resp.text = daily_html
            else:
                mock_resp.text = ""
            return mock_resp
        mock_get.side_effect = side_effect
        results = await crawler.fetch_latest()
        # Verify it called all three URLs
        called_urls = [call.args[0] for call in mock_get.call_args_list]
        assert "https://github.com/trending?since=monthly" in called_urls
        assert "https://github.com/trending?since=weekly" in called_urls
        assert "https://github.com/trending?since=daily" in called_urls
@pytest.mark.asyncio
 async def test_github_trending_crawler_parses_html_correctly(daily_html):
    crawler = GitHubTrendingCrawler()
    with patch("requests.get") as mock_get:
        mock_resp = MagicMock()
        mock_resp.status_code = 200
        mock_resp.text = daily_html
        mock_get.return_value = mock_resp
        # We only care about one fetch here to verify parsing
        # But fetch_latest might call all three, so we mock it to return empty for others if needed
        # or just check the results.
        results = await crawler.fetch_latest()
        # Check if repo4 is correctly parsed
        repo4 = next((item for item in results if "user/repo4" in item.url), None)
        assert repo4 is not None
        assert repo4.title == "user / repo4"
        assert "Daily description 4" in repo4.content_text
        assert "Rust" in repo4.content_text
        assert repo4.source == "GitHub Trending"
@pytest.mark.asyncio
 async def test_github_trending_crawler_deduplication(monthly_html, weekly_html, daily_html):
    crawler = GitHubTrendingCrawler()
    with patch("requests.get") as mock_get:
        def side_effect(url, **kwargs):
            mock_resp = MagicMock()
            mock_resp.status_code = 200
            if "since=monthly" in url:
                mock_resp.text = monthly_html
            elif "since=weekly" in url:
                mock_resp.text = weekly_html
            elif "since=daily" in url:
                mock_resp.text = daily_html
            return mock_resp
        mock_get.side_effect = side_effect
        results = await crawler.fetch_latest()
        # repo1 appears in monthly and daily
        repo1_items = [item for item in results if "user/repo1" in item.url]
        # 1. Assert only ONE NewsItemDTO for repo1
        assert len(repo1_items) == 1
        # 2. Assert content_text or source indicates it appeared in both timeframes
        # The prompt says: "its content_text (or source) should indicate it appeared in both timeframes"
        repo1 = repo1_items[0]
        assert "monthly" in repo1.content_text.lower() or "monthly" in repo1.source.lower()
        assert "daily" in repo1.content_text.lower() or "daily" in repo1.source.lower()
@pytest.mark.asyncio
 async def test_github_trending_crawler_handles_errors():
    crawler = GitHubTrendingCrawler()
    with patch("requests.get") as mock_get:
        mock_get.side_effect = Exception("Network error")
        results = await crawler.fetch_latest()
        assert results == []