#Feature: GitHub Trending Scouting
:Release Notes: - Added a new GitHub Trending crawler that scouts for trending repositories across monthly, weekly, and daily timeframes. :Detailed Notes: - Created `GitHubTrendingCrawler` in `src/crawlers/github_crawler.py` to parse github.com/trending HTML. - Implemented intra-run deduplication: repositories appearing in multiple timeframes (monthly, weekly, daily) are merged into a single item per run to avoid redundant LLM processing. - Registered the new crawler in `src/crawlers/factory.py` and added it to the configuration file `src/crawlers.yml`. - Created comprehensive test suite in `tests/crawlers/test_github_crawler.py` to verify fetching, HTML parsing, and deduplication logic using pytest and mocked responses. :Testing Performed: - Added unit tests for `GitHubTrendingCrawler` using pytest. - Verified all tests pass successfully. - Ensured no duplicate `NewsItemDTO` objects are generated for the same repository URL across different timeframes. :QA Notes: - The vector storage (`ChromaStore`) already handles inter-run deduplication by checking `await self.storage.exists(item.url)` before processing, ensuring repositories are only parsed and processed by the AI once even across multiple script executions. :Issues Addressed: - Resolves request for adding GitHub trending scouting (Month/Week/Day) with deduplication. Change-Id: Ifbcde830263264576e4fadb70f09a6e2e12e3016
This commit is contained in:
parent
6d2ac9d0f0
commit
ef3faec7f8
@ -117,3 +117,6 @@ crawlers:
|
|||||||
url: "https://scholar.google.com/"
|
url: "https://scholar.google.com/"
|
||||||
source: "Google Scholar BMI"
|
source: "Google Scholar BMI"
|
||||||
query: "Brain-machine interface (IoT|Webengine|Linux)"
|
query: "Brain-machine interface (IoT|Webengine|Linux)"
|
||||||
|
- type: github_trending
|
||||||
|
url: "https://github.com/trending"
|
||||||
|
source: "GitHub Trending"
|
||||||
@ -10,6 +10,7 @@ from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
|||||||
from src.crawlers.scirate_crawler import SciRateCrawler
|
from src.crawlers.scirate_crawler import SciRateCrawler
|
||||||
from src.crawlers.scholar_crawler import ScholarCrawler
|
from src.crawlers.scholar_crawler import ScholarCrawler
|
||||||
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
|
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
|
||||||
|
from src.crawlers.github_crawler import GitHubTrendingCrawler
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -59,6 +60,8 @@ class CrawlerFactory:
|
|||||||
crawlers.append(ScholarCrawler(query=query, source=source))
|
crawlers.append(ScholarCrawler(query=query, source=source))
|
||||||
elif crawler_type == 'microsoft_research':
|
elif crawler_type == 'microsoft_research':
|
||||||
crawlers.append(MicrosoftResearchCrawler(url=url, source=source))
|
crawlers.append(MicrosoftResearchCrawler(url=url, source=source))
|
||||||
|
elif crawler_type == 'github_trending':
|
||||||
|
crawlers.append(GitHubTrendingCrawler(url=url, source=source))
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unknown crawler type: {crawler_type}")
|
logger.warning(f"Unknown crawler type: {crawler_type}")
|
||||||
|
|
||||||
|
|||||||
100
src/crawlers/github_crawler.py
Normal file
100
src/crawlers/github_crawler.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
import logging
|
||||||
|
import asyncio
|
||||||
|
import re
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from src.crawlers.base import ICrawler
|
||||||
|
from src.crawlers.dto import NewsItemDTO
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class GitHubTrendingCrawler(ICrawler):
|
||||||
|
"""
|
||||||
|
Crawler for GitHub Trending repositories.
|
||||||
|
Fetches monthly, weekly, and daily trending repositories and deduplicates them.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, url: str = None, source: str = "GitHub Trending"):
|
||||||
|
self.base_url = "https://github.com"
|
||||||
|
self.url = url or "https://github.com/trending"
|
||||||
|
self.source = source
|
||||||
|
|
||||||
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||||
|
timeframes = ["monthly", "weekly", "daily"]
|
||||||
|
repos: Dict[str, dict] = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
for timeframe in timeframes:
|
||||||
|
url = f"{self.base_url}/trending?since={timeframe}"
|
||||||
|
|
||||||
|
# Use asyncio.to_thread to run the synchronous requests.get
|
||||||
|
response = await asyncio.to_thread(requests.get, url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
articles = soup.find_all("article", class_="Box-row")
|
||||||
|
|
||||||
|
for article in articles:
|
||||||
|
h2 = article.find("h2", class_="h3")
|
||||||
|
if not h2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
a_tag = h2.find("a")
|
||||||
|
if not a_tag:
|
||||||
|
continue
|
||||||
|
|
||||||
|
repo_path = a_tag.get("href", "")
|
||||||
|
if not repo_path:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Fix test compatibility. The test assumes the exact href is in the URL.
|
||||||
|
repo_url = f"{self.base_url}{repo_path}"
|
||||||
|
|
||||||
|
# Clean up title whitespace
|
||||||
|
raw_title = h2.get_text()
|
||||||
|
title = re.sub(r'\s+', ' ', raw_title).strip()
|
||||||
|
|
||||||
|
# Extract description
|
||||||
|
p_tag = article.find("p", class_="col-9")
|
||||||
|
description = p_tag.get_text(strip=True) if p_tag else ""
|
||||||
|
|
||||||
|
# Extract language
|
||||||
|
lang_span = article.find("span", attrs={"itemprop": "programmingLanguage"})
|
||||||
|
language = lang_span.get_text(strip=True) if lang_span else "Unknown"
|
||||||
|
|
||||||
|
if repo_url in repos:
|
||||||
|
if timeframe not in repos[repo_url]["timeframes"]:
|
||||||
|
repos[repo_url]["timeframes"].append(timeframe)
|
||||||
|
else:
|
||||||
|
repos[repo_url] = {
|
||||||
|
"title": title,
|
||||||
|
"description": description,
|
||||||
|
"language": language,
|
||||||
|
"timeframes": [timeframe]
|
||||||
|
}
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for repo_url, data in repos.items():
|
||||||
|
timeframes_str = ", ".join(data["timeframes"])
|
||||||
|
content_text = f"{data['description']}\nLanguage: {data['language']}\nTrending: {timeframes_str}"
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
NewsItemDTO(
|
||||||
|
title=data["title"],
|
||||||
|
url=repo_url,
|
||||||
|
content_text=content_text.strip(),
|
||||||
|
source=self.source,
|
||||||
|
timestamp=datetime.now(timezone.utc)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error fetching GitHub trending: {e}")
|
||||||
|
return []
|
||||||
192
tests/crawlers/test_github_crawler.py
Normal file
192
tests/crawlers/test_github_crawler.py
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
import pytest
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from src.crawlers.github_crawler import GitHubTrendingCrawler
|
||||||
|
from src.crawlers.dto import NewsItemDTO
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def monthly_html():
|
||||||
|
return """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<article class="Box-row">
|
||||||
|
<h2 class="h3 lh-condensed">
|
||||||
|
<a href="/user/repo1">
|
||||||
|
<span class="text-normal">user / </span> repo1
|
||||||
|
</a>
|
||||||
|
</h2>
|
||||||
|
<p class="col-9 color-fg-muted my-1 pr-4">Monthly description 1</p>
|
||||||
|
<div class="f6 color-fg-muted mt-2">
|
||||||
|
<span class="d-inline-block ml-0 mr-3">
|
||||||
|
<span itemprop="programmingLanguage">Python</span>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</article>
|
||||||
|
<article class="Box-row">
|
||||||
|
<h2 class="h3 lh-condensed">
|
||||||
|
<a href="/user/repo2">
|
||||||
|
<span class="text-normal">user / </span> repo2
|
||||||
|
</a>
|
||||||
|
</h2>
|
||||||
|
<p class="col-9 color-fg-muted my-1 pr-4">Monthly description 2</p>
|
||||||
|
<div class="f6 color-fg-muted mt-2">
|
||||||
|
<span class="d-inline-block ml-0 mr-3">
|
||||||
|
<span itemprop="programmingLanguage">JavaScript</span>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def weekly_html():
|
||||||
|
return """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<article class="Box-row">
|
||||||
|
<h2 class="h3 lh-condensed">
|
||||||
|
<a href="/user/repo3">
|
||||||
|
<span class="text-normal">user / </span> repo3
|
||||||
|
</a>
|
||||||
|
</h2>
|
||||||
|
<p class="col-9 color-fg-muted my-1 pr-4">Weekly description 3</p>
|
||||||
|
<div class="f6 color-fg-muted mt-2">
|
||||||
|
<span class="d-inline-block ml-0 mr-3">
|
||||||
|
<span itemprop="programmingLanguage">Go</span>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def daily_html():
|
||||||
|
return """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<article class="Box-row">
|
||||||
|
<h2 class="h3 lh-condensed">
|
||||||
|
<a href="/user/repo1">
|
||||||
|
<span class="text-normal">user / </span> repo1
|
||||||
|
</a>
|
||||||
|
</h2>
|
||||||
|
<p class="col-9 color-fg-muted my-1 pr-4">Daily description 1</p>
|
||||||
|
<div class="f6 color-fg-muted mt-2">
|
||||||
|
<span class="d-inline-block ml-0 mr-3">
|
||||||
|
<span itemprop="programmingLanguage">Python</span>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</article>
|
||||||
|
<article class="Box-row">
|
||||||
|
<h2 class="h3 lh-condensed">
|
||||||
|
<a href="/user/repo4">
|
||||||
|
<span class="text-normal">user / </span> repo4
|
||||||
|
</a>
|
||||||
|
</h2>
|
||||||
|
<p class="col-9 color-fg-muted my-1 pr-4">Daily description 4</p>
|
||||||
|
<div class="f6 color-fg-muted mt-2">
|
||||||
|
<span class="d-inline-block ml-0 mr-3">
|
||||||
|
<span itemprop="programmingLanguage">Rust</span>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_github_trending_crawler_fetches_all_timeframes(monthly_html, weekly_html, daily_html):
|
||||||
|
crawler = GitHubTrendingCrawler()
|
||||||
|
|
||||||
|
with patch("requests.get") as mock_get:
|
||||||
|
# Configure mock to return different HTML for different URLs
|
||||||
|
def side_effect(url, **kwargs):
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.status_code = 200
|
||||||
|
if "since=monthly" in url:
|
||||||
|
mock_resp.text = monthly_html
|
||||||
|
elif "since=weekly" in url:
|
||||||
|
mock_resp.text = weekly_html
|
||||||
|
elif "since=daily" in url:
|
||||||
|
mock_resp.text = daily_html
|
||||||
|
else:
|
||||||
|
mock_resp.text = ""
|
||||||
|
return mock_resp
|
||||||
|
|
||||||
|
mock_get.side_effect = side_effect
|
||||||
|
|
||||||
|
results = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
# Verify it called all three URLs
|
||||||
|
called_urls = [call.args[0] for call in mock_get.call_args_list]
|
||||||
|
assert "https://github.com/trending?since=monthly" in called_urls
|
||||||
|
assert "https://github.com/trending?since=weekly" in called_urls
|
||||||
|
assert "https://github.com/trending?since=daily" in called_urls
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_github_trending_crawler_parses_html_correctly(daily_html):
|
||||||
|
crawler = GitHubTrendingCrawler()
|
||||||
|
|
||||||
|
with patch("requests.get") as mock_get:
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.status_code = 200
|
||||||
|
mock_resp.text = daily_html
|
||||||
|
mock_get.return_value = mock_resp
|
||||||
|
|
||||||
|
# We only care about one fetch here to verify parsing
|
||||||
|
# But fetch_latest might call all three, so we mock it to return empty for others if needed
|
||||||
|
# or just check the results.
|
||||||
|
|
||||||
|
results = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
# Check if repo4 is correctly parsed
|
||||||
|
repo4 = next((item for item in results if "user/repo4" in item.url), None)
|
||||||
|
assert repo4 is not None
|
||||||
|
assert repo4.title == "user / repo4"
|
||||||
|
assert "Daily description 4" in repo4.content_text
|
||||||
|
assert "Rust" in repo4.content_text
|
||||||
|
assert repo4.source == "GitHub Trending"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_github_trending_crawler_deduplication(monthly_html, weekly_html, daily_html):
|
||||||
|
crawler = GitHubTrendingCrawler()
|
||||||
|
|
||||||
|
with patch("requests.get") as mock_get:
|
||||||
|
def side_effect(url, **kwargs):
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.status_code = 200
|
||||||
|
if "since=monthly" in url:
|
||||||
|
mock_resp.text = monthly_html
|
||||||
|
elif "since=weekly" in url:
|
||||||
|
mock_resp.text = weekly_html
|
||||||
|
elif "since=daily" in url:
|
||||||
|
mock_resp.text = daily_html
|
||||||
|
return mock_resp
|
||||||
|
|
||||||
|
mock_get.side_effect = side_effect
|
||||||
|
|
||||||
|
results = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
# repo1 appears in monthly and daily
|
||||||
|
repo1_items = [item for item in results if "user/repo1" in item.url]
|
||||||
|
|
||||||
|
# 1. Assert only ONE NewsItemDTO for repo1
|
||||||
|
assert len(repo1_items) == 1
|
||||||
|
|
||||||
|
# 2. Assert content_text or source indicates it appeared in both timeframes
|
||||||
|
# The prompt says: "its content_text (or source) should indicate it appeared in both timeframes"
|
||||||
|
repo1 = repo1_items[0]
|
||||||
|
assert "monthly" in repo1.content_text.lower() or "monthly" in repo1.source.lower()
|
||||||
|
assert "daily" in repo1.content_text.lower() or "daily" in repo1.source.lower()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_github_trending_crawler_handles_errors():
|
||||||
|
crawler = GitHubTrendingCrawler()
|
||||||
|
|
||||||
|
with patch("requests.get") as mock_get:
|
||||||
|
mock_get.side_effect = Exception("Network error")
|
||||||
|
|
||||||
|
results = await crawler.fetch_latest()
|
||||||
|
assert results == []
|
||||||
Loading…
x
Reference in New Issue
Block a user