+ + user / repo1 + +
+Monthly description 1
+diff --git a/src/crawlers.yml b/src/crawlers.yml index c0d8ee0..a5813d7 100644 --- a/src/crawlers.yml +++ b/src/crawlers.yml @@ -116,4 +116,7 @@ crawlers: - type: scholar url: "https://scholar.google.com/" source: "Google Scholar BMI" - query: "Brain-machine interface (IoT|Webengine|Linux)" \ No newline at end of file + query: "Brain-machine interface (IoT|Webengine|Linux)" + - type: github_trending + url: "https://github.com/trending" + source: "GitHub Trending" \ No newline at end of file diff --git a/src/crawlers/factory.py b/src/crawlers/factory.py index 67285fa..51e84b6 100644 --- a/src/crawlers/factory.py +++ b/src/crawlers/factory.py @@ -10,6 +10,7 @@ from src.crawlers.skolkovo_crawler import SkolkovoCrawler from src.crawlers.scirate_crawler import SciRateCrawler from src.crawlers.scholar_crawler import ScholarCrawler from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler +from src.crawlers.github_crawler import GitHubTrendingCrawler logger = logging.getLogger(__name__) @@ -59,6 +60,8 @@ class CrawlerFactory: crawlers.append(ScholarCrawler(query=query, source=source)) elif crawler_type == 'microsoft_research': crawlers.append(MicrosoftResearchCrawler(url=url, source=source)) + elif crawler_type == 'github_trending': + crawlers.append(GitHubTrendingCrawler(url=url, source=source)) else: logger.warning(f"Unknown crawler type: {crawler_type}") diff --git a/src/crawlers/github_crawler.py b/src/crawlers/github_crawler.py new file mode 100644 index 0000000..966b29f --- /dev/null +++ b/src/crawlers/github_crawler.py @@ -0,0 +1,100 @@ +import logging +import asyncio +import re +from datetime import datetime, timezone +from typing import List, Dict + +import requests +from bs4 import BeautifulSoup + +from src.crawlers.base import ICrawler +from src.crawlers.dto import NewsItemDTO + +logger = logging.getLogger(__name__) + +class GitHubTrendingCrawler(ICrawler): + """ + Crawler for GitHub Trending repositories. + Fetches monthly, weekly, and daily trending repositories and deduplicates them. + """ + + def __init__(self, url: str = None, source: str = "GitHub Trending"): + self.base_url = "https://github.com" + self.url = url or "https://github.com/trending" + self.source = source + + async def fetch_latest(self) -> List[NewsItemDTO]: + timeframes = ["monthly", "weekly", "daily"] + repos: Dict[str, dict] = {} + + try: + for timeframe in timeframes: + url = f"{self.base_url}/trending?since={timeframe}" + + # Use asyncio.to_thread to run the synchronous requests.get + response = await asyncio.to_thread(requests.get, url) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + + articles = soup.find_all("article", class_="Box-row") + + for article in articles: + h2 = article.find("h2", class_="h3") + if not h2: + continue + + a_tag = h2.find("a") + if not a_tag: + continue + + repo_path = a_tag.get("href", "") + if not repo_path: + continue + + # Fix test compatibility. The test assumes the exact href is in the URL. + repo_url = f"{self.base_url}{repo_path}" + + # Clean up title whitespace + raw_title = h2.get_text() + title = re.sub(r'\s+', ' ', raw_title).strip() + + # Extract description + p_tag = article.find("p", class_="col-9") + description = p_tag.get_text(strip=True) if p_tag else "" + + # Extract language + lang_span = article.find("span", attrs={"itemprop": "programmingLanguage"}) + language = lang_span.get_text(strip=True) if lang_span else "Unknown" + + if repo_url in repos: + if timeframe not in repos[repo_url]["timeframes"]: + repos[repo_url]["timeframes"].append(timeframe) + else: + repos[repo_url] = { + "title": title, + "description": description, + "language": language, + "timeframes": [timeframe] + } + + results = [] + for repo_url, data in repos.items(): + timeframes_str = ", ".join(data["timeframes"]) + content_text = f"{data['description']}\nLanguage: {data['language']}\nTrending: {timeframes_str}" + + results.append( + NewsItemDTO( + title=data["title"], + url=repo_url, + content_text=content_text.strip(), + source=self.source, + timestamp=datetime.now(timezone.utc) + ) + ) + + return results + + except Exception as e: + logger.error(f"Error fetching GitHub trending: {e}") + return [] diff --git a/tests/crawlers/test_github_crawler.py b/tests/crawlers/test_github_crawler.py new file mode 100644 index 0000000..169b998 --- /dev/null +++ b/tests/crawlers/test_github_crawler.py @@ -0,0 +1,192 @@ +import pytest +from unittest.mock import patch, MagicMock +from datetime import datetime, timezone +from src.crawlers.github_crawler import GitHubTrendingCrawler +from src.crawlers.dto import NewsItemDTO + +@pytest.fixture +def monthly_html(): + return """ + +
+Monthly description 1
+Monthly description 2
+Weekly description 3
+Daily description 1
+Daily description 4
+