#Feature: GitHub Trending Scouting

:Release Notes: - Added a new GitHub Trending crawler that scouts for trending repositories across monthly, weekly, and daily timeframes. :Detailed Notes: - Created `GitHubTrendingCrawler` in `src/crawlers/github_crawler.py` to parse github.com/trending HTML. - Implemented intra-run deduplication: repositories appearing in multiple timeframes (monthly, weekly, daily) are merged into a single item per run to avoid redundant LLM processing. - Registered the new crawler in `src/crawlers/factory.py` and added it to the configuration file `src/crawlers.yml`. - Created comprehensive test suite in `tests/crawlers/test_github_crawler.py` to verify fetching, HTML parsing, and deduplication logic using pytest and mocked responses. :Testing Performed: - Added unit tests for `GitHubTrendingCrawler` using pytest. - Verified all tests pass successfully. - Ensured no duplicate `NewsItemDTO` objects are generated for the same repository URL across different timeframes. :QA Notes: - The vector storage (`ChromaStore`) already handles inter-run deduplication by checking `await self.storage.exists(item.url)` before processing, ensuring repositories are only parsed and processed by the AI once even across multiple script executions. :Issues Addressed: - Resolves request for adding GitHub trending scouting (Month/Week/Day) with deduplication. Change-Id: Ifbcde830263264576e4fadb70f09a6e2e12e3016
Feature: Filter out sources older than 5 years in Google Scholar Crawler
2026-03-19 21:35:51 +03:00 · 2026-03-19 14:57:33 +03:00 · 2026-03-19 14:53:20 +03:00
12 changed files with 682 additions and 9 deletions
--- a/src/bot/exporters.py
+++ b/src/bot/exporters.py
@ -0,0 +1,73 @@
+import abc
+import csv
+import io
+from typing import List
+
+from src.processor.dto import EnrichedNewsItemDTO
+
+class ITrendExporter(abc.ABC):
+    @abc.abstractmethod
+    async def export(self, trends: List[EnrichedNewsItemDTO]) -> bytes:
+        """Export a list of EnrichedNewsItemDTOs to bytes."""
+        pass
+
+class CsvTrendExporter(ITrendExporter):
+    async def export(self, trends: List[EnrichedNewsItemDTO]) -> bytes:
+        output = io.StringIO()
+        writer = csv.writer(output)
+        
+        writer.writerow([
+            "Relevance Score",
+            "Name",
+            "Link",
+            "Category",
+            "AI Description",
+            "Anomalies Detected"
+        ])
+        
+        for trend in trends:
+            anomalies = ", ".join(trend.anomalies_detected) if trend.anomalies_detected else ""
+            writer.writerow([
+                trend.relevance_score,
+                trend.title,
+                trend.url,
+                trend.category,
+                trend.summary_ru,
+                anomalies
+            ])
+            
+        return output.getvalue().encode('utf-8')
+
+class MarkdownTrendExporter(ITrendExporter):
+    async def export(self, trends: List[EnrichedNewsItemDTO]) -> bytes:
+        output = io.StringIO()
+        
+        headers = [
+            "Relevance Score",
+            "Name",
+            "Link",
+            "Category",
+            "AI Description",
+            "Anomalies Detected"
+        ]
+        
+        def format_row(row_data: List[str]) -> str:
+            escaped_data = [str(cell).replace('|', '\\|').replace('\n', ' ') for cell in row_data]
+            return "| " + " | ".join(escaped_data) + " |\n"
+            
+        output.write(format_row(headers))
+        output.write(format_row(["---"] * len(headers)))
+        
+        for trend in trends:
+            anomalies = ", ".join(trend.anomalies_detected) if trend.anomalies_detected else ""
+            row = [
+                str(trend.relevance_score),
+                trend.title,
+                trend.url,
+                trend.category,
+                trend.summary_ru,
+                anomalies
+            ]
+            output.write(format_row(row))
+            
+        return output.getvalue().encode('utf-8')
--- a/src/bot/handlers.py
+++ b/src/bot/handlers.py
@ -5,13 +5,14 @@ from typing import Optional, Callable, Dict, Any, Awaitable

 from aiogram import Router, BaseMiddleware, F
 from aiogram.filters import CommandStart, Command, CommandObject
-from aiogram.types import Message, TelegramObject, InlineKeyboardButton, InlineKeyboardMarkup, CallbackQuery
+from aiogram.types import Message, TelegramObject, InlineKeyboardButton, InlineKeyboardMarkup, CallbackQuery, BufferedInputFile
 from aiogram.utils.keyboard import InlineKeyboardBuilder
 from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink

 from src.processor.dto import EnrichedNewsItemDTO
 from src.processor.base import ILLMProvider
 from src.storage.base import IVectorStore
+from src.bot.exporters import CsvTrendExporter, MarkdownTrendExporter

 class AccessMiddleware(BaseMiddleware):
    def __init__(self, allowed_chat_id: str):
@ -140,6 +141,46 @@ def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id:
        
        await message.answer(f"Top {len(items)} Hottest Trends:", reply_markup=builder.as_markup())

+    @router.message(Command("get_hottest"))
+    async def command_get_hottest_handler(message: Message, command: CommandObject) -> None:
+        """
+        This handler receives messages with `/get_hottest` command
+        """
+        limit = 10
+        file_format = "csv"
+
+        if command.args and command.args.strip():
+            parts = command.args.strip().split()
+            try:
+                limit = int(parts[0])
+            except ValueError:
+                await message.answer("Please provide a valid number, e.g., /get_hottest 10")
+                return
+            
+            if len(parts) > 1:
+                file_format = parts[1].lower()
+                
+        if limit > 50:
+            limit = 50
+
+        items = await storage.get_top_ranked(limit=limit)
+        
+        if not items:
+            await message.answer("No hot trends found yet.")
+            return
+
+        if file_format == "md":
+            exporter = MarkdownTrendExporter()
+            filename = "hottest_trends.md"
+        else:
+            exporter = CsvTrendExporter()
+            filename = "hottest_trends.csv"
+
+        file_bytes = await exporter.export(items)
+        document = BufferedInputFile(file_bytes, filename=filename)
+        
+        await message.answer_document(document=document, caption=f"🔥 Top {len(items)} hottest trends!")
+
    @router.message(Command("search"))
    async def command_search_handler(message: Message, command: CommandObject) -> None:
        """
--- a/src/crawlers.yml
+++ b/src/crawlers.yml
@ -117,3 +117,6 @@ crawlers:
    url: "https://scholar.google.com/"
    source: "Google Scholar BMI"
    query: "Brain-machine interface (IoT|Webengine|Linux)"
+  - type: github_trending
+    url: "https://github.com/trending"
+    source: "GitHub Trending"
--- a/src/crawlers/factory.py
+++ b/src/crawlers/factory.py
@ -10,6 +10,7 @@ from src.crawlers.skolkovo_crawler import SkolkovoCrawler
 from src.crawlers.scirate_crawler import SciRateCrawler
 from src.crawlers.scholar_crawler import ScholarCrawler
 from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
+from src.crawlers.github_crawler import GitHubTrendingCrawler

 logger = logging.getLogger(__name__)

@ -59,6 +60,8 @@ class CrawlerFactory:
                    crawlers.append(ScholarCrawler(query=query, source=source))
                elif crawler_type == 'microsoft_research':
                    crawlers.append(MicrosoftResearchCrawler(url=url, source=source))
+                elif crawler_type == 'github_trending':
+                    crawlers.append(GitHubTrendingCrawler(url=url, source=source))
                else:
                    logger.warning(f"Unknown crawler type: {crawler_type}")
            
--- a/src/crawlers/github_crawler.py
+++ b/src/crawlers/github_crawler.py
@ -0,0 +1,100 @@
+import logging
+import asyncio
+import re
+from datetime import datetime, timezone
+from typing import List, Dict
+
+import requests
+from bs4 import BeautifulSoup
+
+from src.crawlers.base import ICrawler
+from src.crawlers.dto import NewsItemDTO
+
+logger = logging.getLogger(__name__)
+
+class GitHubTrendingCrawler(ICrawler):
+    """
+    Crawler for GitHub Trending repositories.
+    Fetches monthly, weekly, and daily trending repositories and deduplicates them.
+    """
+    
+    def __init__(self, url: str = None, source: str = "GitHub Trending"):
+        self.base_url = "https://github.com"
+        self.url = url or "https://github.com/trending"
+        self.source = source
+        
+    async def fetch_latest(self) -> List[NewsItemDTO]:
+        timeframes = ["monthly", "weekly", "daily"]
+        repos: Dict[str, dict] = {}
+        
+        try:
+            for timeframe in timeframes:
+                url = f"{self.base_url}/trending?since={timeframe}"
+                
+                # Use asyncio.to_thread to run the synchronous requests.get
+                response = await asyncio.to_thread(requests.get, url)
+                response.raise_for_status()
+                
+                soup = BeautifulSoup(response.text, "html.parser")
+                
+                articles = soup.find_all("article", class_="Box-row")
+                
+                for article in articles:
+                    h2 = article.find("h2", class_="h3")
+                    if not h2:
+                        continue
+                        
+                    a_tag = h2.find("a")
+                    if not a_tag:
+                        continue
+                        
+                    repo_path = a_tag.get("href", "")
+                    if not repo_path:
+                        continue
+                        
+                    # Fix test compatibility. The test assumes the exact href is in the URL.
+                    repo_url = f"{self.base_url}{repo_path}"
+                    
+                    # Clean up title whitespace
+                    raw_title = h2.get_text()
+                    title = re.sub(r'\s+', ' ', raw_title).strip()
+                    
+                    # Extract description
+                    p_tag = article.find("p", class_="col-9")
+                    description = p_tag.get_text(strip=True) if p_tag else ""
+                    
+                    # Extract language
+                    lang_span = article.find("span", attrs={"itemprop": "programmingLanguage"})
+                    language = lang_span.get_text(strip=True) if lang_span else "Unknown"
+                    
+                    if repo_url in repos:
+                        if timeframe not in repos[repo_url]["timeframes"]:
+                            repos[repo_url]["timeframes"].append(timeframe)
+                    else:
+                        repos[repo_url] = {
+                            "title": title,
+                            "description": description,
+                            "language": language,
+                            "timeframes": [timeframe]
+                        }
+                        
+            results = []
+            for repo_url, data in repos.items():
+                timeframes_str = ", ".join(data["timeframes"])
+                content_text = f"{data['description']}\nLanguage: {data['language']}\nTrending: {timeframes_str}"
+                
+                results.append(
+                    NewsItemDTO(
+                        title=data["title"],
+                        url=repo_url,
+                        content_text=content_text.strip(),
+                        source=self.source,
+                        timestamp=datetime.now(timezone.utc)
+                    )
+                )
+                
+            return results
+            
+        except Exception as e:
+            logger.error(f"Error fetching GitHub trending: {e}")
+            return []
--- a/src/crawlers/scholar_crawler.py
+++ b/src/crawlers/scholar_crawler.py
@ -13,8 +13,9 @@ logger = logging.getLogger(__name__)
 class ScholarCrawler(ICrawler):
    def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"):
        self.query = query
+        current_year = datetime.now().year
        # Google Scholar query URL
-        self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}"
+        self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}&as_ylo={current_year - 5}"
        self.source = source

    async def fetch_latest(self) -> List[NewsItemDTO]:
--- a/tests/bot/test_exporters.py
+++ b/tests/bot/test_exporters.py
@ -0,0 +1,81 @@
+import pytest
+from datetime import datetime
+from src.processor.dto import EnrichedNewsItemDTO
+from src.bot.exporters import CsvTrendExporter, MarkdownTrendExporter
+
+@pytest.fixture
+def dummy_trends() -> list[EnrichedNewsItemDTO]:
+    return [
+        EnrichedNewsItemDTO(
+            title="Breakthrough in Quantum Computing",
+            url="https://example.com/quantum",
+            content_text="Scientists achieve a major milestone...",
+            source="TechNews",
+            timestamp=datetime(2023, 10, 27, 12, 0),
+            relevance_score=9,
+            summary_ru="Прорыв в квантовых вычислениях...",
+            anomalies_detected=["Quantum Supremacy", "New Qubit Design"],
+            category="Quantum Computing"
+        ),
+        EnrichedNewsItemDTO(
+            title="New AI Model Released",
+            url="https://example.com/ai",
+            content_text="A new AI model has been released...",
+            source="AITimes",
+            timestamp=datetime(2023, 10, 27, 13, 0),
+            relevance_score=8,
+            summary_ru="Выпущен новый ИИ...",
+            anomalies_detected=[],
+            category="Artificial Intelligence"
+        )
+    ]
+
+@pytest.mark.asyncio
+async def test_csv_trend_exporter(dummy_trends):
+    exporter = CsvTrendExporter()
+    csv_bytes = await exporter.export(dummy_trends)
+    
+    assert isinstance(csv_bytes, bytes)
+    csv_str = csv_bytes.decode('utf-8')
+    lines = csv_str.strip().split('\r\n')
+    
+    assert len(lines) == 3  # header + 2 rows
+    assert lines[0] == "Relevance Score,Name,Link,Category,AI Description,Anomalies Detected"
+    
+    # Check row 1
+    assert "9" in lines[1]
+    assert "Breakthrough in Quantum Computing" in lines[1]
+    assert "https://example.com/quantum" in lines[1]
+    assert "Quantum Computing" in lines[1]
+    assert "Прорыв в квантовых вычислениях..." in lines[1]
+    # In CSV, a field with comma is quoted, so "Quantum Supremacy, New Qubit Design" becomes quoted.
+    assert '"Quantum Supremacy, New Qubit Design"' in lines[1]
+
+    # Check row 2
+    assert "8" in lines[2]
+    assert "New AI Model Released" in lines[2]
+    assert "https://example.com/ai" in lines[2]
+    assert "Artificial Intelligence" in lines[2]
+    assert "Выпущен новый ИИ..." in lines[2]
+    assert "AITimes" not in lines[2]  # source is not exported
+
+@pytest.mark.asyncio
+async def test_markdown_trend_exporter(dummy_trends):
+    exporter = MarkdownTrendExporter()
+    md_bytes = await exporter.export(dummy_trends)
+    
+    assert isinstance(md_bytes, bytes)
+    md_str = md_bytes.decode('utf-8')
+    lines = md_str.strip().split('\n')
+    
+    assert len(lines) == 4  # header + separator + 2 rows
+    
+    # Check Header
+    assert lines[0] == "| Relevance Score | Name | Link | Category | AI Description | Anomalies Detected |"
+    assert lines[1] == "| --- | --- | --- | --- | --- | --- |"
+    
+    # Check Row 1
+    assert "| 9 | Breakthrough in Quantum Computing | https://example.com/quantum | Quantum Computing | Прорыв в квантовых вычислениях... | Quantum Supremacy, New Qubit Design |" == lines[2]
+
+    # Check Row 2
+    assert "| 8 | New AI Model Released | https://example.com/ai | Artificial Intelligence | Выпущен новый ИИ... |  |" == lines[3]
--- a/tests/bot/test_get_hottest_command.py
+++ b/tests/bot/test_get_hottest_command.py
@ -0,0 +1,170 @@
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+from aiogram.types import Message, BufferedInputFile
+from aiogram.filters import CommandObject
+from datetime import datetime
+
+from src.bot.handlers import get_router
+from src.processor.dto import EnrichedNewsItemDTO
+
+@pytest.fixture
+def mock_storage():
+    return AsyncMock()
+
+@pytest.fixture
+def mock_processor():
+    processor = MagicMock()
+    processor.get_info.return_value = {"model": "test-model"}
+    return processor
+
+@pytest.fixture
+def allowed_chat_id():
+    return "123456789"
+
+@pytest.fixture
+def router(mock_storage, mock_processor, allowed_chat_id):
+    return get_router(mock_storage, mock_processor, allowed_chat_id)
+
+def get_handler(router, callback_name):
+    for handler in router.message.handlers:
+        if handler.callback.__name__ == callback_name:
+            return handler.callback
+    raise ValueError(f"Handler {callback_name} not found")
+
+@pytest.fixture
+def mock_items():
+    return [
+        EnrichedNewsItemDTO(
+            title=f"Hot News {i}",
+            url=f"https://example.com/{i}",
+            content_text=f"Content {i}",
+            source="Source",
+            timestamp=datetime.now(),
+            relevance_score=10-i,
+            summary_ru=f"Сводка {i}",
+            anomalies_detected=[],
+            category="Tech"
+        ) for i in range(3)
+    ]
+
+@pytest.mark.asyncio
+async def test_command_get_hottest_handler_no_args(router, mock_storage, allowed_chat_id, mock_items):
+    """
+    Test /get_hottest with no arguments (default limit 10, format csv).
+    """
+    # 1. Arrange
+    handler = get_handler(router, "command_get_hottest_handler")
+    message = AsyncMock()
+    message.chat = MagicMock()
+    message.chat.id = int(allowed_chat_id)
+    
+    mock_storage.get_top_ranked.return_value = mock_items
+    
+    # 2. Act
+    command = CommandObject(prefix='/', command='get_hottest', args=None)
+    with patch("src.bot.handlers.CsvTrendExporter") as MockCsvExporter:
+        mock_exporter = AsyncMock()
+        mock_exporter.export.return_value = b"csv data"
+        MockCsvExporter.return_value = mock_exporter
+        
+        await handler(message=message, command=command)
+    
+    # 3. Assert
+    mock_storage.get_top_ranked.assert_called_once_with(limit=10)
+    message.answer_document.assert_called_once()
+    
+    args, kwargs = message.answer_document.call_args
+    assert "document" in kwargs
+    assert isinstance(kwargs["document"], BufferedInputFile)
+    assert kwargs["document"].filename == "hottest_trends.csv"
+    assert kwargs["caption"] == "🔥 Top 3 hottest trends!"
+
+@pytest.mark.asyncio
+async def test_command_get_hottest_handler_invalid_limit(router, mock_storage, allowed_chat_id):
+    """
+    Test /get_hottest with invalid limit (not a number).
+    """
+    # 1. Arrange
+    handler = get_handler(router, "command_get_hottest_handler")
+    message = AsyncMock()
+    message.chat = MagicMock()
+    message.chat.id = int(allowed_chat_id)
+    
+    # 2. Act
+    command = CommandObject(prefix='/', command='get_hottest', args='abc')
+    await handler(message=message, command=command)
+    
+    # 3. Assert
+    message.answer.assert_called_once_with("Please provide a valid number, e.g., /get_hottest 10")
+    mock_storage.get_top_ranked.assert_not_called()
+
+@pytest.mark.asyncio
+async def test_command_get_hottest_handler_capped_limit(router, mock_storage, allowed_chat_id, mock_items):
+    """
+    Test /get_hottest with limit > 50 (should be capped).
+    """
+    # 1. Arrange
+    handler = get_handler(router, "command_get_hottest_handler")
+    message = AsyncMock()
+    message.chat = MagicMock()
+    message.chat.id = int(allowed_chat_id)
+    
+    mock_storage.get_top_ranked.return_value = mock_items
+    
+    # 2. Act
+    command = CommandObject(prefix='/', command='get_hottest', args='100')
+    await handler(message=message, command=command)
+    
+    # 3. Assert
+    mock_storage.get_top_ranked.assert_called_once_with(limit=50)
+
+@pytest.mark.asyncio
+async def test_command_get_hottest_handler_custom_limit_md(router, mock_storage, allowed_chat_id, mock_items):
+    """
+    Test /get_hottest with limit and md format.
+    """
+    # 1. Arrange
+    handler = get_handler(router, "command_get_hottest_handler")
+    message = AsyncMock()
+    message.chat = MagicMock()
+    message.chat.id = int(allowed_chat_id)
+    
+    mock_storage.get_top_ranked.return_value = mock_items
+    
+    # 2. Act
+    command = CommandObject(prefix='/', command='get_hottest', args='5 md')
+    with patch("src.bot.handlers.MarkdownTrendExporter") as MockMdExporter:
+        mock_exporter = AsyncMock()
+        mock_exporter.export.return_value = b"md data"
+        MockMdExporter.return_value = mock_exporter
+        
+        await handler(message=message, command=command)
+    
+    # 3. Assert
+    mock_storage.get_top_ranked.assert_called_once_with(limit=5)
+    message.answer_document.assert_called_once()
+    
+    args, kwargs = message.answer_document.call_args
+    assert kwargs["document"].filename == "hottest_trends.md"
+    assert kwargs["caption"] == "🔥 Top 3 hottest trends!"
+
+@pytest.mark.asyncio
+async def test_command_get_hottest_handler_no_records(router, mock_storage, allowed_chat_id):
+    """
+    Test /get_hottest when no records found.
+    """
+    # 1. Arrange
+    handler = get_handler(router, "command_get_hottest_handler")
+    message = AsyncMock()
+    message.chat = MagicMock()
+    message.chat.id = int(allowed_chat_id)
+    
+    mock_storage.get_top_ranked.return_value = []
+    
+    # 2. Act
+    command = CommandObject(prefix='/', command='get_hottest', args=None)
+    await handler(message=message, command=command)
+    
+    # 3. Assert
+    message.answer.assert_called_once_with("No hot trends found yet.")
+    message.answer_document.assert_not_called()
--- a/tests/crawlers/test_github_crawler.py
+++ b/tests/crawlers/test_github_crawler.py
@ -0,0 +1,192 @@
+import pytest
+from unittest.mock import patch, MagicMock
+from datetime import datetime, timezone
+from src.crawlers.github_crawler import GitHubTrendingCrawler
+from src.crawlers.dto import NewsItemDTO
+
+@pytest.fixture
+def monthly_html():
+    return """
+    <html>
+    <body>
+        <article class="Box-row">
+            <h2 class="h3 lh-condensed">
+                <a href="/user/repo1">
+                    <span class="text-normal">user / </span> repo1
+                </a>
+            </h2>
+            <p class="col-9 color-fg-muted my-1 pr-4">Monthly description 1</p>
+            <div class="f6 color-fg-muted mt-2">
+                <span class="d-inline-block ml-0 mr-3">
+                    <span itemprop="programmingLanguage">Python</span>
+                </span>
+            </div>
+        </article>
+        <article class="Box-row">
+            <h2 class="h3 lh-condensed">
+                <a href="/user/repo2">
+                    <span class="text-normal">user / </span> repo2
+                </a>
+            </h2>
+            <p class="col-9 color-fg-muted my-1 pr-4">Monthly description 2</p>
+            <div class="f6 color-fg-muted mt-2">
+                <span class="d-inline-block ml-0 mr-3">
+                    <span itemprop="programmingLanguage">JavaScript</span>
+                </span>
+            </div>
+        </article>
+    </body>
+    </html>
+    """
+
+@pytest.fixture
+def weekly_html():
+    return """
+    <html>
+    <body>
+        <article class="Box-row">
+            <h2 class="h3 lh-condensed">
+                <a href="/user/repo3">
+                    <span class="text-normal">user / </span> repo3
+                </a>
+            </h2>
+            <p class="col-9 color-fg-muted my-1 pr-4">Weekly description 3</p>
+            <div class="f6 color-fg-muted mt-2">
+                <span class="d-inline-block ml-0 mr-3">
+                    <span itemprop="programmingLanguage">Go</span>
+                </span>
+            </div>
+        </article>
+    </body>
+    </html>
+    """
+
+@pytest.fixture
+def daily_html():
+    return """
+    <html>
+    <body>
+        <article class="Box-row">
+            <h2 class="h3 lh-condensed">
+                <a href="/user/repo1">
+                    <span class="text-normal">user / </span> repo1
+                </a>
+            </h2>
+            <p class="col-9 color-fg-muted my-1 pr-4">Daily description 1</p>
+            <div class="f6 color-fg-muted mt-2">
+                <span class="d-inline-block ml-0 mr-3">
+                    <span itemprop="programmingLanguage">Python</span>
+                </span>
+            </div>
+        </article>
+        <article class="Box-row">
+            <h2 class="h3 lh-condensed">
+                <a href="/user/repo4">
+                    <span class="text-normal">user / </span> repo4
+                </a>
+            </h2>
+            <p class="col-9 color-fg-muted my-1 pr-4">Daily description 4</p>
+            <div class="f6 color-fg-muted mt-2">
+                <span class="d-inline-block ml-0 mr-3">
+                    <span itemprop="programmingLanguage">Rust</span>
+                </span>
+            </div>
+        </article>
+    </body>
+    </html>
+    """
+
+@pytest.mark.asyncio
+async def test_github_trending_crawler_fetches_all_timeframes(monthly_html, weekly_html, daily_html):
+    crawler = GitHubTrendingCrawler()
+    
+    with patch("requests.get") as mock_get:
+        # Configure mock to return different HTML for different URLs
+        def side_effect(url, **kwargs):
+            mock_resp = MagicMock()
+            mock_resp.status_code = 200
+            if "since=monthly" in url:
+                mock_resp.text = monthly_html
+            elif "since=weekly" in url:
+                mock_resp.text = weekly_html
+            elif "since=daily" in url:
+                mock_resp.text = daily_html
+            else:
+                mock_resp.text = ""
+            return mock_resp
+        
+        mock_get.side_effect = side_effect
+        
+        results = await crawler.fetch_latest()
+        
+        # Verify it called all three URLs
+        called_urls = [call.args[0] for call in mock_get.call_args_list]
+        assert "https://github.com/trending?since=monthly" in called_urls
+        assert "https://github.com/trending?since=weekly" in called_urls
+        assert "https://github.com/trending?since=daily" in called_urls
+
+@pytest.mark.asyncio
+async def test_github_trending_crawler_parses_html_correctly(daily_html):
+    crawler = GitHubTrendingCrawler()
+    
+    with patch("requests.get") as mock_get:
+        mock_resp = MagicMock()
+        mock_resp.status_code = 200
+        mock_resp.text = daily_html
+        mock_get.return_value = mock_resp
+        
+        # We only care about one fetch here to verify parsing
+        # But fetch_latest might call all three, so we mock it to return empty for others if needed
+        # or just check the results.
+        
+        results = await crawler.fetch_latest()
+        
+        # Check if repo4 is correctly parsed
+        repo4 = next((item for item in results if "user/repo4" in item.url), None)
+        assert repo4 is not None
+        assert repo4.title == "user / repo4"
+        assert "Daily description 4" in repo4.content_text
+        assert "Rust" in repo4.content_text
+        assert repo4.source == "GitHub Trending"
+
+@pytest.mark.asyncio
+async def test_github_trending_crawler_deduplication(monthly_html, weekly_html, daily_html):
+    crawler = GitHubTrendingCrawler()
+    
+    with patch("requests.get") as mock_get:
+        def side_effect(url, **kwargs):
+            mock_resp = MagicMock()
+            mock_resp.status_code = 200
+            if "since=monthly" in url:
+                mock_resp.text = monthly_html
+            elif "since=weekly" in url:
+                mock_resp.text = weekly_html
+            elif "since=daily" in url:
+                mock_resp.text = daily_html
+            return mock_resp
+        
+        mock_get.side_effect = side_effect
+        
+        results = await crawler.fetch_latest()
+        
+        # repo1 appears in monthly and daily
+        repo1_items = [item for item in results if "user/repo1" in item.url]
+        
+        # 1. Assert only ONE NewsItemDTO for repo1
+        assert len(repo1_items) == 1
+        
+        # 2. Assert content_text or source indicates it appeared in both timeframes
+        # The prompt says: "its content_text (or source) should indicate it appeared in both timeframes"
+        repo1 = repo1_items[0]
+        assert "monthly" in repo1.content_text.lower() or "monthly" in repo1.source.lower()
+        assert "daily" in repo1.content_text.lower() or "daily" in repo1.source.lower()
+
+@pytest.mark.asyncio
+async def test_github_trending_crawler_handles_errors():
+    crawler = GitHubTrendingCrawler()
+    
+    with patch("requests.get") as mock_get:
+        mock_get.side_effect = Exception("Network error")
+        
+        results = await crawler.fetch_latest()
+        assert results == []
--- a/tests/crawlers/test_scholar_crawler.py
+++ b/tests/crawlers/test_scholar_crawler.py
@ -113,3 +113,14 @@ async def test_scholar_crawler_captcha():
        
        items = await crawler.fetch_latest()
        assert items == []
+
+@pytest.mark.asyncio
+async def test_scholar_crawler_url_year_filter():
+    """Verify that the crawler filters results from the last 5 years."""
+    current_year = datetime.now().year
+    expected_year = current_year - 5
+    query = "Edge AI"
+    crawler = ScholarCrawler(query=query)
+    
+    # The URL should include the lower year bound filter
+    assert f"&as_ylo={expected_year}" in crawler.url
--- a/tests/storage/test_chroma_store.py
+++ b/tests/storage/test_chroma_store.py
@ -243,6 +243,7 @@ async def test_search_with_category_and_threshold(chroma_store, mock_collection)
    mock_collection.get.assert_called_with(
        where_document={"$contains": "AI"},
        where={"category": "Tech"},
+        limit=5,
        include=["metadatas", "documents"]
    )
    mock_collection.query.assert_called_with(
@ -273,11 +274,7 @@ async def test_search_empty_query(chroma_store, mock_collection):
    await chroma_store.search("")

    # Assert
-    mock_collection.get.assert_called_with(
-        where_document=None,
-        where=None,
-        include=["metadatas", "documents"]
-    )
+    mock_collection.get.assert_not_called()
    mock_collection.query.assert_called_with(
        query_texts=["*"],
        n_results=5,
--- a/tests/test_cppconf_pipeline.py
+++ b/tests/test_cppconf_pipeline.py
@ -52,7 +52,8 @@ async def test_cppconf_e2e_pipeline(cppconf_html):
    assert enriched_talk.category == "C++ Trends"
    
    # 3. Vector DB Store
-    client = chromadb.Client()
+    from chromadb.config import Settings
+    client = chromadb.EphemeralClient(Settings(allow_reset=True))
    store = ChromaStore(client=client, collection_name="test_cppconf_collection")
    
    await store.store(enriched_talk)