Compare commits
No commits in common. "ef3faec7f89fc061745c3f22835c8138420ed1da" and "ca7407973d17f8ed4dbee78ff0f4bf9e01df970b" have entirely different histories.
ef3faec7f8
...
ca7407973d
@ -1,73 +0,0 @@
|
|||||||
import abc
|
|
||||||
import csv
|
|
||||||
import io
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from src.processor.dto import EnrichedNewsItemDTO
|
|
||||||
|
|
||||||
class ITrendExporter(abc.ABC):
|
|
||||||
@abc.abstractmethod
|
|
||||||
async def export(self, trends: List[EnrichedNewsItemDTO]) -> bytes:
|
|
||||||
"""Export a list of EnrichedNewsItemDTOs to bytes."""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class CsvTrendExporter(ITrendExporter):
|
|
||||||
async def export(self, trends: List[EnrichedNewsItemDTO]) -> bytes:
|
|
||||||
output = io.StringIO()
|
|
||||||
writer = csv.writer(output)
|
|
||||||
|
|
||||||
writer.writerow([
|
|
||||||
"Relevance Score",
|
|
||||||
"Name",
|
|
||||||
"Link",
|
|
||||||
"Category",
|
|
||||||
"AI Description",
|
|
||||||
"Anomalies Detected"
|
|
||||||
])
|
|
||||||
|
|
||||||
for trend in trends:
|
|
||||||
anomalies = ", ".join(trend.anomalies_detected) if trend.anomalies_detected else ""
|
|
||||||
writer.writerow([
|
|
||||||
trend.relevance_score,
|
|
||||||
trend.title,
|
|
||||||
trend.url,
|
|
||||||
trend.category,
|
|
||||||
trend.summary_ru,
|
|
||||||
anomalies
|
|
||||||
])
|
|
||||||
|
|
||||||
return output.getvalue().encode('utf-8')
|
|
||||||
|
|
||||||
class MarkdownTrendExporter(ITrendExporter):
|
|
||||||
async def export(self, trends: List[EnrichedNewsItemDTO]) -> bytes:
|
|
||||||
output = io.StringIO()
|
|
||||||
|
|
||||||
headers = [
|
|
||||||
"Relevance Score",
|
|
||||||
"Name",
|
|
||||||
"Link",
|
|
||||||
"Category",
|
|
||||||
"AI Description",
|
|
||||||
"Anomalies Detected"
|
|
||||||
]
|
|
||||||
|
|
||||||
def format_row(row_data: List[str]) -> str:
|
|
||||||
escaped_data = [str(cell).replace('|', '\\|').replace('\n', ' ') for cell in row_data]
|
|
||||||
return "| " + " | ".join(escaped_data) + " |\n"
|
|
||||||
|
|
||||||
output.write(format_row(headers))
|
|
||||||
output.write(format_row(["---"] * len(headers)))
|
|
||||||
|
|
||||||
for trend in trends:
|
|
||||||
anomalies = ", ".join(trend.anomalies_detected) if trend.anomalies_detected else ""
|
|
||||||
row = [
|
|
||||||
str(trend.relevance_score),
|
|
||||||
trend.title,
|
|
||||||
trend.url,
|
|
||||||
trend.category,
|
|
||||||
trend.summary_ru,
|
|
||||||
anomalies
|
|
||||||
]
|
|
||||||
output.write(format_row(row))
|
|
||||||
|
|
||||||
return output.getvalue().encode('utf-8')
|
|
||||||
@ -5,14 +5,13 @@ from typing import Optional, Callable, Dict, Any, Awaitable
|
|||||||
|
|
||||||
from aiogram import Router, BaseMiddleware, F
|
from aiogram import Router, BaseMiddleware, F
|
||||||
from aiogram.filters import CommandStart, Command, CommandObject
|
from aiogram.filters import CommandStart, Command, CommandObject
|
||||||
from aiogram.types import Message, TelegramObject, InlineKeyboardButton, InlineKeyboardMarkup, CallbackQuery, BufferedInputFile
|
from aiogram.types import Message, TelegramObject, InlineKeyboardButton, InlineKeyboardMarkup, CallbackQuery
|
||||||
from aiogram.utils.keyboard import InlineKeyboardBuilder
|
from aiogram.utils.keyboard import InlineKeyboardBuilder
|
||||||
from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink
|
from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink
|
||||||
|
|
||||||
from src.processor.dto import EnrichedNewsItemDTO
|
from src.processor.dto import EnrichedNewsItemDTO
|
||||||
from src.processor.base import ILLMProvider
|
from src.processor.base import ILLMProvider
|
||||||
from src.storage.base import IVectorStore
|
from src.storage.base import IVectorStore
|
||||||
from src.bot.exporters import CsvTrendExporter, MarkdownTrendExporter
|
|
||||||
|
|
||||||
class AccessMiddleware(BaseMiddleware):
|
class AccessMiddleware(BaseMiddleware):
|
||||||
def __init__(self, allowed_chat_id: str):
|
def __init__(self, allowed_chat_id: str):
|
||||||
@ -141,46 +140,6 @@ def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id:
|
|||||||
|
|
||||||
await message.answer(f"Top {len(items)} Hottest Trends:", reply_markup=builder.as_markup())
|
await message.answer(f"Top {len(items)} Hottest Trends:", reply_markup=builder.as_markup())
|
||||||
|
|
||||||
@router.message(Command("get_hottest"))
|
|
||||||
async def command_get_hottest_handler(message: Message, command: CommandObject) -> None:
|
|
||||||
"""
|
|
||||||
This handler receives messages with `/get_hottest` command
|
|
||||||
"""
|
|
||||||
limit = 10
|
|
||||||
file_format = "csv"
|
|
||||||
|
|
||||||
if command.args and command.args.strip():
|
|
||||||
parts = command.args.strip().split()
|
|
||||||
try:
|
|
||||||
limit = int(parts[0])
|
|
||||||
except ValueError:
|
|
||||||
await message.answer("Please provide a valid number, e.g., /get_hottest 10")
|
|
||||||
return
|
|
||||||
|
|
||||||
if len(parts) > 1:
|
|
||||||
file_format = parts[1].lower()
|
|
||||||
|
|
||||||
if limit > 50:
|
|
||||||
limit = 50
|
|
||||||
|
|
||||||
items = await storage.get_top_ranked(limit=limit)
|
|
||||||
|
|
||||||
if not items:
|
|
||||||
await message.answer("No hot trends found yet.")
|
|
||||||
return
|
|
||||||
|
|
||||||
if file_format == "md":
|
|
||||||
exporter = MarkdownTrendExporter()
|
|
||||||
filename = "hottest_trends.md"
|
|
||||||
else:
|
|
||||||
exporter = CsvTrendExporter()
|
|
||||||
filename = "hottest_trends.csv"
|
|
||||||
|
|
||||||
file_bytes = await exporter.export(items)
|
|
||||||
document = BufferedInputFile(file_bytes, filename=filename)
|
|
||||||
|
|
||||||
await message.answer_document(document=document, caption=f"🔥 Top {len(items)} hottest trends!")
|
|
||||||
|
|
||||||
@router.message(Command("search"))
|
@router.message(Command("search"))
|
||||||
async def command_search_handler(message: Message, command: CommandObject) -> None:
|
async def command_search_handler(message: Message, command: CommandObject) -> None:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -117,6 +117,3 @@ crawlers:
|
|||||||
url: "https://scholar.google.com/"
|
url: "https://scholar.google.com/"
|
||||||
source: "Google Scholar BMI"
|
source: "Google Scholar BMI"
|
||||||
query: "Brain-machine interface (IoT|Webengine|Linux)"
|
query: "Brain-machine interface (IoT|Webengine|Linux)"
|
||||||
- type: github_trending
|
|
||||||
url: "https://github.com/trending"
|
|
||||||
source: "GitHub Trending"
|
|
||||||
@ -10,7 +10,6 @@ from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
|||||||
from src.crawlers.scirate_crawler import SciRateCrawler
|
from src.crawlers.scirate_crawler import SciRateCrawler
|
||||||
from src.crawlers.scholar_crawler import ScholarCrawler
|
from src.crawlers.scholar_crawler import ScholarCrawler
|
||||||
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
|
from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler
|
||||||
from src.crawlers.github_crawler import GitHubTrendingCrawler
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -60,8 +59,6 @@ class CrawlerFactory:
|
|||||||
crawlers.append(ScholarCrawler(query=query, source=source))
|
crawlers.append(ScholarCrawler(query=query, source=source))
|
||||||
elif crawler_type == 'microsoft_research':
|
elif crawler_type == 'microsoft_research':
|
||||||
crawlers.append(MicrosoftResearchCrawler(url=url, source=source))
|
crawlers.append(MicrosoftResearchCrawler(url=url, source=source))
|
||||||
elif crawler_type == 'github_trending':
|
|
||||||
crawlers.append(GitHubTrendingCrawler(url=url, source=source))
|
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unknown crawler type: {crawler_type}")
|
logger.warning(f"Unknown crawler type: {crawler_type}")
|
||||||
|
|
||||||
|
|||||||
@ -1,100 +0,0 @@
|
|||||||
import logging
|
|
||||||
import asyncio
|
|
||||||
import re
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from typing import List, Dict
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from src.crawlers.base import ICrawler
|
|
||||||
from src.crawlers.dto import NewsItemDTO
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
class GitHubTrendingCrawler(ICrawler):
|
|
||||||
"""
|
|
||||||
Crawler for GitHub Trending repositories.
|
|
||||||
Fetches monthly, weekly, and daily trending repositories and deduplicates them.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, url: str = None, source: str = "GitHub Trending"):
|
|
||||||
self.base_url = "https://github.com"
|
|
||||||
self.url = url or "https://github.com/trending"
|
|
||||||
self.source = source
|
|
||||||
|
|
||||||
async def fetch_latest(self) -> List[NewsItemDTO]:
|
|
||||||
timeframes = ["monthly", "weekly", "daily"]
|
|
||||||
repos: Dict[str, dict] = {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
for timeframe in timeframes:
|
|
||||||
url = f"{self.base_url}/trending?since={timeframe}"
|
|
||||||
|
|
||||||
# Use asyncio.to_thread to run the synchronous requests.get
|
|
||||||
response = await asyncio.to_thread(requests.get, url)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
|
|
||||||
articles = soup.find_all("article", class_="Box-row")
|
|
||||||
|
|
||||||
for article in articles:
|
|
||||||
h2 = article.find("h2", class_="h3")
|
|
||||||
if not h2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
a_tag = h2.find("a")
|
|
||||||
if not a_tag:
|
|
||||||
continue
|
|
||||||
|
|
||||||
repo_path = a_tag.get("href", "")
|
|
||||||
if not repo_path:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Fix test compatibility. The test assumes the exact href is in the URL.
|
|
||||||
repo_url = f"{self.base_url}{repo_path}"
|
|
||||||
|
|
||||||
# Clean up title whitespace
|
|
||||||
raw_title = h2.get_text()
|
|
||||||
title = re.sub(r'\s+', ' ', raw_title).strip()
|
|
||||||
|
|
||||||
# Extract description
|
|
||||||
p_tag = article.find("p", class_="col-9")
|
|
||||||
description = p_tag.get_text(strip=True) if p_tag else ""
|
|
||||||
|
|
||||||
# Extract language
|
|
||||||
lang_span = article.find("span", attrs={"itemprop": "programmingLanguage"})
|
|
||||||
language = lang_span.get_text(strip=True) if lang_span else "Unknown"
|
|
||||||
|
|
||||||
if repo_url in repos:
|
|
||||||
if timeframe not in repos[repo_url]["timeframes"]:
|
|
||||||
repos[repo_url]["timeframes"].append(timeframe)
|
|
||||||
else:
|
|
||||||
repos[repo_url] = {
|
|
||||||
"title": title,
|
|
||||||
"description": description,
|
|
||||||
"language": language,
|
|
||||||
"timeframes": [timeframe]
|
|
||||||
}
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for repo_url, data in repos.items():
|
|
||||||
timeframes_str = ", ".join(data["timeframes"])
|
|
||||||
content_text = f"{data['description']}\nLanguage: {data['language']}\nTrending: {timeframes_str}"
|
|
||||||
|
|
||||||
results.append(
|
|
||||||
NewsItemDTO(
|
|
||||||
title=data["title"],
|
|
||||||
url=repo_url,
|
|
||||||
content_text=content_text.strip(),
|
|
||||||
source=self.source,
|
|
||||||
timestamp=datetime.now(timezone.utc)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error fetching GitHub trending: {e}")
|
|
||||||
return []
|
|
||||||
@ -13,9 +13,8 @@ logger = logging.getLogger(__name__)
|
|||||||
class ScholarCrawler(ICrawler):
|
class ScholarCrawler(ICrawler):
|
||||||
def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"):
|
def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"):
|
||||||
self.query = query
|
self.query = query
|
||||||
current_year = datetime.now().year
|
|
||||||
# Google Scholar query URL
|
# Google Scholar query URL
|
||||||
self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}&as_ylo={current_year - 5}"
|
self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}"
|
||||||
self.source = source
|
self.source = source
|
||||||
|
|
||||||
async def fetch_latest(self) -> List[NewsItemDTO]:
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||||
|
|||||||
@ -1,81 +0,0 @@
|
|||||||
import pytest
|
|
||||||
from datetime import datetime
|
|
||||||
from src.processor.dto import EnrichedNewsItemDTO
|
|
||||||
from src.bot.exporters import CsvTrendExporter, MarkdownTrendExporter
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def dummy_trends() -> list[EnrichedNewsItemDTO]:
|
|
||||||
return [
|
|
||||||
EnrichedNewsItemDTO(
|
|
||||||
title="Breakthrough in Quantum Computing",
|
|
||||||
url="https://example.com/quantum",
|
|
||||||
content_text="Scientists achieve a major milestone...",
|
|
||||||
source="TechNews",
|
|
||||||
timestamp=datetime(2023, 10, 27, 12, 0),
|
|
||||||
relevance_score=9,
|
|
||||||
summary_ru="Прорыв в квантовых вычислениях...",
|
|
||||||
anomalies_detected=["Quantum Supremacy", "New Qubit Design"],
|
|
||||||
category="Quantum Computing"
|
|
||||||
),
|
|
||||||
EnrichedNewsItemDTO(
|
|
||||||
title="New AI Model Released",
|
|
||||||
url="https://example.com/ai",
|
|
||||||
content_text="A new AI model has been released...",
|
|
||||||
source="AITimes",
|
|
||||||
timestamp=datetime(2023, 10, 27, 13, 0),
|
|
||||||
relevance_score=8,
|
|
||||||
summary_ru="Выпущен новый ИИ...",
|
|
||||||
anomalies_detected=[],
|
|
||||||
category="Artificial Intelligence"
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_csv_trend_exporter(dummy_trends):
|
|
||||||
exporter = CsvTrendExporter()
|
|
||||||
csv_bytes = await exporter.export(dummy_trends)
|
|
||||||
|
|
||||||
assert isinstance(csv_bytes, bytes)
|
|
||||||
csv_str = csv_bytes.decode('utf-8')
|
|
||||||
lines = csv_str.strip().split('\r\n')
|
|
||||||
|
|
||||||
assert len(lines) == 3 # header + 2 rows
|
|
||||||
assert lines[0] == "Relevance Score,Name,Link,Category,AI Description,Anomalies Detected"
|
|
||||||
|
|
||||||
# Check row 1
|
|
||||||
assert "9" in lines[1]
|
|
||||||
assert "Breakthrough in Quantum Computing" in lines[1]
|
|
||||||
assert "https://example.com/quantum" in lines[1]
|
|
||||||
assert "Quantum Computing" in lines[1]
|
|
||||||
assert "Прорыв в квантовых вычислениях..." in lines[1]
|
|
||||||
# In CSV, a field with comma is quoted, so "Quantum Supremacy, New Qubit Design" becomes quoted.
|
|
||||||
assert '"Quantum Supremacy, New Qubit Design"' in lines[1]
|
|
||||||
|
|
||||||
# Check row 2
|
|
||||||
assert "8" in lines[2]
|
|
||||||
assert "New AI Model Released" in lines[2]
|
|
||||||
assert "https://example.com/ai" in lines[2]
|
|
||||||
assert "Artificial Intelligence" in lines[2]
|
|
||||||
assert "Выпущен новый ИИ..." in lines[2]
|
|
||||||
assert "AITimes" not in lines[2] # source is not exported
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_markdown_trend_exporter(dummy_trends):
|
|
||||||
exporter = MarkdownTrendExporter()
|
|
||||||
md_bytes = await exporter.export(dummy_trends)
|
|
||||||
|
|
||||||
assert isinstance(md_bytes, bytes)
|
|
||||||
md_str = md_bytes.decode('utf-8')
|
|
||||||
lines = md_str.strip().split('\n')
|
|
||||||
|
|
||||||
assert len(lines) == 4 # header + separator + 2 rows
|
|
||||||
|
|
||||||
# Check Header
|
|
||||||
assert lines[0] == "| Relevance Score | Name | Link | Category | AI Description | Anomalies Detected |"
|
|
||||||
assert lines[1] == "| --- | --- | --- | --- | --- | --- |"
|
|
||||||
|
|
||||||
# Check Row 1
|
|
||||||
assert "| 9 | Breakthrough in Quantum Computing | https://example.com/quantum | Quantum Computing | Прорыв в квантовых вычислениях... | Quantum Supremacy, New Qubit Design |" == lines[2]
|
|
||||||
|
|
||||||
# Check Row 2
|
|
||||||
assert "| 8 | New AI Model Released | https://example.com/ai | Artificial Intelligence | Выпущен новый ИИ... | |" == lines[3]
|
|
||||||
@ -1,170 +0,0 @@
|
|||||||
import pytest
|
|
||||||
from unittest.mock import AsyncMock, MagicMock, patch
|
|
||||||
from aiogram.types import Message, BufferedInputFile
|
|
||||||
from aiogram.filters import CommandObject
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from src.bot.handlers import get_router
|
|
||||||
from src.processor.dto import EnrichedNewsItemDTO
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_storage():
|
|
||||||
return AsyncMock()
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_processor():
|
|
||||||
processor = MagicMock()
|
|
||||||
processor.get_info.return_value = {"model": "test-model"}
|
|
||||||
return processor
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def allowed_chat_id():
|
|
||||||
return "123456789"
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def router(mock_storage, mock_processor, allowed_chat_id):
|
|
||||||
return get_router(mock_storage, mock_processor, allowed_chat_id)
|
|
||||||
|
|
||||||
def get_handler(router, callback_name):
|
|
||||||
for handler in router.message.handlers:
|
|
||||||
if handler.callback.__name__ == callback_name:
|
|
||||||
return handler.callback
|
|
||||||
raise ValueError(f"Handler {callback_name} not found")
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_items():
|
|
||||||
return [
|
|
||||||
EnrichedNewsItemDTO(
|
|
||||||
title=f"Hot News {i}",
|
|
||||||
url=f"https://example.com/{i}",
|
|
||||||
content_text=f"Content {i}",
|
|
||||||
source="Source",
|
|
||||||
timestamp=datetime.now(),
|
|
||||||
relevance_score=10-i,
|
|
||||||
summary_ru=f"Сводка {i}",
|
|
||||||
anomalies_detected=[],
|
|
||||||
category="Tech"
|
|
||||||
) for i in range(3)
|
|
||||||
]
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_command_get_hottest_handler_no_args(router, mock_storage, allowed_chat_id, mock_items):
|
|
||||||
"""
|
|
||||||
Test /get_hottest with no arguments (default limit 10, format csv).
|
|
||||||
"""
|
|
||||||
# 1. Arrange
|
|
||||||
handler = get_handler(router, "command_get_hottest_handler")
|
|
||||||
message = AsyncMock()
|
|
||||||
message.chat = MagicMock()
|
|
||||||
message.chat.id = int(allowed_chat_id)
|
|
||||||
|
|
||||||
mock_storage.get_top_ranked.return_value = mock_items
|
|
||||||
|
|
||||||
# 2. Act
|
|
||||||
command = CommandObject(prefix='/', command='get_hottest', args=None)
|
|
||||||
with patch("src.bot.handlers.CsvTrendExporter") as MockCsvExporter:
|
|
||||||
mock_exporter = AsyncMock()
|
|
||||||
mock_exporter.export.return_value = b"csv data"
|
|
||||||
MockCsvExporter.return_value = mock_exporter
|
|
||||||
|
|
||||||
await handler(message=message, command=command)
|
|
||||||
|
|
||||||
# 3. Assert
|
|
||||||
mock_storage.get_top_ranked.assert_called_once_with(limit=10)
|
|
||||||
message.answer_document.assert_called_once()
|
|
||||||
|
|
||||||
args, kwargs = message.answer_document.call_args
|
|
||||||
assert "document" in kwargs
|
|
||||||
assert isinstance(kwargs["document"], BufferedInputFile)
|
|
||||||
assert kwargs["document"].filename == "hottest_trends.csv"
|
|
||||||
assert kwargs["caption"] == "🔥 Top 3 hottest trends!"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_command_get_hottest_handler_invalid_limit(router, mock_storage, allowed_chat_id):
|
|
||||||
"""
|
|
||||||
Test /get_hottest with invalid limit (not a number).
|
|
||||||
"""
|
|
||||||
# 1. Arrange
|
|
||||||
handler = get_handler(router, "command_get_hottest_handler")
|
|
||||||
message = AsyncMock()
|
|
||||||
message.chat = MagicMock()
|
|
||||||
message.chat.id = int(allowed_chat_id)
|
|
||||||
|
|
||||||
# 2. Act
|
|
||||||
command = CommandObject(prefix='/', command='get_hottest', args='abc')
|
|
||||||
await handler(message=message, command=command)
|
|
||||||
|
|
||||||
# 3. Assert
|
|
||||||
message.answer.assert_called_once_with("Please provide a valid number, e.g., /get_hottest 10")
|
|
||||||
mock_storage.get_top_ranked.assert_not_called()
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_command_get_hottest_handler_capped_limit(router, mock_storage, allowed_chat_id, mock_items):
|
|
||||||
"""
|
|
||||||
Test /get_hottest with limit > 50 (should be capped).
|
|
||||||
"""
|
|
||||||
# 1. Arrange
|
|
||||||
handler = get_handler(router, "command_get_hottest_handler")
|
|
||||||
message = AsyncMock()
|
|
||||||
message.chat = MagicMock()
|
|
||||||
message.chat.id = int(allowed_chat_id)
|
|
||||||
|
|
||||||
mock_storage.get_top_ranked.return_value = mock_items
|
|
||||||
|
|
||||||
# 2. Act
|
|
||||||
command = CommandObject(prefix='/', command='get_hottest', args='100')
|
|
||||||
await handler(message=message, command=command)
|
|
||||||
|
|
||||||
# 3. Assert
|
|
||||||
mock_storage.get_top_ranked.assert_called_once_with(limit=50)
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_command_get_hottest_handler_custom_limit_md(router, mock_storage, allowed_chat_id, mock_items):
|
|
||||||
"""
|
|
||||||
Test /get_hottest with limit and md format.
|
|
||||||
"""
|
|
||||||
# 1. Arrange
|
|
||||||
handler = get_handler(router, "command_get_hottest_handler")
|
|
||||||
message = AsyncMock()
|
|
||||||
message.chat = MagicMock()
|
|
||||||
message.chat.id = int(allowed_chat_id)
|
|
||||||
|
|
||||||
mock_storage.get_top_ranked.return_value = mock_items
|
|
||||||
|
|
||||||
# 2. Act
|
|
||||||
command = CommandObject(prefix='/', command='get_hottest', args='5 md')
|
|
||||||
with patch("src.bot.handlers.MarkdownTrendExporter") as MockMdExporter:
|
|
||||||
mock_exporter = AsyncMock()
|
|
||||||
mock_exporter.export.return_value = b"md data"
|
|
||||||
MockMdExporter.return_value = mock_exporter
|
|
||||||
|
|
||||||
await handler(message=message, command=command)
|
|
||||||
|
|
||||||
# 3. Assert
|
|
||||||
mock_storage.get_top_ranked.assert_called_once_with(limit=5)
|
|
||||||
message.answer_document.assert_called_once()
|
|
||||||
|
|
||||||
args, kwargs = message.answer_document.call_args
|
|
||||||
assert kwargs["document"].filename == "hottest_trends.md"
|
|
||||||
assert kwargs["caption"] == "🔥 Top 3 hottest trends!"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_command_get_hottest_handler_no_records(router, mock_storage, allowed_chat_id):
|
|
||||||
"""
|
|
||||||
Test /get_hottest when no records found.
|
|
||||||
"""
|
|
||||||
# 1. Arrange
|
|
||||||
handler = get_handler(router, "command_get_hottest_handler")
|
|
||||||
message = AsyncMock()
|
|
||||||
message.chat = MagicMock()
|
|
||||||
message.chat.id = int(allowed_chat_id)
|
|
||||||
|
|
||||||
mock_storage.get_top_ranked.return_value = []
|
|
||||||
|
|
||||||
# 2. Act
|
|
||||||
command = CommandObject(prefix='/', command='get_hottest', args=None)
|
|
||||||
await handler(message=message, command=command)
|
|
||||||
|
|
||||||
# 3. Assert
|
|
||||||
message.answer.assert_called_once_with("No hot trends found yet.")
|
|
||||||
message.answer_document.assert_not_called()
|
|
||||||
@ -1,192 +0,0 @@
|
|||||||
import pytest
|
|
||||||
from unittest.mock import patch, MagicMock
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from src.crawlers.github_crawler import GitHubTrendingCrawler
|
|
||||||
from src.crawlers.dto import NewsItemDTO
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def monthly_html():
|
|
||||||
return """
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<article class="Box-row">
|
|
||||||
<h2 class="h3 lh-condensed">
|
|
||||||
<a href="/user/repo1">
|
|
||||||
<span class="text-normal">user / </span> repo1
|
|
||||||
</a>
|
|
||||||
</h2>
|
|
||||||
<p class="col-9 color-fg-muted my-1 pr-4">Monthly description 1</p>
|
|
||||||
<div class="f6 color-fg-muted mt-2">
|
|
||||||
<span class="d-inline-block ml-0 mr-3">
|
|
||||||
<span itemprop="programmingLanguage">Python</span>
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
</article>
|
|
||||||
<article class="Box-row">
|
|
||||||
<h2 class="h3 lh-condensed">
|
|
||||||
<a href="/user/repo2">
|
|
||||||
<span class="text-normal">user / </span> repo2
|
|
||||||
</a>
|
|
||||||
</h2>
|
|
||||||
<p class="col-9 color-fg-muted my-1 pr-4">Monthly description 2</p>
|
|
||||||
<div class="f6 color-fg-muted mt-2">
|
|
||||||
<span class="d-inline-block ml-0 mr-3">
|
|
||||||
<span itemprop="programmingLanguage">JavaScript</span>
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
</article>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def weekly_html():
|
|
||||||
return """
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<article class="Box-row">
|
|
||||||
<h2 class="h3 lh-condensed">
|
|
||||||
<a href="/user/repo3">
|
|
||||||
<span class="text-normal">user / </span> repo3
|
|
||||||
</a>
|
|
||||||
</h2>
|
|
||||||
<p class="col-9 color-fg-muted my-1 pr-4">Weekly description 3</p>
|
|
||||||
<div class="f6 color-fg-muted mt-2">
|
|
||||||
<span class="d-inline-block ml-0 mr-3">
|
|
||||||
<span itemprop="programmingLanguage">Go</span>
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
</article>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def daily_html():
|
|
||||||
return """
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<article class="Box-row">
|
|
||||||
<h2 class="h3 lh-condensed">
|
|
||||||
<a href="/user/repo1">
|
|
||||||
<span class="text-normal">user / </span> repo1
|
|
||||||
</a>
|
|
||||||
</h2>
|
|
||||||
<p class="col-9 color-fg-muted my-1 pr-4">Daily description 1</p>
|
|
||||||
<div class="f6 color-fg-muted mt-2">
|
|
||||||
<span class="d-inline-block ml-0 mr-3">
|
|
||||||
<span itemprop="programmingLanguage">Python</span>
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
</article>
|
|
||||||
<article class="Box-row">
|
|
||||||
<h2 class="h3 lh-condensed">
|
|
||||||
<a href="/user/repo4">
|
|
||||||
<span class="text-normal">user / </span> repo4
|
|
||||||
</a>
|
|
||||||
</h2>
|
|
||||||
<p class="col-9 color-fg-muted my-1 pr-4">Daily description 4</p>
|
|
||||||
<div class="f6 color-fg-muted mt-2">
|
|
||||||
<span class="d-inline-block ml-0 mr-3">
|
|
||||||
<span itemprop="programmingLanguage">Rust</span>
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
</article>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_github_trending_crawler_fetches_all_timeframes(monthly_html, weekly_html, daily_html):
|
|
||||||
crawler = GitHubTrendingCrawler()
|
|
||||||
|
|
||||||
with patch("requests.get") as mock_get:
|
|
||||||
# Configure mock to return different HTML for different URLs
|
|
||||||
def side_effect(url, **kwargs):
|
|
||||||
mock_resp = MagicMock()
|
|
||||||
mock_resp.status_code = 200
|
|
||||||
if "since=monthly" in url:
|
|
||||||
mock_resp.text = monthly_html
|
|
||||||
elif "since=weekly" in url:
|
|
||||||
mock_resp.text = weekly_html
|
|
||||||
elif "since=daily" in url:
|
|
||||||
mock_resp.text = daily_html
|
|
||||||
else:
|
|
||||||
mock_resp.text = ""
|
|
||||||
return mock_resp
|
|
||||||
|
|
||||||
mock_get.side_effect = side_effect
|
|
||||||
|
|
||||||
results = await crawler.fetch_latest()
|
|
||||||
|
|
||||||
# Verify it called all three URLs
|
|
||||||
called_urls = [call.args[0] for call in mock_get.call_args_list]
|
|
||||||
assert "https://github.com/trending?since=monthly" in called_urls
|
|
||||||
assert "https://github.com/trending?since=weekly" in called_urls
|
|
||||||
assert "https://github.com/trending?since=daily" in called_urls
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_github_trending_crawler_parses_html_correctly(daily_html):
|
|
||||||
crawler = GitHubTrendingCrawler()
|
|
||||||
|
|
||||||
with patch("requests.get") as mock_get:
|
|
||||||
mock_resp = MagicMock()
|
|
||||||
mock_resp.status_code = 200
|
|
||||||
mock_resp.text = daily_html
|
|
||||||
mock_get.return_value = mock_resp
|
|
||||||
|
|
||||||
# We only care about one fetch here to verify parsing
|
|
||||||
# But fetch_latest might call all three, so we mock it to return empty for others if needed
|
|
||||||
# or just check the results.
|
|
||||||
|
|
||||||
results = await crawler.fetch_latest()
|
|
||||||
|
|
||||||
# Check if repo4 is correctly parsed
|
|
||||||
repo4 = next((item for item in results if "user/repo4" in item.url), None)
|
|
||||||
assert repo4 is not None
|
|
||||||
assert repo4.title == "user / repo4"
|
|
||||||
assert "Daily description 4" in repo4.content_text
|
|
||||||
assert "Rust" in repo4.content_text
|
|
||||||
assert repo4.source == "GitHub Trending"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_github_trending_crawler_deduplication(monthly_html, weekly_html, daily_html):
|
|
||||||
crawler = GitHubTrendingCrawler()
|
|
||||||
|
|
||||||
with patch("requests.get") as mock_get:
|
|
||||||
def side_effect(url, **kwargs):
|
|
||||||
mock_resp = MagicMock()
|
|
||||||
mock_resp.status_code = 200
|
|
||||||
if "since=monthly" in url:
|
|
||||||
mock_resp.text = monthly_html
|
|
||||||
elif "since=weekly" in url:
|
|
||||||
mock_resp.text = weekly_html
|
|
||||||
elif "since=daily" in url:
|
|
||||||
mock_resp.text = daily_html
|
|
||||||
return mock_resp
|
|
||||||
|
|
||||||
mock_get.side_effect = side_effect
|
|
||||||
|
|
||||||
results = await crawler.fetch_latest()
|
|
||||||
|
|
||||||
# repo1 appears in monthly and daily
|
|
||||||
repo1_items = [item for item in results if "user/repo1" in item.url]
|
|
||||||
|
|
||||||
# 1. Assert only ONE NewsItemDTO for repo1
|
|
||||||
assert len(repo1_items) == 1
|
|
||||||
|
|
||||||
# 2. Assert content_text or source indicates it appeared in both timeframes
|
|
||||||
# The prompt says: "its content_text (or source) should indicate it appeared in both timeframes"
|
|
||||||
repo1 = repo1_items[0]
|
|
||||||
assert "monthly" in repo1.content_text.lower() or "monthly" in repo1.source.lower()
|
|
||||||
assert "daily" in repo1.content_text.lower() or "daily" in repo1.source.lower()
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_github_trending_crawler_handles_errors():
|
|
||||||
crawler = GitHubTrendingCrawler()
|
|
||||||
|
|
||||||
with patch("requests.get") as mock_get:
|
|
||||||
mock_get.side_effect = Exception("Network error")
|
|
||||||
|
|
||||||
results = await crawler.fetch_latest()
|
|
||||||
assert results == []
|
|
||||||
@ -113,14 +113,3 @@ async def test_scholar_crawler_captcha():
|
|||||||
|
|
||||||
items = await crawler.fetch_latest()
|
items = await crawler.fetch_latest()
|
||||||
assert items == []
|
assert items == []
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_scholar_crawler_url_year_filter():
|
|
||||||
"""Verify that the crawler filters results from the last 5 years."""
|
|
||||||
current_year = datetime.now().year
|
|
||||||
expected_year = current_year - 5
|
|
||||||
query = "Edge AI"
|
|
||||||
crawler = ScholarCrawler(query=query)
|
|
||||||
|
|
||||||
# The URL should include the lower year bound filter
|
|
||||||
assert f"&as_ylo={expected_year}" in crawler.url
|
|
||||||
|
|||||||
@ -243,7 +243,6 @@ async def test_search_with_category_and_threshold(chroma_store, mock_collection)
|
|||||||
mock_collection.get.assert_called_with(
|
mock_collection.get.assert_called_with(
|
||||||
where_document={"$contains": "AI"},
|
where_document={"$contains": "AI"},
|
||||||
where={"category": "Tech"},
|
where={"category": "Tech"},
|
||||||
limit=5,
|
|
||||||
include=["metadatas", "documents"]
|
include=["metadatas", "documents"]
|
||||||
)
|
)
|
||||||
mock_collection.query.assert_called_with(
|
mock_collection.query.assert_called_with(
|
||||||
@ -274,7 +273,11 @@ async def test_search_empty_query(chroma_store, mock_collection):
|
|||||||
await chroma_store.search("")
|
await chroma_store.search("")
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
mock_collection.get.assert_not_called()
|
mock_collection.get.assert_called_with(
|
||||||
|
where_document=None,
|
||||||
|
where=None,
|
||||||
|
include=["metadatas", "documents"]
|
||||||
|
)
|
||||||
mock_collection.query.assert_called_with(
|
mock_collection.query.assert_called_with(
|
||||||
query_texts=["*"],
|
query_texts=["*"],
|
||||||
n_results=5,
|
n_results=5,
|
||||||
|
|||||||
@ -52,8 +52,7 @@ async def test_cppconf_e2e_pipeline(cppconf_html):
|
|||||||
assert enriched_talk.category == "C++ Trends"
|
assert enriched_talk.category == "C++ Trends"
|
||||||
|
|
||||||
# 3. Vector DB Store
|
# 3. Vector DB Store
|
||||||
from chromadb.config import Settings
|
client = chromadb.Client()
|
||||||
client = chromadb.EphemeralClient(Settings(allow_reset=True))
|
|
||||||
store = ChromaStore(client=client, collection_name="test_cppconf_collection")
|
store = ChromaStore(client=client, collection_name="test_cppconf_collection")
|
||||||
|
|
||||||
await store.store(enriched_talk)
|
await store.store(enriched_talk)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user