Implement 'Top Ranked' feature and expand Habr sources

This commit is contained in:
Artur Mukhamadiev 2026-03-15 01:32:25 +03:00
parent 019d9161de
commit 9fdb4b35cd
8 changed files with 326 additions and 7 deletions

View File

@ -51,6 +51,7 @@ def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id:
"/start - Start the bot\n"
"/help - Show this help message\n"
"/latest [category] - Show the latest enriched news trends\n"
"/hottest - Show top 10 ranked hot trends\n"
"/search query - Search for news\n"
"/stats - Show database statistics\n"
"/params - Show LLM processor parameters\n"
@ -93,6 +94,27 @@ def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id:
await message.answer("Latest news:", reply_markup=builder.as_markup())
@router.message(Command("hottest"))
async def command_hottest_handler(message: Message) -> None:
"""
This handler receives messages with `/hottest` command
"""
items = await storage.get_top_ranked(limit=10)
if not items:
await message.answer("No hot trends found yet.")
return
builder = InlineKeyboardBuilder()
for item in items:
item_id = str(uuid.uuid5(uuid.NAMESPACE_URL, item.url))
builder.row(InlineKeyboardButton(
text=f"🔥 [{item.relevance_score}/10] {item.title}",
callback_data=f"detail:{item_id}"
))
await message.answer("Top 10 Hottest Trends:", reply_markup=builder.as_markup())
@router.message(Command("search"))
async def command_search_handler(message: Message, command: CommandObject) -> None:
"""

View File

@ -8,9 +8,6 @@ crawlers:
- type: rss
url: "https://news.samsung.com/global/rss"
source: "Samsung Newsroom"
- type: rss
url: "https://www.sony.com/en/SonyInfo/News/Service/rss.xml"
source: "Sony Newsroom"
- type: playwright
url: "https://cvpr.thecvf.com/Conferences/2025"
source: "CVPR 2025"
@ -61,10 +58,10 @@ crawlers:
url: "https://форумтехнопром.рф/"
source: "Technoprom-2025"
selector: ".news-item"
- type: playwright
url: "https://www.innoprom.com/en/media/news/"
source: "INNOPROM-2025"
selector: ".news-list__item"
# - type: playwright
# url: "https://www.innoprom.com/en/media/news/"
# source: "INNOPROM-2025"
# selector: ".news-list__item"
- type: playwright
url: "https://www.hannovermesse.de/en/news/news-articles/"
source: "Hannover Messe"
@ -91,3 +88,12 @@ crawlers:
url: "https://t.me/s/addmeto"
source: "Telegram: Addmeto"
selector: ".tgme_widget_message_text"
- type: rss
url: "https://habr.com/ru/rss/hubs/hi/articles/?fl=ru"
source: "Habr HighLoad"
- type: rss
url: "https://habr.com/ru/rss/hubs/complete_code/articles/?fl=ru"
source: "Habr Code Quality"
- type: rss
url: "https://habr.com/ru/rss/articles/rated100/?fl=ru"
source: "Habr High Ranked"

View File

@ -29,3 +29,8 @@ class IVectorStore(ABC):
async def get_stats(self) -> dict[str, int]:
"""Get storage statistics including total count and breakdown by category."""
pass
@abstractmethod
async def get_top_ranked(self, limit: int = 10) -> List[EnrichedNewsItemDTO]:
"""Retrieve top ranked items by relevance score."""
pass

View File

@ -113,3 +113,20 @@ class ChromaStore(IVectorStore):
stats[key] = stats.get(key, 0) + 1
return stats
async def get_top_ranked(self, limit: int = 10) -> List[EnrichedNewsItemDTO]:
"""Retrieve top ranked items by relevance score."""
# Retrieve all metadatas and documents to sort by relevance score
results = self.collection.get(include=["metadatas", "documents"])
metadatas = results.get("metadatas") or []
documents = results.get("documents") or []
items = []
for meta, doc in zip(metadatas, documents):
if meta:
items.append(self._reconstruct_dto(meta, doc))
# Sort by relevance_score descending
items.sort(key=lambda x: x.relevance_score, reverse=True)
return items[:limit]

View File

@ -170,6 +170,37 @@ async def test_command_stats_handler(router, mock_storage, allowed_chat_id):
args, kwargs = message.answer.call_args
assert "Database Statistics" in args[0]
@pytest.mark.asyncio
async def test_command_hottest_handler(router, mock_storage, allowed_chat_id, mock_item):
handler = get_handler(router, "command_hottest_handler")
message = AsyncMock()
message.chat.id = int(allowed_chat_id)
message.answer = AsyncMock()
mock_storage.get_top_ranked.return_value = [mock_item]
await handler(message=message)
mock_storage.get_top_ranked.assert_called_once_with(limit=10)
message.answer.assert_called_once()
args, kwargs = message.answer.call_args
assert "Top 10 Hottest Trends:" in args[0]
assert "reply_markup" in kwargs
assert "🔥" in str(kwargs["reply_markup"])
@pytest.mark.asyncio
async def test_command_hottest_handler_empty(router, mock_storage, allowed_chat_id):
handler = get_handler(router, "command_hottest_handler")
message = AsyncMock()
message.chat.id = int(allowed_chat_id)
message.answer = AsyncMock()
mock_storage.get_top_ranked.return_value = []
await handler(message=message)
message.answer.assert_called_once_with("No hot trends found yet.")
@pytest.mark.asyncio
async def test_access_middleware_allowed(allowed_chat_id):
middleware = AccessMiddleware(allowed_chat_id)

View File

@ -0,0 +1,102 @@
import uuid
import pytest
from unittest.mock import AsyncMock, MagicMock
from aiogram.types import Message, InlineKeyboardMarkup
from datetime import datetime
from src.bot.handlers import get_router
from src.processor.dto import EnrichedNewsItemDTO
@pytest.fixture
def mock_storage():
return AsyncMock()
@pytest.fixture
def mock_processor():
processor = MagicMock()
processor.get_info.return_value = {"model": "test-model"}
return processor
@pytest.fixture
def allowed_chat_id():
return "123456789"
@pytest.fixture
def router(mock_storage, mock_processor, allowed_chat_id):
return get_router(mock_storage, mock_processor, allowed_chat_id)
def get_handler(router, callback_name):
for handler in router.message.handlers:
if handler.callback.__name__ == callback_name:
return handler.callback
raise ValueError(f"Handler {callback_name} not found")
@pytest.mark.asyncio
async def test_command_hottest_handler_success(router, mock_storage, allowed_chat_id):
"""
Test that /hottest command calls get_top_ranked and returns a list of items.
"""
# 1. Arrange
handler = get_handler(router, "command_hottest_handler")
message = AsyncMock()
message.chat = MagicMock()
message.chat.id = int(allowed_chat_id)
message.answer = AsyncMock()
mock_items = [
EnrichedNewsItemDTO(
title=f"Hot News {i}",
url=f"https://example.com/{i}",
content_text=f"Content {i}",
source="Source",
timestamp=datetime.now(),
relevance_score=10-i,
summary_ru=f"Сводка {i}",
anomalies_detected=[],
category="Tech"
) for i in range(3)
]
mock_storage.get_top_ranked.return_value = mock_items
# 2. Act
await handler(message=message)
# 3. Assert
mock_storage.get_top_ranked.assert_called_once_with(limit=10)
message.answer.assert_called_once()
args, kwargs = message.answer.call_args
assert "Top 10 Hottest Trends:" in args[0]
assert "reply_markup" in kwargs
assert isinstance(kwargs["reply_markup"], InlineKeyboardMarkup)
# Check if all 3 items are in the markup
markup = kwargs["reply_markup"]
assert len(markup.inline_keyboard) == 3
# Check if icons and scores are present
button_text = markup.inline_keyboard[0][0].text
assert "🔥" in button_text
assert "[10/10]" in button_text
assert "Hot News 0" in button_text
@pytest.mark.asyncio
async def test_command_hottest_handler_empty(router, mock_storage, allowed_chat_id):
"""
Test that /hottest command handles empty results correctly.
"""
# 1. Arrange
handler = get_handler(router, "command_hottest_handler")
message = AsyncMock()
message.chat = MagicMock()
message.chat.id = int(allowed_chat_id)
message.answer = AsyncMock()
mock_storage.get_top_ranked.return_value = []
# 2. Act
await handler(message=message)
# 3. Assert
mock_storage.get_top_ranked.assert_called_once_with(limit=10)
message.answer.assert_called_once_with("No hot trends found yet.")

View File

@ -2,6 +2,7 @@ import pytest
import pytest_asyncio
import uuid
from datetime import datetime, timezone
from unittest.mock import MagicMock
import chromadb
from chromadb.config import Settings
@ -259,3 +260,43 @@ async def test_search_sorting(chroma_store: ChromaStore):
# Should be sorted 5, 4, 3, 2, 1
scores = [r.relevance_score for r in results]
assert scores == [5, 4, 3, 2, 1]
@pytest.mark.asyncio
async def test_get_top_ranked_mock(chroma_store: ChromaStore):
# 1. Arrange
mock_collection = MagicMock()
chroma_store.collection = mock_collection
# Mock data returned by collection.get
mock_collection.get.return_value = {
"metadatas": [
{"title": "Low", "url": "url1", "relevance_score": 2, "timestamp": "2023-11-01T12:00:00"},
{"title": "High", "url": "url2", "relevance_score": 10, "timestamp": "2023-11-01T12:00:00"},
{"title": "Mid", "url": "url3", "relevance_score": 7, "timestamp": "2023-11-01T12:00:00"},
],
"documents": ["doc1", "doc2", "doc3"]
}
# 2. Act
results = await chroma_store.get_top_ranked(limit=2)
# 3. Assert
mock_collection.get.assert_called_once_with(include=["metadatas", "documents"])
assert len(results) == 2
assert results[0].title == "High"
assert results[0].relevance_score == 10
assert results[1].title == "Mid"
assert results[1].relevance_score == 7
@pytest.mark.asyncio
async def test_get_top_ranked_empty(chroma_store: ChromaStore):
# 1. Arrange
mock_collection = MagicMock()
chroma_store.collection = mock_collection
mock_collection.get.return_value = {"metadatas": [], "documents": []}
# 2. Act
results = await chroma_store.get_top_ranked(limit=10)
# 3. Assert
assert len(results) == 0

View File

@ -0,0 +1,95 @@
import pytest
import pytest_asyncio
from datetime import datetime, timezone
import chromadb
from chromadb.config import Settings
from src.processor.dto import EnrichedNewsItemDTO
from src.storage.chroma_store import ChromaStore
@pytest_asyncio.fixture
async def chroma_store():
# Use EphemeralClient for in-memory testing
client = chromadb.EphemeralClient(Settings(allow_reset=True))
client.reset()
store = ChromaStore(client=client, collection_name="test_top_ranked_collection")
yield store
client.reset()
@pytest.mark.asyncio
async def test_get_top_ranked_sorting(chroma_store: ChromaStore):
"""
Test that get_top_ranked returns items sorted by relevance_score in descending order.
"""
# 1. Arrange - create items with various relevance scores
items = [
EnrichedNewsItemDTO(
title=f"News {score}",
url=f"https://example.com/{score}",
content_text=f"Content for news with score {score}",
source="Source",
timestamp=datetime.now(timezone.utc),
relevance_score=score,
summary_ru=f"Сводка {score}",
anomalies_detected=[],
category="Tech"
) for score in [5, 10, 2, 8, 1]
]
for item in items:
await chroma_store.store(item)
# 2. Act
results = await chroma_store.get_top_ranked(limit=10)
# 3. Assert
assert len(results) == 5
scores = [r.relevance_score for r in results]
# Should be [10, 8, 5, 2, 1]
assert scores == [10, 8, 5, 2, 1]
assert results[0].title == "News 10"
assert results[-1].title == "News 1"
@pytest.mark.asyncio
async def test_get_top_ranked_limit(chroma_store: ChromaStore):
"""
Test that get_top_ranked respects the limit parameter.
"""
# 1. Arrange
items = [
EnrichedNewsItemDTO(
title=f"News {i}",
url=f"https://example.com/{i}",
content_text=f"Content {i}",
source="Source",
timestamp=datetime.now(timezone.utc),
relevance_score=i,
summary_ru=f"Сводка {i}",
anomalies_detected=[],
category="Tech"
) for i in range(1, 11) # 10 items
]
for item in items:
await chroma_store.store(item)
# 2. Act
limit_5 = await chroma_store.get_top_ranked(limit=5)
limit_2 = await chroma_store.get_top_ranked(limit=2)
# 3. Assert
assert len(limit_5) == 5
assert len(limit_2) == 2
assert limit_5[0].relevance_score == 10
assert limit_5[4].relevance_score == 6
@pytest.mark.asyncio
async def test_get_top_ranked_empty_store(chroma_store: ChromaStore):
"""
Test that get_top_ranked returns an empty list if store is empty.
"""
# 1. Act
results = await chroma_store.get_top_ranked(limit=10)
# 2. Assert
assert results == []