Implement 'Top Ranked' feature and expand Habr sources

This commit is contained in:
Artur Mukhamadiev 2026-03-15 01:32:25 +03:00
parent 019d9161de
commit 9fdb4b35cd
8 changed files with 326 additions and 7 deletions

View File

@ -51,6 +51,7 @@ def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id:
"/start - Start the bot\n" "/start - Start the bot\n"
"/help - Show this help message\n" "/help - Show this help message\n"
"/latest [category] - Show the latest enriched news trends\n" "/latest [category] - Show the latest enriched news trends\n"
"/hottest - Show top 10 ranked hot trends\n"
"/search query - Search for news\n" "/search query - Search for news\n"
"/stats - Show database statistics\n" "/stats - Show database statistics\n"
"/params - Show LLM processor parameters\n" "/params - Show LLM processor parameters\n"
@ -93,6 +94,27 @@ def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id:
await message.answer("Latest news:", reply_markup=builder.as_markup()) await message.answer("Latest news:", reply_markup=builder.as_markup())
@router.message(Command("hottest"))
async def command_hottest_handler(message: Message) -> None:
"""
This handler receives messages with `/hottest` command
"""
items = await storage.get_top_ranked(limit=10)
if not items:
await message.answer("No hot trends found yet.")
return
builder = InlineKeyboardBuilder()
for item in items:
item_id = str(uuid.uuid5(uuid.NAMESPACE_URL, item.url))
builder.row(InlineKeyboardButton(
text=f"🔥 [{item.relevance_score}/10] {item.title}",
callback_data=f"detail:{item_id}"
))
await message.answer("Top 10 Hottest Trends:", reply_markup=builder.as_markup())
@router.message(Command("search")) @router.message(Command("search"))
async def command_search_handler(message: Message, command: CommandObject) -> None: async def command_search_handler(message: Message, command: CommandObject) -> None:
""" """

View File

@ -8,9 +8,6 @@ crawlers:
- type: rss - type: rss
url: "https://news.samsung.com/global/rss" url: "https://news.samsung.com/global/rss"
source: "Samsung Newsroom" source: "Samsung Newsroom"
- type: rss
url: "https://www.sony.com/en/SonyInfo/News/Service/rss.xml"
source: "Sony Newsroom"
- type: playwright - type: playwright
url: "https://cvpr.thecvf.com/Conferences/2025" url: "https://cvpr.thecvf.com/Conferences/2025"
source: "CVPR 2025" source: "CVPR 2025"
@ -61,10 +58,10 @@ crawlers:
url: "https://форумтехнопром.рф/" url: "https://форумтехнопром.рф/"
source: "Technoprom-2025" source: "Technoprom-2025"
selector: ".news-item" selector: ".news-item"
- type: playwright # - type: playwright
url: "https://www.innoprom.com/en/media/news/" # url: "https://www.innoprom.com/en/media/news/"
source: "INNOPROM-2025" # source: "INNOPROM-2025"
selector: ".news-list__item" # selector: ".news-list__item"
- type: playwright - type: playwright
url: "https://www.hannovermesse.de/en/news/news-articles/" url: "https://www.hannovermesse.de/en/news/news-articles/"
source: "Hannover Messe" source: "Hannover Messe"
@ -91,3 +88,12 @@ crawlers:
url: "https://t.me/s/addmeto" url: "https://t.me/s/addmeto"
source: "Telegram: Addmeto" source: "Telegram: Addmeto"
selector: ".tgme_widget_message_text" selector: ".tgme_widget_message_text"
- type: rss
url: "https://habr.com/ru/rss/hubs/hi/articles/?fl=ru"
source: "Habr HighLoad"
- type: rss
url: "https://habr.com/ru/rss/hubs/complete_code/articles/?fl=ru"
source: "Habr Code Quality"
- type: rss
url: "https://habr.com/ru/rss/articles/rated100/?fl=ru"
source: "Habr High Ranked"

View File

@ -29,3 +29,8 @@ class IVectorStore(ABC):
async def get_stats(self) -> dict[str, int]: async def get_stats(self) -> dict[str, int]:
"""Get storage statistics including total count and breakdown by category.""" """Get storage statistics including total count and breakdown by category."""
pass pass
@abstractmethod
async def get_top_ranked(self, limit: int = 10) -> List[EnrichedNewsItemDTO]:
"""Retrieve top ranked items by relevance score."""
pass

View File

@ -113,3 +113,20 @@ class ChromaStore(IVectorStore):
stats[key] = stats.get(key, 0) + 1 stats[key] = stats.get(key, 0) + 1
return stats return stats
async def get_top_ranked(self, limit: int = 10) -> List[EnrichedNewsItemDTO]:
"""Retrieve top ranked items by relevance score."""
# Retrieve all metadatas and documents to sort by relevance score
results = self.collection.get(include=["metadatas", "documents"])
metadatas = results.get("metadatas") or []
documents = results.get("documents") or []
items = []
for meta, doc in zip(metadatas, documents):
if meta:
items.append(self._reconstruct_dto(meta, doc))
# Sort by relevance_score descending
items.sort(key=lambda x: x.relevance_score, reverse=True)
return items[:limit]

View File

@ -170,6 +170,37 @@ async def test_command_stats_handler(router, mock_storage, allowed_chat_id):
args, kwargs = message.answer.call_args args, kwargs = message.answer.call_args
assert "Database Statistics" in args[0] assert "Database Statistics" in args[0]
@pytest.mark.asyncio
async def test_command_hottest_handler(router, mock_storage, allowed_chat_id, mock_item):
handler = get_handler(router, "command_hottest_handler")
message = AsyncMock()
message.chat.id = int(allowed_chat_id)
message.answer = AsyncMock()
mock_storage.get_top_ranked.return_value = [mock_item]
await handler(message=message)
mock_storage.get_top_ranked.assert_called_once_with(limit=10)
message.answer.assert_called_once()
args, kwargs = message.answer.call_args
assert "Top 10 Hottest Trends:" in args[0]
assert "reply_markup" in kwargs
assert "🔥" in str(kwargs["reply_markup"])
@pytest.mark.asyncio
async def test_command_hottest_handler_empty(router, mock_storage, allowed_chat_id):
handler = get_handler(router, "command_hottest_handler")
message = AsyncMock()
message.chat.id = int(allowed_chat_id)
message.answer = AsyncMock()
mock_storage.get_top_ranked.return_value = []
await handler(message=message)
message.answer.assert_called_once_with("No hot trends found yet.")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_access_middleware_allowed(allowed_chat_id): async def test_access_middleware_allowed(allowed_chat_id):
middleware = AccessMiddleware(allowed_chat_id) middleware = AccessMiddleware(allowed_chat_id)

View File

@ -0,0 +1,102 @@
import uuid
import pytest
from unittest.mock import AsyncMock, MagicMock
from aiogram.types import Message, InlineKeyboardMarkup
from datetime import datetime
from src.bot.handlers import get_router
from src.processor.dto import EnrichedNewsItemDTO
@pytest.fixture
def mock_storage():
return AsyncMock()
@pytest.fixture
def mock_processor():
processor = MagicMock()
processor.get_info.return_value = {"model": "test-model"}
return processor
@pytest.fixture
def allowed_chat_id():
return "123456789"
@pytest.fixture
def router(mock_storage, mock_processor, allowed_chat_id):
return get_router(mock_storage, mock_processor, allowed_chat_id)
def get_handler(router, callback_name):
for handler in router.message.handlers:
if handler.callback.__name__ == callback_name:
return handler.callback
raise ValueError(f"Handler {callback_name} not found")
@pytest.mark.asyncio
async def test_command_hottest_handler_success(router, mock_storage, allowed_chat_id):
"""
Test that /hottest command calls get_top_ranked and returns a list of items.
"""
# 1. Arrange
handler = get_handler(router, "command_hottest_handler")
message = AsyncMock()
message.chat = MagicMock()
message.chat.id = int(allowed_chat_id)
message.answer = AsyncMock()
mock_items = [
EnrichedNewsItemDTO(
title=f"Hot News {i}",
url=f"https://example.com/{i}",
content_text=f"Content {i}",
source="Source",
timestamp=datetime.now(),
relevance_score=10-i,
summary_ru=f"Сводка {i}",
anomalies_detected=[],
category="Tech"
) for i in range(3)
]
mock_storage.get_top_ranked.return_value = mock_items
# 2. Act
await handler(message=message)
# 3. Assert
mock_storage.get_top_ranked.assert_called_once_with(limit=10)
message.answer.assert_called_once()
args, kwargs = message.answer.call_args
assert "Top 10 Hottest Trends:" in args[0]
assert "reply_markup" in kwargs
assert isinstance(kwargs["reply_markup"], InlineKeyboardMarkup)
# Check if all 3 items are in the markup
markup = kwargs["reply_markup"]
assert len(markup.inline_keyboard) == 3
# Check if icons and scores are present
button_text = markup.inline_keyboard[0][0].text
assert "🔥" in button_text
assert "[10/10]" in button_text
assert "Hot News 0" in button_text
@pytest.mark.asyncio
async def test_command_hottest_handler_empty(router, mock_storage, allowed_chat_id):
"""
Test that /hottest command handles empty results correctly.
"""
# 1. Arrange
handler = get_handler(router, "command_hottest_handler")
message = AsyncMock()
message.chat = MagicMock()
message.chat.id = int(allowed_chat_id)
message.answer = AsyncMock()
mock_storage.get_top_ranked.return_value = []
# 2. Act
await handler(message=message)
# 3. Assert
mock_storage.get_top_ranked.assert_called_once_with(limit=10)
message.answer.assert_called_once_with("No hot trends found yet.")

View File

@ -2,6 +2,7 @@ import pytest
import pytest_asyncio import pytest_asyncio
import uuid import uuid
from datetime import datetime, timezone from datetime import datetime, timezone
from unittest.mock import MagicMock
import chromadb import chromadb
from chromadb.config import Settings from chromadb.config import Settings
@ -259,3 +260,43 @@ async def test_search_sorting(chroma_store: ChromaStore):
# Should be sorted 5, 4, 3, 2, 1 # Should be sorted 5, 4, 3, 2, 1
scores = [r.relevance_score for r in results] scores = [r.relevance_score for r in results]
assert scores == [5, 4, 3, 2, 1] assert scores == [5, 4, 3, 2, 1]
@pytest.mark.asyncio
async def test_get_top_ranked_mock(chroma_store: ChromaStore):
# 1. Arrange
mock_collection = MagicMock()
chroma_store.collection = mock_collection
# Mock data returned by collection.get
mock_collection.get.return_value = {
"metadatas": [
{"title": "Low", "url": "url1", "relevance_score": 2, "timestamp": "2023-11-01T12:00:00"},
{"title": "High", "url": "url2", "relevance_score": 10, "timestamp": "2023-11-01T12:00:00"},
{"title": "Mid", "url": "url3", "relevance_score": 7, "timestamp": "2023-11-01T12:00:00"},
],
"documents": ["doc1", "doc2", "doc3"]
}
# 2. Act
results = await chroma_store.get_top_ranked(limit=2)
# 3. Assert
mock_collection.get.assert_called_once_with(include=["metadatas", "documents"])
assert len(results) == 2
assert results[0].title == "High"
assert results[0].relevance_score == 10
assert results[1].title == "Mid"
assert results[1].relevance_score == 7
@pytest.mark.asyncio
async def test_get_top_ranked_empty(chroma_store: ChromaStore):
# 1. Arrange
mock_collection = MagicMock()
chroma_store.collection = mock_collection
mock_collection.get.return_value = {"metadatas": [], "documents": []}
# 2. Act
results = await chroma_store.get_top_ranked(limit=10)
# 3. Assert
assert len(results) == 0

View File

@ -0,0 +1,95 @@
import pytest
import pytest_asyncio
from datetime import datetime, timezone
import chromadb
from chromadb.config import Settings
from src.processor.dto import EnrichedNewsItemDTO
from src.storage.chroma_store import ChromaStore
@pytest_asyncio.fixture
async def chroma_store():
# Use EphemeralClient for in-memory testing
client = chromadb.EphemeralClient(Settings(allow_reset=True))
client.reset()
store = ChromaStore(client=client, collection_name="test_top_ranked_collection")
yield store
client.reset()
@pytest.mark.asyncio
async def test_get_top_ranked_sorting(chroma_store: ChromaStore):
"""
Test that get_top_ranked returns items sorted by relevance_score in descending order.
"""
# 1. Arrange - create items with various relevance scores
items = [
EnrichedNewsItemDTO(
title=f"News {score}",
url=f"https://example.com/{score}",
content_text=f"Content for news with score {score}",
source="Source",
timestamp=datetime.now(timezone.utc),
relevance_score=score,
summary_ru=f"Сводка {score}",
anomalies_detected=[],
category="Tech"
) for score in [5, 10, 2, 8, 1]
]
for item in items:
await chroma_store.store(item)
# 2. Act
results = await chroma_store.get_top_ranked(limit=10)
# 3. Assert
assert len(results) == 5
scores = [r.relevance_score for r in results]
# Should be [10, 8, 5, 2, 1]
assert scores == [10, 8, 5, 2, 1]
assert results[0].title == "News 10"
assert results[-1].title == "News 1"
@pytest.mark.asyncio
async def test_get_top_ranked_limit(chroma_store: ChromaStore):
"""
Test that get_top_ranked respects the limit parameter.
"""
# 1. Arrange
items = [
EnrichedNewsItemDTO(
title=f"News {i}",
url=f"https://example.com/{i}",
content_text=f"Content {i}",
source="Source",
timestamp=datetime.now(timezone.utc),
relevance_score=i,
summary_ru=f"Сводка {i}",
anomalies_detected=[],
category="Tech"
) for i in range(1, 11) # 10 items
]
for item in items:
await chroma_store.store(item)
# 2. Act
limit_5 = await chroma_store.get_top_ranked(limit=5)
limit_2 = await chroma_store.get_top_ranked(limit=2)
# 3. Assert
assert len(limit_5) == 5
assert len(limit_2) == 2
assert limit_5[0].relevance_score == 10
assert limit_5[4].relevance_score == 6
@pytest.mark.asyncio
async def test_get_top_ranked_empty_store(chroma_store: ChromaStore):
"""
Test that get_top_ranked returns an empty list if store is empty.
"""
# 1. Act
results = await chroma_store.get_top_ranked(limit=10)
# 2. Assert
assert results == []