[perf] stabilization of previous release

This commit is contained in:
Artur Mukhamadiev 2026-03-13 13:23:30 +03:00
parent 9c8e4c7345
commit 4bf7cb4331
13 changed files with 243 additions and 32 deletions

View File

@ -2,12 +2,13 @@ from aiogram import Bot, Dispatcher
from aiogram.client.default import DefaultBotProperties
from src.bot.handlers import get_router
from src.storage.base import IVectorStore
from src.processor.base import ILLMProvider
def setup_bot(token: str, storage: IVectorStore, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
def setup_bot(token: str, storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
"""
Setup the aiogram Bot and Dispatcher with handlers.
"""
bot = Bot(token=token, default=DefaultBotProperties(parse_mode="HTML"))
dp = Dispatcher()
dp.include_router(get_router(storage, allowed_chat_id))
dp.include_router(get_router(storage, processor, allowed_chat_id))
return bot, dp

View File

@ -10,6 +10,7 @@ from aiogram.utils.keyboard import InlineKeyboardBuilder
from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink
from src.processor.dto import EnrichedNewsItemDTO
from src.processor.base import ILLMProvider
from src.storage.base import IVectorStore
class AccessMiddleware(BaseMiddleware):
@ -28,7 +29,7 @@ class AccessMiddleware(BaseMiddleware):
return
return await handler(event, data)
def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> Router:
router = Router(name="main_router")
router.message.middleware(AccessMiddleware(allowed_chat_id))
@ -52,9 +53,24 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
"/latest [category] - Show the latest enriched news trends\n"
"/search query - Search for news\n"
"/stats - Show database statistics\n"
"/params - Show LLM processor parameters\n"
)
await message.answer(help_text)
@router.message(Command("params"))
async def command_params_handler(message: Message) -> None:
"""
This handler receives messages with `/params` command
"""
info = processor.get_info()
response = "🤖 <b>LLM Processor Parameters</b>\n\n"
response += f"<b>Model:</b> {html.escape(info.get('model', 'Unknown'))}\n"
response += f"<b>Base URL:</b> {html.escape(info.get('base_url', 'Unknown'))}\n"
response += f"<b>Prompt Summary:</b> {html.escape(info.get('prompt_summary', 'Unknown'))}"
await message.answer(response, parse_mode="HTML")
@router.message(Command("latest"))
async def command_latest_handler(message: Message, command: CommandObject) -> None:
"""
@ -146,12 +162,13 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
This handler receives messages with `/stats` command
"""
stats = await storage.get_stats()
total = stats.get("total", 0)
total = stats.get("total_count", 0)
breakdown = []
for cat, count in stats.items():
if cat != "total":
breakdown.append(f"- {cat}: {count}")
for key, count in stats.items():
if key.startswith("category_"):
cat_name = key.replace("category_", "")
breakdown.append(f"- {cat_name}: {count}")
response = f"📊 <b>Database Statistics</b>\n\nTotal items: {total}\n"
if breakdown:

View File

@ -1,5 +1,6 @@
import aiohttp
import xml.etree.ElementTree as ET
import logging
from datetime import datetime
from email.utils import parsedate_to_datetime
from typing import List
@ -7,17 +8,26 @@ from typing import List
from src.crawlers.base import ICrawler
from src.crawlers.dto import NewsItemDTO
logger = logging.getLogger(__name__)
class RSSCrawler(ICrawler):
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
def __init__(self, url: str, source: str):
self.url = url
self.source = source
async def fetch_latest(self) -> List[NewsItemDTO]:
headers = {"User-Agent": self.USER_AGENT}
try:
async with aiohttp.ClientSession() as session:
async with session.get(self.url) as response:
async with session.get(self.url, headers=headers) as response:
response.raise_for_status()
xml_data = await response.text()
return self._parse_xml(xml_data)
except Exception as e:
logger.error(f"Error fetching RSS from {self.url}: {e}")
return []
def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]:
root = ET.fromstring(xml_data)

View File

@ -9,6 +9,7 @@ from aiogram import Bot, Dispatcher
from src.crawlers.base import ICrawler
from src.crawlers.rss_crawler import RSSCrawler
from src.crawlers.playwright_crawler import PlaywrightCrawler
from src.processor.ollama_provider import OllamaProvider
from src.storage.chroma_store import ChromaStore
from src.notifications.telegram import TelegramNotifier
@ -49,7 +50,20 @@ async def main():
# 1. Initialize Components that do not depend on Bot
crawlers: List[ICrawler] = [
RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI")
RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI"),
RSSCrawler("https://www.nature.com/nature.rss", source="Nature"),
RSSCrawler("https://news.google.com/rss/search?q=WebOS+Chromium+Edge+AI+LGE+SmartTV&hl=en-US&gl=US&ceid=US:en", source="Google News R&D"),
RSSCrawler("https://news.samsung.com/global/rss", source="Samsung Newsroom"),
RSSCrawler("https://www.sony.com/en/SonyInfo/News/Service/rss.xml", source="Sony Newsroom"),
PlaywrightCrawler("https://cvpr.thecvf.com/Conferences/2025", source="CVPR 2025", selector=".conference-news-item"),
PlaywrightCrawler("https://www.ces.tech/news/press-releases.aspx", source="CES 2025", selector=".press-release-item"),
RSSCrawler("https://vc.ru/rss/tech", source="VC.ru Tech"),
RSSCrawler("https://rb.ru/rss/", source="RB.ru"),
RSSCrawler("https://www.science.org/rss/news_current.xml", source="Science News"),
RSSCrawler("https://ufn.ru/en/rss/", source="УФН"),
RSSCrawler("https://www.tadviser.ru/xml/tadviser.xml", source="TAdviser"),
RSSCrawler("https://habr.com/ru/rss/company/yandex/blog/", source="Yandex Tech"),
RSSCrawler("https://blog.google/technology/ai/rss/", source="Google AI Blog"),
]
processor = OllamaProvider()
@ -62,7 +76,7 @@ async def main():
storage = ChromaStore(client=chroma_client)
# 2. Initialize Bot & Dispatcher
bot, dp = setup_bot(bot_token, storage, chat_id)
bot, dp = setup_bot(bot_token, storage, processor, chat_id)
# 3. Initialize Notifier and Orchestrator
notifier = TelegramNotifier(bot, chat_id)

View File

@ -1,9 +1,12 @@
import logging
from typing import List
from src.crawlers.base import ICrawler
from src.processor.base import ILLMProvider
from src.storage.base import IVectorStore
from src.notifications.base import INotificationService
logger = logging.getLogger(__name__)
class TrendScoutService:
def __init__(
self,
@ -19,8 +22,10 @@ class TrendScoutService:
async def run_iteration(self) -> None:
for crawler in self.crawlers:
try:
items = await crawler.fetch_latest()
for item in items:
try:
if await self.storage.exists(item.url):
continue
@ -34,3 +39,7 @@ class TrendScoutService:
# Proactive alerts are currently disabled per user request
# if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected):
# await self.notifier.send_alert(enriched_item)
except Exception as e:
logger.error(f"Error processing item {item.url}: {e}")
except Exception as e:
logger.error(f"Error running crawler {crawler}: {e}")

View File

@ -6,3 +6,7 @@ class ILLMProvider(ABC):
@abstractmethod
async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
pass
@abstractmethod
def get_info(self) -> dict[str, str]:
pass

View File

@ -7,6 +7,15 @@ from src.processor.base import ILLMProvider
from src.processor.dto import EnrichedNewsItemDTO
class OllamaProvider(ILLMProvider):
def get_info(self) -> dict[str, str]:
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
model = os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud')
return {
"model": model,
"base_url": base_url,
"prompt_summary": "Russian summary, 2 sentences, R&D focus"
}
async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
url = base_url if base_url.endswith('/api/generate') else f"{base_url.rstrip('/')}/api/generate"
@ -14,6 +23,7 @@ class OllamaProvider(ILLMProvider):
f"Analyze the following article.\nTitle: {news_item.title}\n"
f"Content: {news_item.content_text}\n"
"Return JSON with 'relevance_score' (0-10), 'summary_ru' (string), 'anomalies_detected' (list of strings), and 'category' (string).\n"
"The 'summary_ru' MUST be in Russian and strictly NO MORE than 2 sentences.\n"
"The 'category' must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n"
"For 'relevance_score', prioritize and give higher scores to articles related to R&D, Chromium, NPU, and Smart TV operating systems.\n"
"Regarding 'anomalies_detected': only detect factual, conceptual, or industry-related anomalies (e.g., sudden technological shifts, unexpected competitor moves). "

View File

@ -68,6 +68,9 @@ class ChromaStore(IVectorStore):
document = documents[0][idx] if documents and documents[0] else ""
items.append(self._reconstruct_dto(metadata, document))
# Sort items by relevance_score in descending order
items.sort(key=lambda x: x.relevance_score, reverse=True)
return items
def _reconstruct_dto(self, metadata: Mapping[str, Any], document: str) -> EnrichedNewsItemDTO:

View File

@ -35,8 +35,18 @@ def allowed_chat_id():
return "123456789"
@pytest.fixture
def router(mock_storage, allowed_chat_id):
return get_router(mock_storage, allowed_chat_id)
def mock_processor():
processor = MagicMock()
processor.get_info.return_value = {
"model": "test-model",
"base_url": "http://test-url",
"prompt_summary": "Test summary"
}
return processor
@pytest.fixture
def router(mock_storage, mock_processor, allowed_chat_id):
return get_router(mock_storage, mock_processor, allowed_chat_id)
def get_handler(router, callback_name):
for handler in router.message.handlers:
@ -79,6 +89,22 @@ async def test_command_help_handler(router, allowed_chat_id):
assert "/start" in args[0]
assert "/latest" in args[0]
@pytest.mark.asyncio
async def test_command_params_handler(router, mock_processor, allowed_chat_id):
handler = get_handler(router, "command_params_handler")
message = AsyncMock()
message.chat.id = int(allowed_chat_id)
message.answer = AsyncMock()
await handler(message)
mock_processor.get_info.assert_called_once()
message.answer.assert_called_once()
args, kwargs = message.answer.call_args
assert "LLM Processor Parameters" in args[0]
assert "test-model" in args[0]
assert "HTML" in kwargs.get("parse_mode", "")
@pytest.mark.asyncio
async def test_command_latest_handler(router, mock_storage, allowed_chat_id):
handler = get_handler(router, "command_latest_handler")

View File

@ -45,8 +45,8 @@ async def test_rss_crawler_fetch_latest():
# Call the method
items = await crawler.fetch_latest()
# Verify the mock was called with the correct URL
mock_get.assert_called_once_with(url)
# Verify the mock was called with the correct URL and headers
mock_get.assert_called_once_with(url, headers={"User-Agent": RSSCrawler.USER_AGENT})
# Verify the parsing results
assert len(items) == 2
@ -66,3 +66,17 @@ async def test_rss_crawler_fetch_latest():
assert items[1].content_text == "Test Content 2"
assert items[1].source == source
assert items[1].timestamp == datetime(2002, 10, 3, 10, 0, tzinfo=timezone.utc)
@pytest.mark.asyncio
async def test_rss_crawler_fetch_latest_error():
url = "http://error.source.com/rss"
source = "Error Source"
crawler = RSSCrawler(url, source)
with patch("aiohttp.ClientSession.get") as mock_get:
# Simulate an exception during the request
mock_get.side_effect = Exception("Connection error")
items = await crawler.fetch_latest()
assert items == []

View File

@ -100,3 +100,67 @@ async def test_run_iteration():
# Should not alert proactively anymore as per updated requirements
assert notifier_mock.send_alert.call_count == 0
@pytest.mark.asyncio
async def test_run_iteration_crawler_failure():
# Arrange
crawler1 = AsyncMock()
crawler2 = AsyncMock()
processor = AsyncMock()
storage = AsyncMock()
notifier = AsyncMock()
crawler1.fetch_latest.side_effect = Exception("Crawler 1 failed")
crawler2.fetch_latest.return_value = []
service = TrendScoutService(
crawlers=[crawler1, crawler2],
processor=processor,
storage=storage,
notifier=notifier
)
# Act
await service.run_iteration()
# Assert - crawler 1 failed, but it shouldn't stop crawler 2
crawler1.fetch_latest.assert_called_once()
crawler2.fetch_latest.assert_called_once()
@pytest.mark.asyncio
async def test_run_iteration_item_failure():
# Arrange
crawler = AsyncMock()
processor = AsyncMock()
storage = AsyncMock()
notifier = AsyncMock()
item1 = NewsItemDTO(title="T1", url="U1", content_text="C1", source="S1", timestamp=datetime.now())
item2 = NewsItemDTO(title="T2", url="U2", content_text="C2", source="S2", timestamp=datetime.now())
crawler.fetch_latest.return_value = [item1, item2]
storage.exists.return_value = False
# processor.analyze fails for item1
enriched_item2 = EnrichedNewsItemDTO(
**item2.model_dump(),
relevance_score=6,
summary_ru="Summary",
anomalies_detected=[]
)
processor.analyze.side_effect = [Exception("Analyze failed"), enriched_item2]
service = TrendScoutService(
crawlers=[crawler],
processor=processor,
storage=storage,
notifier=notifier
)
# Act
await service.run_iteration()
# Assert - item 1 failed, but it shouldn't stop item 2
assert processor.analyze.call_count == 2
# Only item 2 should be stored
assert storage.store.call_count == 1

View File

@ -106,3 +106,13 @@ async def test_ollama_provider_analyze_markdown_json(sample_news_item):
assert result.anomalies_detected == []
assert result.category == "Browsers"
def test_ollama_provider_get_info():
os.environ['OLLAMA_API_URL'] = 'http://test-url:11434'
os.environ['OLLAMA_MODEL'] = 'test-model'
provider = OllamaProvider()
info = provider.get_info()
assert info["model"] == "test-model"
assert info["base_url"] == "http://test-url:11434"
assert info["prompt_summary"] == "Russian summary, 2 sentences, R&D focus"

View File

@ -230,3 +230,32 @@ async def test_get_stats(chroma_store: ChromaStore):
assert stats["total_count"] == 3
assert stats["category_Tech"] == 2
assert stats["category_Science"] == 1
@pytest.mark.asyncio
async def test_search_sorting(chroma_store: ChromaStore):
# Arrange
items = [
EnrichedNewsItemDTO(
title=f"Title {i}",
url=f"https://example.com/{i}",
content_text=f"Content {i}",
source="Source",
timestamp=datetime.now(timezone.utc),
relevance_score=i,
summary_ru=f"Сводка {i}",
anomalies_detected=[],
category="Tech"
) for i in range(1, 6) # Scores 1 to 5
]
for item in items:
await chroma_store.store(item)
# Act
results = await chroma_store.search("Content", limit=10)
# Assert
assert len(results) == 5
# Should be sorted 5, 4, 3, 2, 1
scores = [r.relevance_score for r in results]
assert scores == [5, 4, 3, 2, 1]