[perf] stabilization of previous release

This commit is contained in:
Artur Mukhamadiev 2026-03-13 13:23:30 +03:00
parent 9c8e4c7345
commit 4bf7cb4331
13 changed files with 243 additions and 32 deletions

View File

@ -2,12 +2,13 @@ from aiogram import Bot, Dispatcher
from aiogram.client.default import DefaultBotProperties from aiogram.client.default import DefaultBotProperties
from src.bot.handlers import get_router from src.bot.handlers import get_router
from src.storage.base import IVectorStore from src.storage.base import IVectorStore
from src.processor.base import ILLMProvider
def setup_bot(token: str, storage: IVectorStore, allowed_chat_id: str) -> tuple[Bot, Dispatcher]: def setup_bot(token: str, storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
""" """
Setup the aiogram Bot and Dispatcher with handlers. Setup the aiogram Bot and Dispatcher with handlers.
""" """
bot = Bot(token=token, default=DefaultBotProperties(parse_mode="HTML")) bot = Bot(token=token, default=DefaultBotProperties(parse_mode="HTML"))
dp = Dispatcher() dp = Dispatcher()
dp.include_router(get_router(storage, allowed_chat_id)) dp.include_router(get_router(storage, processor, allowed_chat_id))
return bot, dp return bot, dp

View File

@ -10,6 +10,7 @@ from aiogram.utils.keyboard import InlineKeyboardBuilder
from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink
from src.processor.dto import EnrichedNewsItemDTO from src.processor.dto import EnrichedNewsItemDTO
from src.processor.base import ILLMProvider
from src.storage.base import IVectorStore from src.storage.base import IVectorStore
class AccessMiddleware(BaseMiddleware): class AccessMiddleware(BaseMiddleware):
@ -28,7 +29,7 @@ class AccessMiddleware(BaseMiddleware):
return return
return await handler(event, data) return await handler(event, data)
def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router: def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> Router:
router = Router(name="main_router") router = Router(name="main_router")
router.message.middleware(AccessMiddleware(allowed_chat_id)) router.message.middleware(AccessMiddleware(allowed_chat_id))
@ -52,9 +53,24 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
"/latest [category] - Show the latest enriched news trends\n" "/latest [category] - Show the latest enriched news trends\n"
"/search query - Search for news\n" "/search query - Search for news\n"
"/stats - Show database statistics\n" "/stats - Show database statistics\n"
"/params - Show LLM processor parameters\n"
) )
await message.answer(help_text) await message.answer(help_text)
@router.message(Command("params"))
async def command_params_handler(message: Message) -> None:
"""
This handler receives messages with `/params` command
"""
info = processor.get_info()
response = "🤖 <b>LLM Processor Parameters</b>\n\n"
response += f"<b>Model:</b> {html.escape(info.get('model', 'Unknown'))}\n"
response += f"<b>Base URL:</b> {html.escape(info.get('base_url', 'Unknown'))}\n"
response += f"<b>Prompt Summary:</b> {html.escape(info.get('prompt_summary', 'Unknown'))}"
await message.answer(response, parse_mode="HTML")
@router.message(Command("latest")) @router.message(Command("latest"))
async def command_latest_handler(message: Message, command: CommandObject) -> None: async def command_latest_handler(message: Message, command: CommandObject) -> None:
""" """
@ -146,12 +162,13 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
This handler receives messages with `/stats` command This handler receives messages with `/stats` command
""" """
stats = await storage.get_stats() stats = await storage.get_stats()
total = stats.get("total", 0) total = stats.get("total_count", 0)
breakdown = [] breakdown = []
for cat, count in stats.items(): for key, count in stats.items():
if cat != "total": if key.startswith("category_"):
breakdown.append(f"- {cat}: {count}") cat_name = key.replace("category_", "")
breakdown.append(f"- {cat_name}: {count}")
response = f"📊 <b>Database Statistics</b>\n\nTotal items: {total}\n" response = f"📊 <b>Database Statistics</b>\n\nTotal items: {total}\n"
if breakdown: if breakdown:

View File

@ -1,5 +1,6 @@
import aiohttp import aiohttp
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import logging
from datetime import datetime from datetime import datetime
from email.utils import parsedate_to_datetime from email.utils import parsedate_to_datetime
from typing import List from typing import List
@ -7,17 +8,26 @@ from typing import List
from src.crawlers.base import ICrawler from src.crawlers.base import ICrawler
from src.crawlers.dto import NewsItemDTO from src.crawlers.dto import NewsItemDTO
logger = logging.getLogger(__name__)
class RSSCrawler(ICrawler): class RSSCrawler(ICrawler):
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
def __init__(self, url: str, source: str): def __init__(self, url: str, source: str):
self.url = url self.url = url
self.source = source self.source = source
async def fetch_latest(self) -> List[NewsItemDTO]: async def fetch_latest(self) -> List[NewsItemDTO]:
async with aiohttp.ClientSession() as session: headers = {"User-Agent": self.USER_AGENT}
async with session.get(self.url) as response: try:
response.raise_for_status() async with aiohttp.ClientSession() as session:
xml_data = await response.text() async with session.get(self.url, headers=headers) as response:
return self._parse_xml(xml_data) response.raise_for_status()
xml_data = await response.text()
return self._parse_xml(xml_data)
except Exception as e:
logger.error(f"Error fetching RSS from {self.url}: {e}")
return []
def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]: def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]:
root = ET.fromstring(xml_data) root = ET.fromstring(xml_data)

View File

@ -9,6 +9,7 @@ from aiogram import Bot, Dispatcher
from src.crawlers.base import ICrawler from src.crawlers.base import ICrawler
from src.crawlers.rss_crawler import RSSCrawler from src.crawlers.rss_crawler import RSSCrawler
from src.crawlers.playwright_crawler import PlaywrightCrawler
from src.processor.ollama_provider import OllamaProvider from src.processor.ollama_provider import OllamaProvider
from src.storage.chroma_store import ChromaStore from src.storage.chroma_store import ChromaStore
from src.notifications.telegram import TelegramNotifier from src.notifications.telegram import TelegramNotifier
@ -49,7 +50,20 @@ async def main():
# 1. Initialize Components that do not depend on Bot # 1. Initialize Components that do not depend on Bot
crawlers: List[ICrawler] = [ crawlers: List[ICrawler] = [
RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI") RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI"),
RSSCrawler("https://www.nature.com/nature.rss", source="Nature"),
RSSCrawler("https://news.google.com/rss/search?q=WebOS+Chromium+Edge+AI+LGE+SmartTV&hl=en-US&gl=US&ceid=US:en", source="Google News R&D"),
RSSCrawler("https://news.samsung.com/global/rss", source="Samsung Newsroom"),
RSSCrawler("https://www.sony.com/en/SonyInfo/News/Service/rss.xml", source="Sony Newsroom"),
PlaywrightCrawler("https://cvpr.thecvf.com/Conferences/2025", source="CVPR 2025", selector=".conference-news-item"),
PlaywrightCrawler("https://www.ces.tech/news/press-releases.aspx", source="CES 2025", selector=".press-release-item"),
RSSCrawler("https://vc.ru/rss/tech", source="VC.ru Tech"),
RSSCrawler("https://rb.ru/rss/", source="RB.ru"),
RSSCrawler("https://www.science.org/rss/news_current.xml", source="Science News"),
RSSCrawler("https://ufn.ru/en/rss/", source="УФН"),
RSSCrawler("https://www.tadviser.ru/xml/tadviser.xml", source="TAdviser"),
RSSCrawler("https://habr.com/ru/rss/company/yandex/blog/", source="Yandex Tech"),
RSSCrawler("https://blog.google/technology/ai/rss/", source="Google AI Blog"),
] ]
processor = OllamaProvider() processor = OllamaProvider()
@ -62,7 +76,7 @@ async def main():
storage = ChromaStore(client=chroma_client) storage = ChromaStore(client=chroma_client)
# 2. Initialize Bot & Dispatcher # 2. Initialize Bot & Dispatcher
bot, dp = setup_bot(bot_token, storage, chat_id) bot, dp = setup_bot(bot_token, storage, processor, chat_id)
# 3. Initialize Notifier and Orchestrator # 3. Initialize Notifier and Orchestrator
notifier = TelegramNotifier(bot, chat_id) notifier = TelegramNotifier(bot, chat_id)

View File

@ -1,9 +1,12 @@
import logging
from typing import List from typing import List
from src.crawlers.base import ICrawler from src.crawlers.base import ICrawler
from src.processor.base import ILLMProvider from src.processor.base import ILLMProvider
from src.storage.base import IVectorStore from src.storage.base import IVectorStore
from src.notifications.base import INotificationService from src.notifications.base import INotificationService
logger = logging.getLogger(__name__)
class TrendScoutService: class TrendScoutService:
def __init__( def __init__(
self, self,
@ -19,18 +22,24 @@ class TrendScoutService:
async def run_iteration(self) -> None: async def run_iteration(self) -> None:
for crawler in self.crawlers: for crawler in self.crawlers:
items = await crawler.fetch_latest() try:
for item in items: items = await crawler.fetch_latest()
if await self.storage.exists(item.url): for item in items:
continue try:
if await self.storage.exists(item.url):
continue
enriched_item = await self.processor.analyze(item) enriched_item = await self.processor.analyze(item)
if enriched_item.relevance_score < 5: if enriched_item.relevance_score < 5:
enriched_item.content_text = "" enriched_item.content_text = ""
await self.storage.store(enriched_item) await self.storage.store(enriched_item)
# Proactive alerts are currently disabled per user request # Proactive alerts are currently disabled per user request
# if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected): # if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected):
# await self.notifier.send_alert(enriched_item) # await self.notifier.send_alert(enriched_item)
except Exception as e:
logger.error(f"Error processing item {item.url}: {e}")
except Exception as e:
logger.error(f"Error running crawler {crawler}: {e}")

View File

@ -6,3 +6,7 @@ class ILLMProvider(ABC):
@abstractmethod @abstractmethod
async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO: async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
pass pass
@abstractmethod
def get_info(self) -> dict[str, str]:
pass

View File

@ -7,6 +7,15 @@ from src.processor.base import ILLMProvider
from src.processor.dto import EnrichedNewsItemDTO from src.processor.dto import EnrichedNewsItemDTO
class OllamaProvider(ILLMProvider): class OllamaProvider(ILLMProvider):
def get_info(self) -> dict[str, str]:
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
model = os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud')
return {
"model": model,
"base_url": base_url,
"prompt_summary": "Russian summary, 2 sentences, R&D focus"
}
async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO: async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434') base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
url = base_url if base_url.endswith('/api/generate') else f"{base_url.rstrip('/')}/api/generate" url = base_url if base_url.endswith('/api/generate') else f"{base_url.rstrip('/')}/api/generate"
@ -14,6 +23,7 @@ class OllamaProvider(ILLMProvider):
f"Analyze the following article.\nTitle: {news_item.title}\n" f"Analyze the following article.\nTitle: {news_item.title}\n"
f"Content: {news_item.content_text}\n" f"Content: {news_item.content_text}\n"
"Return JSON with 'relevance_score' (0-10), 'summary_ru' (string), 'anomalies_detected' (list of strings), and 'category' (string).\n" "Return JSON with 'relevance_score' (0-10), 'summary_ru' (string), 'anomalies_detected' (list of strings), and 'category' (string).\n"
"The 'summary_ru' MUST be in Russian and strictly NO MORE than 2 sentences.\n"
"The 'category' must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n" "The 'category' must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n"
"For 'relevance_score', prioritize and give higher scores to articles related to R&D, Chromium, NPU, and Smart TV operating systems.\n" "For 'relevance_score', prioritize and give higher scores to articles related to R&D, Chromium, NPU, and Smart TV operating systems.\n"
"Regarding 'anomalies_detected': only detect factual, conceptual, or industry-related anomalies (e.g., sudden technological shifts, unexpected competitor moves). " "Regarding 'anomalies_detected': only detect factual, conceptual, or industry-related anomalies (e.g., sudden technological shifts, unexpected competitor moves). "

View File

@ -68,6 +68,9 @@ class ChromaStore(IVectorStore):
document = documents[0][idx] if documents and documents[0] else "" document = documents[0][idx] if documents and documents[0] else ""
items.append(self._reconstruct_dto(metadata, document)) items.append(self._reconstruct_dto(metadata, document))
# Sort items by relevance_score in descending order
items.sort(key=lambda x: x.relevance_score, reverse=True)
return items return items
def _reconstruct_dto(self, metadata: Mapping[str, Any], document: str) -> EnrichedNewsItemDTO: def _reconstruct_dto(self, metadata: Mapping[str, Any], document: str) -> EnrichedNewsItemDTO:

View File

@ -35,8 +35,18 @@ def allowed_chat_id():
return "123456789" return "123456789"
@pytest.fixture @pytest.fixture
def router(mock_storage, allowed_chat_id): def mock_processor():
return get_router(mock_storage, allowed_chat_id) processor = MagicMock()
processor.get_info.return_value = {
"model": "test-model",
"base_url": "http://test-url",
"prompt_summary": "Test summary"
}
return processor
@pytest.fixture
def router(mock_storage, mock_processor, allowed_chat_id):
return get_router(mock_storage, mock_processor, allowed_chat_id)
def get_handler(router, callback_name): def get_handler(router, callback_name):
for handler in router.message.handlers: for handler in router.message.handlers:
@ -79,6 +89,22 @@ async def test_command_help_handler(router, allowed_chat_id):
assert "/start" in args[0] assert "/start" in args[0]
assert "/latest" in args[0] assert "/latest" in args[0]
@pytest.mark.asyncio
async def test_command_params_handler(router, mock_processor, allowed_chat_id):
handler = get_handler(router, "command_params_handler")
message = AsyncMock()
message.chat.id = int(allowed_chat_id)
message.answer = AsyncMock()
await handler(message)
mock_processor.get_info.assert_called_once()
message.answer.assert_called_once()
args, kwargs = message.answer.call_args
assert "LLM Processor Parameters" in args[0]
assert "test-model" in args[0]
assert "HTML" in kwargs.get("parse_mode", "")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_command_latest_handler(router, mock_storage, allowed_chat_id): async def test_command_latest_handler(router, mock_storage, allowed_chat_id):
handler = get_handler(router, "command_latest_handler") handler = get_handler(router, "command_latest_handler")

View File

@ -45,8 +45,8 @@ async def test_rss_crawler_fetch_latest():
# Call the method # Call the method
items = await crawler.fetch_latest() items = await crawler.fetch_latest()
# Verify the mock was called with the correct URL # Verify the mock was called with the correct URL and headers
mock_get.assert_called_once_with(url) mock_get.assert_called_once_with(url, headers={"User-Agent": RSSCrawler.USER_AGENT})
# Verify the parsing results # Verify the parsing results
assert len(items) == 2 assert len(items) == 2
@ -66,3 +66,17 @@ async def test_rss_crawler_fetch_latest():
assert items[1].content_text == "Test Content 2" assert items[1].content_text == "Test Content 2"
assert items[1].source == source assert items[1].source == source
assert items[1].timestamp == datetime(2002, 10, 3, 10, 0, tzinfo=timezone.utc) assert items[1].timestamp == datetime(2002, 10, 3, 10, 0, tzinfo=timezone.utc)
@pytest.mark.asyncio
async def test_rss_crawler_fetch_latest_error():
url = "http://error.source.com/rss"
source = "Error Source"
crawler = RSSCrawler(url, source)
with patch("aiohttp.ClientSession.get") as mock_get:
# Simulate an exception during the request
mock_get.side_effect = Exception("Connection error")
items = await crawler.fetch_latest()
assert items == []

View File

@ -100,3 +100,67 @@ async def test_run_iteration():
# Should not alert proactively anymore as per updated requirements # Should not alert proactively anymore as per updated requirements
assert notifier_mock.send_alert.call_count == 0 assert notifier_mock.send_alert.call_count == 0
@pytest.mark.asyncio
async def test_run_iteration_crawler_failure():
# Arrange
crawler1 = AsyncMock()
crawler2 = AsyncMock()
processor = AsyncMock()
storage = AsyncMock()
notifier = AsyncMock()
crawler1.fetch_latest.side_effect = Exception("Crawler 1 failed")
crawler2.fetch_latest.return_value = []
service = TrendScoutService(
crawlers=[crawler1, crawler2],
processor=processor,
storage=storage,
notifier=notifier
)
# Act
await service.run_iteration()
# Assert - crawler 1 failed, but it shouldn't stop crawler 2
crawler1.fetch_latest.assert_called_once()
crawler2.fetch_latest.assert_called_once()
@pytest.mark.asyncio
async def test_run_iteration_item_failure():
# Arrange
crawler = AsyncMock()
processor = AsyncMock()
storage = AsyncMock()
notifier = AsyncMock()
item1 = NewsItemDTO(title="T1", url="U1", content_text="C1", source="S1", timestamp=datetime.now())
item2 = NewsItemDTO(title="T2", url="U2", content_text="C2", source="S2", timestamp=datetime.now())
crawler.fetch_latest.return_value = [item1, item2]
storage.exists.return_value = False
# processor.analyze fails for item1
enriched_item2 = EnrichedNewsItemDTO(
**item2.model_dump(),
relevance_score=6,
summary_ru="Summary",
anomalies_detected=[]
)
processor.analyze.side_effect = [Exception("Analyze failed"), enriched_item2]
service = TrendScoutService(
crawlers=[crawler],
processor=processor,
storage=storage,
notifier=notifier
)
# Act
await service.run_iteration()
# Assert - item 1 failed, but it shouldn't stop item 2
assert processor.analyze.call_count == 2
# Only item 2 should be stored
assert storage.store.call_count == 1

View File

@ -106,3 +106,13 @@ async def test_ollama_provider_analyze_markdown_json(sample_news_item):
assert result.anomalies_detected == [] assert result.anomalies_detected == []
assert result.category == "Browsers" assert result.category == "Browsers"
def test_ollama_provider_get_info():
os.environ['OLLAMA_API_URL'] = 'http://test-url:11434'
os.environ['OLLAMA_MODEL'] = 'test-model'
provider = OllamaProvider()
info = provider.get_info()
assert info["model"] == "test-model"
assert info["base_url"] == "http://test-url:11434"
assert info["prompt_summary"] == "Russian summary, 2 sentences, R&D focus"

View File

@ -230,3 +230,32 @@ async def test_get_stats(chroma_store: ChromaStore):
assert stats["total_count"] == 3 assert stats["total_count"] == 3
assert stats["category_Tech"] == 2 assert stats["category_Tech"] == 2
assert stats["category_Science"] == 1 assert stats["category_Science"] == 1
@pytest.mark.asyncio
async def test_search_sorting(chroma_store: ChromaStore):
# Arrange
items = [
EnrichedNewsItemDTO(
title=f"Title {i}",
url=f"https://example.com/{i}",
content_text=f"Content {i}",
source="Source",
timestamp=datetime.now(timezone.utc),
relevance_score=i,
summary_ru=f"Сводка {i}",
anomalies_detected=[],
category="Tech"
) for i in range(1, 6) # Scores 1 to 5
]
for item in items:
await chroma_store.store(item)
# Act
results = await chroma_store.search("Content", limit=10)
# Assert
assert len(results) == 5
# Should be sorted 5, 4, 3, 2, 1
scores = [r.relevance_score for r in results]
assert scores == [5, 4, 3, 2, 1]