[perf] stabilization of previous release

2026-03-13 13:23:30 +03:00 · 2026-03-13 13:23:30 +03:00 · 4bf7cb4331
commit 4bf7cb4331
parent 9c8e4c7345
13 changed files with 243 additions and 32 deletions
--- a/src/bot/bot.py
+++ b/src/bot/bot.py
@ -2,12 +2,13 @@ from aiogram import Bot, Dispatcher
 from aiogram.client.default import DefaultBotProperties
 from src.bot.handlers import get_router
 from src.storage.base import IVectorStore
 from src.processor.base import ILLMProvider
-def setup_bot(token: str, storage: IVectorStore, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
+def setup_bot(token: str, storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
    """
    Setup the aiogram Bot and Dispatcher with handlers.
    """
    bot = Bot(token=token, default=DefaultBotProperties(parse_mode="HTML"))
    dp = Dispatcher()
-    dp.include_router(get_router(storage, allowed_chat_id))
+    dp.include_router(get_router(storage, processor, allowed_chat_id))
    return bot, dp
--- a/src/bot/handlers.py
+++ b/src/bot/handlers.py
@ -10,6 +10,7 @@ from aiogram.utils.keyboard import InlineKeyboardBuilder
 from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink
 from src.processor.dto import EnrichedNewsItemDTO
 from src.processor.base import ILLMProvider
 from src.storage.base import IVectorStore
 class AccessMiddleware(BaseMiddleware):
@ -28,7 +29,7 @@ class AccessMiddleware(BaseMiddleware):
                return
        return await handler(event, data)
-def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
+def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> Router:
    router = Router(name="main_router")
    router.message.middleware(AccessMiddleware(allowed_chat_id))
@ -52,9 +53,24 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
            "/latest [category] - Show the latest enriched news trends\n"
            "/search query - Search for news\n"
            "/stats - Show database statistics\n"
            "/params - Show LLM processor parameters\n"
        )
        await message.answer(help_text)
    @router.message(Command("params"))
    async def command_params_handler(message: Message) -> None:
        """
        This handler receives messages with `/params` command
        """
        info = processor.get_info()
        response = "🤖 <b>LLM Processor Parameters</b>\n\n"
        response += f"<b>Model:</b> {html.escape(info.get('model', 'Unknown'))}\n"
        response += f"<b>Base URL:</b> {html.escape(info.get('base_url', 'Unknown'))}\n"
        response += f"<b>Prompt Summary:</b> {html.escape(info.get('prompt_summary', 'Unknown'))}"
        await message.answer(response, parse_mode="HTML")
    @router.message(Command("latest"))
    async def command_latest_handler(message: Message, command: CommandObject) -> None:
        """
@ -146,12 +162,13 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
        This handler receives messages with `/stats` command
        """
        stats = await storage.get_stats()
-        total = stats.get("total", 0)
+        total = stats.get("total_count", 0)
        breakdown = []
-        for cat, count in stats.items():
+        for key, count in stats.items():
-            if cat != "total":
+            if key.startswith("category_"):
-                breakdown.append(f"- {cat}: {count}")
+                cat_name = key.replace("category_", "")
                breakdown.append(f"- {cat_name}: {count}")
        response = f"📊 <b>Database Statistics</b>\n\nTotal items: {total}\n"
        if breakdown:
--- a/src/crawlers/rss_crawler.py
+++ b/src/crawlers/rss_crawler.py
@ -1,5 +1,6 @@
 import aiohttp
 import xml.etree.ElementTree as ET
 import logging
 from datetime import datetime
 from email.utils import parsedate_to_datetime
 from typing import List
@ -7,17 +8,26 @@ from typing import List
 from src.crawlers.base import ICrawler
 from src.crawlers.dto import NewsItemDTO
 logger = logging.getLogger(__name__)
 class RSSCrawler(ICrawler):
    USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    def __init__(self, url: str, source: str):
        self.url = url
        self.source = source
    async def fetch_latest(self) -> List[NewsItemDTO]:
-        async with aiohttp.ClientSession() as session:
+        headers = {"User-Agent": self.USER_AGENT}
-            async with session.get(self.url) as response:
+        try:
-                response.raise_for_status()
+            async with aiohttp.ClientSession() as session:
-                xml_data = await response.text()
+                async with session.get(self.url, headers=headers) as response:
-                return self._parse_xml(xml_data)
+                    response.raise_for_status()
                    xml_data = await response.text()
                    return self._parse_xml(xml_data)
        except Exception as e:
            logger.error(f"Error fetching RSS from {self.url}: {e}")
            return []
    def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]:
        root = ET.fromstring(xml_data)
--- a/src/main.py
+++ b/src/main.py
@ -9,6 +9,7 @@ from aiogram import Bot, Dispatcher
 from src.crawlers.base import ICrawler
 from src.crawlers.rss_crawler import RSSCrawler
 from src.crawlers.playwright_crawler import PlaywrightCrawler
 from src.processor.ollama_provider import OllamaProvider
 from src.storage.chroma_store import ChromaStore
 from src.notifications.telegram import TelegramNotifier
@ -49,7 +50,20 @@ async def main():
    # 1. Initialize Components that do not depend on Bot
    crawlers: List[ICrawler] = [
-        RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI")
+        RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI"),
        RSSCrawler("https://www.nature.com/nature.rss", source="Nature"),
        RSSCrawler("https://news.google.com/rss/search?q=WebOS+Chromium+Edge+AI+LGE+SmartTV&hl=en-US&gl=US&ceid=US:en", source="Google News R&D"),
        RSSCrawler("https://news.samsung.com/global/rss", source="Samsung Newsroom"),
        RSSCrawler("https://www.sony.com/en/SonyInfo/News/Service/rss.xml", source="Sony Newsroom"),
        PlaywrightCrawler("https://cvpr.thecvf.com/Conferences/2025", source="CVPR 2025", selector=".conference-news-item"),
        PlaywrightCrawler("https://www.ces.tech/news/press-releases.aspx", source="CES 2025", selector=".press-release-item"),
        RSSCrawler("https://vc.ru/rss/tech", source="VC.ru Tech"),
        RSSCrawler("https://rb.ru/rss/", source="RB.ru"),
        RSSCrawler("https://www.science.org/rss/news_current.xml", source="Science News"),
        RSSCrawler("https://ufn.ru/en/rss/", source="УФН"),
        RSSCrawler("https://www.tadviser.ru/xml/tadviser.xml", source="TAdviser"),
        RSSCrawler("https://habr.com/ru/rss/company/yandex/blog/", source="Yandex Tech"),
        RSSCrawler("https://blog.google/technology/ai/rss/", source="Google AI Blog"),
    ]
    processor = OllamaProvider()
@ -62,7 +76,7 @@ async def main():
    storage = ChromaStore(client=chroma_client)
    # 2. Initialize Bot & Dispatcher
-    bot, dp = setup_bot(bot_token, storage, chat_id)
+    bot, dp = setup_bot(bot_token, storage, processor, chat_id)
    # 3. Initialize Notifier and Orchestrator
    notifier = TelegramNotifier(bot, chat_id)
--- a/src/orchestrator/service.py
+++ b/src/orchestrator/service.py
@ -1,9 +1,12 @@
 import logging
 from typing import List
 from src.crawlers.base import ICrawler
 from src.processor.base import ILLMProvider
 from src.storage.base import IVectorStore
 from src.notifications.base import INotificationService
 logger = logging.getLogger(__name__)
 class TrendScoutService:
    def __init__(
        self,
@ -19,18 +22,24 @@ class TrendScoutService:
    async def run_iteration(self) -> None:
        for crawler in self.crawlers:
-            items = await crawler.fetch_latest()
+            try:
-            for item in items:
+                items = await crawler.fetch_latest()
-                if await self.storage.exists(item.url):
+                for item in items:
-                    continue
+                    try:
                        if await self.storage.exists(item.url):
                            continue
-                enriched_item = await self.processor.analyze(item)
+                        enriched_item = await self.processor.analyze(item)
-                
+                        
-                if enriched_item.relevance_score < 5:
+                        if enriched_item.relevance_score < 5:
-                    enriched_item.content_text = ""
+                            enriched_item.content_text = ""
-                
+                        
-                await self.storage.store(enriched_item)
+                        await self.storage.store(enriched_item)
-                
+                        
-                # Proactive alerts are currently disabled per user request
+                        # Proactive alerts are currently disabled per user request
-                # if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected):
+                        # if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected):
-                #     await self.notifier.send_alert(enriched_item)
+                        #     await self.notifier.send_alert(enriched_item)
                    except Exception as e:
                        logger.error(f"Error processing item {item.url}: {e}")
            except Exception as e:
                logger.error(f"Error running crawler {crawler}: {e}")
--- a/src/processor/base.py
+++ b/src/processor/base.py
@ -6,3 +6,7 @@ class ILLMProvider(ABC):
    @abstractmethod
    async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
        pass
    @abstractmethod
    def get_info(self) -> dict[str, str]:
        pass
--- a/src/processor/ollama_provider.py
+++ b/src/processor/ollama_provider.py
@ -7,6 +7,15 @@ from src.processor.base import ILLMProvider
 from src.processor.dto import EnrichedNewsItemDTO
 class OllamaProvider(ILLMProvider):
    def get_info(self) -> dict[str, str]:
        base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
        model = os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud')
        return {
            "model": model,
            "base_url": base_url,
            "prompt_summary": "Russian summary, 2 sentences, R&D focus"
        }
    async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
        base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
        url = base_url if base_url.endswith('/api/generate') else f"{base_url.rstrip('/')}/api/generate"
@ -14,6 +23,7 @@ class OllamaProvider(ILLMProvider):
            f"Analyze the following article.\nTitle: {news_item.title}\n"
            f"Content: {news_item.content_text}\n"
            "Return JSON with 'relevance_score' (0-10), 'summary_ru' (string), 'anomalies_detected' (list of strings), and 'category' (string).\n"
            "The 'summary_ru' MUST be in Russian and strictly NO MORE than 2 sentences.\n"
            "The 'category' must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n"
            "For 'relevance_score', prioritize and give higher scores to articles related to R&D, Chromium, NPU, and Smart TV operating systems.\n"
            "Regarding 'anomalies_detected': only detect factual, conceptual, or industry-related anomalies (e.g., sudden technological shifts, unexpected competitor moves). "
--- a/src/storage/chroma_store.py
+++ b/src/storage/chroma_store.py
@ -68,6 +68,9 @@ class ChromaStore(IVectorStore):
            document = documents[0][idx] if documents and documents[0] else ""
            items.append(self._reconstruct_dto(metadata, document))
        # Sort items by relevance_score in descending order
        items.sort(key=lambda x: x.relevance_score, reverse=True)
        return items
    def _reconstruct_dto(self, metadata: Mapping[str, Any], document: str) -> EnrichedNewsItemDTO:
--- a/tests/bot/test_handlers.py
+++ b/tests/bot/test_handlers.py
@ -35,8 +35,18 @@ def allowed_chat_id():
    return "123456789"
@pytest.fixture
-def router(mock_storage, allowed_chat_id):
+def mock_processor():
-    return get_router(mock_storage, allowed_chat_id)
+    processor = MagicMock()
    processor.get_info.return_value = {
        "model": "test-model",
        "base_url": "http://test-url",
        "prompt_summary": "Test summary"
    }
    return processor
@pytest.fixture
 def router(mock_storage, mock_processor, allowed_chat_id):
    return get_router(mock_storage, mock_processor, allowed_chat_id)
 def get_handler(router, callback_name):
    for handler in router.message.handlers:
@ -79,6 +89,22 @@ async def test_command_help_handler(router, allowed_chat_id):
    assert "/start" in args[0]
    assert "/latest" in args[0]
@pytest.mark.asyncio
 async def test_command_params_handler(router, mock_processor, allowed_chat_id):
    handler = get_handler(router, "command_params_handler")
    message = AsyncMock()
    message.chat.id = int(allowed_chat_id)
    message.answer = AsyncMock()
    await handler(message)
    mock_processor.get_info.assert_called_once()
    message.answer.assert_called_once()
    args, kwargs = message.answer.call_args
    assert "LLM Processor Parameters" in args[0]
    assert "test-model" in args[0]
    assert "HTML" in kwargs.get("parse_mode", "")
@pytest.mark.asyncio
 async def test_command_latest_handler(router, mock_storage, allowed_chat_id):
    handler = get_handler(router, "command_latest_handler")
--- a/tests/crawlers/test_rss_crawler.py
+++ b/tests/crawlers/test_rss_crawler.py
@ -45,8 +45,8 @@ async def test_rss_crawler_fetch_latest():
        # Call the method
        items = await crawler.fetch_latest()
-        # Verify the mock was called with the correct URL
+        # Verify the mock was called with the correct URL and headers
-        mock_get.assert_called_once_with(url)
+        mock_get.assert_called_once_with(url, headers={"User-Agent": RSSCrawler.USER_AGENT})
        # Verify the parsing results
        assert len(items) == 2
@ -66,3 +66,17 @@ async def test_rss_crawler_fetch_latest():
        assert items[1].content_text == "Test Content 2"
        assert items[1].source == source
        assert items[1].timestamp == datetime(2002, 10, 3, 10, 0, tzinfo=timezone.utc)
@pytest.mark.asyncio
 async def test_rss_crawler_fetch_latest_error():
    url = "http://error.source.com/rss"
    source = "Error Source"
    crawler = RSSCrawler(url, source)
    with patch("aiohttp.ClientSession.get") as mock_get:
        # Simulate an exception during the request
        mock_get.side_effect = Exception("Connection error")
        items = await crawler.fetch_latest()
        assert items == []
--- a/tests/orchestrator/test_service.py
+++ b/tests/orchestrator/test_service.py
@ -100,3 +100,67 @@ async def test_run_iteration():
    # Should not alert proactively anymore as per updated requirements
    assert notifier_mock.send_alert.call_count == 0
@pytest.mark.asyncio
 async def test_run_iteration_crawler_failure():
    # Arrange
    crawler1 = AsyncMock()
    crawler2 = AsyncMock()
    processor = AsyncMock()
    storage = AsyncMock()
    notifier = AsyncMock()
    crawler1.fetch_latest.side_effect = Exception("Crawler 1 failed")
    crawler2.fetch_latest.return_value = []
    service = TrendScoutService(
        crawlers=[crawler1, crawler2],
        processor=processor,
        storage=storage,
        notifier=notifier
    )
    # Act
    await service.run_iteration()
    # Assert - crawler 1 failed, but it shouldn't stop crawler 2
    crawler1.fetch_latest.assert_called_once()
    crawler2.fetch_latest.assert_called_once()
@pytest.mark.asyncio
 async def test_run_iteration_item_failure():
    # Arrange
    crawler = AsyncMock()
    processor = AsyncMock()
    storage = AsyncMock()
    notifier = AsyncMock()
    item1 = NewsItemDTO(title="T1", url="U1", content_text="C1", source="S1", timestamp=datetime.now())
    item2 = NewsItemDTO(title="T2", url="U2", content_text="C2", source="S2", timestamp=datetime.now())
    crawler.fetch_latest.return_value = [item1, item2]
    storage.exists.return_value = False
    # processor.analyze fails for item1
    enriched_item2 = EnrichedNewsItemDTO(
        **item2.model_dump(),
        relevance_score=6,
        summary_ru="Summary",
        anomalies_detected=[]
    )
    processor.analyze.side_effect = [Exception("Analyze failed"), enriched_item2]
    service = TrendScoutService(
        crawlers=[crawler],
        processor=processor,
        storage=storage,
        notifier=notifier
    )
    # Act
    await service.run_iteration()
    # Assert - item 1 failed, but it shouldn't stop item 2
    assert processor.analyze.call_count == 2
    # Only item 2 should be stored
    assert storage.store.call_count == 1
--- a/tests/processor/test_ollama_provider.py
+++ b/tests/processor/test_ollama_provider.py
@ -106,3 +106,13 @@ async def test_ollama_provider_analyze_markdown_json(sample_news_item):
        assert result.anomalies_detected == []
        assert result.category == "Browsers"
 def test_ollama_provider_get_info():
    os.environ['OLLAMA_API_URL'] = 'http://test-url:11434'
    os.environ['OLLAMA_MODEL'] = 'test-model'
    provider = OllamaProvider()
    info = provider.get_info()
    assert info["model"] == "test-model"
    assert info["base_url"] == "http://test-url:11434"
    assert info["prompt_summary"] == "Russian summary, 2 sentences, R&D focus"
--- a/tests/storage/test_chroma_store.py
+++ b/tests/storage/test_chroma_store.py
@ -230,3 +230,32 @@ async def test_get_stats(chroma_store: ChromaStore):
    assert stats["total_count"] == 3
    assert stats["category_Tech"] == 2
    assert stats["category_Science"] == 1
@pytest.mark.asyncio
 async def test_search_sorting(chroma_store: ChromaStore):
    # Arrange
    items = [
        EnrichedNewsItemDTO(
            title=f"Title {i}",
            url=f"https://example.com/{i}",
            content_text=f"Content {i}",
            source="Source",
            timestamp=datetime.now(timezone.utc),
            relevance_score=i,
            summary_ru=f"Сводка {i}",
            anomalies_detected=[],
            category="Tech"
        ) for i in range(1, 6) # Scores 1 to 5
    ]
    for item in items:
        await chroma_store.store(item)
    # Act
    results = await chroma_store.search("Content", limit=10)
    # Assert
    assert len(results) == 5
    # Should be sorted 5, 4, 3, 2, 1
    scores = [r.relevance_score for r in results]
    assert scores == [5, 4, 3, 2, 1]