[perf] stabilization of previous release

2026-03-13 13:23:30 +03:00 · 2026-03-13 13:23:30 +03:00 · 4bf7cb4331
commit 4bf7cb4331
parent 9c8e4c7345
13 changed files with 243 additions and 32 deletions
--- a/src/bot/bot.py
+++ b/src/bot/bot.py
@ -2,12 +2,13 @@ from aiogram import Bot, Dispatcher
 from aiogram.client.default import DefaultBotProperties
 from src.bot.handlers import get_router
 from src.storage.base import IVectorStore
+from src.processor.base import ILLMProvider

-def setup_bot(token: str, storage: IVectorStore, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
+def setup_bot(token: str, storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
    """
    Setup the aiogram Bot and Dispatcher with handlers.
    """
    bot = Bot(token=token, default=DefaultBotProperties(parse_mode="HTML"))
    dp = Dispatcher()
-    dp.include_router(get_router(storage, allowed_chat_id))
+    dp.include_router(get_router(storage, processor, allowed_chat_id))
    return bot, dp
--- a/src/bot/handlers.py
+++ b/src/bot/handlers.py
@ -10,6 +10,7 @@ from aiogram.utils.keyboard import InlineKeyboardBuilder
 from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink

 from src.processor.dto import EnrichedNewsItemDTO
+from src.processor.base import ILLMProvider
 from src.storage.base import IVectorStore

 class AccessMiddleware(BaseMiddleware):
@ -28,7 +29,7 @@ class AccessMiddleware(BaseMiddleware):
                return
        return await handler(event, data)

-def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
+def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> Router:
    router = Router(name="main_router")
    router.message.middleware(AccessMiddleware(allowed_chat_id))
    
@ -52,9 +53,24 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
            "/latest [category] - Show the latest enriched news trends\n"
            "/search query - Search for news\n"
            "/stats - Show database statistics\n"
+            "/params - Show LLM processor parameters\n"
        )
        await message.answer(help_text)

+    @router.message(Command("params"))
+    async def command_params_handler(message: Message) -> None:
+        """
+        This handler receives messages with `/params` command
+        """
+        info = processor.get_info()
+        
+        response = "🤖 <b>LLM Processor Parameters</b>\n\n"
+        response += f"<b>Model:</b> {html.escape(info.get('model', 'Unknown'))}\n"
+        response += f"<b>Base URL:</b> {html.escape(info.get('base_url', 'Unknown'))}\n"
+        response += f"<b>Prompt Summary:</b> {html.escape(info.get('prompt_summary', 'Unknown'))}"
+        
+        await message.answer(response, parse_mode="HTML")
+
    @router.message(Command("latest"))
    async def command_latest_handler(message: Message, command: CommandObject) -> None:
        """
@ -146,12 +162,13 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
        This handler receives messages with `/stats` command
        """
        stats = await storage.get_stats()
-        total = stats.get("total", 0)
+        total = stats.get("total_count", 0)
        
        breakdown = []
-        for cat, count in stats.items():
-            if cat != "total":
-                breakdown.append(f"- {cat}: {count}")
+        for key, count in stats.items():
+            if key.startswith("category_"):
+                cat_name = key.replace("category_", "")
+                breakdown.append(f"- {cat_name}: {count}")
        
        response = f"📊 <b>Database Statistics</b>\n\nTotal items: {total}\n"
        if breakdown:
--- a/src/crawlers/rss_crawler.py
+++ b/src/crawlers/rss_crawler.py
@ -1,5 +1,6 @@
 import aiohttp
 import xml.etree.ElementTree as ET
+import logging
 from datetime import datetime
 from email.utils import parsedate_to_datetime
 from typing import List
@ -7,17 +8,26 @@ from typing import List
 from src.crawlers.base import ICrawler
 from src.crawlers.dto import NewsItemDTO

+logger = logging.getLogger(__name__)
+
 class RSSCrawler(ICrawler):
+    USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+
    def __init__(self, url: str, source: str):
        self.url = url
        self.source = source

    async def fetch_latest(self) -> List[NewsItemDTO]:
-        async with aiohttp.ClientSession() as session:
-            async with session.get(self.url) as response:
-                response.raise_for_status()
-                xml_data = await response.text()
-                return self._parse_xml(xml_data)
+        headers = {"User-Agent": self.USER_AGENT}
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(self.url, headers=headers) as response:
+                    response.raise_for_status()
+                    xml_data = await response.text()
+                    return self._parse_xml(xml_data)
+        except Exception as e:
+            logger.error(f"Error fetching RSS from {self.url}: {e}")
+            return []

    def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]:
        root = ET.fromstring(xml_data)
--- a/src/main.py
+++ b/src/main.py
@ -9,6 +9,7 @@ from aiogram import Bot, Dispatcher

 from src.crawlers.base import ICrawler
 from src.crawlers.rss_crawler import RSSCrawler
+from src.crawlers.playwright_crawler import PlaywrightCrawler
 from src.processor.ollama_provider import OllamaProvider
 from src.storage.chroma_store import ChromaStore
 from src.notifications.telegram import TelegramNotifier
@ -49,7 +50,20 @@ async def main():

    # 1. Initialize Components that do not depend on Bot
    crawlers: List[ICrawler] = [
-        RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI")
+        RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI"),
+        RSSCrawler("https://www.nature.com/nature.rss", source="Nature"),
+        RSSCrawler("https://news.google.com/rss/search?q=WebOS+Chromium+Edge+AI+LGE+SmartTV&hl=en-US&gl=US&ceid=US:en", source="Google News R&D"),
+        RSSCrawler("https://news.samsung.com/global/rss", source="Samsung Newsroom"),
+        RSSCrawler("https://www.sony.com/en/SonyInfo/News/Service/rss.xml", source="Sony Newsroom"),
+        PlaywrightCrawler("https://cvpr.thecvf.com/Conferences/2025", source="CVPR 2025", selector=".conference-news-item"),
+        PlaywrightCrawler("https://www.ces.tech/news/press-releases.aspx", source="CES 2025", selector=".press-release-item"),
+        RSSCrawler("https://vc.ru/rss/tech", source="VC.ru Tech"),
+        RSSCrawler("https://rb.ru/rss/", source="RB.ru"),
+        RSSCrawler("https://www.science.org/rss/news_current.xml", source="Science News"),
+        RSSCrawler("https://ufn.ru/en/rss/", source="УФН"),
+        RSSCrawler("https://www.tadviser.ru/xml/tadviser.xml", source="TAdviser"),
+        RSSCrawler("https://habr.com/ru/rss/company/yandex/blog/", source="Yandex Tech"),
+        RSSCrawler("https://blog.google/technology/ai/rss/", source="Google AI Blog"),
    ]
    
    processor = OllamaProvider()
@ -62,7 +76,7 @@ async def main():
    storage = ChromaStore(client=chroma_client)

    # 2. Initialize Bot & Dispatcher
-    bot, dp = setup_bot(bot_token, storage, chat_id)
+    bot, dp = setup_bot(bot_token, storage, processor, chat_id)
    
    # 3. Initialize Notifier and Orchestrator
    notifier = TelegramNotifier(bot, chat_id)
--- a/src/orchestrator/service.py
+++ b/src/orchestrator/service.py
@ -1,9 +1,12 @@
+import logging
 from typing import List
 from src.crawlers.base import ICrawler
 from src.processor.base import ILLMProvider
 from src.storage.base import IVectorStore
 from src.notifications.base import INotificationService

+logger = logging.getLogger(__name__)
+
 class TrendScoutService:
    def __init__(
        self,
@ -19,18 +22,24 @@ class TrendScoutService:

    async def run_iteration(self) -> None:
        for crawler in self.crawlers:
-            items = await crawler.fetch_latest()
-            for item in items:
-                if await self.storage.exists(item.url):
-                    continue
+            try:
+                items = await crawler.fetch_latest()
+                for item in items:
+                    try:
+                        if await self.storage.exists(item.url):
+                            continue

-                enriched_item = await self.processor.analyze(item)
-                
-                if enriched_item.relevance_score < 5:
-                    enriched_item.content_text = ""
-                
-                await self.storage.store(enriched_item)
-                
-                # Proactive alerts are currently disabled per user request
-                # if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected):
-                #     await self.notifier.send_alert(enriched_item)
+                        enriched_item = await self.processor.analyze(item)
+                        
+                        if enriched_item.relevance_score < 5:
+                            enriched_item.content_text = ""
+                        
+                        await self.storage.store(enriched_item)
+                        
+                        # Proactive alerts are currently disabled per user request
+                        # if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected):
+                        #     await self.notifier.send_alert(enriched_item)
+                    except Exception as e:
+                        logger.error(f"Error processing item {item.url}: {e}")
+            except Exception as e:
+                logger.error(f"Error running crawler {crawler}: {e}")
--- a/src/processor/base.py
+++ b/src/processor/base.py
@ -6,3 +6,7 @@ class ILLMProvider(ABC):
    @abstractmethod
    async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
        pass
+
+    @abstractmethod
+    def get_info(self) -> dict[str, str]:
+        pass
--- a/src/processor/ollama_provider.py
+++ b/src/processor/ollama_provider.py
@ -7,6 +7,15 @@ from src.processor.base import ILLMProvider
 from src.processor.dto import EnrichedNewsItemDTO

 class OllamaProvider(ILLMProvider):
+    def get_info(self) -> dict[str, str]:
+        base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
+        model = os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud')
+        return {
+            "model": model,
+            "base_url": base_url,
+            "prompt_summary": "Russian summary, 2 sentences, R&D focus"
+        }
+
    async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
        base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
        url = base_url if base_url.endswith('/api/generate') else f"{base_url.rstrip('/')}/api/generate"
@ -14,6 +23,7 @@ class OllamaProvider(ILLMProvider):
            f"Analyze the following article.\nTitle: {news_item.title}\n"
            f"Content: {news_item.content_text}\n"
            "Return JSON with 'relevance_score' (0-10), 'summary_ru' (string), 'anomalies_detected' (list of strings), and 'category' (string).\n"
+            "The 'summary_ru' MUST be in Russian and strictly NO MORE than 2 sentences.\n"
            "The 'category' must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n"
            "For 'relevance_score', prioritize and give higher scores to articles related to R&D, Chromium, NPU, and Smart TV operating systems.\n"
            "Regarding 'anomalies_detected': only detect factual, conceptual, or industry-related anomalies (e.g., sudden technological shifts, unexpected competitor moves). "
--- a/src/storage/chroma_store.py
+++ b/src/storage/chroma_store.py
@ -68,6 +68,9 @@ class ChromaStore(IVectorStore):
            document = documents[0][idx] if documents and documents[0] else ""
            items.append(self._reconstruct_dto(metadata, document))
            
+        # Sort items by relevance_score in descending order
+        items.sort(key=lambda x: x.relevance_score, reverse=True)
+            
        return items

    def _reconstruct_dto(self, metadata: Mapping[str, Any], document: str) -> EnrichedNewsItemDTO:
--- a/tests/bot/test_handlers.py
+++ b/tests/bot/test_handlers.py
@ -35,8 +35,18 @@ def allowed_chat_id():
    return "123456789"

@pytest.fixture
-def router(mock_storage, allowed_chat_id):
-    return get_router(mock_storage, allowed_chat_id)
+def mock_processor():
+    processor = MagicMock()
+    processor.get_info.return_value = {
+        "model": "test-model",
+        "base_url": "http://test-url",
+        "prompt_summary": "Test summary"
+    }
+    return processor
+
+@pytest.fixture
+def router(mock_storage, mock_processor, allowed_chat_id):
+    return get_router(mock_storage, mock_processor, allowed_chat_id)

 def get_handler(router, callback_name):
    for handler in router.message.handlers:
@ -79,6 +89,22 @@ async def test_command_help_handler(router, allowed_chat_id):
    assert "/start" in args[0]
    assert "/latest" in args[0]

+@pytest.mark.asyncio
+async def test_command_params_handler(router, mock_processor, allowed_chat_id):
+    handler = get_handler(router, "command_params_handler")
+    message = AsyncMock()
+    message.chat.id = int(allowed_chat_id)
+    message.answer = AsyncMock()
+    
+    await handler(message)
+    
+    mock_processor.get_info.assert_called_once()
+    message.answer.assert_called_once()
+    args, kwargs = message.answer.call_args
+    assert "LLM Processor Parameters" in args[0]
+    assert "test-model" in args[0]
+    assert "HTML" in kwargs.get("parse_mode", "")
+
@pytest.mark.asyncio
 async def test_command_latest_handler(router, mock_storage, allowed_chat_id):
    handler = get_handler(router, "command_latest_handler")
--- a/tests/crawlers/test_rss_crawler.py
+++ b/tests/crawlers/test_rss_crawler.py
@ -45,8 +45,8 @@ async def test_rss_crawler_fetch_latest():
        # Call the method
        items = await crawler.fetch_latest()

-        # Verify the mock was called with the correct URL
-        mock_get.assert_called_once_with(url)
+        # Verify the mock was called with the correct URL and headers
+        mock_get.assert_called_once_with(url, headers={"User-Agent": RSSCrawler.USER_AGENT})

        # Verify the parsing results
        assert len(items) == 2
@ -66,3 +66,17 @@ async def test_rss_crawler_fetch_latest():
        assert items[1].content_text == "Test Content 2"
        assert items[1].source == source
        assert items[1].timestamp == datetime(2002, 10, 3, 10, 0, tzinfo=timezone.utc)
+
+@pytest.mark.asyncio
+async def test_rss_crawler_fetch_latest_error():
+    url = "http://error.source.com/rss"
+    source = "Error Source"
+    crawler = RSSCrawler(url, source)
+
+    with patch("aiohttp.ClientSession.get") as mock_get:
+        # Simulate an exception during the request
+        mock_get.side_effect = Exception("Connection error")
+
+        items = await crawler.fetch_latest()
+
+        assert items == []
--- a/tests/orchestrator/test_service.py
+++ b/tests/orchestrator/test_service.py
@ -100,3 +100,67 @@ async def test_run_iteration():
    
    # Should not alert proactively anymore as per updated requirements
    assert notifier_mock.send_alert.call_count == 0
+
+@pytest.mark.asyncio
+async def test_run_iteration_crawler_failure():
+    # Arrange
+    crawler1 = AsyncMock()
+    crawler2 = AsyncMock()
+    processor = AsyncMock()
+    storage = AsyncMock()
+    notifier = AsyncMock()
+    
+    crawler1.fetch_latest.side_effect = Exception("Crawler 1 failed")
+    crawler2.fetch_latest.return_value = []
+    
+    service = TrendScoutService(
+        crawlers=[crawler1, crawler2],
+        processor=processor,
+        storage=storage,
+        notifier=notifier
+    )
+    
+    # Act
+    await service.run_iteration()
+    
+    # Assert - crawler 1 failed, but it shouldn't stop crawler 2
+    crawler1.fetch_latest.assert_called_once()
+    crawler2.fetch_latest.assert_called_once()
+
+@pytest.mark.asyncio
+async def test_run_iteration_item_failure():
+    # Arrange
+    crawler = AsyncMock()
+    processor = AsyncMock()
+    storage = AsyncMock()
+    notifier = AsyncMock()
+    
+    item1 = NewsItemDTO(title="T1", url="U1", content_text="C1", source="S1", timestamp=datetime.now())
+    item2 = NewsItemDTO(title="T2", url="U2", content_text="C2", source="S2", timestamp=datetime.now())
+    
+    crawler.fetch_latest.return_value = [item1, item2]
+    storage.exists.return_value = False
+    
+    # processor.analyze fails for item1
+    enriched_item2 = EnrichedNewsItemDTO(
+        **item2.model_dump(),
+        relevance_score=6,
+        summary_ru="Summary",
+        anomalies_detected=[]
+    )
+    processor.analyze.side_effect = [Exception("Analyze failed"), enriched_item2]
+    
+    service = TrendScoutService(
+        crawlers=[crawler],
+        processor=processor,
+        storage=storage,
+        notifier=notifier
+    )
+    
+    # Act
+    await service.run_iteration()
+    
+    # Assert - item 1 failed, but it shouldn't stop item 2
+    assert processor.analyze.call_count == 2
+    # Only item 2 should be stored
+    assert storage.store.call_count == 1
--- a/tests/processor/test_ollama_provider.py
+++ b/tests/processor/test_ollama_provider.py
@ -106,3 +106,13 @@ async def test_ollama_provider_analyze_markdown_json(sample_news_item):
        assert result.anomalies_detected == []
        assert result.category == "Browsers"

+def test_ollama_provider_get_info():
+    os.environ['OLLAMA_API_URL'] = 'http://test-url:11434'
+    os.environ['OLLAMA_MODEL'] = 'test-model'
+    provider = OllamaProvider()
+    info = provider.get_info()
+    
+    assert info["model"] == "test-model"
+    assert info["base_url"] == "http://test-url:11434"
+    assert info["prompt_summary"] == "Russian summary, 2 sentences, R&D focus"
+
--- a/tests/storage/test_chroma_store.py
+++ b/tests/storage/test_chroma_store.py
@ -230,3 +230,32 @@ async def test_get_stats(chroma_store: ChromaStore):
    assert stats["total_count"] == 3
    assert stats["category_Tech"] == 2
    assert stats["category_Science"] == 1
+
+@pytest.mark.asyncio
+async def test_search_sorting(chroma_store: ChromaStore):
+    # Arrange
+    items = [
+        EnrichedNewsItemDTO(
+            title=f"Title {i}",
+            url=f"https://example.com/{i}",
+            content_text=f"Content {i}",
+            source="Source",
+            timestamp=datetime.now(timezone.utc),
+            relevance_score=i,
+            summary_ru=f"Сводка {i}",
+            anomalies_detected=[],
+            category="Tech"
+        ) for i in range(1, 6) # Scores 1 to 5
+    ]
+    
+    for item in items:
+        await chroma_store.store(item)
+    
+    # Act
+    results = await chroma_store.search("Content", limit=10)
+    
+    # Assert
+    assert len(results) == 5
+    # Should be sorted 5, 4, 3, 2, 1
+    scores = [r.relevance_score for r in results]
+    assert scores == [5, 4, 3, 2, 1]