diff --git a/src/bot/bot.py b/src/bot/bot.py index 92fc9e4..eeae995 100644 --- a/src/bot/bot.py +++ b/src/bot/bot.py @@ -2,12 +2,13 @@ from aiogram import Bot, Dispatcher from aiogram.client.default import DefaultBotProperties from src.bot.handlers import get_router from src.storage.base import IVectorStore +from src.processor.base import ILLMProvider -def setup_bot(token: str, storage: IVectorStore, allowed_chat_id: str) -> tuple[Bot, Dispatcher]: +def setup_bot(token: str, storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> tuple[Bot, Dispatcher]: """ Setup the aiogram Bot and Dispatcher with handlers. """ bot = Bot(token=token, default=DefaultBotProperties(parse_mode="HTML")) dp = Dispatcher() - dp.include_router(get_router(storage, allowed_chat_id)) + dp.include_router(get_router(storage, processor, allowed_chat_id)) return bot, dp diff --git a/src/bot/handlers.py b/src/bot/handlers.py index 2c256c1..23038de 100644 --- a/src/bot/handlers.py +++ b/src/bot/handlers.py @@ -10,6 +10,7 @@ from aiogram.utils.keyboard import InlineKeyboardBuilder from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink from src.processor.dto import EnrichedNewsItemDTO +from src.processor.base import ILLMProvider from src.storage.base import IVectorStore class AccessMiddleware(BaseMiddleware): @@ -28,7 +29,7 @@ class AccessMiddleware(BaseMiddleware): return return await handler(event, data) -def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router: +def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> Router: router = Router(name="main_router") router.message.middleware(AccessMiddleware(allowed_chat_id)) @@ -52,9 +53,24 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router: "/latest [category] - Show the latest enriched news trends\n" "/search query - Search for news\n" "/stats - Show database statistics\n" + "/params - Show LLM processor parameters\n" ) await message.answer(help_text) + @router.message(Command("params")) + async def command_params_handler(message: Message) -> None: + """ + This handler receives messages with `/params` command + """ + info = processor.get_info() + + response = "🤖 LLM Processor Parameters\n\n" + response += f"Model: {html.escape(info.get('model', 'Unknown'))}\n" + response += f"Base URL: {html.escape(info.get('base_url', 'Unknown'))}\n" + response += f"Prompt Summary: {html.escape(info.get('prompt_summary', 'Unknown'))}" + + await message.answer(response, parse_mode="HTML") + @router.message(Command("latest")) async def command_latest_handler(message: Message, command: CommandObject) -> None: """ @@ -146,12 +162,13 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router: This handler receives messages with `/stats` command """ stats = await storage.get_stats() - total = stats.get("total", 0) + total = stats.get("total_count", 0) breakdown = [] - for cat, count in stats.items(): - if cat != "total": - breakdown.append(f"- {cat}: {count}") + for key, count in stats.items(): + if key.startswith("category_"): + cat_name = key.replace("category_", "") + breakdown.append(f"- {cat_name}: {count}") response = f"📊 Database Statistics\n\nTotal items: {total}\n" if breakdown: diff --git a/src/crawlers/rss_crawler.py b/src/crawlers/rss_crawler.py index f3e4f6a..4a984e1 100644 --- a/src/crawlers/rss_crawler.py +++ b/src/crawlers/rss_crawler.py @@ -1,5 +1,6 @@ import aiohttp import xml.etree.ElementTree as ET +import logging from datetime import datetime from email.utils import parsedate_to_datetime from typing import List @@ -7,17 +8,26 @@ from typing import List from src.crawlers.base import ICrawler from src.crawlers.dto import NewsItemDTO +logger = logging.getLogger(__name__) + class RSSCrawler(ICrawler): + USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + def __init__(self, url: str, source: str): self.url = url self.source = source async def fetch_latest(self) -> List[NewsItemDTO]: - async with aiohttp.ClientSession() as session: - async with session.get(self.url) as response: - response.raise_for_status() - xml_data = await response.text() - return self._parse_xml(xml_data) + headers = {"User-Agent": self.USER_AGENT} + try: + async with aiohttp.ClientSession() as session: + async with session.get(self.url, headers=headers) as response: + response.raise_for_status() + xml_data = await response.text() + return self._parse_xml(xml_data) + except Exception as e: + logger.error(f"Error fetching RSS from {self.url}: {e}") + return [] def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]: root = ET.fromstring(xml_data) diff --git a/src/main.py b/src/main.py index ad3545b..7891fec 100644 --- a/src/main.py +++ b/src/main.py @@ -9,6 +9,7 @@ from aiogram import Bot, Dispatcher from src.crawlers.base import ICrawler from src.crawlers.rss_crawler import RSSCrawler +from src.crawlers.playwright_crawler import PlaywrightCrawler from src.processor.ollama_provider import OllamaProvider from src.storage.chroma_store import ChromaStore from src.notifications.telegram import TelegramNotifier @@ -49,7 +50,20 @@ async def main(): # 1. Initialize Components that do not depend on Bot crawlers: List[ICrawler] = [ - RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI") + RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI"), + RSSCrawler("https://www.nature.com/nature.rss", source="Nature"), + RSSCrawler("https://news.google.com/rss/search?q=WebOS+Chromium+Edge+AI+LGE+SmartTV&hl=en-US&gl=US&ceid=US:en", source="Google News R&D"), + RSSCrawler("https://news.samsung.com/global/rss", source="Samsung Newsroom"), + RSSCrawler("https://www.sony.com/en/SonyInfo/News/Service/rss.xml", source="Sony Newsroom"), + PlaywrightCrawler("https://cvpr.thecvf.com/Conferences/2025", source="CVPR 2025", selector=".conference-news-item"), + PlaywrightCrawler("https://www.ces.tech/news/press-releases.aspx", source="CES 2025", selector=".press-release-item"), + RSSCrawler("https://vc.ru/rss/tech", source="VC.ru Tech"), + RSSCrawler("https://rb.ru/rss/", source="RB.ru"), + RSSCrawler("https://www.science.org/rss/news_current.xml", source="Science News"), + RSSCrawler("https://ufn.ru/en/rss/", source="УФН"), + RSSCrawler("https://www.tadviser.ru/xml/tadviser.xml", source="TAdviser"), + RSSCrawler("https://habr.com/ru/rss/company/yandex/blog/", source="Yandex Tech"), + RSSCrawler("https://blog.google/technology/ai/rss/", source="Google AI Blog"), ] processor = OllamaProvider() @@ -62,7 +76,7 @@ async def main(): storage = ChromaStore(client=chroma_client) # 2. Initialize Bot & Dispatcher - bot, dp = setup_bot(bot_token, storage, chat_id) + bot, dp = setup_bot(bot_token, storage, processor, chat_id) # 3. Initialize Notifier and Orchestrator notifier = TelegramNotifier(bot, chat_id) diff --git a/src/orchestrator/service.py b/src/orchestrator/service.py index a445ad2..57bc83d 100644 --- a/src/orchestrator/service.py +++ b/src/orchestrator/service.py @@ -1,9 +1,12 @@ +import logging from typing import List from src.crawlers.base import ICrawler from src.processor.base import ILLMProvider from src.storage.base import IVectorStore from src.notifications.base import INotificationService +logger = logging.getLogger(__name__) + class TrendScoutService: def __init__( self, @@ -19,18 +22,24 @@ class TrendScoutService: async def run_iteration(self) -> None: for crawler in self.crawlers: - items = await crawler.fetch_latest() - for item in items: - if await self.storage.exists(item.url): - continue + try: + items = await crawler.fetch_latest() + for item in items: + try: + if await self.storage.exists(item.url): + continue - enriched_item = await self.processor.analyze(item) - - if enriched_item.relevance_score < 5: - enriched_item.content_text = "" - - await self.storage.store(enriched_item) - - # Proactive alerts are currently disabled per user request - # if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected): - # await self.notifier.send_alert(enriched_item) + enriched_item = await self.processor.analyze(item) + + if enriched_item.relevance_score < 5: + enriched_item.content_text = "" + + await self.storage.store(enriched_item) + + # Proactive alerts are currently disabled per user request + # if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected): + # await self.notifier.send_alert(enriched_item) + except Exception as e: + logger.error(f"Error processing item {item.url}: {e}") + except Exception as e: + logger.error(f"Error running crawler {crawler}: {e}") diff --git a/src/processor/base.py b/src/processor/base.py index 728f589..368e77d 100644 --- a/src/processor/base.py +++ b/src/processor/base.py @@ -6,3 +6,7 @@ class ILLMProvider(ABC): @abstractmethod async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO: pass + + @abstractmethod + def get_info(self) -> dict[str, str]: + pass diff --git a/src/processor/ollama_provider.py b/src/processor/ollama_provider.py index a056b9b..8f956dd 100644 --- a/src/processor/ollama_provider.py +++ b/src/processor/ollama_provider.py @@ -7,6 +7,15 @@ from src.processor.base import ILLMProvider from src.processor.dto import EnrichedNewsItemDTO class OllamaProvider(ILLMProvider): + def get_info(self) -> dict[str, str]: + base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434') + model = os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud') + return { + "model": model, + "base_url": base_url, + "prompt_summary": "Russian summary, 2 sentences, R&D focus" + } + async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO: base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434') url = base_url if base_url.endswith('/api/generate') else f"{base_url.rstrip('/')}/api/generate" @@ -14,6 +23,7 @@ class OllamaProvider(ILLMProvider): f"Analyze the following article.\nTitle: {news_item.title}\n" f"Content: {news_item.content_text}\n" "Return JSON with 'relevance_score' (0-10), 'summary_ru' (string), 'anomalies_detected' (list of strings), and 'category' (string).\n" + "The 'summary_ru' MUST be in Russian and strictly NO MORE than 2 sentences.\n" "The 'category' must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n" "For 'relevance_score', prioritize and give higher scores to articles related to R&D, Chromium, NPU, and Smart TV operating systems.\n" "Regarding 'anomalies_detected': only detect factual, conceptual, or industry-related anomalies (e.g., sudden technological shifts, unexpected competitor moves). " diff --git a/src/storage/chroma_store.py b/src/storage/chroma_store.py index 05a782e..1c68812 100644 --- a/src/storage/chroma_store.py +++ b/src/storage/chroma_store.py @@ -68,6 +68,9 @@ class ChromaStore(IVectorStore): document = documents[0][idx] if documents and documents[0] else "" items.append(self._reconstruct_dto(metadata, document)) + # Sort items by relevance_score in descending order + items.sort(key=lambda x: x.relevance_score, reverse=True) + return items def _reconstruct_dto(self, metadata: Mapping[str, Any], document: str) -> EnrichedNewsItemDTO: diff --git a/tests/bot/test_handlers.py b/tests/bot/test_handlers.py index b33ac75..39f260b 100644 --- a/tests/bot/test_handlers.py +++ b/tests/bot/test_handlers.py @@ -35,8 +35,18 @@ def allowed_chat_id(): return "123456789" @pytest.fixture -def router(mock_storage, allowed_chat_id): - return get_router(mock_storage, allowed_chat_id) +def mock_processor(): + processor = MagicMock() + processor.get_info.return_value = { + "model": "test-model", + "base_url": "http://test-url", + "prompt_summary": "Test summary" + } + return processor + +@pytest.fixture +def router(mock_storage, mock_processor, allowed_chat_id): + return get_router(mock_storage, mock_processor, allowed_chat_id) def get_handler(router, callback_name): for handler in router.message.handlers: @@ -79,6 +89,22 @@ async def test_command_help_handler(router, allowed_chat_id): assert "/start" in args[0] assert "/latest" in args[0] +@pytest.mark.asyncio +async def test_command_params_handler(router, mock_processor, allowed_chat_id): + handler = get_handler(router, "command_params_handler") + message = AsyncMock() + message.chat.id = int(allowed_chat_id) + message.answer = AsyncMock() + + await handler(message) + + mock_processor.get_info.assert_called_once() + message.answer.assert_called_once() + args, kwargs = message.answer.call_args + assert "LLM Processor Parameters" in args[0] + assert "test-model" in args[0] + assert "HTML" in kwargs.get("parse_mode", "") + @pytest.mark.asyncio async def test_command_latest_handler(router, mock_storage, allowed_chat_id): handler = get_handler(router, "command_latest_handler") diff --git a/tests/crawlers/test_rss_crawler.py b/tests/crawlers/test_rss_crawler.py index 8bba030..42c8b9e 100644 --- a/tests/crawlers/test_rss_crawler.py +++ b/tests/crawlers/test_rss_crawler.py @@ -45,8 +45,8 @@ async def test_rss_crawler_fetch_latest(): # Call the method items = await crawler.fetch_latest() - # Verify the mock was called with the correct URL - mock_get.assert_called_once_with(url) + # Verify the mock was called with the correct URL and headers + mock_get.assert_called_once_with(url, headers={"User-Agent": RSSCrawler.USER_AGENT}) # Verify the parsing results assert len(items) == 2 @@ -66,3 +66,17 @@ async def test_rss_crawler_fetch_latest(): assert items[1].content_text == "Test Content 2" assert items[1].source == source assert items[1].timestamp == datetime(2002, 10, 3, 10, 0, tzinfo=timezone.utc) + +@pytest.mark.asyncio +async def test_rss_crawler_fetch_latest_error(): + url = "http://error.source.com/rss" + source = "Error Source" + crawler = RSSCrawler(url, source) + + with patch("aiohttp.ClientSession.get") as mock_get: + # Simulate an exception during the request + mock_get.side_effect = Exception("Connection error") + + items = await crawler.fetch_latest() + + assert items == [] diff --git a/tests/orchestrator/test_service.py b/tests/orchestrator/test_service.py index 411d9f0..0969941 100644 --- a/tests/orchestrator/test_service.py +++ b/tests/orchestrator/test_service.py @@ -100,3 +100,67 @@ async def test_run_iteration(): # Should not alert proactively anymore as per updated requirements assert notifier_mock.send_alert.call_count == 0 + +@pytest.mark.asyncio +async def test_run_iteration_crawler_failure(): + # Arrange + crawler1 = AsyncMock() + crawler2 = AsyncMock() + processor = AsyncMock() + storage = AsyncMock() + notifier = AsyncMock() + + crawler1.fetch_latest.side_effect = Exception("Crawler 1 failed") + crawler2.fetch_latest.return_value = [] + + service = TrendScoutService( + crawlers=[crawler1, crawler2], + processor=processor, + storage=storage, + notifier=notifier + ) + + # Act + await service.run_iteration() + + # Assert - crawler 1 failed, but it shouldn't stop crawler 2 + crawler1.fetch_latest.assert_called_once() + crawler2.fetch_latest.assert_called_once() + +@pytest.mark.asyncio +async def test_run_iteration_item_failure(): + # Arrange + crawler = AsyncMock() + processor = AsyncMock() + storage = AsyncMock() + notifier = AsyncMock() + + item1 = NewsItemDTO(title="T1", url="U1", content_text="C1", source="S1", timestamp=datetime.now()) + item2 = NewsItemDTO(title="T2", url="U2", content_text="C2", source="S2", timestamp=datetime.now()) + + crawler.fetch_latest.return_value = [item1, item2] + storage.exists.return_value = False + + # processor.analyze fails for item1 + enriched_item2 = EnrichedNewsItemDTO( + **item2.model_dump(), + relevance_score=6, + summary_ru="Summary", + anomalies_detected=[] + ) + processor.analyze.side_effect = [Exception("Analyze failed"), enriched_item2] + + service = TrendScoutService( + crawlers=[crawler], + processor=processor, + storage=storage, + notifier=notifier + ) + + # Act + await service.run_iteration() + + # Assert - item 1 failed, but it shouldn't stop item 2 + assert processor.analyze.call_count == 2 + # Only item 2 should be stored + assert storage.store.call_count == 1 diff --git a/tests/processor/test_ollama_provider.py b/tests/processor/test_ollama_provider.py index 9190175..d75dbe8 100644 --- a/tests/processor/test_ollama_provider.py +++ b/tests/processor/test_ollama_provider.py @@ -106,3 +106,13 @@ async def test_ollama_provider_analyze_markdown_json(sample_news_item): assert result.anomalies_detected == [] assert result.category == "Browsers" +def test_ollama_provider_get_info(): + os.environ['OLLAMA_API_URL'] = 'http://test-url:11434' + os.environ['OLLAMA_MODEL'] = 'test-model' + provider = OllamaProvider() + info = provider.get_info() + + assert info["model"] == "test-model" + assert info["base_url"] == "http://test-url:11434" + assert info["prompt_summary"] == "Russian summary, 2 sentences, R&D focus" + diff --git a/tests/storage/test_chroma_store.py b/tests/storage/test_chroma_store.py index bf3a995..47feef6 100644 --- a/tests/storage/test_chroma_store.py +++ b/tests/storage/test_chroma_store.py @@ -230,3 +230,32 @@ async def test_get_stats(chroma_store: ChromaStore): assert stats["total_count"] == 3 assert stats["category_Tech"] == 2 assert stats["category_Science"] == 1 + +@pytest.mark.asyncio +async def test_search_sorting(chroma_store: ChromaStore): + # Arrange + items = [ + EnrichedNewsItemDTO( + title=f"Title {i}", + url=f"https://example.com/{i}", + content_text=f"Content {i}", + source="Source", + timestamp=datetime.now(timezone.utc), + relevance_score=i, + summary_ru=f"Сводка {i}", + anomalies_detected=[], + category="Tech" + ) for i in range(1, 6) # Scores 1 to 5 + ] + + for item in items: + await chroma_store.store(item) + + # Act + results = await chroma_store.search("Content", limit=10) + + # Assert + assert len(results) == 5 + # Should be sorted 5, 4, 3, 2, 1 + scores = [r.relevance_score for r in results] + assert scores == [5, 4, 3, 2, 1]