diff --git a/src/bot/bot.py b/src/bot/bot.py
index 92fc9e4..eeae995 100644
--- a/src/bot/bot.py
+++ b/src/bot/bot.py
@@ -2,12 +2,13 @@ from aiogram import Bot, Dispatcher
from aiogram.client.default import DefaultBotProperties
from src.bot.handlers import get_router
from src.storage.base import IVectorStore
+from src.processor.base import ILLMProvider
-def setup_bot(token: str, storage: IVectorStore, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
+def setup_bot(token: str, storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
"""
Setup the aiogram Bot and Dispatcher with handlers.
"""
bot = Bot(token=token, default=DefaultBotProperties(parse_mode="HTML"))
dp = Dispatcher()
- dp.include_router(get_router(storage, allowed_chat_id))
+ dp.include_router(get_router(storage, processor, allowed_chat_id))
return bot, dp
diff --git a/src/bot/handlers.py b/src/bot/handlers.py
index 2c256c1..23038de 100644
--- a/src/bot/handlers.py
+++ b/src/bot/handlers.py
@@ -10,6 +10,7 @@ from aiogram.utils.keyboard import InlineKeyboardBuilder
from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink
from src.processor.dto import EnrichedNewsItemDTO
+from src.processor.base import ILLMProvider
from src.storage.base import IVectorStore
class AccessMiddleware(BaseMiddleware):
@@ -28,7 +29,7 @@ class AccessMiddleware(BaseMiddleware):
return
return await handler(event, data)
-def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
+def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> Router:
router = Router(name="main_router")
router.message.middleware(AccessMiddleware(allowed_chat_id))
@@ -52,9 +53,24 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
"/latest [category] - Show the latest enriched news trends\n"
"/search query - Search for news\n"
"/stats - Show database statistics\n"
+ "/params - Show LLM processor parameters\n"
)
await message.answer(help_text)
+ @router.message(Command("params"))
+ async def command_params_handler(message: Message) -> None:
+ """
+ This handler receives messages with `/params` command
+ """
+ info = processor.get_info()
+
+ response = "🤖 LLM Processor Parameters\n\n"
+ response += f"Model: {html.escape(info.get('model', 'Unknown'))}\n"
+ response += f"Base URL: {html.escape(info.get('base_url', 'Unknown'))}\n"
+ response += f"Prompt Summary: {html.escape(info.get('prompt_summary', 'Unknown'))}"
+
+ await message.answer(response, parse_mode="HTML")
+
@router.message(Command("latest"))
async def command_latest_handler(message: Message, command: CommandObject) -> None:
"""
@@ -146,12 +162,13 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
This handler receives messages with `/stats` command
"""
stats = await storage.get_stats()
- total = stats.get("total", 0)
+ total = stats.get("total_count", 0)
breakdown = []
- for cat, count in stats.items():
- if cat != "total":
- breakdown.append(f"- {cat}: {count}")
+ for key, count in stats.items():
+ if key.startswith("category_"):
+ cat_name = key.replace("category_", "")
+ breakdown.append(f"- {cat_name}: {count}")
response = f"📊 Database Statistics\n\nTotal items: {total}\n"
if breakdown:
diff --git a/src/crawlers/rss_crawler.py b/src/crawlers/rss_crawler.py
index f3e4f6a..4a984e1 100644
--- a/src/crawlers/rss_crawler.py
+++ b/src/crawlers/rss_crawler.py
@@ -1,5 +1,6 @@
import aiohttp
import xml.etree.ElementTree as ET
+import logging
from datetime import datetime
from email.utils import parsedate_to_datetime
from typing import List
@@ -7,17 +8,26 @@ from typing import List
from src.crawlers.base import ICrawler
from src.crawlers.dto import NewsItemDTO
+logger = logging.getLogger(__name__)
+
class RSSCrawler(ICrawler):
+ USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+
def __init__(self, url: str, source: str):
self.url = url
self.source = source
async def fetch_latest(self) -> List[NewsItemDTO]:
- async with aiohttp.ClientSession() as session:
- async with session.get(self.url) as response:
- response.raise_for_status()
- xml_data = await response.text()
- return self._parse_xml(xml_data)
+ headers = {"User-Agent": self.USER_AGENT}
+ try:
+ async with aiohttp.ClientSession() as session:
+ async with session.get(self.url, headers=headers) as response:
+ response.raise_for_status()
+ xml_data = await response.text()
+ return self._parse_xml(xml_data)
+ except Exception as e:
+ logger.error(f"Error fetching RSS from {self.url}: {e}")
+ return []
def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]:
root = ET.fromstring(xml_data)
diff --git a/src/main.py b/src/main.py
index ad3545b..7891fec 100644
--- a/src/main.py
+++ b/src/main.py
@@ -9,6 +9,7 @@ from aiogram import Bot, Dispatcher
from src.crawlers.base import ICrawler
from src.crawlers.rss_crawler import RSSCrawler
+from src.crawlers.playwright_crawler import PlaywrightCrawler
from src.processor.ollama_provider import OllamaProvider
from src.storage.chroma_store import ChromaStore
from src.notifications.telegram import TelegramNotifier
@@ -49,7 +50,20 @@ async def main():
# 1. Initialize Components that do not depend on Bot
crawlers: List[ICrawler] = [
- RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI")
+ RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI"),
+ RSSCrawler("https://www.nature.com/nature.rss", source="Nature"),
+ RSSCrawler("https://news.google.com/rss/search?q=WebOS+Chromium+Edge+AI+LGE+SmartTV&hl=en-US&gl=US&ceid=US:en", source="Google News R&D"),
+ RSSCrawler("https://news.samsung.com/global/rss", source="Samsung Newsroom"),
+ RSSCrawler("https://www.sony.com/en/SonyInfo/News/Service/rss.xml", source="Sony Newsroom"),
+ PlaywrightCrawler("https://cvpr.thecvf.com/Conferences/2025", source="CVPR 2025", selector=".conference-news-item"),
+ PlaywrightCrawler("https://www.ces.tech/news/press-releases.aspx", source="CES 2025", selector=".press-release-item"),
+ RSSCrawler("https://vc.ru/rss/tech", source="VC.ru Tech"),
+ RSSCrawler("https://rb.ru/rss/", source="RB.ru"),
+ RSSCrawler("https://www.science.org/rss/news_current.xml", source="Science News"),
+ RSSCrawler("https://ufn.ru/en/rss/", source="УФН"),
+ RSSCrawler("https://www.tadviser.ru/xml/tadviser.xml", source="TAdviser"),
+ RSSCrawler("https://habr.com/ru/rss/company/yandex/blog/", source="Yandex Tech"),
+ RSSCrawler("https://blog.google/technology/ai/rss/", source="Google AI Blog"),
]
processor = OllamaProvider()
@@ -62,7 +76,7 @@ async def main():
storage = ChromaStore(client=chroma_client)
# 2. Initialize Bot & Dispatcher
- bot, dp = setup_bot(bot_token, storage, chat_id)
+ bot, dp = setup_bot(bot_token, storage, processor, chat_id)
# 3. Initialize Notifier and Orchestrator
notifier = TelegramNotifier(bot, chat_id)
diff --git a/src/orchestrator/service.py b/src/orchestrator/service.py
index a445ad2..57bc83d 100644
--- a/src/orchestrator/service.py
+++ b/src/orchestrator/service.py
@@ -1,9 +1,12 @@
+import logging
from typing import List
from src.crawlers.base import ICrawler
from src.processor.base import ILLMProvider
from src.storage.base import IVectorStore
from src.notifications.base import INotificationService
+logger = logging.getLogger(__name__)
+
class TrendScoutService:
def __init__(
self,
@@ -19,18 +22,24 @@ class TrendScoutService:
async def run_iteration(self) -> None:
for crawler in self.crawlers:
- items = await crawler.fetch_latest()
- for item in items:
- if await self.storage.exists(item.url):
- continue
+ try:
+ items = await crawler.fetch_latest()
+ for item in items:
+ try:
+ if await self.storage.exists(item.url):
+ continue
- enriched_item = await self.processor.analyze(item)
-
- if enriched_item.relevance_score < 5:
- enriched_item.content_text = ""
-
- await self.storage.store(enriched_item)
-
- # Proactive alerts are currently disabled per user request
- # if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected):
- # await self.notifier.send_alert(enriched_item)
+ enriched_item = await self.processor.analyze(item)
+
+ if enriched_item.relevance_score < 5:
+ enriched_item.content_text = ""
+
+ await self.storage.store(enriched_item)
+
+ # Proactive alerts are currently disabled per user request
+ # if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected):
+ # await self.notifier.send_alert(enriched_item)
+ except Exception as e:
+ logger.error(f"Error processing item {item.url}: {e}")
+ except Exception as e:
+ logger.error(f"Error running crawler {crawler}: {e}")
diff --git a/src/processor/base.py b/src/processor/base.py
index 728f589..368e77d 100644
--- a/src/processor/base.py
+++ b/src/processor/base.py
@@ -6,3 +6,7 @@ class ILLMProvider(ABC):
@abstractmethod
async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
pass
+
+ @abstractmethod
+ def get_info(self) -> dict[str, str]:
+ pass
diff --git a/src/processor/ollama_provider.py b/src/processor/ollama_provider.py
index a056b9b..8f956dd 100644
--- a/src/processor/ollama_provider.py
+++ b/src/processor/ollama_provider.py
@@ -7,6 +7,15 @@ from src.processor.base import ILLMProvider
from src.processor.dto import EnrichedNewsItemDTO
class OllamaProvider(ILLMProvider):
+ def get_info(self) -> dict[str, str]:
+ base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
+ model = os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud')
+ return {
+ "model": model,
+ "base_url": base_url,
+ "prompt_summary": "Russian summary, 2 sentences, R&D focus"
+ }
+
async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
url = base_url if base_url.endswith('/api/generate') else f"{base_url.rstrip('/')}/api/generate"
@@ -14,6 +23,7 @@ class OllamaProvider(ILLMProvider):
f"Analyze the following article.\nTitle: {news_item.title}\n"
f"Content: {news_item.content_text}\n"
"Return JSON with 'relevance_score' (0-10), 'summary_ru' (string), 'anomalies_detected' (list of strings), and 'category' (string).\n"
+ "The 'summary_ru' MUST be in Russian and strictly NO MORE than 2 sentences.\n"
"The 'category' must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n"
"For 'relevance_score', prioritize and give higher scores to articles related to R&D, Chromium, NPU, and Smart TV operating systems.\n"
"Regarding 'anomalies_detected': only detect factual, conceptual, or industry-related anomalies (e.g., sudden technological shifts, unexpected competitor moves). "
diff --git a/src/storage/chroma_store.py b/src/storage/chroma_store.py
index 05a782e..1c68812 100644
--- a/src/storage/chroma_store.py
+++ b/src/storage/chroma_store.py
@@ -68,6 +68,9 @@ class ChromaStore(IVectorStore):
document = documents[0][idx] if documents and documents[0] else ""
items.append(self._reconstruct_dto(metadata, document))
+ # Sort items by relevance_score in descending order
+ items.sort(key=lambda x: x.relevance_score, reverse=True)
+
return items
def _reconstruct_dto(self, metadata: Mapping[str, Any], document: str) -> EnrichedNewsItemDTO:
diff --git a/tests/bot/test_handlers.py b/tests/bot/test_handlers.py
index b33ac75..39f260b 100644
--- a/tests/bot/test_handlers.py
+++ b/tests/bot/test_handlers.py
@@ -35,8 +35,18 @@ def allowed_chat_id():
return "123456789"
@pytest.fixture
-def router(mock_storage, allowed_chat_id):
- return get_router(mock_storage, allowed_chat_id)
+def mock_processor():
+ processor = MagicMock()
+ processor.get_info.return_value = {
+ "model": "test-model",
+ "base_url": "http://test-url",
+ "prompt_summary": "Test summary"
+ }
+ return processor
+
+@pytest.fixture
+def router(mock_storage, mock_processor, allowed_chat_id):
+ return get_router(mock_storage, mock_processor, allowed_chat_id)
def get_handler(router, callback_name):
for handler in router.message.handlers:
@@ -79,6 +89,22 @@ async def test_command_help_handler(router, allowed_chat_id):
assert "/start" in args[0]
assert "/latest" in args[0]
+@pytest.mark.asyncio
+async def test_command_params_handler(router, mock_processor, allowed_chat_id):
+ handler = get_handler(router, "command_params_handler")
+ message = AsyncMock()
+ message.chat.id = int(allowed_chat_id)
+ message.answer = AsyncMock()
+
+ await handler(message)
+
+ mock_processor.get_info.assert_called_once()
+ message.answer.assert_called_once()
+ args, kwargs = message.answer.call_args
+ assert "LLM Processor Parameters" in args[0]
+ assert "test-model" in args[0]
+ assert "HTML" in kwargs.get("parse_mode", "")
+
@pytest.mark.asyncio
async def test_command_latest_handler(router, mock_storage, allowed_chat_id):
handler = get_handler(router, "command_latest_handler")
diff --git a/tests/crawlers/test_rss_crawler.py b/tests/crawlers/test_rss_crawler.py
index 8bba030..42c8b9e 100644
--- a/tests/crawlers/test_rss_crawler.py
+++ b/tests/crawlers/test_rss_crawler.py
@@ -45,8 +45,8 @@ async def test_rss_crawler_fetch_latest():
# Call the method
items = await crawler.fetch_latest()
- # Verify the mock was called with the correct URL
- mock_get.assert_called_once_with(url)
+ # Verify the mock was called with the correct URL and headers
+ mock_get.assert_called_once_with(url, headers={"User-Agent": RSSCrawler.USER_AGENT})
# Verify the parsing results
assert len(items) == 2
@@ -66,3 +66,17 @@ async def test_rss_crawler_fetch_latest():
assert items[1].content_text == "Test Content 2"
assert items[1].source == source
assert items[1].timestamp == datetime(2002, 10, 3, 10, 0, tzinfo=timezone.utc)
+
+@pytest.mark.asyncio
+async def test_rss_crawler_fetch_latest_error():
+ url = "http://error.source.com/rss"
+ source = "Error Source"
+ crawler = RSSCrawler(url, source)
+
+ with patch("aiohttp.ClientSession.get") as mock_get:
+ # Simulate an exception during the request
+ mock_get.side_effect = Exception("Connection error")
+
+ items = await crawler.fetch_latest()
+
+ assert items == []
diff --git a/tests/orchestrator/test_service.py b/tests/orchestrator/test_service.py
index 411d9f0..0969941 100644
--- a/tests/orchestrator/test_service.py
+++ b/tests/orchestrator/test_service.py
@@ -100,3 +100,67 @@ async def test_run_iteration():
# Should not alert proactively anymore as per updated requirements
assert notifier_mock.send_alert.call_count == 0
+
+@pytest.mark.asyncio
+async def test_run_iteration_crawler_failure():
+ # Arrange
+ crawler1 = AsyncMock()
+ crawler2 = AsyncMock()
+ processor = AsyncMock()
+ storage = AsyncMock()
+ notifier = AsyncMock()
+
+ crawler1.fetch_latest.side_effect = Exception("Crawler 1 failed")
+ crawler2.fetch_latest.return_value = []
+
+ service = TrendScoutService(
+ crawlers=[crawler1, crawler2],
+ processor=processor,
+ storage=storage,
+ notifier=notifier
+ )
+
+ # Act
+ await service.run_iteration()
+
+ # Assert - crawler 1 failed, but it shouldn't stop crawler 2
+ crawler1.fetch_latest.assert_called_once()
+ crawler2.fetch_latest.assert_called_once()
+
+@pytest.mark.asyncio
+async def test_run_iteration_item_failure():
+ # Arrange
+ crawler = AsyncMock()
+ processor = AsyncMock()
+ storage = AsyncMock()
+ notifier = AsyncMock()
+
+ item1 = NewsItemDTO(title="T1", url="U1", content_text="C1", source="S1", timestamp=datetime.now())
+ item2 = NewsItemDTO(title="T2", url="U2", content_text="C2", source="S2", timestamp=datetime.now())
+
+ crawler.fetch_latest.return_value = [item1, item2]
+ storage.exists.return_value = False
+
+ # processor.analyze fails for item1
+ enriched_item2 = EnrichedNewsItemDTO(
+ **item2.model_dump(),
+ relevance_score=6,
+ summary_ru="Summary",
+ anomalies_detected=[]
+ )
+ processor.analyze.side_effect = [Exception("Analyze failed"), enriched_item2]
+
+ service = TrendScoutService(
+ crawlers=[crawler],
+ processor=processor,
+ storage=storage,
+ notifier=notifier
+ )
+
+ # Act
+ await service.run_iteration()
+
+ # Assert - item 1 failed, but it shouldn't stop item 2
+ assert processor.analyze.call_count == 2
+ # Only item 2 should be stored
+ assert storage.store.call_count == 1
diff --git a/tests/processor/test_ollama_provider.py b/tests/processor/test_ollama_provider.py
index 9190175..d75dbe8 100644
--- a/tests/processor/test_ollama_provider.py
+++ b/tests/processor/test_ollama_provider.py
@@ -106,3 +106,13 @@ async def test_ollama_provider_analyze_markdown_json(sample_news_item):
assert result.anomalies_detected == []
assert result.category == "Browsers"
+def test_ollama_provider_get_info():
+ os.environ['OLLAMA_API_URL'] = 'http://test-url:11434'
+ os.environ['OLLAMA_MODEL'] = 'test-model'
+ provider = OllamaProvider()
+ info = provider.get_info()
+
+ assert info["model"] == "test-model"
+ assert info["base_url"] == "http://test-url:11434"
+ assert info["prompt_summary"] == "Russian summary, 2 sentences, R&D focus"
+
diff --git a/tests/storage/test_chroma_store.py b/tests/storage/test_chroma_store.py
index bf3a995..47feef6 100644
--- a/tests/storage/test_chroma_store.py
+++ b/tests/storage/test_chroma_store.py
@@ -230,3 +230,32 @@ async def test_get_stats(chroma_store: ChromaStore):
assert stats["total_count"] == 3
assert stats["category_Tech"] == 2
assert stats["category_Science"] == 1
+
+@pytest.mark.asyncio
+async def test_search_sorting(chroma_store: ChromaStore):
+ # Arrange
+ items = [
+ EnrichedNewsItemDTO(
+ title=f"Title {i}",
+ url=f"https://example.com/{i}",
+ content_text=f"Content {i}",
+ source="Source",
+ timestamp=datetime.now(timezone.utc),
+ relevance_score=i,
+ summary_ru=f"Сводка {i}",
+ anomalies_detected=[],
+ category="Tech"
+ ) for i in range(1, 6) # Scores 1 to 5
+ ]
+
+ for item in items:
+ await chroma_store.store(item)
+
+ # Act
+ results = await chroma_store.search("Content", limit=10)
+
+ # Assert
+ assert len(results) == 5
+ # Should be sorted 5, 4, 3, 2, 1
+ scores = [r.relevance_score for r in results]
+ assert scores == [5, 4, 3, 2, 1]