[perf] stabilization of previous release
This commit is contained in:
parent
9c8e4c7345
commit
4bf7cb4331
@ -2,12 +2,13 @@ from aiogram import Bot, Dispatcher
|
||||
from aiogram.client.default import DefaultBotProperties
|
||||
from src.bot.handlers import get_router
|
||||
from src.storage.base import IVectorStore
|
||||
from src.processor.base import ILLMProvider
|
||||
|
||||
def setup_bot(token: str, storage: IVectorStore, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
|
||||
def setup_bot(token: str, storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> tuple[Bot, Dispatcher]:
|
||||
"""
|
||||
Setup the aiogram Bot and Dispatcher with handlers.
|
||||
"""
|
||||
bot = Bot(token=token, default=DefaultBotProperties(parse_mode="HTML"))
|
||||
dp = Dispatcher()
|
||||
dp.include_router(get_router(storage, allowed_chat_id))
|
||||
dp.include_router(get_router(storage, processor, allowed_chat_id))
|
||||
return bot, dp
|
||||
|
||||
@ -10,6 +10,7 @@ from aiogram.utils.keyboard import InlineKeyboardBuilder
|
||||
from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink
|
||||
|
||||
from src.processor.dto import EnrichedNewsItemDTO
|
||||
from src.processor.base import ILLMProvider
|
||||
from src.storage.base import IVectorStore
|
||||
|
||||
class AccessMiddleware(BaseMiddleware):
|
||||
@ -28,7 +29,7 @@ class AccessMiddleware(BaseMiddleware):
|
||||
return
|
||||
return await handler(event, data)
|
||||
|
||||
def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
|
||||
def get_router(storage: IVectorStore, processor: ILLMProvider, allowed_chat_id: str) -> Router:
|
||||
router = Router(name="main_router")
|
||||
router.message.middleware(AccessMiddleware(allowed_chat_id))
|
||||
|
||||
@ -52,9 +53,24 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
|
||||
"/latest [category] - Show the latest enriched news trends\n"
|
||||
"/search query - Search for news\n"
|
||||
"/stats - Show database statistics\n"
|
||||
"/params - Show LLM processor parameters\n"
|
||||
)
|
||||
await message.answer(help_text)
|
||||
|
||||
@router.message(Command("params"))
|
||||
async def command_params_handler(message: Message) -> None:
|
||||
"""
|
||||
This handler receives messages with `/params` command
|
||||
"""
|
||||
info = processor.get_info()
|
||||
|
||||
response = "🤖 <b>LLM Processor Parameters</b>\n\n"
|
||||
response += f"<b>Model:</b> {html.escape(info.get('model', 'Unknown'))}\n"
|
||||
response += f"<b>Base URL:</b> {html.escape(info.get('base_url', 'Unknown'))}\n"
|
||||
response += f"<b>Prompt Summary:</b> {html.escape(info.get('prompt_summary', 'Unknown'))}"
|
||||
|
||||
await message.answer(response, parse_mode="HTML")
|
||||
|
||||
@router.message(Command("latest"))
|
||||
async def command_latest_handler(message: Message, command: CommandObject) -> None:
|
||||
"""
|
||||
@ -146,12 +162,13 @@ def get_router(storage: IVectorStore, allowed_chat_id: str) -> Router:
|
||||
This handler receives messages with `/stats` command
|
||||
"""
|
||||
stats = await storage.get_stats()
|
||||
total = stats.get("total", 0)
|
||||
total = stats.get("total_count", 0)
|
||||
|
||||
breakdown = []
|
||||
for cat, count in stats.items():
|
||||
if cat != "total":
|
||||
breakdown.append(f"- {cat}: {count}")
|
||||
for key, count in stats.items():
|
||||
if key.startswith("category_"):
|
||||
cat_name = key.replace("category_", "")
|
||||
breakdown.append(f"- {cat_name}: {count}")
|
||||
|
||||
response = f"📊 <b>Database Statistics</b>\n\nTotal items: {total}\n"
|
||||
if breakdown:
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import aiohttp
|
||||
import xml.etree.ElementTree as ET
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from email.utils import parsedate_to_datetime
|
||||
from typing import List
|
||||
@ -7,17 +8,26 @@ from typing import List
|
||||
from src.crawlers.base import ICrawler
|
||||
from src.crawlers.dto import NewsItemDTO
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class RSSCrawler(ICrawler):
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
|
||||
def __init__(self, url: str, source: str):
|
||||
self.url = url
|
||||
self.source = source
|
||||
|
||||
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(self.url) as response:
|
||||
response.raise_for_status()
|
||||
xml_data = await response.text()
|
||||
return self._parse_xml(xml_data)
|
||||
headers = {"User-Agent": self.USER_AGENT}
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(self.url, headers=headers) as response:
|
||||
response.raise_for_status()
|
||||
xml_data = await response.text()
|
||||
return self._parse_xml(xml_data)
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching RSS from {self.url}: {e}")
|
||||
return []
|
||||
|
||||
def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]:
|
||||
root = ET.fromstring(xml_data)
|
||||
|
||||
18
src/main.py
18
src/main.py
@ -9,6 +9,7 @@ from aiogram import Bot, Dispatcher
|
||||
|
||||
from src.crawlers.base import ICrawler
|
||||
from src.crawlers.rss_crawler import RSSCrawler
|
||||
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
||||
from src.processor.ollama_provider import OllamaProvider
|
||||
from src.storage.chroma_store import ChromaStore
|
||||
from src.notifications.telegram import TelegramNotifier
|
||||
@ -49,7 +50,20 @@ async def main():
|
||||
|
||||
# 1. Initialize Components that do not depend on Bot
|
||||
crawlers: List[ICrawler] = [
|
||||
RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI")
|
||||
RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI"),
|
||||
RSSCrawler("https://www.nature.com/nature.rss", source="Nature"),
|
||||
RSSCrawler("https://news.google.com/rss/search?q=WebOS+Chromium+Edge+AI+LGE+SmartTV&hl=en-US&gl=US&ceid=US:en", source="Google News R&D"),
|
||||
RSSCrawler("https://news.samsung.com/global/rss", source="Samsung Newsroom"),
|
||||
RSSCrawler("https://www.sony.com/en/SonyInfo/News/Service/rss.xml", source="Sony Newsroom"),
|
||||
PlaywrightCrawler("https://cvpr.thecvf.com/Conferences/2025", source="CVPR 2025", selector=".conference-news-item"),
|
||||
PlaywrightCrawler("https://www.ces.tech/news/press-releases.aspx", source="CES 2025", selector=".press-release-item"),
|
||||
RSSCrawler("https://vc.ru/rss/tech", source="VC.ru Tech"),
|
||||
RSSCrawler("https://rb.ru/rss/", source="RB.ru"),
|
||||
RSSCrawler("https://www.science.org/rss/news_current.xml", source="Science News"),
|
||||
RSSCrawler("https://ufn.ru/en/rss/", source="УФН"),
|
||||
RSSCrawler("https://www.tadviser.ru/xml/tadviser.xml", source="TAdviser"),
|
||||
RSSCrawler("https://habr.com/ru/rss/company/yandex/blog/", source="Yandex Tech"),
|
||||
RSSCrawler("https://blog.google/technology/ai/rss/", source="Google AI Blog"),
|
||||
]
|
||||
|
||||
processor = OllamaProvider()
|
||||
@ -62,7 +76,7 @@ async def main():
|
||||
storage = ChromaStore(client=chroma_client)
|
||||
|
||||
# 2. Initialize Bot & Dispatcher
|
||||
bot, dp = setup_bot(bot_token, storage, chat_id)
|
||||
bot, dp = setup_bot(bot_token, storage, processor, chat_id)
|
||||
|
||||
# 3. Initialize Notifier and Orchestrator
|
||||
notifier = TelegramNotifier(bot, chat_id)
|
||||
|
||||
@ -1,9 +1,12 @@
|
||||
import logging
|
||||
from typing import List
|
||||
from src.crawlers.base import ICrawler
|
||||
from src.processor.base import ILLMProvider
|
||||
from src.storage.base import IVectorStore
|
||||
from src.notifications.base import INotificationService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class TrendScoutService:
|
||||
def __init__(
|
||||
self,
|
||||
@ -19,18 +22,24 @@ class TrendScoutService:
|
||||
|
||||
async def run_iteration(self) -> None:
|
||||
for crawler in self.crawlers:
|
||||
items = await crawler.fetch_latest()
|
||||
for item in items:
|
||||
if await self.storage.exists(item.url):
|
||||
continue
|
||||
try:
|
||||
items = await crawler.fetch_latest()
|
||||
for item in items:
|
||||
try:
|
||||
if await self.storage.exists(item.url):
|
||||
continue
|
||||
|
||||
enriched_item = await self.processor.analyze(item)
|
||||
|
||||
if enriched_item.relevance_score < 5:
|
||||
enriched_item.content_text = ""
|
||||
|
||||
await self.storage.store(enriched_item)
|
||||
|
||||
# Proactive alerts are currently disabled per user request
|
||||
# if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected):
|
||||
# await self.notifier.send_alert(enriched_item)
|
||||
enriched_item = await self.processor.analyze(item)
|
||||
|
||||
if enriched_item.relevance_score < 5:
|
||||
enriched_item.content_text = ""
|
||||
|
||||
await self.storage.store(enriched_item)
|
||||
|
||||
# Proactive alerts are currently disabled per user request
|
||||
# if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected):
|
||||
# await self.notifier.send_alert(enriched_item)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing item {item.url}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error running crawler {crawler}: {e}")
|
||||
|
||||
@ -6,3 +6,7 @@ class ILLMProvider(ABC):
|
||||
@abstractmethod
|
||||
async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_info(self) -> dict[str, str]:
|
||||
pass
|
||||
|
||||
@ -7,6 +7,15 @@ from src.processor.base import ILLMProvider
|
||||
from src.processor.dto import EnrichedNewsItemDTO
|
||||
|
||||
class OllamaProvider(ILLMProvider):
|
||||
def get_info(self) -> dict[str, str]:
|
||||
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
|
||||
model = os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud')
|
||||
return {
|
||||
"model": model,
|
||||
"base_url": base_url,
|
||||
"prompt_summary": "Russian summary, 2 sentences, R&D focus"
|
||||
}
|
||||
|
||||
async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
|
||||
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
|
||||
url = base_url if base_url.endswith('/api/generate') else f"{base_url.rstrip('/')}/api/generate"
|
||||
@ -14,6 +23,7 @@ class OllamaProvider(ILLMProvider):
|
||||
f"Analyze the following article.\nTitle: {news_item.title}\n"
|
||||
f"Content: {news_item.content_text}\n"
|
||||
"Return JSON with 'relevance_score' (0-10), 'summary_ru' (string), 'anomalies_detected' (list of strings), and 'category' (string).\n"
|
||||
"The 'summary_ru' MUST be in Russian and strictly NO MORE than 2 sentences.\n"
|
||||
"The 'category' must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n"
|
||||
"For 'relevance_score', prioritize and give higher scores to articles related to R&D, Chromium, NPU, and Smart TV operating systems.\n"
|
||||
"Regarding 'anomalies_detected': only detect factual, conceptual, or industry-related anomalies (e.g., sudden technological shifts, unexpected competitor moves). "
|
||||
|
||||
@ -68,6 +68,9 @@ class ChromaStore(IVectorStore):
|
||||
document = documents[0][idx] if documents and documents[0] else ""
|
||||
items.append(self._reconstruct_dto(metadata, document))
|
||||
|
||||
# Sort items by relevance_score in descending order
|
||||
items.sort(key=lambda x: x.relevance_score, reverse=True)
|
||||
|
||||
return items
|
||||
|
||||
def _reconstruct_dto(self, metadata: Mapping[str, Any], document: str) -> EnrichedNewsItemDTO:
|
||||
|
||||
@ -35,8 +35,18 @@ def allowed_chat_id():
|
||||
return "123456789"
|
||||
|
||||
@pytest.fixture
|
||||
def router(mock_storage, allowed_chat_id):
|
||||
return get_router(mock_storage, allowed_chat_id)
|
||||
def mock_processor():
|
||||
processor = MagicMock()
|
||||
processor.get_info.return_value = {
|
||||
"model": "test-model",
|
||||
"base_url": "http://test-url",
|
||||
"prompt_summary": "Test summary"
|
||||
}
|
||||
return processor
|
||||
|
||||
@pytest.fixture
|
||||
def router(mock_storage, mock_processor, allowed_chat_id):
|
||||
return get_router(mock_storage, mock_processor, allowed_chat_id)
|
||||
|
||||
def get_handler(router, callback_name):
|
||||
for handler in router.message.handlers:
|
||||
@ -79,6 +89,22 @@ async def test_command_help_handler(router, allowed_chat_id):
|
||||
assert "/start" in args[0]
|
||||
assert "/latest" in args[0]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_command_params_handler(router, mock_processor, allowed_chat_id):
|
||||
handler = get_handler(router, "command_params_handler")
|
||||
message = AsyncMock()
|
||||
message.chat.id = int(allowed_chat_id)
|
||||
message.answer = AsyncMock()
|
||||
|
||||
await handler(message)
|
||||
|
||||
mock_processor.get_info.assert_called_once()
|
||||
message.answer.assert_called_once()
|
||||
args, kwargs = message.answer.call_args
|
||||
assert "LLM Processor Parameters" in args[0]
|
||||
assert "test-model" in args[0]
|
||||
assert "HTML" in kwargs.get("parse_mode", "")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_command_latest_handler(router, mock_storage, allowed_chat_id):
|
||||
handler = get_handler(router, "command_latest_handler")
|
||||
|
||||
@ -45,8 +45,8 @@ async def test_rss_crawler_fetch_latest():
|
||||
# Call the method
|
||||
items = await crawler.fetch_latest()
|
||||
|
||||
# Verify the mock was called with the correct URL
|
||||
mock_get.assert_called_once_with(url)
|
||||
# Verify the mock was called with the correct URL and headers
|
||||
mock_get.assert_called_once_with(url, headers={"User-Agent": RSSCrawler.USER_AGENT})
|
||||
|
||||
# Verify the parsing results
|
||||
assert len(items) == 2
|
||||
@ -66,3 +66,17 @@ async def test_rss_crawler_fetch_latest():
|
||||
assert items[1].content_text == "Test Content 2"
|
||||
assert items[1].source == source
|
||||
assert items[1].timestamp == datetime(2002, 10, 3, 10, 0, tzinfo=timezone.utc)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rss_crawler_fetch_latest_error():
|
||||
url = "http://error.source.com/rss"
|
||||
source = "Error Source"
|
||||
crawler = RSSCrawler(url, source)
|
||||
|
||||
with patch("aiohttp.ClientSession.get") as mock_get:
|
||||
# Simulate an exception during the request
|
||||
mock_get.side_effect = Exception("Connection error")
|
||||
|
||||
items = await crawler.fetch_latest()
|
||||
|
||||
assert items == []
|
||||
|
||||
@ -100,3 +100,67 @@ async def test_run_iteration():
|
||||
|
||||
# Should not alert proactively anymore as per updated requirements
|
||||
assert notifier_mock.send_alert.call_count == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_iteration_crawler_failure():
|
||||
# Arrange
|
||||
crawler1 = AsyncMock()
|
||||
crawler2 = AsyncMock()
|
||||
processor = AsyncMock()
|
||||
storage = AsyncMock()
|
||||
notifier = AsyncMock()
|
||||
|
||||
crawler1.fetch_latest.side_effect = Exception("Crawler 1 failed")
|
||||
crawler2.fetch_latest.return_value = []
|
||||
|
||||
service = TrendScoutService(
|
||||
crawlers=[crawler1, crawler2],
|
||||
processor=processor,
|
||||
storage=storage,
|
||||
notifier=notifier
|
||||
)
|
||||
|
||||
# Act
|
||||
await service.run_iteration()
|
||||
|
||||
# Assert - crawler 1 failed, but it shouldn't stop crawler 2
|
||||
crawler1.fetch_latest.assert_called_once()
|
||||
crawler2.fetch_latest.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_iteration_item_failure():
|
||||
# Arrange
|
||||
crawler = AsyncMock()
|
||||
processor = AsyncMock()
|
||||
storage = AsyncMock()
|
||||
notifier = AsyncMock()
|
||||
|
||||
item1 = NewsItemDTO(title="T1", url="U1", content_text="C1", source="S1", timestamp=datetime.now())
|
||||
item2 = NewsItemDTO(title="T2", url="U2", content_text="C2", source="S2", timestamp=datetime.now())
|
||||
|
||||
crawler.fetch_latest.return_value = [item1, item2]
|
||||
storage.exists.return_value = False
|
||||
|
||||
# processor.analyze fails for item1
|
||||
enriched_item2 = EnrichedNewsItemDTO(
|
||||
**item2.model_dump(),
|
||||
relevance_score=6,
|
||||
summary_ru="Summary",
|
||||
anomalies_detected=[]
|
||||
)
|
||||
processor.analyze.side_effect = [Exception("Analyze failed"), enriched_item2]
|
||||
|
||||
service = TrendScoutService(
|
||||
crawlers=[crawler],
|
||||
processor=processor,
|
||||
storage=storage,
|
||||
notifier=notifier
|
||||
)
|
||||
|
||||
# Act
|
||||
await service.run_iteration()
|
||||
|
||||
# Assert - item 1 failed, but it shouldn't stop item 2
|
||||
assert processor.analyze.call_count == 2
|
||||
# Only item 2 should be stored
|
||||
assert storage.store.call_count == 1
|
||||
|
||||
@ -106,3 +106,13 @@ async def test_ollama_provider_analyze_markdown_json(sample_news_item):
|
||||
assert result.anomalies_detected == []
|
||||
assert result.category == "Browsers"
|
||||
|
||||
def test_ollama_provider_get_info():
|
||||
os.environ['OLLAMA_API_URL'] = 'http://test-url:11434'
|
||||
os.environ['OLLAMA_MODEL'] = 'test-model'
|
||||
provider = OllamaProvider()
|
||||
info = provider.get_info()
|
||||
|
||||
assert info["model"] == "test-model"
|
||||
assert info["base_url"] == "http://test-url:11434"
|
||||
assert info["prompt_summary"] == "Russian summary, 2 sentences, R&D focus"
|
||||
|
||||
|
||||
@ -230,3 +230,32 @@ async def test_get_stats(chroma_store: ChromaStore):
|
||||
assert stats["total_count"] == 3
|
||||
assert stats["category_Tech"] == 2
|
||||
assert stats["category_Science"] == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_sorting(chroma_store: ChromaStore):
|
||||
# Arrange
|
||||
items = [
|
||||
EnrichedNewsItemDTO(
|
||||
title=f"Title {i}",
|
||||
url=f"https://example.com/{i}",
|
||||
content_text=f"Content {i}",
|
||||
source="Source",
|
||||
timestamp=datetime.now(timezone.utc),
|
||||
relevance_score=i,
|
||||
summary_ru=f"Сводка {i}",
|
||||
anomalies_detected=[],
|
||||
category="Tech"
|
||||
) for i in range(1, 6) # Scores 1 to 5
|
||||
]
|
||||
|
||||
for item in items:
|
||||
await chroma_store.store(item)
|
||||
|
||||
# Act
|
||||
results = await chroma_store.search("Content", limit=10)
|
||||
|
||||
# Assert
|
||||
assert len(results) == 5
|
||||
# Should be sorted 5, 4, 3, 2, 1
|
||||
scores = [r.relevance_score for r in results]
|
||||
assert scores == [5, 4, 3, 2, 1]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user