commit 5f093075f75cba2364cc1b69e8521f834c500876 Author: Artur Mukhamadiev Date: Fri Mar 13 11:48:37 2026 +0300 [ai] mvp generated by gemini diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e15106e --- /dev/null +++ b/.gitignore @@ -0,0 +1,216 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..7255980 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,61 @@ +# Agents Architecture and Tasks +This document outlines the architecture, responsibilities, and tasks for the subagents working on the **"Trend-Scout AI"** Telegram bot. + +## Core Principles +- **Test-Driven Development (TDD):** Write tests before implementing features. Use `pytest`. +- **SOLID Principles:** Ensure code is modular, maintainable, and follows Single Responsibility, Open/Closed, Liskov Substitution, Interface Segregation, and Dependency Inversion principles. +- **Asynchronous I/O:** Use `asyncio` for network requests, database operations, and bot handling. + +## 1. Crawler Agent (Data Collection) +**Responsibility:** Collect data from various sources (RSS feeds, HTML parsing for protected sites like Samsung/Sony Newsroom). +**Inputs:** +- Target URLs and source types (RSS, HTML). +- Configuration for Scrapy/Playwright. +**Outputs:** +- Standardized DTOs (Data Transfer Objects) containing: `title`, `url`, `content_text`, `source`, `timestamp`. +**Tasks:** +1. Setup TDD environment for crawlers (mocking HTTP responses). +2. Implement RSS parser for standard sources (e.g., Nature, Habr). +3. Implement HTML parser (Playwright/Scrapy) for complex/protected sources. +4. Ensure SRP: Crawlers only fetch and parse, returning raw text data. + +## 2. AI Processor Agent (NLP & LLM) +**Responsibility:** Analyze collected data using the Ollama API (`gpt-oss:120b-cloud` model). +**Inputs:** +- Standardized DTOs from Crawler Agent. +- Prompts for relevance scoring (0-10) and summarization in Russian. +- Keywords for anomaly detection (e.g., "WebGPU", "NPU acceleration", "Edge AI"). +**Outputs:** +- Enriched DTOs containing: `relevance_score`, `summary_ru`, `anomalies_detected`. +**Tasks:** +1. Setup TDD with mocked Ollama API responses. +2. Implement an `ILLMProvider` interface (Dependency Inversion). +3. Implement the concrete Ollama provider. +4. Create prompt templates for relevance, summarization, and anomaly detection. + +## 3. Vector Storage Agent (Database) +**Responsibility:** Store and retrieve processed data using a Vector Database (ChromaDB). +**Inputs:** +- Enriched DTOs from AI Processor Agent. +- Search queries. +**Outputs:** +- Stored records. +- Search results (similar news/trends). +**Tasks:** +1. Setup TDD for in-memory ChromaDB operations. +2. Define interfaces for `IVectorStore` (Interface Segregation). +3. Implement embedding generation (using a lightweight local model or Ollama). +4. Implement store and semantic search functionalities. + +## 4. Telegram Bot Agent (Frontend) +**Responsibility:** Handle user interactions, display summaries, and send alerts for anomalies. +**Inputs:** +- Telegram updates. +- Enriched DTOs (for alerts). +**Outputs:** +- Formatted Telegram messages. +**Tasks:** +1. Setup TDD for `aiogram` handlers (mocking Telegram API). +2. Implement `/start`, `/help`, and `/latest` commands. +3. Implement a notification service for high-relevance news and anomalies. +4. Ensure OCP: Routers should easily accept new commands without modifying core logic. diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/bot/__init__.py b/src/bot/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/bot/bot.py b/src/bot/bot.py new file mode 100644 index 0000000..f3f1fc1 --- /dev/null +++ b/src/bot/bot.py @@ -0,0 +1,12 @@ +from aiogram import Bot, Dispatcher +from aiogram.client.default import DefaultBotProperties +from src.bot.handlers import router + +def setup_bot(token: str) -> tuple[Bot, Dispatcher]: + """ + Setup the aiogram Bot and Dispatcher with handlers. + """ + bot = Bot(token=token, default=DefaultBotProperties(parse_mode="HTML")) + dp = Dispatcher() + dp.include_router(router) + return bot, dp diff --git a/src/bot/handlers.py b/src/bot/handlers.py new file mode 100644 index 0000000..c5fa4eb --- /dev/null +++ b/src/bot/handlers.py @@ -0,0 +1,69 @@ +from datetime import datetime +import html +from aiogram import Router +from aiogram.filters import CommandStart, Command +from aiogram.types import Message +from aiogram.utils.formatting import as_list, as_marked_section, Bold, TextLink + +from src.processor.dto import EnrichedNewsItemDTO + +router = Router(name="main_router") + + +@router.message(CommandStart()) +async def command_start_handler(message: Message) -> None: + """ + This handler receives messages with `/start` command + """ + user_name = html.escape(message.from_user.full_name) if message.from_user else 'user' + await message.answer(f"Welcome to Trend-Scout AI, {user_name}!") + + +@router.message(Command("help")) +async def command_help_handler(message: Message) -> None: + """ + This handler receives messages with `/help` command + """ + help_text = ( + "Available commands:\n" + "/start - Start the bot\n" + "/help - Show this help message\n" + "/latest - Show the latest enriched news trends\n" + ) + await message.answer(help_text) + + +@router.message(Command("latest")) +async def command_latest_handler(message: Message) -> None: + """ + This handler receives messages with `/latest` command + and returns a mock EnrichedNewsItemDTO + """ + mock_item = EnrichedNewsItemDTO( + title="Breakthrough in Quantum Computing", + url="https://example.com/quantum-breakthrough", + content_text="Researchers have achieved a new milestone in quantum supremacy...", + source="Example Domain News", + timestamp=datetime.now(), + relevance_score=9, + summary_ru="Исследователи достигли нового рубежа в квантовом превосходстве.", + anomalies_detected=["Quantum supremacy", "Room temperature superconductors"] + ) + + title = html.escape(mock_item.title) + source = html.escape(mock_item.source) + summary = html.escape(mock_item.summary_ru) + anomalies = [html.escape(a) for a in mock_item.anomalies_detected] + anomalies_text = ", ".join(anomalies) + url = html.escape(mock_item.url) + + response_text = ( + f"🌟 {title}\n\n" + f"Source: {source}\n" + f"Relevance Score: {mock_item.relevance_score}/10\n" + f"Summary: {summary}\n" + f"Anomalies Detected: {anomalies_text}\n\n" + f"Read more" + ) + + await message.answer(response_text, parse_mode="HTML") diff --git a/src/crawlers/base.py b/src/crawlers/base.py new file mode 100644 index 0000000..a7b84e1 --- /dev/null +++ b/src/crawlers/base.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod +from typing import List +from .dto import NewsItemDTO + +class ICrawler(ABC): + @abstractmethod + async def fetch_latest(self) -> List[NewsItemDTO]: + pass diff --git a/src/crawlers/dto.py b/src/crawlers/dto.py new file mode 100644 index 0000000..33542cc --- /dev/null +++ b/src/crawlers/dto.py @@ -0,0 +1,9 @@ +from datetime import datetime +from pydantic import BaseModel + +class NewsItemDTO(BaseModel): + title: str + url: str + content_text: str + source: str + timestamp: datetime diff --git a/src/crawlers/rss_crawler.py b/src/crawlers/rss_crawler.py new file mode 100644 index 0000000..f3e4f6a --- /dev/null +++ b/src/crawlers/rss_crawler.py @@ -0,0 +1,48 @@ +import aiohttp +import xml.etree.ElementTree as ET +from datetime import datetime +from email.utils import parsedate_to_datetime +from typing import List + +from src.crawlers.base import ICrawler +from src.crawlers.dto import NewsItemDTO + +class RSSCrawler(ICrawler): + def __init__(self, url: str, source: str): + self.url = url + self.source = source + + async def fetch_latest(self) -> List[NewsItemDTO]: + async with aiohttp.ClientSession() as session: + async with session.get(self.url) as response: + response.raise_for_status() + xml_data = await response.text() + return self._parse_xml(xml_data) + + def _parse_xml(self, xml_data: str) -> List[NewsItemDTO]: + root = ET.fromstring(xml_data) + items = [] + for item in root.findall('.//item'): + title = item.findtext('title') or "" + link = item.findtext('link') or "" + description = item.findtext('description') or "" + pub_date_str = item.findtext('pubDate') + + if pub_date_str: + try: + timestamp = parsedate_to_datetime(pub_date_str) + except Exception: + timestamp = datetime.now() + else: + timestamp = datetime.now() + + items.append( + NewsItemDTO( + title=title, + url=link, + content_text=description, + source=self.source, + timestamp=timestamp + ) + ) + return items diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..d245663 --- /dev/null +++ b/src/main.py @@ -0,0 +1,89 @@ +import asyncio +import os +import logging +from typing import List +from dotenv import load_dotenv +import chromadb + +from aiogram import Bot, Dispatcher + +from src.crawlers.base import ICrawler +from src.crawlers.rss_crawler import RSSCrawler +from src.processor.ollama_provider import OllamaProvider +from src.storage.chroma_store import ChromaStore +from src.notifications.telegram import TelegramNotifier +from src.orchestrator.service import TrendScoutService +from src.bot.bot import setup_bot + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +async def background_task(orchestrator: TrendScoutService, interval: int = 3600): + """Run the orchestrator periodically.""" + while True: + logger.info("Starting new TrendScout iteration...") + try: + await orchestrator.run_iteration() + logger.info("Iteration completed successfully.") + except Exception as e: + logger.error(f"Error during iteration: {e}", exc_info=True) + + logger.info(f"Sleeping for {interval} seconds before next iteration.") + await asyncio.sleep(interval) + +async def main(): + load_dotenv() + + # Load configuration + bot_token = os.getenv("TELEGRAM_BOT_TOKEN") + chat_id = os.getenv("TELEGRAM_CHAT_ID", "") + ollama_url = os.getenv("OLLAMA_API_URL", "http://localhost:11434/api/generate") + chroma_db_path = os.getenv("CHROMA_DB_PATH", "./chroma_db") + + if not bot_token: + logger.error("TELEGRAM_BOT_TOKEN is missing!") + return + + if not chat_id or chat_id == "YOUR_CHAT_ID_HERE": + logger.warning("TELEGRAM_CHAT_ID is missing or not set. Notifications will fail.") + + # 1. Initialize Bot & Dispatcher + bot, dp = setup_bot(bot_token) + + # 2. Initialize Components + crawlers: List[ICrawler] = [ + RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI") + ] + + processor = OllamaProvider() + + if chroma_db_path: + chroma_client = chromadb.PersistentClient(path=chroma_db_path) + else: + chroma_client = chromadb.Client() + + storage = ChromaStore(client=chroma_client) + notifier = TelegramNotifier(bot, chat_id) + + orchestrator = TrendScoutService( + crawlers=crawlers, + processor=processor, + storage=storage, + notifier=notifier + ) + + # 3. Start tasks + logger.info("Starting TrendScout AI Bot and Background Task...") + + # Create the background task + bg_task = asyncio.create_task(background_task(orchestrator, interval=3600)) + + # Start polling the Telegram bot (blocking call) + try: + await dp.start_polling(bot) + finally: + bg_task.cancel() + logger.info("Shutting down...") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/src/notifications/__init__.py b/src/notifications/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/notifications/base.py b/src/notifications/base.py new file mode 100644 index 0000000..c1a12a5 --- /dev/null +++ b/src/notifications/base.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod +from src.processor.dto import EnrichedNewsItemDTO + +class INotificationService(ABC): + @abstractmethod + async def send_alert(self, item: EnrichedNewsItemDTO) -> None: + """Send an alert about an enriched news item.""" + pass diff --git a/src/notifications/telegram.py b/src/notifications/telegram.py new file mode 100644 index 0000000..433d2f5 --- /dev/null +++ b/src/notifications/telegram.py @@ -0,0 +1,26 @@ +from typing import Union +import aiogram +import html +from src.notifications.base import INotificationService +from src.processor.dto import EnrichedNewsItemDTO + +class TelegramNotifier(INotificationService): + def __init__(self, bot: aiogram.Bot, chat_id: Union[str, int]): + self.bot = bot + self.chat_id = chat_id + + async def send_alert(self, item: EnrichedNewsItemDTO) -> None: + title = html.escape(item.title) + summary = html.escape(item.summary_ru) + anomalies_list = [html.escape(a) for a in item.anomalies_detected] if item.anomalies_detected else [] + anomalies_text = ", ".join(anomalies_list) if anomalies_list else "None" + url = html.escape(item.url) + + formatted_text = ( + f"{title}\n\n" + f"Relevance: {item.relevance_score}/10\n" + f"Anomalies: {anomalies_text}\n\n" + f"{summary}\n\n" + f"Source" + ) + await self.bot.send_message(chat_id=self.chat_id, text=formatted_text, parse_mode="HTML") diff --git a/src/orchestrator/__init__.py b/src/orchestrator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/orchestrator/service.py b/src/orchestrator/service.py new file mode 100644 index 0000000..09c3d74 --- /dev/null +++ b/src/orchestrator/service.py @@ -0,0 +1,28 @@ +from typing import List +from src.crawlers.base import ICrawler +from src.processor.base import ILLMProvider +from src.storage.base import IVectorStore +from src.notifications.base import INotificationService + +class TrendScoutService: + def __init__( + self, + crawlers: List[ICrawler], + processor: ILLMProvider, + storage: IVectorStore, + notifier: INotificationService, + ): + self.crawlers = crawlers + self.processor = processor + self.storage = storage + self.notifier = notifier + + async def run_iteration(self) -> None: + for crawler in self.crawlers: + items = await crawler.fetch_latest() + for item in items: + enriched_item = await self.processor.analyze(item) + await self.storage.store(enriched_item) + + if enriched_item.relevance_score >= 8 or bool(enriched_item.anomalies_detected): + await self.notifier.send_alert(enriched_item) diff --git a/src/processor/__init__.py b/src/processor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/processor/base.py b/src/processor/base.py new file mode 100644 index 0000000..728f589 --- /dev/null +++ b/src/processor/base.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod +from src.crawlers.dto import NewsItemDTO +from src.processor.dto import EnrichedNewsItemDTO + +class ILLMProvider(ABC): + @abstractmethod + async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO: + pass diff --git a/src/processor/dto.py b/src/processor/dto.py new file mode 100644 index 0000000..07bd1f9 --- /dev/null +++ b/src/processor/dto.py @@ -0,0 +1,8 @@ +from typing import List +from pydantic import Field +from src.crawlers.dto import NewsItemDTO + +class EnrichedNewsItemDTO(NewsItemDTO): + relevance_score: int = Field(ge=0, le=10) + summary_ru: str + anomalies_detected: List[str] diff --git a/src/processor/ollama_provider.py b/src/processor/ollama_provider.py new file mode 100644 index 0000000..6b25e89 --- /dev/null +++ b/src/processor/ollama_provider.py @@ -0,0 +1,67 @@ +import os +import json +import aiohttp +from typing import List +from src.crawlers.dto import NewsItemDTO +from src.processor.base import ILLMProvider +from src.processor.dto import EnrichedNewsItemDTO + +class OllamaProvider(ILLMProvider): + async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO: + base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434') + url = base_url if base_url.endswith('/api/generate') else f"{base_url.rstrip('/')}/api/generate" + prompt = ( + f"Analyze the following article.\nTitle: {news_item.title}\n" + f"Content: {news_item.content_text}\n" + "Return JSON with 'relevance_score' (0-10), 'summary_ru' (string), and 'anomalies_detected' (list of strings)." + ) + payload = { + "model": os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud'), + "prompt": prompt, + "stream": False, + "format": "json" + } + + async with aiohttp.ClientSession() as session: + async with session.post(url, json=payload) as response: + response.raise_for_status() + data = await response.json() + + # Ollama returns the generated text inside 'response' key + generated_text = data.get('response', '') + if not generated_text: + generated_text = "{}" + + # Strip markdown code blocks + cleaned_text = generated_text.strip() + if cleaned_text.startswith("```json"): + cleaned_text = cleaned_text[7:] + elif cleaned_text.startswith("```"): + cleaned_text = cleaned_text[3:] + + if cleaned_text.endswith("```"): + cleaned_text = cleaned_text[:-3] + + cleaned_text = cleaned_text.strip() + + try: + parsed_json = json.loads(cleaned_text) + if not isinstance(parsed_json, dict): + parsed_json = {} + except json.JSONDecodeError: + parsed_json = { + "relevance_score": 0, + "summary_ru": "Error parsing LLM response: " + generated_text, + "anomalies_detected": [] + } + + return EnrichedNewsItemDTO( + title=news_item.title, + url=news_item.url, + content_text=news_item.content_text, + source=news_item.source, + timestamp=news_item.timestamp, + relevance_score=parsed_json.get('relevance_score', 0), + summary_ru=parsed_json.get('summary_ru', ''), + anomalies_detected=parsed_json.get('anomalies_detected', []) + ) diff --git a/src/storage/base.py b/src/storage/base.py new file mode 100644 index 0000000..6bc5771 --- /dev/null +++ b/src/storage/base.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod +from typing import List +from src.processor.dto import EnrichedNewsItemDTO + +class IVectorStore(ABC): + """Abstract interface for a Vector Store.""" + + @abstractmethod + async def store(self, item: EnrichedNewsItemDTO) -> None: + """Store an item in the vector database.""" + pass + + @abstractmethod + async def search(self, query: str, limit: int = 5) -> List[EnrichedNewsItemDTO]: + """Search for items in the vector database.""" + pass diff --git a/src/storage/chroma_store.py b/src/storage/chroma_store.py new file mode 100644 index 0000000..cc2e40e --- /dev/null +++ b/src/storage/chroma_store.py @@ -0,0 +1,73 @@ +import uuid +from typing import List +from datetime import datetime + +import chromadb +from chromadb.api import ClientAPI + +from src.storage.base import IVectorStore +from src.processor.dto import EnrichedNewsItemDTO + +class ChromaStore(IVectorStore): + def __init__(self, client: ClientAPI, collection_name: str = "news_collection"): + self.client = client + self.collection = self.client.get_or_create_collection(name=collection_name) + + async def store(self, item: EnrichedNewsItemDTO) -> None: + # Create a deterministic UUID based on the URL + doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, item.url)) + + metadata = { + "title": item.title, + "url": item.url, + "source": item.source, + "timestamp": item.timestamp.isoformat(), + "relevance_score": item.relevance_score, + "summary_ru": item.summary_ru, + # Chroma accepts string, int, float or bool for metadata values + "anomalies_detected": ",".join(item.anomalies_detected) if item.anomalies_detected else "" + } + + self.collection.upsert( + ids=[doc_id], + documents=[item.content_text], + metadatas=[metadata] + ) + + async def search(self, query: str, limit: int = 5) -> List[EnrichedNewsItemDTO]: + results = self.collection.query( + query_texts=[query], + n_results=limit + ) + + items = [] + # Check if we have results + metadatas = results.get('metadatas') + if not metadatas or not metadatas[0]: + return items + + documents = results.get('documents') + + for idx, metadata in enumerate(metadatas[0]): + if metadata is None: + continue + + document = documents[0][idx] if documents and documents[0] else "" + + anomalies_str = str(metadata.get("anomalies_detected", "")) + anomalies = anomalies_str.split(",") if anomalies_str else [] + anomalies = [a.strip() for a in anomalies if a.strip()] + + item = EnrichedNewsItemDTO( + title=str(metadata.get("title", "")), + url=str(metadata.get("url", "")), + content_text=str(document), + source=str(metadata.get("source", "")), + timestamp=datetime.fromisoformat(str(metadata.get("timestamp", ""))), + relevance_score=int(float(str(metadata.get("relevance_score", 0)))), + summary_ru=str(metadata.get("summary_ru", "")), + anomalies_detected=anomalies + ) + items.append(item) + + return items diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/bot/__init__.py b/tests/bot/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/bot/test_handlers.py b/tests/bot/test_handlers.py new file mode 100644 index 0000000..1ee3486 --- /dev/null +++ b/tests/bot/test_handlers.py @@ -0,0 +1,45 @@ +import pytest +from unittest.mock import AsyncMock, MagicMock +from aiogram.types import Message + +from src.bot.handlers import command_start_handler, command_help_handler, command_latest_handler + +@pytest.mark.asyncio +async def test_command_start_handler(): + message = AsyncMock() + message.from_user = MagicMock() + message.from_user.full_name = "Test User" + message.answer = AsyncMock() + + await command_start_handler(message) + + message.answer.assert_called_once() + args, kwargs = message.answer.call_args + assert "Welcome" in args[0] or "Trend-Scout" in args[0] + +@pytest.mark.asyncio +async def test_command_help_handler(): + message = AsyncMock() + message.answer = AsyncMock() + + await command_help_handler(message) + + message.answer.assert_called_once() + args, kwargs = message.answer.call_args + assert "/start" in args[0] + assert "/latest" in args[0] + +@pytest.mark.asyncio +async def test_command_latest_handler(): + message = AsyncMock() + message.answer = AsyncMock() + + await command_latest_handler(message) + + message.answer.assert_called_once() + args, kwargs = message.answer.call_args + response_text = args[0] + + assert "Relevance:" in response_text or "Score:" in response_text or "10" in response_text + assert "Anomalies:" in response_text or "detected" in response_text.lower() + assert "Example Domain" in response_text diff --git a/tests/crawlers/test_rss_crawler.py b/tests/crawlers/test_rss_crawler.py new file mode 100644 index 0000000..8bba030 --- /dev/null +++ b/tests/crawlers/test_rss_crawler.py @@ -0,0 +1,68 @@ +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from datetime import datetime, timezone + +from src.crawlers.rss_crawler import RSSCrawler +from src.crawlers.dto import NewsItemDTO + +MOCK_RSS = """ + + + Mock Source + http://mock.source.com + Mock Source Description + + Test Title 1 + http://mock.source.com/1 + Test Content 1 + Wed, 02 Oct 2002 08:00:00 GMT + + + Test Title 2 + http://mock.source.com/2 + Test Content 2 + Thu, 03 Oct 2002 10:00:00 GMT + + + +""" + +@pytest.mark.asyncio +async def test_rss_crawler_fetch_latest(): + url = "http://mock.source.com/rss" + source = "Mock Source" + crawler = RSSCrawler(url, source) + + with patch("aiohttp.ClientSession.get") as mock_get: + # Create an async mock for the response object + mock_response = AsyncMock() + mock_response.text.return_value = MOCK_RSS + mock_response.raise_for_status = MagicMock() + + # Setup context manager for the 'async with session.get(...)' part + mock_get.return_value.__aenter__.return_value = mock_response + + # Call the method + items = await crawler.fetch_latest() + + # Verify the mock was called with the correct URL + mock_get.assert_called_once_with(url) + + # Verify the parsing results + assert len(items) == 2 + + # Check first item + assert isinstance(items[0], NewsItemDTO) + assert items[0].title == "Test Title 1" + assert items[0].url == "http://mock.source.com/1" + assert items[0].content_text == "Test Content 1" + assert items[0].source == source + assert items[0].timestamp == datetime(2002, 10, 2, 8, 0, tzinfo=timezone.utc) + + # Check second item + assert isinstance(items[1], NewsItemDTO) + assert items[1].title == "Test Title 2" + assert items[1].url == "http://mock.source.com/2" + assert items[1].content_text == "Test Content 2" + assert items[1].source == source + assert items[1].timestamp == datetime(2002, 10, 3, 10, 0, tzinfo=timezone.utc) diff --git a/tests/notifications/test_telegram_notifier.py b/tests/notifications/test_telegram_notifier.py new file mode 100644 index 0000000..e12d552 --- /dev/null +++ b/tests/notifications/test_telegram_notifier.py @@ -0,0 +1,35 @@ +import pytest +from unittest.mock import AsyncMock +from datetime import datetime +from src.processor.dto import EnrichedNewsItemDTO +from src.notifications.telegram import TelegramNotifier + +@pytest.mark.asyncio +async def test_telegram_notifier_sends_message(): + # Arrange + bot_mock = AsyncMock() + chat_id = "12345" + notifier = TelegramNotifier(bot=bot_mock, chat_id=chat_id) + + item = EnrichedNewsItemDTO( + title="Test Title", + url="http://example.com", + content_text="Sample content", + source="Test Source", + timestamp=datetime.now(), + relevance_score=9, + summary_ru="Тестовое саммари", + anomalies_detected=["Test Anomaly"] + ) + + # Act + await notifier.send_alert(item) + + # Assert + bot_mock.send_message.assert_called_once() + kwargs = bot_mock.send_message.call_args.kwargs + assert kwargs["chat_id"] == chat_id + assert kwargs["parse_mode"] == "HTML" + assert "Test Title" in kwargs["text"] + assert "9/10" in kwargs["text"] + assert "Test Anomaly" in kwargs["text"] diff --git a/tests/orchestrator/test_service.py b/tests/orchestrator/test_service.py new file mode 100644 index 0000000..f803585 --- /dev/null +++ b/tests/orchestrator/test_service.py @@ -0,0 +1,72 @@ +import pytest +from unittest.mock import AsyncMock +from datetime import datetime +from src.crawlers.dto import NewsItemDTO +from src.processor.dto import EnrichedNewsItemDTO +from src.orchestrator.service import TrendScoutService + +@pytest.mark.asyncio +async def test_run_iteration(): + # Arrange + crawler_mock = AsyncMock() + processor_mock = AsyncMock() + storage_mock = AsyncMock() + notifier_mock = AsyncMock() + + timestamp = datetime.now() + + news_item = NewsItemDTO( + title="Test Title", + url="http://example.com", + content_text="Sample text", + source="Source", + timestamp=timestamp + ) + + high_relevance_item = EnrichedNewsItemDTO( + **news_item.model_dump(), + relevance_score=8, + summary_ru="Summary", + anomalies_detected=[] + ) + + anomaly_item = EnrichedNewsItemDTO( + **news_item.model_dump(), + relevance_score=5, + summary_ru="Summary", + anomalies_detected=["Anomaly"] + ) + + low_relevance_item = EnrichedNewsItemDTO( + **news_item.model_dump(), + relevance_score=5, + summary_ru="Summary", + anomalies_detected=[] + ) + + crawler_mock.fetch_latest.return_value = [news_item, news_item, news_item] + + # Return different items for each call to simulate different results + processor_mock.analyze.side_effect = [ + high_relevance_item, + anomaly_item, + low_relevance_item + ] + + service = TrendScoutService( + crawlers=[crawler_mock], + processor=processor_mock, + storage=storage_mock, + notifier=notifier_mock + ) + + # Act + await service.run_iteration() + + # Assert + crawler_mock.fetch_latest.assert_called_once() + assert processor_mock.analyze.call_count == 3 + assert storage_mock.store.call_count == 3 + + # Should only alert on high relevance (1) or anomalies (1), total 2 times + assert notifier_mock.send_alert.call_count == 2 diff --git a/tests/processor/__init__.py b/tests/processor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/processor/test_ollama_provider.py b/tests/processor/test_ollama_provider.py new file mode 100644 index 0000000..2415a71 --- /dev/null +++ b/tests/processor/test_ollama_provider.py @@ -0,0 +1,104 @@ +import os +import pytest +from datetime import datetime +from unittest.mock import AsyncMock, patch +from src.crawlers.dto import NewsItemDTO +from src.processor.dto import EnrichedNewsItemDTO +from src.processor.ollama_provider import OllamaProvider + +@pytest.fixture +def sample_news_item(): + return NewsItemDTO( + title="Test News", + url="http://example.com", + content_text="This is a test article about AI and NPU acceleration.", + source="Test Source", + timestamp=datetime.now() + ) + +def create_mock_session(mock_response_json): + class MockResponse: + async def __aenter__(self): + return self + async def __aexit__(self, exc_type, exc_val, exc_tb): + pass + async def json(self): + return mock_response_json + def raise_for_status(self): + pass + + class MockSession: + async def __aenter__(self): + return self + async def __aexit__(self, exc_type, exc_val, exc_tb): + pass + def post(self, url, **kwargs): + return MockResponse() + + return MockSession() + +@pytest.mark.asyncio +async def test_ollama_provider_analyze_success(sample_news_item): + os.environ['OLLAMA_API_URL'] = 'http://localhost:11434/api/generate' + mock_response_json = { + "response": '{"relevance_score": 8, "summary_ru": "Тестовая статья про ИИ.", "anomalies_detected": ["NPU acceleration"]}' + } + + provider = OllamaProvider() + with patch('aiohttp.ClientSession', return_value=create_mock_session(mock_response_json)): + result = await provider.analyze(sample_news_item) + + assert isinstance(result, EnrichedNewsItemDTO) + assert result.title == "Test News" + assert result.relevance_score == 8 + assert result.summary_ru == "Тестовая статья про ИИ." + assert result.anomalies_detected == ["NPU acceleration"] + +@pytest.mark.asyncio +async def test_ollama_provider_analyze_empty_response(sample_news_item): + os.environ['OLLAMA_API_URL'] = 'http://localhost:11434/api/generate' + mock_response_json = { + "response": "" + } + + provider = OllamaProvider() + with patch('aiohttp.ClientSession', return_value=create_mock_session(mock_response_json)): + result = await provider.analyze(sample_news_item) + + assert isinstance(result, EnrichedNewsItemDTO) + assert result.relevance_score == 0 + assert result.summary_ru == "" + assert result.anomalies_detected == [] + +@pytest.mark.asyncio +async def test_ollama_provider_analyze_malformed_json(sample_news_item): + os.environ['OLLAMA_API_URL'] = 'http://localhost:11434/api/generate' + mock_response_json = { + "response": "{ invalid json" + } + + provider = OllamaProvider() + with patch('aiohttp.ClientSession', return_value=create_mock_session(mock_response_json)): + result = await provider.analyze(sample_news_item) + + assert isinstance(result, EnrichedNewsItemDTO) + assert result.relevance_score == 0 + assert "Error parsing LLM response" in result.summary_ru + assert result.anomalies_detected == [] + +@pytest.mark.asyncio +async def test_ollama_provider_analyze_markdown_json(sample_news_item): + os.environ['OLLAMA_API_URL'] = 'http://localhost:11434/api/generate' + mock_response_json = { + "response": "```json\n{\"relevance_score\": 5, \"summary_ru\": \"Markdown test\", \"anomalies_detected\": []}\n```" + } + + provider = OllamaProvider() + with patch('aiohttp.ClientSession', return_value=create_mock_session(mock_response_json)): + result = await provider.analyze(sample_news_item) + + assert isinstance(result, EnrichedNewsItemDTO) + assert result.relevance_score == 5 + assert result.summary_ru == "Markdown test" + assert result.anomalies_detected == [] + diff --git a/tests/storage/test_chroma_store.py b/tests/storage/test_chroma_store.py new file mode 100644 index 0000000..aba71ad --- /dev/null +++ b/tests/storage/test_chroma_store.py @@ -0,0 +1,116 @@ +import pytest +import pytest_asyncio +from datetime import datetime, timezone +import chromadb +from chromadb.config import Settings + +from src.processor.dto import EnrichedNewsItemDTO +from src.storage.chroma_store import ChromaStore + +@pytest_asyncio.fixture +async def chroma_store(): + # Use EphemeralClient for in-memory testing + client = chromadb.EphemeralClient(Settings(allow_reset=True)) + client.reset() + store = ChromaStore(client=client, collection_name="test_collection") + yield store + client.reset() + +@pytest.mark.asyncio +async def test_store_and_search(chroma_store: ChromaStore): + # 1. Arrange + item1 = EnrichedNewsItemDTO( + title="Apple announces new M4 chip", + url="https://example.com/apple-m4", + content_text="Apple has announced its newest M4 chip for next generation Macs. This processor brings massive AI improvements.", + source="TechNews", + timestamp=datetime(2023, 11, 1, 12, 0, tzinfo=timezone.utc), + relevance_score=9, + summary_ru="Apple анонсировала новый чип M4.", + anomalies_detected=["NPU acceleration"] + ) + + item2 = EnrichedNewsItemDTO( + title="Local bakery makes giant bread", + url="https://example.com/giant-bread", + content_text="A bakery in town just baked the world's largest loaf of bread, weighing over 1000 pounds.", + source="LocalNews", + timestamp=datetime(2023, 11, 2, 10, 0, tzinfo=timezone.utc), + relevance_score=2, + summary_ru="Местная пекарня испекла гигантский хлеб.", + anomalies_detected=[] + ) + + item3 = EnrichedNewsItemDTO( + title="NVIDIA reveals RTX 5090 with WebGPU support", + url="https://example.com/nvidia-rtx-5090", + content_text="NVIDIA's new RTX 5090 GPU fully accelerates WebGPU workloads for advanced edge AI applications.", + source="GPUWeekly", + timestamp=datetime(2023, 11, 3, 14, 0, tzinfo=timezone.utc), + relevance_score=10, + summary_ru="NVIDIA представила RTX 5090 с поддержкой WebGPU.", + anomalies_detected=["WebGPU", "Edge AI"] + ) + + # 2. Act + await chroma_store.store(item1) + await chroma_store.store(item2) + await chroma_store.store(item3) + + # Search for AI and chip related news + search_results = await chroma_store.search("AI processor and GPU", limit=2) + + # 3. Assert + assert len(search_results) == 2 + + # Expected: The Apple M4 chip and NVIDIA RTX 5090 are highly relevant to AI/GPU + titles = [res.title for res in search_results] + assert "NVIDIA reveals RTX 5090 with WebGPU support" in titles + assert "Apple announces new M4 chip" in titles + assert "Local bakery makes giant bread" not in titles + + # Check if properties are correctly restored for one of the items + for res in search_results: + if "NVIDIA" in res.title: + assert res.relevance_score == 10 + assert "WebGPU" in res.anomalies_detected + assert "Edge AI" in res.anomalies_detected + assert "NVIDIA's new RTX 5090" in res.content_text + assert res.source == "GPUWeekly" + +@pytest.mark.asyncio +async def test_search_empty_store(chroma_store: ChromaStore): + results = await chroma_store.search("test query", limit=5) + assert len(results) == 0 + +@pytest.mark.asyncio +async def test_store_upsert(chroma_store: ChromaStore): + item1 = EnrichedNewsItemDTO( + title="Apple announces new M4 chip", + url="https://example.com/apple-m4", + content_text="Apple has announced its newest M4 chip for next generation Macs.", + source="TechNews", + timestamp=datetime(2023, 11, 1, 12, 0, tzinfo=timezone.utc), + relevance_score=9, + summary_ru="Apple анонсировала новый чип M4.", + anomalies_detected=["NPU acceleration"] + ) + + # Store first time + await chroma_store.store(item1) + results = await chroma_store.search("Apple", limit=5) + assert len(results) == 1 + assert results[0].relevance_score == 9 + + # Modify item and store again (same URL, should upsert) + item1_updated = item1.model_copy() + item1_updated.relevance_score = 10 + item1_updated.summary_ru = "Apple анонсировала чип M4. Обновлено." + + await chroma_store.store(item1_updated) + results_updated = await chroma_store.search("Apple", limit=5) + + # Should still be 1 item, but updated + assert len(results_updated) == 1 + assert results_updated[0].relevance_score == 10 + assert results_updated[0].summary_ru == "Apple анонсировала чип M4. Обновлено."