diff --git a/README.md b/README.md new file mode 100644 index 0000000..03cd368 --- /dev/null +++ b/README.md @@ -0,0 +1,109 @@ +# Trend-Scout AI + +**Trend-Scout AI** is an intelligent Telegram bot designed for automated monitoring, analysis, and summarization of technological trends. It was developed to support R&D activities (specifically within the context of LG Electronics R&D Lab in St. Petersburg) by scanning the environment for emerging technologies, competitive benchmarks, and scientific breakthroughs. + +## 🚀 Key Features + +- **Automated Multi-Source Crawling:** Monitors RSS feeds, scientific journals (Nature, Science), IT conferences (CES, CVPR), and corporate newsrooms using Playwright and Scrapy. +- **AI-Powered Analysis:** Utilizes LLMs (via Ollama API) to evaluate the relevance of news articles based on specific R&D landscapes (e.g., WebOS, Chromium, Edge AI). +- **Russian Summarization:** Automatically generates concise summaries in Russian for quick review. +- **Anomaly Detection:** Alerts users when there is a significant surge in mentions of specific technologies (e.g., "WebGPU", "NPU acceleration"). +- **Semantic Search:** Employs a vector database (ChromaDB) to allow searching for trends and news by meaning rather than just keywords. +- **Telegram Interface:** Simple and effective interaction via Telegram for receiving alerts and querying the latest trends. + +## 🏗 Architecture + +The project follows a modular, agent-based architecture designed around SOLID principles and asynchronous I/O: + +1. **Crawler Agent:** Responsible for fetching and parsing data from various sources into standardized DTOs. +2. **AI Processor Agent:** Enriches data by scoring relevance, summarizing content, and detecting technological anomalies using LLMs. +3. **Vector Storage Agent:** Manages persistent storage and semantic retrieval using ChromaDB. +4. **Telegram Bot Agent:** Handles user interaction, command processing (`/start`, `/latest`, `/help`), and notification delivery. +5. **Orchestrator:** Coordinates the flow between crawling, processing, and storage in periodic background iterations. + +## 🛠 Tech Stack + +- **Language:** Python 3.12+ +- **Frameworks:** `aiogram` (Telegram Bot), `playwright` (Web Crawling), `pydantic` (Data Validation) +- **Database:** `ChromaDB` (Vector Store) +- **AI/LLM:** `Ollama` (local or cloud models) +- **Testing:** `pytest`, `pytest-asyncio` +- **Environment:** Docker-ready, `.env` for configuration + +## 📋 Prerequisites + +- Python 3.12 or higher +- [Ollama](https://ollama.ai/) installed and running (for AI processing) +- Playwright browsers installed (`playwright install chromium`) + +## ⚙️ Installation & Setup + +1. **Clone the repository:** + ```bash + git clone https://github.com/your-repo/trend-scout-ai.git + cd trend-scout-ai + ``` + +2. **Create and activate a virtual environment:** + ```bash + python -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + ``` + +3. **Install dependencies:** + ```bash + pip install -r requirements.txt + playwright install chromium + ``` + +4. **Configure environment variables:** + Create a `.env` file in the root directory: + ```env + TELEGRAM_BOT_TOKEN=your_bot_token_here + TELEGRAM_CHAT_ID=your_chat_id_here + OLLAMA_API_URL=http://localhost:11434/api/generate + CHROMA_DB_PATH=./chroma_db + ``` + +## 🏃 Usage + +### Start the Bot and Background Crawler +To run the full system (bot + periodic crawler): +```bash +python -m src.main +``` + +### Run Manual Update +To trigger a manual crawl and update of the vector store: +```bash +python update_chroma_store.py +``` + +## 🧪 Testing + +The project maintains a high test coverage following TDD principles. + +Run all tests: +```bash +pytest +``` + +Run specific test categories: +```bash +pytest tests/crawlers/ +pytest tests/processor/ +pytest tests/storage/ +``` + +## 📂 Project Structure + +- `src/`: Core application logic. + - `bot/`: Telegram bot handlers and setup. + - `crawlers/`: Web scraping modules and factory. + - `processor/`: LLM integration and prompt logic. + - `storage/`: Vector database operations. + - `orchestrator/`: Main service coordination. +- `tests/`: Comprehensive test suite. +- `docs/`: Architecture Decision Records (ADR) and methodology. +- `chroma_db/`: Persistent vector storage (local). +- `requirements.txt`: Python dependencies. diff --git a/src/processor/ollama_provider.py b/src/processor/ollama_provider.py index 52f6f34..95ac536 100644 --- a/src/processor/ollama_provider.py +++ b/src/processor/ollama_provider.py @@ -32,29 +32,49 @@ class OllamaProvider(ILLMProvider): "3. 'anomalies_detected' (list of strings): Any bleeding-edge tech, controversial topics, or Rust comparisons.\n" "4. 'category' (string): Must be exactly 'C++ Trends'.\n" ) + elif "Scholar" in news_item.source or news_item.source == "SciRate" or "arxiv" in news_item.url.lower(): + prompt = ( + "Act as a Senior Research Scientist and Strategic Tech Scout. Analyze this academic research abstract.\n\n" + f"Title: {news_item.title}\nContent: {news_item.content_text}\n\n" + "Return a JSON object strictly with these keys:\n" + "1. 'relevance_score' (integer 0-10): Score the potential impact on industrial R&D (Edge AI, NPU acceleration, WebEngines).\n" + "2. 'summary_ru' (string): A technical summary in Russian (2-3 sentences). Explain the methodology, core innovation, and practical relevance.\n" + "3. 'anomalies_detected' (list of strings): Identify if this is State-of-the-art (SOTA) improvement, a paradigm shift, or unexpected results.\n" + "4. 'category' (string): Must be exactly 'Academic/SOTA'.\n\n" + "SCORING GUIDELINES:\n" + "- 9-10: SOTA breakthrough in NPU/AI efficiency, new web standards, or major SWE tool improvements.\n" + "- 7-8: Solid research with clear application in SmartTV, IoT, or Browsers.\n" + "- 4-6: Theoretical work with distant industrial application.\n" + "- 0-3: Out of scope (e.g., pure medicine, social sciences, or consumer electronics reviews).\n" + ) else: prompt = ( "Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, " "cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n" - f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n" + f"Analyze the following article or research abstract.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n" "Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), " "'anomalies_detected' (list of strings), and 'category' (string).\n\n" "OUTPUT RULES:\n" - "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n" - "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n" + "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2-3 sentences. " + "For academic/research papers, provide a technical summary of the methodology, key findings, and potential R&D application. " + "Focus on the technological or business value for an R&D team.\n" + "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Academic/SOTA', 'Other'.\n\n" "SCORING LOGIC ('relevance_score'):\n" - "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n" - "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n" + "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, " + "Edge AI/NPU integration at the middleware level, State-of-the-art (SOTA) research in AI/ML/NPU acceleration, " + "or disruptive software developer tools (SWE).\n" + "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems, " + "or major SmartTV OS updates. Peer-reviewed research with clear industrial application or architectural improvements.\n" "- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n" "- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n" "ANOMALY DETECTION ('anomalies_detected'):\n" - "Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: " + "Do not just summarize. Look for strategic or architectural disruptions. Examples: " "a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, " - "or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). " + "unexpected convergence of WebTech with hardware, or research that significantly outperforms current SOTA. " "Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found." ) payload = { diff --git a/tests/processor/test_ollama_provider.py b/tests/processor/test_ollama_provider.py index d75dbe8..4b7e0a3 100644 --- a/tests/processor/test_ollama_provider.py +++ b/tests/processor/test_ollama_provider.py @@ -106,6 +106,28 @@ async def test_ollama_provider_analyze_markdown_json(sample_news_item): assert result.anomalies_detected == [] assert result.category == "Browsers" +@pytest.mark.asyncio +async def test_ollama_provider_academic_content(): + os.environ['OLLAMA_API_URL'] = 'http://localhost:11434/api/generate' + academic_item = NewsItemDTO( + title="Attention Is All You Need", + url="https://arxiv.org/abs/1706.03762", + content_text="The dominant sequence transduction models...", + source="ArXiv", + timestamp=datetime.now() + ) + mock_response_json = { + "response": '{"relevance_score": 10, "summary_ru": "Революционная архитектура Transformer.", "anomalies_detected": ["SOTA"], "category": "Academic/SOTA"}' + } + + provider = OllamaProvider() + with patch('aiohttp.ClientSession', return_value=create_mock_session(mock_response_json)): + result = await provider.analyze(academic_item) + + assert result.relevance_score == 10 + assert result.category == "Academic/SOTA" + assert "Transformer" in result.summary_ru + def test_ollama_provider_get_info(): os.environ['OLLAMA_API_URL'] = 'http://test-url:11434' os.environ['OLLAMA_MODEL'] = 'test-model'