feat(crawlers): implement specialized CppConf crawler and AI analysis

- Added CppConfCrawler using aiohttp and regex to parse Next.js JSON data, skipping the Playwright bottleneck. - Added C++ specific prompts to OllamaProvider for trend analysis (identifying C++26, memory safety, coroutines). - Created offline pytest fixtures and TDD unit tests for the parser. - Created end-to-end pipeline test mapping Crawler -> AI Processor -> Vector DB.
2026-03-15 20:34:28 +03:00 · 2026-03-15 20:34:28 +03:00 · a363ca41cf
commit a363ca41cf
parent a0eeba0918
7 changed files with 238 additions and 26 deletions
--- a/src/crawlers.yml
+++ b/src/crawlers.yml
@ -46,10 +46,9 @@ crawlers:
  - type: rss
    url: "https://blog.google/products-and-platforms/platforms/android/rss/"
    source: "Google Android Blog"
-  - type: playwright
+  - type: cppconf
    url: "https://cppconf.ru/en/talks/"
    source: "C++ Russia"
-    selector: "div.talk-item"
  - type: playwright
    url: "https://2025.ieee-icra.org/media/"
    source: "ICRA 2025"
--- a/src/crawlers/cppconf_crawler.py
+++ b/src/crawlers/cppconf_crawler.py
@ -0,0 +1,106 @@
+import json
+import re
+import asyncio
+from datetime import datetime, timezone
+from typing import List
+import aiohttp
+
+from .base import ICrawler
+from .dto import NewsItemDTO
+
+class CppConfNextJsParser:
+    def _clean_html(self, raw_html: str) -> str:
+        if not raw_html:
+            return ""
+        # Remove html tags
+        cleanr = re.compile('<.*?>')
+        cleantext = re.sub(cleanr, ' ', raw_html)
+        # Remove extra whitespace
+        return ' '.join(cleantext.split())
+
+    def parse_talks(self, html: str) -> List[NewsItemDTO]:
+        match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
+        if not match:
+            return []
+        
+        try:
+            data = json.loads(match.group(1))
+            talks_by_day = data.get("props", {}).get("pageProps", {}).get("talksByDay", [])
+        except (json.JSONDecodeError, KeyError, TypeError):
+            return []
+
+        talks = []
+        for day in talks_by_day:
+            if "talks" not in day:
+                continue
+                
+            for row in day["talks"]:
+                if len(row) < 2:
+                    continue
+                
+                # row[1] contains the actual list of talks happening at that time
+                for talk in row[1]:
+                    if talk.get("isServiceTalk", False) or not talk.get("name"):
+                        continue
+                    
+                    title = talk["name"].get("en") or talk["name"].get("ru", "Unknown Title")
+                    url = f"https://cppconf.ru/en/talks/{talk.get('id', '')}/"
+                    
+                    # timestamp
+                    time_str = talk.get("time") or talk.get("talkStartTime")
+                    timestamp = datetime.now(timezone.utc)
+                    if time_str:
+                        try:
+                            # format usually "2026-05-07T09:00:00Z"
+                            timestamp = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
+                        except ValueError:
+                            pass
+                    
+                    # text content
+                    short_desc = talk.get("shortDescription", {}).get("en", talk.get("shortDescription", {}).get("ru", ""))
+                    long_desc = talk.get("longDescription", {}).get("en", talk.get("longDescription", {}).get("ru", ""))
+                    
+                    desc = self._clean_html(short_desc) + " " + self._clean_html(long_desc)
+                    
+                    # speakers
+                    speakers = []
+                    for speaker in talk.get("speakers", []):
+                        name = speaker.get("name", {}).get("en") or speaker.get("name", {}).get("ru")
+                        if name:
+                            speakers.append(name)
+                    
+                    speaker_str = f"Speakers: {', '.join(speakers)}. " if speakers else ""
+                    content_text = f"{speaker_str}{desc}".strip()
+                    
+                    # only keep talks with decent content
+                    if not content_text:
+                        content_text = "No description available."
+                    
+                    talks.append(
+                        NewsItemDTO(
+                            title=title,
+                            url=url,
+                            content_text=content_text,
+                            source="cppconf",
+                            timestamp=timestamp
+                        )
+                    )
+        
+        return talks
+
+class CppConfCrawler(ICrawler):
+    def __init__(self, url: str, source: str = "cppconf"):
+        self.url = url
+        self.source = source
+        self.parser = CppConfNextJsParser()
+
+    async def fetch_latest(self) -> List[NewsItemDTO]:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(self.url) as response:
+                if response.status != 200:
+                    return []
+                html = await response.text()
+                talks = self.parser.parse_talks(html)
+                for talk in talks:
+                    talk.source = self.source
+                return talks
--- a/src/crawlers/factory.py
+++ b/src/crawlers/factory.py
@ -4,6 +4,7 @@ from typing import List
 from src.crawlers.base import ICrawler
 from src.crawlers.rss_crawler import RSSCrawler
 from src.crawlers.playwright_crawler import PlaywrightCrawler
+from src.crawlers.cppconf_crawler import CppConfCrawler

 logger = logging.getLogger(__name__)

@ -36,6 +37,8 @@ class CrawlerFactory:
                elif crawler_type == 'playwright':
                    selector = item.get('selector')
                    crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
+                elif crawler_type == 'cppconf':
+                    crawlers.append(CppConfCrawler(url=url, source=source))
                else:
                    logger.warning(f"Unknown crawler type: {crawler_type}")
            
--- a/src/processor/ollama_provider.py
+++ b/src/processor/ollama_provider.py
@ -21,30 +21,42 @@ class OllamaProvider(ILLMProvider):
        base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
        url = base_url if base_url.endswith(
            '/api/generate') else f"{base_url.rstrip('/')}/api/generate"
-        prompt = (
-            "Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
-            "cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
-            f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
-
-            "Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
-            "'anomalies_detected' (list of strings), and 'category' (string).\n\n"
-
-            "OUTPUT RULES:\n"
-            "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
-            "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
-
-            "SCORING LOGIC ('relevance_score'):\n"
-            "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
-            "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
-            "- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
-            "- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
-
-            "ANOMALY DETECTION ('anomalies_detected'):\n"
-            "Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
-            "a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
-            "or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
-            "Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
-        )
+        if news_item.source in ["C++ Russia", "cppconf"]:
+            prompt = (
+                "Analyze this C++ conference talk abstract. Extract the primary C++ trends discussed "
+                "(e.g., C++20/26 concepts, memory safety, coroutines, heterogeneous computing).\n\n"
+                f"Title: {news_item.title}\nContent: {news_item.content_text}\n\n"
+                "Return a JSON object strictly with these keys:\n"
+                "1. 'relevance_score' (integer 0-10): Indicate its importance to the modern C++ ecosystem.\n"
+                "2. 'summary_ru' (string): A concise 2-sentence summary in Russian.\n"
+                "3. 'anomalies_detected' (list of strings): Any bleeding-edge tech, controversial topics, or Rust comparisons.\n"
+                "4. 'category' (string): Must be exactly 'C++ Trends'.\n"
+            )
+        else:
+            prompt = (
+                "Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
+                "cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
+                f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
+    
+                "Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
+                "'anomalies_detected' (list of strings), and 'category' (string).\n\n"
+    
+                "OUTPUT RULES:\n"
+                "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
+                "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
+    
+                "SCORING LOGIC ('relevance_score'):\n"
+                "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
+                "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
+                "- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
+                "- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
+    
+                "ANOMALY DETECTION ('anomalies_detected'):\n"
+                "Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
+                "a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
+                "or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
+                "Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
+            )
        payload = {
            "model": os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud'),
            "prompt": prompt,
--- a/tests/crawlers/test_cppconf.py
+++ b/tests/crawlers/test_cppconf.py
@ -0,0 +1,23 @@
+import pytest
+from datetime import datetime
+from src.crawlers.cppconf_crawler import CppConfNextJsParser
+from src.crawlers.dto import NewsItemDTO
+
+@pytest.fixture
+def cppconf_html():
+    with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
+        return f.read()
+
+def test_cppconf_parser(cppconf_html):
+    parser = CppConfNextJsParser()
+    talks = parser.parse_talks(cppconf_html)
+    
+    assert len(talks) > 0, "Should extract at least one talk"
+    
+    first_talk = talks[0]
+    assert isinstance(first_talk, NewsItemDTO)
+    assert len(first_talk.title) > 0
+    assert first_talk.url.startswith("https://cppconf.ru/en/talks/")
+    assert len(first_talk.content_text) > 0
+    assert first_talk.source == "cppconf"
+    assert isinstance(first_talk.timestamp, datetime)
--- a/tests/fixtures/cppconf/talks.html
+++ b/tests/fixtures/cppconf/talks.html
--- a/tests/test_cppconf_pipeline.py
+++ b/tests/test_cppconf_pipeline.py
@ -0,0 +1,68 @@
+import pytest
+import chromadb
+from unittest.mock import AsyncMock, patch
+from src.crawlers.cppconf_crawler import CppConfCrawler
+from src.processor.ollama_provider import OllamaProvider
+from src.storage.chroma_store import ChromaStore
+
+@pytest.fixture
+def cppconf_html():
+    with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
+        return f.read()
+
+@pytest.mark.asyncio
+async def test_cppconf_e2e_pipeline(cppconf_html):
+    # 1. Mock Crawler fetch
+    crawler = CppConfCrawler(url="https://cppconf.ru/en/talks/", source="C++ Russia")
+    
+    with patch("aiohttp.ClientSession.get") as mock_get:
+        mock_response = AsyncMock()
+        mock_response.status = 200
+        mock_response.text.return_value = cppconf_html
+        mock_get.return_value.__aenter__.return_value = mock_response
+        
+        talks = await crawler.fetch_latest()
+    
+    assert len(talks) > 0
+    talk = talks[0]
+    assert talk.source == "C++ Russia"
+    assert "https://cppconf.ru/en/talks/" in talk.url
+
+    # 2. Mock AI Processor
+    provider = OllamaProvider()
+    
+    mock_llm_response = {
+        "relevance_score": 9,
+        "summary_ru": "Этот доклад обсуждает новые фичи C++26 и их влияние на производительность. Показаны примеры использования концептов и корутин.",
+        "anomalies_detected": ["Сравнение производительности с Rust"],
+        "category": "C++ Trends"
+    }
+    
+    with patch("aiohttp.ClientSession.post") as mock_post:
+        mock_llm_post_response = AsyncMock()
+        mock_llm_post_response.raise_for_status = AsyncMock()
+        import json
+        mock_llm_post_response.json.return_value = {"response": json.dumps(mock_llm_response)}
+        mock_post.return_value.__aenter__.return_value = mock_llm_post_response
+        
+        enriched_talk = await provider.analyze(talk)
+        
+    assert enriched_talk.relevance_score == 9
+    assert "Rust" in enriched_talk.anomalies_detected[0]
+    assert enriched_talk.category == "C++ Trends"
+    
+    # 3. Vector DB Store
+    client = chromadb.Client()
+    store = ChromaStore(client=client, collection_name="test_cppconf_collection")
+    
+    await store.store(enriched_talk)
+    
+    # Verify it exists
+    exists = await store.exists(enriched_talk.url)
+    assert exists is True
+    
+    # Search
+    results = await store.search("C++26 features", limit=1)
+    assert len(results) == 1
+    assert results[0].relevance_score == 9
+    assert results[0].url == enriched_talk.url