feat(crawlers): implement specialized CppConf crawler and AI analysis

- Added CppConfCrawler using aiohttp and regex to parse Next.js JSON data, skipping the Playwright bottleneck. - Added C++ specific prompts to OllamaProvider for trend analysis (identifying C++26, memory safety, coroutines). - Created offline pytest fixtures and TDD unit tests for the parser. - Created end-to-end pipeline test mapping Crawler -> AI Processor -> Vector DB.
2026-03-15 20:34:28 +03:00 · 2026-03-15 20:34:28 +03:00 · a363ca41cf
commit a363ca41cf
parent a0eeba0918
7 changed files with 238 additions and 26 deletions
--- a/src/crawlers.yml
+++ b/src/crawlers.yml
@ -46,10 +46,9 @@ crawlers:
  - type: rss
    url: "https://blog.google/products-and-platforms/platforms/android/rss/"
    source: "Google Android Blog"
-  - type: playwright
+  - type: cppconf
    url: "https://cppconf.ru/en/talks/"
    source: "C++ Russia"
    selector: "div.talk-item"
  - type: playwright
    url: "https://2025.ieee-icra.org/media/"
    source: "ICRA 2025"
--- a/src/crawlers/cppconf_crawler.py
+++ b/src/crawlers/cppconf_crawler.py
@ -0,0 +1,106 @@
 import json
 import re
 import asyncio
 from datetime import datetime, timezone
 from typing import List
 import aiohttp
 from .base import ICrawler
 from .dto import NewsItemDTO
 class CppConfNextJsParser:
    def _clean_html(self, raw_html: str) -> str:
        if not raw_html:
            return ""
        # Remove html tags
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, ' ', raw_html)
        # Remove extra whitespace
        return ' '.join(cleantext.split())
    def parse_talks(self, html: str) -> List[NewsItemDTO]:
        match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
        if not match:
            return []
        try:
            data = json.loads(match.group(1))
            talks_by_day = data.get("props", {}).get("pageProps", {}).get("talksByDay", [])
        except (json.JSONDecodeError, KeyError, TypeError):
            return []
        talks = []
        for day in talks_by_day:
            if "talks" not in day:
                continue
            for row in day["talks"]:
                if len(row) < 2:
                    continue
                # row[1] contains the actual list of talks happening at that time
                for talk in row[1]:
                    if talk.get("isServiceTalk", False) or not talk.get("name"):
                        continue
                    title = talk["name"].get("en") or talk["name"].get("ru", "Unknown Title")
                    url = f"https://cppconf.ru/en/talks/{talk.get('id', '')}/"
                    # timestamp
                    time_str = talk.get("time") or talk.get("talkStartTime")
                    timestamp = datetime.now(timezone.utc)
                    if time_str:
                        try:
                            # format usually "2026-05-07T09:00:00Z"
                            timestamp = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
                        except ValueError:
                            pass
                    # text content
                    short_desc = talk.get("shortDescription", {}).get("en", talk.get("shortDescription", {}).get("ru", ""))
                    long_desc = talk.get("longDescription", {}).get("en", talk.get("longDescription", {}).get("ru", ""))
                    desc = self._clean_html(short_desc) + " " + self._clean_html(long_desc)
                    # speakers
                    speakers = []
                    for speaker in talk.get("speakers", []):
                        name = speaker.get("name", {}).get("en") or speaker.get("name", {}).get("ru")
                        if name:
                            speakers.append(name)
                    speaker_str = f"Speakers: {', '.join(speakers)}. " if speakers else ""
                    content_text = f"{speaker_str}{desc}".strip()
                    # only keep talks with decent content
                    if not content_text:
                        content_text = "No description available."
                    talks.append(
                        NewsItemDTO(
                            title=title,
                            url=url,
                            content_text=content_text,
                            source="cppconf",
                            timestamp=timestamp
                        )
                    )
        return talks
 class CppConfCrawler(ICrawler):
    def __init__(self, url: str, source: str = "cppconf"):
        self.url = url
        self.source = source
        self.parser = CppConfNextJsParser()
    async def fetch_latest(self) -> List[NewsItemDTO]:
        async with aiohttp.ClientSession() as session:
            async with session.get(self.url) as response:
                if response.status != 200:
                    return []
                html = await response.text()
                talks = self.parser.parse_talks(html)
                for talk in talks:
                    talk.source = self.source
                return talks
--- a/src/crawlers/factory.py
+++ b/src/crawlers/factory.py
@ -4,6 +4,7 @@ from typing import List
 from src.crawlers.base import ICrawler
 from src.crawlers.rss_crawler import RSSCrawler
 from src.crawlers.playwright_crawler import PlaywrightCrawler
 from src.crawlers.cppconf_crawler import CppConfCrawler
 logger = logging.getLogger(__name__)
@ -36,6 +37,8 @@ class CrawlerFactory:
                elif crawler_type == 'playwright':
                    selector = item.get('selector')
                    crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
                elif crawler_type == 'cppconf':
                    crawlers.append(CppConfCrawler(url=url, source=source))
                else:
                    logger.warning(f"Unknown crawler type: {crawler_type}")
--- a/src/processor/ollama_provider.py
+++ b/src/processor/ollama_provider.py
@ -21,30 +21,42 @@ class OllamaProvider(ILLMProvider):
        base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
        url = base_url if base_url.endswith(
            '/api/generate') else f"{base_url.rstrip('/')}/api/generate"
-        prompt = (
+        if news_item.source in ["C++ Russia", "cppconf"]:
-            "Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
+            prompt = (
-            "cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
+                "Analyze this C++ conference talk abstract. Extract the primary C++ trends discussed "
-            f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
+                "(e.g., C++20/26 concepts, memory safety, coroutines, heterogeneous computing).\n\n"
                f"Title: {news_item.title}\nContent: {news_item.content_text}\n\n"
                "Return a JSON object strictly with these keys:\n"
                "1. 'relevance_score' (integer 0-10): Indicate its importance to the modern C++ ecosystem.\n"
                "2. 'summary_ru' (string): A concise 2-sentence summary in Russian.\n"
                "3. 'anomalies_detected' (list of strings): Any bleeding-edge tech, controversial topics, or Rust comparisons.\n"
                "4. 'category' (string): Must be exactly 'C++ Trends'.\n"
            )
        else:
            prompt = (
                "Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
                "cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
                f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
-            "Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
+                "Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
-            "'anomalies_detected' (list of strings), and 'category' (string).\n\n"
+                "'anomalies_detected' (list of strings), and 'category' (string).\n\n"
-            "OUTPUT RULES:\n"
+                "OUTPUT RULES:\n"
-            "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
+                "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
-            "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
+                "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
-            "SCORING LOGIC ('relevance_score'):\n"
+                "SCORING LOGIC ('relevance_score'):\n"
-            "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
+                "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
-            "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
+                "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
-            "- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
+                "- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
-            "- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
+                "- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
-            "ANOMALY DETECTION ('anomalies_detected'):\n"
+                "ANOMALY DETECTION ('anomalies_detected'):\n"
-            "Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
+                "Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
-            "a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
+                "a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
-            "or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
+                "or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
-            "Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
+                "Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
-        )
+            )
        payload = {
            "model": os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud'),
            "prompt": prompt,
--- a/tests/crawlers/test_cppconf.py
+++ b/tests/crawlers/test_cppconf.py
@ -0,0 +1,23 @@
 import pytest
 from datetime import datetime
 from src.crawlers.cppconf_crawler import CppConfNextJsParser
 from src.crawlers.dto import NewsItemDTO
@pytest.fixture
 def cppconf_html():
    with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
        return f.read()
 def test_cppconf_parser(cppconf_html):
    parser = CppConfNextJsParser()
    talks = parser.parse_talks(cppconf_html)
    assert len(talks) > 0, "Should extract at least one talk"
    first_talk = talks[0]
    assert isinstance(first_talk, NewsItemDTO)
    assert len(first_talk.title) > 0
    assert first_talk.url.startswith("https://cppconf.ru/en/talks/")
    assert len(first_talk.content_text) > 0
    assert first_talk.source == "cppconf"
    assert isinstance(first_talk.timestamp, datetime)
--- a/tests/fixtures/cppconf/talks.html
+++ b/tests/fixtures/cppconf/talks.html
--- a/tests/test_cppconf_pipeline.py
+++ b/tests/test_cppconf_pipeline.py
@ -0,0 +1,68 @@
 import pytest
 import chromadb
 from unittest.mock import AsyncMock, patch
 from src.crawlers.cppconf_crawler import CppConfCrawler
 from src.processor.ollama_provider import OllamaProvider
 from src.storage.chroma_store import ChromaStore
@pytest.fixture
 def cppconf_html():
    with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
        return f.read()
@pytest.mark.asyncio
 async def test_cppconf_e2e_pipeline(cppconf_html):
    # 1. Mock Crawler fetch
    crawler = CppConfCrawler(url="https://cppconf.ru/en/talks/", source="C++ Russia")
    with patch("aiohttp.ClientSession.get") as mock_get:
        mock_response = AsyncMock()
        mock_response.status = 200
        mock_response.text.return_value = cppconf_html
        mock_get.return_value.__aenter__.return_value = mock_response
        talks = await crawler.fetch_latest()
    assert len(talks) > 0
    talk = talks[0]
    assert talk.source == "C++ Russia"
    assert "https://cppconf.ru/en/talks/" in talk.url
    # 2. Mock AI Processor
    provider = OllamaProvider()
    mock_llm_response = {
        "relevance_score": 9,
        "summary_ru": "Этот доклад обсуждает новые фичи C++26 и их влияние на производительность. Показаны примеры использования концептов и корутин.",
        "anomalies_detected": ["Сравнение производительности с Rust"],
        "category": "C++ Trends"
    }
    with patch("aiohttp.ClientSession.post") as mock_post:
        mock_llm_post_response = AsyncMock()
        mock_llm_post_response.raise_for_status = AsyncMock()
        import json
        mock_llm_post_response.json.return_value = {"response": json.dumps(mock_llm_response)}
        mock_post.return_value.__aenter__.return_value = mock_llm_post_response
        enriched_talk = await provider.analyze(talk)
    assert enriched_talk.relevance_score == 9
    assert "Rust" in enriched_talk.anomalies_detected[0]
    assert enriched_talk.category == "C++ Trends"
    # 3. Vector DB Store
    client = chromadb.Client()
    store = ChromaStore(client=client, collection_name="test_cppconf_collection")
    await store.store(enriched_talk)
    # Verify it exists
    exists = await store.exists(enriched_talk.url)
    assert exists is True
    # Search
    results = await store.search("C++26 features", limit=1)
    assert len(results) == 1
    assert results[0].relevance_score == 9
    assert results[0].url == enriched_talk.url