diff --git a/src/crawlers.yml b/src/crawlers.yml
index e4fe92b..85449d4 100644
--- a/src/crawlers.yml
+++ b/src/crawlers.yml
@@ -46,10 +46,9 @@ crawlers:
- type: rss
url: "https://blog.google/products-and-platforms/platforms/android/rss/"
source: "Google Android Blog"
- - type: playwright
+ - type: cppconf
url: "https://cppconf.ru/en/talks/"
source: "C++ Russia"
- selector: "div.talk-item"
- type: playwright
url: "https://2025.ieee-icra.org/media/"
source: "ICRA 2025"
diff --git a/src/crawlers/cppconf_crawler.py b/src/crawlers/cppconf_crawler.py
new file mode 100644
index 0000000..41882d8
--- /dev/null
+++ b/src/crawlers/cppconf_crawler.py
@@ -0,0 +1,106 @@
+import json
+import re
+import asyncio
+from datetime import datetime, timezone
+from typing import List
+import aiohttp
+
+from .base import ICrawler
+from .dto import NewsItemDTO
+
+class CppConfNextJsParser:
+ def _clean_html(self, raw_html: str) -> str:
+ if not raw_html:
+ return ""
+ # Remove html tags
+ cleanr = re.compile('<.*?>')
+ cleantext = re.sub(cleanr, ' ', raw_html)
+ # Remove extra whitespace
+ return ' '.join(cleantext.split())
+
+ def parse_talks(self, html: str) -> List[NewsItemDTO]:
+ match = re.search(r'', html)
+ if not match:
+ return []
+
+ try:
+ data = json.loads(match.group(1))
+ talks_by_day = data.get("props", {}).get("pageProps", {}).get("talksByDay", [])
+ except (json.JSONDecodeError, KeyError, TypeError):
+ return []
+
+ talks = []
+ for day in talks_by_day:
+ if "talks" not in day:
+ continue
+
+ for row in day["talks"]:
+ if len(row) < 2:
+ continue
+
+ # row[1] contains the actual list of talks happening at that time
+ for talk in row[1]:
+ if talk.get("isServiceTalk", False) or not talk.get("name"):
+ continue
+
+ title = talk["name"].get("en") or talk["name"].get("ru", "Unknown Title")
+ url = f"https://cppconf.ru/en/talks/{talk.get('id', '')}/"
+
+ # timestamp
+ time_str = talk.get("time") or talk.get("talkStartTime")
+ timestamp = datetime.now(timezone.utc)
+ if time_str:
+ try:
+ # format usually "2026-05-07T09:00:00Z"
+ timestamp = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
+ except ValueError:
+ pass
+
+ # text content
+ short_desc = talk.get("shortDescription", {}).get("en", talk.get("shortDescription", {}).get("ru", ""))
+ long_desc = talk.get("longDescription", {}).get("en", talk.get("longDescription", {}).get("ru", ""))
+
+ desc = self._clean_html(short_desc) + " " + self._clean_html(long_desc)
+
+ # speakers
+ speakers = []
+ for speaker in talk.get("speakers", []):
+ name = speaker.get("name", {}).get("en") or speaker.get("name", {}).get("ru")
+ if name:
+ speakers.append(name)
+
+ speaker_str = f"Speakers: {', '.join(speakers)}. " if speakers else ""
+ content_text = f"{speaker_str}{desc}".strip()
+
+ # only keep talks with decent content
+ if not content_text:
+ content_text = "No description available."
+
+ talks.append(
+ NewsItemDTO(
+ title=title,
+ url=url,
+ content_text=content_text,
+ source="cppconf",
+ timestamp=timestamp
+ )
+ )
+
+ return talks
+
+class CppConfCrawler(ICrawler):
+ def __init__(self, url: str, source: str = "cppconf"):
+ self.url = url
+ self.source = source
+ self.parser = CppConfNextJsParser()
+
+ async def fetch_latest(self) -> List[NewsItemDTO]:
+ async with aiohttp.ClientSession() as session:
+ async with session.get(self.url) as response:
+ if response.status != 200:
+ return []
+ html = await response.text()
+ talks = self.parser.parse_talks(html)
+ for talk in talks:
+ talk.source = self.source
+ return talks
diff --git a/src/crawlers/factory.py b/src/crawlers/factory.py
index 4aa6a91..07c92c8 100644
--- a/src/crawlers/factory.py
+++ b/src/crawlers/factory.py
@@ -4,6 +4,7 @@ from typing import List
from src.crawlers.base import ICrawler
from src.crawlers.rss_crawler import RSSCrawler
from src.crawlers.playwright_crawler import PlaywrightCrawler
+from src.crawlers.cppconf_crawler import CppConfCrawler
logger = logging.getLogger(__name__)
@@ -36,6 +37,8 @@ class CrawlerFactory:
elif crawler_type == 'playwright':
selector = item.get('selector')
crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
+ elif crawler_type == 'cppconf':
+ crawlers.append(CppConfCrawler(url=url, source=source))
else:
logger.warning(f"Unknown crawler type: {crawler_type}")
diff --git a/src/processor/ollama_provider.py b/src/processor/ollama_provider.py
index 8797a71..52f6f34 100644
--- a/src/processor/ollama_provider.py
+++ b/src/processor/ollama_provider.py
@@ -21,30 +21,42 @@ class OllamaProvider(ILLMProvider):
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
url = base_url if base_url.endswith(
'/api/generate') else f"{base_url.rstrip('/')}/api/generate"
- prompt = (
- "Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
- "cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
- f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
-
- "Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
- "'anomalies_detected' (list of strings), and 'category' (string).\n\n"
-
- "OUTPUT RULES:\n"
- "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
- "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
-
- "SCORING LOGIC ('relevance_score'):\n"
- "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
- "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
- "- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
- "- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
-
- "ANOMALY DETECTION ('anomalies_detected'):\n"
- "Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
- "a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
- "or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
- "Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
- )
+ if news_item.source in ["C++ Russia", "cppconf"]:
+ prompt = (
+ "Analyze this C++ conference talk abstract. Extract the primary C++ trends discussed "
+ "(e.g., C++20/26 concepts, memory safety, coroutines, heterogeneous computing).\n\n"
+ f"Title: {news_item.title}\nContent: {news_item.content_text}\n\n"
+ "Return a JSON object strictly with these keys:\n"
+ "1. 'relevance_score' (integer 0-10): Indicate its importance to the modern C++ ecosystem.\n"
+ "2. 'summary_ru' (string): A concise 2-sentence summary in Russian.\n"
+ "3. 'anomalies_detected' (list of strings): Any bleeding-edge tech, controversial topics, or Rust comparisons.\n"
+ "4. 'category' (string): Must be exactly 'C++ Trends'.\n"
+ )
+ else:
+ prompt = (
+ "Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
+ "cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
+ f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
+
+ "Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
+ "'anomalies_detected' (list of strings), and 'category' (string).\n\n"
+
+ "OUTPUT RULES:\n"
+ "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
+ "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
+
+ "SCORING LOGIC ('relevance_score'):\n"
+ "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
+ "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
+ "- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
+ "- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
+
+ "ANOMALY DETECTION ('anomalies_detected'):\n"
+ "Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
+ "a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
+ "or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
+ "Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
+ )
payload = {
"model": os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud'),
"prompt": prompt,
diff --git a/tests/crawlers/test_cppconf.py b/tests/crawlers/test_cppconf.py
new file mode 100644
index 0000000..54ee5bf
--- /dev/null
+++ b/tests/crawlers/test_cppconf.py
@@ -0,0 +1,23 @@
+import pytest
+from datetime import datetime
+from src.crawlers.cppconf_crawler import CppConfNextJsParser
+from src.crawlers.dto import NewsItemDTO
+
+@pytest.fixture
+def cppconf_html():
+ with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
+ return f.read()
+
+def test_cppconf_parser(cppconf_html):
+ parser = CppConfNextJsParser()
+ talks = parser.parse_talks(cppconf_html)
+
+ assert len(talks) > 0, "Should extract at least one talk"
+
+ first_talk = talks[0]
+ assert isinstance(first_talk, NewsItemDTO)
+ assert len(first_talk.title) > 0
+ assert first_talk.url.startswith("https://cppconf.ru/en/talks/")
+ assert len(first_talk.content_text) > 0
+ assert first_talk.source == "cppconf"
+ assert isinstance(first_talk.timestamp, datetime)
diff --git a/tests/fixtures/cppconf/talks.html b/tests/fixtures/cppconf/talks.html
new file mode 100644
index 0000000..24646de
--- /dev/null
+++ b/tests/fixtures/cppconf/talks.html
@@ -0,0 +1 @@
+
C++ Russia 2026 | Schedule | Conference for C++ developersProgram is filling up
New talks are published weekly. Follow updates or secure your ticket early.
Techplatform of Yandex City Services
Higher School of Economics in St. Petersburg
\ No newline at end of file
diff --git a/tests/test_cppconf_pipeline.py b/tests/test_cppconf_pipeline.py
new file mode 100644
index 0000000..e8a9722
--- /dev/null
+++ b/tests/test_cppconf_pipeline.py
@@ -0,0 +1,68 @@
+import pytest
+import chromadb
+from unittest.mock import AsyncMock, patch
+from src.crawlers.cppconf_crawler import CppConfCrawler
+from src.processor.ollama_provider import OllamaProvider
+from src.storage.chroma_store import ChromaStore
+
+@pytest.fixture
+def cppconf_html():
+ with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
+ return f.read()
+
+@pytest.mark.asyncio
+async def test_cppconf_e2e_pipeline(cppconf_html):
+ # 1. Mock Crawler fetch
+ crawler = CppConfCrawler(url="https://cppconf.ru/en/talks/", source="C++ Russia")
+
+ with patch("aiohttp.ClientSession.get") as mock_get:
+ mock_response = AsyncMock()
+ mock_response.status = 200
+ mock_response.text.return_value = cppconf_html
+ mock_get.return_value.__aenter__.return_value = mock_response
+
+ talks = await crawler.fetch_latest()
+
+ assert len(talks) > 0
+ talk = talks[0]
+ assert talk.source == "C++ Russia"
+ assert "https://cppconf.ru/en/talks/" in talk.url
+
+ # 2. Mock AI Processor
+ provider = OllamaProvider()
+
+ mock_llm_response = {
+ "relevance_score": 9,
+ "summary_ru": "Этот доклад обсуждает новые фичи C++26 и их влияние на производительность. Показаны примеры использования концептов и корутин.",
+ "anomalies_detected": ["Сравнение производительности с Rust"],
+ "category": "C++ Trends"
+ }
+
+ with patch("aiohttp.ClientSession.post") as mock_post:
+ mock_llm_post_response = AsyncMock()
+ mock_llm_post_response.raise_for_status = AsyncMock()
+ import json
+ mock_llm_post_response.json.return_value = {"response": json.dumps(mock_llm_response)}
+ mock_post.return_value.__aenter__.return_value = mock_llm_post_response
+
+ enriched_talk = await provider.analyze(talk)
+
+ assert enriched_talk.relevance_score == 9
+ assert "Rust" in enriched_talk.anomalies_detected[0]
+ assert enriched_talk.category == "C++ Trends"
+
+ # 3. Vector DB Store
+ client = chromadb.Client()
+ store = ChromaStore(client=client, collection_name="test_cppconf_collection")
+
+ await store.store(enriched_talk)
+
+ # Verify it exists
+ exists = await store.exists(enriched_talk.url)
+ assert exists is True
+
+ # Search
+ results = await store.search("C++26 features", limit=1)
+ assert len(results) == 1
+ assert results[0].relevance_score == 9
+ assert results[0].url == enriched_talk.url
\ No newline at end of file