feat(crawlers): implement specialized CppConf crawler and AI analysis

- Added CppConfCrawler using aiohttp and regex to parse Next.js JSON data, skipping the Playwright bottleneck.
- Added C++ specific prompts to OllamaProvider for trend analysis (identifying C++26, memory safety, coroutines).
- Created offline pytest fixtures and TDD unit tests for the parser.
- Created end-to-end pipeline test mapping Crawler -> AI Processor -> Vector DB.
This commit is contained in:
Artur Mukhamadiev 2026-03-15 20:34:28 +03:00
parent a0eeba0918
commit a363ca41cf
7 changed files with 238 additions and 26 deletions

View File

@ -46,10 +46,9 @@ crawlers:
- type: rss
url: "https://blog.google/products-and-platforms/platforms/android/rss/"
source: "Google Android Blog"
- type: playwright
- type: cppconf
url: "https://cppconf.ru/en/talks/"
source: "C++ Russia"
selector: "div.talk-item"
- type: playwright
url: "https://2025.ieee-icra.org/media/"
source: "ICRA 2025"

View File

@ -0,0 +1,106 @@
import json
import re
import asyncio
from datetime import datetime, timezone
from typing import List
import aiohttp
from .base import ICrawler
from .dto import NewsItemDTO
class CppConfNextJsParser:
def _clean_html(self, raw_html: str) -> str:
if not raw_html:
return ""
# Remove html tags
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', raw_html)
# Remove extra whitespace
return ' '.join(cleantext.split())
def parse_talks(self, html: str) -> List[NewsItemDTO]:
match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
if not match:
return []
try:
data = json.loads(match.group(1))
talks_by_day = data.get("props", {}).get("pageProps", {}).get("talksByDay", [])
except (json.JSONDecodeError, KeyError, TypeError):
return []
talks = []
for day in talks_by_day:
if "talks" not in day:
continue
for row in day["talks"]:
if len(row) < 2:
continue
# row[1] contains the actual list of talks happening at that time
for talk in row[1]:
if talk.get("isServiceTalk", False) or not talk.get("name"):
continue
title = talk["name"].get("en") or talk["name"].get("ru", "Unknown Title")
url = f"https://cppconf.ru/en/talks/{talk.get('id', '')}/"
# timestamp
time_str = talk.get("time") or talk.get("talkStartTime")
timestamp = datetime.now(timezone.utc)
if time_str:
try:
# format usually "2026-05-07T09:00:00Z"
timestamp = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
except ValueError:
pass
# text content
short_desc = talk.get("shortDescription", {}).get("en", talk.get("shortDescription", {}).get("ru", ""))
long_desc = talk.get("longDescription", {}).get("en", talk.get("longDescription", {}).get("ru", ""))
desc = self._clean_html(short_desc) + " " + self._clean_html(long_desc)
# speakers
speakers = []
for speaker in talk.get("speakers", []):
name = speaker.get("name", {}).get("en") or speaker.get("name", {}).get("ru")
if name:
speakers.append(name)
speaker_str = f"Speakers: {', '.join(speakers)}. " if speakers else ""
content_text = f"{speaker_str}{desc}".strip()
# only keep talks with decent content
if not content_text:
content_text = "No description available."
talks.append(
NewsItemDTO(
title=title,
url=url,
content_text=content_text,
source="cppconf",
timestamp=timestamp
)
)
return talks
class CppConfCrawler(ICrawler):
def __init__(self, url: str, source: str = "cppconf"):
self.url = url
self.source = source
self.parser = CppConfNextJsParser()
async def fetch_latest(self) -> List[NewsItemDTO]:
async with aiohttp.ClientSession() as session:
async with session.get(self.url) as response:
if response.status != 200:
return []
html = await response.text()
talks = self.parser.parse_talks(html)
for talk in talks:
talk.source = self.source
return talks

View File

@ -4,6 +4,7 @@ from typing import List
from src.crawlers.base import ICrawler
from src.crawlers.rss_crawler import RSSCrawler
from src.crawlers.playwright_crawler import PlaywrightCrawler
from src.crawlers.cppconf_crawler import CppConfCrawler
logger = logging.getLogger(__name__)
@ -36,6 +37,8 @@ class CrawlerFactory:
elif crawler_type == 'playwright':
selector = item.get('selector')
crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
elif crawler_type == 'cppconf':
crawlers.append(CppConfCrawler(url=url, source=source))
else:
logger.warning(f"Unknown crawler type: {crawler_type}")

View File

@ -21,30 +21,42 @@ class OllamaProvider(ILLMProvider):
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
url = base_url if base_url.endswith(
'/api/generate') else f"{base_url.rstrip('/')}/api/generate"
prompt = (
"Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
"cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
"Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
"'anomalies_detected' (list of strings), and 'category' (string).\n\n"
"OUTPUT RULES:\n"
"1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
"2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
"SCORING LOGIC ('relevance_score'):\n"
"- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
"- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
"- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
"- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
"ANOMALY DETECTION ('anomalies_detected'):\n"
"Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
"a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
"or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
"Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
)
if news_item.source in ["C++ Russia", "cppconf"]:
prompt = (
"Analyze this C++ conference talk abstract. Extract the primary C++ trends discussed "
"(e.g., C++20/26 concepts, memory safety, coroutines, heterogeneous computing).\n\n"
f"Title: {news_item.title}\nContent: {news_item.content_text}\n\n"
"Return a JSON object strictly with these keys:\n"
"1. 'relevance_score' (integer 0-10): Indicate its importance to the modern C++ ecosystem.\n"
"2. 'summary_ru' (string): A concise 2-sentence summary in Russian.\n"
"3. 'anomalies_detected' (list of strings): Any bleeding-edge tech, controversial topics, or Rust comparisons.\n"
"4. 'category' (string): Must be exactly 'C++ Trends'.\n"
)
else:
prompt = (
"Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
"cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
"Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
"'anomalies_detected' (list of strings), and 'category' (string).\n\n"
"OUTPUT RULES:\n"
"1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
"2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
"SCORING LOGIC ('relevance_score'):\n"
"- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
"- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
"- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
"- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
"ANOMALY DETECTION ('anomalies_detected'):\n"
"Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
"a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
"or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
"Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
)
payload = {
"model": os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud'),
"prompt": prompt,

View File

@ -0,0 +1,23 @@
import pytest
from datetime import datetime
from src.crawlers.cppconf_crawler import CppConfNextJsParser
from src.crawlers.dto import NewsItemDTO
@pytest.fixture
def cppconf_html():
with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
return f.read()
def test_cppconf_parser(cppconf_html):
parser = CppConfNextJsParser()
talks = parser.parse_talks(cppconf_html)
assert len(talks) > 0, "Should extract at least one talk"
first_talk = talks[0]
assert isinstance(first_talk, NewsItemDTO)
assert len(first_talk.title) > 0
assert first_talk.url.startswith("https://cppconf.ru/en/talks/")
assert len(first_talk.content_text) > 0
assert first_talk.source == "cppconf"
assert isinstance(first_talk.timestamp, datetime)

1
tests/fixtures/cppconf/talks.html vendored Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,68 @@
import pytest
import chromadb
from unittest.mock import AsyncMock, patch
from src.crawlers.cppconf_crawler import CppConfCrawler
from src.processor.ollama_provider import OllamaProvider
from src.storage.chroma_store import ChromaStore
@pytest.fixture
def cppconf_html():
with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
return f.read()
@pytest.mark.asyncio
async def test_cppconf_e2e_pipeline(cppconf_html):
# 1. Mock Crawler fetch
crawler = CppConfCrawler(url="https://cppconf.ru/en/talks/", source="C++ Russia")
with patch("aiohttp.ClientSession.get") as mock_get:
mock_response = AsyncMock()
mock_response.status = 200
mock_response.text.return_value = cppconf_html
mock_get.return_value.__aenter__.return_value = mock_response
talks = await crawler.fetch_latest()
assert len(talks) > 0
talk = talks[0]
assert talk.source == "C++ Russia"
assert "https://cppconf.ru/en/talks/" in talk.url
# 2. Mock AI Processor
provider = OllamaProvider()
mock_llm_response = {
"relevance_score": 9,
"summary_ru": "Этот доклад обсуждает новые фичи C++26 и их влияние на производительность. Показаны примеры использования концептов и корутин.",
"anomalies_detected": ["Сравнение производительности с Rust"],
"category": "C++ Trends"
}
with patch("aiohttp.ClientSession.post") as mock_post:
mock_llm_post_response = AsyncMock()
mock_llm_post_response.raise_for_status = AsyncMock()
import json
mock_llm_post_response.json.return_value = {"response": json.dumps(mock_llm_response)}
mock_post.return_value.__aenter__.return_value = mock_llm_post_response
enriched_talk = await provider.analyze(talk)
assert enriched_talk.relevance_score == 9
assert "Rust" in enriched_talk.anomalies_detected[0]
assert enriched_talk.category == "C++ Trends"
# 3. Vector DB Store
client = chromadb.Client()
store = ChromaStore(client=client, collection_name="test_cppconf_collection")
await store.store(enriched_talk)
# Verify it exists
exists = await store.exists(enriched_talk.url)
assert exists is True
# Search
results = await store.search("C++26 features", limit=1)
assert len(results) == 1
assert results[0].relevance_score == 9
assert results[0].url == enriched_talk.url