feat(crawlers): implement specialized CppConf crawler and AI analysis
- Added CppConfCrawler using aiohttp and regex to parse Next.js JSON data, skipping the Playwright bottleneck. - Added C++ specific prompts to OllamaProvider for trend analysis (identifying C++26, memory safety, coroutines). - Created offline pytest fixtures and TDD unit tests for the parser. - Created end-to-end pipeline test mapping Crawler -> AI Processor -> Vector DB.
This commit is contained in:
parent
a0eeba0918
commit
a363ca41cf
@ -46,10 +46,9 @@ crawlers:
|
||||
- type: rss
|
||||
url: "https://blog.google/products-and-platforms/platforms/android/rss/"
|
||||
source: "Google Android Blog"
|
||||
- type: playwright
|
||||
- type: cppconf
|
||||
url: "https://cppconf.ru/en/talks/"
|
||||
source: "C++ Russia"
|
||||
selector: "div.talk-item"
|
||||
- type: playwright
|
||||
url: "https://2025.ieee-icra.org/media/"
|
||||
source: "ICRA 2025"
|
||||
|
||||
106
src/crawlers/cppconf_crawler.py
Normal file
106
src/crawlers/cppconf_crawler.py
Normal file
@ -0,0 +1,106 @@
|
||||
import json
|
||||
import re
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
from typing import List
|
||||
import aiohttp
|
||||
|
||||
from .base import ICrawler
|
||||
from .dto import NewsItemDTO
|
||||
|
||||
class CppConfNextJsParser:
|
||||
def _clean_html(self, raw_html: str) -> str:
|
||||
if not raw_html:
|
||||
return ""
|
||||
# Remove html tags
|
||||
cleanr = re.compile('<.*?>')
|
||||
cleantext = re.sub(cleanr, ' ', raw_html)
|
||||
# Remove extra whitespace
|
||||
return ' '.join(cleantext.split())
|
||||
|
||||
def parse_talks(self, html: str) -> List[NewsItemDTO]:
|
||||
match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
|
||||
if not match:
|
||||
return []
|
||||
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
talks_by_day = data.get("props", {}).get("pageProps", {}).get("talksByDay", [])
|
||||
except (json.JSONDecodeError, KeyError, TypeError):
|
||||
return []
|
||||
|
||||
talks = []
|
||||
for day in talks_by_day:
|
||||
if "talks" not in day:
|
||||
continue
|
||||
|
||||
for row in day["talks"]:
|
||||
if len(row) < 2:
|
||||
continue
|
||||
|
||||
# row[1] contains the actual list of talks happening at that time
|
||||
for talk in row[1]:
|
||||
if talk.get("isServiceTalk", False) or not talk.get("name"):
|
||||
continue
|
||||
|
||||
title = talk["name"].get("en") or talk["name"].get("ru", "Unknown Title")
|
||||
url = f"https://cppconf.ru/en/talks/{talk.get('id', '')}/"
|
||||
|
||||
# timestamp
|
||||
time_str = talk.get("time") or talk.get("talkStartTime")
|
||||
timestamp = datetime.now(timezone.utc)
|
||||
if time_str:
|
||||
try:
|
||||
# format usually "2026-05-07T09:00:00Z"
|
||||
timestamp = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# text content
|
||||
short_desc = talk.get("shortDescription", {}).get("en", talk.get("shortDescription", {}).get("ru", ""))
|
||||
long_desc = talk.get("longDescription", {}).get("en", talk.get("longDescription", {}).get("ru", ""))
|
||||
|
||||
desc = self._clean_html(short_desc) + " " + self._clean_html(long_desc)
|
||||
|
||||
# speakers
|
||||
speakers = []
|
||||
for speaker in talk.get("speakers", []):
|
||||
name = speaker.get("name", {}).get("en") or speaker.get("name", {}).get("ru")
|
||||
if name:
|
||||
speakers.append(name)
|
||||
|
||||
speaker_str = f"Speakers: {', '.join(speakers)}. " if speakers else ""
|
||||
content_text = f"{speaker_str}{desc}".strip()
|
||||
|
||||
# only keep talks with decent content
|
||||
if not content_text:
|
||||
content_text = "No description available."
|
||||
|
||||
talks.append(
|
||||
NewsItemDTO(
|
||||
title=title,
|
||||
url=url,
|
||||
content_text=content_text,
|
||||
source="cppconf",
|
||||
timestamp=timestamp
|
||||
)
|
||||
)
|
||||
|
||||
return talks
|
||||
|
||||
class CppConfCrawler(ICrawler):
|
||||
def __init__(self, url: str, source: str = "cppconf"):
|
||||
self.url = url
|
||||
self.source = source
|
||||
self.parser = CppConfNextJsParser()
|
||||
|
||||
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(self.url) as response:
|
||||
if response.status != 200:
|
||||
return []
|
||||
html = await response.text()
|
||||
talks = self.parser.parse_talks(html)
|
||||
for talk in talks:
|
||||
talk.source = self.source
|
||||
return talks
|
||||
@ -4,6 +4,7 @@ from typing import List
|
||||
from src.crawlers.base import ICrawler
|
||||
from src.crawlers.rss_crawler import RSSCrawler
|
||||
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
||||
from src.crawlers.cppconf_crawler import CppConfCrawler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -36,6 +37,8 @@ class CrawlerFactory:
|
||||
elif crawler_type == 'playwright':
|
||||
selector = item.get('selector')
|
||||
crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
|
||||
elif crawler_type == 'cppconf':
|
||||
crawlers.append(CppConfCrawler(url=url, source=source))
|
||||
else:
|
||||
logger.warning(f"Unknown crawler type: {crawler_type}")
|
||||
|
||||
|
||||
@ -21,30 +21,42 @@ class OllamaProvider(ILLMProvider):
|
||||
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
|
||||
url = base_url if base_url.endswith(
|
||||
'/api/generate') else f"{base_url.rstrip('/')}/api/generate"
|
||||
prompt = (
|
||||
"Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
|
||||
"cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
|
||||
f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
|
||||
|
||||
"Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
|
||||
"'anomalies_detected' (list of strings), and 'category' (string).\n\n"
|
||||
|
||||
"OUTPUT RULES:\n"
|
||||
"1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
|
||||
"2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
|
||||
|
||||
"SCORING LOGIC ('relevance_score'):\n"
|
||||
"- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
|
||||
"- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
|
||||
"- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
|
||||
"- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
|
||||
|
||||
"ANOMALY DETECTION ('anomalies_detected'):\n"
|
||||
"Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
|
||||
"a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
|
||||
"or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
|
||||
"Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
|
||||
)
|
||||
if news_item.source in ["C++ Russia", "cppconf"]:
|
||||
prompt = (
|
||||
"Analyze this C++ conference talk abstract. Extract the primary C++ trends discussed "
|
||||
"(e.g., C++20/26 concepts, memory safety, coroutines, heterogeneous computing).\n\n"
|
||||
f"Title: {news_item.title}\nContent: {news_item.content_text}\n\n"
|
||||
"Return a JSON object strictly with these keys:\n"
|
||||
"1. 'relevance_score' (integer 0-10): Indicate its importance to the modern C++ ecosystem.\n"
|
||||
"2. 'summary_ru' (string): A concise 2-sentence summary in Russian.\n"
|
||||
"3. 'anomalies_detected' (list of strings): Any bleeding-edge tech, controversial topics, or Rust comparisons.\n"
|
||||
"4. 'category' (string): Must be exactly 'C++ Trends'.\n"
|
||||
)
|
||||
else:
|
||||
prompt = (
|
||||
"Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
|
||||
"cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
|
||||
f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
|
||||
|
||||
"Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
|
||||
"'anomalies_detected' (list of strings), and 'category' (string).\n\n"
|
||||
|
||||
"OUTPUT RULES:\n"
|
||||
"1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
|
||||
"2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
|
||||
|
||||
"SCORING LOGIC ('relevance_score'):\n"
|
||||
"- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
|
||||
"- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
|
||||
"- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
|
||||
"- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
|
||||
|
||||
"ANOMALY DETECTION ('anomalies_detected'):\n"
|
||||
"Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
|
||||
"a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
|
||||
"or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
|
||||
"Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
|
||||
)
|
||||
payload = {
|
||||
"model": os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud'),
|
||||
"prompt": prompt,
|
||||
|
||||
23
tests/crawlers/test_cppconf.py
Normal file
23
tests/crawlers/test_cppconf.py
Normal file
@ -0,0 +1,23 @@
|
||||
import pytest
|
||||
from datetime import datetime
|
||||
from src.crawlers.cppconf_crawler import CppConfNextJsParser
|
||||
from src.crawlers.dto import NewsItemDTO
|
||||
|
||||
@pytest.fixture
|
||||
def cppconf_html():
|
||||
with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
def test_cppconf_parser(cppconf_html):
|
||||
parser = CppConfNextJsParser()
|
||||
talks = parser.parse_talks(cppconf_html)
|
||||
|
||||
assert len(talks) > 0, "Should extract at least one talk"
|
||||
|
||||
first_talk = talks[0]
|
||||
assert isinstance(first_talk, NewsItemDTO)
|
||||
assert len(first_talk.title) > 0
|
||||
assert first_talk.url.startswith("https://cppconf.ru/en/talks/")
|
||||
assert len(first_talk.content_text) > 0
|
||||
assert first_talk.source == "cppconf"
|
||||
assert isinstance(first_talk.timestamp, datetime)
|
||||
1
tests/fixtures/cppconf/talks.html
vendored
Normal file
1
tests/fixtures/cppconf/talks.html
vendored
Normal file
File diff suppressed because one or more lines are too long
68
tests/test_cppconf_pipeline.py
Normal file
68
tests/test_cppconf_pipeline.py
Normal file
@ -0,0 +1,68 @@
|
||||
import pytest
|
||||
import chromadb
|
||||
from unittest.mock import AsyncMock, patch
|
||||
from src.crawlers.cppconf_crawler import CppConfCrawler
|
||||
from src.processor.ollama_provider import OllamaProvider
|
||||
from src.storage.chroma_store import ChromaStore
|
||||
|
||||
@pytest.fixture
|
||||
def cppconf_html():
|
||||
with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cppconf_e2e_pipeline(cppconf_html):
|
||||
# 1. Mock Crawler fetch
|
||||
crawler = CppConfCrawler(url="https://cppconf.ru/en/talks/", source="C++ Russia")
|
||||
|
||||
with patch("aiohttp.ClientSession.get") as mock_get:
|
||||
mock_response = AsyncMock()
|
||||
mock_response.status = 200
|
||||
mock_response.text.return_value = cppconf_html
|
||||
mock_get.return_value.__aenter__.return_value = mock_response
|
||||
|
||||
talks = await crawler.fetch_latest()
|
||||
|
||||
assert len(talks) > 0
|
||||
talk = talks[0]
|
||||
assert talk.source == "C++ Russia"
|
||||
assert "https://cppconf.ru/en/talks/" in talk.url
|
||||
|
||||
# 2. Mock AI Processor
|
||||
provider = OllamaProvider()
|
||||
|
||||
mock_llm_response = {
|
||||
"relevance_score": 9,
|
||||
"summary_ru": "Этот доклад обсуждает новые фичи C++26 и их влияние на производительность. Показаны примеры использования концептов и корутин.",
|
||||
"anomalies_detected": ["Сравнение производительности с Rust"],
|
||||
"category": "C++ Trends"
|
||||
}
|
||||
|
||||
with patch("aiohttp.ClientSession.post") as mock_post:
|
||||
mock_llm_post_response = AsyncMock()
|
||||
mock_llm_post_response.raise_for_status = AsyncMock()
|
||||
import json
|
||||
mock_llm_post_response.json.return_value = {"response": json.dumps(mock_llm_response)}
|
||||
mock_post.return_value.__aenter__.return_value = mock_llm_post_response
|
||||
|
||||
enriched_talk = await provider.analyze(talk)
|
||||
|
||||
assert enriched_talk.relevance_score == 9
|
||||
assert "Rust" in enriched_talk.anomalies_detected[0]
|
||||
assert enriched_talk.category == "C++ Trends"
|
||||
|
||||
# 3. Vector DB Store
|
||||
client = chromadb.Client()
|
||||
store = ChromaStore(client=client, collection_name="test_cppconf_collection")
|
||||
|
||||
await store.store(enriched_talk)
|
||||
|
||||
# Verify it exists
|
||||
exists = await store.exists(enriched_talk.url)
|
||||
assert exists is True
|
||||
|
||||
# Search
|
||||
results = await store.search("C++26 features", limit=1)
|
||||
assert len(results) == 1
|
||||
assert results[0].relevance_score == 9
|
||||
assert results[0].url == enriched_talk.url
|
||||
Loading…
x
Reference in New Issue
Block a user