feat(crawlers): implement specialized CppConf crawler and AI analysis
- Added CppConfCrawler using aiohttp and regex to parse Next.js JSON data, skipping the Playwright bottleneck. - Added C++ specific prompts to OllamaProvider for trend analysis (identifying C++26, memory safety, coroutines). - Created offline pytest fixtures and TDD unit tests for the parser. - Created end-to-end pipeline test mapping Crawler -> AI Processor -> Vector DB.
This commit is contained in:
parent
a0eeba0918
commit
a363ca41cf
@ -46,10 +46,9 @@ crawlers:
|
|||||||
- type: rss
|
- type: rss
|
||||||
url: "https://blog.google/products-and-platforms/platforms/android/rss/"
|
url: "https://blog.google/products-and-platforms/platforms/android/rss/"
|
||||||
source: "Google Android Blog"
|
source: "Google Android Blog"
|
||||||
- type: playwright
|
- type: cppconf
|
||||||
url: "https://cppconf.ru/en/talks/"
|
url: "https://cppconf.ru/en/talks/"
|
||||||
source: "C++ Russia"
|
source: "C++ Russia"
|
||||||
selector: "div.talk-item"
|
|
||||||
- type: playwright
|
- type: playwright
|
||||||
url: "https://2025.ieee-icra.org/media/"
|
url: "https://2025.ieee-icra.org/media/"
|
||||||
source: "ICRA 2025"
|
source: "ICRA 2025"
|
||||||
|
|||||||
106
src/crawlers/cppconf_crawler.py
Normal file
106
src/crawlers/cppconf_crawler.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
import json
|
||||||
|
import re
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import List
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
from .base import ICrawler
|
||||||
|
from .dto import NewsItemDTO
|
||||||
|
|
||||||
|
class CppConfNextJsParser:
|
||||||
|
def _clean_html(self, raw_html: str) -> str:
|
||||||
|
if not raw_html:
|
||||||
|
return ""
|
||||||
|
# Remove html tags
|
||||||
|
cleanr = re.compile('<.*?>')
|
||||||
|
cleantext = re.sub(cleanr, ' ', raw_html)
|
||||||
|
# Remove extra whitespace
|
||||||
|
return ' '.join(cleantext.split())
|
||||||
|
|
||||||
|
def parse_talks(self, html: str) -> List[NewsItemDTO]:
|
||||||
|
match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
|
||||||
|
if not match:
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(match.group(1))
|
||||||
|
talks_by_day = data.get("props", {}).get("pageProps", {}).get("talksByDay", [])
|
||||||
|
except (json.JSONDecodeError, KeyError, TypeError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
talks = []
|
||||||
|
for day in talks_by_day:
|
||||||
|
if "talks" not in day:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for row in day["talks"]:
|
||||||
|
if len(row) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# row[1] contains the actual list of talks happening at that time
|
||||||
|
for talk in row[1]:
|
||||||
|
if talk.get("isServiceTalk", False) or not talk.get("name"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = talk["name"].get("en") or talk["name"].get("ru", "Unknown Title")
|
||||||
|
url = f"https://cppconf.ru/en/talks/{talk.get('id', '')}/"
|
||||||
|
|
||||||
|
# timestamp
|
||||||
|
time_str = talk.get("time") or talk.get("talkStartTime")
|
||||||
|
timestamp = datetime.now(timezone.utc)
|
||||||
|
if time_str:
|
||||||
|
try:
|
||||||
|
# format usually "2026-05-07T09:00:00Z"
|
||||||
|
timestamp = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# text content
|
||||||
|
short_desc = talk.get("shortDescription", {}).get("en", talk.get("shortDescription", {}).get("ru", ""))
|
||||||
|
long_desc = talk.get("longDescription", {}).get("en", talk.get("longDescription", {}).get("ru", ""))
|
||||||
|
|
||||||
|
desc = self._clean_html(short_desc) + " " + self._clean_html(long_desc)
|
||||||
|
|
||||||
|
# speakers
|
||||||
|
speakers = []
|
||||||
|
for speaker in talk.get("speakers", []):
|
||||||
|
name = speaker.get("name", {}).get("en") or speaker.get("name", {}).get("ru")
|
||||||
|
if name:
|
||||||
|
speakers.append(name)
|
||||||
|
|
||||||
|
speaker_str = f"Speakers: {', '.join(speakers)}. " if speakers else ""
|
||||||
|
content_text = f"{speaker_str}{desc}".strip()
|
||||||
|
|
||||||
|
# only keep talks with decent content
|
||||||
|
if not content_text:
|
||||||
|
content_text = "No description available."
|
||||||
|
|
||||||
|
talks.append(
|
||||||
|
NewsItemDTO(
|
||||||
|
title=title,
|
||||||
|
url=url,
|
||||||
|
content_text=content_text,
|
||||||
|
source="cppconf",
|
||||||
|
timestamp=timestamp
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return talks
|
||||||
|
|
||||||
|
class CppConfCrawler(ICrawler):
|
||||||
|
def __init__(self, url: str, source: str = "cppconf"):
|
||||||
|
self.url = url
|
||||||
|
self.source = source
|
||||||
|
self.parser = CppConfNextJsParser()
|
||||||
|
|
||||||
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(self.url) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
return []
|
||||||
|
html = await response.text()
|
||||||
|
talks = self.parser.parse_talks(html)
|
||||||
|
for talk in talks:
|
||||||
|
talk.source = self.source
|
||||||
|
return talks
|
||||||
@ -4,6 +4,7 @@ from typing import List
|
|||||||
from src.crawlers.base import ICrawler
|
from src.crawlers.base import ICrawler
|
||||||
from src.crawlers.rss_crawler import RSSCrawler
|
from src.crawlers.rss_crawler import RSSCrawler
|
||||||
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
||||||
|
from src.crawlers.cppconf_crawler import CppConfCrawler
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -36,6 +37,8 @@ class CrawlerFactory:
|
|||||||
elif crawler_type == 'playwright':
|
elif crawler_type == 'playwright':
|
||||||
selector = item.get('selector')
|
selector = item.get('selector')
|
||||||
crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
|
crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
|
||||||
|
elif crawler_type == 'cppconf':
|
||||||
|
crawlers.append(CppConfCrawler(url=url, source=source))
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unknown crawler type: {crawler_type}")
|
logger.warning(f"Unknown crawler type: {crawler_type}")
|
||||||
|
|
||||||
|
|||||||
@ -21,30 +21,42 @@ class OllamaProvider(ILLMProvider):
|
|||||||
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
|
base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
|
||||||
url = base_url if base_url.endswith(
|
url = base_url if base_url.endswith(
|
||||||
'/api/generate') else f"{base_url.rstrip('/')}/api/generate"
|
'/api/generate') else f"{base_url.rstrip('/')}/api/generate"
|
||||||
prompt = (
|
if news_item.source in ["C++ Russia", "cppconf"]:
|
||||||
"Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
|
prompt = (
|
||||||
"cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
|
"Analyze this C++ conference talk abstract. Extract the primary C++ trends discussed "
|
||||||
f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
|
"(e.g., C++20/26 concepts, memory safety, coroutines, heterogeneous computing).\n\n"
|
||||||
|
f"Title: {news_item.title}\nContent: {news_item.content_text}\n\n"
|
||||||
|
"Return a JSON object strictly with these keys:\n"
|
||||||
|
"1. 'relevance_score' (integer 0-10): Indicate its importance to the modern C++ ecosystem.\n"
|
||||||
|
"2. 'summary_ru' (string): A concise 2-sentence summary in Russian.\n"
|
||||||
|
"3. 'anomalies_detected' (list of strings): Any bleeding-edge tech, controversial topics, or Rust comparisons.\n"
|
||||||
|
"4. 'category' (string): Must be exactly 'C++ Trends'.\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prompt = (
|
||||||
|
"Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
|
||||||
|
"cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
|
||||||
|
f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
|
||||||
|
|
||||||
"Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
|
"Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
|
||||||
"'anomalies_detected' (list of strings), and 'category' (string).\n\n"
|
"'anomalies_detected' (list of strings), and 'category' (string).\n\n"
|
||||||
|
|
||||||
"OUTPUT RULES:\n"
|
"OUTPUT RULES:\n"
|
||||||
"1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
|
"1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
|
||||||
"2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
|
"2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
|
||||||
|
|
||||||
"SCORING LOGIC ('relevance_score'):\n"
|
"SCORING LOGIC ('relevance_score'):\n"
|
||||||
"- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
|
"- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
|
||||||
"- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
|
"- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
|
||||||
"- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
|
"- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
|
||||||
"- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
|
"- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
|
||||||
|
|
||||||
"ANOMALY DETECTION ('anomalies_detected'):\n"
|
"ANOMALY DETECTION ('anomalies_detected'):\n"
|
||||||
"Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
|
"Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
|
||||||
"a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
|
"a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
|
||||||
"or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
|
"or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
|
||||||
"Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
|
"Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
|
||||||
)
|
)
|
||||||
payload = {
|
payload = {
|
||||||
"model": os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud'),
|
"model": os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud'),
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
|
|||||||
23
tests/crawlers/test_cppconf.py
Normal file
23
tests/crawlers/test_cppconf.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
import pytest
|
||||||
|
from datetime import datetime
|
||||||
|
from src.crawlers.cppconf_crawler import CppConfNextJsParser
|
||||||
|
from src.crawlers.dto import NewsItemDTO
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def cppconf_html():
|
||||||
|
with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
def test_cppconf_parser(cppconf_html):
|
||||||
|
parser = CppConfNextJsParser()
|
||||||
|
talks = parser.parse_talks(cppconf_html)
|
||||||
|
|
||||||
|
assert len(talks) > 0, "Should extract at least one talk"
|
||||||
|
|
||||||
|
first_talk = talks[0]
|
||||||
|
assert isinstance(first_talk, NewsItemDTO)
|
||||||
|
assert len(first_talk.title) > 0
|
||||||
|
assert first_talk.url.startswith("https://cppconf.ru/en/talks/")
|
||||||
|
assert len(first_talk.content_text) > 0
|
||||||
|
assert first_talk.source == "cppconf"
|
||||||
|
assert isinstance(first_talk.timestamp, datetime)
|
||||||
1
tests/fixtures/cppconf/talks.html
vendored
Normal file
1
tests/fixtures/cppconf/talks.html
vendored
Normal file
File diff suppressed because one or more lines are too long
68
tests/test_cppconf_pipeline.py
Normal file
68
tests/test_cppconf_pipeline.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import pytest
|
||||||
|
import chromadb
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
from src.crawlers.cppconf_crawler import CppConfCrawler
|
||||||
|
from src.processor.ollama_provider import OllamaProvider
|
||||||
|
from src.storage.chroma_store import ChromaStore
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def cppconf_html():
|
||||||
|
with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cppconf_e2e_pipeline(cppconf_html):
|
||||||
|
# 1. Mock Crawler fetch
|
||||||
|
crawler = CppConfCrawler(url="https://cppconf.ru/en/talks/", source="C++ Russia")
|
||||||
|
|
||||||
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
||||||
|
mock_response = AsyncMock()
|
||||||
|
mock_response.status = 200
|
||||||
|
mock_response.text.return_value = cppconf_html
|
||||||
|
mock_get.return_value.__aenter__.return_value = mock_response
|
||||||
|
|
||||||
|
talks = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert len(talks) > 0
|
||||||
|
talk = talks[0]
|
||||||
|
assert talk.source == "C++ Russia"
|
||||||
|
assert "https://cppconf.ru/en/talks/" in talk.url
|
||||||
|
|
||||||
|
# 2. Mock AI Processor
|
||||||
|
provider = OllamaProvider()
|
||||||
|
|
||||||
|
mock_llm_response = {
|
||||||
|
"relevance_score": 9,
|
||||||
|
"summary_ru": "Этот доклад обсуждает новые фичи C++26 и их влияние на производительность. Показаны примеры использования концептов и корутин.",
|
||||||
|
"anomalies_detected": ["Сравнение производительности с Rust"],
|
||||||
|
"category": "C++ Trends"
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch("aiohttp.ClientSession.post") as mock_post:
|
||||||
|
mock_llm_post_response = AsyncMock()
|
||||||
|
mock_llm_post_response.raise_for_status = AsyncMock()
|
||||||
|
import json
|
||||||
|
mock_llm_post_response.json.return_value = {"response": json.dumps(mock_llm_response)}
|
||||||
|
mock_post.return_value.__aenter__.return_value = mock_llm_post_response
|
||||||
|
|
||||||
|
enriched_talk = await provider.analyze(talk)
|
||||||
|
|
||||||
|
assert enriched_talk.relevance_score == 9
|
||||||
|
assert "Rust" in enriched_talk.anomalies_detected[0]
|
||||||
|
assert enriched_talk.category == "C++ Trends"
|
||||||
|
|
||||||
|
# 3. Vector DB Store
|
||||||
|
client = chromadb.Client()
|
||||||
|
store = ChromaStore(client=client, collection_name="test_cppconf_collection")
|
||||||
|
|
||||||
|
await store.store(enriched_talk)
|
||||||
|
|
||||||
|
# Verify it exists
|
||||||
|
exists = await store.exists(enriched_talk.url)
|
||||||
|
assert exists is True
|
||||||
|
|
||||||
|
# Search
|
||||||
|
results = await store.search("C++26 features", limit=1)
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0].relevance_score == 9
|
||||||
|
assert results[0].url == enriched_talk.url
|
||||||
Loading…
x
Reference in New Issue
Block a user