diff --git a/src/crawlers.yml b/src/crawlers.yml index e4fe92b..85449d4 100644 --- a/src/crawlers.yml +++ b/src/crawlers.yml @@ -46,10 +46,9 @@ crawlers: - type: rss url: "https://blog.google/products-and-platforms/platforms/android/rss/" source: "Google Android Blog" - - type: playwright + - type: cppconf url: "https://cppconf.ru/en/talks/" source: "C++ Russia" - selector: "div.talk-item" - type: playwright url: "https://2025.ieee-icra.org/media/" source: "ICRA 2025" diff --git a/src/crawlers/cppconf_crawler.py b/src/crawlers/cppconf_crawler.py new file mode 100644 index 0000000..41882d8 --- /dev/null +++ b/src/crawlers/cppconf_crawler.py @@ -0,0 +1,106 @@ +import json +import re +import asyncio +from datetime import datetime, timezone +from typing import List +import aiohttp + +from .base import ICrawler +from .dto import NewsItemDTO + +class CppConfNextJsParser: + def _clean_html(self, raw_html: str) -> str: + if not raw_html: + return "" + # Remove html tags + cleanr = re.compile('<.*?>') + cleantext = re.sub(cleanr, ' ', raw_html) + # Remove extra whitespace + return ' '.join(cleantext.split()) + + def parse_talks(self, html: str) -> List[NewsItemDTO]: + match = re.search(r'', html) + if not match: + return [] + + try: + data = json.loads(match.group(1)) + talks_by_day = data.get("props", {}).get("pageProps", {}).get("talksByDay", []) + except (json.JSONDecodeError, KeyError, TypeError): + return [] + + talks = [] + for day in talks_by_day: + if "talks" not in day: + continue + + for row in day["talks"]: + if len(row) < 2: + continue + + # row[1] contains the actual list of talks happening at that time + for talk in row[1]: + if talk.get("isServiceTalk", False) or not talk.get("name"): + continue + + title = talk["name"].get("en") or talk["name"].get("ru", "Unknown Title") + url = f"https://cppconf.ru/en/talks/{talk.get('id', '')}/" + + # timestamp + time_str = talk.get("time") or talk.get("talkStartTime") + timestamp = datetime.now(timezone.utc) + if time_str: + try: + # format usually "2026-05-07T09:00:00Z" + timestamp = datetime.fromisoformat(time_str.replace("Z", "+00:00")) + except ValueError: + pass + + # text content + short_desc = talk.get("shortDescription", {}).get("en", talk.get("shortDescription", {}).get("ru", "")) + long_desc = talk.get("longDescription", {}).get("en", talk.get("longDescription", {}).get("ru", "")) + + desc = self._clean_html(short_desc) + " " + self._clean_html(long_desc) + + # speakers + speakers = [] + for speaker in talk.get("speakers", []): + name = speaker.get("name", {}).get("en") or speaker.get("name", {}).get("ru") + if name: + speakers.append(name) + + speaker_str = f"Speakers: {', '.join(speakers)}. " if speakers else "" + content_text = f"{speaker_str}{desc}".strip() + + # only keep talks with decent content + if not content_text: + content_text = "No description available." + + talks.append( + NewsItemDTO( + title=title, + url=url, + content_text=content_text, + source="cppconf", + timestamp=timestamp + ) + ) + + return talks + +class CppConfCrawler(ICrawler): + def __init__(self, url: str, source: str = "cppconf"): + self.url = url + self.source = source + self.parser = CppConfNextJsParser() + + async def fetch_latest(self) -> List[NewsItemDTO]: + async with aiohttp.ClientSession() as session: + async with session.get(self.url) as response: + if response.status != 200: + return [] + html = await response.text() + talks = self.parser.parse_talks(html) + for talk in talks: + talk.source = self.source + return talks diff --git a/src/crawlers/factory.py b/src/crawlers/factory.py index 4aa6a91..07c92c8 100644 --- a/src/crawlers/factory.py +++ b/src/crawlers/factory.py @@ -4,6 +4,7 @@ from typing import List from src.crawlers.base import ICrawler from src.crawlers.rss_crawler import RSSCrawler from src.crawlers.playwright_crawler import PlaywrightCrawler +from src.crawlers.cppconf_crawler import CppConfCrawler logger = logging.getLogger(__name__) @@ -36,6 +37,8 @@ class CrawlerFactory: elif crawler_type == 'playwright': selector = item.get('selector') crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector)) + elif crawler_type == 'cppconf': + crawlers.append(CppConfCrawler(url=url, source=source)) else: logger.warning(f"Unknown crawler type: {crawler_type}") diff --git a/src/processor/ollama_provider.py b/src/processor/ollama_provider.py index 8797a71..52f6f34 100644 --- a/src/processor/ollama_provider.py +++ b/src/processor/ollama_provider.py @@ -21,30 +21,42 @@ class OllamaProvider(ILLMProvider): base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434') url = base_url if base_url.endswith( '/api/generate') else f"{base_url.rstrip('/')}/api/generate" - prompt = ( - "Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, " - "cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n" - f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n" - - "Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), " - "'anomalies_detected' (list of strings), and 'category' (string).\n\n" - - "OUTPUT RULES:\n" - "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n" - "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n" - - "SCORING LOGIC ('relevance_score'):\n" - "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n" - "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n" - "- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n" - "- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n" - - "ANOMALY DETECTION ('anomalies_detected'):\n" - "Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: " - "a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, " - "or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). " - "Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found." - ) + if news_item.source in ["C++ Russia", "cppconf"]: + prompt = ( + "Analyze this C++ conference talk abstract. Extract the primary C++ trends discussed " + "(e.g., C++20/26 concepts, memory safety, coroutines, heterogeneous computing).\n\n" + f"Title: {news_item.title}\nContent: {news_item.content_text}\n\n" + "Return a JSON object strictly with these keys:\n" + "1. 'relevance_score' (integer 0-10): Indicate its importance to the modern C++ ecosystem.\n" + "2. 'summary_ru' (string): A concise 2-sentence summary in Russian.\n" + "3. 'anomalies_detected' (list of strings): Any bleeding-edge tech, controversial topics, or Rust comparisons.\n" + "4. 'category' (string): Must be exactly 'C++ Trends'.\n" + ) + else: + prompt = ( + "Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, " + "cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n" + f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n" + + "Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), " + "'anomalies_detected' (list of strings), and 'category' (string).\n\n" + + "OUTPUT RULES:\n" + "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n" + "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n" + + "SCORING LOGIC ('relevance_score'):\n" + "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n" + "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n" + "- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n" + "- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n" + + "ANOMALY DETECTION ('anomalies_detected'):\n" + "Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: " + "a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, " + "or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). " + "Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found." + ) payload = { "model": os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud'), "prompt": prompt, diff --git a/tests/crawlers/test_cppconf.py b/tests/crawlers/test_cppconf.py new file mode 100644 index 0000000..54ee5bf --- /dev/null +++ b/tests/crawlers/test_cppconf.py @@ -0,0 +1,23 @@ +import pytest +from datetime import datetime +from src.crawlers.cppconf_crawler import CppConfNextJsParser +from src.crawlers.dto import NewsItemDTO + +@pytest.fixture +def cppconf_html(): + with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f: + return f.read() + +def test_cppconf_parser(cppconf_html): + parser = CppConfNextJsParser() + talks = parser.parse_talks(cppconf_html) + + assert len(talks) > 0, "Should extract at least one talk" + + first_talk = talks[0] + assert isinstance(first_talk, NewsItemDTO) + assert len(first_talk.title) > 0 + assert first_talk.url.startswith("https://cppconf.ru/en/talks/") + assert len(first_talk.content_text) > 0 + assert first_talk.source == "cppconf" + assert isinstance(first_talk.timestamp, datetime) diff --git a/tests/fixtures/cppconf/talks.html b/tests/fixtures/cppconf/talks.html new file mode 100644 index 0000000..24646de --- /dev/null +++ b/tests/fixtures/cppconf/talks.html @@ -0,0 +1 @@ +C++ Russia 2026 | Schedule | Conference for C++ developers

Schedule

The time in the program is for your time zone .

Program is filling up

Program is filling up

New talks are published weekly. Follow updates or secure your ticket early.

  1. May 7. Online

  2. May 16. Offline + online

  3. May 17. Offline + online

\ No newline at end of file diff --git a/tests/test_cppconf_pipeline.py b/tests/test_cppconf_pipeline.py new file mode 100644 index 0000000..e8a9722 --- /dev/null +++ b/tests/test_cppconf_pipeline.py @@ -0,0 +1,68 @@ +import pytest +import chromadb +from unittest.mock import AsyncMock, patch +from src.crawlers.cppconf_crawler import CppConfCrawler +from src.processor.ollama_provider import OllamaProvider +from src.storage.chroma_store import ChromaStore + +@pytest.fixture +def cppconf_html(): + with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f: + return f.read() + +@pytest.mark.asyncio +async def test_cppconf_e2e_pipeline(cppconf_html): + # 1. Mock Crawler fetch + crawler = CppConfCrawler(url="https://cppconf.ru/en/talks/", source="C++ Russia") + + with patch("aiohttp.ClientSession.get") as mock_get: + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.text.return_value = cppconf_html + mock_get.return_value.__aenter__.return_value = mock_response + + talks = await crawler.fetch_latest() + + assert len(talks) > 0 + talk = talks[0] + assert talk.source == "C++ Russia" + assert "https://cppconf.ru/en/talks/" in talk.url + + # 2. Mock AI Processor + provider = OllamaProvider() + + mock_llm_response = { + "relevance_score": 9, + "summary_ru": "Этот доклад обсуждает новые фичи C++26 и их влияние на производительность. Показаны примеры использования концептов и корутин.", + "anomalies_detected": ["Сравнение производительности с Rust"], + "category": "C++ Trends" + } + + with patch("aiohttp.ClientSession.post") as mock_post: + mock_llm_post_response = AsyncMock() + mock_llm_post_response.raise_for_status = AsyncMock() + import json + mock_llm_post_response.json.return_value = {"response": json.dumps(mock_llm_response)} + mock_post.return_value.__aenter__.return_value = mock_llm_post_response + + enriched_talk = await provider.analyze(talk) + + assert enriched_talk.relevance_score == 9 + assert "Rust" in enriched_talk.anomalies_detected[0] + assert enriched_talk.category == "C++ Trends" + + # 3. Vector DB Store + client = chromadb.Client() + store = ChromaStore(client=client, collection_name="test_cppconf_collection") + + await store.store(enriched_talk) + + # Verify it exists + exists = await store.exists(enriched_talk.url) + assert exists is True + + # Search + results = await store.search("C++26 features", limit=1) + assert len(results) == 1 + assert results[0].relevance_score == 9 + assert results[0].url == enriched_talk.url \ No newline at end of file