- Added CppConfCrawler using aiohttp and regex to parse Next.js JSON data, skipping the Playwright bottleneck. - Added C++ specific prompts to OllamaProvider for trend analysis (identifying C++26, memory safety, coroutines). - Created offline pytest fixtures and TDD unit tests for the parser. - Created end-to-end pipeline test mapping Crawler -> AI Processor -> Vector DB.
24 lines
798 B
Python
24 lines
798 B
Python
import pytest
|
|
from datetime import datetime
|
|
from src.crawlers.cppconf_crawler import CppConfNextJsParser
|
|
from src.crawlers.dto import NewsItemDTO
|
|
|
|
@pytest.fixture
|
|
def cppconf_html():
|
|
with open("tests/fixtures/cppconf/talks.html", "r", encoding="utf-8") as f:
|
|
return f.read()
|
|
|
|
def test_cppconf_parser(cppconf_html):
|
|
parser = CppConfNextJsParser()
|
|
talks = parser.parse_talks(cppconf_html)
|
|
|
|
assert len(talks) > 0, "Should extract at least one talk"
|
|
|
|
first_talk = talks[0]
|
|
assert isinstance(first_talk, NewsItemDTO)
|
|
assert len(first_talk.title) > 0
|
|
assert first_talk.url.startswith("https://cppconf.ru/en/talks/")
|
|
assert len(first_talk.content_text) > 0
|
|
assert first_talk.source == "cppconf"
|
|
assert isinstance(first_talk.timestamp, datetime)
|