import pytest import pytest_asyncio import uuid from datetime import datetime, timezone import chromadb from chromadb.config import Settings from src.processor.dto import EnrichedNewsItemDTO from src.storage.chroma_store import ChromaStore @pytest_asyncio.fixture async def chroma_store(): # Use EphemeralClient for in-memory testing client = chromadb.EphemeralClient(Settings(allow_reset=True)) client.reset() store = ChromaStore(client=client, collection_name="test_collection") yield store client.reset() @pytest.mark.asyncio async def test_store_and_search(chroma_store: ChromaStore): # 1. Arrange item1 = EnrichedNewsItemDTO( title="Apple announces new M4 chip", url="https://example.com/apple-m4", content_text="Apple has announced its newest M4 chip for next generation Macs. This processor brings massive AI improvements.", source="TechNews", timestamp=datetime(2023, 11, 1, 12, 0, tzinfo=timezone.utc), relevance_score=9, summary_ru="Apple анонсировала новый чип M4.", anomalies_detected=["NPU acceleration"], category="Competitors" ) item2 = EnrichedNewsItemDTO( title="Local bakery makes giant bread", url="https://example.com/giant-bread", content_text="A bakery in town just baked the world's largest loaf of bread, weighing over 1000 pounds.", source="LocalNews", timestamp=datetime(2023, 11, 2, 10, 0, tzinfo=timezone.utc), relevance_score=2, summary_ru="Местная пекарня испекла гигантский хлеб.", anomalies_detected=[], category="Other" ) item3 = EnrichedNewsItemDTO( title="NVIDIA reveals RTX 5090 with WebGPU support", url="https://example.com/nvidia-rtx-5090", content_text="NVIDIA's new RTX 5090 GPU fully accelerates WebGPU workloads for advanced edge AI applications.", source="GPUWeekly", timestamp=datetime(2023, 11, 3, 14, 0, tzinfo=timezone.utc), relevance_score=10, summary_ru="NVIDIA представила RTX 5090 с поддержкой WebGPU.", anomalies_detected=["WebGPU", "Edge AI"], category="Edge AI" ) # 2. Act await chroma_store.store(item1) await chroma_store.store(item2) await chroma_store.store(item3) # Search for AI and chip related news search_results = await chroma_store.search("AI processor and GPU", limit=2) # 3. Assert assert len(search_results) == 2 # Expected: The Apple M4 chip and NVIDIA RTX 5090 are highly relevant to AI/GPU titles = [res.title for res in search_results] assert "NVIDIA reveals RTX 5090 with WebGPU support" in titles assert "Apple announces new M4 chip" in titles assert "Local bakery makes giant bread" not in titles # Check if properties are correctly restored for one of the items for res in search_results: if "NVIDIA" in res.title: assert res.relevance_score == 10 assert "WebGPU" in res.anomalies_detected assert "Edge AI" in res.anomalies_detected assert "NVIDIA's new RTX 5090" in res.content_text assert res.source == "GPUWeekly" assert res.category == "Edge AI" @pytest.mark.asyncio async def test_search_empty_store(chroma_store: ChromaStore): results = await chroma_store.search("test query", limit=5) assert len(results) == 0 @pytest.mark.asyncio async def test_store_upsert(chroma_store: ChromaStore): item1 = EnrichedNewsItemDTO( title="Apple announces new M4 chip", url="https://example.com/apple-m4", content_text="Apple has announced its newest M4 chip for next generation Macs.", source="TechNews", timestamp=datetime(2023, 11, 1, 12, 0, tzinfo=timezone.utc), relevance_score=9, summary_ru="Apple анонсировала новый чип M4.", anomalies_detected=["NPU acceleration"], category="Competitors" ) # Store first time await chroma_store.store(item1) results = await chroma_store.search("Apple", limit=5) assert len(results) == 1 assert results[0].relevance_score == 9 # Modify item and store again (same URL, should upsert) item1_updated = item1.model_copy() item1_updated.relevance_score = 10 item1_updated.summary_ru = "Apple анонсировала чип M4. Обновлено." await chroma_store.store(item1_updated) results_updated = await chroma_store.search("Apple", limit=5) # Should still be 1 item, but updated assert len(results_updated) == 1 assert results_updated[0].relevance_score == 10 assert results_updated[0].summary_ru == "Apple анонсировала чип M4. Обновлено." @pytest.mark.asyncio async def test_exists(chroma_store: ChromaStore): url = "https://example.com/unique-news-123" # Check that it doesn't exist initially assert not await chroma_store.exists(url) item = EnrichedNewsItemDTO( title="Test Title", url=url, content_text="Test content", source="TestSource", timestamp=datetime(2023, 11, 1, 12, 0, tzinfo=timezone.utc), relevance_score=5, summary_ru="Тест", anomalies_detected=[], category="Other" ) await chroma_store.store(item) # Check that it exists now assert await chroma_store.exists(url) @pytest.mark.asyncio async def test_get_by_id(chroma_store: ChromaStore): # 1. Arrange url = "https://example.com/get-by-id-test" doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, url)) item = EnrichedNewsItemDTO( title="ID Test Title", url=url, content_text="ID Test Content", source="IDTestSource", timestamp=datetime(2023, 11, 1, 12, 0, tzinfo=timezone.utc), relevance_score=7, summary_ru="Тест по ID", anomalies_detected=["TestAnomaly"], category="Testing" ) # 2. Act await chroma_store.store(item) # Try to retrieve by ID retrieved_item = await chroma_store.get_by_id(doc_id) # Try to retrieve non-existent ID none_item = await chroma_store.get_by_id("non-existent-id") # 3. Assert assert retrieved_item is not None assert retrieved_item.title == "ID Test Title" assert retrieved_item.url == url assert retrieved_item.relevance_score == 7 assert "TestAnomaly" in retrieved_item.anomalies_detected assert retrieved_item.category == "Testing" assert none_item is None @pytest.mark.asyncio async def test_get_stats(chroma_store: ChromaStore): # 1. Arrange item1 = EnrichedNewsItemDTO( title="Title 1", url="https://example.com/1", content_text="Content 1", source="Source 1", timestamp=datetime.now(timezone.utc), relevance_score=5, summary_ru="Сводка 1", anomalies_detected=[], category="Tech" ) item2 = EnrichedNewsItemDTO( title="Title 2", url="https://example.com/2", content_text="Content 2", source="Source 2", timestamp=datetime.now(timezone.utc), relevance_score=5, summary_ru="Сводка 2", anomalies_detected=[], category="Tech" ) item3 = EnrichedNewsItemDTO( title="Title 3", url="https://example.com/3", content_text="Content 3", source="Source 3", timestamp=datetime.now(timezone.utc), relevance_score=5, summary_ru="Сводка 3", anomalies_detected=[], category="Science" ) # 2. Act await chroma_store.store(item1) await chroma_store.store(item2) await chroma_store.store(item3) stats = await chroma_store.get_stats() # 3. Assert assert stats["total_count"] == 3 assert stats["category_Tech"] == 2 assert stats["category_Science"] == 1 @pytest.mark.asyncio async def test_search_sorting(chroma_store: ChromaStore): # Arrange items = [ EnrichedNewsItemDTO( title=f"Title {i}", url=f"https://example.com/{i}", content_text=f"Content {i}", source="Source", timestamp=datetime.now(timezone.utc), relevance_score=i, summary_ru=f"Сводка {i}", anomalies_detected=[], category="Tech" ) for i in range(1, 6) # Scores 1 to 5 ] for item in items: await chroma_store.store(item) # Act results = await chroma_store.search("Content", limit=10) # Assert assert len(results) == 5 # Should be sorted 5, 4, 3, 2, 1 scores = [r.relevance_score for r in results] assert scores == [5, 4, 3, 2, 1]