AI-Trend-Scout/tests/storage/test_chroma_store.py

262 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pytest
import pytest_asyncio
import uuid
from datetime import datetime, timezone
import chromadb
from chromadb.config import Settings
from src.processor.dto import EnrichedNewsItemDTO
from src.storage.chroma_store import ChromaStore
@pytest_asyncio.fixture
async def chroma_store():
# Use EphemeralClient for in-memory testing
client = chromadb.EphemeralClient(Settings(allow_reset=True))
client.reset()
store = ChromaStore(client=client, collection_name="test_collection")
yield store
client.reset()
@pytest.mark.asyncio
async def test_store_and_search(chroma_store: ChromaStore):
# 1. Arrange
item1 = EnrichedNewsItemDTO(
title="Apple announces new M4 chip",
url="https://example.com/apple-m4",
content_text="Apple has announced its newest M4 chip for next generation Macs. This processor brings massive AI improvements.",
source="TechNews",
timestamp=datetime(2023, 11, 1, 12, 0, tzinfo=timezone.utc),
relevance_score=9,
summary_ru="Apple анонсировала новый чип M4.",
anomalies_detected=["NPU acceleration"],
category="Competitors"
)
item2 = EnrichedNewsItemDTO(
title="Local bakery makes giant bread",
url="https://example.com/giant-bread",
content_text="A bakery in town just baked the world's largest loaf of bread, weighing over 1000 pounds.",
source="LocalNews",
timestamp=datetime(2023, 11, 2, 10, 0, tzinfo=timezone.utc),
relevance_score=2,
summary_ru="Местная пекарня испекла гигантский хлеб.",
anomalies_detected=[],
category="Other"
)
item3 = EnrichedNewsItemDTO(
title="NVIDIA reveals RTX 5090 with WebGPU support",
url="https://example.com/nvidia-rtx-5090",
content_text="NVIDIA's new RTX 5090 GPU fully accelerates WebGPU workloads for advanced edge AI applications.",
source="GPUWeekly",
timestamp=datetime(2023, 11, 3, 14, 0, tzinfo=timezone.utc),
relevance_score=10,
summary_ru="NVIDIA представила RTX 5090 с поддержкой WebGPU.",
anomalies_detected=["WebGPU", "Edge AI"],
category="Edge AI"
)
# 2. Act
await chroma_store.store(item1)
await chroma_store.store(item2)
await chroma_store.store(item3)
# Search for AI and chip related news
search_results = await chroma_store.search("AI processor and GPU", limit=2)
# 3. Assert
assert len(search_results) == 2
# Expected: The Apple M4 chip and NVIDIA RTX 5090 are highly relevant to AI/GPU
titles = [res.title for res in search_results]
assert "NVIDIA reveals RTX 5090 with WebGPU support" in titles
assert "Apple announces new M4 chip" in titles
assert "Local bakery makes giant bread" not in titles
# Check if properties are correctly restored for one of the items
for res in search_results:
if "NVIDIA" in res.title:
assert res.relevance_score == 10
assert "WebGPU" in res.anomalies_detected
assert "Edge AI" in res.anomalies_detected
assert "NVIDIA's new RTX 5090" in res.content_text
assert res.source == "GPUWeekly"
assert res.category == "Edge AI"
@pytest.mark.asyncio
async def test_search_empty_store(chroma_store: ChromaStore):
results = await chroma_store.search("test query", limit=5)
assert len(results) == 0
@pytest.mark.asyncio
async def test_store_upsert(chroma_store: ChromaStore):
item1 = EnrichedNewsItemDTO(
title="Apple announces new M4 chip",
url="https://example.com/apple-m4",
content_text="Apple has announced its newest M4 chip for next generation Macs.",
source="TechNews",
timestamp=datetime(2023, 11, 1, 12, 0, tzinfo=timezone.utc),
relevance_score=9,
summary_ru="Apple анонсировала новый чип M4.",
anomalies_detected=["NPU acceleration"],
category="Competitors"
)
# Store first time
await chroma_store.store(item1)
results = await chroma_store.search("Apple", limit=5)
assert len(results) == 1
assert results[0].relevance_score == 9
# Modify item and store again (same URL, should upsert)
item1_updated = item1.model_copy()
item1_updated.relevance_score = 10
item1_updated.summary_ru = "Apple анонсировала чип M4. Обновлено."
await chroma_store.store(item1_updated)
results_updated = await chroma_store.search("Apple", limit=5)
# Should still be 1 item, but updated
assert len(results_updated) == 1
assert results_updated[0].relevance_score == 10
assert results_updated[0].summary_ru == "Apple анонсировала чип M4. Обновлено."
@pytest.mark.asyncio
async def test_exists(chroma_store: ChromaStore):
url = "https://example.com/unique-news-123"
# Check that it doesn't exist initially
assert not await chroma_store.exists(url)
item = EnrichedNewsItemDTO(
title="Test Title",
url=url,
content_text="Test content",
source="TestSource",
timestamp=datetime(2023, 11, 1, 12, 0, tzinfo=timezone.utc),
relevance_score=5,
summary_ru="Тест",
anomalies_detected=[],
category="Other"
)
await chroma_store.store(item)
# Check that it exists now
assert await chroma_store.exists(url)
@pytest.mark.asyncio
async def test_get_by_id(chroma_store: ChromaStore):
# 1. Arrange
url = "https://example.com/get-by-id-test"
doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, url))
item = EnrichedNewsItemDTO(
title="ID Test Title",
url=url,
content_text="ID Test Content",
source="IDTestSource",
timestamp=datetime(2023, 11, 1, 12, 0, tzinfo=timezone.utc),
relevance_score=7,
summary_ru="Тест по ID",
anomalies_detected=["TestAnomaly"],
category="Testing"
)
# 2. Act
await chroma_store.store(item)
# Try to retrieve by ID
retrieved_item = await chroma_store.get_by_id(doc_id)
# Try to retrieve non-existent ID
none_item = await chroma_store.get_by_id("non-existent-id")
# 3. Assert
assert retrieved_item is not None
assert retrieved_item.title == "ID Test Title"
assert retrieved_item.url == url
assert retrieved_item.relevance_score == 7
assert "TestAnomaly" in retrieved_item.anomalies_detected
assert retrieved_item.category == "Testing"
assert none_item is None
@pytest.mark.asyncio
async def test_get_stats(chroma_store: ChromaStore):
# 1. Arrange
item1 = EnrichedNewsItemDTO(
title="Title 1",
url="https://example.com/1",
content_text="Content 1",
source="Source 1",
timestamp=datetime.now(timezone.utc),
relevance_score=5,
summary_ru="Сводка 1",
anomalies_detected=[],
category="Tech"
)
item2 = EnrichedNewsItemDTO(
title="Title 2",
url="https://example.com/2",
content_text="Content 2",
source="Source 2",
timestamp=datetime.now(timezone.utc),
relevance_score=5,
summary_ru="Сводка 2",
anomalies_detected=[],
category="Tech"
)
item3 = EnrichedNewsItemDTO(
title="Title 3",
url="https://example.com/3",
content_text="Content 3",
source="Source 3",
timestamp=datetime.now(timezone.utc),
relevance_score=5,
summary_ru="Сводка 3",
anomalies_detected=[],
category="Science"
)
# 2. Act
await chroma_store.store(item1)
await chroma_store.store(item2)
await chroma_store.store(item3)
stats = await chroma_store.get_stats()
# 3. Assert
assert stats["total_count"] == 3
assert stats["category_Tech"] == 2
assert stats["category_Science"] == 1
@pytest.mark.asyncio
async def test_search_sorting(chroma_store: ChromaStore):
# Arrange
items = [
EnrichedNewsItemDTO(
title=f"Title {i}",
url=f"https://example.com/{i}",
content_text=f"Content {i}",
source="Source",
timestamp=datetime.now(timezone.utc),
relevance_score=i,
summary_ru=f"Сводка {i}",
anomalies_detected=[],
category="Tech"
) for i in range(1, 6) # Scores 1 to 5
]
for item in items:
await chroma_store.store(item)
# Act
results = await chroma_store.search("Content", limit=10)
# Assert
assert len(results) == 5
# Should be sorted 5, 4, 3, 2, 1
scores = [r.relevance_score for r in results]
assert scores == [5, 4, 3, 2, 1]