import pytest import asyncio import uuid from datetime import datetime, timezone from unittest.mock import MagicMock, patch, AsyncMock from typing import Dict, Any from src.processor.dto import EnrichedNewsItemDTO from src.storage.chroma_store import ChromaStore @pytest.fixture def mock_client(): return MagicMock() @pytest.fixture def mock_collection(): return MagicMock() @pytest.fixture def chroma_store(mock_client, mock_collection): mock_client.get_or_create_collection.return_value = mock_collection return ChromaStore(client=mock_client, collection_name="test_collection") @pytest.mark.asyncio async def test_store(chroma_store, mock_collection): # Arrange item = EnrichedNewsItemDTO( title="Test Title", url="https://example.com/test", content_text="Test Content", source="Test Source", timestamp=datetime(2023, 1, 1, tzinfo=timezone.utc), relevance_score=8, summary_ru="Тест", category="Tech", anomalies_detected=["A1", "A2"] ) doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, item.url)) # Act await chroma_store.store(item) # Assert mock_collection.upsert.assert_called_once() args, kwargs = mock_collection.upsert.call_args assert kwargs['ids'] == [doc_id] assert kwargs['documents'] == ["Test Content"] assert kwargs['metadatas'][0]['title'] == "Test Title" assert kwargs['metadatas'][0]['category'] == "Tech" assert kwargs['metadatas'][0]['anomalies_detected'] == "A1,A2" @pytest.mark.asyncio async def test_get_by_id_found(chroma_store, mock_collection): # Arrange item_id = "some-id" mock_collection.get.return_value = { "metadatas": [{ "title": "Title", "url": "https://url.com", "source": "Source", "timestamp": "2023-01-01T00:00:00", "relevance_score": 5.0, "summary_ru": "Сводка", "category": "Cat", "anomalies_detected": "A1" }], "documents": ["Content"] } # Act result = await chroma_store.get_by_id(item_id) # Assert assert result is not None assert result.title == "Title" assert result.content_text == "Content" assert result.anomalies_detected == ["A1"] mock_collection.get.assert_called_once_with(ids=[item_id]) @pytest.mark.asyncio async def test_get_by_id_not_found(chroma_store, mock_collection): # Arrange mock_collection.get.return_value = {"metadatas": [], "documents": []} # Act result = await chroma_store.get_by_id("none") # Assert assert result is None @pytest.mark.asyncio async def test_exists(chroma_store, mock_collection): # Arrange url = "https://example.com" doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, url)) mock_collection.get.return_value = {"ids": [doc_id]} # Act exists = await chroma_store.exists(url) # Assert assert exists is True mock_collection.get.assert_called_once_with(ids=[doc_id]) @pytest.mark.asyncio async def test_get_stats(chroma_store, mock_collection): # Arrange mock_collection.get.return_value = { "metadatas": [ {"category": "Tech"}, {"category": "Tech"}, {"category": "Science"}, None, {"other": "data"} ] } # Act stats = await chroma_store.get_stats() # Assert assert stats["total_count"] == 5 assert stats["category_Tech"] == 2 assert stats["category_Science"] == 1 assert stats["category_Uncategorized"] == 1 # for the dict without category @pytest.mark.asyncio async def test_get_latest(chroma_store, mock_collection): # Arrange mock_collection.get.return_value = { "metadatas": [ {"title": "Old", "timestamp": "2023-01-01T00:00:00", "url": "u1", "relevance_score": 1}, {"title": "New", "timestamp": "2023-01-02T00:00:00", "url": "u2", "relevance_score": 1}, ], "documents": ["doc1", "doc2"] } # Act results = await chroma_store.get_latest(limit=10, category="Tech") # Assert assert len(results) == 2 assert results[0].title == "New" assert results[1].title == "Old" mock_collection.get.assert_called_once_with( include=["metadatas", "documents"], where={"category": "Tech"} ) @pytest.mark.asyncio async def test_get_top_ranked(chroma_store, mock_collection): # Arrange mock_collection.get.return_value = { "metadatas": [ {"title": "Low", "timestamp": "2023-01-01T00:00:00", "url": "u1", "relevance_score": 2}, {"title": "High", "timestamp": "2023-01-01T00:00:00", "url": "u2", "relevance_score": 10}, ], "documents": ["doc1", "doc2"] } # Act results = await chroma_store.get_top_ranked(limit=1, category="Tech") # Assert assert len(results) == 1 assert results[0].title == "High" mock_collection.get.assert_called_once_with( include=["metadatas", "documents"], where={"category": "Tech"} ) @pytest.mark.asyncio async def test_search_hybrid_exact_match_fills_limit(chroma_store, mock_collection): # Arrange query = "Apple" mock_collection.get.return_value = { "metadatas": [ {"title": "Apple M4", "url": "u1", "timestamp": "2023-01-01T00:00:00", "relevance_score": 10}, {"title": "Apple Vision", "url": "u2", "timestamp": "2023-01-01T00:00:00", "relevance_score": 9}, ], "documents": ["doc1", "doc2"] } # Act results = await chroma_store.search(query, limit=2) # Assert assert len(results) == 2 assert results[0].title == "Apple M4" assert results[1].title == "Apple Vision" mock_collection.get.assert_called_once() mock_collection.query.assert_not_called() @pytest.mark.asyncio async def test_search_hybrid_falls_back_to_semantic(chroma_store, mock_collection): # Arrange query = "Apple" # Exact match finds 1 item mock_collection.get.return_value = { "metadatas": [{"title": "Apple M4", "url": "u1", "timestamp": "2023-01-01T00:00:00", "relevance_score": 10}], "documents": ["doc1"] } # Semantic match finds more items, including the same one mock_collection.query.return_value = { "metadatas": [[ {"title": "Apple M4", "url": "u1", "timestamp": "2023-01-01T00:00:00", "relevance_score": 10}, {"title": "M3 Chip", "url": "u2", "timestamp": "2023-01-01T00:00:00", "relevance_score": 8}, ]], "documents": [["doc1", "doc2"]], "distances": [[0.1, 0.5]] } # Act results = await chroma_store.search(query, limit=2) # Assert assert len(results) == 2 assert results[0].title == "Apple M4" assert results[1].title == "M3 Chip" assert mock_collection.get.called assert mock_collection.query.called @pytest.mark.asyncio async def test_search_with_category_and_threshold(chroma_store, mock_collection): # Arrange query = "AI" mock_collection.get.return_value = {"metadatas": [], "documents": []} mock_collection.query.return_value = { "metadatas": [[ {"title": "Good match", "url": "u1", "timestamp": "2023-01-01T00:00:00", "relevance_score": 10}, {"title": "Bad match", "url": "u2", "timestamp": "2023-01-01T00:00:00", "relevance_score": 5}, ]], "documents": [["doc1", "doc2"]], "distances": [[0.2, 0.8]] } # Act results = await chroma_store.search(query, limit=5, category="Tech", threshold=0.5) # Assert assert len(results) == 1 assert results[0].title == "Good match" mock_collection.get.assert_called_with( where_document={"$contains": "AI"}, where={"category": "Tech"}, limit=5, include=["metadatas", "documents"] ) mock_collection.query.assert_called_with( query_texts=["AI"], n_results=5, where={"category": "Tech"} ) @pytest.mark.asyncio async def test_search_exception_handling(chroma_store, mock_collection): # Arrange mock_collection.get.side_effect = Exception("Get failed") mock_collection.query.side_effect = Exception("Query failed") # Act results = await chroma_store.search("query") # Assert assert results == [] # Should not crash @pytest.mark.asyncio async def test_search_empty_query(chroma_store, mock_collection): # Arrange mock_collection.get.return_value = {"metadatas": [], "documents": []} mock_collection.query.return_value = {"metadatas": [[]], "documents": [[]], "distances": [[]]} # Act await chroma_store.search("") # Assert mock_collection.get.assert_not_called() mock_collection.query.assert_called_with( query_texts=["*"], n_results=5, where=None ) # ============================================================================= # Tests for SQLiteStore integration # ============================================================================= @pytest.fixture def mock_sqlite_store(): return AsyncMock() @pytest.fixture def chroma_store_with_sqlite(mock_client, mock_collection, mock_sqlite_store): mock_client.get_or_create_collection.return_value = mock_collection return ChromaStore( client=mock_client, collection_name="test_collection", sqlite_store=mock_sqlite_store ) @pytest.mark.asyncio async def test_get_latest_delegates_to_sqlite_store(chroma_store_with_sqlite, mock_sqlite_store): # Arrange ts = datetime(2024, 1, 15, tzinfo=timezone.utc) mock_sqlite_store.get_latest.return_value = [ {"title": "Latest1", "url": "u1", "content_text": "c1", "source": "s1", "timestamp": ts, "relevance_score": 8, "summary_ru": "sum1", "category": "Tech", "anomalies_detected": ["A1"]}, {"title": "Latest2", "url": "u2", "content_text": "c2", "source": "s2", "timestamp": ts, "relevance_score": 7, "summary_ru": "sum2", "category": "Tech", "anomalies_detected": []}, ] # Act results = await chroma_store_with_sqlite.get_latest(limit=10, category="Tech") # Assert mock_sqlite_store.get_latest.assert_called_once_with(limit=10, category="Tech") assert len(results) == 2 assert results[0].title == "Latest1" assert results[0].relevance_score == 8 assert results[0].anomalies_detected == ["A1"] assert results[1].title == "Latest2" assert results[1].anomalies_detected == [] @pytest.mark.asyncio async def test_get_latest_fallback_when_no_sqlite_store(chroma_store, mock_collection): # Arrange mock_collection.get.return_value = { "metadatas": [ {"title": "Chroma Latest", "timestamp": "2024-01-01T00:00:00", "url": "u1", "relevance_score": 5, "source": "src", "category": "Tech"}, ], "documents": ["content"] } # Act results = await chroma_store.get_latest(limit=10) # Assert assert len(results) == 1 assert results[0].title == "Chroma Latest" @pytest.mark.asyncio async def test_get_top_ranked_delegates_to_sqlite_store(chroma_store_with_sqlite, mock_sqlite_store): # Arrange ts = datetime(2024, 1, 15, tzinfo=timezone.utc) mock_sqlite_store.get_top_ranked.return_value = [ {"title": "Top1", "url": "u1", "content_text": "c1", "source": "s1", "timestamp": ts, "relevance_score": 10, "summary_ru": "sum1", "category": "Tech", "anomalies_detected": []}, {"title": "Top2", "url": "u2", "content_text": "c2", "source": "s2", "timestamp": ts, "relevance_score": 9, "summary_ru": "sum2", "category": "Tech", "anomalies_detected": ["A2"]}, ] # Act results = await chroma_store_with_sqlite.get_top_ranked(limit=5) # Assert mock_sqlite_store.get_top_ranked.assert_called_once_with(limit=5, category=None) assert len(results) == 2 assert results[0].title == "Top1" assert results[0].relevance_score == 10 assert results[1].title == "Top2" assert results[1].anomalies_detected == ["A2"] @pytest.mark.asyncio async def test_get_top_ranked_fallback_when_no_sqlite_store(chroma_store, mock_collection): # Arrange mock_collection.get.return_value = { "metadatas": [ {"title": "Chroma Top", "timestamp": "2024-01-01T00:00:00", "url": "u1", "relevance_score": 10, "source": "src", "category": "Tech"}, ], "documents": ["content"] } # Act results = await chroma_store.get_top_ranked(limit=10) # Assert assert len(results) == 1 assert results[0].title == "Chroma Top" @pytest.mark.asyncio async def test_get_stats_delegates_to_sqlite_store(chroma_store_with_sqlite, mock_sqlite_store): # Arrange mock_sqlite_store.get_stats.return_value = { "total_count": 100, "category_counts": {"Tech": 60, "Science": 40}, "source_counts": {"src1": 70, "src2": 30}, "anomaly_counts": {"A1": 15, "A2": 5}, "last_updated": datetime(2024, 1, 15, tzinfo=timezone.utc) } # Act stats = await chroma_store_with_sqlite.get_stats() # Assert mock_sqlite_store.get_stats.assert_called_once_with(use_cache=True) assert stats["total_count"] == 100 assert stats["category_Tech"] == 60 assert stats["category_Science"] == 40 @pytest.mark.asyncio async def test_get_stats_fallback_when_no_sqlite_store(chroma_store, mock_collection): # Arrange mock_collection.get.return_value = { "metadatas": [ {"category": "Tech"}, {"category": "Tech"}, {"category": "Science"}, ] } # Act stats = await chroma_store.get_stats() # Assert assert stats["total_count"] == 3 assert stats["category_Tech"] == 2 assert stats["category_Science"] == 1 @pytest.mark.asyncio async def test_dict_to_dto_handles_integer_timestamp(): # Arrange store = ChromaStore(client=MagicMock(), collection_name="test") item_dict = { "title": "Test", "url": "http://test.com", "content_text": "Content", "source": "Source", "timestamp": 1705312800, # Unix timestamp as int "relevance_score": 7, "summary_ru": "Summary", "category": "Tech", "anomalies_detected": ["A1", "A2"] } # Act dto = store._dict_to_dto(item_dict) # Assert assert dto.timestamp.year == 2024 assert dto.timestamp.month == 1 assert dto.anomalies_detected == ["A1", "A2"] @pytest.mark.asyncio async def test_dict_to_dto_handles_string_timestamp(): # Arrange store = ChromaStore(client=MagicMock(), collection_name="test") item_dict = { "title": "Test", "url": "http://test.com", "content_text": "Content", "source": "Source", "timestamp": "2024-01-15T12:00:00", "relevance_score": 7, "summary_ru": "Summary", "category": "Tech", "anomalies_detected": [] } # Act dto = store._dict_to_dto(item_dict) # Assert assert dto.timestamp.year == 2024 assert dto.timestamp.month == 1 assert dto.timestamp.day == 15 @pytest.mark.asyncio async def test_dict_to_dto_handles_string_anomalies(): # Arrange store = ChromaStore(client=MagicMock(), collection_name="test") item_dict = { "title": "Test", "url": "http://test.com", "content_text": "Content", "source": "Source", "timestamp": datetime(2024, 1, 15, tzinfo=timezone.utc), "relevance_score": 7, "summary_ru": "Summary", "category": "Tech", "anomalies_detected": "A1,A2,A3" # String instead of list } # Act dto = store._dict_to_dto(item_dict) # Assert assert dto.anomalies_detected == ["A1", "A2", "A3"] @pytest.mark.asyncio async def test_dict_to_dto_handles_empty_anomalies(): # Arrange store = ChromaStore(client=MagicMock(), collection_name="test") item_dict = { "title": "Test", "url": "http://test.com", "content_text": "Content", "source": "Source", "timestamp": datetime(2024, 1, 15, tzinfo=timezone.utc), "relevance_score": 7, "summary_ru": "Summary", "category": "Tech", "anomalies_detected": None } # Act dto = store._dict_to_dto(item_dict) # Assert assert dto.anomalies_detected == []