:Release Notes: - Add ACID-compliant SQLiteStore (WAL mode, FULL sync, FK constraints) - Add AnomalyType enum for normalized anomaly storage - Add legacy data migration script (dry-run, batch, rollback) - Update ChromaStore to delegate indexed queries to SQLite - Add test suite for SQLiteStore (7 tests, all passing) :Detailed Notes: - SQLiteStore: news_items, anomaly_types, news_anomalies tables with indexes - Performance: get_latest/get_top_ranked O(n)→O(log n), get_stats O(n)→O(1) - ChromaDB remains primary vector store; SQLite provides indexed metadata queries :Testing Performed: - python3 -m pytest tests/ -v (112 passed) :QA Notes: - Tests verified by Python QA Engineer subagent :Issues Addressed: - get_latest/get_top_ranked fetched ALL items then sorted in Python - get_stats iterated over ALL items - anomalies_detected stored as comma-joined string (no index) Change-Id: I708808b6e72889869afcf16d4ac274260242007a
528 lines
16 KiB
Python
528 lines
16 KiB
Python
import pytest
|
|
import asyncio
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from unittest.mock import MagicMock, patch, AsyncMock
|
|
from typing import Dict, Any
|
|
|
|
from src.processor.dto import EnrichedNewsItemDTO
|
|
from src.storage.chroma_store import ChromaStore
|
|
|
|
@pytest.fixture
|
|
def mock_client():
|
|
return MagicMock()
|
|
|
|
@pytest.fixture
|
|
def mock_collection():
|
|
return MagicMock()
|
|
|
|
@pytest.fixture
|
|
def chroma_store(mock_client, mock_collection):
|
|
mock_client.get_or_create_collection.return_value = mock_collection
|
|
return ChromaStore(client=mock_client, collection_name="test_collection")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_store(chroma_store, mock_collection):
|
|
# Arrange
|
|
item = EnrichedNewsItemDTO(
|
|
title="Test Title",
|
|
url="https://example.com/test",
|
|
content_text="Test Content",
|
|
source="Test Source",
|
|
timestamp=datetime(2023, 1, 1, tzinfo=timezone.utc),
|
|
relevance_score=8,
|
|
summary_ru="Тест",
|
|
category="Tech",
|
|
anomalies_detected=["A1", "A2"]
|
|
)
|
|
doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, item.url))
|
|
|
|
# Act
|
|
await chroma_store.store(item)
|
|
|
|
# Assert
|
|
mock_collection.upsert.assert_called_once()
|
|
args, kwargs = mock_collection.upsert.call_args
|
|
assert kwargs['ids'] == [doc_id]
|
|
assert kwargs['documents'] == ["Test Content"]
|
|
assert kwargs['metadatas'][0]['title'] == "Test Title"
|
|
assert kwargs['metadatas'][0]['category'] == "Tech"
|
|
assert kwargs['metadatas'][0]['anomalies_detected'] == "A1,A2"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_by_id_found(chroma_store, mock_collection):
|
|
# Arrange
|
|
item_id = "some-id"
|
|
mock_collection.get.return_value = {
|
|
"metadatas": [{
|
|
"title": "Title",
|
|
"url": "https://url.com",
|
|
"source": "Source",
|
|
"timestamp": "2023-01-01T00:00:00",
|
|
"relevance_score": 5.0,
|
|
"summary_ru": "Сводка",
|
|
"category": "Cat",
|
|
"anomalies_detected": "A1"
|
|
}],
|
|
"documents": ["Content"]
|
|
}
|
|
|
|
# Act
|
|
result = await chroma_store.get_by_id(item_id)
|
|
|
|
# Assert
|
|
assert result is not None
|
|
assert result.title == "Title"
|
|
assert result.content_text == "Content"
|
|
assert result.anomalies_detected == ["A1"]
|
|
mock_collection.get.assert_called_once_with(ids=[item_id])
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_by_id_not_found(chroma_store, mock_collection):
|
|
# Arrange
|
|
mock_collection.get.return_value = {"metadatas": [], "documents": []}
|
|
|
|
# Act
|
|
result = await chroma_store.get_by_id("none")
|
|
|
|
# Assert
|
|
assert result is None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_exists(chroma_store, mock_collection):
|
|
# Arrange
|
|
url = "https://example.com"
|
|
doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, url))
|
|
mock_collection.get.return_value = {"ids": [doc_id]}
|
|
|
|
# Act
|
|
exists = await chroma_store.exists(url)
|
|
|
|
# Assert
|
|
assert exists is True
|
|
mock_collection.get.assert_called_once_with(ids=[doc_id])
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_stats(chroma_store, mock_collection):
|
|
# Arrange
|
|
mock_collection.get.return_value = {
|
|
"metadatas": [
|
|
{"category": "Tech"},
|
|
{"category": "Tech"},
|
|
{"category": "Science"},
|
|
None,
|
|
{"other": "data"}
|
|
]
|
|
}
|
|
|
|
# Act
|
|
stats = await chroma_store.get_stats()
|
|
|
|
# Assert
|
|
assert stats["total_count"] == 5
|
|
assert stats["category_Tech"] == 2
|
|
assert stats["category_Science"] == 1
|
|
assert stats["category_Uncategorized"] == 1 # for the dict without category
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_latest(chroma_store, mock_collection):
|
|
# Arrange
|
|
mock_collection.get.return_value = {
|
|
"metadatas": [
|
|
{"title": "Old", "timestamp": "2023-01-01T00:00:00", "url": "u1", "relevance_score": 1},
|
|
{"title": "New", "timestamp": "2023-01-02T00:00:00", "url": "u2", "relevance_score": 1},
|
|
],
|
|
"documents": ["doc1", "doc2"]
|
|
}
|
|
|
|
# Act
|
|
results = await chroma_store.get_latest(limit=10, category="Tech")
|
|
|
|
# Assert
|
|
assert len(results) == 2
|
|
assert results[0].title == "New"
|
|
assert results[1].title == "Old"
|
|
mock_collection.get.assert_called_once_with(
|
|
include=["metadatas", "documents"],
|
|
where={"category": "Tech"}
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_top_ranked(chroma_store, mock_collection):
|
|
# Arrange
|
|
mock_collection.get.return_value = {
|
|
"metadatas": [
|
|
{"title": "Low", "timestamp": "2023-01-01T00:00:00", "url": "u1", "relevance_score": 2},
|
|
{"title": "High", "timestamp": "2023-01-01T00:00:00", "url": "u2", "relevance_score": 10},
|
|
],
|
|
"documents": ["doc1", "doc2"]
|
|
}
|
|
|
|
# Act
|
|
results = await chroma_store.get_top_ranked(limit=1, category="Tech")
|
|
|
|
# Assert
|
|
assert len(results) == 1
|
|
assert results[0].title == "High"
|
|
mock_collection.get.assert_called_once_with(
|
|
include=["metadatas", "documents"],
|
|
where={"category": "Tech"}
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_search_hybrid_exact_match_fills_limit(chroma_store, mock_collection):
|
|
# Arrange
|
|
query = "Apple"
|
|
mock_collection.get.return_value = {
|
|
"metadatas": [
|
|
{"title": "Apple M4", "url": "u1", "timestamp": "2023-01-01T00:00:00", "relevance_score": 10},
|
|
{"title": "Apple Vision", "url": "u2", "timestamp": "2023-01-01T00:00:00", "relevance_score": 9},
|
|
],
|
|
"documents": ["doc1", "doc2"]
|
|
}
|
|
|
|
# Act
|
|
results = await chroma_store.search(query, limit=2)
|
|
|
|
# Assert
|
|
assert len(results) == 2
|
|
assert results[0].title == "Apple M4"
|
|
assert results[1].title == "Apple Vision"
|
|
mock_collection.get.assert_called_once()
|
|
mock_collection.query.assert_not_called()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_search_hybrid_falls_back_to_semantic(chroma_store, mock_collection):
|
|
# Arrange
|
|
query = "Apple"
|
|
# Exact match finds 1 item
|
|
mock_collection.get.return_value = {
|
|
"metadatas": [{"title": "Apple M4", "url": "u1", "timestamp": "2023-01-01T00:00:00", "relevance_score": 10}],
|
|
"documents": ["doc1"]
|
|
}
|
|
# Semantic match finds more items, including the same one
|
|
mock_collection.query.return_value = {
|
|
"metadatas": [[
|
|
{"title": "Apple M4", "url": "u1", "timestamp": "2023-01-01T00:00:00", "relevance_score": 10},
|
|
{"title": "M3 Chip", "url": "u2", "timestamp": "2023-01-01T00:00:00", "relevance_score": 8},
|
|
]],
|
|
"documents": [["doc1", "doc2"]],
|
|
"distances": [[0.1, 0.5]]
|
|
}
|
|
|
|
# Act
|
|
results = await chroma_store.search(query, limit=2)
|
|
|
|
# Assert
|
|
assert len(results) == 2
|
|
assert results[0].title == "Apple M4"
|
|
assert results[1].title == "M3 Chip"
|
|
assert mock_collection.get.called
|
|
assert mock_collection.query.called
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_search_with_category_and_threshold(chroma_store, mock_collection):
|
|
# Arrange
|
|
query = "AI"
|
|
mock_collection.get.return_value = {"metadatas": [], "documents": []}
|
|
mock_collection.query.return_value = {
|
|
"metadatas": [[
|
|
{"title": "Good match", "url": "u1", "timestamp": "2023-01-01T00:00:00", "relevance_score": 10},
|
|
{"title": "Bad match", "url": "u2", "timestamp": "2023-01-01T00:00:00", "relevance_score": 5},
|
|
]],
|
|
"documents": [["doc1", "doc2"]],
|
|
"distances": [[0.2, 0.8]]
|
|
}
|
|
|
|
# Act
|
|
results = await chroma_store.search(query, limit=5, category="Tech", threshold=0.5)
|
|
|
|
# Assert
|
|
assert len(results) == 1
|
|
assert results[0].title == "Good match"
|
|
mock_collection.get.assert_called_with(
|
|
where_document={"$contains": "AI"},
|
|
where={"category": "Tech"},
|
|
limit=5,
|
|
include=["metadatas", "documents"]
|
|
)
|
|
mock_collection.query.assert_called_with(
|
|
query_texts=["AI"],
|
|
n_results=5,
|
|
where={"category": "Tech"}
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_search_exception_handling(chroma_store, mock_collection):
|
|
# Arrange
|
|
mock_collection.get.side_effect = Exception("Get failed")
|
|
mock_collection.query.side_effect = Exception("Query failed")
|
|
|
|
# Act
|
|
results = await chroma_store.search("query")
|
|
|
|
# Assert
|
|
assert results == [] # Should not crash
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_search_empty_query(chroma_store, mock_collection):
|
|
# Arrange
|
|
mock_collection.get.return_value = {"metadatas": [], "documents": []}
|
|
mock_collection.query.return_value = {"metadatas": [[]], "documents": [[]], "distances": [[]]}
|
|
|
|
# Act
|
|
await chroma_store.search("")
|
|
|
|
# Assert
|
|
mock_collection.get.assert_not_called()
|
|
mock_collection.query.assert_called_with(
|
|
query_texts=["*"],
|
|
n_results=5,
|
|
where=None
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# Tests for SQLiteStore integration
|
|
# =============================================================================
|
|
|
|
@pytest.fixture
|
|
def mock_sqlite_store():
|
|
return AsyncMock()
|
|
|
|
|
|
@pytest.fixture
|
|
def chroma_store_with_sqlite(mock_client, mock_collection, mock_sqlite_store):
|
|
mock_client.get_or_create_collection.return_value = mock_collection
|
|
return ChromaStore(
|
|
client=mock_client,
|
|
collection_name="test_collection",
|
|
sqlite_store=mock_sqlite_store
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_latest_delegates_to_sqlite_store(chroma_store_with_sqlite, mock_sqlite_store):
|
|
# Arrange
|
|
ts = datetime(2024, 1, 15, tzinfo=timezone.utc)
|
|
mock_sqlite_store.get_latest.return_value = [
|
|
{"title": "Latest1", "url": "u1", "content_text": "c1", "source": "s1",
|
|
"timestamp": ts, "relevance_score": 8, "summary_ru": "sum1",
|
|
"category": "Tech", "anomalies_detected": ["A1"]},
|
|
{"title": "Latest2", "url": "u2", "content_text": "c2", "source": "s2",
|
|
"timestamp": ts, "relevance_score": 7, "summary_ru": "sum2",
|
|
"category": "Tech", "anomalies_detected": []},
|
|
]
|
|
|
|
# Act
|
|
results = await chroma_store_with_sqlite.get_latest(limit=10, category="Tech")
|
|
|
|
# Assert
|
|
mock_sqlite_store.get_latest.assert_called_once_with(limit=10, category="Tech")
|
|
assert len(results) == 2
|
|
assert results[0].title == "Latest1"
|
|
assert results[0].relevance_score == 8
|
|
assert results[0].anomalies_detected == ["A1"]
|
|
assert results[1].title == "Latest2"
|
|
assert results[1].anomalies_detected == []
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_latest_fallback_when_no_sqlite_store(chroma_store, mock_collection):
|
|
# Arrange
|
|
mock_collection.get.return_value = {
|
|
"metadatas": [
|
|
{"title": "Chroma Latest", "timestamp": "2024-01-01T00:00:00", "url": "u1",
|
|
"relevance_score": 5, "source": "src", "category": "Tech"},
|
|
],
|
|
"documents": ["content"]
|
|
}
|
|
|
|
# Act
|
|
results = await chroma_store.get_latest(limit=10)
|
|
|
|
# Assert
|
|
assert len(results) == 1
|
|
assert results[0].title == "Chroma Latest"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_top_ranked_delegates_to_sqlite_store(chroma_store_with_sqlite, mock_sqlite_store):
|
|
# Arrange
|
|
ts = datetime(2024, 1, 15, tzinfo=timezone.utc)
|
|
mock_sqlite_store.get_top_ranked.return_value = [
|
|
{"title": "Top1", "url": "u1", "content_text": "c1", "source": "s1",
|
|
"timestamp": ts, "relevance_score": 10, "summary_ru": "sum1",
|
|
"category": "Tech", "anomalies_detected": []},
|
|
{"title": "Top2", "url": "u2", "content_text": "c2", "source": "s2",
|
|
"timestamp": ts, "relevance_score": 9, "summary_ru": "sum2",
|
|
"category": "Tech", "anomalies_detected": ["A2"]},
|
|
]
|
|
|
|
# Act
|
|
results = await chroma_store_with_sqlite.get_top_ranked(limit=5)
|
|
|
|
# Assert
|
|
mock_sqlite_store.get_top_ranked.assert_called_once_with(limit=5, category=None)
|
|
assert len(results) == 2
|
|
assert results[0].title == "Top1"
|
|
assert results[0].relevance_score == 10
|
|
assert results[1].title == "Top2"
|
|
assert results[1].anomalies_detected == ["A2"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_top_ranked_fallback_when_no_sqlite_store(chroma_store, mock_collection):
|
|
# Arrange
|
|
mock_collection.get.return_value = {
|
|
"metadatas": [
|
|
{"title": "Chroma Top", "timestamp": "2024-01-01T00:00:00", "url": "u1",
|
|
"relevance_score": 10, "source": "src", "category": "Tech"},
|
|
],
|
|
"documents": ["content"]
|
|
}
|
|
|
|
# Act
|
|
results = await chroma_store.get_top_ranked(limit=10)
|
|
|
|
# Assert
|
|
assert len(results) == 1
|
|
assert results[0].title == "Chroma Top"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_stats_delegates_to_sqlite_store(chroma_store_with_sqlite, mock_sqlite_store):
|
|
# Arrange
|
|
mock_sqlite_store.get_stats.return_value = {
|
|
"total_count": 100,
|
|
"category_counts": {"Tech": 60, "Science": 40},
|
|
"source_counts": {"src1": 70, "src2": 30},
|
|
"anomaly_counts": {"A1": 15, "A2": 5},
|
|
"last_updated": datetime(2024, 1, 15, tzinfo=timezone.utc)
|
|
}
|
|
|
|
# Act
|
|
stats = await chroma_store_with_sqlite.get_stats()
|
|
|
|
# Assert
|
|
mock_sqlite_store.get_stats.assert_called_once_with(use_cache=True)
|
|
assert stats["total_count"] == 100
|
|
assert stats["category_Tech"] == 60
|
|
assert stats["category_Science"] == 40
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_stats_fallback_when_no_sqlite_store(chroma_store, mock_collection):
|
|
# Arrange
|
|
mock_collection.get.return_value = {
|
|
"metadatas": [
|
|
{"category": "Tech"},
|
|
{"category": "Tech"},
|
|
{"category": "Science"},
|
|
]
|
|
}
|
|
|
|
# Act
|
|
stats = await chroma_store.get_stats()
|
|
|
|
# Assert
|
|
assert stats["total_count"] == 3
|
|
assert stats["category_Tech"] == 2
|
|
assert stats["category_Science"] == 1
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_dict_to_dto_handles_integer_timestamp():
|
|
# Arrange
|
|
store = ChromaStore(client=MagicMock(), collection_name="test")
|
|
item_dict = {
|
|
"title": "Test",
|
|
"url": "http://test.com",
|
|
"content_text": "Content",
|
|
"source": "Source",
|
|
"timestamp": 1705312800, # Unix timestamp as int
|
|
"relevance_score": 7,
|
|
"summary_ru": "Summary",
|
|
"category": "Tech",
|
|
"anomalies_detected": ["A1", "A2"]
|
|
}
|
|
|
|
# Act
|
|
dto = store._dict_to_dto(item_dict)
|
|
|
|
# Assert
|
|
assert dto.timestamp.year == 2024
|
|
assert dto.timestamp.month == 1
|
|
assert dto.anomalies_detected == ["A1", "A2"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_dict_to_dto_handles_string_timestamp():
|
|
# Arrange
|
|
store = ChromaStore(client=MagicMock(), collection_name="test")
|
|
item_dict = {
|
|
"title": "Test",
|
|
"url": "http://test.com",
|
|
"content_text": "Content",
|
|
"source": "Source",
|
|
"timestamp": "2024-01-15T12:00:00",
|
|
"relevance_score": 7,
|
|
"summary_ru": "Summary",
|
|
"category": "Tech",
|
|
"anomalies_detected": []
|
|
}
|
|
|
|
# Act
|
|
dto = store._dict_to_dto(item_dict)
|
|
|
|
# Assert
|
|
assert dto.timestamp.year == 2024
|
|
assert dto.timestamp.month == 1
|
|
assert dto.timestamp.day == 15
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_dict_to_dto_handles_string_anomalies():
|
|
# Arrange
|
|
store = ChromaStore(client=MagicMock(), collection_name="test")
|
|
item_dict = {
|
|
"title": "Test",
|
|
"url": "http://test.com",
|
|
"content_text": "Content",
|
|
"source": "Source",
|
|
"timestamp": datetime(2024, 1, 15, tzinfo=timezone.utc),
|
|
"relevance_score": 7,
|
|
"summary_ru": "Summary",
|
|
"category": "Tech",
|
|
"anomalies_detected": "A1,A2,A3" # String instead of list
|
|
}
|
|
|
|
# Act
|
|
dto = store._dict_to_dto(item_dict)
|
|
|
|
# Assert
|
|
assert dto.anomalies_detected == ["A1", "A2", "A3"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_dict_to_dto_handles_empty_anomalies():
|
|
# Arrange
|
|
store = ChromaStore(client=MagicMock(), collection_name="test")
|
|
item_dict = {
|
|
"title": "Test",
|
|
"url": "http://test.com",
|
|
"content_text": "Content",
|
|
"source": "Source",
|
|
"timestamp": datetime(2024, 1, 15, tzinfo=timezone.utc),
|
|
"relevance_score": 7,
|
|
"summary_ru": "Summary",
|
|
"category": "Tech",
|
|
"anomalies_detected": None
|
|
}
|
|
|
|
# Act
|
|
dto = store._dict_to_dto(item_dict)
|
|
|
|
# Assert
|
|
assert dto.anomalies_detected == []
|