feat(database): SQLite shadow database for indexed queries

:Release Notes:
- Add ACID-compliant SQLiteStore (WAL mode, FULL sync, FK constraints)
- Add AnomalyType enum for normalized anomaly storage
- Add legacy data migration script (dry-run, batch, rollback)
- Update ChromaStore to delegate indexed queries to SQLite
- Add test suite for SQLiteStore (7 tests, all passing)

:Detailed Notes:
- SQLiteStore: news_items, anomaly_types, news_anomalies tables with indexes
- Performance: get_latest/get_top_ranked O(n)→O(log n), get_stats O(n)→O(1)
- ChromaDB remains primary vector store; SQLite provides indexed metadata queries

:Testing Performed:
- python3 -m pytest tests/ -v (112 passed)

:QA Notes:
- Tests verified by Python QA Engineer subagent

:Issues Addressed:
- get_latest/get_top_ranked fetched ALL items then sorted in Python
- get_stats iterated over ALL items
- anomalies_detected stored as comma-joined string (no index)

Change-Id: I708808b6e72889869afcf16d4ac274260242007a
This commit is contained in:
Artur Mukhamadiev 2026-03-30 13:00:08 +03:00
parent ef3faec7f8
commit f4ae73bdae
7 changed files with 3673 additions and 3 deletions

1709
docs/MIGRATION_PLAN.md Normal file

File diff suppressed because it is too large Load Diff

1011
scripts/migrate_legacy_data.py Executable file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,43 @@
from enum import Enum
class AnomalyType(str, Enum):
WEBGPU = "WebGPU"
NPU_ACCELERATION = "NPU acceleration"
EDGE_AI = "Edge AI"
QUANTUM_COMPUTING = "Quantum computing"
NEUROMORPHIC = "Neuromorphic computing"
SPATIAL_COMPUTING = "Spatial computing"
UNKNOWN = "Unknown"
@classmethod
def from_string(cls, value: str) -> "AnomalyType":
normalized = value.strip().lower()
for member in cls:
if member.value.lower() == normalized:
return member
for member in cls:
if normalized in member.value.lower():
return member
return cls.UNKNOWN
@property
def description(self) -> str:
descriptions = {
"WebGPU": "WebGPU graphics API or GPU compute",
"NPU acceleration": "Neural Processing Unit hardware",
"Edge AI": "Edge computing with AI",
"Quantum computing": "Quantum computing technology",
"Neuromorphic computing": "Neuromorphic processor architecture",
"Spatial computing": "Spatial computing and AR/VR",
"Unknown": "Unrecognized anomaly type",
}
return descriptions.get(self.value, "")
def normalize_anomaly_list(anomalies: list[str]) -> list[AnomalyType]:
return [AnomalyType.from_string(a) for a in anomalies]
def is_valid_anomaly(value: str) -> bool:
return AnomalyType.from_string(value) != AnomalyType.UNKNOWN

View File

@ -1,8 +1,12 @@
import uuid
import asyncio
import logging
from typing import TYPE_CHECKING, Dict
from typing import List, Optional, Mapping, Any
from datetime import datetime
if TYPE_CHECKING:
from src.storage.sqlite_store import SQLiteStore
from datetime import datetime, timezone
import chromadb
from chromadb.api import ClientAPI
@ -13,9 +17,15 @@ from src.processor.dto import EnrichedNewsItemDTO
logger = logging.getLogger(__name__)
class ChromaStore(IVectorStore):
def __init__(self, client: ClientAPI, collection_name: str = "news_collection"):
def __init__(
self,
client: ClientAPI,
collection_name: str = "news_collection",
sqlite_store: Optional["SQLiteStore"] = None
):
self.client = client
self.collection = self.client.get_or_create_collection(name=collection_name)
self.sqlite_store = sqlite_store
async def store(self, item: EnrichedNewsItemDTO) -> None:
# Create a deterministic UUID based on the URL
@ -138,12 +148,45 @@ class ChromaStore(IVectorStore):
anomalies_detected=anomalies
)
def _dict_to_dto(self, item_dict: Dict[str, Any]) -> EnrichedNewsItemDTO:
"""Convert a dict (from SQLite row) back to EnrichedNewsItemDTO."""
anomalies = item_dict.get("anomalies_detected", []) or []
if isinstance(anomalies, str):
anomalies = [a.strip() for a in anomalies.split(",") if a.strip()]
timestamp = item_dict.get("timestamp")
if isinstance(timestamp, (int, float)):
timestamp = datetime.fromtimestamp(timestamp, tz=timezone.utc)
elif isinstance(timestamp, str):
timestamp = datetime.fromisoformat(timestamp)
else:
timestamp = datetime.now(timezone.utc)
return EnrichedNewsItemDTO(
title=str(item_dict.get("title", "")),
url=str(item_dict.get("url", "")),
content_text=str(item_dict.get("content_text", "")),
source=str(item_dict.get("source", "")),
timestamp=timestamp,
relevance_score=int(float(str(item_dict.get("relevance_score", 0)))),
summary_ru=str(item_dict.get("summary_ru", "")),
category=str(item_dict.get("category", "")),
anomalies_detected=anomalies if isinstance(anomalies, list) else []
)
async def exists(self, url: str) -> bool:
doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, url))
result = await asyncio.to_thread(self.collection.get, ids=[doc_id])
return len(result.get("ids", [])) > 0
async def get_stats(self) -> dict[str, int]:
if self.sqlite_store is not None:
stats = await self.sqlite_store.get_stats(use_cache=True)
return {
"total_count": stats["total_count"],
**{f"category_{k}": v for k, v in stats["category_counts"].items()}
}
results = await asyncio.to_thread(self.collection.get, include=["metadatas"])
metadatas = results.get("metadatas")
if metadatas is None:
@ -163,6 +206,10 @@ class ChromaStore(IVectorStore):
return stats
async def get_latest(self, limit: int = 10, category: Optional[str] = None) -> List[EnrichedNewsItemDTO]:
if self.sqlite_store is not None:
items_dict = await self.sqlite_store.get_latest(limit=limit, category=category)
return [self._dict_to_dto(item) for item in items_dict]
where: Any = {"category": category} if category else None
results = await asyncio.to_thread(
self.collection.get,
@ -186,6 +233,10 @@ class ChromaStore(IVectorStore):
return items[:limit]
async def get_top_ranked(self, limit: int = 10, category: Optional[str] = None) -> List[EnrichedNewsItemDTO]:
if self.sqlite_store is not None:
items_dict = await self.sqlite_store.get_top_ranked(limit=limit, category=category)
return [self._dict_to_dto(item) for item in items_dict]
where: Any = {"category": category} if category else None
results = await asyncio.to_thread(
self.collection.get,

305
src/storage/sqlite_store.py Normal file
View File

@ -0,0 +1,305 @@
import sqlite3
import json
import asyncio
import uuid
from pathlib import Path
from typing import List, Optional, Dict, Any
from contextlib import contextmanager
from datetime import datetime, timezone
from dataclasses import dataclass
from src.processor.dto import EnrichedNewsItemDTO
from src.processor.anomaly_types import AnomalyType, normalize_anomaly_list
@dataclass
class StatsCache:
total_count: int = 0
category_counts: Optional[Dict[str, int]] = None
source_counts: Optional[Dict[str, int]] = None
anomaly_counts: Optional[Dict[str, int]] = None
last_updated: Optional[datetime] = None
def __post_init__(self):
if self.category_counts is None:
self.category_counts = {}
if self.source_counts is None:
self.source_counts = {}
if self.anomaly_counts is None:
self.anomaly_counts = {}
class SQLiteStore:
def __init__(self, db_path: Path):
self.db_path = db_path
self._init_schema()
self._stats_cache: Optional[StatsCache] = None
def _init_schema(self):
with self._get_connection() as conn:
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=FULL")
conn.execute("PRAGMA foreign_keys=ON")
conn.execute("""
CREATE TABLE IF NOT EXISTS news_items (
id TEXT PRIMARY KEY,
title TEXT NOT NULL,
url TEXT UNIQUE NOT NULL,
source TEXT NOT NULL,
timestamp INTEGER NOT NULL,
relevance_score INTEGER NOT NULL CHECK (relevance_score >= 0 AND relevance_score <= 10),
summary_ru TEXT,
category TEXT NOT NULL DEFAULT '',
content_text TEXT,
created_at INTEGER DEFAULT (unixepoch())
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_news_timestamp ON news_items(timestamp DESC)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_news_relevance ON news_items(relevance_score DESC)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_news_category ON news_items(category)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_news_source ON news_items(source)")
conn.execute("""
CREATE TABLE IF NOT EXISTS anomaly_types (
id TEXT PRIMARY KEY,
name TEXT UNIQUE NOT NULL,
description TEXT
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS news_anomalies (
news_id TEXT NOT NULL,
anomaly_id TEXT NOT NULL,
detected_at INTEGER DEFAULT (unixepoch()),
PRIMARY KEY (news_id, anomaly_id),
FOREIGN KEY (news_id) REFERENCES news_items(id) ON DELETE CASCADE,
FOREIGN KEY (anomaly_id) REFERENCES anomaly_types(id)
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_news_anomalies_news ON news_anomalies(news_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_news_anomalies_anomaly ON news_anomalies(anomaly_id)")
conn.execute("""
CREATE TABLE IF NOT EXISTS stats_cache (
key TEXT PRIMARY KEY DEFAULT 'main',
total_count INTEGER DEFAULT 0,
category_counts TEXT DEFAULT '{}',
source_counts TEXT DEFAULT '{}',
anomaly_counts TEXT DEFAULT '{}',
last_updated INTEGER
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS schema_version (
version INTEGER PRIMARY KEY,
applied_at INTEGER DEFAULT (unixepoch())
)
""")
conn.commit()
@contextmanager
def _get_connection(self):
conn = sqlite3.connect(self.db_path, timeout=30.0)
conn.row_factory = sqlite3.Row
try:
yield conn
finally:
conn.close()
def _row_to_dict(self, row: sqlite3.Row) -> Dict[str, Any]:
return dict(row)
async def store_with_anomalies(self, item: EnrichedNewsItemDTO, anomaly_types: List[AnomalyType]) -> str:
doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, item.url))
timestamp_int = int(item.timestamp.timestamp())
with self._get_connection() as conn:
try:
conn.execute("BEGIN IMMEDIATE")
conn.execute("""
INSERT OR REPLACE INTO news_items
(id, title, url, source, timestamp, relevance_score, summary_ru, category, content_text)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
doc_id,
item.title,
item.url,
item.source,
timestamp_int,
item.relevance_score,
item.summary_ru,
item.category,
item.content_text
))
for anomaly in anomaly_types:
anomaly_id = str(uuid.uuid5(uuid.NAMESPACE_URL, anomaly.value))
conn.execute("""
INSERT OR IGNORE INTO anomaly_types (id, name, description)
VALUES (?, ?, ?)
""", (anomaly_id, anomaly.value, anomaly.description))
conn.execute("""
INSERT OR IGNORE INTO news_anomalies (news_id, anomaly_id)
VALUES (?, ?)
""", (doc_id, anomaly_id))
conn.execute("COMMIT")
except Exception:
conn.execute("ROLLBACK")
raise
self._invalidate_cache()
return doc_id
async def get_by_id(self, item_id: str) -> Optional[Dict[str, Any]]:
with self._get_connection() as conn:
row = conn.execute("""
SELECT * FROM news_items WHERE id = ?
""", (item_id,)).fetchone()
if not row:
return None
result = self._row_to_dict(row)
result["timestamp"] = datetime.fromtimestamp(result["timestamp"], tz=timezone.utc)
anomaly_rows = conn.execute("""
SELECT a.name FROM news_anomalies na
JOIN anomaly_types a ON na.anomaly_id = a.id
WHERE na.news_id = ?
""", (item_id,)).fetchall()
result["anomalies_detected"] = [r["name"] for r in anomaly_rows]
return result
async def exists(self, url: str) -> bool:
doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, url))
with self._get_connection() as conn:
row = conn.execute("SELECT 1 FROM news_items WHERE id = ?", (doc_id,)).fetchone()
return row is not None
async def get_latest(self, limit: int = 10, category: Optional[str] = None, offset: int = 0) -> List[Dict[str, Any]]:
with self._get_connection() as conn:
if category:
rows = conn.execute("""
SELECT * FROM news_items
WHERE category = ?
ORDER BY timestamp DESC
LIMIT ? OFFSET ?
""", (category, limit, offset)).fetchall()
else:
rows = conn.execute("""
SELECT * FROM news_items
ORDER BY timestamp DESC
LIMIT ? OFFSET ?
""", (limit, offset)).fetchall()
results = []
for row in rows:
item = self._row_to_dict(row)
item["timestamp"] = datetime.fromtimestamp(item["timestamp"], tz=timezone.utc)
results.append(item)
return results
async def get_top_ranked(self, limit: int = 10, category: Optional[str] = None, offset: int = 0) -> List[Dict[str, Any]]:
with self._get_connection() as conn:
if category:
rows = conn.execute("""
SELECT * FROM news_items
WHERE category = ?
ORDER BY relevance_score DESC, timestamp DESC
LIMIT ? OFFSET ?
""", (category, limit, offset)).fetchall()
else:
rows = conn.execute("""
SELECT * FROM news_items
ORDER BY relevance_score DESC, timestamp DESC
LIMIT ? OFFSET ?
""", (limit, offset)).fetchall()
results = []
for row in rows:
item = self._row_to_dict(row)
item["timestamp"] = datetime.fromtimestamp(item["timestamp"], tz=timezone.utc)
results.append(item)
return results
async def get_stats(self, use_cache: bool = True) -> Dict[str, Any]:
if use_cache and self._stats_cache is not None:
return {
"total_count": self._stats_cache.total_count,
"category_counts": self._stats_cache.category_counts,
"source_counts": self._stats_cache.source_counts,
"anomaly_counts": self._stats_cache.anomaly_counts,
"last_updated": self._stats_cache.last_updated
}
with self._get_connection() as conn:
total = conn.execute("SELECT COUNT(*) FROM news_items").fetchone()[0]
category_rows = conn.execute("""
SELECT category, COUNT(*) as count FROM news_items GROUP BY category
""").fetchall()
category_counts = {r["category"]: r["count"] for r in category_rows}
source_rows = conn.execute("""
SELECT source, COUNT(*) as count FROM news_items GROUP BY source
""").fetchall()
source_counts = {r["source"]: r["count"] for r in source_rows}
anomaly_rows = conn.execute("""
SELECT a.name, COUNT(*) as count
FROM news_anomalies na
JOIN anomaly_types a ON na.anomaly_id = a.id
GROUP BY a.name
""").fetchall()
anomaly_counts = {r["name"]: r["count"] for r in anomaly_rows}
result = {
"total_count": total,
"category_counts": category_counts,
"source_counts": source_counts,
"anomaly_counts": anomaly_counts,
"last_updated": datetime.now(timezone.utc)
}
self._stats_cache = StatsCache(
total_count=total,
category_counts=category_counts,
source_counts=source_counts,
anomaly_counts=anomaly_counts,
last_updated=datetime.now(timezone.utc)
)
return result
def _invalidate_cache(self):
self._stats_cache = None
async def invalidate_cache(self):
self._invalidate_cache()
async def count_all(self) -> int:
with self._get_connection() as conn:
return conn.execute("SELECT COUNT(*) FROM news_items").fetchone()[0]
async def get_all_items_for_migration(self, batch_size: int = 100) -> List[Dict[str, Any]]:
with self._get_connection() as conn:
rows = conn.execute("""
SELECT * FROM news_items ORDER BY created_at
""").fetchall()
results = []
for row in rows:
item = self._row_to_dict(row)
item["timestamp"] = datetime.fromtimestamp(item["timestamp"], tz=timezone.utc)
anomaly_rows = conn.execute("""
SELECT a.name FROM news_anomalies na
JOIN anomaly_types a ON na.anomaly_id = a.id
WHERE na.news_id = ?
""", (item["id"],)).fetchall()
item["anomalies_detected"] = [r["name"] for r in anomaly_rows]
results.append(item)
return results

View File

@ -2,7 +2,7 @@ import pytest
import asyncio
import uuid
from datetime import datetime, timezone
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock, patch, AsyncMock
from typing import Dict, Any
from src.processor.dto import EnrichedNewsItemDTO
@ -280,3 +280,248 @@ async def test_search_empty_query(chroma_store, mock_collection):
n_results=5,
where=None
)
# =============================================================================
# Tests for SQLiteStore integration
# =============================================================================
@pytest.fixture
def mock_sqlite_store():
return AsyncMock()
@pytest.fixture
def chroma_store_with_sqlite(mock_client, mock_collection, mock_sqlite_store):
mock_client.get_or_create_collection.return_value = mock_collection
return ChromaStore(
client=mock_client,
collection_name="test_collection",
sqlite_store=mock_sqlite_store
)
@pytest.mark.asyncio
async def test_get_latest_delegates_to_sqlite_store(chroma_store_with_sqlite, mock_sqlite_store):
# Arrange
ts = datetime(2024, 1, 15, tzinfo=timezone.utc)
mock_sqlite_store.get_latest.return_value = [
{"title": "Latest1", "url": "u1", "content_text": "c1", "source": "s1",
"timestamp": ts, "relevance_score": 8, "summary_ru": "sum1",
"category": "Tech", "anomalies_detected": ["A1"]},
{"title": "Latest2", "url": "u2", "content_text": "c2", "source": "s2",
"timestamp": ts, "relevance_score": 7, "summary_ru": "sum2",
"category": "Tech", "anomalies_detected": []},
]
# Act
results = await chroma_store_with_sqlite.get_latest(limit=10, category="Tech")
# Assert
mock_sqlite_store.get_latest.assert_called_once_with(limit=10, category="Tech")
assert len(results) == 2
assert results[0].title == "Latest1"
assert results[0].relevance_score == 8
assert results[0].anomalies_detected == ["A1"]
assert results[1].title == "Latest2"
assert results[1].anomalies_detected == []
@pytest.mark.asyncio
async def test_get_latest_fallback_when_no_sqlite_store(chroma_store, mock_collection):
# Arrange
mock_collection.get.return_value = {
"metadatas": [
{"title": "Chroma Latest", "timestamp": "2024-01-01T00:00:00", "url": "u1",
"relevance_score": 5, "source": "src", "category": "Tech"},
],
"documents": ["content"]
}
# Act
results = await chroma_store.get_latest(limit=10)
# Assert
assert len(results) == 1
assert results[0].title == "Chroma Latest"
@pytest.mark.asyncio
async def test_get_top_ranked_delegates_to_sqlite_store(chroma_store_with_sqlite, mock_sqlite_store):
# Arrange
ts = datetime(2024, 1, 15, tzinfo=timezone.utc)
mock_sqlite_store.get_top_ranked.return_value = [
{"title": "Top1", "url": "u1", "content_text": "c1", "source": "s1",
"timestamp": ts, "relevance_score": 10, "summary_ru": "sum1",
"category": "Tech", "anomalies_detected": []},
{"title": "Top2", "url": "u2", "content_text": "c2", "source": "s2",
"timestamp": ts, "relevance_score": 9, "summary_ru": "sum2",
"category": "Tech", "anomalies_detected": ["A2"]},
]
# Act
results = await chroma_store_with_sqlite.get_top_ranked(limit=5)
# Assert
mock_sqlite_store.get_top_ranked.assert_called_once_with(limit=5, category=None)
assert len(results) == 2
assert results[0].title == "Top1"
assert results[0].relevance_score == 10
assert results[1].title == "Top2"
assert results[1].anomalies_detected == ["A2"]
@pytest.mark.asyncio
async def test_get_top_ranked_fallback_when_no_sqlite_store(chroma_store, mock_collection):
# Arrange
mock_collection.get.return_value = {
"metadatas": [
{"title": "Chroma Top", "timestamp": "2024-01-01T00:00:00", "url": "u1",
"relevance_score": 10, "source": "src", "category": "Tech"},
],
"documents": ["content"]
}
# Act
results = await chroma_store.get_top_ranked(limit=10)
# Assert
assert len(results) == 1
assert results[0].title == "Chroma Top"
@pytest.mark.asyncio
async def test_get_stats_delegates_to_sqlite_store(chroma_store_with_sqlite, mock_sqlite_store):
# Arrange
mock_sqlite_store.get_stats.return_value = {
"total_count": 100,
"category_counts": {"Tech": 60, "Science": 40},
"source_counts": {"src1": 70, "src2": 30},
"anomaly_counts": {"A1": 15, "A2": 5},
"last_updated": datetime(2024, 1, 15, tzinfo=timezone.utc)
}
# Act
stats = await chroma_store_with_sqlite.get_stats()
# Assert
mock_sqlite_store.get_stats.assert_called_once_with(use_cache=True)
assert stats["total_count"] == 100
assert stats["category_Tech"] == 60
assert stats["category_Science"] == 40
@pytest.mark.asyncio
async def test_get_stats_fallback_when_no_sqlite_store(chroma_store, mock_collection):
# Arrange
mock_collection.get.return_value = {
"metadatas": [
{"category": "Tech"},
{"category": "Tech"},
{"category": "Science"},
]
}
# Act
stats = await chroma_store.get_stats()
# Assert
assert stats["total_count"] == 3
assert stats["category_Tech"] == 2
assert stats["category_Science"] == 1
@pytest.mark.asyncio
async def test_dict_to_dto_handles_integer_timestamp():
# Arrange
store = ChromaStore(client=MagicMock(), collection_name="test")
item_dict = {
"title": "Test",
"url": "http://test.com",
"content_text": "Content",
"source": "Source",
"timestamp": 1705312800, # Unix timestamp as int
"relevance_score": 7,
"summary_ru": "Summary",
"category": "Tech",
"anomalies_detected": ["A1", "A2"]
}
# Act
dto = store._dict_to_dto(item_dict)
# Assert
assert dto.timestamp.year == 2024
assert dto.timestamp.month == 1
assert dto.anomalies_detected == ["A1", "A2"]
@pytest.mark.asyncio
async def test_dict_to_dto_handles_string_timestamp():
# Arrange
store = ChromaStore(client=MagicMock(), collection_name="test")
item_dict = {
"title": "Test",
"url": "http://test.com",
"content_text": "Content",
"source": "Source",
"timestamp": "2024-01-15T12:00:00",
"relevance_score": 7,
"summary_ru": "Summary",
"category": "Tech",
"anomalies_detected": []
}
# Act
dto = store._dict_to_dto(item_dict)
# Assert
assert dto.timestamp.year == 2024
assert dto.timestamp.month == 1
assert dto.timestamp.day == 15
@pytest.mark.asyncio
async def test_dict_to_dto_handles_string_anomalies():
# Arrange
store = ChromaStore(client=MagicMock(), collection_name="test")
item_dict = {
"title": "Test",
"url": "http://test.com",
"content_text": "Content",
"source": "Source",
"timestamp": datetime(2024, 1, 15, tzinfo=timezone.utc),
"relevance_score": 7,
"summary_ru": "Summary",
"category": "Tech",
"anomalies_detected": "A1,A2,A3" # String instead of list
}
# Act
dto = store._dict_to_dto(item_dict)
# Assert
assert dto.anomalies_detected == ["A1", "A2", "A3"]
@pytest.mark.asyncio
async def test_dict_to_dto_handles_empty_anomalies():
# Arrange
store = ChromaStore(client=MagicMock(), collection_name="test")
item_dict = {
"title": "Test",
"url": "http://test.com",
"content_text": "Content",
"source": "Source",
"timestamp": datetime(2024, 1, 15, tzinfo=timezone.utc),
"relevance_score": 7,
"summary_ru": "Summary",
"category": "Tech",
"anomalies_detected": None
}
# Act
dto = store._dict_to_dto(item_dict)
# Assert
assert dto.anomalies_detected == []

View File

@ -0,0 +1,306 @@
import pytest
import sqlite3
import uuid
import asyncio
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any
from contextlib import contextmanager
from unittest.mock import patch, MagicMock
from src.storage.sqlite_store import SQLiteStore, StatsCache
from src.processor.dto import EnrichedNewsItemDTO
from src.processor.anomaly_types import AnomalyType
import tempfile
import os
@pytest.fixture
def sqlite_store():
# Use a temporary file for the database to allow multiple connections
# to the same data while still being isolated between tests.
fd, path = tempfile.mkstemp()
os.close(fd)
db_path = Path(path)
store = SQLiteStore(db_path)
yield store
if db_path.exists():
try:
os.remove(db_path)
except PermissionError:
pass
@pytest.mark.asyncio
async def test_store_with_anomalies(sqlite_store):
"""1. Test store_with_anomalies - verify atomic writes with anomaly junction records"""
item = EnrichedNewsItemDTO(
title="Test News Title",
url="https://example.com/test-news-1",
content_text="Sample content for the news item.",
source="Test Source",
timestamp=datetime.now(timezone.utc),
relevance_score=8,
summary_ru="Тестовая сводка",
category="Technology",
anomalies_detected=["WebGPU"]
)
anomalies = [AnomalyType.WEBGPU]
doc_id = await sqlite_store.store_with_anomalies(item, anomalies)
assert doc_id is not None
# Verify records in database
with sqlite_store._get_connection() as conn:
news_row = conn.execute("SELECT * FROM news_items WHERE id = ?", (doc_id,)).fetchone()
assert news_row is not None
assert news_row["title"] == "Test News Title"
assert news_row["category"] == "Technology"
# Check anomaly_types table
anomaly_type_row = conn.execute(
"SELECT * FROM anomaly_types WHERE name = ?", (AnomalyType.WEBGPU.value,)
).fetchone()
assert anomaly_type_row is not None
assert anomaly_type_row["name"] == "WebGPU"
# Check news_anomalies junction table
junction_row = conn.execute(
"SELECT * FROM news_anomalies WHERE news_id = ? AND anomaly_id = ?",
(doc_id, anomaly_type_row["id"])
).fetchone()
assert junction_row is not None
@pytest.mark.asyncio
async def test_get_by_id(sqlite_store):
"""2. Test get_by_id - verify proper reconstruction of DTO with anomalies"""
item_timestamp = datetime(2023, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
item = EnrichedNewsItemDTO(
title="DTO Reconstruction Test",
url="https://example.com/reconstruct",
content_text="Detailed content here.",
source="Source B",
timestamp=item_timestamp,
relevance_score=7,
summary_ru="Сводка для теста",
category="Science",
anomalies_detected=["WebGPU", "Edge AI"]
)
anomalies = [AnomalyType.WEBGPU, AnomalyType.EDGE_AI]
doc_id = await sqlite_store.store_with_anomalies(item, anomalies)
result = await sqlite_store.get_by_id(doc_id)
assert result is not None
assert result["id"] == doc_id
assert result["title"] == item.title
assert result["url"] == item.url
assert result["relevance_score"] == item.relevance_score
assert result["category"] == item.category
assert "WebGPU" in result["anomalies_detected"]
assert "Edge AI" in result["anomalies_detected"]
assert len(result["anomalies_detected"]) == 2
# Check timestamp reconstruction
# result["timestamp"] is expected to be a datetime object
assert result["timestamp"].replace(tzinfo=timezone.utc) == item_timestamp
@pytest.mark.asyncio
async def test_exists(sqlite_store):
"""3. Test exists - verify URL-based existence check"""
url = "https://example.com/exists-test"
item = EnrichedNewsItemDTO(
title="Exists Test",
url=url,
content_text="...",
source="Source",
timestamp=datetime.now(timezone.utc),
relevance_score=1,
summary_ru="...",
category="Cat",
anomalies_detected=[]
)
# Before storing
assert await sqlite_store.exists(url) is False
# After storing
await sqlite_store.store_with_anomalies(item, [])
assert await sqlite_store.exists(url) is True
# Other URL
assert await sqlite_store.exists("https://example.com/does-not-exist") is False
@pytest.mark.asyncio
async def test_get_latest(sqlite_store):
"""4. Test get_latest - verify indexed timestamp ordering with pagination"""
base_time = datetime(2023, 1, 1, tzinfo=timezone.utc).timestamp()
# Store 5 items with increasing timestamps
for i in range(5):
item = EnrichedNewsItemDTO(
title=f"News Item {i}",
url=f"https://example.com/news-{i}",
content_text="...",
source="Source",
timestamp=datetime.fromtimestamp(base_time + i, tz=timezone.utc),
relevance_score=5,
summary_ru="...",
category="Tech" if i % 2 == 0 else "Science",
anomalies_detected=[]
)
await sqlite_store.store_with_anomalies(item, [])
# Latest items (descending timestamp)
latest = await sqlite_store.get_latest(limit=10)
assert len(latest) == 5
assert latest[0]["title"] == "News Item 4"
assert latest[-1]["title"] == "News Item 0"
# Pagination: limit 2, offset 1
paged = await sqlite_store.get_latest(limit=2, offset=1)
assert len(paged) == 2
assert paged[0]["title"] == "News Item 3"
assert paged[1]["title"] == "News Item 2"
# Category filter
science_only = await sqlite_store.get_latest(category="Science")
assert len(science_only) == 2
for item in science_only:
assert item["category"] == "Science"
@pytest.mark.asyncio
async def test_get_top_ranked(sqlite_store):
"""5. Test get_top_ranked - verify indexed relevance ordering with pagination"""
for i in range(5):
item = EnrichedNewsItemDTO(
title=f"Ranked Item {i}",
url=f"https://example.com/ranked-{i}",
content_text="...",
source="Source",
timestamp=datetime.now(timezone.utc),
relevance_score=i * 2, # Scores: 0, 2, 4, 6, 8
summary_ru="...",
category="Tech",
anomalies_detected=[]
)
await sqlite_store.store_with_anomalies(item, [])
# Top ranked items (descending score)
top = await sqlite_store.get_top_ranked(limit=10)
assert len(top) == 5
assert top[0]["relevance_score"] == 8
assert top[-1]["relevance_score"] == 0
# Pagination: limit 2, offset 1
paged = await sqlite_store.get_top_ranked(limit=2, offset=1)
assert len(paged) == 2
assert paged[0]["relevance_score"] == 6
assert paged[1]["relevance_score"] == 4
@pytest.mark.asyncio
async def test_get_stats(sqlite_store):
"""6. Test get_stats - verify correct aggregation with cache"""
item1 = EnrichedNewsItemDTO(
title="N1", url="u1", content_text="C1", source="S1",
timestamp=datetime.now(timezone.utc), relevance_score=5,
summary_ru="S1", category="CatA", anomalies_detected=["WebGPU"]
)
item2 = EnrichedNewsItemDTO(
title="N2", url="u2", content_text="C2", source="S2",
timestamp=datetime.now(timezone.utc), relevance_score=5,
summary_ru="S2", category="CatA", anomalies_detected=["WebGPU", "Edge AI"]
)
await sqlite_store.store_with_anomalies(item1, [AnomalyType.WEBGPU])
await sqlite_store.store_with_anomalies(item2, [AnomalyType.WEBGPU, AnomalyType.EDGE_AI])
# Initial stats
stats = await sqlite_store.get_stats(use_cache=False)
assert stats["total_count"] == 2
assert stats["category_counts"]["CatA"] == 2
assert stats["source_counts"]["S1"] == 1
assert stats["source_counts"]["S2"] == 1
assert stats["anomaly_counts"]["WebGPU"] == 2
assert stats["anomaly_counts"]["Edge AI"] == 1
# Verify cache works: manually add item to DB without invalidating cache
with sqlite_store._get_connection() as conn:
conn.execute(
"INSERT INTO news_items (id, title, url, source, timestamp, relevance_score, category) VALUES (?,?,?,?,?,?,?)",
("cached-id", "cached", "cached-url", "cached-source", 0, 0, "CatB")
)
conn.commit()
# Should still return cached stats if use_cache=True
cached_stats = await sqlite_store.get_stats(use_cache=True)
assert cached_stats["total_count"] == 2
assert "CatB" not in cached_stats["category_counts"]
# Should return new stats if use_cache=False
new_stats = await sqlite_store.get_stats(use_cache=False)
assert new_stats["total_count"] == 3
assert new_stats["category_counts"]["CatB"] == 1
@pytest.mark.asyncio
async def test_acid_compliance(sqlite_store):
"""7. Test ACID compliance - verify transactions rollback on error"""
item = EnrichedNewsItemDTO(
title="Atomic Test",
url="https://example.com/atomic",
content_text="Should not be saved",
source="Source",
timestamp=datetime.now(timezone.utc),
relevance_score=5,
summary_ru="Summary",
category="Tech",
anomalies_detected=["WebGPU"]
)
# Use a mock to simulate an error mid-transaction
# We want it to fail after news_items is inserted but before anomalies are finished
original_get_conn = sqlite_store._get_connection
@contextmanager
def mocked_get_connection():
with original_get_conn() as conn:
# We can't patch conn.execute directly, so we'll wrap the connection
# but still allow it to behave like a connection for the rest.
# Since SQLiteStore uses conn.execute directly on the yielded connection,
# we can return a proxy object.
class ConnProxy:
def __init__(self, real_conn):
self.real_conn = real_conn
def execute(self, sql, *args):
if "anomaly_types" in sql or "news_anomalies" in sql:
raise sqlite3.Error("Simulated database failure")
return self.real_conn.execute(sql, *args)
def commit(self): return self.real_conn.commit()
def rollback(self): return self.real_conn.rollback()
def __getattr__(self, name): return getattr(self.real_conn, name)
# The connection is used in context blocks
def __enter__(self): return self
def __exit__(self, *args): pass
yield ConnProxy(conn)
# Patch the _get_connection method of the store
with patch.object(sqlite_store, '_get_connection', side_effect=mocked_get_connection):
with pytest.raises(sqlite3.Error, match="Simulated database failure"):
await sqlite_store.store_with_anomalies(item, [AnomalyType.WEBGPU])
# If ACID works, the news_items table should still be empty
assert await sqlite_store.count_all() == 0
with sqlite_store._get_connection() as conn:
count = conn.execute("SELECT COUNT(*) FROM news_items").fetchone()[0]
assert count == 0