From b232298c6143a1ad3695db1f5a8faf2fb79024d5 Mon Sep 17 00:00:00 2001
From: Felipe Cardoso <felipe.cardoso@hotmail.it>
Date: Mon, 5 Jan 2026 15:38:52 +0100
Subject: [PATCH] feat(memory): add memory consolidation task and switch
 `source_episode_ids` to JSON

- Added `memory_consolidation` to the task list and updated `__all__` in test files.
- Updated `source_episode_ids` in `Fact` model to use JSON for cross-database compatibility.
- Revised related database migrations to use JSONB instead of ARRAY.
- Adjusted test concurrency in Makefile for improved test performance.
---
 backend/Makefile                                  |  2 +-
 .../versions/0005_add_memory_system_tables.py     |  5 +++--
 backend/app/models/memory/fact.py                 | 15 +++++----------
 backend/tests/tasks/test_celery_config.py         | 10 +++++++++-
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/backend/Makefile b/backend/Makefile
index 87681fe..079604c 100644
--- a/backend/Makefile
+++ b/backend/Makefile
@@ -80,7 +80,7 @@ test:
 
 test-cov:
 	@echo "🧪 Running tests with coverage..."
-	@IS_TEST=True PYTHONPATH=. uv run pytest --cov=app --cov-report=term-missing --cov-report=html -n 16
+	@IS_TEST=True PYTHONPATH=. uv run pytest --cov=app --cov-report=term-missing --cov-report=html -n 20
 	@echo "📊 Coverage report generated in htmlcov/index.html"
 
 # ============================================================================
diff --git a/backend/app/alembic/versions/0005_add_memory_system_tables.py b/backend/app/alembic/versions/0005_add_memory_system_tables.py
index e4cc106..ee0f7ed 100644
--- a/backend/app/alembic/versions/0005_add_memory_system_tables.py
+++ b/backend/app/alembic/versions/0005_add_memory_system_tables.py
@@ -247,11 +247,12 @@ def upgrade() -> None:
         sa.Column("predicate", sa.String(255), nullable=False),
         sa.Column("object", sa.Text(), nullable=False),
         sa.Column("confidence", sa.Float(), nullable=False, server_default="0.8"),
+        # Source episode IDs stored as JSON array of UUID strings for cross-db compatibility
         sa.Column(
             "source_episode_ids",
-            postgresql.ARRAY(postgresql.UUID(as_uuid=True)),
+            postgresql.JSONB(astext_type=sa.Text()),
             nullable=False,
-            server_default="{}",
+            server_default="[]",
         ),
         sa.Column("first_learned", sa.DateTime(timezone=True), nullable=False),
         sa.Column("last_reinforced", sa.DateTime(timezone=True), nullable=False),
diff --git a/backend/app/models/memory/fact.py b/backend/app/models/memory/fact.py
index 81b7045..59aeb55 100644
--- a/backend/app/models/memory/fact.py
+++ b/backend/app/models/memory/fact.py
@@ -18,11 +18,9 @@ from sqlalchemy import (
     Text,
     text,
 )
-from sqlalchemy.dialects.postgresql import (
-    ARRAY,
-    UUID as PGUUID,
-)
+from sqlalchemy.dialects.postgresql import UUID as PGUUID
 from sqlalchemy.orm import relationship
+from sqlalchemy.types import JSON
 
 from app.models.base import Base, TimestampMixin, UUIDMixin
 
@@ -63,10 +61,8 @@ class Fact(Base, UUIDMixin, TimestampMixin):
     # Confidence score (0.0 to 1.0)
     confidence = Column(Float, nullable=False, default=0.8, index=True)
 
-    # Source tracking: which episodes contributed to this fact
-    source_episode_ids: Column[list] = Column(
-        ARRAY(PGUUID(as_uuid=True)), default=list, nullable=False
-    )
+    # Source tracking: which episodes contributed to this fact (stored as JSON array of UUID strings)
+    source_episode_ids: Column[list] = Column(JSON, default=list, nullable=False)
 
     # Learning history
     first_learned = Column(DateTime(timezone=True), nullable=False)
@@ -94,8 +90,7 @@ class Fact(Base, UUIDMixin, TimestampMixin):
         Index("ix_facts_subject_predicate", "subject", "predicate"),
         Index("ix_facts_project_subject", "project_id", "subject"),
         Index("ix_facts_confidence_time", "confidence", "last_reinforced"),
-        # For finding facts by entity (subject or object)
-        Index("ix_facts_subject", "subject"),
+        # Note: subject already has index=True on Column definition, no need for explicit index
         # Data integrity constraints
         CheckConstraint(
             "confidence >= 0.0 AND confidence <= 1.0",
diff --git a/backend/tests/tasks/test_celery_config.py b/backend/tests/tasks/test_celery_config.py
index 11fd579..abe1bd9 100644
--- a/backend/tests/tasks/test_celery_config.py
+++ b/backend/tests/tasks/test_celery_config.py
@@ -304,10 +304,18 @@ class TestTaskModuleExports:
         assert hasattr(tasks, "sync")
         assert hasattr(tasks, "workflow")
         assert hasattr(tasks, "cost")
+        assert hasattr(tasks, "memory_consolidation")
 
     def test_tasks_all_attribute_is_correct(self):
         """Test that __all__ contains all expected module names."""
         from app import tasks
 
-        expected_modules = ["agent", "git", "sync", "workflow", "cost"]
+        expected_modules = [
+            "agent",
+            "git",
+            "sync",
+            "workflow",
+            "cost",
+            "memory_consolidation",
+        ]
         assert set(tasks.__all__) == set(expected_modules)