# app/services/memory/semantic/extraction.py """ Fact Extraction from Episodes. Provides utilities for extracting semantic facts (subject-predicate-object triples) from episodic memories and other text sources. """ import logging import re from dataclasses import dataclass, field from typing import Any, ClassVar from app.services.memory.types import Episode, FactCreate, Outcome logger = logging.getLogger(__name__) @dataclass class ExtractionContext: """Context for fact extraction.""" project_id: Any | None = None source_episode_id: Any | None = None min_confidence: float = 0.5 max_facts_per_source: int = 10 @dataclass class ExtractedFact: """A fact extracted from text before storage.""" subject: str predicate: str object: str confidence: float source_text: str = "" metadata: dict[str, Any] = field(default_factory=dict) def to_fact_create( self, project_id: Any | None = None, source_episode_ids: list[Any] | None = None, ) -> FactCreate: """Convert to FactCreate for storage.""" return FactCreate( subject=self.subject, predicate=self.predicate, object=self.object, confidence=self.confidence, project_id=project_id, source_episode_ids=source_episode_ids or [], ) class FactExtractor: """ Extracts facts from episodes and text. This is a rule-based extractor. In production, this would be replaced or augmented with LLM-based extraction for better accuracy. """ # Common predicates we can detect PREDICATE_PATTERNS: ClassVar[dict[str, str]] = { "uses": r"(?:uses?|using|utilizes?)", "requires": r"(?:requires?|needs?|depends?\s+on)", "is_a": r"(?:is\s+a|is\s+an|are\s+a|are)", "has": r"(?:has|have|contains?)", "part_of": r"(?:part\s+of|belongs?\s+to|member\s+of)", "causes": r"(?:causes?|leads?\s+to|results?\s+in)", "prevents": r"(?:prevents?|avoids?|stops?)", "solves": r"(?:solves?|fixes?|resolves?)", } def __init__(self) -> None: """Initialize extractor.""" self._compiled_patterns = { pred: re.compile(pattern, re.IGNORECASE) for pred, pattern in self.PREDICATE_PATTERNS.items() } def extract_from_episode( self, episode: Episode, context: ExtractionContext | None = None, ) -> list[ExtractedFact]: """ Extract facts from an episode. Args: episode: Episode to extract from context: Optional extraction context Returns: List of extracted facts """ ctx = context or ExtractionContext() facts: list[ExtractedFact] = [] # Extract from task description task_facts = self._extract_from_text( episode.task_description, source_prefix=episode.task_type, ) facts.extend(task_facts) # Extract from lessons learned for lesson in episode.lessons_learned: lesson_facts = self._extract_from_lesson(lesson, episode) facts.extend(lesson_facts) # Extract outcome-based facts outcome_facts = self._extract_outcome_facts(episode) facts.extend(outcome_facts) # Limit and filter facts = [f for f in facts if f.confidence >= ctx.min_confidence] facts = facts[: ctx.max_facts_per_source] logger.debug(f"Extracted {len(facts)} facts from episode {episode.id}") return facts def _extract_from_text( self, text: str, source_prefix: str = "", ) -> list[ExtractedFact]: """Extract facts from free-form text using pattern matching.""" facts: list[ExtractedFact] = [] if not text or len(text) < 10: return facts # Split into sentences sentences = re.split(r"[.!?]+", text) for sentence in sentences: sentence = sentence.strip() if len(sentence) < 10: continue # Try to match predicate patterns for predicate, pattern in self._compiled_patterns.items(): match = pattern.search(sentence) if match: # Extract subject (text before predicate) subject = sentence[: match.start()].strip() # Extract object (text after predicate) obj = sentence[match.end() :].strip() if len(subject) > 2 and len(obj) > 2: facts.append( ExtractedFact( subject=subject[:200], # Limit length predicate=predicate, object=obj[:500], confidence=0.6, # Medium confidence for pattern matching source_text=sentence, ) ) break # One fact per sentence return facts def _extract_from_lesson( self, lesson: str, episode: Episode, ) -> list[ExtractedFact]: """Extract facts from a lesson learned.""" facts: list[ExtractedFact] = [] if not lesson or len(lesson) < 10: return facts # Lessons are typically in the form "Always do X" or "Never do Y" # or "When X, do Y" # Direct lesson fact facts.append( ExtractedFact( subject=episode.task_type, predicate="lesson_learned", object=lesson, confidence=0.8, # High confidence for explicit lessons source_text=lesson, metadata={"outcome": episode.outcome.value}, ) ) # Extract conditional patterns conditional_match = re.match( r"(?:when|if)\s+(.+?),\s*(.+)", lesson, re.IGNORECASE, ) if conditional_match: condition, action = conditional_match.groups() facts.append( ExtractedFact( subject=condition.strip(), predicate="requires_action", object=action.strip(), confidence=0.7, source_text=lesson, ) ) # Extract "always/never" patterns always_match = re.match( r"(?:always)\s+(.+)", lesson, re.IGNORECASE, ) if always_match: facts.append( ExtractedFact( subject=episode.task_type, predicate="best_practice", object=always_match.group(1).strip(), confidence=0.85, source_text=lesson, ) ) never_match = re.match( r"(?:never|avoid)\s+(.+)", lesson, re.IGNORECASE, ) if never_match: facts.append( ExtractedFact( subject=episode.task_type, predicate="anti_pattern", object=never_match.group(1).strip(), confidence=0.85, source_text=lesson, ) ) return facts def _extract_outcome_facts( self, episode: Episode, ) -> list[ExtractedFact]: """Extract facts based on episode outcome.""" facts: list[ExtractedFact] = [] # Create fact based on outcome if episode.outcome == Outcome.SUCCESS: if episode.outcome_details: facts.append( ExtractedFact( subject=episode.task_type, predicate="successful_approach", object=episode.outcome_details[:500], confidence=0.75, source_text=episode.outcome_details, ) ) elif episode.outcome == Outcome.FAILURE: if episode.outcome_details: facts.append( ExtractedFact( subject=episode.task_type, predicate="known_failure_mode", object=episode.outcome_details[:500], confidence=0.8, # High confidence for failures source_text=episode.outcome_details, ) ) return facts def extract_from_text( self, text: str, context: ExtractionContext | None = None, ) -> list[ExtractedFact]: """ Extract facts from arbitrary text. Args: text: Text to extract from context: Optional extraction context Returns: List of extracted facts """ ctx = context or ExtractionContext() facts = self._extract_from_text(text) # Filter by confidence facts = [f for f in facts if f.confidence >= ctx.min_confidence] return facts[: ctx.max_facts_per_source] # Singleton extractor instance _extractor: FactExtractor | None = None def get_fact_extractor() -> FactExtractor: """Get the singleton fact extractor instance.""" global _extractor if _extractor is None: _extractor = FactExtractor() return _extractor