src.core.error_monitoring module#

Marcus Error Monitoring and Correlation System.

Provides comprehensive error tracking, pattern analysis, and correlation capabilities for autonomous agent environments.

class src.core.error_monitoring.AlertSeverity[source]#

Bases: Enum

Alert severity levels.

INFO = 'info'#
WARNING = 'warning'#
ERROR = 'error'#
CRITICAL = 'critical'#
class src.core.error_monitoring.ErrorMetrics[source]#

Bases: object

Error metrics for monitoring.

total_errors: int = 0#
errors_by_type: Dict[str, int]#
errors_by_severity: Dict[str, int]#
errors_by_category: Dict[str, int]#
errors_by_agent: Dict[str, int]#
errors_by_operation: Dict[str, int]#
retryable_errors: int = 0#
critical_errors: int = 0#
error_rate_per_minute: float = 0.0#
last_updated: datetime#
__init__(total_errors=0, errors_by_type=<factory>, errors_by_severity=<factory>, errors_by_category=<factory>, errors_by_agent=<factory>, errors_by_operation=<factory>, retryable_errors=0, critical_errors=0, error_rate_per_minute=0.0, last_updated=<factory>)#
Parameters:
Return type:

None

class src.core.error_monitoring.ErrorPattern[source]#

Bases: object

Detected error pattern.

pattern_id: str#
pattern_type: str#
description: str#
frequency: int#
first_seen: datetime#
last_seen: datetime#
affected_agents: Set[str]#
affected_operations: Set[str]#
severity: ErrorSeverity = 'medium'#
sample_errors: List[str]#
__init__(pattern_id, pattern_type, description, frequency, first_seen, last_seen, affected_agents=<factory>, affected_operations=<factory>, severity=ErrorSeverity.MEDIUM, sample_errors=<factory>)#
Parameters:
Return type:

None

class src.core.error_monitoring.CorrelationGroup[source]#

Bases: object

Group of correlated errors.

group_id: str#
correlation_key: str#
errors: List[str]#
start_time: datetime#
end_time: datetime | None = None#
pattern: str | None = None#
root_cause: str | None = None#
__init__(group_id, correlation_key, errors=<factory>, start_time=<factory>, end_time=None, pattern=None, root_cause=None)#
Parameters:
Return type:

None

class src.core.error_monitoring.ErrorMonitor[source]#

Bases: object

Comprehensive error monitoring system.

Tracks error patterns, provides real-time metrics, and enables proactive issue detection for autonomous agents.

__init__(storage_path='logs/error_monitoring.json', metrics_window_minutes=60, pattern_detection_enabled=True, correlation_timeout_minutes=30)[source]#
Parameters:
  • storage_path (str)

  • metrics_window_minutes (int)

  • pattern_detection_enabled (bool)

  • correlation_timeout_minutes (int)

error_history: deque[Dict[str, Any]]#
error_index: Dict[str, Dict[str, Any]]#
metrics_history: List[ErrorMetrics]#
detected_patterns: Dict[str, ErrorPattern]#
correlation_groups: Dict[str, CorrelationGroup]#
active_correlations: Dict[str, str]#
alert_callbacks: List[Callable[[ErrorPattern], None]]#
record_error(error)[source]#

Record an error for monitoring and analysis.

Return type:

None

Parameters:

error (MarcusBaseError)

add_alert_callback(callback)[source]#

Add callback for pattern alerts.

Return type:

None

Parameters:

callback (Callable[[ErrorPattern], None])

get_current_metrics()[source]#

Get current error metrics.

Return type:

ErrorMetrics

get_metrics_history(hours=24)[source]#

Get metrics history for specified hours.

Return type:

List[ErrorMetrics]

Parameters:

hours (int)

get_detected_patterns(active_only=True)[source]#

Get detected error patterns.

Return type:

List[ErrorPattern]

Parameters:

active_only (bool)

get_correlation_groups(active_only=True)[source]#

Get error correlation groups.

Return type:

List[CorrelationGroup]

Parameters:

active_only (bool)

get_error_details(correlation_id)[source]#

Get detailed information about a specific error.

Return type:

Optional[Dict[str, Any]]

Parameters:

correlation_id (str)

search_errors(error_type=None, agent_id=None, operation=None, severity=None, hours=24)[source]#

Search errors with specified criteria.

Return type:

List[Dict[str, Any]]

Parameters:
  • error_type (str | None)

  • agent_id (str | None)

  • operation (str | None)

  • severity (str | None)

  • hours (int)

generate_health_report()[source]#

Generate comprehensive health report.

Return type:

Dict[str, Any]

async start_monitoring()[source]#

Start background monitoring tasks.

Return type:

None

async stop_monitoring()[source]#

Stop background monitoring tasks.

Return type:

None

src.core.error_monitoring.setup_error_monitoring(storage_path='logs/error_monitoring.json', enable_patterns=True, alert_callback=None)[source]#

Set up global error monitoring.

Return type:

ErrorMonitor

Parameters:
src.core.error_monitoring.record_error_for_monitoring(error)[source]#

Record error in global monitor.

Return type:

None

Parameters:

error (MarcusBaseError)

src.core.error_monitoring.get_error_health_status()[source]#

Get current error health status.

Return type:

Dict[str, Any]