""" ProcessMonitorThread — watches a running game server process. Responsibilities: 1. Detect when the process exits unexpectedly (crash). 2. On crash: update server status to "crashed" in DB, emit a crash event. 3. If auto_restart is enabled on the server record: trigger restart. 4. Respect max_restarts — if exceeded, leave server in "crashed" state. Poll interval: 5 seconds. """ from __future__ import annotations import logging import queue from core.dal.event_repository import EventRepository from core.dal.server_repository import ServerRepository from core.threads.base_thread import BaseServerThread logger = logging.getLogger(__name__) _POLL_INTERVAL = 5.0 class ProcessMonitorThread(BaseServerThread): """ Monitors the OS process for a running game server. Args: server_id: Database server ID. process_manager: ProcessManager singleton (injected). broadcast_queue: Optional queue.Queue for crash notifications. """ def __init__( self, server_id: int, process_manager, broadcast_queue=None, ) -> None: super().__init__(server_id, "ProcessMonitor") self._process_manager = process_manager self._broadcast_queue = broadcast_queue # ── Main loop ── def _run_loop(self) -> None: self._stop_event.wait(timeout=_POLL_INTERVAL) if self._stop_event.is_set(): return if not self._process_manager.is_running(self.server_id): self._handle_unexpected_exit() # After handling, stop this monitor — the server is no longer running self._fatal_error = True # ── Crash handling ── def _handle_unexpected_exit(self) -> None: if self._db is None: return server_repo = ServerRepository(self._db) event_repo = EventRepository(self._db) server = server_repo.get_by_id(self.server_id) if server is None: return # Only treat as crash if the server was supposed to be running if server["status"] not in ("running", "starting"): return logger.warning( "[%s] Server %d process exited unexpectedly (status was '%s')", self.name, self.server_id, server["status"], ) # Increment crash counter server_repo.increment_restart_count(self.server_id) restart_count = server["restart_count"] + 1 max_restarts = server.get("max_restarts", 3) # Record crash event event_repo.insert( server_id=self.server_id, event_type="crash", detail={"restart_count": restart_count}, ) should_restart = ( server.get("auto_restart", False) and restart_count <= max_restarts ) if should_restart: server_repo.update_status(self.server_id, "restarting") event_repo.insert( server_id=self.server_id, event_type="restart_scheduled", detail={"attempt": restart_count, "max": max_restarts}, ) else: server_repo.update_status(self.server_id, "crashed") if restart_count > max_restarts: event_repo.insert( server_id=self.server_id, event_type="restart_limit_reached", detail={"restart_count": restart_count, "max_restarts": max_restarts}, ) try: self._db.commit() except Exception as exc: logger.error("[%s] DB commit failed during crash handling: %s", self.name, exc) self._db.rollback() if self._broadcast_queue is not None: try: self._broadcast_queue.put_nowait({ "type": "server_status", "server_id": self.server_id, "data": { "status": "restarting" if should_restart else "crashed", "restart_count": restart_count, }, }) except queue.Full: logger.debug("[%s] Broadcast queue full, dropping server_status event", self.name) # Trigger actual restart outside DB work if should_restart: self._trigger_restart() def _trigger_restart(self) -> None: """ Calls ServerService.start() to restart the server. This is safe to call from a background thread. """ try: from database import get_thread_db from core.servers.service import ServerService db = get_thread_db() try: service = ServerService(db) service.start(self.server_id) except Exception as exc: logger.error("[%s] Auto-restart start() failed: %s", self.name, exc, exc_info=True) finally: try: db.close() except Exception as exc: logger.debug("[%s] Error closing restart DB connection: %s", self.name, exc) except Exception as exc: logger.error("[%s] Auto-restart failed: %s", self.name, exc, exc_info=True)