feat: implement full backend + frontend server detail, settings, and create server pages
Backend: - Complete FastAPI backend with 42+ REST endpoints (auth, servers, config, players, bans, missions, mods, games, system) - Game adapter architecture with Arma 3 as first-class adapter - WebSocket real-time events for status, metrics, logs, players - Background thread system (process monitor, metrics, log tail, RCon poller) - Fernet encryption for sensitive config fields at rest - JWT auth with admin/viewer roles, bcrypt password hashing - SQLite with WAL mode, parameterized queries, migration system - APScheduler cleanup jobs for logs, metrics, events Frontend: - Server Detail page with 7 tabs (overview, config, players, bans, missions, mods, logs) - Settings page with password change and admin user management - Create Server wizard (4-step; known bug: silent validation failure) - New hooks: useServerDetail, useAuth, useGames - New components: ServerHeader, ConfigEditor, PlayerTable, BanTable, MissionList, ModList, LogViewer, PasswordChange, UserManager - WebSocket onEvent callback for real-time log accumulation - 120 unit tests passing (Vitest + React Testing Library) Docs: - Added .gitignore, CLAUDE.md, README.md - Updated FRONTEND.md, ARCHITECTURE.md with current implementation state - Added .env.example for backend configuration Known issues: - Create Server form: "Next" buttons don't validate before advancing, causing silent submit failure when fields are invalid - Config sub-tabs need UX redesign for non-technical users
This commit is contained in:
3
backend/core/threads/__init__.py
Normal file
3
backend/core/threads/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from core.threads.thread_registry import ThreadRegistry
|
||||
|
||||
__all__ = ["ThreadRegistry"]
|
||||
123
backend/core/threads/base_thread.py
Normal file
123
backend/core/threads/base_thread.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""
|
||||
BaseServerThread — base class for all per-server background threads.
|
||||
|
||||
Rules every subclass MUST follow:
|
||||
- Call super().__init__(server_id, name) in __init__
|
||||
- Implement _run_loop() — called repeatedly until _stop_event is set
|
||||
- Do NOT override run() directly
|
||||
- Use self._db for all database operations — it is a thread-local connection
|
||||
- Call self._close_db() in your finally block if you open additional connections
|
||||
- Exceptions raised from _run_loop() are caught, logged, and the loop continues
|
||||
unless the exception is a fatal error — set self._fatal_error = True to stop
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from database import get_thread_db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_EXCEPTION_BACKOFF_BASE = 2.0
|
||||
_EXCEPTION_BACKOFF_MAX = 60.0
|
||||
_EXCEPTION_BACKOFF_MULTIPLIER = 2.0
|
||||
|
||||
|
||||
class BaseServerThread(ABC, threading.Thread):
|
||||
"""
|
||||
Abstract base for all per-server background threads.
|
||||
|
||||
Subclasses implement _run_loop(). This base class handles:
|
||||
- Stop event signaling
|
||||
- Thread-local DB connection lifecycle
|
||||
- Exception backoff to prevent tight crash loops
|
||||
- Structured logging with server_id context
|
||||
"""
|
||||
|
||||
def __init__(self, server_id: int, name: str) -> None:
|
||||
super().__init__(name=f"{name}-server-{server_id}", daemon=True)
|
||||
self.server_id = server_id
|
||||
self._stop_event = threading.Event()
|
||||
self._fatal_error = False
|
||||
self._db = None
|
||||
self._exception_count = 0
|
||||
|
||||
# ── Public API ──
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Signal the thread to stop. Does not block."""
|
||||
self._stop_event.set()
|
||||
|
||||
def stop_and_join(self, timeout: float = 5.0) -> None:
|
||||
"""Signal stop and wait for the thread to exit."""
|
||||
self._stop_event.set()
|
||||
self.join(timeout=timeout)
|
||||
|
||||
@property
|
||||
def is_stopping(self) -> bool:
|
||||
return self._stop_event.is_set()
|
||||
|
||||
# ── Thread entry point ──
|
||||
|
||||
def run(self) -> None:
|
||||
logger.info("[%s] Starting", self.name)
|
||||
backoff = _EXCEPTION_BACKOFF_BASE
|
||||
|
||||
try:
|
||||
self._db = get_thread_db()
|
||||
self._on_start()
|
||||
|
||||
while not self._stop_event.is_set() and not self._fatal_error:
|
||||
try:
|
||||
self._run_loop()
|
||||
backoff = _EXCEPTION_BACKOFF_BASE
|
||||
self._exception_count = 0
|
||||
except Exception as exc:
|
||||
self._exception_count += 1
|
||||
logger.error(
|
||||
"[%s] Unhandled exception in _run_loop (count=%d): %s",
|
||||
self.name, self._exception_count, exc, exc_info=True,
|
||||
)
|
||||
if self._fatal_error:
|
||||
break
|
||||
self._stop_event.wait(timeout=backoff)
|
||||
backoff = min(backoff * _EXCEPTION_BACKOFF_MULTIPLIER, _EXCEPTION_BACKOFF_MAX)
|
||||
|
||||
except Exception as exc:
|
||||
logger.critical("[%s] Fatal error in thread setup: %s", self.name, exc, exc_info=True)
|
||||
finally:
|
||||
self._on_stop()
|
||||
self._close_db()
|
||||
logger.info("[%s] Stopped", self.name)
|
||||
|
||||
# ── Hooks for subclasses ──
|
||||
|
||||
def _on_start(self) -> None:
|
||||
"""Called once before the loop starts. Override for setup."""
|
||||
|
||||
def _on_stop(self) -> None:
|
||||
"""Called once after the loop ends. Override for cleanup."""
|
||||
|
||||
@abstractmethod
|
||||
def _run_loop(self) -> None:
|
||||
"""
|
||||
Implement the thread's work here.
|
||||
Called repeatedly until stop() is called or _fatal_error is set.
|
||||
Should block for a short period (sleep or wait) to avoid busy-looping.
|
||||
"""
|
||||
|
||||
# ── Internal helpers ──
|
||||
|
||||
def _close_db(self) -> None:
|
||||
if self._db is not None:
|
||||
try:
|
||||
self._db.close()
|
||||
except Exception as exc:
|
||||
logger.debug("[%s] Error closing DB connection: %s", self.name, exc)
|
||||
self._db = None
|
||||
|
||||
def _sleep(self, seconds: float) -> None:
|
||||
"""Interruptible sleep — wakes up early if stop() is called."""
|
||||
self._stop_event.wait(timeout=seconds)
|
||||
167
backend/core/threads/log_tail.py
Normal file
167
backend/core/threads/log_tail.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""
|
||||
LogTailThread — tails a server's log file, parses lines via LogParser,
|
||||
and persists parsed entries to the logs table.
|
||||
|
||||
Design notes:
|
||||
- Opens the log file in text mode with errors="replace" to handle encoding issues
|
||||
- Detects log rotation by checking if the inode changes (Unix) or file shrinks (Windows)
|
||||
- On rotation: closes old handle, reopens from position 0
|
||||
- Flushes inserts in batches of up to LOG_BATCH_SIZE per loop iteration
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional
|
||||
|
||||
from core.dal.log_repository import LogRepository
|
||||
from core.threads.base_thread import BaseServerThread
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_LOG_BATCH_SIZE = 50
|
||||
_POLL_INTERVAL = 1.0
|
||||
_REOPEN_DELAY = 2.0
|
||||
|
||||
|
||||
class LogTailThread(BaseServerThread):
|
||||
"""
|
||||
Tails a log file for a specific server.
|
||||
|
||||
Args:
|
||||
server_id: The database server ID.
|
||||
log_path: Absolute path to the log file to tail.
|
||||
log_parser: LogParser adapter instance for this game type.
|
||||
broadcast_queue: Optional queue.Queue to push parsed events to BroadcastThread.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
server_id: int,
|
||||
log_path: str,
|
||||
log_parser,
|
||||
broadcast_queue=None,
|
||||
) -> None:
|
||||
super().__init__(server_id, "LogTail")
|
||||
self._log_path = log_path
|
||||
self._log_parser = log_parser
|
||||
self._broadcast_queue = broadcast_queue
|
||||
self._file_handle = None
|
||||
self._last_inode = None
|
||||
self._last_size = 0
|
||||
|
||||
# ── Lifecycle ──
|
||||
|
||||
def _on_start(self) -> None:
|
||||
self._open_log_file()
|
||||
|
||||
def _on_stop(self) -> None:
|
||||
self._close_file()
|
||||
|
||||
# ── Main loop ──
|
||||
|
||||
def _run_loop(self) -> None:
|
||||
if self._file_handle is None:
|
||||
self._stop_event.wait(timeout=_POLL_INTERVAL)
|
||||
self._open_log_file()
|
||||
return
|
||||
|
||||
if self._detect_rotation():
|
||||
logger.info("[%s] Log rotation detected, reopening", self.name)
|
||||
self._close_file()
|
||||
self._stop_event.wait(timeout=_REOPEN_DELAY)
|
||||
self._open_log_file()
|
||||
return
|
||||
|
||||
lines_read = 0
|
||||
entries_to_insert = []
|
||||
|
||||
while lines_read < _LOG_BATCH_SIZE:
|
||||
line = self._file_handle.readline()
|
||||
if not line:
|
||||
break
|
||||
lines_read += 1
|
||||
line = line.rstrip("\n").rstrip("\r")
|
||||
if not line:
|
||||
continue
|
||||
|
||||
parsed = self._log_parser.parse_line(line)
|
||||
if parsed is not None:
|
||||
entries_to_insert.append(parsed)
|
||||
|
||||
if entries_to_insert and self._db is not None:
|
||||
log_repo = LogRepository(self._db)
|
||||
for entry in entries_to_insert:
|
||||
log_repo.insert(server_id=self.server_id, entry=entry)
|
||||
try:
|
||||
self._db.commit()
|
||||
except Exception as exc:
|
||||
logger.error("[%s] DB commit failed: %s", self.name, exc)
|
||||
self._db.rollback()
|
||||
|
||||
if self._broadcast_queue is not None:
|
||||
for entry in entries_to_insert:
|
||||
try:
|
||||
self._broadcast_queue.put_nowait({
|
||||
"type": "log",
|
||||
"server_id": self.server_id,
|
||||
"data": entry,
|
||||
})
|
||||
except queue.Full:
|
||||
logger.debug("[%s] Broadcast queue full, dropping log event", self.name)
|
||||
|
||||
if lines_read == 0:
|
||||
self._stop_event.wait(timeout=_POLL_INTERVAL)
|
||||
|
||||
# ── File management ──
|
||||
|
||||
def _open_log_file(self) -> None:
|
||||
if not os.path.exists(self._log_path):
|
||||
return
|
||||
try:
|
||||
self._file_handle = open(
|
||||
self._log_path, "r", encoding="utf-8", errors="replace"
|
||||
)
|
||||
# Start tailing from the end of the file
|
||||
self._file_handle.seek(0, 2)
|
||||
self._last_size = self._file_handle.tell()
|
||||
stat = os.stat(self._log_path)
|
||||
self._last_inode = getattr(stat, "st_ino", None)
|
||||
logger.debug("[%s] Opened log file: %s", self.name, self._log_path)
|
||||
except OSError as exc:
|
||||
logger.warning("[%s] Cannot open log file %s: %s", self.name, self._log_path, exc)
|
||||
self._file_handle = None
|
||||
|
||||
def _close_file(self) -> None:
|
||||
if self._file_handle is not None:
|
||||
try:
|
||||
self._file_handle.close()
|
||||
except OSError as exc:
|
||||
logger.debug("[%s] Error closing log file: %s", self.name, exc)
|
||||
self._file_handle = None
|
||||
self._last_inode = None
|
||||
self._last_size = 0
|
||||
|
||||
def _detect_rotation(self) -> bool:
|
||||
"""Returns True if the log file has been rotated."""
|
||||
try:
|
||||
stat = os.stat(self._log_path)
|
||||
except OSError:
|
||||
return True
|
||||
|
||||
current_inode = getattr(stat, "st_ino", None)
|
||||
if current_inode is not None and self._last_inode is not None:
|
||||
if current_inode != self._last_inode:
|
||||
return True
|
||||
|
||||
# Windows fallback: file shrunk
|
||||
current_size = stat.st_size
|
||||
if self._file_handle is not None:
|
||||
current_pos = self._file_handle.tell()
|
||||
if current_size < current_pos:
|
||||
return True
|
||||
self._last_size = current_size
|
||||
|
||||
return False
|
||||
118
backend/core/threads/metrics_collector.py
Normal file
118
backend/core/threads/metrics_collector.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""
|
||||
MetricsCollectorThread — collects CPU and memory usage for a server process
|
||||
and persists to the metrics table every COLLECTION_INTERVAL seconds.
|
||||
|
||||
Uses psutil to inspect the process identified by ProcessManager.get_pid().
|
||||
If the process is not running, the thread sleeps and retries.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import queue
|
||||
|
||||
import psutil
|
||||
|
||||
from core.dal.metrics_repository import MetricsRepository
|
||||
from core.threads.base_thread import BaseServerThread
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_COLLECTION_INTERVAL = 10.0
|
||||
_RETENTION_DAYS = 1
|
||||
|
||||
|
||||
class MetricsCollectorThread(BaseServerThread):
|
||||
"""
|
||||
Collects process metrics for a running game server.
|
||||
|
||||
Args:
|
||||
server_id: Database server ID.
|
||||
process_manager: ProcessManager singleton instance.
|
||||
broadcast_queue: Optional queue.Queue for real-time metric pushes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
server_id: int,
|
||||
process_manager,
|
||||
broadcast_queue=None,
|
||||
) -> None:
|
||||
super().__init__(server_id, "MetricsCollector")
|
||||
self._process_manager = process_manager
|
||||
self._broadcast_queue = broadcast_queue
|
||||
self._psutil_process = None
|
||||
self._samples_since_cleanup = 0
|
||||
self._cleanup_every = 360 # ~1 hour at 10s intervals
|
||||
|
||||
# ── Main loop ──
|
||||
|
||||
def _run_loop(self) -> None:
|
||||
pid = self._process_manager.get_pid(self.server_id)
|
||||
if pid is None:
|
||||
self._psutil_process = None
|
||||
self._stop_event.wait(timeout=_COLLECTION_INTERVAL)
|
||||
return
|
||||
|
||||
# Reuse or create psutil.Process handle
|
||||
if self._psutil_process is None or self._psutil_process.pid != pid:
|
||||
try:
|
||||
self._psutil_process = psutil.Process(pid)
|
||||
self._psutil_process.cpu_percent(interval=None)
|
||||
except psutil.NoSuchProcess:
|
||||
self._psutil_process = None
|
||||
self._stop_event.wait(timeout=_COLLECTION_INTERVAL)
|
||||
return
|
||||
|
||||
self._stop_event.wait(timeout=_COLLECTION_INTERVAL)
|
||||
|
||||
if self._stop_event.is_set():
|
||||
return
|
||||
|
||||
try:
|
||||
cpu_pct = self._psutil_process.cpu_percent(interval=None)
|
||||
mem_info = self._psutil_process.memory_info()
|
||||
mem_mb = round(mem_info.rss / (1024 * 1024), 2)
|
||||
except psutil.NoSuchProcess:
|
||||
logger.info("[%s] Process %d no longer exists", self.name, pid)
|
||||
self._psutil_process = None
|
||||
return
|
||||
except psutil.AccessDenied as exc:
|
||||
logger.warning("[%s] Access denied reading process %d: %s", self.name, pid, exc)
|
||||
return
|
||||
|
||||
if self._db is None:
|
||||
return
|
||||
|
||||
metrics_repo = MetricsRepository(self._db)
|
||||
metrics_repo.insert(
|
||||
server_id=self.server_id,
|
||||
cpu_percent=cpu_pct,
|
||||
ram_mb=mem_mb,
|
||||
)
|
||||
try:
|
||||
self._db.commit()
|
||||
except Exception as exc:
|
||||
logger.error("[%s] DB commit failed: %s", self.name, exc)
|
||||
self._db.rollback()
|
||||
return
|
||||
|
||||
if self._broadcast_queue is not None:
|
||||
try:
|
||||
self._broadcast_queue.put_nowait({
|
||||
"type": "metrics",
|
||||
"server_id": self.server_id,
|
||||
"data": {"cpu_percent": cpu_pct, "memory_mb": mem_mb},
|
||||
})
|
||||
except queue.Full:
|
||||
logger.debug("[%s] Broadcast queue full, dropping metrics event", self.name)
|
||||
|
||||
# Periodic cleanup
|
||||
self._samples_since_cleanup += 1
|
||||
if self._samples_since_cleanup >= self._cleanup_every:
|
||||
self._samples_since_cleanup = 0
|
||||
try:
|
||||
metrics_repo.cleanup_old(server_id=self.server_id, retention_days=_RETENTION_DAYS)
|
||||
self._db.commit()
|
||||
except Exception as exc:
|
||||
logger.warning("[%s] Cleanup failed: %s", self.name, exc)
|
||||
self._db.rollback()
|
||||
158
backend/core/threads/process_monitor.py
Normal file
158
backend/core/threads/process_monitor.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""
|
||||
ProcessMonitorThread — watches a running game server process.
|
||||
|
||||
Responsibilities:
|
||||
1. Detect when the process exits unexpectedly (crash).
|
||||
2. On crash: update server status to "crashed" in DB, emit a crash event.
|
||||
3. If auto_restart is enabled on the server record: trigger restart.
|
||||
4. Respect max_restarts — if exceeded, leave server in "crashed" state.
|
||||
|
||||
Poll interval: 5 seconds.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import queue
|
||||
|
||||
from core.dal.event_repository import EventRepository
|
||||
from core.dal.server_repository import ServerRepository
|
||||
from core.threads.base_thread import BaseServerThread
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_POLL_INTERVAL = 5.0
|
||||
|
||||
|
||||
class ProcessMonitorThread(BaseServerThread):
|
||||
"""
|
||||
Monitors the OS process for a running game server.
|
||||
|
||||
Args:
|
||||
server_id: Database server ID.
|
||||
process_manager: ProcessManager singleton (injected).
|
||||
broadcast_queue: Optional queue.Queue for crash notifications.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
server_id: int,
|
||||
process_manager,
|
||||
broadcast_queue=None,
|
||||
) -> None:
|
||||
super().__init__(server_id, "ProcessMonitor")
|
||||
self._process_manager = process_manager
|
||||
self._broadcast_queue = broadcast_queue
|
||||
|
||||
# ── Main loop ──
|
||||
|
||||
def _run_loop(self) -> None:
|
||||
self._stop_event.wait(timeout=_POLL_INTERVAL)
|
||||
|
||||
if self._stop_event.is_set():
|
||||
return
|
||||
|
||||
if not self._process_manager.is_running(self.server_id):
|
||||
self._handle_unexpected_exit()
|
||||
# After handling, stop this monitor — the server is no longer running
|
||||
self._fatal_error = True
|
||||
|
||||
# ── Crash handling ──
|
||||
|
||||
def _handle_unexpected_exit(self) -> None:
|
||||
if self._db is None:
|
||||
return
|
||||
|
||||
server_repo = ServerRepository(self._db)
|
||||
event_repo = EventRepository(self._db)
|
||||
|
||||
server = server_repo.get_by_id(self.server_id)
|
||||
if server is None:
|
||||
return
|
||||
|
||||
# Only treat as crash if the server was supposed to be running
|
||||
if server["status"] not in ("running", "starting"):
|
||||
return
|
||||
|
||||
logger.warning(
|
||||
"[%s] Server %d process exited unexpectedly (status was '%s')",
|
||||
self.name, self.server_id, server["status"],
|
||||
)
|
||||
|
||||
# Increment crash counter
|
||||
server_repo.increment_restart_count(self.server_id)
|
||||
restart_count = server["restart_count"] + 1
|
||||
max_restarts = server.get("max_restarts", 3)
|
||||
|
||||
# Record crash event
|
||||
event_repo.insert(
|
||||
server_id=self.server_id,
|
||||
event_type="crash",
|
||||
detail={"restart_count": restart_count},
|
||||
)
|
||||
|
||||
should_restart = (
|
||||
server.get("auto_restart", False)
|
||||
and restart_count <= max_restarts
|
||||
)
|
||||
|
||||
if should_restart:
|
||||
server_repo.update_status(self.server_id, "restarting")
|
||||
event_repo.insert(
|
||||
server_id=self.server_id,
|
||||
event_type="restart_scheduled",
|
||||
detail={"attempt": restart_count, "max": max_restarts},
|
||||
)
|
||||
else:
|
||||
server_repo.update_status(self.server_id, "crashed")
|
||||
if restart_count > max_restarts:
|
||||
event_repo.insert(
|
||||
server_id=self.server_id,
|
||||
event_type="restart_limit_reached",
|
||||
detail={"restart_count": restart_count, "max_restarts": max_restarts},
|
||||
)
|
||||
|
||||
try:
|
||||
self._db.commit()
|
||||
except Exception as exc:
|
||||
logger.error("[%s] DB commit failed during crash handling: %s", self.name, exc)
|
||||
self._db.rollback()
|
||||
|
||||
if self._broadcast_queue is not None:
|
||||
try:
|
||||
self._broadcast_queue.put_nowait({
|
||||
"type": "server_status",
|
||||
"server_id": self.server_id,
|
||||
"data": {
|
||||
"status": "restarting" if should_restart else "crashed",
|
||||
"restart_count": restart_count,
|
||||
},
|
||||
})
|
||||
except queue.Full:
|
||||
logger.debug("[%s] Broadcast queue full, dropping server_status event", self.name)
|
||||
|
||||
# Trigger actual restart outside DB work
|
||||
if should_restart:
|
||||
self._trigger_restart()
|
||||
|
||||
def _trigger_restart(self) -> None:
|
||||
"""
|
||||
Calls ServerService.start() to restart the server.
|
||||
This is safe to call from a background thread.
|
||||
"""
|
||||
try:
|
||||
from database import get_thread_db
|
||||
from core.servers.service import ServerService
|
||||
|
||||
db = get_thread_db()
|
||||
try:
|
||||
service = ServerService(db)
|
||||
service.start(self.server_id)
|
||||
except Exception as exc:
|
||||
logger.error("[%s] Auto-restart start() failed: %s", self.name, exc, exc_info=True)
|
||||
finally:
|
||||
try:
|
||||
db.close()
|
||||
except Exception as exc:
|
||||
logger.debug("[%s] Error closing restart DB connection: %s", self.name, exc)
|
||||
except Exception as exc:
|
||||
logger.error("[%s] Auto-restart failed: %s", self.name, exc, exc_info=True)
|
||||
169
backend/core/threads/remote_admin_poller.py
Normal file
169
backend/core/threads/remote_admin_poller.py
Normal file
@@ -0,0 +1,169 @@
|
||||
"""
|
||||
RemoteAdminPollerThread — polls the game server's remote admin interface
|
||||
(e.g. BattlEye RCon for Arma3) to sync the player list.
|
||||
|
||||
Design notes:
|
||||
- Uses the RemoteAdminClient protocol injected at construction time
|
||||
- Reconnects automatically on disconnect with exponential backoff
|
||||
- Persists current player list to players table via PlayerRepository
|
||||
- Emits player_join / player_leave events via EventRepository
|
||||
- Pushes player list updates to broadcast_queue if provided
|
||||
|
||||
Poll interval: 30 seconds.
|
||||
Reconnect backoff: 5s -> 10s -> 20s -> 40s -> 60s (cap).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import queue
|
||||
|
||||
from core.dal.event_repository import EventRepository
|
||||
from core.dal.player_repository import PlayerRepository
|
||||
from core.threads.base_thread import BaseServerThread
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_POLL_INTERVAL = 30.0
|
||||
_RECONNECT_BACKOFF_BASE = 5.0
|
||||
_RECONNECT_BACKOFF_MAX = 60.0
|
||||
_RECONNECT_BACKOFF_MULT = 2.0
|
||||
|
||||
|
||||
class RemoteAdminPollerThread(BaseServerThread):
|
||||
"""
|
||||
Polls the remote admin interface for a game server.
|
||||
|
||||
Args:
|
||||
server_id: Database server ID.
|
||||
remote_admin_client: Connected RemoteAdminClient instance.
|
||||
broadcast_queue: Optional queue.Queue for player list pushes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
server_id: int,
|
||||
remote_admin_client,
|
||||
broadcast_queue=None,
|
||||
) -> None:
|
||||
super().__init__(server_id, "RemoteAdminPoller")
|
||||
self._client = remote_admin_client
|
||||
self._broadcast_queue = broadcast_queue
|
||||
self._connected = False
|
||||
self._reconnect_backoff = _RECONNECT_BACKOFF_BASE
|
||||
self._known_players: dict[str, dict] = {} # player_uid -> player data
|
||||
|
||||
# ── Lifecycle ──
|
||||
|
||||
def _on_stop(self) -> None:
|
||||
if self._connected and self._client is not None:
|
||||
try:
|
||||
self._client.disconnect()
|
||||
except Exception as exc:
|
||||
logger.debug("[%s] Error disconnecting remote admin on stop: %s", self.name, exc)
|
||||
self._connected = False
|
||||
|
||||
# ── Main loop ──
|
||||
|
||||
def _run_loop(self) -> None:
|
||||
if not self._connected:
|
||||
self._attempt_connect()
|
||||
return
|
||||
|
||||
self._stop_event.wait(timeout=_POLL_INTERVAL)
|
||||
|
||||
if self._stop_event.is_set():
|
||||
return
|
||||
|
||||
try:
|
||||
players = self._client.get_players()
|
||||
self._reconnect_backoff = _RECONNECT_BACKOFF_BASE
|
||||
self._sync_players(players)
|
||||
except Exception as exc:
|
||||
logger.warning("[%s] Poll failed: %s — will reconnect", self.name, exc)
|
||||
self._connected = False
|
||||
try:
|
||||
if self._client is not None:
|
||||
self._client.disconnect()
|
||||
except Exception as exc:
|
||||
logger.debug("[%s] Error disconnecting after poll failure: %s", self.name, exc)
|
||||
|
||||
# ── Connection management ──
|
||||
|
||||
def _attempt_connect(self) -> None:
|
||||
try:
|
||||
self._client.connect() if hasattr(self._client, "connect") else None
|
||||
self._connected = True
|
||||
self._reconnect_backoff = _RECONNECT_BACKOFF_BASE
|
||||
logger.info("[%s] Connected to remote admin", self.name)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[%s] Connection failed: %s — retrying in %.1fs",
|
||||
self.name, exc, self._reconnect_backoff,
|
||||
)
|
||||
self._stop_event.wait(timeout=self._reconnect_backoff)
|
||||
self._reconnect_backoff = min(
|
||||
self._reconnect_backoff * _RECONNECT_BACKOFF_MULT,
|
||||
_RECONNECT_BACKOFF_MAX,
|
||||
)
|
||||
|
||||
# ── Player sync ──
|
||||
|
||||
def _sync_players(self, current_players: list[dict]) -> None:
|
||||
"""
|
||||
Diff current_players against self._known_players.
|
||||
Insert join events for new players, leave events for departed ones.
|
||||
Upsert all current players in the DB.
|
||||
|
||||
Each player dict must have at least: slot_id, name (other fields optional).
|
||||
"""
|
||||
if self._db is None:
|
||||
return
|
||||
|
||||
player_repo = PlayerRepository(self._db)
|
||||
event_repo = EventRepository(self._db)
|
||||
|
||||
# Build uid sets for diffing — use slot_id as key
|
||||
current_slots = {str(p.get("slot_id", i)): p for i, p in enumerate(current_players)}
|
||||
current_keys = set(current_slots.keys())
|
||||
known_keys = set(self._known_players.keys())
|
||||
|
||||
joined = current_keys - known_keys
|
||||
left = known_keys - current_keys
|
||||
|
||||
for slot_key, player in current_slots.items():
|
||||
player_repo.upsert(server_id=self.server_id, player=player)
|
||||
if slot_key in joined:
|
||||
event_repo.insert(
|
||||
server_id=self.server_id,
|
||||
event_type="player_join",
|
||||
detail={"name": player.get("name", ""), "slot": slot_key},
|
||||
)
|
||||
logger.debug("[%s] Player joined: %s (slot %s)", self.name, player.get("name"), slot_key)
|
||||
|
||||
for slot_key in left:
|
||||
departed = self._known_players[slot_key]
|
||||
event_repo.insert(
|
||||
server_id=self.server_id,
|
||||
event_type="player_leave",
|
||||
detail={"name": departed.get("name", ""), "slot": slot_key},
|
||||
)
|
||||
logger.debug("[%s] Player left: %s (slot %s)", self.name, departed.get("name"), slot_key)
|
||||
|
||||
try:
|
||||
self._db.commit()
|
||||
except Exception as exc:
|
||||
logger.error("[%s] DB commit failed during player sync: %s", self.name, exc)
|
||||
self._db.rollback()
|
||||
|
||||
# Update known players
|
||||
self._known_players = current_slots
|
||||
|
||||
if self._broadcast_queue is not None:
|
||||
try:
|
||||
self._broadcast_queue.put_nowait({
|
||||
"type": "players",
|
||||
"server_id": self.server_id,
|
||||
"data": current_players,
|
||||
})
|
||||
except queue.Full:
|
||||
logger.debug("[%s] Broadcast queue full, dropping players event", self.name)
|
||||
257
backend/core/threads/thread_registry.py
Normal file
257
backend/core/threads/thread_registry.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
ThreadRegistry — manages the lifecycle of all per-server background threads.
|
||||
|
||||
One instance is created at app startup and stored in app.state.thread_registry.
|
||||
Also provides class-level methods for convenience (called from ServerService).
|
||||
|
||||
Thread set per server:
|
||||
- LogTailThread (started if adapter has "log_parser" capability and log_path is known)
|
||||
- MetricsCollectorThread (always started)
|
||||
- ProcessMonitorThread (always started)
|
||||
- RemoteAdminPollerThread (started only if adapter has "remote_admin" capability)
|
||||
|
||||
Key methods:
|
||||
start_server_threads(server_id, db) — start all threads for a server
|
||||
stop_server_threads(server_id) — stop all threads for a server
|
||||
reattach_server_threads(server_id, db) — re-attach threads without restarting process
|
||||
stop_all() — called at app shutdown
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import queue
|
||||
|
||||
from adapters.registry import GameAdapterRegistry
|
||||
from core.dal.config_repository import ConfigRepository
|
||||
from core.dal.server_repository import ServerRepository
|
||||
from core.threads.log_tail import LogTailThread
|
||||
from core.threads.metrics_collector import MetricsCollectorThread
|
||||
from core.threads.process_monitor import ProcessMonitorThread
|
||||
from core.threads.remote_admin_poller import RemoteAdminPollerThread
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Module-level singleton for convenience (used by ServerService)
|
||||
_instance: ThreadRegistry | None = None
|
||||
|
||||
|
||||
class ThreadRegistry:
|
||||
"""
|
||||
Manages all background threads for all running servers.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
process_manager,
|
||||
adapter_registry: GameAdapterRegistry | None = None,
|
||||
global_broadcast_queue: queue.Queue | None = None,
|
||||
) -> None:
|
||||
self._process_manager = process_manager
|
||||
self._adapter_registry = adapter_registry or GameAdapterRegistry
|
||||
self._broadcast_queue = global_broadcast_queue or queue.Queue(maxsize=1000)
|
||||
self._bundles: dict[int, dict] = {} # server_id -> thread bundle
|
||||
|
||||
# ── Class-level convenience API ──
|
||||
|
||||
@classmethod
|
||||
def _get_instance(cls) -> "ThreadRegistry | None":
|
||||
return _instance
|
||||
|
||||
@classmethod
|
||||
def set_instance(cls, registry: "ThreadRegistry") -> None:
|
||||
global _instance
|
||||
_instance = registry
|
||||
|
||||
@classmethod
|
||||
def start_server_threads(cls, server_id: int, db) -> None:
|
||||
"""Class-level convenience — starts threads for a server using the singleton."""
|
||||
registry = cls._get_instance()
|
||||
if registry is not None:
|
||||
registry._start_server_threads(server_id, db)
|
||||
|
||||
@classmethod
|
||||
def stop_server_threads(cls, server_id: int) -> None:
|
||||
"""Class-level convenience — stops threads for a server using the singleton."""
|
||||
registry = cls._get_instance()
|
||||
if registry is not None:
|
||||
registry._stop_server_threads(server_id)
|
||||
|
||||
@classmethod
|
||||
def reattach_server_threads(cls, server_id: int, db) -> None:
|
||||
"""Class-level convenience — re-attaches threads for a recovered server."""
|
||||
registry = cls._get_instance()
|
||||
if registry is not None:
|
||||
registry._reattach_server_threads(server_id, db)
|
||||
|
||||
@classmethod
|
||||
def stop_all(cls) -> None:
|
||||
"""Class-level convenience — stops all threads."""
|
||||
registry = cls._get_instance()
|
||||
if registry is not None:
|
||||
registry._stop_all()
|
||||
|
||||
# ── Instance methods ──
|
||||
|
||||
def _start_server_threads(self, server_id: int, db) -> None:
|
||||
if server_id in self._bundles:
|
||||
logger.warning(
|
||||
"ThreadRegistry: threads already exist for server %d — stopping first",
|
||||
server_id,
|
||||
)
|
||||
self._stop_server_threads(server_id)
|
||||
|
||||
bundle = self._build_bundle(server_id, db)
|
||||
self._bundles[server_id] = bundle
|
||||
self._start_bundle(server_id, bundle)
|
||||
|
||||
def _stop_server_threads(self, server_id: int) -> None:
|
||||
bundle = self._bundles.pop(server_id, None)
|
||||
if bundle is None:
|
||||
return
|
||||
self._stop_bundle(server_id, bundle)
|
||||
|
||||
def _reattach_server_threads(self, server_id: int, db) -> None:
|
||||
logger.info("ThreadRegistry: reattaching threads for server %d", server_id)
|
||||
self._start_server_threads(server_id, db)
|
||||
|
||||
def _stop_all(self) -> None:
|
||||
server_ids = list(self._bundles.keys())
|
||||
for server_id in server_ids:
|
||||
self._stop_server_threads(server_id)
|
||||
logger.info("ThreadRegistry: all threads stopped")
|
||||
|
||||
def get_thread_count(self, server_id: int) -> int:
|
||||
"""Returns the number of running threads for a server."""
|
||||
bundle = self._bundles.get(server_id)
|
||||
if bundle is None:
|
||||
return 0
|
||||
return sum(
|
||||
1
|
||||
for key in ("log_tail", "metrics", "monitor", "rcon_poller")
|
||||
if bundle.get(key) is not None and bundle[key].is_alive()
|
||||
)
|
||||
|
||||
# ── Bundle construction ──
|
||||
|
||||
def _build_bundle(self, server_id: int, db) -> dict:
|
||||
"""Reads server + config data from DB and constructs (but does not start) the thread bundle."""
|
||||
server_repo = ServerRepository(db)
|
||||
config_repo = ConfigRepository(db)
|
||||
|
||||
server = server_repo.get_by_id(server_id)
|
||||
if server is None:
|
||||
raise ValueError(f"Server {server_id} not found in database")
|
||||
|
||||
game_type = server["game_type"]
|
||||
adapter = self._adapter_registry.get(game_type)
|
||||
|
||||
# Log path: read from config if present, else use adapter default
|
||||
log_path = None
|
||||
if adapter.has_capability("log_parser"):
|
||||
log_parser = adapter.get_log_parser()
|
||||
# Try to resolve log path via the adapter's log file resolver
|
||||
from core.utils.file_utils import get_server_dir
|
||||
server_dir = get_server_dir(server_id)
|
||||
if server_dir.exists():
|
||||
resolver = log_parser.get_log_file_resolver(server_id)
|
||||
resolved = resolver(server_dir)
|
||||
if resolved is not None:
|
||||
log_path = str(resolved)
|
||||
|
||||
bundle: dict = {
|
||||
"log_tail": None,
|
||||
"metrics": None,
|
||||
"monitor": None,
|
||||
"rcon_poller": None,
|
||||
}
|
||||
|
||||
# Always: ProcessMonitorThread
|
||||
bundle["monitor"] = ProcessMonitorThread(
|
||||
server_id=server_id,
|
||||
process_manager=self._process_manager,
|
||||
broadcast_queue=self._broadcast_queue,
|
||||
)
|
||||
|
||||
# Always: MetricsCollectorThread
|
||||
bundle["metrics"] = MetricsCollectorThread(
|
||||
server_id=server_id,
|
||||
process_manager=self._process_manager,
|
||||
broadcast_queue=self._broadcast_queue,
|
||||
)
|
||||
|
||||
# Conditional: LogTailThread
|
||||
if log_path and adapter.has_capability("log_parser"):
|
||||
log_parser = adapter.get_log_parser()
|
||||
bundle["log_tail"] = LogTailThread(
|
||||
server_id=server_id,
|
||||
log_path=log_path,
|
||||
log_parser=log_parser,
|
||||
broadcast_queue=self._broadcast_queue,
|
||||
)
|
||||
|
||||
# Conditional: RemoteAdminPollerThread
|
||||
if adapter.has_capability("remote_admin"):
|
||||
remote_admin = adapter.get_remote_admin()
|
||||
if remote_admin is not None:
|
||||
# Get RCon password from config
|
||||
rcon_password = self._get_remote_admin_password(server_id, config_repo)
|
||||
if rcon_password:
|
||||
try:
|
||||
rcon_port = server.get("rcon_port") or server.get("game_port", 0) + 1
|
||||
client = remote_admin.create_client(
|
||||
host="127.0.0.1",
|
||||
port=rcon_port,
|
||||
password=rcon_password,
|
||||
)
|
||||
bundle["rcon_poller"] = RemoteAdminPollerThread(
|
||||
server_id=server_id,
|
||||
remote_admin_client=client,
|
||||
broadcast_queue=self._broadcast_queue,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"ThreadRegistry: could not create RCon client for server %d: %s",
|
||||
server_id, exc,
|
||||
)
|
||||
|
||||
return bundle
|
||||
|
||||
def _start_bundle(self, server_id: int, bundle: dict) -> None:
|
||||
started = []
|
||||
for key in ("monitor", "metrics", "log_tail", "rcon_poller"):
|
||||
thread = bundle.get(key)
|
||||
if thread is not None:
|
||||
thread.start()
|
||||
started.append(key)
|
||||
logger.info("ThreadRegistry: started threads for server %d: %s", server_id, started)
|
||||
|
||||
def _stop_bundle(self, server_id: int, bundle: dict) -> None:
|
||||
for key in ("rcon_poller", "log_tail", "metrics", "monitor"):
|
||||
thread = bundle.get(key)
|
||||
if thread is not None and thread.is_alive():
|
||||
thread.stop_and_join(timeout=5.0)
|
||||
logger.info("ThreadRegistry: stopped all threads for server %d", server_id)
|
||||
|
||||
# ── Helpers ──
|
||||
|
||||
def _get_remote_admin_password(
|
||||
self, server_id: int, config_repo: ConfigRepository
|
||||
) -> str | None:
|
||||
"""Read the RCon password from the rcon config section."""
|
||||
# Need to decrypt sensitive fields
|
||||
from adapters.registry import GameAdapterRegistry
|
||||
try:
|
||||
server = ServerRepository(config_repo._db).get_by_id(server_id)
|
||||
if server is None:
|
||||
return None
|
||||
adapter = self._adapter_registry.get(server["game_type"])
|
||||
config_gen = adapter.get_config_generator()
|
||||
sensitive = config_gen.get_sensitive_fields("rcon") if "rcon" in config_gen.get_sections() else []
|
||||
except Exception as exc:
|
||||
logger.debug("Could not determine sensitive fields for RCon config: %s", exc)
|
||||
sensitive = []
|
||||
|
||||
rcon_section = config_repo.get_section(server_id, "rcon", sensitive)
|
||||
if rcon_section is None:
|
||||
return None
|
||||
return rcon_section.get("password") or None
|
||||
Reference in New Issue
Block a user