"""
arma_modlist_tools.parser
~~~~~~~~~~~~~~~~~~~~~~~~~
Parse Arma 3 Launcher mod preset HTML files (.html exported from the launcher)
into plain Python dicts / lists suitable for JSON serialisation.
Typical usage::
from arma_modlist_tools.parser import parse_modlist_html, parse_modlist_dir
# single file
preset = parse_modlist_html("modlist_html/my_preset.html")
# whole folder
presets = parse_modlist_dir("modlist_html")
"""
from __future__ import annotations
import re
import xml.etree.ElementTree as ET
from pathlib import Path
# ---------------------------------------------------------------------------
# Public types (plain dicts — keep it dependency-free)
# ---------------------------------------------------------------------------
# ModEntry:
# name : str display name from the launcher
# source : "steam" | "local" | "unknown"
# url : str | None full workshop / local path URL
# steam_id : str | None numeric workshop item ID extracted from the URL
# Preset:
# preset_name : str stem of the source filename
# source_file : str basename of the source filename
# mod_count : int
# mods : list[ModEntry]
# ---------------------------------------------------------------------------
# Low-level helpers
# ---------------------------------------------------------------------------
_STEAM_ID_RE = re.compile(r"[?&]id=(\d+)")
def _extract_steam_id(url: str) -> str | None:
"""Return the numeric workshop item ID from a Steam URL, or None."""
m = _STEAM_ID_RE.search(url)
return m.group(1) if m else None
def _source_from_class(css_class: str) -> str:
"""Map a span CSS class to a source label."""
if "from-steam" in css_class:
return "steam"
if "from-local" in css_class:
return "local"
return "unknown"
# ---------------------------------------------------------------------------
# Core parsing
# ---------------------------------------------------------------------------
def parse_mod_entry(tr_element: ET.Element) -> dict | None:
"""
Parse a single ``
`` element into a mod dict.
Returns ``None`` if the element does not contain a display name (i.e. it
is not a valid mod row).
"""
name: str | None = None
source: str = "unknown"
url: str | None = None
steam_id: str | None = None
for td in tr_element:
dtype = td.get("data-type")
if dtype == "DisplayName":
name = (td.text or "").strip()
continue
for span in td.iter("span"):
css = span.get("class", "")
if "from-" in css:
source = _source_from_class(css)
for a in td.iter("a"):
if a.get("data-type") == "Link":
href = (a.get("href") or "").strip()
if href:
url = href
steam_id = _extract_steam_id(href)
if name is None:
return None
return {"name": name, "source": source, "url": url, "steam_id": steam_id}
def parse_modlist_html(filepath: str | Path) -> dict:
"""
Parse an Arma 3 Launcher preset HTML file and return a preset dict.
:param filepath: Path to the ``.html`` preset file.
:returns: Dict with keys ``preset_name``, ``source_file``, ``mod_count``,
and ``mods`` (list of mod entry dicts).
:raises FileNotFoundError: If *filepath* does not exist.
:raises ET.ParseError: If the file is not valid XML/HTML.
"""
path = Path(filepath)
tree = ET.parse(path)
root = tree.getroot()
mods = []
for tr in root.iter("tr"):
if tr.get("data-type") != "ModContainer":
continue
entry = parse_mod_entry(tr)
if entry is not None:
mods.append(entry)
return {
"preset_name": path.stem,
"source_file": path.name,
"mod_count": len(mods),
"mods": mods,
}
def parse_modlist_dir(directory: str | Path) -> list[dict]:
"""
Parse all ``.html`` preset files in *directory* and return a list of
preset dicts (one per file, sorted by filename).
:param directory: Folder containing ``.html`` preset files.
:returns: List of preset dicts as returned by :func:`parse_modlist_html`.
:raises NotADirectoryError: If *directory* does not exist or is not a dir.
"""
d = Path(directory)
if not d.is_dir():
raise NotADirectoryError(f"Not a directory: {d}")
return [parse_modlist_html(f) for f in sorted(d.glob("*.html"))]