""" arma_modlist_tools.parser ~~~~~~~~~~~~~~~~~~~~~~~~~ Parse Arma 3 Launcher mod preset HTML files (.html exported from the launcher) into plain Python dicts / lists suitable for JSON serialisation. Typical usage:: from arma_modlist_tools.parser import parse_modlist_html, parse_modlist_dir # single file preset = parse_modlist_html("modlist_html/my_preset.html") # whole folder presets = parse_modlist_dir("modlist_html") """ from __future__ import annotations import re import xml.etree.ElementTree as ET from pathlib import Path # --------------------------------------------------------------------------- # Public types (plain dicts — keep it dependency-free) # --------------------------------------------------------------------------- # ModEntry: # name : str display name from the launcher # source : "steam" | "local" | "unknown" # url : str | None full workshop / local path URL # steam_id : str | None numeric workshop item ID extracted from the URL # Preset: # preset_name : str stem of the source filename # source_file : str basename of the source filename # mod_count : int # mods : list[ModEntry] # --------------------------------------------------------------------------- # Low-level helpers # --------------------------------------------------------------------------- _STEAM_ID_RE = re.compile(r"[?&]id=(\d+)") def _extract_steam_id(url: str) -> str | None: """Return the numeric workshop item ID from a Steam URL, or None.""" m = _STEAM_ID_RE.search(url) return m.group(1) if m else None def _source_from_class(css_class: str) -> str: """Map a span CSS class to a source label.""" if "from-steam" in css_class: return "steam" if "from-local" in css_class: return "local" return "unknown" # --------------------------------------------------------------------------- # Core parsing # --------------------------------------------------------------------------- def parse_mod_entry(tr_element: ET.Element) -> dict | None: """ Parse a single ```` element into a mod dict. Returns ``None`` if the element does not contain a display name (i.e. it is not a valid mod row). """ name: str | None = None source: str = "unknown" url: str | None = None steam_id: str | None = None for td in tr_element: dtype = td.get("data-type") if dtype == "DisplayName": name = (td.text or "").strip() continue for span in td.iter("span"): css = span.get("class", "") if "from-" in css: source = _source_from_class(css) for a in td.iter("a"): if a.get("data-type") == "Link": href = (a.get("href") or "").strip() if href: url = href steam_id = _extract_steam_id(href) if name is None: return None return {"name": name, "source": source, "url": url, "steam_id": steam_id} def parse_modlist_html(filepath: str | Path) -> dict: """ Parse an Arma 3 Launcher preset HTML file and return a preset dict. :param filepath: Path to the ``.html`` preset file. :returns: Dict with keys ``preset_name``, ``source_file``, ``mod_count``, and ``mods`` (list of mod entry dicts). :raises FileNotFoundError: If *filepath* does not exist. :raises ET.ParseError: If the file is not valid XML/HTML. """ path = Path(filepath) tree = ET.parse(path) root = tree.getroot() mods = [] for tr in root.iter("tr"): if tr.get("data-type") != "ModContainer": continue entry = parse_mod_entry(tr) if entry is not None: mods.append(entry) return { "preset_name": path.stem, "source_file": path.name, "mod_count": len(mods), "mods": mods, } def parse_modlist_dir(directory: str | Path) -> list[dict]: """ Parse all ``.html`` preset files in *directory* and return a list of preset dicts (one per file, sorted by filename). :param directory: Folder containing ``.html`` preset files. :returns: List of preset dicts as returned by :func:`parse_modlist_html`. :raises NotADirectoryError: If *directory* does not exist or is not a dir. """ d = Path(directory) if not d.is_dir(): raise NotADirectoryError(f"Not a directory: {d}") return [parse_modlist_html(f) for f in sorted(d.glob("*.html"))]