Skip to content

CLI

The current experiment surface is validate → launch/submit → inspect. Validation is pure YAML/Pydantic checking; launch runs one typed ExperimentConfig in-process; submit wraps that launch in an sbatch script; inspect reads the run manifest and event log. See chassis-invariants.md for the four architectural properties (drift resistance, MLflow as state store, reproduction contract, render purity).

Stage Command Module
config graphids exp config <yaml> config
launch graphids exp launch <yaml> launch
submit graphids exp submit <yaml> submit
status graphids exp status <run> status
manifest graphids exp manifest <run> manifest
results graphids exp results results

There is no separate ops CLI surface. The run config itself carries the stage and the stage-specific payload, so fit / test / extract / analyze all go through graphids.exp.runtime.launch_run and the run manifest records the exact config that launched.

app.py owns the root Typer app + shared option types. __main__.py imports each submodule to register commands.

graphids.cli

cli

app

Typer app + root callback. Login-node safe: no torch/model imports here.

Owns the structlog configuration since this module is the login-node entry point that runs first; compute-side code imports :func:configure_logging from here too.

configure_logging cached

configure_logging() -> None

structlog → JSON sync stderr with SLURM env auto-attached. Idempotent.

Source code in graphids/cli/app.py
@functools.cache
def configure_logging() -> None:
    """structlog → JSON sync stderr with SLURM env auto-attached. Idempotent."""
    structlog.configure(
        processors=[
            structlog.stdlib.add_log_level,
            structlog.processors.TimeStamper(fmt="iso"),
            structlog.processors.format_exc_info,
            _slurm_context,
            structlog.processors.JSONRenderer(),
        ],
        cache_logger_on_first_use=True,
    )

exp

New experiment-facing CLI.

This is the first replacement surface for the old row/submit mental model.

config

config(path: Annotated[Path, Argument(help='YAML config file to validate as ExperimentConfig')]) -> None

Load a YAML config through OmegaConf and validate it as ExperimentConfig.

Source code in graphids/cli/exp.py
@exp_app.command("config")
def config(
    path: Annotated[Path, typer.Argument(help="YAML config file to validate as ExperimentConfig")],
) -> None:
    """Load a YAML config through OmegaConf and validate it as ``ExperimentConfig``."""
    console.print_json(data=ExperimentConfig.from_yaml(path).model_dump(mode="json"))

launch

launch(path: Annotated[Path, Argument(help='YAML experiment config')]) -> None

Launch one experiment config through the new primitive surface.

Source code in graphids/cli/exp.py
@exp_app.command("launch")
def launch(
    path: Annotated[Path, typer.Argument(help="YAML experiment config")],
) -> None:
    """Launch one experiment config through the new primitive surface."""
    exp_cfg = ExperimentConfig.from_yaml(path)
    run = exp_cfg.build_run(
        name=exp_cfg.experiment_name,
        stage=exp_cfg.stage,
        config=exp_cfg.config,
    )
    result = launch_run(run)
    if result is not None:
        payload = asdict(result) if is_dataclass(result) else {"result": str(result)}
        console.print_json(data=payload)

manifest

manifest(run_dir: Annotated[Path, Argument(help='Run directory to inspect')]) -> None

Dump the manifest JSON for a run.

Source code in graphids/cli/exp.py
@exp_app.command("manifest")
def manifest(
    run_dir: Annotated[Path, typer.Argument(help="Run directory to inspect")],
) -> None:
    """Dump the manifest JSON for a run."""
    manifest = load_manifest(run_dir)
    if manifest is None:
        raise typer.BadParameter(f"no manifest found in {run_dir}")
    console.print_json(data=manifest.model_dump(mode="json"))

results

results(view: Annotated[str, Option('--view', '-v', help='Result view in configs/result_views.yml')] = 'fusion', dataset: Annotated[list[str] | None, Option('--dataset', '-d', help='Dataset to query; repeat for multiple datasets')] = None, variant: Annotated[list[str] | None, Option('--variant', help='Variant to include; repeat for multiple variants')] = None, all_runs: Annotated[bool, Option('--all', help='Show all matching runs, not latest per variant')] = False, tracking_uri: Annotated[str | None, Option('--tracking-uri', help='Override MLflow tracking URI')] = None, output_format: Annotated[str, Option('--format', help='table or json')] = 'table') -> None

Query configured MLflow result views.

Source code in graphids/cli/exp.py
@exp_app.command("results")
def results(
    view: Annotated[str, typer.Option("--view", "-v", help="Result view in configs/result_views.yml")] = "fusion",
    dataset: Annotated[
        list[str] | None,
        typer.Option("--dataset", "-d", help="Dataset to query; repeat for multiple datasets"),
    ] = None,
    variant: Annotated[
        list[str] | None,
        typer.Option("--variant", help="Variant to include; repeat for multiple variants"),
    ] = None,
    all_runs: Annotated[bool, typer.Option("--all", help="Show all matching runs, not latest per variant")] = False,
    tracking_uri: Annotated[str | None, typer.Option("--tracking-uri", help="Override MLflow tracking URI")] = None,
    output_format: Annotated[str, typer.Option("--format", help="table or json")] = "table",
) -> None:
    """Query configured MLflow result views."""
    if tracking_uri:
        mlflow.set_tracking_uri(tracking_uri)
    datasets = dataset or ["hcrl_sa", "set_01", "set_02", "set_03", "set_04"]
    rows = sort_rows(
        query_result_view(
            view=view,
            datasets=datasets,
            variants=variant,
            latest=not all_runs,
        )
    )
    if output_format == "json":
        console.print_json(data=result_rows_as_json(rows))
        return
    if output_format != "table":
        raise typer.BadParameter("--format must be table or json")

    table = Table(title=f"{view} results", show_lines=False)
    base_cols = ["dataset", "variant", "status", "run_id"]
    metric_cols = list(rows[0].metrics) if rows else []
    for col in [*base_cols, *metric_cols]:
        table.add_column(col)
    for row in rows:
        payload = row.flat()
        values = []
        for col in [*base_cols, *metric_cols]:
            value = payload.get(col)
            if isinstance(value, float):
                values.append(f"{value:.4f}")
            elif value is None:
                values.append("n/a")
            else:
                values.append(str(value))
        table.add_row(*values)
    console.print(table)

status

status(run_dir: Annotated[Path, Argument(help='Run directory to inspect')]) -> None

Print manifest + latest event summary for one run.

Source code in graphids/cli/exp.py
@exp_app.command("status")
def status(
    run_dir: Annotated[Path, typer.Argument(help="Run directory to inspect")],
) -> None:
    """Print manifest + latest event summary for one run."""
    summary = summarize_run(run_dir)
    if summary is None:
        raise typer.BadParameter(f"no manifest found in {run_dir}")

    table = Table(title="run status", show_lines=False)
    table.add_column("field")
    table.add_column("value")
    table.add_row("name", summary.name)
    table.add_row("stage", summary.stage)
    table.add_row("status", summary.status)
    table.add_row("last_event", summary.last_event or "—")
    table.add_row("error", summary.error or "—")
    table.add_row("run_dir", summary.run_dir)
    table.add_row("git_sha", summary.extra.get("git_sha", "—"))
    table.add_row("run_id", summary.extra.get("run_id", "—"))
    console.print(table)

submit

submit(path: Annotated[Path, Argument(help='YAML experiment config')], cluster: Annotated[str | None, Option('--cluster', '-C', help='SLURM cluster override')] = None, partition: Annotated[str | None, Option('--partition', '-p', help='SLURM partition override')] = None, time_limit: Annotated[str | None, Option('--time', '-t', help='SLURM walltime override')] = None, gres: Annotated[str | None, Option('--gres', help='SLURM gres override')] = None, dry_run: Annotated[bool, Option('--dry-run', help='Print the sbatch script without submitting')] = False) -> None

Submit one experiment YAML as a SLURM batch job.

Source code in graphids/cli/exp.py
@exp_app.command("submit")
def submit(
    path: Annotated[Path, typer.Argument(help="YAML experiment config")],
    cluster: Annotated[str | None, typer.Option("--cluster", "-C", help="SLURM cluster override")] = None,
    partition: Annotated[str | None, typer.Option("--partition", "-p", help="SLURM partition override")] = None,
    time_limit: Annotated[str | None, typer.Option("--time", "-t", help="SLURM walltime override")] = None,
    gres: Annotated[str | None, typer.Option("--gres", help="SLURM gres override")] = None,
    dry_run: Annotated[bool, typer.Option("--dry-run", help="Print the sbatch script without submitting")] = False,
) -> None:
    """Submit one experiment YAML as a SLURM batch job."""
    exp_cfg = ExperimentConfig.from_yaml(path)
    result = submit_experiment(
        exp_cfg,
        path,
        cluster=cluster,
        partition=partition,
        time_limit=time_limit,
        gres=gres,
        dry_run=dry_run,
    )
    if dry_run:
        typer.echo(result.script, nl=False)
        return
    console.print_json(
        data={
            "job_id": result.job_id,
            "script_path": str(result.script_path),
            "command": list(result.command),
            "stdout": result.stdout.strip(),
        }
    )