Source code for srdatalog.ir.codegen.cuda.build.compiler_ninja

'''Ninja + PCH compile orchestrator.

Emits a `build.ninja` in the cache dir that:
  1. Precompiles `srdatalog.h` into a `.pch` once per build (srdatalog.h
     pulls in boost/hana/mp11/RMM/spdlog — ~4s per TU to parse cold, so
     one PCH saves ~(N-1) * 4s on N-shard projects like doop).
  2. Compiles every project `.cpp` with `-include-pch srdatalog.pch`.
  3. Links the resulting objects + extra_sources into a shared library.

Invokes ninja via the `ninja` PyPI wheel so we don't need a system
binary — `pip install srdatalog` pulls the ninja wheel as a transitive
dep (~500 KB).

Contract matches `compile_jit_project` in `compiler.py`:
  - Input: `project_result` dict from `cache.write_jit_project`, plus a
    `CompilerConfig`.
  - Output: `BuildResult` with compile + link results.

The ThreadPoolExecutor orchestrator in `compiler.py` remains as a
fallback (env `SRDATALOG_JIT_NO_NINJA=1` or `use_ninja=False`) for
contributors without ninja installed or for debugging a single-TU
compile path.
'''

from __future__ import annotations

import os
import shutil
import subprocess
import time
from pathlib import Path

from srdatalog.ir.codegen.cuda.build.cache import JitProjectLayout
from srdatalog.ir.codegen.cuda.build.compiler import (
  BuildResult,
  CompilerConfig,
  CompileResult,
  _artifact_name,
  _base_cxx_flags,
)

# ---------------------------------------------------------------------------
# build.ninja emission
# ---------------------------------------------------------------------------


def _find_srdatalog_h(config: CompilerConfig) -> str | None:
  '''Locate `srdatalog.h` on the runtime include paths.'''
  for p in config.include_paths:
    candidate = os.path.join(p, "srdatalog.h")
    if os.path.isfile(candidate):
      return candidate
  return None


def _ninja_escape(s: str) -> str:
  '''Escape characters that ninja treats specially ($, :, space in
  build paths). Most of our paths don't contain these, but be safe.'''
  return s.replace("$", "$$").replace(":", "$:").replace(" ", "$ ")


def _join_flags(flags: list[str]) -> str:
  '''Ninja variable substitution doesn't need shell-like quoting in
  `command =` — ninja exec()s the command via a shell, so we just
  space-join. Flags with spaces are rare (paths). Quote them if they
  contain a space.'''
  out = []
  for f in flags:
    if " " in f:
      out.append(f'"{f}"')
    else:
      out.append(f)
  return " ".join(out)



[docs]
def emit_build_ninja(
  project_result: JitProjectLayout,
  config: CompilerConfig,
  *,
  use_pch: bool = False,
  use_ccache: bool | None = None,
) -> str:
  '''Write `<cache_dir>/build.ninja` from `project_result` + `config`.

  Returns the absolute path to the emitted ninja file.

  Args:
    use_pch: opt-in split host/device PCH. Disabled by default because
      clang's CUDA + PCH pipeline is fragile on our runtime headers
      (ptxas chokes when `-Xclang -emit-pch` runs with `--cuda-*-only`
      on headers that transitively pull CUDA intrinsics via
      `gpu/search.h`). Keep the code path so future runtime-header
      cleanups can flip it on with `use_pch=True`.
    use_ccache: prepend `ccache` to the compile command when it's on
      PATH. Defaults to True iff `ccache` is found. Warm rebuilds
      after `rm -rf build/` go from ~97s → ~5s on doop with ccache.
      Override via `SRDATALOG_JIT_NO_CCACHE=1`.
  '''
  # Absolute paths so `ninja -C <cache_dir>` can resolve inputs no
  # matter what the caller's cwd is.
  project_dir = os.path.abspath(str(project_result["dir"]))
  main_cpp = os.path.abspath(str(project_result["main"]))
  batches = [os.path.abspath(str(b)) for b in project_result["batches"]]
  output_dir = os.path.abspath(config.output_dir) if config.output_dir else project_dir
  os.makedirs(output_dir, exist_ok=True)
  artifact = _artifact_name(output_dir, config.shared)
  cxx = config.resolved_cxx()

  # Common flags — same set the ThreadPoolExecutor path uses.
  cxx_flags = _base_cxx_flags(config)

  # Split PCH approach for clang CUDA mode: build a PCH pair (one for
  # the host pass, one for the device pass) from the same stub `.cu`.
  # clang's CUDA two-pass compile needs each pass's AST to match the
  # PCH's recorded target; feeding a single host PCH to both passes
  # fails with a target-mismatch diagnostic, and a combined PCH built
  # with plain `-x cuda` trips ptxas on PCH bytes. The split approach
  # is what clang actually supports — host-pass uses host PCH,
  # device-pass uses device PCH.
  pch_header: str | None = None
  pch_host_obj = ""
  pch_device_obj = ""
  pch_stub_path = ""
  pch_include_clause = ""
  if use_pch:
    pch_header = _find_srdatalog_h(config)
    if pch_header is None:
      use_pch = False
    else:
      pch_stub_path = os.path.join(output_dir, "_pch_stub.cu")
      with open(pch_stub_path, "w") as f:
        f.write(
          f'// Auto-generated stub for split host/device PCH of srdatalog.h\n'
          f'#include "{pch_header}"\n'
        )

  # PCH note to caller via the returned path — can't raise because
  # callers rely on this function being pure file emission.
  lines: list[str] = []
  lines.append("# Auto-generated by srdatalog.ir.codegen.cuda.build.compiler_ninja")
  lines.append(f"# Generated for project dir: {project_dir}")
  lines.append("")
  # ccache detection — transparent speedup for warm rebuilds. The
  # compiler command becomes `ccache clang++ ...`, which is all ccache
  # needs to cache the .o file content-addressed by the source+flags.
  if use_ccache is None:
    use_ccache = (
      os.environ.get("SRDATALOG_JIT_NO_CCACHE", "") != "1" and shutil.which("ccache") is not None
    )
  cc_prefix = "ccache " if use_ccache else ""

  lines.append(f"cxx = {cc_prefix}{cxx}")
  lines.append(f"cxx_flags = {_join_flags(cxx_flags)}")
  # Link flags live on one line too. Ninja passes them verbatim to the shell.
  link_flags_list = list(config.link_flags)
  libs_list = [f"-l{lib}" for lib in config.libs]
  lines.append(f"link_flags = {_join_flags(link_flags_list + libs_list)}")
  lines.append(f"extra_sources = {_join_flags(list(config.extra_sources))}")
  lines.append("")

  if use_pch:
    pch_host_obj = os.path.join(output_dir, "srdatalog.host.pch")
    pch_device_obj = os.path.join(output_dir, "srdatalog.device.pch")
    # `--cuda-host-only`  → runs only the host pass, no nvptx/ptxas
    # `--cuda-device-only` + `--cuda-gpu-arch` → runs only the device pass
    # Both use `-x cuda` so clang interprets the `.cu` stub as CUDA. The
    # `-Xclang -emit-pch` forces PCH emission for either pass.
    lines.append("rule pch_host")
    lines.append("  command = $cxx $cxx_flags --cuda-host-only -Xclang -emit-pch -c $in -o $out")
    lines.append("  description = PCH-HOST $out")
    lines.append("")
    lines.append("rule pch_device")
    lines.append("  command = $cxx $cxx_flags --cuda-device-only -Xclang -emit-pch -c $in -o $out")
    lines.append("  description = PCH-DEVICE $out")
    lines.append("")
    pch_include_clause = (
      f" -include-pch {_ninja_escape(pch_host_obj)} -include-pch {_ninja_escape(pch_device_obj)}"
    )

  # Two compile rules:
  #   `cxx_host_only` — `-x cuda --cuda-host-only`. Runs ONE pass (host)
  #                      instead of the default two (host + device).
  #                      Used for TUs that don't DEFINE `__global__`
  #                      kernels — main.cpp, runner shards, etc.
  #                      ~50% faster per TU because the device pass
  #                      (which also re-parses srdatalog.h) is skipped.
  #   `cxx`           — full two-pass CUDA compile for jit_batch_*.cpp
  #                      which host actual __global__ kernel definitions.
  lines.append("rule cxx_host_only")
  lines.append(f"  command = $cxx $cxx_flags --cuda-host-only{pch_include_clause} -c $in -o $out")
  lines.append("  description = CXX-HOST $out")
  lines.append("")
  lines.append("rule cxx")
  lines.append(f"  command = $cxx $cxx_flags{pch_include_clause} -c $in -o $out")
  lines.append("  description = CXX $out")
  lines.append("")
  lines.append("rule link")
  if config.shared:
    lines.append("  command = $cxx -shared -o $out $in $extra_sources $link_flags")
  else:
    lines.append("  command = $cxx -o $out $in $extra_sources $link_flags")
  lines.append("  description = LINK $out")
  lines.append("")

  # Build statements.
  if use_pch:
    assert pch_header is not None  # use_pch=True implies pch_header was found
    host_line = (
      f"build {_ninja_escape(pch_host_obj)}: pch_host "
      f"{_ninja_escape(pch_stub_path)} | {_ninja_escape(pch_header)}"
    )
    device_line = (
      f"build {_ninja_escape(pch_device_obj)}: pch_device "
      f"{_ninja_escape(pch_stub_path)} | {_ninja_escape(pch_header)}"
    )
    lines.append(host_line)
    lines.append(device_line)
    lines.append("")

  sources = [main_cpp] + batches
  object_paths: list[str] = []
  for src in sources:
    obj = os.path.join(output_dir, Path(src).stem + ".o")
    object_paths.append(obj)
    stem = Path(src).stem
    # Every TU goes through the full two-pass CUDA compile. An earlier
    # version of this code compiled main.cpp with `--cuda-host-only` to
    # halve compile time, on the theory that main only references
    # kernels. That was wrong: main.cpp triggers template instantiations
    # of thrust/cub helpers (e.g. set_difference, unique, scan) that no
    # other TU also instantiates with the same template arguments.
    # Host-only compilation silently dropped their device side, so the
    # final `.so` was missing ~113 kernels relative to the Nim build
    # and `cuKernelGetFunction` returned INVALID_HANDLE at runtime when
    # the fixpoint tried to launch them (observed on doop/batik_interned
    # after step 8). Matching Nim's pipeline — full CUDA on every TU —
    # is the correct behavior.
    rule = "cxx"
    line = f"build {_ninja_escape(obj)}: {rule} {_ninja_escape(src)}"
    if use_pch:
      # Order-only deps (||) — PCH files must exist before the TU
      # compiles, but changes to the PCH don't force recompile of
      # the .o (ninja already tracks -include-pch via input).
      line += f" || {_ninja_escape(pch_host_obj)} {_ninja_escape(pch_device_obj)}"
    lines.append(line)
  lines.append("")

  lines.append(
    f"build {_ninja_escape(artifact)}: link " + " ".join(_ninja_escape(o) for o in object_paths)
  )
  lines.append("")
  lines.append(f"default {_ninja_escape(artifact)}")
  lines.append("")

  ninja_path = os.path.join(output_dir, "build.ninja")
  with open(ninja_path, "w") as f:
    f.write("\n".join(lines))
  return ninja_path



# ---------------------------------------------------------------------------
# Invocation
# ---------------------------------------------------------------------------


def _locate_ninja_binary() -> str:
  '''Prefer the `ninja` PyPI wheel's binary (installed alongside
  srdatalog), fall back to any ninja on PATH.'''
  try:
    import ninja as _ninja_pkg  # type: ignore[import-not-found]

    candidate = os.path.join(_ninja_pkg.BIN_DIR, "ninja")
    if os.path.isfile(candidate):
      return candidate
  except ImportError:
    pass
  sys_ninja = shutil.which("ninja")
  if sys_ninja:
    return sys_ninja
  raise RuntimeError(
    "ninja not found. Install via `pip install ninja` or disable "
    "the ninja backend with use_ninja=False / SRDATALOG_JIT_NO_NINJA=1."
  )



[docs]
def compile_jit_project_ninja(
  project_result: JitProjectLayout,
  config: CompilerConfig | None = None,
  *,
  use_pch: bool = False,
) -> BuildResult:
  '''Drop-in replacement for `compile_jit_project` that goes through
  ninja + PCH. Returns the same BuildResult shape so callers don't
  need to change.

  Compile-failure reporting: on non-zero ninja exit, we return a
  single synthetic CompileResult holding ninja's captured output —
  we don't parse per-TU diagnostics out of ninja's stream (ninja
  already shows them verbatim to stderr). Callers can still read
  `.stderr` + exit code.
  '''
  config = config or CompilerConfig()
  ninja_path = emit_build_ninja(project_result, config, use_pch=use_pch)
  ninja_bin = _locate_ninja_binary()

  jobs = config.resolved_jobs()
  cmd = [
    ninja_bin,
    "-C",
    os.path.dirname(ninja_path),
    "-f",
    os.path.basename(ninja_path),
    f"-j{jobs}",
  ]

  start = time.perf_counter()
  proc = subprocess.run(cmd, capture_output=True, text=True, check=False)
  elapsed = time.perf_counter() - start

  output_dir = config.output_dir or str(project_result["dir"])
  artifact = _artifact_name(output_dir, config.shared)

  # Build a BuildResult that looks like the ThreadPoolExecutor one. We
  # fold every compile into one pseudo-CompileResult (ninja printed
  # diagnostics already); the link result mirrors ninja's exit code.
  synthesized_compile = CompileResult(
    command=cmd,
    output=ninja_path,
    returncode=proc.returncode,
    stdout=proc.stdout,
    stderr=proc.stderr,
    elapsed_sec=elapsed,
  )
  link_result = None
  if proc.returncode == 0:
    link_result = CompileResult(
      command=cmd,
      output=artifact,
      returncode=0,
      stdout="",
      stderr="",
      elapsed_sec=0.0,
    )
  return BuildResult(
    artifact=artifact if proc.returncode == 0 else "",
    compile_results=[synthesized_compile],
    link_result=link_result,
    elapsed_sec=elapsed,
  )