TaehyunKim github-actions[bot]

wyldecat Claude Opus 4.6 (1M context) commited on Mar 18

Commit

10848ab

unverified ·

1 Parent(s): 67f7e11

draft commit for cpu_offload (#23)

* draft commit for cpu_offload

* draft commit for cpu_offload [skip-build]

* claude is british

* Add built binary [skip-build]

* fix load, save and add test

* Add built binary [skip-build]

* Fix yapf formatting for pre-commit CI [skip-build]

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: WyldeCat <skan1543@gmail.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu126-x86_64-linux/{_optimizer_7aef62f_dirty.abi3.so → _optimizer_5b58933_dirty.abi3.so} +1 -1
build/torch210-cxx11-cu126-x86_64-linux/adamw.py +154 -37
build/torch210-cxx11-cu126-x86_64-linux/core.py +134 -31
build/torch210-cxx11-cu126-x86_64-linux/cpu_offload.py +188 -0
build/torch210-cxx11-cu126-x86_64-linux/distributed/utils.py +0 -6
build/torch210-cxx11-cu126-x86_64-linux/matmul_transpose_triton.py +11 -10
build/torch210-cxx11-cu126-x86_64-linux/muon.py +573 -99
build/torch210-cxx11-cu126-x86_64-linux/newton_schulz.py +206 -20
build/torch210-cxx11-cu126-x86_64-linux/pipeline.py +158 -80
build/torch210-cxx11-cu126-x86_64-linux/qk_clip.py +15 -9
build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu128-x86_64-linux/{_optimizer_7aef62f_dirty.abi3.so → _optimizer_5b58933_dirty.abi3.so} +1 -1
build/torch210-cxx11-cu128-x86_64-linux/adamw.py +154 -37
build/torch210-cxx11-cu128-x86_64-linux/core.py +134 -31
build/torch210-cxx11-cu128-x86_64-linux/cpu_offload.py +188 -0
build/torch210-cxx11-cu128-x86_64-linux/distributed/utils.py +0 -6
build/torch210-cxx11-cu128-x86_64-linux/matmul_transpose_triton.py +11 -10
build/torch210-cxx11-cu128-x86_64-linux/muon.py +573 -99
build/torch210-cxx11-cu128-x86_64-linux/newton_schulz.py +206 -20
build/torch210-cxx11-cu128-x86_64-linux/pipeline.py +158 -80
build/torch210-cxx11-cu128-x86_64-linux/qk_clip.py +15 -9
build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu130-x86_64-linux/{_optimizer_7aef62f_dirty.abi3.so → _optimizer_5b58933_dirty.abi3.so} +1 -1
build/torch210-cxx11-cu130-x86_64-linux/adamw.py +154 -37
build/torch210-cxx11-cu130-x86_64-linux/core.py +134 -31
build/torch210-cxx11-cu130-x86_64-linux/cpu_offload.py +188 -0
build/torch210-cxx11-cu130-x86_64-linux/distributed/utils.py +0 -6
build/torch210-cxx11-cu130-x86_64-linux/matmul_transpose_triton.py +11 -10
build/torch210-cxx11-cu130-x86_64-linux/muon.py +573 -99
build/torch210-cxx11-cu130-x86_64-linux/newton_schulz.py +206 -20
build/torch210-cxx11-cu130-x86_64-linux/pipeline.py +158 -80
build/torch210-cxx11-cu130-x86_64-linux/qk_clip.py +15 -9
build/torch210-cxx11-rocm70-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-rocm70-x86_64-linux/{_optimizer_7aef62f_dirty.abi3.so → _optimizer_5b58933_dirty.abi3.so} +1 -1
build/torch210-cxx11-rocm70-x86_64-linux/adamw.py +154 -37
build/torch210-cxx11-rocm70-x86_64-linux/core.py +134 -31
build/torch210-cxx11-rocm70-x86_64-linux/cpu_offload.py +188 -0
build/torch210-cxx11-rocm70-x86_64-linux/distributed/utils.py +0 -6
build/torch210-cxx11-rocm70-x86_64-linux/matmul_transpose_triton.py +11 -10
build/torch210-cxx11-rocm70-x86_64-linux/muon.py +573 -99
build/torch210-cxx11-rocm70-x86_64-linux/newton_schulz.py +206 -20
build/torch210-cxx11-rocm70-x86_64-linux/pipeline.py +158 -80
build/torch210-cxx11-rocm70-x86_64-linux/qk_clip.py +15 -9
build/torch210-cxx11-rocm71-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-rocm71-x86_64-linux/{_optimizer_7aef62f_dirty.abi3.so → _optimizer_5b58933_dirty.abi3.so} +1 -1
build/torch210-cxx11-rocm71-x86_64-linux/adamw.py +154 -37
build/torch210-cxx11-rocm71-x86_64-linux/core.py +134 -31
build/torch210-cxx11-rocm71-x86_64-linux/cpu_offload.py +188 -0
build/torch210-cxx11-rocm71-x86_64-linux/distributed/utils.py +0 -6

build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_7aef62f_dirty
-ops = torch.ops._optimizer_7aef62f_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_7aef62f_dirty::{op_name}"

 import torch
+from . import _optimizer_5b58933_dirty
+ops = torch.ops._optimizer_5b58933_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_5b58933_dirty::{op_name}"

build/torch210-cxx11-cu126-x86_64-linux/{_optimizer_7aef62f_dirty.abi3.so → _optimizer_5b58933_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f095be87ff6185010a3cff4175abbde0b2e50fe1e435dc1db4eaf5bf1f6199ca
 size 1940944

 version https://git-lfs.github.com/spec/v1
+oid sha256:90ace47a61519aefe759810c803789e7f91e6949ca0b04fc177e311709976334
 size 1940944

build/torch210-cxx11-cu126-x86_64-linux/adamw.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from collections import defaultdict
 from typing import cast
 import torch
 from torch.distributed.tensor import DTensor
 def fused_adamw(
@@ -72,54 +76,72 @@ def fused_adamw(
         )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
     params_with_grads = []
     grads = []
     moment1 = []
     moment2 = []
-    max_exp_avg_sqs = []
     state_steps = []
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
     for p in params:
         g = p.grad
         if g is None:
             continue
         state = optimizer_state[p]
-        params_with_grads.append(p)
-        grads.append(g)
         if "step" not in state:
-            state["step"] = (torch.zeros((),
-                                         dtype=torch.float32,
-                                         device=p.device))
             state["moment1"] = torch.zeros_like(g)
             state["moment2"] = torch.zeros_like(g)
-        moment1.append(state["moment1"])
-        moment2.append(state["moment2"])
         if not isinstance(state["step"], torch.Tensor):
-            step_tensor = torch.tensor(state["step"],
-                                       dtype=torch.float32,
-                                       device=p.device)
-        else:
-            step_tensor = state["step"]
-        state_steps.append(step_tensor)
     fused_adamw(
         params_with_grads,
         grads,
         moment1,
         moment2,
-        max_exp_avg_sqs,
         state_steps,
         amsgrad=False,
         beta1=beta1,
@@ -131,24 +153,119 @@ def step_adamw_params(optimizer_state, params, group):
     )
 def step_adamw(optimizer_state, group):
     """Dispatch AdamW step, grouping parameters by type and placement.
     Args:
         optimizer_state: The optimizer's state dict (self.state in Muon).
         group: Parameter group dict.
     """
     params = group["params"]
-    # group params with its type and placement
-    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
-    for p in params:
-        match p:
-            case DTensor():
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])].append(p)
-            case torch.Tensor():
-                placement_to_params[tuple([torch.Tensor, None])].append(p)
-    for group_params in placement_to_params.values():
         step_adamw_params(optimizer_state, group_params, group)

+import logging
 from collections import defaultdict
 from typing import cast
 import torch
 from torch.distributed.tensor import DTensor
+from torch.profiler import record_function
+logger = logging.getLogger(__name__)
 def fused_adamw(
         )
+def _to_local(t):
+    """Unwrap DTensor to local tensor for fused ops."""
+    return t._local_tensor if isinstance(t, DTensor) else t
+# ---------------------------------------------------------------------------
+# Caches for eliminating per-step Python overhead.
+#
+# Placement grouping and tensor list assembly are identical every step
+# (params don't change placement, moment/step tensors are the same objects
+# after initialisation).  We cache them keyed by id() of the param list
+# stored in param_groups (stable across steps).
+#
+# Only gradients change each step and must be collected fresh.
+# ---------------------------------------------------------------------------
+# id(group["params"]) → dict[placement_key, list[param]]
+_placement_cache: dict[int, dict[tuple, list]] = {}
+# id(placement_group_list) → (params_local, moment1, moment2, state_steps)
+_tensor_cache: dict[int, tuple[list, list, list, list]] = {}
+def _step_adamw_params_slow(optimizer_state, params, group):
+    """Uncached fallback for the rare case where some params lack grads."""
     params_with_grads = []
     grads = []
     moment1 = []
     moment2 = []
     state_steps = []
     for p in params:
         g = p.grad
         if g is None:
             continue
         state = optimizer_state[p]
+        params_with_grads.append(_to_local(p))
+        grads.append(_to_local(g))
         if "step" not in state:
+            state["step"] = torch.zeros((),
+                                        dtype=torch.float32,
+                                        device=p.device)
             state["moment1"] = torch.zeros_like(g)
             state["moment2"] = torch.zeros_like(g)
+        moment1.append(_to_local(state["moment1"]))
+        moment2.append(_to_local(state["moment2"]))
         if not isinstance(state["step"], torch.Tensor):
+            state["step"] = torch.tensor(state["step"],
+                                         dtype=torch.float32,
+                                         device=p.device)
+        state_steps.append(state["step"])
+    if not params_with_grads:
+        return
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
     fused_adamw(
         params_with_grads,
         grads,
         moment1,
         moment2,
+        [],
         state_steps,
         amsgrad=False,
         beta1=beta1,
     )
+def step_adamw_params(optimizer_state, params, group):
+    """Run fused AdamW on a list of parameters sharing the same placement.
+    After the first call, cached tensor lists (params_local, moment1,
+    moment2, state_steps) are reused — only gradients are collected fresh.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        params: List of parameters to update.
+        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
+    """
+    # Collect grads — the only thing that changes each step.
+    with record_function("adamw::collect_grads"):
+        grads = []
+        for p in params:
+            g = p.grad
+            if g is None:
+                # Rare: fall back to slow path that filters per-param.
+                _step_adamw_params_slow(optimizer_state, params, group)
+                return
+            grads.append(_to_local(g))
+    tensor_key = id(params)
+    if tensor_key not in _tensor_cache:
+        with record_function("adamw::init_tensor_cache"):
+            params_local = []
+            moment1 = []
+            moment2 = []
+            state_steps = []
+            for p in params:
+                state = optimizer_state[p]
+                params_local.append(_to_local(p))
+                if "step" not in state:
+                    state["step"] = torch.zeros((),
+                                                dtype=torch.float32,
+                                                device=p.device)
+                    state["moment1"] = torch.zeros_like(p.grad)
+                    state["moment2"] = torch.zeros_like(p.grad)
+                moment1.append(_to_local(state["moment1"]))
+                moment2.append(_to_local(state["moment2"]))
+                if not isinstance(state["step"], torch.Tensor):
+                    state["step"] = torch.tensor(state["step"],
+                                                 dtype=torch.float32,
+                                                 device=p.device)
+                state_steps.append(state["step"])
+            _tensor_cache[tensor_key] = (params_local, moment1, moment2,
+                                         state_steps)
+    params_local, moment1, moment2, state_steps = _tensor_cache[tensor_key]
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
+    with record_function("adamw::fused_adamw"):
+        fused_adamw(
+            params_local,
+            grads,
+            moment1,
+            moment2,
+            [],
+            state_steps,
+            amsgrad=False,
+            beta1=beta1,
+            beta2=beta2,
+            lr=lr,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=False,
+        )
 def step_adamw(optimizer_state, group):
     """Dispatch AdamW step, grouping parameters by type and placement.
+    Placement grouping is cached after the first call since params never
+    change their placement between steps.
     Args:
         optimizer_state: The optimizer's state dict (self.state in Muon).
         group: Parameter group dict.
     """
     params = group["params"]
+    placement_key = id(params)
+    if placement_key not in _placement_cache:
+        with record_function("adamw::group_by_placement"):
+            placement_to_params: dict[tuple,
+                                      list[torch.Tensor]] = defaultdict(list)
+            for p in params:
+                match p:
+                    case DTensor():
+                        logger.debug(
+                            "[AdamW] DTensor param: shape=%s, placements=%s, "
+                            "mesh=%s, grad=%s", p.shape, p.placements,
+                            p.device_mesh.mesh_dim_names,
+                            p.grad.shape if p.grad is not None else None)
+                        placement_to_params[tuple(
+                            [p.placements, p.device_mesh])].append(p)
+                    case torch.Tensor():
+                        logger.debug(
+                            "[AdamW] plain param: shape=%s, grad=%s", p.shape,
+                            p.grad.shape if p.grad is not None else None)
+                        placement_to_params[tuple([torch.Tensor,
+                                                   None])].append(p)
+            logger.debug("[AdamW] %d placement groups, %d total params",
+                         len(placement_to_params), len(params))
+            _placement_cache[placement_key] = dict(placement_to_params)
+    for group_params in _placement_cache[placement_key].values():
         step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu126-x86_64-linux/core.py CHANGED Viewed

@@ -1,11 +1,25 @@
 import math
 from dataclasses import dataclass
 import torch
-import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
 @dataclass
 class _muon_state:
@@ -17,26 +31,71 @@ class _muon_state:
     qk_clip_state: torch.Tensor | None = None
-def update_g(optimizer_state, p, g, group, momentum):
-    """Apply momentum update to gradient.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        p: Parameter tensor.
-        g: Gradient tensor.
-        group: Parameter group dict.
-        momentum: Momentum coefficient.
-    Returns:
-        Momentum-updated gradient tensor.
     """
-    state = optimizer_state[p]
-    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-    torch.add(g, buf, alpha=momentum, out=buf)
-    if group["nesterov"]:
-        g.add_(buf, alpha=momentum)
-        return g
-    return buf
 def update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -49,14 +108,13 @@ def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
-    if isinstance(p, torch.nn.Parameter):
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    else:
-        p.mul_(1 - lr * weight_decay)
-        p.add_(u, alpha=-adjusted_lr)
 def adjust_lr_for_muon(lr, param_shape):
@@ -77,14 +135,55 @@ def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
 def default_is_muon(name, x, expert_keys=None):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    if any(key in name for key in skip_keys):
         return False
     effective_ndim = x.ndim
-    if expert_keys and any(key in name for key in expert_keys):
         effective_ndim -= 1
-    return effective_ndim >= 2
 def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
@@ -92,7 +191,7 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
-    non_muon_params = []
     for n, p in model.named_parameters():
         if not p.requires_grad:
@@ -102,6 +201,10 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
             muon_names.append(n)
         else:
             non_muon_params.append(p)
     return [
         {

+import logging
 import math
 from dataclasses import dataclass
+from typing import List
 import torch
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
+# torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
+# parameter FQNs.  Activation checkpointing similarly inserts
+# "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
+# expert_keys, QK layer parsing) works regardless of wrapper nesting.
+_WRAPPER_PARTS = frozenset({"_orig_mod", "_checkpoint_wrapped_module"})
+logger = logging.getLogger(__name__)
+def normalize_fqn(name: str) -> str:
+    """Strip torch.compile / checkpoint wrapper components from a parameter FQN."""
+    return ".".join(p for p in name.split(".") if p not in _WRAPPER_PARTS)
 @dataclass
 class _muon_state:
     qk_clip_state: torch.Tensor | None = None
+def _batch_momentum(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update (no nesterov)."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+def _batch_momentum_nesterov(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update with nesterov correction."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+    nesterov_terms = torch._foreach_mul(momentum_bufs, momentum)
+    torch._foreach_add_(grads, nesterov_terms)
+_compiled_momentum: dict[bool, callable] = {}
+_use_momentum_compile = True
+def set_momentum_compile(enabled: bool):
+    """Toggle torch.compile for batched momentum."""
+    global _use_momentum_compile
+    _use_momentum_compile = enabled
+def batch_pre_ortho(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+    nesterov: bool,
+) -> None:
+    """Batched momentum update on lists of plain tensors.
+    Mirrors dion's ``muon_update_pre_orthogonalize``.
+    Inputs must be plain CUDA tensors (not DTensor).
+    Modifies ``momentum_bufs`` and (for nesterov) ``grads`` in-place.
+    When compile is enabled, uses separately compiled functions for
+    nesterov=True/False to avoid graph breaks from the branch.
     """
+    fn = _batch_momentum_nesterov if nesterov else _batch_momentum
+    if _use_momentum_compile:
+        if nesterov not in _compiled_momentum:
+            _compiled_momentum[nesterov] = torch.compile(fn)
+        fn = _compiled_momentum[nesterov]
+    fn(grads, momentum_bufs, momentum)
+def _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay):
+    """Weight-decay + update on plain tensors.
+    Not compiled: per-param @torch.compile caused ~0.25ms TorchDynamo cache
+    lookup per call × 256+ params = massive overhead.  The pipeline path uses
+    batched _foreach_* ops instead; this function remains for base() and
+    distributed_muon().
+    """
+    p_data.mul_(1 - lr * weight_decay)
+    p_data.add_(u_data, alpha=-adjusted_lr)
 def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
+    # Unwrap Parameter -> underlying data tensor.
+    p_data = p.data if isinstance(p, torch.nn.Parameter) else p
+    # Unwrap DTensor -> local CUDA tensor for compiled kernel.
+    if isinstance(p_data, DTensor):
+        p_data = p_data._local_tensor
+    u_data = u._local_tensor if isinstance(u, DTensor) else u
+    _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay)
 def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
+def _match_key(parts, key):
+    """Check if key matches as contiguous components in parts.
+    Single-component keys (e.g. "experts") match any single component.
+    Multi-component keys (e.g. "experts.w1") match as a contiguous subsequence.
+    """
+    key_parts = key.split(".")
+    key_len = len(key_parts)
+    if key_len == 1:
+        return key in parts
+    return any(parts[i:i + key_len] == key_parts
+               for i in range(len(parts) - key_len + 1))
+def is_expert_param(name, expert_keys):
+    """Check if a parameter name matches any expert key (component-level)."""
+    if not expert_keys:
+        return False
+    parts = normalize_fqn(name).split(".")
+    return any(_match_key(parts, key) for key in expert_keys)
 def default_is_muon(name, x, expert_keys=None):
+    normalized = normalize_fqn(name)
+    parts = normalized.split(".")
+    skip_keys = [
+        "embed_tokens",
+        "lm_head",
+        "tok_embeddings",
+        "output",
+        "mhc_attn",
+        "mhc_ffn",
+        "lambda_proj",
+    ]
+    if any(key in parts for key in skip_keys):
+        logger.info(
+            "[is_muon] %s (orig: %s): skip (matched skip_key), ndim=%d",
+            normalized, name, x.ndim)
         return False
     effective_ndim = x.ndim
+    is_expert = is_expert_param(name, expert_keys)
+    if is_expert:
         effective_ndim -= 1
+    result = effective_ndim >= 2
+    logger.info(
+        "[is_muon] %s (orig: %s): ndim=%d, expert=%s, effective_ndim=%d → %s",
+        normalized, name, x.ndim, is_expert, effective_ndim,
+        "Muon" if result else "AdamW")
+    return result
 def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
+    non_muon_params, non_muon_names = [], []
     for n, p in model.named_parameters():
         if not p.requires_grad:
             muon_names.append(n)
         else:
             non_muon_params.append(p)
+            non_muon_names.append(n)
+    logger.info("[param_groups] expert_keys=%s, Muon=%d, AdamW=%d",
+                expert_keys, len(muon_names), len(non_muon_names))
     return [
         {

build/torch210-cxx11-cu126-x86_64-linux/cpu_offload.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""CPU offloading for optimizer states.
+Manages a pinned CPU memory pool and async CUDA streams to offload
+optimizer state tensors (momentum buffers, Adam moments) to CPU between
+optimizer steps, freeing GPU memory.
+All tracked tensors are packed into a single flat pinned CPU buffer
+(per dtype).  D2H and H2D copies are performed per-tensor directly
+between individual GPU tensors and their slice of the CPU flat buffer
+— no GPU staging buffer is allocated, so there is **no temporary GPU
+memory spike** during offload or reload.
+Individual tensor storages are freed after offload via
+``untyped_storage().resize_(0)``, preserving tensor identity so
+downstream caches remain valid.
+"""
+import logging
+from collections import defaultdict
+import torch
+from torch.distributed.tensor import DTensor
+logger = logging.getLogger(__name__)
+class CPUOffloadPool:
+    """Pinned CPU memory pool for async optimizer state offloading.
+    Tracked tensors are grouped by dtype.  Each group gets a single flat
+    pinned CPU buffer.  D2H / H2D copies are per-tensor (into slices of
+    the flat buffer) to avoid allocating a GPU staging buffer.
+    """
+    def __init__(self):
+        self._managed: list[torch.Tensor] = []
+        self._storage_nbytes: dict[int, int] = {}  # id(t) → bytes
+        # Per-dtype group: populated on first offload.
+        # dtype → dict with keys:
+        #   "indices"   : list[int]           managed-list indices
+        #   "offsets"   : list[tuple[int,int]] (start, numel) in flat buf
+        #   "total"     : int                  total numel
+        #   "cpu_flat"  : Tensor               pinned CPU buffer
+        self._groups: dict[torch.dtype, dict] = {}
+        self._offload_stream: torch.cuda.Stream | None = None
+        self._device: torch.device | None = None
+        self._initialized: bool = False
+        self._logged: bool = False
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _local(t: torch.Tensor) -> torch.Tensor:
+        """Unwrap DTensor to its local CUDA tensor."""
+        return t._local_tensor if isinstance(t, DTensor) else t
+    def _ensure_stream(self):
+        if self._offload_stream is None:
+            self._offload_stream = torch.cuda.Stream(device=self._device)
+    # ------------------------------------------------------------------
+    def track(self, tensor: torch.Tensor):
+        """Register a GPU tensor for CPU offloading.  Idempotent."""
+        tid = id(tensor)
+        if tid in self._storage_nbytes:
+            return
+        local = self._local(tensor)
+        if self._device is None:
+            self._device = local.device
+        self._storage_nbytes[tid] = local.untyped_storage().size()
+        self._managed.append(tensor)
+    # ------------------------------------------------------------------
+    def _init_buffers(self):
+        """Build per-dtype flat buffers on first offload."""
+        # Group managed tensors by dtype.
+        dtype_map: dict[torch.dtype, list[tuple[int, int]]] = defaultdict(list)
+        for idx, t in enumerate(self._managed):
+            local = self._local(t)
+            dtype_map[local.dtype].append((idx, local.numel()))
+        total_cpu_bytes = 0
+        for dtype, entries in dtype_map.items():
+            offsets: list[tuple[int, int]] = []
+            indices: list[int] = []
+            off = 0
+            for idx, n in entries:
+                indices.append(idx)
+                offsets.append((off, n))
+                off += n
+            cpu_flat = torch.empty(off, dtype=dtype, device="cpu", pin_memory=True)
+            self._groups[dtype] = {
+                "indices": indices,
+                "offsets": offsets,
+                "total": off,
+                "cpu_flat": cpu_flat,
+            }
+            total_cpu_bytes += off * cpu_flat.element_size()
+        self._initialized = True
+        logger.info(
+            "[CPUOffload] Pool initialized: %d tensors, %d dtype group(s), "
+            "%.2f MB pinned CPU memory",
+            len(self._managed),
+            len(self._groups),
+            total_cpu_bytes / (1024**2),
+        )
+    # ------------------------------------------------------------------
+    def offload(self):
+        """Per-tensor async D2H into CPU flat buffer, then free GPU storage."""
+        if not self._managed:
+            return
+        if not self._initialized:
+            self._init_buffers()
+        self._ensure_stream()
+        # Offload stream waits for compute to finish.
+        compute_event = torch.cuda.current_stream(
+            self._device).record_event()
+        self._offload_stream.wait_event(compute_event)
+        offloaded_bytes = 0
+        # Per-tensor D2H copies directly into CPU flat buffer slices.
+        # No GPU staging buffer → no temporary GPU memory spike.
+        with torch.cuda.stream(self._offload_stream):
+            for dtype, grp in self._groups.items():
+                indices = grp["indices"]
+                offsets = grp["offsets"]
+                cpu_flat = grp["cpu_flat"]
+                for i, mgd_idx in enumerate(indices):
+                    local = self._local(self._managed[mgd_idx])
+                    off, n = offsets[i]
+                    cpu_flat[off:off + n].copy_(
+                        local.reshape(-1), non_blocking=True)
+                offloaded_bytes += grp["total"] * cpu_flat.element_size()
+        # Wait for all D2H copies to land, then free GPU storage.
+        self._offload_stream.synchronize()
+        for t in self._managed:
+            self._local(t).untyped_storage().resize_(0)
+        if not self._logged:
+            logger.info("[CPUOffload] Offloaded %.2f MB (GPU → CPU)",
+                        offloaded_bytes / (1024**2))
+    # ------------------------------------------------------------------
+    def reload(self):
+        """Per-tensor H2D from CPU flat buffer on the default stream.
+        Runs on the current (default) CUDA stream to avoid stream
+        interaction issues with the parallel Muon pipeline.  Since
+        pinned CPU memory is the source, the copies overlap with
+        GPU idle time between steps.
+        """
+        if not self._managed or not self._initialized:
+            return
+        reloaded_bytes = 0
+        # Re-allocate all GPU storages first.
+        for t in self._managed:
+            local = self._local(t)
+            local.untyped_storage().resize_(self._storage_nbytes[id(t)])
+        # Per-tensor H2D copies from CPU flat buffer slices.
+        # non_blocking=True with pinned source allows DMA overlap.
+        for dtype, grp in self._groups.items():
+            indices = grp["indices"]
+            offsets = grp["offsets"]
+            cpu_flat = grp["cpu_flat"]
+            for i, mgd_idx in enumerate(indices):
+                local = self._local(self._managed[mgd_idx])
+                off, n = offsets[i]
+                local.reshape(-1).copy_(
+                    cpu_flat[off:off + n], non_blocking=True)
+            reloaded_bytes += grp["total"] * cpu_flat.element_size()
+        if not self._logged:
+            logger.info("[CPUOffload] Reloaded %.2f MB (CPU → GPU)",
+                        reloaded_bytes / (1024**2))
+            self._logged = True

build/torch210-cxx11-cu126-x86_64-linux/distributed/utils.py CHANGED Viewed

@@ -72,12 +72,6 @@ def get_slices_of_dtensor(
         else:
             curr_size = target.size()[shard_dim]
-        if curr_size % num_chunks != 0:
-            raise NotImplementedError(
-                f"Dimension size {curr_size} is not divisible "
-                f"by number of ranks {num_chunks} for shard "
-                f"placement on dim {shard_dim}. (shape: {target.shape})")
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

         else:
             curr_size = target.size()[shard_dim]
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

build/torch210-cxx11-cu126-x86_64-linux/matmul_transpose_triton.py CHANGED Viewed

@@ -43,6 +43,7 @@ def get_autotune_config():
 @triton.autotune(
     configs=get_autotune_config(),
     key=['M', 'K'],
 )
 @triton.jit
 def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
@@ -102,16 +103,10 @@ def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
         tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-def matmul_transpose_assign(d_in, d_out):
-    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
-    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
-    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
-    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
-    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
-    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
-    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
-            "First dimension of `d_in` must match first and second dimension of `d_out`"
     d_in = d_in.contiguous()
     M, K = d_in.shape
     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
@@ -119,3 +114,9 @@ def matmul_transpose_assign(d_in, d_out):
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))

 @triton.autotune(
     configs=get_autotune_config(),
     key=['M', 'K'],
+    restore_value=['y'],
 )
 @triton.jit
 def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
         tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+@torch.library.custom_op("muon::matmul_transpose_assign",
+                         mutates_args=("d_out", ))
+def matmul_transpose_assign(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
+    """Compute d_out = d_in @ d_in.T using an optimized Triton kernel."""
     d_in = d_in.contiguous()
     M, K = d_in.shape
     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))
+@matmul_transpose_assign.register_fake
+def _(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
+    """FakeTensor impl: d_out is already allocated, mutation is declared."""
+    pass

build/torch210-cxx11-cu126-x86_64-linux/muon.py CHANGED Viewed

@@ -10,13 +10,16 @@ from torch.profiler import record_function
 from .adamw import step_adamw
 from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon,
-                   get_default_muon_param_groups, update_g, update_p)
 from .distributed.utils import (_is_shard, construct_shard_mesh,
                                 get_slices_of_dtensor)
 from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5)
-from .pipeline import muon_chunk_pipeline
 from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
@@ -45,9 +48,21 @@ def _expand_expert_params(names, params, expert_keys):
     expanded_params = []
     for n, p in zip(names, params):
-        is_expert = expert_keys and any(key in n for key in expert_keys)
         is_dtensor = isinstance(p.data, DTensor)
         if not is_expert:
             assert p.data.ndim <= 2, (
                 f"Param {n} has ndim={p.data.ndim} but does not match "
@@ -168,7 +183,6 @@ class Muon(torch.optim.Optimizer):
                      Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
-        small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
         expert_keys: List of strings to identify expert-parallel parameters.
                      If any key appears in a parameter's name, its outermost
                      dimension is treated as the expert dimension and expanded
@@ -193,8 +207,8 @@ class Muon(torch.optim.Optimizer):
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
-                 small_param_numel_threshold=65536,
-                 expert_keys=None):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -228,8 +242,12 @@ class Muon(torch.optim.Optimizer):
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
-        self.small_param_numel_threshold = small_param_numel_threshold
         self.expert_keys = expert_keys
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -333,8 +351,8 @@ class Muon(torch.optim.Optimizer):
             if g is None:
                 continue
-            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                             steps=group["ns_steps"])
             adjusted_lr = adjust_lr_for_muon(lr, p.shape)
             update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -355,52 +373,269 @@ class Muon(torch.optim.Optimizer):
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
-        """ Implementation of Distributed Muon by Liu et al. """
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            # Gather G
-            if isinstance(p.data, DTensor):
-                g_full = g.full_tensor()
-                p_full = p.data.full_tensor()
-            else:
-                g_full = g
-                p_full = p
-            u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
-                                                  steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
-            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p_full, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
-            if isinstance(p.data, DTensor):
-                ndims = len(p.device_mesh.mesh.shape)
-                p_replicate = DTensor.from_local(
-                    p_full,
-                    device_mesh=p.device_mesh,
-                    placements=[Replicate() for _ in range(ndims)],
-                )
-                p_sharded = p_replicate.redistribute(
-                    device_mesh=p.device_mesh,
-                    placements=p.placements,
                 )
-                p.copy_(p_sharded)
-    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
         """
         Perform a parallel optimization step using Muon.
@@ -409,31 +644,23 @@ class Muon(torch.optim.Optimizer):
         interleaves multiple chunks so that communication and computation
         overlap across chunks (the same overlap previously achieved by the
         warmup + main-loop index scheduling).
         """
         # Momentum is already applied by _step_muon before this method.
-        param_to_state, ordered_params = self.init_state_and_assign_params(
-            names, params, group, qk_logits)
-        # Compute local rank for this group's shard process group.
-        shard_pg = param_to_state[id(ordered_params[0])].process_group
-        rank = dist.get_rank(group=shard_pg)
-        if self.chunk_size == -1:
-            shard_ranks = dist.get_world_size(param_to_state[id(
-                ordered_params[0])].process_group)
-            chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-        elif self.chunk_size > 0:
-            chunk_size = self.chunk_size
-        else:
-            raise ValueError("chunk_size must be -1 or a positive integer.")
         def pipelines():
             for start in range(0, len(ordered_params), chunk_size):
                 chunk = ordered_params[start:start + chunk_size]
                 if chunk:
-                    yield muon_chunk_pipeline(
                         params=chunk,
                         param_to_state=param_to_state,
                         rank=rank,
@@ -442,9 +669,11 @@ class Muon(torch.optim.Optimizer):
                         weight_decay=weight_decay,
                         none_grad=group["none_grad"],
                     )
-        with record_function("muon::barrier"):
-            dist.barrier()
         with record_function("muon::pipeline"):
             run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
@@ -456,16 +685,152 @@ class Muon(torch.optim.Optimizer):
         names = group["names"]
         # Apply momentum to all params before routing/expansion.
         with record_function("muon::momentum"):
-            for n, p in zip(names, params):
-                g = p.grad
-                if g is None:
                     continue
-                g = update_g(self.state, p, g, group, momentum)
-                p.grad = g
         # Expand expert params by splitting on dim 0.
-        names, params = _expand_expert_params(names, params, self.expert_keys)
         param_dtensors = []
         name_dtensors = []
@@ -473,10 +838,10 @@ class Muon(torch.optim.Optimizer):
         param_tensors = []
         name_tensors = []
-        param_dtensors_small = []
-        name_dtensors_small = []
         if self.use_distributed_muon:
             self.distributed_muon(names=names,
                                   params=params,
                                   group=group,
@@ -485,8 +850,6 @@ class Muon(torch.optim.Optimizer):
                                   qk_logits=qk_logits)
             return
-        # For simplicity, we use distributed Muon for small parameters
-        # whose number of elements is below a threshold.
         for n, p in zip(names, params):
             if p is None or p.grad is None:
                 continue
@@ -494,23 +857,28 @@ class Muon(torch.optim.Optimizer):
                 if all(
                         isinstance(placement, Replicate)
                         for placement in p.placements):
                     param_tensors.append(p)
                     name_tensors.append(n)
-                elif p.data.numel() <= self.small_param_numel_threshold:
-                    param_dtensors_small.append(p)
-                    name_dtensors_small.append(n)
                 else:
                     param_dtensors.append(p)
                     name_dtensors.append(n)
             elif isinstance(p.data, torch.Tensor):
                 param_tensors.append(p)
                 name_tensors.append(n)
             else:
                 raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(
-            f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors, "
-            f"{len(param_dtensors_small)} Small DTensors")
         def group_dtensors(dtensors, names):
             # To support different placements, we group parameters by placements
@@ -526,21 +894,6 @@ class Muon(torch.optim.Optimizer):
                                            p.device_mesh])][1].append(p)
             return placement_to_params
-        if len(param_dtensors_small) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            self.distributed_muon(
-                params=param_dtensors_small,
-                names=name_dtensors_small,
-                group=group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
         if len(param_dtensors) > 0:
             if not dist.is_initialized():
                 raise RuntimeError(
@@ -548,7 +901,26 @@ class Muon(torch.optim.Optimizer):
                 )
             dtensor_group = group_dtensors(param_dtensors, name_dtensors)
             for _, (names, params) in dtensor_group.items():
                 self.parallel(
                     names,
                     params,
@@ -556,7 +928,10 @@ class Muon(torch.optim.Optimizer):
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
                 )
         if len(param_tensors) > 0:
             self.base(
@@ -568,6 +943,33 @@ class Muon(torch.optim.Optimizer):
                 qk_logits=qk_logits,
             )
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
@@ -585,10 +987,82 @@ class Muon(torch.optim.Optimizer):
             with torch.enable_grad():
                 loss = closure()
-        for group in self.param_groups:
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
                 step_adamw(self.state, group)
         return loss

 from .adamw import step_adamw
 from .async_utils import run_pipeline
+from .core import (_muon_state, adjust_lr_for_muon, batch_pre_ortho,
+                   get_default_muon_param_groups, is_expert_param, update_p)
+from .cpu_offload import CPUOffloadPool
 from .distributed.utils import (_is_shard, construct_shard_mesh,
                                 get_slices_of_dtensor)
 from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
+                            _zeropower_via_newtonschulz5,
+                            zeropower_via_newtonschulz5,
+                            zeropower_via_newtonschulz5_batched)
+from .pipeline import muon_chunk_pipeline, prelaunch_first_gather
 from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
     expanded_params = []
     for n, p in zip(names, params):
+        is_expert = is_expert_param(n, expert_keys)
         is_dtensor = isinstance(p.data, DTensor)
+        if is_expert:
+            if is_dtensor:
+                logger.debug(
+                    "[expand_expert] %s: expert DTensor, shape=%s, "
+                    "placements=%s, mesh=%s, local_shape=%s", n, p.shape,
+                    p.placements, p.device_mesh.mesh_dim_names,
+                    p.to_local().shape)
+            else:
+                logger.debug(
+                    "[expand_expert] %s: expert plain tensor, shape=%s", n,
+                    p.data.shape)
         if not is_expert:
             assert p.data.ndim <= 2, (
                 f"Param {n} has ndim={p.data.ndim} but does not match "
                      Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         expert_keys: List of strings to identify expert-parallel parameters.
                      If any key appears in a parameter's name, its outermost
                      dimension is treated as the expert dimension and expanded
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
+                 expert_keys=None,
+                 cpu_offload=False):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.expert_keys = expert_keys
+        self.cpu_offload = cpu_offload
+        self._cpu_offload_pool = CPUOffloadPool() if cpu_offload else None
+        self._offload_initialized = False
+        self._parallel_cache: dict[tuple[str, ...], dict] = {}
+        self._expert_expand_cache: dict[tuple[int, ...], dict] = {}
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if g is None:
                 continue
+            u = zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
+                                            steps=group["ns_steps"])
             adjusted_lr = adjust_lr_for_muon(lr, p.shape)
             update_p(p, u, lr, adjusted_lr, weight_decay)
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
+        """Batched Distributed Muon — for testing/correctness verification only.
+        Uses all-gather to reconstruct full tensors, computes Newton-Schulz on
+        the full grad, then slices back to local shards.  This is simpler but
+        slower than the parallel pipeline (all2all) path, so it serves as a
+        reference implementation for verifying correctness.
+        """
+        with record_function("distributed_muon"):
+            # Momentum is already applied by _step_muon before this method.
+            ns_steps = group["ns_steps"]
+            # Separate plain tensors (no communication) from DTensors.
+            plain_names, plain_params = [], []
+            dtensor_names, dtensor_params = [], []
+            for n, p in zip(names, params):
+                if p.grad is None:
+                    continue
+                if isinstance(p.data, DTensor):
+                    dtensor_names.append(n)
+                    dtensor_params.append(p)
+                else:
+                    plain_names.append(n)
+                    plain_params.append(p)
+            # Process plain tensors per-param (no communication).
+            for n, p in zip(plain_names, plain_params):
+                u = _zeropower_via_newtonschulz5(p.grad.to(COMM_DTYPE),
+                                                 steps=ns_steps)
+                adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                update_p(p, u, lr, adjusted_lr, weight_decay)
+                qk_clip_state = get_qk_clip_info(self.clip_config, n,
+                                                 qk_logits)
+                scales_full = compute_scales(
+                    p, qk_clip_state) if qk_clip_state is not None else None
+                if scales_full is not None:
+                    qk_clip(p, scales_full, qk_clip_state.head_dim)
+            if not dtensor_params:
+                return
+            # Group DTensors by (placements, mesh) for batched all-gather.
+            placement_groups: dict[tuple,
+                                   tuple[list,
+                                         list]] = defaultdict(lambda: ([], []))
+            for n, p in zip(dtensor_names, dtensor_params):
+                key = (p.placements, p.device_mesh)
+                placement_groups[key][0].append(n)
+                placement_groups[key][1].append(p)
+            logger.info(
+                "distributed_muon: %d placement groups, %d total dtensors",
+                len(placement_groups), len(dtensor_params))
+            for (placements, mesh), (grp_names,
+                                     grp_params) in placement_groups.items():
+                shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
+                    placements, mesh)
+                rank = dist.get_rank(shard_pg)
+                world_size = dist.get_world_size(shard_pg)
+                logger.info("  group: %d params, placements=%s, world_size=%d",
+                            len(grp_params), placements, world_size)
+                # Separate params that can be batched (all shard dims evenly
+                # divisible) from those needing per-param full_tensor
+                # (e.g. MoE gate weights with fewer rows than shard ranks).
+                # all_gather_into_tensor requires equal buffer sizes across
+                # ranks, so uneven splits must use DTensor full_tensor().
+                batch_names, batch_params = [], []
+                single_names, single_params = [], []
+                for n, p in zip(grp_names, grp_params):
+                    even = all(p.shape[pl.dim] %
+                               shard_mesh.mesh.shape[dim_idx] == 0
+                               for dim_idx, pl in enumerate(shard_placements))
+                    if even:
+                        batch_names.append(n)
+                        batch_params.append(p)
+                    else:
+                        single_names.append(n)
+                        single_params.append(p)
+                # Process uneven-split params per-param via full_tensor().
+                for n, p in zip(single_names, single_params):
+                    with record_function("distributed_muon::newton_schulz"):
+                        g_full = p.grad.full_tensor().to(COMM_DTYPE)
+                        u_full = _zeropower_via_newtonschulz5(g_full,
+                                                              steps=ns_steps)
+                        del g_full
+                    with record_function("distributed_muon::update"):
+                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                        p._local_tensor.mul_(1 - lr * weight_decay)
+                        local_indices = get_slices_of_dtensor(
+                            p, rank, shard_mesh, shard_placements)
+                        u_local = u_full[local_indices]
+                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
+                        del u_full
+                        qk_clip_state = get_qk_clip_info(
+                            self.clip_config, n, qk_logits)
+                        scales_full = compute_scales(
+                            p, qk_clip_state
+                        ) if qk_clip_state is not None else None
+                        if scales_full is not None:
+                            ratio = p.shape[0] // scales_full.shape[0]
+                            idx0 = local_indices[0]
+                            if isinstance(idx0, slice):
+                                start = idx0.start or 0
+                                idx0 = torch.arange(start,
+                                                    idx0.stop,
+                                                    device=scales_full.device)
+                            row_scales = scales_full[idx0 // ratio]
+                            p._local_tensor.mul_(row_scales.view(-1, 1))
+                if not batch_params:
+                    continue
+                logger.info("  batched=%d, single=%d", len(batch_params),
+                            len(single_params))
+                # Concat all local grad shards into a single flat buffer.
+                with record_function("distributed_muon::gather"):
+                    grad_locals = [
+                        p.grad.to_local().to(COMM_DTYPE).flatten()
+                        for p in batch_params
+                    ]
+                    numels = [g.numel() for g in grad_locals]
+                    grad_concat = torch.cat(grad_locals)
+                    del grad_locals
+                    # Single all-gather (replaces N separate full_tensor).
+                    grad_gathered = torch.empty(
+                        grad_concat.numel() * world_size,
+                        dtype=COMM_DTYPE,
+                        device="cuda",
+                    )
+                    dist.all_gather_into_tensor(grad_gathered,
+                                                grad_concat,
+                                                group=shard_pg)
+                total_numel = grad_concat.numel()
+                del grad_concat
+                # Precompute per-param offsets within the concat buffer.
+                offsets = []
+                off = 0
+                for ne in numels:
+                    offsets.append(off)
+                    off += ne
+                # Per-param: reconstruct full grad → NS → local update.
+                for i, (n, p) in enumerate(zip(batch_names, batch_params)):
+                    with record_function("distributed_muon::newton_schulz"):
+                        g_full = torch.empty(p.shape,
+                                             dtype=COMM_DTYPE,
+                                             device="cuda")
+                        for r in range(world_size):
+                            r_start = r * total_numel + offsets[i]
+                            shard = grad_gathered[r_start:r_start + numels[i]]
+                            indices = get_slices_of_dtensor(
+                                p, r, shard_mesh, shard_placements)
+                            g_full[indices] = shard.reshape(
+                                g_full[indices].shape)
+                        u_full = _zeropower_via_newtonschulz5(g_full,
+                                                              steps=ns_steps)
+                        del g_full
+                    with record_function("distributed_muon::update"):
+                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                        p._local_tensor.mul_(1 - lr * weight_decay)
+                        local_indices = get_slices_of_dtensor(
+                            p, rank, shard_mesh, shard_placements)
+                        u_local = u_full[local_indices]
+                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
+                        del u_full
+                        qk_clip_state = get_qk_clip_info(
+                            self.clip_config, n, qk_logits)
+                        scales_full = compute_scales(
+                            p, qk_clip_state
+                        ) if qk_clip_state is not None else None
+                        if scales_full is not None:
+                            ratio = p.shape[0] // scales_full.shape[0]
+                            idx0 = local_indices[0]
+                            if isinstance(idx0, slice):
+                                start = idx0.start or 0
+                                idx0 = torch.arange(start,
+                                                    idx0.stop,
+                                                    device=scales_full.device)
+                            row_scales = scales_full[idx0 // ratio]
+                            p._local_tensor.mul_(row_scales.view(-1, 1))
+    def _setup_parallel(self, names, params, group, qk_logits):
+        """Compute (or retrieve cached) parallel pipeline metadata.
+        Returns:
+            (ordered_params, param_to_state, rank, chunk_size)
+        """
+        cache_key = tuple(names)
+        if cache_key not in self._parallel_cache:
+            # First call: compute metadata and populate cache.
+            param_to_state, ordered_params = self.init_state_and_assign_params(
+                names, params, group, qk_logits)
+            shard_pg = param_to_state[id(ordered_params[0])].process_group
+            rank = dist.get_rank(group=shard_pg)
+            if self.chunk_size == -1:
+                shard_ranks = dist.get_world_size(shard_pg)
+                chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
+            elif self.chunk_size > 0:
+                chunk_size = self.chunk_size
+            else:
+                raise ValueError(
+                    "chunk_size must be -1 or a positive integer.")
+            ordered_names = [
+                param_to_state[id(p)].name for p in ordered_params
+            ]
+            name_to_state = {
+                param_to_state[id(p)].name: param_to_state[id(p)]
+                for p in ordered_params
+            }
+            self._parallel_cache[cache_key] = {
+                'ordered_names': ordered_names,
+                'name_to_state': name_to_state,
+                'rank': rank,
+                'chunk_size': chunk_size,
+            }
+        else:
+            # Cached path: rebuild param_to_state with current id(p) keys.
+            cache = self._parallel_cache[cache_key]
+            rank = cache['rank']
+            chunk_size = cache['chunk_size']
+            name_to_param = dict(zip(names, params))
+            ordered_params = [name_to_param[n] for n in cache['ordered_names']]
+            param_to_state = {}
+            for p, n in zip(ordered_params, cache['ordered_names']):
+                cached_state = cache['name_to_state'][n]
+                param_to_state[id(p)] = _muon_state(
+                    worker_rank=cached_state.worker_rank,
+                    process_group=cached_state.process_group,
+                    rank_indices=cached_state.rank_indices,
+                    rank_numels=cached_state.rank_numels,
+                    name=n,
+                    qk_clip_state=get_qk_clip_info(self.clip_config, n,
+                                                   qk_logits),
                 )
+        return ordered_params, param_to_state, rank, chunk_size
+    def parallel(self,
+                 names,
+                 params,
+                 group,
+                 lr,
+                 weight_decay,
+                 qk_logits,
+                 prelaunch_gather=None):
         """
         Perform a parallel optimization step using Muon.
         interleaves multiple chunks so that communication and computation
         overlap across chunks (the same overlap previously achieved by the
         warmup + main-loop index scheduling).
+        If ``prelaunch_gather`` is provided, it is passed to the first
+        chunk's generator to skip re-launching the already in-flight
+        A2A gather.
         """
         # Momentum is already applied by _step_muon before this method.
+        ordered_params, param_to_state, rank, chunk_size = (
+            self._setup_parallel(names, params, group, qk_logits))
         def pipelines():
+            first = True
             for start in range(0, len(ordered_params), chunk_size):
                 chunk = ordered_params[start:start + chunk_size]
                 if chunk:
+                    kwargs = dict(
                         params=chunk,
                         param_to_state=param_to_state,
                         rank=rank,
                         weight_decay=weight_decay,
                         none_grad=group["none_grad"],
                     )
+                    if first and prelaunch_gather is not None:
+                        kwargs['prelaunch_gather'] = prelaunch_gather
+                    first = False
+                    yield muon_chunk_pipeline(**kwargs)
         with record_function("muon::pipeline"):
             run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
         names = group["names"]
         # Apply momentum to all params before routing/expansion.
+        # Batched using _foreach_* ops (compiled, fullgraph=True).
         with record_function("muon::momentum"):
+            active_params = [p for p in params if p.grad is not None]
+            if active_params:
+                # Ensure momentum buffers exist (avoid zeros_like when already present).
+                for p in active_params:
+                    if "momentum_buffer" not in self.state[p]:
+                        self.state[p]["momentum_buffer"] = torch.zeros_like(
+                            p.grad)
+                # Extract local tensors for compiled batch function.
+                local_grads = [
+                    p.grad._local_tensor
+                    if isinstance(p.grad, DTensor) else p.grad
+                    for p in active_params
+                ]
+                local_bufs = [
+                    self.state[p]["momentum_buffer"]._local_tensor
+                    if isinstance(self.state[p]["momentum_buffer"], DTensor)
+                    else self.state[p]["momentum_buffer"]
+                    for p in active_params
+                ]
+                # Wrap momentum as tensor for torch.compile.
+                batch_pre_ortho(local_grads, local_bufs,
+                                torch.tensor(momentum), group["nesterov"])
+                # For non-nesterov, the result is the momentum buffer.
+                if not group["nesterov"]:
+                    for p in active_params:
+                        p.grad = self.state[p]["momentum_buffer"]
+        # Identify batched experts for deferred NS.
+        # Detection is cheap (condition checks only); actual NS compute is
+        # deferred so it can overlap with the first chunk's A2A gather.
+        deferred_expert_work = []
+        if self.expert_keys:
+            batched_expert_indices = []
+            for i, (n, p) in enumerate(zip(names, params)):
+                if not (is_expert_param(n, self.expert_keys)
+                        and p.grad is not None):
                     continue
+                # Eligible: plain tensor, or DTensor with no non-dim-0 shards.
+                if isinstance(p.data, DTensor):
+                    has_tp = any(
+                        _is_shard(pl) and pl.dim != 0 for pl in p.placements)
+                    if has_tp:
+                        continue
+                batched_expert_indices.append(i)
+            if batched_expert_indices:
+                # Save refs for deferred NS; free grads from param list.
+                for i in batched_expert_indices:
+                    p = params[i]
+                    g = p.grad
+                    local_g = (g._local_tensor
+                               if isinstance(g, DTensor) else g)
+                    local_data = (p.data._local_tensor if isinstance(
+                        p.data, DTensor) else p.data)
+                    deferred_expert_work.append((local_data, local_g))
+                    p.grad = None
+                # Remove batched experts from lists before expansion.
+                keep = sorted(
+                    set(range(len(params))) - set(batched_expert_indices))
+                names = [names[i] for i in keep]
+                params = [params[i] for i in keep]
+        def _run_deferred_expert_ns():
+            """Execute deferred batched expert NS."""
+            if not deferred_expert_work:
+                return
+            with record_function("muon::batched_expert_ns"):
+                ns_steps = group["ns_steps"]
+                for local_data, local_g in deferred_expert_work:
+                    u = zeropower_via_newtonschulz5_batched(
+                        local_g.to(COMM_DTYPE), steps=ns_steps)
+                    adjusted_lr = adjust_lr_for_muon(lr, local_g.shape[1:])
+                    local_data.mul_(1 - lr * weight_decay)
+                    local_data.add_(u, alpha=-adjusted_lr)
         # Expand expert params by splitting on dim 0.
+        logger.debug("[_step_muon] before expand: %d params, expert_keys=%s",
+                     len(params), self.expert_keys)
+        if self.expert_keys:
+            cache_key = tuple(id(p) for p in params)
+            cache = self._expert_expand_cache.get(cache_key)
+            if cache is None:
+                # Cold path: full expansion + build cache metadata.
+                exp_names, exp_params = _expand_expert_params(
+                    names, params, self.expert_keys)
+                # Build per-expert-group info for hot-path grad updates.
+                grad_info = []
+                exp_idx = 0
+                for orig_idx, (n, p) in enumerate(zip(names, params)):
+                    if not is_expert_param(n, self.expert_keys):
+                        exp_idx += 1
+                        continue
+                    is_dt = isinstance(p.data, DTensor)
+                    num_experts = (p.to_local() if is_dt else p.data).shape[0]
+                    # Detect TP mesh from the first expanded expert param.
+                    tp_mesh = None
+                    tp_pls = None
+                    sample = exp_params[exp_idx]
+                    if isinstance(sample.data, DTensor):
+                        tp_mesh = sample.data.device_mesh
+                        tp_pls = list(sample.data.placements)
+                    grad_info.append((orig_idx, num_experts, exp_idx, is_dt,
+                                      tp_mesh, tp_pls))
+                    exp_idx += num_experts
+                self._expert_expand_cache[cache_key] = {
+                    'names': exp_names,
+                    'params': exp_params,
+                    'grad_info': grad_info,
+                }
+                names, params = exp_names, exp_params
+            else:
+                # Hot path: reuse cached params, only update expert grads.
+                for (orig_idx, num_experts, exp_start, is_dt, tp_mesh,
+                     tp_pls) in cache['grad_info']:
+                    p = params[orig_idx]
+                    g = p.grad
+                    local_grad = (g.to_local()
+                                  if is_dt and isinstance(g, DTensor) else g)
+                    for i in range(num_experts):
+                        expert_p = cache['params'][exp_start + i]
+                        sg = local_grad[i]
+                        if tp_mesh is not None:
+                            expert_p.grad = DTensor.from_local(
+                                sg, device_mesh=tp_mesh, placements=tp_pls)
+                        else:
+                            expert_p.grad = sg
+                    p.grad = None
+                names = cache['names']
+                params = cache['params']
+        else:
+            names, params = _expand_expert_params(names, params,
+                                                  self.expert_keys)
+        logger.debug("[_step_muon] after expand: %d params", len(params))
         param_dtensors = []
         name_dtensors = []
         param_tensors = []
         name_tensors = []
+        # distributed_muon is a reference implementation for testing only.
+        # The parallel pipeline (all2all) path below is the production path.
         if self.use_distributed_muon:
+            _run_deferred_expert_ns()
             self.distributed_muon(names=names,
                                   params=params,
                                   group=group,
                                   qk_logits=qk_logits)
             return
         for n, p in zip(names, params):
             if p is None or p.grad is None:
                 continue
                 if all(
                         isinstance(placement, Replicate)
                         for placement in p.placements):
+                    logger.debug(
+                        "[route] %s → base (DTensor all-Replicate), "
+                        "shape=%s, placements=%s", n, p.shape, p.placements)
                     param_tensors.append(p)
                     name_tensors.append(n)
                 else:
+                    logger.debug(
+                        "[route] %s → parallel (DTensor), shape=%s, "
+                        "placements=%s, mesh=%s", n, p.shape, p.placements,
+                        p.device_mesh.mesh_dim_names)
                     param_dtensors.append(p)
                     name_dtensors.append(n)
             elif isinstance(p.data, torch.Tensor):
+                logger.debug("[route] %s → base (plain tensor), shape=%s", n,
+                             p.data.shape)
                 param_tensors.append(p)
                 name_tensors.append(n)
             else:
                 raise TypeError(f"Unsupported parameter type: {type(p.data)}")
+        logger.debug(f"[Muon] {len(param_dtensors)} DTensors → parallel, "
+                     f"{len(param_tensors)} Tensors → base")
         def group_dtensors(dtensors, names):
             # To support different placements, we group parameters by placements
                                            p.device_mesh])][1].append(p)
             return placement_to_params
         if len(param_dtensors) > 0:
             if not dist.is_initialized():
                 raise RuntimeError(
                 )
             dtensor_group = group_dtensors(param_dtensors, name_dtensors)
+            # Pre-launch the first chunk's A2A gather so that the NCCL
+            # communication overlaps with the (deferred) batched expert NS
+            # compute on the default CUDA stream.
+            prelaunch = None
+            if deferred_expert_work:
+                first_names, first_params = next(iter(dtensor_group.values()))
+                ordered, pts, rnk, csz = self._setup_parallel(
+                    first_names, first_params, group, qk_logits)
+                first_chunk = ordered[:csz]
+                if first_chunk:
+                    prelaunch = prelaunch_first_gather(first_chunk, pts, rnk,
+                                                       group["none_grad"])
+            _run_deferred_expert_ns()
+            first_group = True
             for _, (names, params) in dtensor_group.items():
+                pg = prelaunch if first_group else None
+                first_group = False
                 self.parallel(
                     names,
                     params,
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
+                    prelaunch_gather=pg,
                 )
+        else:
+            _run_deferred_expert_ns()
         if len(param_tensors) > 0:
             self.base(
                 qk_logits=qk_logits,
             )
+    def _register_states_for_offload(self):
+        """Register all optimizer state tensors with the CPU offload pool.
+        Called once after the first step when states have been lazily created.
+        Offloads all param states (momentum buffers for Muon, moment1/moment2
+        for AdamW) to free GPU memory between steps.
+        """
+        pool = self._cpu_offload_pool
+        tracked = 0
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p not in self.state:
+                    continue
+                state = self.state[p]
+                if group.get("use_muon", False):
+                    if "momentum_buffer" in state:
+                        pool.track(state["momentum_buffer"])
+                        tracked += 1
+                else:
+                    if "moment1" in state:
+                        pool.track(state["moment1"])
+                    if "moment2" in state:
+                        pool.track(state["moment2"])
+                        tracked += 1
+        logger.info("[CPUOffload] Registered %d param states for offload",
+                    tracked)
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
             with torch.enable_grad():
                 loss = closure()
+        # H2D: reload optimizer states from CPU before computation.
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+        logger.debug("[Muon.step] expert_keys=%s, %d param groups",
+                     self.expert_keys, len(self.param_groups))
+        for i, group in enumerate(self.param_groups):
             if group["use_muon"]:
+                logger.debug("[Muon.step] group %d: use_muon=True, %d params",
+                             i, len(group["params"]))
                 self._step_muon(group, qk_logits=qk_logits)
             else:
+                logger.debug(
+                    "[Muon.step] group %d: use_muon=False (AdamW), %d params",
+                    i, len(group["params"]))
                 step_adamw(self.state, group)
+        # D2H: offload optimizer states to CPU after computation.
+        if self.cpu_offload:
+            if not self._offload_initialized:
+                self._register_states_for_offload()
+                self._offload_initialized = True
+            self._cpu_offload_pool.offload()
         return loss
+    # ------------------------------------------------------------------
+    # Checkpoint support for cpu_offload
+    # ------------------------------------------------------------------
+    def state_dict(self) -> dict:
+        """Return optimizer state dict, reloading offloaded states first.
+        When ``cpu_offload=True``, optimizer state tensors have their GPU
+        storage freed (``resize_(0)``) between steps.  We reload them,
+        snapshot the state dict, then re-offload so the optimizer stays
+        in the expected post-step state.  The returned dict holds cloned
+        tensors so they remain valid after the re-offload frees the
+        originals' GPU storage.
+        """
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+            torch.cuda.current_stream().synchronize()
+        sd = super().state_dict()
+        if self.cpu_offload and self._offload_initialized:
+            # Clone state tensors so the returned dict survives re-offload
+            # (which frees GPU storage on the originals via resize_(0)).
+            for k in sd["state"]:
+                sd["state"][k] = {
+                    sk: sv.clone() if isinstance(sv, torch.Tensor) else sv
+                    for sk, sv in sd["state"][k].items()
+                }
+            self._cpu_offload_pool.offload()
+        return sd
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Load optimizer state dict, then offload states if needed.
+        After ``super().load_state_dict()`` populates GPU tensors, we
+        re-register them with the offload pool and offload to CPU so the
+        optimizer is in the same post-step state (GPU storage freed).
+        """
+        # If states were offloaded, reload first so storage sizes are
+        # correct for super().load_state_dict() to overwrite.
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+            torch.cuda.current_stream().synchronize()
+        super().load_state_dict(state_dict)
+        if self.cpu_offload:
+            # Re-create the offload pool since state tensors may be new
+            # objects after load_state_dict.
+            self._cpu_offload_pool = CPUOffloadPool()
+            self._offload_initialized = False
+            self._register_states_for_offload()
+            self._offload_initialized = True
+            self._cpu_offload_pool.offload()

build/torch210-cxx11-cu126-x86_64-linux/newton_schulz.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import torch
 from .matmul_transpose_triton import matmul_transpose_assign
@@ -6,21 +10,134 @@ COMM_DTYPE = torch.bfloat16
 DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
     assert G.dtype == COMM_DTYPE
@@ -28,18 +145,14 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
         matmul_transpose_assign(X, buf1)
         matmul_transpose_assign(buf1, buf2)
         buf1.mul_(b).add_(buf2, alpha=c)
@@ -47,4 +160,77 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
     return X

+from itertools import repeat
+from math import inf, sqrt
+import numpy as np
 import torch
 from .matmul_transpose_triton import matmul_transpose_assign
 DEFAULT_CHUNK_SIZE_RATIO = 4
+def _optimal_quintic(l, u, max_iter=1000):
+    """
+    Use the simplified Remez algorithm to find the optimal odd quintic approximant
+    to the constant function x -> 1 over the interval [l, u].
+    Returns (a, b, c) for p(x) = ax + bx^3 + cx^5 that minimizes the maximum
+    approximation error max_{x in [l,u]} |p(x) - 1|. Iterates by updating the
+    two interior equioscillation nodes q, r until convergence. Returns the
+    closed-form equioscillating solution when l ≈ u.
+    Raises ValueError if any intermediate value (a, b, c, E, q, r) is non-finite
+    (NaN or inf). Raises RuntimeError if convergence is not reached within
+    max_iter iterations.
+    """
+    assert 0 <= l <= u
+    if 1 - 5e-6 <= l / u:
+        return (15 / 8) / u, (-10 / 8) / (u**3), (3 / 8) / (u**5)
+    q = (3 * l + u) / 4
+    r = (l + 3 * u) / 4
+    E = inf
+    for _ in range(max_iter):
+        old_E = E
+        LHS = np.array([
+            [l, l**3, l**5, 1],
+            [q, q**3, q**5, -1],
+            [r, r**3, r**5, 1],
+            [u, u**3, u**5, -1],
+        ])
+        a, b, c, E = np.linalg.solve(LHS, np.ones(4))
+        if not np.all(np.isfinite([a, b, c, E])):
+            raise ValueError(f"_optimal_quintic: non-finite solve result "
+                             f"a={a}, b={b}, c={c}, E={E}")
+        q, r = np.sqrt(
+            (-3 * b + np.array([-1, 1]) * sqrt(9 * b**2 - 20 * a * c)) /
+            (10 * c))
+        if not np.all(np.isfinite([q, r])):
+            raise ValueError(
+                f"_optimal_quintic: non-finite node update q={q}, r={r}")
+        if abs(old_E - E) <= 1e-15:
+            break
+    else:
+        raise RuntimeError(
+            f"_optimal_quintic: did not converge after {max_iter} iterations")
+    return float(a), float(b), float(c)
+def _optimal_composition(l, num_iters, safety_factor_eps=0, cushion=0):
+    """
+    Compute the Polar Express coefficient series for `num_iters` quintic iterations.
+    Builds a sequence of per-step optimal odd quintic coefficients (a, b, c) that
+    compose to map singular values from [l, 1] toward 1. At each step:
+      1. Solves `_optimal_quintic` on [max(l, cushion*u), u]. The `cushion`
+         prevents near-zero singular values from stalling by raising the effective
+         lower bound; if it is active (cushion*u > l), the coefficients are
+         rescaled so that p(l) and p(u) are centered around 1 w.r.t. the true [l, u].
+      2. Deflates the coefficients by (1 + safety_factor_eps)^degree for all but the
+         last iteration, providing numerical headroom at the cost of a slightly slower
+         final convergence step.
+      3. Advances the interval: l <- p(l), u <- 2 - p(l) (by symmetry of p around 1).
+    Returns a list of (a, b, c) tuples, one per iteration.
+    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
+    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
+    """
+    u = 1
+    assert 0 <= l <= u
+    safety_factor = 1 + safety_factor_eps
+    coefficients = []
+    for iter in range(num_iters):
+        a, b, c = _optimal_quintic(max(l, cushion * u), u)
+        if cushion * u > l:
+            pl = a * l + b * l**3 + c * l**5
+            pu = a * u + b * u**3 + c * u**5
+            rescaler = 2 / (pl + pu)
+            a *= rescaler
+            b *= rescaler
+            c *= rescaler
+        if iter < num_iters - 1:
+            a /= safety_factor
+            b /= safety_factor**3
+            c /= safety_factor**5
+        coefficients.append((a, b, c))
+        l = a * l + b * l**3 + c * l**5
+        u = 2 - l
+    return coefficients
+# Precomputed Polar Express coefficients (a, b, c) for 10 quintic Newton-Schulz
+# iterations. Each tuple is the minimax-optimal (Remez/equioscillation) odd quintic
+# approximant to x->1 over the current singular-value interval, computed once at
+# import time and reused across all optimizer steps.
+#
+# Contrast with the former hardcoded NS coefficients (5 fixed tuples):
+#   - Former: empirically tuned to maximize slope at zero; did not converge
+#     singular values to 1, yielding US'V^T with S' ~ Uniform(0.5, 1.5) instead
+#     of the true polar factor UV^T.
+#   - Polar Express: analytically optimal per step, adapting to the shrinking
+#     singular-value interval [l, u] as iterations progress; converges all
+#     singular values to 1, producing the exact polar factor UV^T.
+_coeffs_list = _optimal_composition(l=1e-3,
+                                    num_iters=10,
+                                    safety_factor_eps=1e-2,
+                                    cushion=0.02)
+# This code is adapted from:
+#   KellerJordan/Muon (https://github.com/KellerJordan/Muon/blob/master/muon.py)
+#   NoahAmsel/PolarExpress (https://github.com/NoahAmsel/PolarExpress)
+#   matmul_transpose_assign kernel from nil0x9/flash-muon (https://github.com/nil0x9/flash-muon)
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
+    Compute the polar factor of G via the Polar Express method.
+    Applies `steps` quintic iterations X <- aX + bX^3 + cX^5, where (a, b, c)
+    are the Polar Express coefficients from `_coeffs_list`. Each step is the
+    optimal odd quintic approximant to x -> 1 over the current singular-value
+    interval, minimizing the maximum approximation error (Remez / minimax criterion).
+    The composition maps singular values from [l, 1] to near 1, producing the
+    polar factor (orthogonal factor in the polar decomposition G = UP).
+    `_coeffs_list` is precomputed for 10 iterations (l=1e-3, safety_factor_eps=1e-2,
+    cushion=0.02). If `steps` exceeds 10, the final coefficient set is repeated.
+    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
+    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
     """
     assert len(G.shape) == 2
     assert G.dtype == COMM_DTYPE
     if G.size(0) > G.size(1):
         X = X.T
     X = X / (X.norm() + 1e-7)
+    hs = _coeffs_list[:steps] + list(
+        repeat(_coeffs_list[-1], steps - len(_coeffs_list)))
     buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
+    for a, b, c in hs:
         matmul_transpose_assign(X, buf1)
         matmul_transpose_assign(buf1, buf2)
         buf1.mul_(b).add_(buf2, alpha=c)
     if G.size(0) > G.size(1):
         X = X.T
     return X
+@torch.no_grad()
+def _zeropower_via_newtonschulz5_batched(G, steps):
+    """Batched polar factor computation for 3D (E, out, in) tensors.
+    Same algorithm as ``_zeropower_via_newtonschulz5`` but uses
+    ``torch.bmm`` / ``torch.baddbmm`` instead of the 2D Triton kernel,
+    processing all E expert matrices in a single batched call.
+    """
+    assert len(G.shape) == 3
+    assert G.dtype == COMM_DTYPE
+    X = G
+    if G.size(1) > G.size(2):
+        X = X.transpose(-2, -1)
+    # Per-expert Frobenius norm.
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
+    hs = _coeffs_list[:steps] + list(
+        repeat(_coeffs_list[-1], steps - len(_coeffs_list)))
+    for a, b, c in hs:
+        buf1 = torch.bmm(X, X.transpose(-2, -1))
+        buf2 = torch.bmm(buf1, buf1.transpose(-2, -1))
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.baddbmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(1) > G.size(2):
+        X = X.transpose(-2, -1)
+    return X
+_ns_per_shape: dict[tuple[int, ...], callable] = {}
+_use_compile = True
+def set_ns_compile(enabled: bool):
+    """Toggle torch.compile for Newton-Schulz iteration."""
+    global _use_compile
+    _use_compile = enabled
+def zeropower_via_newtonschulz5(G, steps=5):
+    if not _use_compile:
+        return _zeropower_via_newtonschulz5(G, steps)
+    key = G.shape
+    if key not in _ns_per_shape:
+        _ns_per_shape[key] = torch.compile(_zeropower_via_newtonschulz5,
+                                           options={
+                                               "triton.cudagraphs": True,
+                                               "shape_padding": False
+                                           })
+    torch.compiler.cudagraph_mark_step_begin()
+    return _ns_per_shape[key](G, steps).clone()
+def zeropower_via_newtonschulz5_batched(G, steps=5):
+    """Compile-cached batched Newton-Schulz for 3D expert tensors."""
+    if not _use_compile:
+        return _zeropower_via_newtonschulz5_batched(G, steps)
+    key = G.shape
+    if key not in _ns_per_shape:
+        _ns_per_shape[key] = torch.compile(
+            _zeropower_via_newtonschulz5_batched,
+            options={
+                "triton.cudagraphs": True,
+                "shape_padding": False
+            })
+    torch.compiler.cudagraph_mark_step_begin()
+    return _ns_per_shape[key](G, steps).clone()

build/torch210-cxx11-cu126-x86_64-linux/pipeline.py CHANGED Viewed

@@ -6,8 +6,8 @@ import torch.distributed as dist
 from torch.distributed.tensor import DTensor
 from torch.profiler import record_function
-from .core import _muon_state, adjust_lr_for_muon, update_p
-from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
 from .qk_clip import compute_scales
 logger = logging.getLogger(__name__)
@@ -45,26 +45,33 @@ def _launch_gather(
         else:
             gathered_grads[id(p)] = None
-    # Build send buffer
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
     send_counts = [0] * num_ranks
     for p in params:
         state = param_to_state[id(p)]
-        dst = state.worker_rank
-        assert dst < num_ranks
-        shard_elems = state.rank_numels[rank]
-        g = p.grad
-        g = g.to_local().to(COMM_DTYPE).contiguous()
-        assert g.numel() == shard_elems
-        per_dst[dst].append(g.view(-1))
-        send_counts[dst] += shard_elems
-    assert any(
-        len(v) > 0 for v in
-        per_dst), "At least one destination rank must receive a sharded tensor"
-    per_dst_flat = [t for dst in per_dst for t in dst]
-    send_buf = torch.cat(per_dst_flat, dim=0)
     # Build recv buffer
     recv_counts = [0] * num_ranks
@@ -120,7 +127,8 @@ def _complete_gather(
             shard_view = gathered_grads[id(p)][indices]
             n = shard_view.numel()
-            assert n > 0
             sg = recv_buf.narrow(0, off + inner_off, n)
             sg = sg.reshape(shard_view.shape)
@@ -143,7 +151,7 @@ def _compute_ns(
     """
     computed_us: dict[int, torch.Tensor | None] = {}
     for p in owned_params:
-        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
         gathered_grads[id(p)] = None  # free gathered grad
         computed_us[id(p)] = u
     return computed_us
@@ -163,46 +171,47 @@ def _launch_scatter(
     Returns:
         work: Async operation handle.
         recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
-        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
         recv_counts: Per-source-rank element counts.
     """
-    # Allocate scattered-u buffers
     scattered_us: dict[int, torch.Tensor] = {}
     for p in params:
-        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
-    # Build send buffer (from computed_us on owner ranks)
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
     send_counts = [0] * num_ranks
     if owned_params:
         for p in owned_params:
             state = param_to_state[id(p)]
-            assert computed_us[id(p)] is not None
-            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
-            total_sent = 0
             for dst_rank in range(num_ranks):
-                indices = state.rank_indices[dst_rank]
-                su = u_full[indices].flatten()
-                n = su.numel()
-                assert n > 0
-                per_dst[dst_rank].append(su)
-                send_counts[dst_rank] += n
-                total_sent += n
-            assert total_sent == u_full.numel()
-    lengths = [len(v) for v in per_dst]
-    if all(l > 0 for l in lengths):
-        assert all(
-            l == lengths[0] for l in lengths
-        ), "All destination ranks must have the same number of sharded tensor"
-        per_dst_flat = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst_flat, dim=0)
     else:
         send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
@@ -218,7 +227,6 @@ def _launch_scatter(
         recv_counts[src] = total
     recv_total = sum(recv_counts)
-    assert recv_total > 0
     recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
     # Launch async all-to-all
@@ -242,7 +250,13 @@ def _complete_scatter(
     rank: int,
     scattered_us: dict[int, torch.Tensor],
 ) -> None:
-    """Copy recv buffer into scattered_us (in-place)."""
     off = 0
     for src in range(len(recv_counts)):
         block = recv_counts[src]
@@ -255,11 +269,11 @@ def _complete_scatter(
             if state.worker_rank != src:
                 continue
             n = state.rank_numels[rank]
-            assert n > 0
-            flat_local = recv_buf.narrow(0, off + inner_off,
-                                         n).view_as(p.to_local())
-            scattered_us[id(p)].copy_(flat_local)
             inner_off += n
@@ -275,23 +289,40 @@ def _update_params(
     lr: float,
     weight_decay: float,
 ) -> None:
-    """Apply weight decay, Muon update, and optional QK clipping."""
-    for p in params:
-        state = param_to_state[id(p)]
-        u_dtensor = DTensor.from_local(
-            scattered_us[id(p)],
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
         adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
-        # QK clipping – applied directly on the local tensor to
-        # avoid DTensor sharding-propagation issues with _StridedShard.
-        scales_full = compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
         if scales_full is not None:
             ratio = p.shape[0] // scales_full.shape[0]
             idx0 = state.rank_indices[rank][0]
@@ -304,6 +335,45 @@ def _update_params(
             p._local_tensor.mul_(row_scales.view(-1, 1))
 # ======================================================================
 # Main generator – thin orchestrator that wires stages together.
 # ======================================================================
@@ -318,6 +388,7 @@ def muon_chunk_pipeline(
     lr: float,
     weight_decay: float,
     none_grad: bool,
 ) -> Generator[None, None, None]:
     """Process one chunk of parameters through the full Muon pipeline.
@@ -334,9 +405,12 @@ def muon_chunk_pipeline(
     runs concurrently on the NCCL stream — no separate ``comm_stream``
     is required.
     Yields exactly **2** times:
-    1. After launching async all-to-all gather.
     2. After launching async all-to-all scatter.
     """
     process_group = param_to_state[id(params[0])].process_group
@@ -345,15 +419,19 @@ def muon_chunk_pipeline(
         p for p in params if param_to_state[id(p)].worker_rank == rank
     ]
-    # Stages 1-2: launch async gather.
-    with record_function("muon::launch_gather"):
-        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group)
-        if none_grad:
-            for p in params:
-                p.grad = None
     yield  # --- YIELD 1: other chunks can launch their gather ---

 from torch.distributed.tensor import DTensor
 from torch.profiler import record_function
+from .core import _muon_state, adjust_lr_for_muon
+from .newton_schulz import COMM_DTYPE, zeropower_via_newtonschulz5
 from .qk_clip import compute_scales
 logger = logging.getLogger(__name__)
         else:
             gathered_grads[id(p)] = None
+    # Build send buffer – batch grad copies via torch.cat
+    # (1-2 fused kernels vs N individual narrow().copy_() calls).
     send_counts = [0] * num_ranks
     for p in params:
         state = param_to_state[id(p)]
+        send_counts[state.worker_rank] += state.rank_numels[rank]
+    total_send = sum(send_counts)
+    if total_send > 0:
+        # Group grad slices by destination rank in a single pass.
+        dst_to_grads = [[] for _ in range(num_ranks)]
+        for p in params:
+            state = param_to_state[id(p)]
+            n = state.rank_numels[rank]
+            if n > 0:
+                g = p.grad.to_local()
+                dst_to_grads[state.worker_rank].append(g.reshape(-1))
+        # Flatten in dst order and cat once.
+        all_slices = []
+        for dst in range(num_ranks):
+            all_slices.extend(dst_to_grads[dst])
+        send_buf = torch.cat(all_slices)
+        if send_buf.dtype != COMM_DTYPE:
+            send_buf = send_buf.to(COMM_DTYPE)
+    else:
+        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
     # Build recv buffer
     recv_counts = [0] * num_ranks
             shard_view = gathered_grads[id(p)][indices]
             n = shard_view.numel()
+            if n == 0:
+                continue
             sg = recv_buf.narrow(0, off + inner_off, n)
             sg = sg.reshape(shard_view.shape)
     """
     computed_us: dict[int, torch.Tensor | None] = {}
     for p in owned_params:
+        u = zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
         gathered_grads[id(p)] = None  # free gathered grad
         computed_us[id(p)] = u
     return computed_us
     Returns:
         work: Async operation handle.
         recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
+        scattered_us: Empty dict, populated by ``_complete_scatter`` with
+            zero-copy views into ``recv_buf``.
         recv_counts: Per-source-rank element counts.
     """
+    # scattered_us is populated by _complete_scatter with zero-copy views
+    # into recv_buf, avoiding N empty_like allocations + N copy_ calls.
+    # Pre-seed entries for params whose local shard is empty (rank_numels == 0)
+    # so _update_params can iterate all params without KeyError.
     scattered_us: dict[int, torch.Tensor] = {}
     for p in params:
+        if param_to_state[id(p)].rank_numels[rank] == 0:
+            scattered_us[id(p)] = torch.empty_like(p.to_local(),
+                                                   dtype=COMM_DTYPE)
+    # Build send buffer – batch via torch.cat
+    # (1 fused kernel vs N*num_ranks individual narrow().copy_() calls).
     send_counts = [0] * num_ranks
     if owned_params:
         for p in owned_params:
             state = param_to_state[id(p)]
             for dst_rank in range(num_ranks):
+                send_counts[dst_rank] += state.rank_numels[dst_rank]
+    total_send = sum(send_counts)
+    if total_send > 0:
+        # Cache u_full conversions to avoid redundant .to() per dst_rank.
+        u_fulls = {}
+        for p in owned_params:
+            u_fulls[id(p)] = computed_us[id(p)].to(COMM_DTYPE).contiguous()
+        # Collect slices in dst order (matches all-to-all send layout).
+        all_slices = []
+        for dst_rank in range(num_ranks):
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                su = u_fulls[id(p)][state.rank_indices[dst_rank]].flatten()
+                if su.numel() > 0:
+                    all_slices.append(su)
+        send_buf = torch.cat(all_slices) if all_slices else torch.empty(
+            0, dtype=COMM_DTYPE, device="cuda")
     else:
         send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
         recv_counts[src] = total
     recv_total = sum(recv_counts)
     recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
     # Launch async all-to-all
     rank: int,
     scattered_us: dict[int, torch.Tensor],
 ) -> None:
+    """Populate scattered_us with zero-copy views into recv_buf.
+    Instead of pre-allocating tensors and copying, we assign views directly
+    from ``recv_buf``.  This eliminates N ``empty_like`` + N ``copy_`` calls.
+    The underlying storage of ``recv_buf`` is kept alive through the views
+    until ``scattered_us`` is cleared after ``_update_params``.
+    """
     off = 0
     for src in range(len(recv_counts)):
         block = recv_counts[src]
             if state.worker_rank != src:
                 continue
             n = state.rank_numels[rank]
+            if n == 0:
+                continue
+            scattered_us[id(p)] = recv_buf.narrow(0, off + inner_off,
+                                                  n).view_as(p.to_local())
             inner_off += n
     lr: float,
     weight_decay: float,
 ) -> None:
+    """Apply weight decay, Muon update, and optional QK clipping.
+    Uses batched ``_foreach_mul_`` for weight decay and batched
+    ``_foreach_add_`` for the Muon update, grouping parameters by
+    adjusted_lr to minimize kernel launches while preserving float32
+    precision for the alpha scaling.
+    """
+    if not params:
+        return
+    # Batched weight decay: p *= (1 - lr * wd) — single fused kernel.
+    p_locals = [p._local_tensor for p in params]
+    torch._foreach_mul_(p_locals, 1.0 - lr * weight_decay)
+    # Group params by adjusted_lr so _foreach_add_ can use a single
+    # alpha per group (preserves float32 precision for alpha scaling).
+    lr_groups: dict[float, tuple[list, list]] = {}
+    for p in params:
         adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+        if adjusted_lr not in lr_groups:
+            lr_groups[adjusted_lr] = ([], [])
+        lr_groups[adjusted_lr][0].append(p._local_tensor)
+        lr_groups[adjusted_lr][1].append(scattered_us[id(p)])
+    for adjusted_lr, (p_group, u_group) in lr_groups.items():
+        torch._foreach_add_(p_group, u_group, alpha=-adjusted_lr)
+    # QK clipping – applied directly on the local tensor to
+    # avoid DTensor sharding-propagation issues with _StridedShard.
+    for p in params:
+        state = param_to_state[id(p)]
+        if state.qk_clip_state is None:
+            continue
+        scales_full = compute_scales(p, state.qk_clip_state)
         if scales_full is not None:
             ratio = p.shape[0] // scales_full.shape[0]
             idx0 = state.rank_indices[rank][0]
             p._local_tensor.mul_(row_scales.view(-1, 1))
+# ======================================================================
+# Pre-launch helper for overlapping first chunk's gather with other work.
+# ======================================================================
+@torch.no_grad()
+def prelaunch_first_gather(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    none_grad: bool,
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
+    """Launch the first chunk's A2A gather early for overlap with other compute.
+    Call this *before* expensive GPU work (e.g. batched expert NS) so that
+    the NCCL all-to-all runs concurrently on the NCCL stream while the
+    default stream executes compute.
+    Returns the same 4-tuple that ``_launch_gather`` produces, which should
+    be passed as ``prelaunch_gather`` to :func:`muon_chunk_pipeline`.
+    """
+    process_group = param_to_state[id(params[0])].process_group
+    num_ranks = dist.get_world_size(group=process_group)
+    owned_params = [
+        p for p in params if param_to_state[id(p)].worker_rank == rank
+    ]
+    with record_function("muon::prelaunch_gather"):
+        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group)
+    if none_grad:
+        for p in params:
+            p.grad = None
+    return work, recv_buf, gathered_grads, recv_counts
 # ======================================================================
 # Main generator – thin orchestrator that wires stages together.
 # ======================================================================
     lr: float,
     weight_decay: float,
     none_grad: bool,
+    prelaunch_gather: tuple | None = None,
 ) -> Generator[None, None, None]:
     """Process one chunk of parameters through the full Muon pipeline.
     runs concurrently on the NCCL stream — no separate ``comm_stream``
     is required.
+    If ``prelaunch_gather`` is provided, the gather was already launched
+    by :func:`prelaunch_first_gather` and we skip launching it again.
     Yields exactly **2** times:
+    1. After launching async all-to-all gather (or immediately if pre-launched).
     2. After launching async all-to-all scatter.
     """
     process_group = param_to_state[id(params[0])].process_group
         p for p in params if param_to_state[id(p)].worker_rank == rank
     ]
+    if prelaunch_gather is not None:
+        # Gather was pre-launched; none_grad already handled by caller.
+        work, recv_buf, gathered_grads, recv_counts = prelaunch_gather
+    else:
+        # Normal path: launch async gather.
+        with record_function("muon::launch_gather"):
+            work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+                params, owned_params, param_to_state, rank, num_ranks,
+                process_group)
+            if none_grad:
+                for p in params:
+                    p.grad = None
     yield  # --- YIELD 1: other chunks can launch their gather ---

build/torch210-cxx11-cu126-x86_64-linux/qk_clip.py CHANGED Viewed

@@ -5,6 +5,8 @@ from dataclasses import dataclass
 import torch
 from torch.distributed.tensor import DTensor
 logger = logging.getLogger(__name__)
@@ -23,7 +25,7 @@ def parse_qk_layer(name: str) -> tuple[str | None, int]:
         'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
         'model.4.attn.v_proj.weight'  -> (None, -1)
     """
-    parts = name.split('.')
     if len(parts) < 3:
         return None, -1
@@ -100,23 +102,27 @@ def compute_scales(p, qk_clip_state):
     threshold = qk_clip_state.threshold
     logit = qk_clip_state.logit
-    H_global = p.shape[0] // head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    scaling = 0
     for logit_idx, head_idx in enumerate(indices):
         v_ele = float(logit[logit_idx])
         if v_ele > threshold:
             new_scale = math.sqrt(threshold / v_ele)
-            if new_scale < scales_full[head_idx]:
-                scales_full[head_idx] = new_scale
                 logger.info(
                     f"[{kind}] Head {head_idx} exceeded threshold "
                     f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
                 )
-                scaling += 1
-    return scales_full if scaling > 0 else None
 def qk_clip(p, scales, head_dim):

 import torch
 from torch.distributed.tensor import DTensor
+from .core import normalize_fqn
 logger = logging.getLogger(__name__)
         'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
         'model.4.attn.v_proj.weight'  -> (None, -1)
     """
+    parts = normalize_fqn(name).split('.')
     if len(parts) < 3:
         return None, -1
     threshold = qk_clip_state.threshold
     logit = qk_clip_state.logit
+    # Check if any head exceeds threshold before allocating.
+    head_scales = {}
     for logit_idx, head_idx in enumerate(indices):
         v_ele = float(logit[logit_idx])
         if v_ele > threshold:
             new_scale = math.sqrt(threshold / v_ele)
+            if head_idx not in head_scales or new_scale < head_scales[head_idx]:
+                head_scales[head_idx] = new_scale
                 logger.info(
                     f"[{kind}] Head {head_idx} exceeded threshold "
                     f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
                 )
+    if not head_scales:
+        return None
+    H_global = p.shape[0] // head_dim
+    scales_full = torch.ones(H_global, device=p.data.device)
+    for head_idx, scale in head_scales.items():
+        scales_full[head_idx] = scale
+    return scales_full
 def qk_clip(p, scales, head_dim):

build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_7aef62f_dirty
-ops = torch.ops._optimizer_7aef62f_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_7aef62f_dirty::{op_name}"

 import torch
+from . import _optimizer_5b58933_dirty
+ops = torch.ops._optimizer_5b58933_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_5b58933_dirty::{op_name}"

build/torch210-cxx11-cu128-x86_64-linux/{_optimizer_7aef62f_dirty.abi3.so → _optimizer_5b58933_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4919c48c77c6223dbf668f1461bcec175ef1bd6ea4cec8c2509de12ca7200a62
 size 2004144

 version https://git-lfs.github.com/spec/v1
+oid sha256:1abfa69cd254e0000246a074c0bfa53c2e72bb53cc5fa8216275295cd021c57a
 size 2004144

build/torch210-cxx11-cu128-x86_64-linux/adamw.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from collections import defaultdict
 from typing import cast
 import torch
 from torch.distributed.tensor import DTensor
 def fused_adamw(
@@ -72,54 +76,72 @@ def fused_adamw(
         )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
     params_with_grads = []
     grads = []
     moment1 = []
     moment2 = []
-    max_exp_avg_sqs = []
     state_steps = []
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
     for p in params:
         g = p.grad
         if g is None:
             continue
         state = optimizer_state[p]
-        params_with_grads.append(p)
-        grads.append(g)
         if "step" not in state:
-            state["step"] = (torch.zeros((),
-                                         dtype=torch.float32,
-                                         device=p.device))
             state["moment1"] = torch.zeros_like(g)
             state["moment2"] = torch.zeros_like(g)
-        moment1.append(state["moment1"])
-        moment2.append(state["moment2"])
         if not isinstance(state["step"], torch.Tensor):
-            step_tensor = torch.tensor(state["step"],
-                                       dtype=torch.float32,
-                                       device=p.device)
-        else:
-            step_tensor = state["step"]
-        state_steps.append(step_tensor)
     fused_adamw(
         params_with_grads,
         grads,
         moment1,
         moment2,
-        max_exp_avg_sqs,
         state_steps,
         amsgrad=False,
         beta1=beta1,
@@ -131,24 +153,119 @@ def step_adamw_params(optimizer_state, params, group):
     )
 def step_adamw(optimizer_state, group):
     """Dispatch AdamW step, grouping parameters by type and placement.
     Args:
         optimizer_state: The optimizer's state dict (self.state in Muon).
         group: Parameter group dict.
     """
     params = group["params"]
-    # group params with its type and placement
-    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
-    for p in params:
-        match p:
-            case DTensor():
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])].append(p)
-            case torch.Tensor():
-                placement_to_params[tuple([torch.Tensor, None])].append(p)
-    for group_params in placement_to_params.values():
         step_adamw_params(optimizer_state, group_params, group)

+import logging
 from collections import defaultdict
 from typing import cast
 import torch
 from torch.distributed.tensor import DTensor
+from torch.profiler import record_function
+logger = logging.getLogger(__name__)
 def fused_adamw(
         )
+def _to_local(t):
+    """Unwrap DTensor to local tensor for fused ops."""
+    return t._local_tensor if isinstance(t, DTensor) else t
+# ---------------------------------------------------------------------------
+# Caches for eliminating per-step Python overhead.
+#
+# Placement grouping and tensor list assembly are identical every step
+# (params don't change placement, moment/step tensors are the same objects
+# after initialisation).  We cache them keyed by id() of the param list
+# stored in param_groups (stable across steps).
+#
+# Only gradients change each step and must be collected fresh.
+# ---------------------------------------------------------------------------
+# id(group["params"]) → dict[placement_key, list[param]]
+_placement_cache: dict[int, dict[tuple, list]] = {}
+# id(placement_group_list) → (params_local, moment1, moment2, state_steps)
+_tensor_cache: dict[int, tuple[list, list, list, list]] = {}
+def _step_adamw_params_slow(optimizer_state, params, group):
+    """Uncached fallback for the rare case where some params lack grads."""
     params_with_grads = []
     grads = []
     moment1 = []
     moment2 = []
     state_steps = []
     for p in params:
         g = p.grad
         if g is None:
             continue
         state = optimizer_state[p]
+        params_with_grads.append(_to_local(p))
+        grads.append(_to_local(g))
         if "step" not in state:
+            state["step"] = torch.zeros((),
+                                        dtype=torch.float32,
+                                        device=p.device)
             state["moment1"] = torch.zeros_like(g)
             state["moment2"] = torch.zeros_like(g)
+        moment1.append(_to_local(state["moment1"]))
+        moment2.append(_to_local(state["moment2"]))
         if not isinstance(state["step"], torch.Tensor):
+            state["step"] = torch.tensor(state["step"],
+                                         dtype=torch.float32,
+                                         device=p.device)
+        state_steps.append(state["step"])
+    if not params_with_grads:
+        return
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
     fused_adamw(
         params_with_grads,
         grads,
         moment1,
         moment2,
+        [],
         state_steps,
         amsgrad=False,
         beta1=beta1,
     )
+def step_adamw_params(optimizer_state, params, group):
+    """Run fused AdamW on a list of parameters sharing the same placement.
+    After the first call, cached tensor lists (params_local, moment1,
+    moment2, state_steps) are reused — only gradients are collected fresh.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        params: List of parameters to update.
+        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
+    """
+    # Collect grads — the only thing that changes each step.
+    with record_function("adamw::collect_grads"):
+        grads = []
+        for p in params:
+            g = p.grad
+            if g is None:
+                # Rare: fall back to slow path that filters per-param.
+                _step_adamw_params_slow(optimizer_state, params, group)
+                return
+            grads.append(_to_local(g))
+    tensor_key = id(params)
+    if tensor_key not in _tensor_cache:
+        with record_function("adamw::init_tensor_cache"):
+            params_local = []
+            moment1 = []
+            moment2 = []
+            state_steps = []
+            for p in params:
+                state = optimizer_state[p]
+                params_local.append(_to_local(p))
+                if "step" not in state:
+                    state["step"] = torch.zeros((),
+                                                dtype=torch.float32,
+                                                device=p.device)
+                    state["moment1"] = torch.zeros_like(p.grad)
+                    state["moment2"] = torch.zeros_like(p.grad)
+                moment1.append(_to_local(state["moment1"]))
+                moment2.append(_to_local(state["moment2"]))
+                if not isinstance(state["step"], torch.Tensor):
+                    state["step"] = torch.tensor(state["step"],
+                                                 dtype=torch.float32,
+                                                 device=p.device)
+                state_steps.append(state["step"])
+            _tensor_cache[tensor_key] = (params_local, moment1, moment2,
+                                         state_steps)
+    params_local, moment1, moment2, state_steps = _tensor_cache[tensor_key]
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
+    with record_function("adamw::fused_adamw"):
+        fused_adamw(
+            params_local,
+            grads,
+            moment1,
+            moment2,
+            [],
+            state_steps,
+            amsgrad=False,
+            beta1=beta1,
+            beta2=beta2,
+            lr=lr,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=False,
+        )
 def step_adamw(optimizer_state, group):
     """Dispatch AdamW step, grouping parameters by type and placement.
+    Placement grouping is cached after the first call since params never
+    change their placement between steps.
     Args:
         optimizer_state: The optimizer's state dict (self.state in Muon).
         group: Parameter group dict.
     """
     params = group["params"]
+    placement_key = id(params)
+    if placement_key not in _placement_cache:
+        with record_function("adamw::group_by_placement"):
+            placement_to_params: dict[tuple,
+                                      list[torch.Tensor]] = defaultdict(list)
+            for p in params:
+                match p:
+                    case DTensor():
+                        logger.debug(
+                            "[AdamW] DTensor param: shape=%s, placements=%s, "
+                            "mesh=%s, grad=%s", p.shape, p.placements,
+                            p.device_mesh.mesh_dim_names,
+                            p.grad.shape if p.grad is not None else None)
+                        placement_to_params[tuple(
+                            [p.placements, p.device_mesh])].append(p)
+                    case torch.Tensor():
+                        logger.debug(
+                            "[AdamW] plain param: shape=%s, grad=%s", p.shape,
+                            p.grad.shape if p.grad is not None else None)
+                        placement_to_params[tuple([torch.Tensor,
+                                                   None])].append(p)
+            logger.debug("[AdamW] %d placement groups, %d total params",
+                         len(placement_to_params), len(params))
+            _placement_cache[placement_key] = dict(placement_to_params)
+    for group_params in _placement_cache[placement_key].values():
         step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu128-x86_64-linux/core.py CHANGED Viewed

@@ -1,11 +1,25 @@
 import math
 from dataclasses import dataclass
 import torch
-import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
 @dataclass
 class _muon_state:
@@ -17,26 +31,71 @@ class _muon_state:
     qk_clip_state: torch.Tensor | None = None
-def update_g(optimizer_state, p, g, group, momentum):
-    """Apply momentum update to gradient.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        p: Parameter tensor.
-        g: Gradient tensor.
-        group: Parameter group dict.
-        momentum: Momentum coefficient.
-    Returns:
-        Momentum-updated gradient tensor.
     """
-    state = optimizer_state[p]
-    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-    torch.add(g, buf, alpha=momentum, out=buf)
-    if group["nesterov"]:
-        g.add_(buf, alpha=momentum)
-        return g
-    return buf
 def update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -49,14 +108,13 @@ def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
-    if isinstance(p, torch.nn.Parameter):
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    else:
-        p.mul_(1 - lr * weight_decay)
-        p.add_(u, alpha=-adjusted_lr)
 def adjust_lr_for_muon(lr, param_shape):
@@ -77,14 +135,55 @@ def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
 def default_is_muon(name, x, expert_keys=None):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    if any(key in name for key in skip_keys):
         return False
     effective_ndim = x.ndim
-    if expert_keys and any(key in name for key in expert_keys):
         effective_ndim -= 1
-    return effective_ndim >= 2
 def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
@@ -92,7 +191,7 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
-    non_muon_params = []
     for n, p in model.named_parameters():
         if not p.requires_grad:
@@ -102,6 +201,10 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
             muon_names.append(n)
         else:
             non_muon_params.append(p)
     return [
         {

+import logging
 import math
 from dataclasses import dataclass
+from typing import List
 import torch
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
+# torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
+# parameter FQNs.  Activation checkpointing similarly inserts
+# "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
+# expert_keys, QK layer parsing) works regardless of wrapper nesting.
+_WRAPPER_PARTS = frozenset({"_orig_mod", "_checkpoint_wrapped_module"})
+logger = logging.getLogger(__name__)
+def normalize_fqn(name: str) -> str:
+    """Strip torch.compile / checkpoint wrapper components from a parameter FQN."""
+    return ".".join(p for p in name.split(".") if p not in _WRAPPER_PARTS)
 @dataclass
 class _muon_state:
     qk_clip_state: torch.Tensor | None = None
+def _batch_momentum(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update (no nesterov)."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+def _batch_momentum_nesterov(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update with nesterov correction."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+    nesterov_terms = torch._foreach_mul(momentum_bufs, momentum)
+    torch._foreach_add_(grads, nesterov_terms)
+_compiled_momentum: dict[bool, callable] = {}
+_use_momentum_compile = True
+def set_momentum_compile(enabled: bool):
+    """Toggle torch.compile for batched momentum."""
+    global _use_momentum_compile
+    _use_momentum_compile = enabled
+def batch_pre_ortho(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+    nesterov: bool,
+) -> None:
+    """Batched momentum update on lists of plain tensors.
+    Mirrors dion's ``muon_update_pre_orthogonalize``.
+    Inputs must be plain CUDA tensors (not DTensor).
+    Modifies ``momentum_bufs`` and (for nesterov) ``grads`` in-place.
+    When compile is enabled, uses separately compiled functions for
+    nesterov=True/False to avoid graph breaks from the branch.
     """
+    fn = _batch_momentum_nesterov if nesterov else _batch_momentum
+    if _use_momentum_compile:
+        if nesterov not in _compiled_momentum:
+            _compiled_momentum[nesterov] = torch.compile(fn)
+        fn = _compiled_momentum[nesterov]
+    fn(grads, momentum_bufs, momentum)
+def _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay):
+    """Weight-decay + update on plain tensors.
+    Not compiled: per-param @torch.compile caused ~0.25ms TorchDynamo cache
+    lookup per call × 256+ params = massive overhead.  The pipeline path uses
+    batched _foreach_* ops instead; this function remains for base() and
+    distributed_muon().
+    """
+    p_data.mul_(1 - lr * weight_decay)
+    p_data.add_(u_data, alpha=-adjusted_lr)
 def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
+    # Unwrap Parameter -> underlying data tensor.
+    p_data = p.data if isinstance(p, torch.nn.Parameter) else p
+    # Unwrap DTensor -> local CUDA tensor for compiled kernel.
+    if isinstance(p_data, DTensor):
+        p_data = p_data._local_tensor
+    u_data = u._local_tensor if isinstance(u, DTensor) else u
+    _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay)
 def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
+def _match_key(parts, key):
+    """Check if key matches as contiguous components in parts.
+    Single-component keys (e.g. "experts") match any single component.
+    Multi-component keys (e.g. "experts.w1") match as a contiguous subsequence.
+    """
+    key_parts = key.split(".")
+    key_len = len(key_parts)
+    if key_len == 1:
+        return key in parts
+    return any(parts[i:i + key_len] == key_parts
+               for i in range(len(parts) - key_len + 1))
+def is_expert_param(name, expert_keys):
+    """Check if a parameter name matches any expert key (component-level)."""
+    if not expert_keys:
+        return False
+    parts = normalize_fqn(name).split(".")
+    return any(_match_key(parts, key) for key in expert_keys)
 def default_is_muon(name, x, expert_keys=None):
+    normalized = normalize_fqn(name)
+    parts = normalized.split(".")
+    skip_keys = [
+        "embed_tokens",
+        "lm_head",
+        "tok_embeddings",
+        "output",
+        "mhc_attn",
+        "mhc_ffn",
+        "lambda_proj",
+    ]
+    if any(key in parts for key in skip_keys):
+        logger.info(
+            "[is_muon] %s (orig: %s): skip (matched skip_key), ndim=%d",
+            normalized, name, x.ndim)
         return False
     effective_ndim = x.ndim
+    is_expert = is_expert_param(name, expert_keys)
+    if is_expert:
         effective_ndim -= 1
+    result = effective_ndim >= 2
+    logger.info(
+        "[is_muon] %s (orig: %s): ndim=%d, expert=%s, effective_ndim=%d → %s",
+        normalized, name, x.ndim, is_expert, effective_ndim,
+        "Muon" if result else "AdamW")
+    return result
 def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
+    non_muon_params, non_muon_names = [], []
     for n, p in model.named_parameters():
         if not p.requires_grad:
             muon_names.append(n)
         else:
             non_muon_params.append(p)
+            non_muon_names.append(n)
+    logger.info("[param_groups] expert_keys=%s, Muon=%d, AdamW=%d",
+                expert_keys, len(muon_names), len(non_muon_names))
     return [
         {

build/torch210-cxx11-cu128-x86_64-linux/cpu_offload.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""CPU offloading for optimizer states.
+Manages a pinned CPU memory pool and async CUDA streams to offload
+optimizer state tensors (momentum buffers, Adam moments) to CPU between
+optimizer steps, freeing GPU memory.
+All tracked tensors are packed into a single flat pinned CPU buffer
+(per dtype).  D2H and H2D copies are performed per-tensor directly
+between individual GPU tensors and their slice of the CPU flat buffer
+— no GPU staging buffer is allocated, so there is **no temporary GPU
+memory spike** during offload or reload.
+Individual tensor storages are freed after offload via
+``untyped_storage().resize_(0)``, preserving tensor identity so
+downstream caches remain valid.
+"""
+import logging
+from collections import defaultdict
+import torch
+from torch.distributed.tensor import DTensor
+logger = logging.getLogger(__name__)
+class CPUOffloadPool:
+    """Pinned CPU memory pool for async optimizer state offloading.
+    Tracked tensors are grouped by dtype.  Each group gets a single flat
+    pinned CPU buffer.  D2H / H2D copies are per-tensor (into slices of
+    the flat buffer) to avoid allocating a GPU staging buffer.
+    """
+    def __init__(self):
+        self._managed: list[torch.Tensor] = []
+        self._storage_nbytes: dict[int, int] = {}  # id(t) → bytes
+        # Per-dtype group: populated on first offload.
+        # dtype → dict with keys:
+        #   "indices"   : list[int]           managed-list indices
+        #   "offsets"   : list[tuple[int,int]] (start, numel) in flat buf
+        #   "total"     : int                  total numel
+        #   "cpu_flat"  : Tensor               pinned CPU buffer
+        self._groups: dict[torch.dtype, dict] = {}
+        self._offload_stream: torch.cuda.Stream | None = None
+        self._device: torch.device | None = None
+        self._initialized: bool = False
+        self._logged: bool = False
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _local(t: torch.Tensor) -> torch.Tensor:
+        """Unwrap DTensor to its local CUDA tensor."""
+        return t._local_tensor if isinstance(t, DTensor) else t
+    def _ensure_stream(self):
+        if self._offload_stream is None:
+            self._offload_stream = torch.cuda.Stream(device=self._device)
+    # ------------------------------------------------------------------
+    def track(self, tensor: torch.Tensor):
+        """Register a GPU tensor for CPU offloading.  Idempotent."""
+        tid = id(tensor)
+        if tid in self._storage_nbytes:
+            return
+        local = self._local(tensor)
+        if self._device is None:
+            self._device = local.device
+        self._storage_nbytes[tid] = local.untyped_storage().size()
+        self._managed.append(tensor)
+    # ------------------------------------------------------------------
+    def _init_buffers(self):
+        """Build per-dtype flat buffers on first offload."""
+        # Group managed tensors by dtype.
+        dtype_map: dict[torch.dtype, list[tuple[int, int]]] = defaultdict(list)
+        for idx, t in enumerate(self._managed):
+            local = self._local(t)
+            dtype_map[local.dtype].append((idx, local.numel()))
+        total_cpu_bytes = 0
+        for dtype, entries in dtype_map.items():
+            offsets: list[tuple[int, int]] = []
+            indices: list[int] = []
+            off = 0
+            for idx, n in entries:
+                indices.append(idx)
+                offsets.append((off, n))
+                off += n
+            cpu_flat = torch.empty(off, dtype=dtype, device="cpu", pin_memory=True)
+            self._groups[dtype] = {
+                "indices": indices,
+                "offsets": offsets,
+                "total": off,
+                "cpu_flat": cpu_flat,
+            }
+            total_cpu_bytes += off * cpu_flat.element_size()
+        self._initialized = True
+        logger.info(
+            "[CPUOffload] Pool initialized: %d tensors, %d dtype group(s), "
+            "%.2f MB pinned CPU memory",
+            len(self._managed),
+            len(self._groups),
+            total_cpu_bytes / (1024**2),
+        )
+    # ------------------------------------------------------------------
+    def offload(self):
+        """Per-tensor async D2H into CPU flat buffer, then free GPU storage."""
+        if not self._managed:
+            return
+        if not self._initialized:
+            self._init_buffers()
+        self._ensure_stream()
+        # Offload stream waits for compute to finish.
+        compute_event = torch.cuda.current_stream(
+            self._device).record_event()
+        self._offload_stream.wait_event(compute_event)
+        offloaded_bytes = 0
+        # Per-tensor D2H copies directly into CPU flat buffer slices.
+        # No GPU staging buffer → no temporary GPU memory spike.
+        with torch.cuda.stream(self._offload_stream):
+            for dtype, grp in self._groups.items():
+                indices = grp["indices"]
+                offsets = grp["offsets"]
+                cpu_flat = grp["cpu_flat"]
+                for i, mgd_idx in enumerate(indices):
+                    local = self._local(self._managed[mgd_idx])
+                    off, n = offsets[i]
+                    cpu_flat[off:off + n].copy_(
+                        local.reshape(-1), non_blocking=True)
+                offloaded_bytes += grp["total"] * cpu_flat.element_size()
+        # Wait for all D2H copies to land, then free GPU storage.
+        self._offload_stream.synchronize()
+        for t in self._managed:
+            self._local(t).untyped_storage().resize_(0)
+        if not self._logged:
+            logger.info("[CPUOffload] Offloaded %.2f MB (GPU → CPU)",
+                        offloaded_bytes / (1024**2))
+    # ------------------------------------------------------------------
+    def reload(self):
+        """Per-tensor H2D from CPU flat buffer on the default stream.
+        Runs on the current (default) CUDA stream to avoid stream
+        interaction issues with the parallel Muon pipeline.  Since
+        pinned CPU memory is the source, the copies overlap with
+        GPU idle time between steps.
+        """
+        if not self._managed or not self._initialized:
+            return
+        reloaded_bytes = 0
+        # Re-allocate all GPU storages first.
+        for t in self._managed:
+            local = self._local(t)
+            local.untyped_storage().resize_(self._storage_nbytes[id(t)])
+        # Per-tensor H2D copies from CPU flat buffer slices.
+        # non_blocking=True with pinned source allows DMA overlap.
+        for dtype, grp in self._groups.items():
+            indices = grp["indices"]
+            offsets = grp["offsets"]
+            cpu_flat = grp["cpu_flat"]
+            for i, mgd_idx in enumerate(indices):
+                local = self._local(self._managed[mgd_idx])
+                off, n = offsets[i]
+                local.reshape(-1).copy_(
+                    cpu_flat[off:off + n], non_blocking=True)
+            reloaded_bytes += grp["total"] * cpu_flat.element_size()
+        if not self._logged:
+            logger.info("[CPUOffload] Reloaded %.2f MB (CPU → GPU)",
+                        reloaded_bytes / (1024**2))
+            self._logged = True

build/torch210-cxx11-cu128-x86_64-linux/distributed/utils.py CHANGED Viewed

@@ -72,12 +72,6 @@ def get_slices_of_dtensor(
         else:
             curr_size = target.size()[shard_dim]
-        if curr_size % num_chunks != 0:
-            raise NotImplementedError(
-                f"Dimension size {curr_size} is not divisible "
-                f"by number of ranks {num_chunks} for shard "
-                f"placement on dim {shard_dim}. (shape: {target.shape})")
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

         else:
             curr_size = target.size()[shard_dim]
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

build/torch210-cxx11-cu128-x86_64-linux/matmul_transpose_triton.py CHANGED Viewed

@@ -43,6 +43,7 @@ def get_autotune_config():
 @triton.autotune(
     configs=get_autotune_config(),
     key=['M', 'K'],
 )
 @triton.jit
 def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
@@ -102,16 +103,10 @@ def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
         tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-def matmul_transpose_assign(d_in, d_out):
-    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
-    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
-    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
-    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
-    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
-    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
-    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
-            "First dimension of `d_in` must match first and second dimension of `d_out`"
     d_in = d_in.contiguous()
     M, K = d_in.shape
     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
@@ -119,3 +114,9 @@ def matmul_transpose_assign(d_in, d_out):
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))

 @triton.autotune(
     configs=get_autotune_config(),
     key=['M', 'K'],
+    restore_value=['y'],
 )
 @triton.jit
 def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
         tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+@torch.library.custom_op("muon::matmul_transpose_assign",
+                         mutates_args=("d_out", ))
+def matmul_transpose_assign(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
+    """Compute d_out = d_in @ d_in.T using an optimized Triton kernel."""
     d_in = d_in.contiguous()
     M, K = d_in.shape
     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))
+@matmul_transpose_assign.register_fake
+def _(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
+    """FakeTensor impl: d_out is already allocated, mutation is declared."""
+    pass

build/torch210-cxx11-cu128-x86_64-linux/muon.py CHANGED Viewed

@@ -10,13 +10,16 @@ from torch.profiler import record_function
 from .adamw import step_adamw
 from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon,
-                   get_default_muon_param_groups, update_g, update_p)
 from .distributed.utils import (_is_shard, construct_shard_mesh,
                                 get_slices_of_dtensor)
 from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5)
-from .pipeline import muon_chunk_pipeline
 from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
@@ -45,9 +48,21 @@ def _expand_expert_params(names, params, expert_keys):
     expanded_params = []
     for n, p in zip(names, params):
-        is_expert = expert_keys and any(key in n for key in expert_keys)
         is_dtensor = isinstance(p.data, DTensor)
         if not is_expert:
             assert p.data.ndim <= 2, (
                 f"Param {n} has ndim={p.data.ndim} but does not match "
@@ -168,7 +183,6 @@ class Muon(torch.optim.Optimizer):
                      Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
-        small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
         expert_keys: List of strings to identify expert-parallel parameters.
                      If any key appears in a parameter's name, its outermost
                      dimension is treated as the expert dimension and expanded
@@ -193,8 +207,8 @@ class Muon(torch.optim.Optimizer):
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
-                 small_param_numel_threshold=65536,
-                 expert_keys=None):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -228,8 +242,12 @@ class Muon(torch.optim.Optimizer):
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
-        self.small_param_numel_threshold = small_param_numel_threshold
         self.expert_keys = expert_keys
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -333,8 +351,8 @@ class Muon(torch.optim.Optimizer):
             if g is None:
                 continue
-            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                             steps=group["ns_steps"])
             adjusted_lr = adjust_lr_for_muon(lr, p.shape)
             update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -355,52 +373,269 @@ class Muon(torch.optim.Optimizer):
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
-        """ Implementation of Distributed Muon by Liu et al. """
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            # Gather G
-            if isinstance(p.data, DTensor):
-                g_full = g.full_tensor()
-                p_full = p.data.full_tensor()
-            else:
-                g_full = g
-                p_full = p
-            u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
-                                                  steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
-            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p_full, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
-            if isinstance(p.data, DTensor):
-                ndims = len(p.device_mesh.mesh.shape)
-                p_replicate = DTensor.from_local(
-                    p_full,
-                    device_mesh=p.device_mesh,
-                    placements=[Replicate() for _ in range(ndims)],
-                )
-                p_sharded = p_replicate.redistribute(
-                    device_mesh=p.device_mesh,
-                    placements=p.placements,
                 )
-                p.copy_(p_sharded)
-    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
         """
         Perform a parallel optimization step using Muon.
@@ -409,31 +644,23 @@ class Muon(torch.optim.Optimizer):
         interleaves multiple chunks so that communication and computation
         overlap across chunks (the same overlap previously achieved by the
         warmup + main-loop index scheduling).
         """
         # Momentum is already applied by _step_muon before this method.
-        param_to_state, ordered_params = self.init_state_and_assign_params(
-            names, params, group, qk_logits)
-        # Compute local rank for this group's shard process group.
-        shard_pg = param_to_state[id(ordered_params[0])].process_group
-        rank = dist.get_rank(group=shard_pg)
-        if self.chunk_size == -1:
-            shard_ranks = dist.get_world_size(param_to_state[id(
-                ordered_params[0])].process_group)
-            chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-        elif self.chunk_size > 0:
-            chunk_size = self.chunk_size
-        else:
-            raise ValueError("chunk_size must be -1 or a positive integer.")
         def pipelines():
             for start in range(0, len(ordered_params), chunk_size):
                 chunk = ordered_params[start:start + chunk_size]
                 if chunk:
-                    yield muon_chunk_pipeline(
                         params=chunk,
                         param_to_state=param_to_state,
                         rank=rank,
@@ -442,9 +669,11 @@ class Muon(torch.optim.Optimizer):
                         weight_decay=weight_decay,
                         none_grad=group["none_grad"],
                     )
-        with record_function("muon::barrier"):
-            dist.barrier()
         with record_function("muon::pipeline"):
             run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
@@ -456,16 +685,152 @@ class Muon(torch.optim.Optimizer):
         names = group["names"]
         # Apply momentum to all params before routing/expansion.
         with record_function("muon::momentum"):
-            for n, p in zip(names, params):
-                g = p.grad
-                if g is None:
                     continue
-                g = update_g(self.state, p, g, group, momentum)
-                p.grad = g
         # Expand expert params by splitting on dim 0.
-        names, params = _expand_expert_params(names, params, self.expert_keys)
         param_dtensors = []
         name_dtensors = []
@@ -473,10 +838,10 @@ class Muon(torch.optim.Optimizer):
         param_tensors = []
         name_tensors = []
-        param_dtensors_small = []
-        name_dtensors_small = []
         if self.use_distributed_muon:
             self.distributed_muon(names=names,
                                   params=params,
                                   group=group,
@@ -485,8 +850,6 @@ class Muon(torch.optim.Optimizer):
                                   qk_logits=qk_logits)
             return
-        # For simplicity, we use distributed Muon for small parameters
-        # whose number of elements is below a threshold.
         for n, p in zip(names, params):
             if p is None or p.grad is None:
                 continue
@@ -494,23 +857,28 @@ class Muon(torch.optim.Optimizer):
                 if all(
                         isinstance(placement, Replicate)
                         for placement in p.placements):
                     param_tensors.append(p)
                     name_tensors.append(n)
-                elif p.data.numel() <= self.small_param_numel_threshold:
-                    param_dtensors_small.append(p)
-                    name_dtensors_small.append(n)
                 else:
                     param_dtensors.append(p)
                     name_dtensors.append(n)
             elif isinstance(p.data, torch.Tensor):
                 param_tensors.append(p)
                 name_tensors.append(n)
             else:
                 raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(
-            f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors, "
-            f"{len(param_dtensors_small)} Small DTensors")
         def group_dtensors(dtensors, names):
             # To support different placements, we group parameters by placements
@@ -526,21 +894,6 @@ class Muon(torch.optim.Optimizer):
                                            p.device_mesh])][1].append(p)
             return placement_to_params
-        if len(param_dtensors_small) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            self.distributed_muon(
-                params=param_dtensors_small,
-                names=name_dtensors_small,
-                group=group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
         if len(param_dtensors) > 0:
             if not dist.is_initialized():
                 raise RuntimeError(
@@ -548,7 +901,26 @@ class Muon(torch.optim.Optimizer):
                 )
             dtensor_group = group_dtensors(param_dtensors, name_dtensors)
             for _, (names, params) in dtensor_group.items():
                 self.parallel(
                     names,
                     params,
@@ -556,7 +928,10 @@ class Muon(torch.optim.Optimizer):
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
                 )
         if len(param_tensors) > 0:
             self.base(
@@ -568,6 +943,33 @@ class Muon(torch.optim.Optimizer):
                 qk_logits=qk_logits,
             )
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
@@ -585,10 +987,82 @@ class Muon(torch.optim.Optimizer):
             with torch.enable_grad():
                 loss = closure()
-        for group in self.param_groups:
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
                 step_adamw(self.state, group)
         return loss

 from .adamw import step_adamw
 from .async_utils import run_pipeline
+from .core import (_muon_state, adjust_lr_for_muon, batch_pre_ortho,
+                   get_default_muon_param_groups, is_expert_param, update_p)
+from .cpu_offload import CPUOffloadPool
 from .distributed.utils import (_is_shard, construct_shard_mesh,
                                 get_slices_of_dtensor)
 from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
+                            _zeropower_via_newtonschulz5,
+                            zeropower_via_newtonschulz5,
+                            zeropower_via_newtonschulz5_batched)
+from .pipeline import muon_chunk_pipeline, prelaunch_first_gather
 from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
     expanded_params = []
     for n, p in zip(names, params):
+        is_expert = is_expert_param(n, expert_keys)
         is_dtensor = isinstance(p.data, DTensor)
+        if is_expert:
+            if is_dtensor:
+                logger.debug(
+                    "[expand_expert] %s: expert DTensor, shape=%s, "
+                    "placements=%s, mesh=%s, local_shape=%s", n, p.shape,
+                    p.placements, p.device_mesh.mesh_dim_names,
+                    p.to_local().shape)
+            else:
+                logger.debug(
+                    "[expand_expert] %s: expert plain tensor, shape=%s", n,
+                    p.data.shape)
         if not is_expert:
             assert p.data.ndim <= 2, (
                 f"Param {n} has ndim={p.data.ndim} but does not match "
                      Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         expert_keys: List of strings to identify expert-parallel parameters.
                      If any key appears in a parameter's name, its outermost
                      dimension is treated as the expert dimension and expanded
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
+                 expert_keys=None,
+                 cpu_offload=False):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.expert_keys = expert_keys
+        self.cpu_offload = cpu_offload
+        self._cpu_offload_pool = CPUOffloadPool() if cpu_offload else None
+        self._offload_initialized = False
+        self._parallel_cache: dict[tuple[str, ...], dict] = {}
+        self._expert_expand_cache: dict[tuple[int, ...], dict] = {}
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if g is None:
                 continue
+            u = zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
+                                            steps=group["ns_steps"])
             adjusted_lr = adjust_lr_for_muon(lr, p.shape)
             update_p(p, u, lr, adjusted_lr, weight_decay)
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
+        """Batched Distributed Muon — for testing/correctness verification only.
+        Uses all-gather to reconstruct full tensors, computes Newton-Schulz on
+        the full grad, then slices back to local shards.  This is simpler but
+        slower than the parallel pipeline (all2all) path, so it serves as a
+        reference implementation for verifying correctness.
+        """
+        with record_function("distributed_muon"):
+            # Momentum is already applied by _step_muon before this method.
+            ns_steps = group["ns_steps"]
+            # Separate plain tensors (no communication) from DTensors.
+            plain_names, plain_params = [], []
+            dtensor_names, dtensor_params = [], []
+            for n, p in zip(names, params):
+                if p.grad is None:
+                    continue
+                if isinstance(p.data, DTensor):
+                    dtensor_names.append(n)
+                    dtensor_params.append(p)
+                else:
+                    plain_names.append(n)
+                    plain_params.append(p)
+            # Process plain tensors per-param (no communication).
+            for n, p in zip(plain_names, plain_params):
+                u = _zeropower_via_newtonschulz5(p.grad.to(COMM_DTYPE),
+                                                 steps=ns_steps)
+                adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                update_p(p, u, lr, adjusted_lr, weight_decay)
+                qk_clip_state = get_qk_clip_info(self.clip_config, n,
+                                                 qk_logits)
+                scales_full = compute_scales(
+                    p, qk_clip_state) if qk_clip_state is not None else None
+                if scales_full is not None:
+                    qk_clip(p, scales_full, qk_clip_state.head_dim)
+            if not dtensor_params:
+                return
+            # Group DTensors by (placements, mesh) for batched all-gather.
+            placement_groups: dict[tuple,
+                                   tuple[list,
+                                         list]] = defaultdict(lambda: ([], []))
+            for n, p in zip(dtensor_names, dtensor_params):
+                key = (p.placements, p.device_mesh)
+                placement_groups[key][0].append(n)
+                placement_groups[key][1].append(p)
+            logger.info(
+                "distributed_muon: %d placement groups, %d total dtensors",
+                len(placement_groups), len(dtensor_params))
+            for (placements, mesh), (grp_names,
+                                     grp_params) in placement_groups.items():
+                shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
+                    placements, mesh)
+                rank = dist.get_rank(shard_pg)
+                world_size = dist.get_world_size(shard_pg)
+                logger.info("  group: %d params, placements=%s, world_size=%d",
+                            len(grp_params), placements, world_size)
+                # Separate params that can be batched (all shard dims evenly
+                # divisible) from those needing per-param full_tensor
+                # (e.g. MoE gate weights with fewer rows than shard ranks).
+                # all_gather_into_tensor requires equal buffer sizes across
+                # ranks, so uneven splits must use DTensor full_tensor().
+                batch_names, batch_params = [], []
+                single_names, single_params = [], []
+                for n, p in zip(grp_names, grp_params):
+                    even = all(p.shape[pl.dim] %
+                               shard_mesh.mesh.shape[dim_idx] == 0
+                               for dim_idx, pl in enumerate(shard_placements))
+                    if even:
+                        batch_names.append(n)
+                        batch_params.append(p)
+                    else:
+                        single_names.append(n)
+                        single_params.append(p)
+                # Process uneven-split params per-param via full_tensor().
+                for n, p in zip(single_names, single_params):
+                    with record_function("distributed_muon::newton_schulz"):
+                        g_full = p.grad.full_tensor().to(COMM_DTYPE)
+                        u_full = _zeropower_via_newtonschulz5(g_full,
+                                                              steps=ns_steps)
+                        del g_full
+                    with record_function("distributed_muon::update"):
+                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                        p._local_tensor.mul_(1 - lr * weight_decay)
+                        local_indices = get_slices_of_dtensor(
+                            p, rank, shard_mesh, shard_placements)
+                        u_local = u_full[local_indices]
+                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
+                        del u_full
+                        qk_clip_state = get_qk_clip_info(
+                            self.clip_config, n, qk_logits)
+                        scales_full = compute_scales(
+                            p, qk_clip_state
+                        ) if qk_clip_state is not None else None
+                        if scales_full is not None:
+                            ratio = p.shape[0] // scales_full.shape[0]
+                            idx0 = local_indices[0]
+                            if isinstance(idx0, slice):
+                                start = idx0.start or 0
+                                idx0 = torch.arange(start,
+                                                    idx0.stop,
+                                                    device=scales_full.device)
+                            row_scales = scales_full[idx0 // ratio]
+                            p._local_tensor.mul_(row_scales.view(-1, 1))
+                if not batch_params:
+                    continue
+                logger.info("  batched=%d, single=%d", len(batch_params),
+                            len(single_params))
+                # Concat all local grad shards into a single flat buffer.
+                with record_function("distributed_muon::gather"):
+                    grad_locals = [
+                        p.grad.to_local().to(COMM_DTYPE).flatten()
+                        for p in batch_params
+                    ]
+                    numels = [g.numel() for g in grad_locals]
+                    grad_concat = torch.cat(grad_locals)
+                    del grad_locals
+                    # Single all-gather (replaces N separate full_tensor).
+                    grad_gathered = torch.empty(
+                        grad_concat.numel() * world_size,
+                        dtype=COMM_DTYPE,
+                        device="cuda",
+                    )
+                    dist.all_gather_into_tensor(grad_gathered,
+                                                grad_concat,
+                                                group=shard_pg)
+                total_numel = grad_concat.numel()
+                del grad_concat
+                # Precompute per-param offsets within the concat buffer.
+                offsets = []
+                off = 0
+                for ne in numels:
+                    offsets.append(off)
+                    off += ne
+                # Per-param: reconstruct full grad → NS → local update.
+                for i, (n, p) in enumerate(zip(batch_names, batch_params)):
+                    with record_function("distributed_muon::newton_schulz"):
+                        g_full = torch.empty(p.shape,
+                                             dtype=COMM_DTYPE,
+                                             device="cuda")
+                        for r in range(world_size):
+                            r_start = r * total_numel + offsets[i]
+                            shard = grad_gathered[r_start:r_start + numels[i]]
+                            indices = get_slices_of_dtensor(
+                                p, r, shard_mesh, shard_placements)
+                            g_full[indices] = shard.reshape(
+                                g_full[indices].shape)
+                        u_full = _zeropower_via_newtonschulz5(g_full,
+                                                              steps=ns_steps)
+                        del g_full
+                    with record_function("distributed_muon::update"):
+                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                        p._local_tensor.mul_(1 - lr * weight_decay)
+                        local_indices = get_slices_of_dtensor(
+                            p, rank, shard_mesh, shard_placements)
+                        u_local = u_full[local_indices]
+                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
+                        del u_full
+                        qk_clip_state = get_qk_clip_info(
+                            self.clip_config, n, qk_logits)
+                        scales_full = compute_scales(
+                            p, qk_clip_state
+                        ) if qk_clip_state is not None else None
+                        if scales_full is not None:
+                            ratio = p.shape[0] // scales_full.shape[0]
+                            idx0 = local_indices[0]
+                            if isinstance(idx0, slice):
+                                start = idx0.start or 0
+                                idx0 = torch.arange(start,
+                                                    idx0.stop,
+                                                    device=scales_full.device)
+                            row_scales = scales_full[idx0 // ratio]
+                            p._local_tensor.mul_(row_scales.view(-1, 1))
+    def _setup_parallel(self, names, params, group, qk_logits):
+        """Compute (or retrieve cached) parallel pipeline metadata.
+        Returns:
+            (ordered_params, param_to_state, rank, chunk_size)
+        """
+        cache_key = tuple(names)
+        if cache_key not in self._parallel_cache:
+            # First call: compute metadata and populate cache.
+            param_to_state, ordered_params = self.init_state_and_assign_params(
+                names, params, group, qk_logits)
+            shard_pg = param_to_state[id(ordered_params[0])].process_group
+            rank = dist.get_rank(group=shard_pg)
+            if self.chunk_size == -1:
+                shard_ranks = dist.get_world_size(shard_pg)
+                chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
+            elif self.chunk_size > 0:
+                chunk_size = self.chunk_size
+            else:
+                raise ValueError(
+                    "chunk_size must be -1 or a positive integer.")
+            ordered_names = [
+                param_to_state[id(p)].name for p in ordered_params
+            ]
+            name_to_state = {
+                param_to_state[id(p)].name: param_to_state[id(p)]
+                for p in ordered_params
+            }
+            self._parallel_cache[cache_key] = {
+                'ordered_names': ordered_names,
+                'name_to_state': name_to_state,
+                'rank': rank,
+                'chunk_size': chunk_size,
+            }
+        else:
+            # Cached path: rebuild param_to_state with current id(p) keys.
+            cache = self._parallel_cache[cache_key]
+            rank = cache['rank']
+            chunk_size = cache['chunk_size']
+            name_to_param = dict(zip(names, params))
+            ordered_params = [name_to_param[n] for n in cache['ordered_names']]
+            param_to_state = {}
+            for p, n in zip(ordered_params, cache['ordered_names']):
+                cached_state = cache['name_to_state'][n]
+                param_to_state[id(p)] = _muon_state(
+                    worker_rank=cached_state.worker_rank,
+                    process_group=cached_state.process_group,
+                    rank_indices=cached_state.rank_indices,
+                    rank_numels=cached_state.rank_numels,
+                    name=n,
+                    qk_clip_state=get_qk_clip_info(self.clip_config, n,
+                                                   qk_logits),
                 )
+        return ordered_params, param_to_state, rank, chunk_size
+    def parallel(self,
+                 names,
+                 params,
+                 group,
+                 lr,
+                 weight_decay,
+                 qk_logits,
+                 prelaunch_gather=None):
         """
         Perform a parallel optimization step using Muon.
         interleaves multiple chunks so that communication and computation
         overlap across chunks (the same overlap previously achieved by the
         warmup + main-loop index scheduling).
+        If ``prelaunch_gather`` is provided, it is passed to the first
+        chunk's generator to skip re-launching the already in-flight
+        A2A gather.
         """
         # Momentum is already applied by _step_muon before this method.
+        ordered_params, param_to_state, rank, chunk_size = (
+            self._setup_parallel(names, params, group, qk_logits))
         def pipelines():
+            first = True
             for start in range(0, len(ordered_params), chunk_size):
                 chunk = ordered_params[start:start + chunk_size]
                 if chunk:
+                    kwargs = dict(
                         params=chunk,
                         param_to_state=param_to_state,
                         rank=rank,
                         weight_decay=weight_decay,
                         none_grad=group["none_grad"],
                     )
+                    if first and prelaunch_gather is not None:
+                        kwargs['prelaunch_gather'] = prelaunch_gather
+                    first = False
+                    yield muon_chunk_pipeline(**kwargs)
         with record_function("muon::pipeline"):
             run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
         names = group["names"]
         # Apply momentum to all params before routing/expansion.
+        # Batched using _foreach_* ops (compiled, fullgraph=True).
         with record_function("muon::momentum"):
+            active_params = [p for p in params if p.grad is not None]
+            if active_params:
+                # Ensure momentum buffers exist (avoid zeros_like when already present).
+                for p in active_params:
+                    if "momentum_buffer" not in self.state[p]:
+                        self.state[p]["momentum_buffer"] = torch.zeros_like(
+                            p.grad)
+                # Extract local tensors for compiled batch function.
+                local_grads = [
+                    p.grad._local_tensor
+                    if isinstance(p.grad, DTensor) else p.grad
+                    for p in active_params
+                ]
+                local_bufs = [
+                    self.state[p]["momentum_buffer"]._local_tensor
+                    if isinstance(self.state[p]["momentum_buffer"], DTensor)
+                    else self.state[p]["momentum_buffer"]
+                    for p in active_params
+                ]
+                # Wrap momentum as tensor for torch.compile.
+                batch_pre_ortho(local_grads, local_bufs,
+                                torch.tensor(momentum), group["nesterov"])
+                # For non-nesterov, the result is the momentum buffer.
+                if not group["nesterov"]:
+                    for p in active_params:
+                        p.grad = self.state[p]["momentum_buffer"]
+        # Identify batched experts for deferred NS.
+        # Detection is cheap (condition checks only); actual NS compute is
+        # deferred so it can overlap with the first chunk's A2A gather.
+        deferred_expert_work = []
+        if self.expert_keys:
+            batched_expert_indices = []
+            for i, (n, p) in enumerate(zip(names, params)):
+                if not (is_expert_param(n, self.expert_keys)
+                        and p.grad is not None):
                     continue
+                # Eligible: plain tensor, or DTensor with no non-dim-0 shards.
+                if isinstance(p.data, DTensor):
+                    has_tp = any(
+                        _is_shard(pl) and pl.dim != 0 for pl in p.placements)
+                    if has_tp:
+                        continue
+                batched_expert_indices.append(i)
+            if batched_expert_indices:
+                # Save refs for deferred NS; free grads from param list.
+                for i in batched_expert_indices:
+                    p = params[i]
+                    g = p.grad
+                    local_g = (g._local_tensor
+                               if isinstance(g, DTensor) else g)
+                    local_data = (p.data._local_tensor if isinstance(
+                        p.data, DTensor) else p.data)
+                    deferred_expert_work.append((local_data, local_g))
+                    p.grad = None
+                # Remove batched experts from lists before expansion.
+                keep = sorted(
+                    set(range(len(params))) - set(batched_expert_indices))
+                names = [names[i] for i in keep]
+                params = [params[i] for i in keep]
+        def _run_deferred_expert_ns():
+            """Execute deferred batched expert NS."""
+            if not deferred_expert_work:
+                return
+            with record_function("muon::batched_expert_ns"):
+                ns_steps = group["ns_steps"]
+                for local_data, local_g in deferred_expert_work:
+                    u = zeropower_via_newtonschulz5_batched(
+                        local_g.to(COMM_DTYPE), steps=ns_steps)
+                    adjusted_lr = adjust_lr_for_muon(lr, local_g.shape[1:])
+                    local_data.mul_(1 - lr * weight_decay)
+                    local_data.add_(u, alpha=-adjusted_lr)
         # Expand expert params by splitting on dim 0.
+        logger.debug("[_step_muon] before expand: %d params, expert_keys=%s",
+                     len(params), self.expert_keys)
+        if self.expert_keys:
+            cache_key = tuple(id(p) for p in params)
+            cache = self._expert_expand_cache.get(cache_key)
+            if cache is None:
+                # Cold path: full expansion + build cache metadata.
+                exp_names, exp_params = _expand_expert_params(
+                    names, params, self.expert_keys)
+                # Build per-expert-group info for hot-path grad updates.
+                grad_info = []
+                exp_idx = 0
+                for orig_idx, (n, p) in enumerate(zip(names, params)):
+                    if not is_expert_param(n, self.expert_keys):
+                        exp_idx += 1
+                        continue
+                    is_dt = isinstance(p.data, DTensor)
+                    num_experts = (p.to_local() if is_dt else p.data).shape[0]
+                    # Detect TP mesh from the first expanded expert param.
+                    tp_mesh = None
+                    tp_pls = None
+                    sample = exp_params[exp_idx]
+                    if isinstance(sample.data, DTensor):
+                        tp_mesh = sample.data.device_mesh
+                        tp_pls = list(sample.data.placements)
+                    grad_info.append((orig_idx, num_experts, exp_idx, is_dt,
+                                      tp_mesh, tp_pls))
+                    exp_idx += num_experts
+                self._expert_expand_cache[cache_key] = {
+                    'names': exp_names,
+                    'params': exp_params,
+                    'grad_info': grad_info,
+                }
+                names, params = exp_names, exp_params
+            else:
+                # Hot path: reuse cached params, only update expert grads.
+                for (orig_idx, num_experts, exp_start, is_dt, tp_mesh,
+                     tp_pls) in cache['grad_info']:
+                    p = params[orig_idx]
+                    g = p.grad
+                    local_grad = (g.to_local()
+                                  if is_dt and isinstance(g, DTensor) else g)
+                    for i in range(num_experts):
+                        expert_p = cache['params'][exp_start + i]
+                        sg = local_grad[i]
+                        if tp_mesh is not None:
+                            expert_p.grad = DTensor.from_local(
+                                sg, device_mesh=tp_mesh, placements=tp_pls)
+                        else:
+                            expert_p.grad = sg
+                    p.grad = None
+                names = cache['names']
+                params = cache['params']
+        else:
+            names, params = _expand_expert_params(names, params,
+                                                  self.expert_keys)
+        logger.debug("[_step_muon] after expand: %d params", len(params))
         param_dtensors = []
         name_dtensors = []
         param_tensors = []
         name_tensors = []
+        # distributed_muon is a reference implementation for testing only.
+        # The parallel pipeline (all2all) path below is the production path.
         if self.use_distributed_muon:
+            _run_deferred_expert_ns()
             self.distributed_muon(names=names,
                                   params=params,
                                   group=group,
                                   qk_logits=qk_logits)
             return
         for n, p in zip(names, params):
             if p is None or p.grad is None:
                 continue
                 if all(
                         isinstance(placement, Replicate)
                         for placement in p.placements):
+                    logger.debug(
+                        "[route] %s → base (DTensor all-Replicate), "
+                        "shape=%s, placements=%s", n, p.shape, p.placements)
                     param_tensors.append(p)
                     name_tensors.append(n)
                 else:
+                    logger.debug(
+                        "[route] %s → parallel (DTensor), shape=%s, "
+                        "placements=%s, mesh=%s", n, p.shape, p.placements,
+                        p.device_mesh.mesh_dim_names)
                     param_dtensors.append(p)
                     name_dtensors.append(n)
             elif isinstance(p.data, torch.Tensor):
+                logger.debug("[route] %s → base (plain tensor), shape=%s", n,
+                             p.data.shape)
                 param_tensors.append(p)
                 name_tensors.append(n)
             else:
                 raise TypeError(f"Unsupported parameter type: {type(p.data)}")
+        logger.debug(f"[Muon] {len(param_dtensors)} DTensors → parallel, "
+                     f"{len(param_tensors)} Tensors → base")
         def group_dtensors(dtensors, names):
             # To support different placements, we group parameters by placements
                                            p.device_mesh])][1].append(p)
             return placement_to_params
         if len(param_dtensors) > 0:
             if not dist.is_initialized():
                 raise RuntimeError(
                 )
             dtensor_group = group_dtensors(param_dtensors, name_dtensors)
+            # Pre-launch the first chunk's A2A gather so that the NCCL
+            # communication overlaps with the (deferred) batched expert NS
+            # compute on the default CUDA stream.
+            prelaunch = None
+            if deferred_expert_work:
+                first_names, first_params = next(iter(dtensor_group.values()))
+                ordered, pts, rnk, csz = self._setup_parallel(
+                    first_names, first_params, group, qk_logits)
+                first_chunk = ordered[:csz]
+                if first_chunk:
+                    prelaunch = prelaunch_first_gather(first_chunk, pts, rnk,
+                                                       group["none_grad"])
+            _run_deferred_expert_ns()
+            first_group = True
             for _, (names, params) in dtensor_group.items():
+                pg = prelaunch if first_group else None
+                first_group = False
                 self.parallel(
                     names,
                     params,
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
+                    prelaunch_gather=pg,
                 )
+        else:
+            _run_deferred_expert_ns()
         if len(param_tensors) > 0:
             self.base(
                 qk_logits=qk_logits,
             )
+    def _register_states_for_offload(self):
+        """Register all optimizer state tensors with the CPU offload pool.
+        Called once after the first step when states have been lazily created.
+        Offloads all param states (momentum buffers for Muon, moment1/moment2
+        for AdamW) to free GPU memory between steps.
+        """
+        pool = self._cpu_offload_pool
+        tracked = 0
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p not in self.state:
+                    continue
+                state = self.state[p]
+                if group.get("use_muon", False):
+                    if "momentum_buffer" in state:
+                        pool.track(state["momentum_buffer"])
+                        tracked += 1
+                else:
+                    if "moment1" in state:
+                        pool.track(state["moment1"])
+                    if "moment2" in state:
+                        pool.track(state["moment2"])
+                        tracked += 1
+        logger.info("[CPUOffload] Registered %d param states for offload",
+                    tracked)
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
             with torch.enable_grad():
                 loss = closure()
+        # H2D: reload optimizer states from CPU before computation.
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+        logger.debug("[Muon.step] expert_keys=%s, %d param groups",
+                     self.expert_keys, len(self.param_groups))
+        for i, group in enumerate(self.param_groups):
             if group["use_muon"]:
+                logger.debug("[Muon.step] group %d: use_muon=True, %d params",
+                             i, len(group["params"]))
                 self._step_muon(group, qk_logits=qk_logits)
             else:
+                logger.debug(
+                    "[Muon.step] group %d: use_muon=False (AdamW), %d params",
+                    i, len(group["params"]))
                 step_adamw(self.state, group)
+        # D2H: offload optimizer states to CPU after computation.
+        if self.cpu_offload:
+            if not self._offload_initialized:
+                self._register_states_for_offload()
+                self._offload_initialized = True
+            self._cpu_offload_pool.offload()
         return loss
+    # ------------------------------------------------------------------
+    # Checkpoint support for cpu_offload
+    # ------------------------------------------------------------------
+    def state_dict(self) -> dict:
+        """Return optimizer state dict, reloading offloaded states first.
+        When ``cpu_offload=True``, optimizer state tensors have their GPU
+        storage freed (``resize_(0)``) between steps.  We reload them,
+        snapshot the state dict, then re-offload so the optimizer stays
+        in the expected post-step state.  The returned dict holds cloned
+        tensors so they remain valid after the re-offload frees the
+        originals' GPU storage.
+        """
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+            torch.cuda.current_stream().synchronize()
+        sd = super().state_dict()
+        if self.cpu_offload and self._offload_initialized:
+            # Clone state tensors so the returned dict survives re-offload
+            # (which frees GPU storage on the originals via resize_(0)).
+            for k in sd["state"]:
+                sd["state"][k] = {
+                    sk: sv.clone() if isinstance(sv, torch.Tensor) else sv
+                    for sk, sv in sd["state"][k].items()
+                }
+            self._cpu_offload_pool.offload()
+        return sd
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Load optimizer state dict, then offload states if needed.
+        After ``super().load_state_dict()`` populates GPU tensors, we
+        re-register them with the offload pool and offload to CPU so the
+        optimizer is in the same post-step state (GPU storage freed).
+        """
+        # If states were offloaded, reload first so storage sizes are
+        # correct for super().load_state_dict() to overwrite.
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+            torch.cuda.current_stream().synchronize()
+        super().load_state_dict(state_dict)
+        if self.cpu_offload:
+            # Re-create the offload pool since state tensors may be new
+            # objects after load_state_dict.
+            self._cpu_offload_pool = CPUOffloadPool()
+            self._offload_initialized = False
+            self._register_states_for_offload()
+            self._offload_initialized = True
+            self._cpu_offload_pool.offload()

build/torch210-cxx11-cu128-x86_64-linux/newton_schulz.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import torch
 from .matmul_transpose_triton import matmul_transpose_assign
@@ -6,21 +10,134 @@ COMM_DTYPE = torch.bfloat16
 DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
     assert G.dtype == COMM_DTYPE
@@ -28,18 +145,14 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
         matmul_transpose_assign(X, buf1)
         matmul_transpose_assign(buf1, buf2)
         buf1.mul_(b).add_(buf2, alpha=c)
@@ -47,4 +160,77 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
     return X

+from itertools import repeat
+from math import inf, sqrt
+import numpy as np
 import torch
 from .matmul_transpose_triton import matmul_transpose_assign
 DEFAULT_CHUNK_SIZE_RATIO = 4
+def _optimal_quintic(l, u, max_iter=1000):
+    """
+    Use the simplified Remez algorithm to find the optimal odd quintic approximant
+    to the constant function x -> 1 over the interval [l, u].
+    Returns (a, b, c) for p(x) = ax + bx^3 + cx^5 that minimizes the maximum
+    approximation error max_{x in [l,u]} |p(x) - 1|. Iterates by updating the
+    two interior equioscillation nodes q, r until convergence. Returns the
+    closed-form equioscillating solution when l ≈ u.
+    Raises ValueError if any intermediate value (a, b, c, E, q, r) is non-finite
+    (NaN or inf). Raises RuntimeError if convergence is not reached within
+    max_iter iterations.
+    """
+    assert 0 <= l <= u
+    if 1 - 5e-6 <= l / u:
+        return (15 / 8) / u, (-10 / 8) / (u**3), (3 / 8) / (u**5)
+    q = (3 * l + u) / 4
+    r = (l + 3 * u) / 4
+    E = inf
+    for _ in range(max_iter):
+        old_E = E
+        LHS = np.array([
+            [l, l**3, l**5, 1],
+            [q, q**3, q**5, -1],
+            [r, r**3, r**5, 1],
+            [u, u**3, u**5, -1],
+        ])
+        a, b, c, E = np.linalg.solve(LHS, np.ones(4))
+        if not np.all(np.isfinite([a, b, c, E])):
+            raise ValueError(f"_optimal_quintic: non-finite solve result "
+                             f"a={a}, b={b}, c={c}, E={E}")
+        q, r = np.sqrt(
+            (-3 * b + np.array([-1, 1]) * sqrt(9 * b**2 - 20 * a * c)) /
+            (10 * c))
+        if not np.all(np.isfinite([q, r])):
+            raise ValueError(
+                f"_optimal_quintic: non-finite node update q={q}, r={r}")
+        if abs(old_E - E) <= 1e-15:
+            break
+    else:
+        raise RuntimeError(
+            f"_optimal_quintic: did not converge after {max_iter} iterations")
+    return float(a), float(b), float(c)
+def _optimal_composition(l, num_iters, safety_factor_eps=0, cushion=0):
+    """
+    Compute the Polar Express coefficient series for `num_iters` quintic iterations.
+    Builds a sequence of per-step optimal odd quintic coefficients (a, b, c) that
+    compose to map singular values from [l, 1] toward 1. At each step:
+      1. Solves `_optimal_quintic` on [max(l, cushion*u), u]. The `cushion`
+         prevents near-zero singular values from stalling by raising the effective
+         lower bound; if it is active (cushion*u > l), the coefficients are
+         rescaled so that p(l) and p(u) are centered around 1 w.r.t. the true [l, u].
+      2. Deflates the coefficients by (1 + safety_factor_eps)^degree for all but the
+         last iteration, providing numerical headroom at the cost of a slightly slower
+         final convergence step.
+      3. Advances the interval: l <- p(l), u <- 2 - p(l) (by symmetry of p around 1).
+    Returns a list of (a, b, c) tuples, one per iteration.
+    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
+    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
+    """
+    u = 1
+    assert 0 <= l <= u
+    safety_factor = 1 + safety_factor_eps
+    coefficients = []
+    for iter in range(num_iters):
+        a, b, c = _optimal_quintic(max(l, cushion * u), u)
+        if cushion * u > l:
+            pl = a * l + b * l**3 + c * l**5
+            pu = a * u + b * u**3 + c * u**5
+            rescaler = 2 / (pl + pu)
+            a *= rescaler
+            b *= rescaler
+            c *= rescaler
+        if iter < num_iters - 1:
+            a /= safety_factor
+            b /= safety_factor**3
+            c /= safety_factor**5
+        coefficients.append((a, b, c))
+        l = a * l + b * l**3 + c * l**5
+        u = 2 - l
+    return coefficients
+# Precomputed Polar Express coefficients (a, b, c) for 10 quintic Newton-Schulz
+# iterations. Each tuple is the minimax-optimal (Remez/equioscillation) odd quintic
+# approximant to x->1 over the current singular-value interval, computed once at
+# import time and reused across all optimizer steps.
+#
+# Contrast with the former hardcoded NS coefficients (5 fixed tuples):
+#   - Former: empirically tuned to maximize slope at zero; did not converge
+#     singular values to 1, yielding US'V^T with S' ~ Uniform(0.5, 1.5) instead
+#     of the true polar factor UV^T.
+#   - Polar Express: analytically optimal per step, adapting to the shrinking
+#     singular-value interval [l, u] as iterations progress; converges all
+#     singular values to 1, producing the exact polar factor UV^T.
+_coeffs_list = _optimal_composition(l=1e-3,
+                                    num_iters=10,
+                                    safety_factor_eps=1e-2,
+                                    cushion=0.02)
+# This code is adapted from:
+#   KellerJordan/Muon (https://github.com/KellerJordan/Muon/blob/master/muon.py)
+#   NoahAmsel/PolarExpress (https://github.com/NoahAmsel/PolarExpress)
+#   matmul_transpose_assign kernel from nil0x9/flash-muon (https://github.com/nil0x9/flash-muon)
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
+    Compute the polar factor of G via the Polar Express method.
+    Applies `steps` quintic iterations X <- aX + bX^3 + cX^5, where (a, b, c)
+    are the Polar Express coefficients from `_coeffs_list`. Each step is the
+    optimal odd quintic approximant to x -> 1 over the current singular-value
+    interval, minimizing the maximum approximation error (Remez / minimax criterion).
+    The composition maps singular values from [l, 1] to near 1, producing the
+    polar factor (orthogonal factor in the polar decomposition G = UP).
+    `_coeffs_list` is precomputed for 10 iterations (l=1e-3, safety_factor_eps=1e-2,
+    cushion=0.02). If `steps` exceeds 10, the final coefficient set is repeated.
+    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
+    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
     """
     assert len(G.shape) == 2
     assert G.dtype == COMM_DTYPE
     if G.size(0) > G.size(1):
         X = X.T
     X = X / (X.norm() + 1e-7)
+    hs = _coeffs_list[:steps] + list(
+        repeat(_coeffs_list[-1], steps - len(_coeffs_list)))
     buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
+    for a, b, c in hs:
         matmul_transpose_assign(X, buf1)
         matmul_transpose_assign(buf1, buf2)
         buf1.mul_(b).add_(buf2, alpha=c)
     if G.size(0) > G.size(1):
         X = X.T
     return X
+@torch.no_grad()
+def _zeropower_via_newtonschulz5_batched(G, steps):
+    """Batched polar factor computation for 3D (E, out, in) tensors.
+    Same algorithm as ``_zeropower_via_newtonschulz5`` but uses
+    ``torch.bmm`` / ``torch.baddbmm`` instead of the 2D Triton kernel,
+    processing all E expert matrices in a single batched call.
+    """
+    assert len(G.shape) == 3
+    assert G.dtype == COMM_DTYPE
+    X = G
+    if G.size(1) > G.size(2):
+        X = X.transpose(-2, -1)
+    # Per-expert Frobenius norm.
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
+    hs = _coeffs_list[:steps] + list(
+        repeat(_coeffs_list[-1], steps - len(_coeffs_list)))
+    for a, b, c in hs:
+        buf1 = torch.bmm(X, X.transpose(-2, -1))
+        buf2 = torch.bmm(buf1, buf1.transpose(-2, -1))
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.baddbmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(1) > G.size(2):
+        X = X.transpose(-2, -1)
+    return X
+_ns_per_shape: dict[tuple[int, ...], callable] = {}
+_use_compile = True
+def set_ns_compile(enabled: bool):
+    """Toggle torch.compile for Newton-Schulz iteration."""
+    global _use_compile
+    _use_compile = enabled
+def zeropower_via_newtonschulz5(G, steps=5):
+    if not _use_compile:
+        return _zeropower_via_newtonschulz5(G, steps)
+    key = G.shape
+    if key not in _ns_per_shape:
+        _ns_per_shape[key] = torch.compile(_zeropower_via_newtonschulz5,
+                                           options={
+                                               "triton.cudagraphs": True,
+                                               "shape_padding": False
+                                           })
+    torch.compiler.cudagraph_mark_step_begin()
+    return _ns_per_shape[key](G, steps).clone()
+def zeropower_via_newtonschulz5_batched(G, steps=5):
+    """Compile-cached batched Newton-Schulz for 3D expert tensors."""
+    if not _use_compile:
+        return _zeropower_via_newtonschulz5_batched(G, steps)
+    key = G.shape
+    if key not in _ns_per_shape:
+        _ns_per_shape[key] = torch.compile(
+            _zeropower_via_newtonschulz5_batched,
+            options={
+                "triton.cudagraphs": True,
+                "shape_padding": False
+            })
+    torch.compiler.cudagraph_mark_step_begin()
+    return _ns_per_shape[key](G, steps).clone()

build/torch210-cxx11-cu128-x86_64-linux/pipeline.py CHANGED Viewed

@@ -6,8 +6,8 @@ import torch.distributed as dist
 from torch.distributed.tensor import DTensor
 from torch.profiler import record_function
-from .core import _muon_state, adjust_lr_for_muon, update_p
-from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
 from .qk_clip import compute_scales
 logger = logging.getLogger(__name__)
@@ -45,26 +45,33 @@ def _launch_gather(
         else:
             gathered_grads[id(p)] = None
-    # Build send buffer
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
     send_counts = [0] * num_ranks
     for p in params:
         state = param_to_state[id(p)]
-        dst = state.worker_rank
-        assert dst < num_ranks
-        shard_elems = state.rank_numels[rank]
-        g = p.grad
-        g = g.to_local().to(COMM_DTYPE).contiguous()
-        assert g.numel() == shard_elems
-        per_dst[dst].append(g.view(-1))
-        send_counts[dst] += shard_elems
-    assert any(
-        len(v) > 0 for v in
-        per_dst), "At least one destination rank must receive a sharded tensor"
-    per_dst_flat = [t for dst in per_dst for t in dst]
-    send_buf = torch.cat(per_dst_flat, dim=0)
     # Build recv buffer
     recv_counts = [0] * num_ranks
@@ -120,7 +127,8 @@ def _complete_gather(
             shard_view = gathered_grads[id(p)][indices]
             n = shard_view.numel()
-            assert n > 0
             sg = recv_buf.narrow(0, off + inner_off, n)
             sg = sg.reshape(shard_view.shape)
@@ -143,7 +151,7 @@ def _compute_ns(
     """
     computed_us: dict[int, torch.Tensor | None] = {}
     for p in owned_params:
-        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
         gathered_grads[id(p)] = None  # free gathered grad
         computed_us[id(p)] = u
     return computed_us
@@ -163,46 +171,47 @@ def _launch_scatter(
     Returns:
         work: Async operation handle.
         recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
-        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
         recv_counts: Per-source-rank element counts.
     """
-    # Allocate scattered-u buffers
     scattered_us: dict[int, torch.Tensor] = {}
     for p in params:
-        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
-    # Build send buffer (from computed_us on owner ranks)
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
     send_counts = [0] * num_ranks
     if owned_params:
         for p in owned_params:
             state = param_to_state[id(p)]
-            assert computed_us[id(p)] is not None
-            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
-            total_sent = 0
             for dst_rank in range(num_ranks):
-                indices = state.rank_indices[dst_rank]
-                su = u_full[indices].flatten()
-                n = su.numel()
-                assert n > 0
-                per_dst[dst_rank].append(su)
-                send_counts[dst_rank] += n
-                total_sent += n
-            assert total_sent == u_full.numel()
-    lengths = [len(v) for v in per_dst]
-    if all(l > 0 for l in lengths):
-        assert all(
-            l == lengths[0] for l in lengths
-        ), "All destination ranks must have the same number of sharded tensor"
-        per_dst_flat = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst_flat, dim=0)
     else:
         send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
@@ -218,7 +227,6 @@ def _launch_scatter(
         recv_counts[src] = total
     recv_total = sum(recv_counts)
-    assert recv_total > 0
     recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
     # Launch async all-to-all
@@ -242,7 +250,13 @@ def _complete_scatter(
     rank: int,
     scattered_us: dict[int, torch.Tensor],
 ) -> None:
-    """Copy recv buffer into scattered_us (in-place)."""
     off = 0
     for src in range(len(recv_counts)):
         block = recv_counts[src]
@@ -255,11 +269,11 @@ def _complete_scatter(
             if state.worker_rank != src:
                 continue
             n = state.rank_numels[rank]
-            assert n > 0
-            flat_local = recv_buf.narrow(0, off + inner_off,
-                                         n).view_as(p.to_local())
-            scattered_us[id(p)].copy_(flat_local)
             inner_off += n
@@ -275,23 +289,40 @@ def _update_params(
     lr: float,
     weight_decay: float,
 ) -> None:
-    """Apply weight decay, Muon update, and optional QK clipping."""
-    for p in params:
-        state = param_to_state[id(p)]
-        u_dtensor = DTensor.from_local(
-            scattered_us[id(p)],
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
         adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
-        # QK clipping – applied directly on the local tensor to
-        # avoid DTensor sharding-propagation issues with _StridedShard.
-        scales_full = compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
         if scales_full is not None:
             ratio = p.shape[0] // scales_full.shape[0]
             idx0 = state.rank_indices[rank][0]
@@ -304,6 +335,45 @@ def _update_params(
             p._local_tensor.mul_(row_scales.view(-1, 1))
 # ======================================================================
 # Main generator – thin orchestrator that wires stages together.
 # ======================================================================
@@ -318,6 +388,7 @@ def muon_chunk_pipeline(
     lr: float,
     weight_decay: float,
     none_grad: bool,
 ) -> Generator[None, None, None]:
     """Process one chunk of parameters through the full Muon pipeline.
@@ -334,9 +405,12 @@ def muon_chunk_pipeline(
     runs concurrently on the NCCL stream — no separate ``comm_stream``
     is required.
     Yields exactly **2** times:
-    1. After launching async all-to-all gather.
     2. After launching async all-to-all scatter.
     """
     process_group = param_to_state[id(params[0])].process_group
@@ -345,15 +419,19 @@ def muon_chunk_pipeline(
         p for p in params if param_to_state[id(p)].worker_rank == rank
     ]
-    # Stages 1-2: launch async gather.
-    with record_function("muon::launch_gather"):
-        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group)
-        if none_grad:
-            for p in params:
-                p.grad = None
     yield  # --- YIELD 1: other chunks can launch their gather ---

 from torch.distributed.tensor import DTensor
 from torch.profiler import record_function
+from .core import _muon_state, adjust_lr_for_muon
+from .newton_schulz import COMM_DTYPE, zeropower_via_newtonschulz5
 from .qk_clip import compute_scales
 logger = logging.getLogger(__name__)
         else:
             gathered_grads[id(p)] = None
+    # Build send buffer – batch grad copies via torch.cat
+    # (1-2 fused kernels vs N individual narrow().copy_() calls).
     send_counts = [0] * num_ranks
     for p in params:
         state = param_to_state[id(p)]
+        send_counts[state.worker_rank] += state.rank_numels[rank]
+    total_send = sum(send_counts)
+    if total_send > 0:
+        # Group grad slices by destination rank in a single pass.
+        dst_to_grads = [[] for _ in range(num_ranks)]
+        for p in params:
+            state = param_to_state[id(p)]
+            n = state.rank_numels[rank]
+            if n > 0:
+                g = p.grad.to_local()
+                dst_to_grads[state.worker_rank].append(g.reshape(-1))
+        # Flatten in dst order and cat once.
+        all_slices = []
+        for dst in range(num_ranks):
+            all_slices.extend(dst_to_grads[dst])
+        send_buf = torch.cat(all_slices)
+        if send_buf.dtype != COMM_DTYPE:
+            send_buf = send_buf.to(COMM_DTYPE)
+    else:
+        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
     # Build recv buffer
     recv_counts = [0] * num_ranks
             shard_view = gathered_grads[id(p)][indices]
             n = shard_view.numel()
+            if n == 0:
+                continue
             sg = recv_buf.narrow(0, off + inner_off, n)
             sg = sg.reshape(shard_view.shape)
     """
     computed_us: dict[int, torch.Tensor | None] = {}
     for p in owned_params:
+        u = zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
         gathered_grads[id(p)] = None  # free gathered grad
         computed_us[id(p)] = u
     return computed_us
     Returns:
         work: Async operation handle.
         recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
+        scattered_us: Empty dict, populated by ``_complete_scatter`` with
+            zero-copy views into ``recv_buf``.
         recv_counts: Per-source-rank element counts.
     """
+    # scattered_us is populated by _complete_scatter with zero-copy views
+    # into recv_buf, avoiding N empty_like allocations + N copy_ calls.
+    # Pre-seed entries for params whose local shard is empty (rank_numels == 0)
+    # so _update_params can iterate all params without KeyError.
     scattered_us: dict[int, torch.Tensor] = {}
     for p in params:
+        if param_to_state[id(p)].rank_numels[rank] == 0:
+            scattered_us[id(p)] = torch.empty_like(p.to_local(),
+                                                   dtype=COMM_DTYPE)
+    # Build send buffer – batch via torch.cat
+    # (1 fused kernel vs N*num_ranks individual narrow().copy_() calls).
     send_counts = [0] * num_ranks
     if owned_params:
         for p in owned_params:
             state = param_to_state[id(p)]
             for dst_rank in range(num_ranks):
+                send_counts[dst_rank] += state.rank_numels[dst_rank]
+    total_send = sum(send_counts)
+    if total_send > 0:
+        # Cache u_full conversions to avoid redundant .to() per dst_rank.
+        u_fulls = {}
+        for p in owned_params:
+            u_fulls[id(p)] = computed_us[id(p)].to(COMM_DTYPE).contiguous()
+        # Collect slices in dst order (matches all-to-all send layout).
+        all_slices = []
+        for dst_rank in range(num_ranks):
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                su = u_fulls[id(p)][state.rank_indices[dst_rank]].flatten()
+                if su.numel() > 0:
+                    all_slices.append(su)
+        send_buf = torch.cat(all_slices) if all_slices else torch.empty(
+            0, dtype=COMM_DTYPE, device="cuda")
     else:
         send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
         recv_counts[src] = total
     recv_total = sum(recv_counts)
     recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
     # Launch async all-to-all
     rank: int,
     scattered_us: dict[int, torch.Tensor],
 ) -> None:
+    """Populate scattered_us with zero-copy views into recv_buf.
+    Instead of pre-allocating tensors and copying, we assign views directly
+    from ``recv_buf``.  This eliminates N ``empty_like`` + N ``copy_`` calls.
+    The underlying storage of ``recv_buf`` is kept alive through the views
+    until ``scattered_us`` is cleared after ``_update_params``.
+    """
     off = 0
     for src in range(len(recv_counts)):
         block = recv_counts[src]
             if state.worker_rank != src:
                 continue
             n = state.rank_numels[rank]
+            if n == 0:
+                continue
+            scattered_us[id(p)] = recv_buf.narrow(0, off + inner_off,
+                                                  n).view_as(p.to_local())
             inner_off += n
     lr: float,
     weight_decay: float,
 ) -> None:
+    """Apply weight decay, Muon update, and optional QK clipping.
+    Uses batched ``_foreach_mul_`` for weight decay and batched
+    ``_foreach_add_`` for the Muon update, grouping parameters by
+    adjusted_lr to minimize kernel launches while preserving float32
+    precision for the alpha scaling.
+    """
+    if not params:
+        return
+    # Batched weight decay: p *= (1 - lr * wd) — single fused kernel.
+    p_locals = [p._local_tensor for p in params]
+    torch._foreach_mul_(p_locals, 1.0 - lr * weight_decay)
+    # Group params by adjusted_lr so _foreach_add_ can use a single
+    # alpha per group (preserves float32 precision for alpha scaling).
+    lr_groups: dict[float, tuple[list, list]] = {}
+    for p in params:
         adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+        if adjusted_lr not in lr_groups:
+            lr_groups[adjusted_lr] = ([], [])
+        lr_groups[adjusted_lr][0].append(p._local_tensor)
+        lr_groups[adjusted_lr][1].append(scattered_us[id(p)])
+    for adjusted_lr, (p_group, u_group) in lr_groups.items():
+        torch._foreach_add_(p_group, u_group, alpha=-adjusted_lr)
+    # QK clipping – applied directly on the local tensor to
+    # avoid DTensor sharding-propagation issues with _StridedShard.
+    for p in params:
+        state = param_to_state[id(p)]
+        if state.qk_clip_state is None:
+            continue
+        scales_full = compute_scales(p, state.qk_clip_state)
         if scales_full is not None:
             ratio = p.shape[0] // scales_full.shape[0]
             idx0 = state.rank_indices[rank][0]
             p._local_tensor.mul_(row_scales.view(-1, 1))
+# ======================================================================
+# Pre-launch helper for overlapping first chunk's gather with other work.
+# ======================================================================
+@torch.no_grad()
+def prelaunch_first_gather(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    none_grad: bool,
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
+    """Launch the first chunk's A2A gather early for overlap with other compute.
+    Call this *before* expensive GPU work (e.g. batched expert NS) so that
+    the NCCL all-to-all runs concurrently on the NCCL stream while the
+    default stream executes compute.
+    Returns the same 4-tuple that ``_launch_gather`` produces, which should
+    be passed as ``prelaunch_gather`` to :func:`muon_chunk_pipeline`.
+    """
+    process_group = param_to_state[id(params[0])].process_group
+    num_ranks = dist.get_world_size(group=process_group)
+    owned_params = [
+        p for p in params if param_to_state[id(p)].worker_rank == rank
+    ]
+    with record_function("muon::prelaunch_gather"):
+        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group)
+    if none_grad:
+        for p in params:
+            p.grad = None
+    return work, recv_buf, gathered_grads, recv_counts
 # ======================================================================
 # Main generator – thin orchestrator that wires stages together.
 # ======================================================================
     lr: float,
     weight_decay: float,
     none_grad: bool,
+    prelaunch_gather: tuple | None = None,
 ) -> Generator[None, None, None]:
     """Process one chunk of parameters through the full Muon pipeline.
     runs concurrently on the NCCL stream — no separate ``comm_stream``
     is required.
+    If ``prelaunch_gather`` is provided, the gather was already launched
+    by :func:`prelaunch_first_gather` and we skip launching it again.
     Yields exactly **2** times:
+    1. After launching async all-to-all gather (or immediately if pre-launched).
     2. After launching async all-to-all scatter.
     """
     process_group = param_to_state[id(params[0])].process_group
         p for p in params if param_to_state[id(p)].worker_rank == rank
     ]
+    if prelaunch_gather is not None:
+        # Gather was pre-launched; none_grad already handled by caller.
+        work, recv_buf, gathered_grads, recv_counts = prelaunch_gather
+    else:
+        # Normal path: launch async gather.
+        with record_function("muon::launch_gather"):
+            work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+                params, owned_params, param_to_state, rank, num_ranks,
+                process_group)
+            if none_grad:
+                for p in params:
+                    p.grad = None
     yield  # --- YIELD 1: other chunks can launch their gather ---

build/torch210-cxx11-cu128-x86_64-linux/qk_clip.py CHANGED Viewed

@@ -5,6 +5,8 @@ from dataclasses import dataclass
 import torch
 from torch.distributed.tensor import DTensor
 logger = logging.getLogger(__name__)
@@ -23,7 +25,7 @@ def parse_qk_layer(name: str) -> tuple[str | None, int]:
         'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
         'model.4.attn.v_proj.weight'  -> (None, -1)
     """
-    parts = name.split('.')
     if len(parts) < 3:
         return None, -1
@@ -100,23 +102,27 @@ def compute_scales(p, qk_clip_state):
     threshold = qk_clip_state.threshold
     logit = qk_clip_state.logit
-    H_global = p.shape[0] // head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    scaling = 0
     for logit_idx, head_idx in enumerate(indices):
         v_ele = float(logit[logit_idx])
         if v_ele > threshold:
             new_scale = math.sqrt(threshold / v_ele)
-            if new_scale < scales_full[head_idx]:
-                scales_full[head_idx] = new_scale
                 logger.info(
                     f"[{kind}] Head {head_idx} exceeded threshold "
                     f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
                 )
-                scaling += 1
-    return scales_full if scaling > 0 else None
 def qk_clip(p, scales, head_dim):

 import torch
 from torch.distributed.tensor import DTensor
+from .core import normalize_fqn
 logger = logging.getLogger(__name__)
         'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
         'model.4.attn.v_proj.weight'  -> (None, -1)
     """
+    parts = normalize_fqn(name).split('.')
     if len(parts) < 3:
         return None, -1
     threshold = qk_clip_state.threshold
     logit = qk_clip_state.logit
+    # Check if any head exceeds threshold before allocating.
+    head_scales = {}
     for logit_idx, head_idx in enumerate(indices):
         v_ele = float(logit[logit_idx])
         if v_ele > threshold:
             new_scale = math.sqrt(threshold / v_ele)
+            if head_idx not in head_scales or new_scale < head_scales[head_idx]:
+                head_scales[head_idx] = new_scale
                 logger.info(
                     f"[{kind}] Head {head_idx} exceeded threshold "
                     f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
                 )
+    if not head_scales:
+        return None
+    H_global = p.shape[0] // head_dim
+    scales_full = torch.ones(H_global, device=p.data.device)
+    for head_idx, scale in head_scales.items():
+        scales_full[head_idx] = scale
+    return scales_full
 def qk_clip(p, scales, head_dim):

build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_7aef62f_dirty
-ops = torch.ops._optimizer_7aef62f_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_7aef62f_dirty::{op_name}"

 import torch
+from . import _optimizer_5b58933_dirty
+ops = torch.ops._optimizer_5b58933_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_5b58933_dirty::{op_name}"

build/torch210-cxx11-cu130-x86_64-linux/{_optimizer_7aef62f_dirty.abi3.so → _optimizer_5b58933_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9c7bb12bc030d4959e880a959b39ea07eb03e16175d7cf03829f9860f52525d
 size 2004728

 version https://git-lfs.github.com/spec/v1
+oid sha256:6869cfabdf45c7092d251846b3099287f8bccd5c5ebe7edf1a5fd21436324349
 size 2004728

build/torch210-cxx11-cu130-x86_64-linux/adamw.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from collections import defaultdict
 from typing import cast
 import torch
 from torch.distributed.tensor import DTensor
 def fused_adamw(
@@ -72,54 +76,72 @@ def fused_adamw(
         )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
     params_with_grads = []
     grads = []
     moment1 = []
     moment2 = []
-    max_exp_avg_sqs = []
     state_steps = []
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
     for p in params:
         g = p.grad
         if g is None:
             continue
         state = optimizer_state[p]
-        params_with_grads.append(p)
-        grads.append(g)
         if "step" not in state:
-            state["step"] = (torch.zeros((),
-                                         dtype=torch.float32,
-                                         device=p.device))
             state["moment1"] = torch.zeros_like(g)
             state["moment2"] = torch.zeros_like(g)
-        moment1.append(state["moment1"])
-        moment2.append(state["moment2"])
         if not isinstance(state["step"], torch.Tensor):
-            step_tensor = torch.tensor(state["step"],
-                                       dtype=torch.float32,
-                                       device=p.device)
-        else:
-            step_tensor = state["step"]
-        state_steps.append(step_tensor)
     fused_adamw(
         params_with_grads,
         grads,
         moment1,
         moment2,
-        max_exp_avg_sqs,
         state_steps,
         amsgrad=False,
         beta1=beta1,
@@ -131,24 +153,119 @@ def step_adamw_params(optimizer_state, params, group):
     )
 def step_adamw(optimizer_state, group):
     """Dispatch AdamW step, grouping parameters by type and placement.
     Args:
         optimizer_state: The optimizer's state dict (self.state in Muon).
         group: Parameter group dict.
     """
     params = group["params"]
-    # group params with its type and placement
-    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
-    for p in params:
-        match p:
-            case DTensor():
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])].append(p)
-            case torch.Tensor():
-                placement_to_params[tuple([torch.Tensor, None])].append(p)
-    for group_params in placement_to_params.values():
         step_adamw_params(optimizer_state, group_params, group)

+import logging
 from collections import defaultdict
 from typing import cast
 import torch
 from torch.distributed.tensor import DTensor
+from torch.profiler import record_function
+logger = logging.getLogger(__name__)
 def fused_adamw(
         )
+def _to_local(t):
+    """Unwrap DTensor to local tensor for fused ops."""
+    return t._local_tensor if isinstance(t, DTensor) else t
+# ---------------------------------------------------------------------------
+# Caches for eliminating per-step Python overhead.
+#
+# Placement grouping and tensor list assembly are identical every step
+# (params don't change placement, moment/step tensors are the same objects
+# after initialisation).  We cache them keyed by id() of the param list
+# stored in param_groups (stable across steps).
+#
+# Only gradients change each step and must be collected fresh.
+# ---------------------------------------------------------------------------
+# id(group["params"]) → dict[placement_key, list[param]]
+_placement_cache: dict[int, dict[tuple, list]] = {}
+# id(placement_group_list) → (params_local, moment1, moment2, state_steps)
+_tensor_cache: dict[int, tuple[list, list, list, list]] = {}
+def _step_adamw_params_slow(optimizer_state, params, group):
+    """Uncached fallback for the rare case where some params lack grads."""
     params_with_grads = []
     grads = []
     moment1 = []
     moment2 = []
     state_steps = []
     for p in params:
         g = p.grad
         if g is None:
             continue
         state = optimizer_state[p]
+        params_with_grads.append(_to_local(p))
+        grads.append(_to_local(g))
         if "step" not in state:
+            state["step"] = torch.zeros((),
+                                        dtype=torch.float32,
+                                        device=p.device)
             state["moment1"] = torch.zeros_like(g)
             state["moment2"] = torch.zeros_like(g)
+        moment1.append(_to_local(state["moment1"]))
+        moment2.append(_to_local(state["moment2"]))
         if not isinstance(state["step"], torch.Tensor):
+            state["step"] = torch.tensor(state["step"],
+                                         dtype=torch.float32,
+                                         device=p.device)
+        state_steps.append(state["step"])
+    if not params_with_grads:
+        return
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
     fused_adamw(
         params_with_grads,
         grads,
         moment1,
         moment2,
+        [],
         state_steps,
         amsgrad=False,
         beta1=beta1,
     )
+def step_adamw_params(optimizer_state, params, group):
+    """Run fused AdamW on a list of parameters sharing the same placement.
+    After the first call, cached tensor lists (params_local, moment1,
+    moment2, state_steps) are reused — only gradients are collected fresh.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        params: List of parameters to update.
+        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
+    """
+    # Collect grads — the only thing that changes each step.
+    with record_function("adamw::collect_grads"):
+        grads = []
+        for p in params:
+            g = p.grad
+            if g is None:
+                # Rare: fall back to slow path that filters per-param.
+                _step_adamw_params_slow(optimizer_state, params, group)
+                return
+            grads.append(_to_local(g))
+    tensor_key = id(params)
+    if tensor_key not in _tensor_cache:
+        with record_function("adamw::init_tensor_cache"):
+            params_local = []
+            moment1 = []
+            moment2 = []
+            state_steps = []
+            for p in params:
+                state = optimizer_state[p]
+                params_local.append(_to_local(p))
+                if "step" not in state:
+                    state["step"] = torch.zeros((),
+                                                dtype=torch.float32,
+                                                device=p.device)
+                    state["moment1"] = torch.zeros_like(p.grad)
+                    state["moment2"] = torch.zeros_like(p.grad)
+                moment1.append(_to_local(state["moment1"]))
+                moment2.append(_to_local(state["moment2"]))
+                if not isinstance(state["step"], torch.Tensor):
+                    state["step"] = torch.tensor(state["step"],
+                                                 dtype=torch.float32,
+                                                 device=p.device)
+                state_steps.append(state["step"])
+            _tensor_cache[tensor_key] = (params_local, moment1, moment2,
+                                         state_steps)
+    params_local, moment1, moment2, state_steps = _tensor_cache[tensor_key]
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
+    with record_function("adamw::fused_adamw"):
+        fused_adamw(
+            params_local,
+            grads,
+            moment1,
+            moment2,
+            [],
+            state_steps,
+            amsgrad=False,
+            beta1=beta1,
+            beta2=beta2,
+            lr=lr,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=False,
+        )
 def step_adamw(optimizer_state, group):
     """Dispatch AdamW step, grouping parameters by type and placement.
+    Placement grouping is cached after the first call since params never
+    change their placement between steps.
     Args:
         optimizer_state: The optimizer's state dict (self.state in Muon).
         group: Parameter group dict.
     """
     params = group["params"]
+    placement_key = id(params)
+    if placement_key not in _placement_cache:
+        with record_function("adamw::group_by_placement"):
+            placement_to_params: dict[tuple,
+                                      list[torch.Tensor]] = defaultdict(list)
+            for p in params:
+                match p:
+                    case DTensor():
+                        logger.debug(
+                            "[AdamW] DTensor param: shape=%s, placements=%s, "
+                            "mesh=%s, grad=%s", p.shape, p.placements,
+                            p.device_mesh.mesh_dim_names,
+                            p.grad.shape if p.grad is not None else None)
+                        placement_to_params[tuple(
+                            [p.placements, p.device_mesh])].append(p)
+                    case torch.Tensor():
+                        logger.debug(
+                            "[AdamW] plain param: shape=%s, grad=%s", p.shape,
+                            p.grad.shape if p.grad is not None else None)
+                        placement_to_params[tuple([torch.Tensor,
+                                                   None])].append(p)
+            logger.debug("[AdamW] %d placement groups, %d total params",
+                         len(placement_to_params), len(params))
+            _placement_cache[placement_key] = dict(placement_to_params)
+    for group_params in _placement_cache[placement_key].values():
         step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-cu130-x86_64-linux/core.py CHANGED Viewed

@@ -1,11 +1,25 @@
 import math
 from dataclasses import dataclass
 import torch
-import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
 @dataclass
 class _muon_state:
@@ -17,26 +31,71 @@ class _muon_state:
     qk_clip_state: torch.Tensor | None = None
-def update_g(optimizer_state, p, g, group, momentum):
-    """Apply momentum update to gradient.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        p: Parameter tensor.
-        g: Gradient tensor.
-        group: Parameter group dict.
-        momentum: Momentum coefficient.
-    Returns:
-        Momentum-updated gradient tensor.
     """
-    state = optimizer_state[p]
-    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-    torch.add(g, buf, alpha=momentum, out=buf)
-    if group["nesterov"]:
-        g.add_(buf, alpha=momentum)
-        return g
-    return buf
 def update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -49,14 +108,13 @@ def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
-    if isinstance(p, torch.nn.Parameter):
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    else:
-        p.mul_(1 - lr * weight_decay)
-        p.add_(u, alpha=-adjusted_lr)
 def adjust_lr_for_muon(lr, param_shape):
@@ -77,14 +135,55 @@ def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
 def default_is_muon(name, x, expert_keys=None):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    if any(key in name for key in skip_keys):
         return False
     effective_ndim = x.ndim
-    if expert_keys and any(key in name for key in expert_keys):
         effective_ndim -= 1
-    return effective_ndim >= 2
 def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
@@ -92,7 +191,7 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
-    non_muon_params = []
     for n, p in model.named_parameters():
         if not p.requires_grad:
@@ -102,6 +201,10 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
             muon_names.append(n)
         else:
             non_muon_params.append(p)
     return [
         {

+import logging
 import math
 from dataclasses import dataclass
+from typing import List
 import torch
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
+# torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
+# parameter FQNs.  Activation checkpointing similarly inserts
+# "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
+# expert_keys, QK layer parsing) works regardless of wrapper nesting.
+_WRAPPER_PARTS = frozenset({"_orig_mod", "_checkpoint_wrapped_module"})
+logger = logging.getLogger(__name__)
+def normalize_fqn(name: str) -> str:
+    """Strip torch.compile / checkpoint wrapper components from a parameter FQN."""
+    return ".".join(p for p in name.split(".") if p not in _WRAPPER_PARTS)
 @dataclass
 class _muon_state:
     qk_clip_state: torch.Tensor | None = None
+def _batch_momentum(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update (no nesterov)."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+def _batch_momentum_nesterov(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update with nesterov correction."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+    nesterov_terms = torch._foreach_mul(momentum_bufs, momentum)
+    torch._foreach_add_(grads, nesterov_terms)
+_compiled_momentum: dict[bool, callable] = {}
+_use_momentum_compile = True
+def set_momentum_compile(enabled: bool):
+    """Toggle torch.compile for batched momentum."""
+    global _use_momentum_compile
+    _use_momentum_compile = enabled
+def batch_pre_ortho(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+    nesterov: bool,
+) -> None:
+    """Batched momentum update on lists of plain tensors.
+    Mirrors dion's ``muon_update_pre_orthogonalize``.
+    Inputs must be plain CUDA tensors (not DTensor).
+    Modifies ``momentum_bufs`` and (for nesterov) ``grads`` in-place.
+    When compile is enabled, uses separately compiled functions for
+    nesterov=True/False to avoid graph breaks from the branch.
     """
+    fn = _batch_momentum_nesterov if nesterov else _batch_momentum
+    if _use_momentum_compile:
+        if nesterov not in _compiled_momentum:
+            _compiled_momentum[nesterov] = torch.compile(fn)
+        fn = _compiled_momentum[nesterov]
+    fn(grads, momentum_bufs, momentum)
+def _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay):
+    """Weight-decay + update on plain tensors.
+    Not compiled: per-param @torch.compile caused ~0.25ms TorchDynamo cache
+    lookup per call × 256+ params = massive overhead.  The pipeline path uses
+    batched _foreach_* ops instead; this function remains for base() and
+    distributed_muon().
+    """
+    p_data.mul_(1 - lr * weight_decay)
+    p_data.add_(u_data, alpha=-adjusted_lr)
 def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
+    # Unwrap Parameter -> underlying data tensor.
+    p_data = p.data if isinstance(p, torch.nn.Parameter) else p
+    # Unwrap DTensor -> local CUDA tensor for compiled kernel.
+    if isinstance(p_data, DTensor):
+        p_data = p_data._local_tensor
+    u_data = u._local_tensor if isinstance(u, DTensor) else u
+    _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay)
 def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
+def _match_key(parts, key):
+    """Check if key matches as contiguous components in parts.
+    Single-component keys (e.g. "experts") match any single component.
+    Multi-component keys (e.g. "experts.w1") match as a contiguous subsequence.
+    """
+    key_parts = key.split(".")
+    key_len = len(key_parts)
+    if key_len == 1:
+        return key in parts
+    return any(parts[i:i + key_len] == key_parts
+               for i in range(len(parts) - key_len + 1))
+def is_expert_param(name, expert_keys):
+    """Check if a parameter name matches any expert key (component-level)."""
+    if not expert_keys:
+        return False
+    parts = normalize_fqn(name).split(".")
+    return any(_match_key(parts, key) for key in expert_keys)
 def default_is_muon(name, x, expert_keys=None):
+    normalized = normalize_fqn(name)
+    parts = normalized.split(".")
+    skip_keys = [
+        "embed_tokens",
+        "lm_head",
+        "tok_embeddings",
+        "output",
+        "mhc_attn",
+        "mhc_ffn",
+        "lambda_proj",
+    ]
+    if any(key in parts for key in skip_keys):
+        logger.info(
+            "[is_muon] %s (orig: %s): skip (matched skip_key), ndim=%d",
+            normalized, name, x.ndim)
         return False
     effective_ndim = x.ndim
+    is_expert = is_expert_param(name, expert_keys)
+    if is_expert:
         effective_ndim -= 1
+    result = effective_ndim >= 2
+    logger.info(
+        "[is_muon] %s (orig: %s): ndim=%d, expert=%s, effective_ndim=%d → %s",
+        normalized, name, x.ndim, is_expert, effective_ndim,
+        "Muon" if result else "AdamW")
+    return result
 def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
+    non_muon_params, non_muon_names = [], []
     for n, p in model.named_parameters():
         if not p.requires_grad:
             muon_names.append(n)
         else:
             non_muon_params.append(p)
+            non_muon_names.append(n)
+    logger.info("[param_groups] expert_keys=%s, Muon=%d, AdamW=%d",
+                expert_keys, len(muon_names), len(non_muon_names))
     return [
         {

build/torch210-cxx11-cu130-x86_64-linux/cpu_offload.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""CPU offloading for optimizer states.
+Manages a pinned CPU memory pool and async CUDA streams to offload
+optimizer state tensors (momentum buffers, Adam moments) to CPU between
+optimizer steps, freeing GPU memory.
+All tracked tensors are packed into a single flat pinned CPU buffer
+(per dtype).  D2H and H2D copies are performed per-tensor directly
+between individual GPU tensors and their slice of the CPU flat buffer
+— no GPU staging buffer is allocated, so there is **no temporary GPU
+memory spike** during offload or reload.
+Individual tensor storages are freed after offload via
+``untyped_storage().resize_(0)``, preserving tensor identity so
+downstream caches remain valid.
+"""
+import logging
+from collections import defaultdict
+import torch
+from torch.distributed.tensor import DTensor
+logger = logging.getLogger(__name__)
+class CPUOffloadPool:
+    """Pinned CPU memory pool for async optimizer state offloading.
+    Tracked tensors are grouped by dtype.  Each group gets a single flat
+    pinned CPU buffer.  D2H / H2D copies are per-tensor (into slices of
+    the flat buffer) to avoid allocating a GPU staging buffer.
+    """
+    def __init__(self):
+        self._managed: list[torch.Tensor] = []
+        self._storage_nbytes: dict[int, int] = {}  # id(t) → bytes
+        # Per-dtype group: populated on first offload.
+        # dtype → dict with keys:
+        #   "indices"   : list[int]           managed-list indices
+        #   "offsets"   : list[tuple[int,int]] (start, numel) in flat buf
+        #   "total"     : int                  total numel
+        #   "cpu_flat"  : Tensor               pinned CPU buffer
+        self._groups: dict[torch.dtype, dict] = {}
+        self._offload_stream: torch.cuda.Stream | None = None
+        self._device: torch.device | None = None
+        self._initialized: bool = False
+        self._logged: bool = False
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _local(t: torch.Tensor) -> torch.Tensor:
+        """Unwrap DTensor to its local CUDA tensor."""
+        return t._local_tensor if isinstance(t, DTensor) else t
+    def _ensure_stream(self):
+        if self._offload_stream is None:
+            self._offload_stream = torch.cuda.Stream(device=self._device)
+    # ------------------------------------------------------------------
+    def track(self, tensor: torch.Tensor):
+        """Register a GPU tensor for CPU offloading.  Idempotent."""
+        tid = id(tensor)
+        if tid in self._storage_nbytes:
+            return
+        local = self._local(tensor)
+        if self._device is None:
+            self._device = local.device
+        self._storage_nbytes[tid] = local.untyped_storage().size()
+        self._managed.append(tensor)
+    # ------------------------------------------------------------------
+    def _init_buffers(self):
+        """Build per-dtype flat buffers on first offload."""
+        # Group managed tensors by dtype.
+        dtype_map: dict[torch.dtype, list[tuple[int, int]]] = defaultdict(list)
+        for idx, t in enumerate(self._managed):
+            local = self._local(t)
+            dtype_map[local.dtype].append((idx, local.numel()))
+        total_cpu_bytes = 0
+        for dtype, entries in dtype_map.items():
+            offsets: list[tuple[int, int]] = []
+            indices: list[int] = []
+            off = 0
+            for idx, n in entries:
+                indices.append(idx)
+                offsets.append((off, n))
+                off += n
+            cpu_flat = torch.empty(off, dtype=dtype, device="cpu", pin_memory=True)
+            self._groups[dtype] = {
+                "indices": indices,
+                "offsets": offsets,
+                "total": off,
+                "cpu_flat": cpu_flat,
+            }
+            total_cpu_bytes += off * cpu_flat.element_size()
+        self._initialized = True
+        logger.info(
+            "[CPUOffload] Pool initialized: %d tensors, %d dtype group(s), "
+            "%.2f MB pinned CPU memory",
+            len(self._managed),
+            len(self._groups),
+            total_cpu_bytes / (1024**2),
+        )
+    # ------------------------------------------------------------------
+    def offload(self):
+        """Per-tensor async D2H into CPU flat buffer, then free GPU storage."""
+        if not self._managed:
+            return
+        if not self._initialized:
+            self._init_buffers()
+        self._ensure_stream()
+        # Offload stream waits for compute to finish.
+        compute_event = torch.cuda.current_stream(
+            self._device).record_event()
+        self._offload_stream.wait_event(compute_event)
+        offloaded_bytes = 0
+        # Per-tensor D2H copies directly into CPU flat buffer slices.
+        # No GPU staging buffer → no temporary GPU memory spike.
+        with torch.cuda.stream(self._offload_stream):
+            for dtype, grp in self._groups.items():
+                indices = grp["indices"]
+                offsets = grp["offsets"]
+                cpu_flat = grp["cpu_flat"]
+                for i, mgd_idx in enumerate(indices):
+                    local = self._local(self._managed[mgd_idx])
+                    off, n = offsets[i]
+                    cpu_flat[off:off + n].copy_(
+                        local.reshape(-1), non_blocking=True)
+                offloaded_bytes += grp["total"] * cpu_flat.element_size()
+        # Wait for all D2H copies to land, then free GPU storage.
+        self._offload_stream.synchronize()
+        for t in self._managed:
+            self._local(t).untyped_storage().resize_(0)
+        if not self._logged:
+            logger.info("[CPUOffload] Offloaded %.2f MB (GPU → CPU)",
+                        offloaded_bytes / (1024**2))
+    # ------------------------------------------------------------------
+    def reload(self):
+        """Per-tensor H2D from CPU flat buffer on the default stream.
+        Runs on the current (default) CUDA stream to avoid stream
+        interaction issues with the parallel Muon pipeline.  Since
+        pinned CPU memory is the source, the copies overlap with
+        GPU idle time between steps.
+        """
+        if not self._managed or not self._initialized:
+            return
+        reloaded_bytes = 0
+        # Re-allocate all GPU storages first.
+        for t in self._managed:
+            local = self._local(t)
+            local.untyped_storage().resize_(self._storage_nbytes[id(t)])
+        # Per-tensor H2D copies from CPU flat buffer slices.
+        # non_blocking=True with pinned source allows DMA overlap.
+        for dtype, grp in self._groups.items():
+            indices = grp["indices"]
+            offsets = grp["offsets"]
+            cpu_flat = grp["cpu_flat"]
+            for i, mgd_idx in enumerate(indices):
+                local = self._local(self._managed[mgd_idx])
+                off, n = offsets[i]
+                local.reshape(-1).copy_(
+                    cpu_flat[off:off + n], non_blocking=True)
+            reloaded_bytes += grp["total"] * cpu_flat.element_size()
+        if not self._logged:
+            logger.info("[CPUOffload] Reloaded %.2f MB (CPU → GPU)",
+                        reloaded_bytes / (1024**2))
+            self._logged = True

build/torch210-cxx11-cu130-x86_64-linux/distributed/utils.py CHANGED Viewed

@@ -72,12 +72,6 @@ def get_slices_of_dtensor(
         else:
             curr_size = target.size()[shard_dim]
-        if curr_size % num_chunks != 0:
-            raise NotImplementedError(
-                f"Dimension size {curr_size} is not divisible "
-                f"by number of ranks {num_chunks} for shard "
-                f"placement on dim {shard_dim}. (shape: {target.shape})")
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

         else:
             curr_size = target.size()[shard_dim]
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

build/torch210-cxx11-cu130-x86_64-linux/matmul_transpose_triton.py CHANGED Viewed

@@ -43,6 +43,7 @@ def get_autotune_config():
 @triton.autotune(
     configs=get_autotune_config(),
     key=['M', 'K'],
 )
 @triton.jit
 def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
@@ -102,16 +103,10 @@ def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
         tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-def matmul_transpose_assign(d_in, d_out):
-    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
-    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
-    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
-    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
-    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
-    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
-    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
-            "First dimension of `d_in` must match first and second dimension of `d_out`"
     d_in = d_in.contiguous()
     M, K = d_in.shape
     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
@@ -119,3 +114,9 @@ def matmul_transpose_assign(d_in, d_out):
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))

 @triton.autotune(
     configs=get_autotune_config(),
     key=['M', 'K'],
+    restore_value=['y'],
 )
 @triton.jit
 def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
         tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+@torch.library.custom_op("muon::matmul_transpose_assign",
+                         mutates_args=("d_out", ))
+def matmul_transpose_assign(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
+    """Compute d_out = d_in @ d_in.T using an optimized Triton kernel."""
     d_in = d_in.contiguous()
     M, K = d_in.shape
     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))
+@matmul_transpose_assign.register_fake
+def _(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
+    """FakeTensor impl: d_out is already allocated, mutation is declared."""
+    pass

build/torch210-cxx11-cu130-x86_64-linux/muon.py CHANGED Viewed

@@ -10,13 +10,16 @@ from torch.profiler import record_function
 from .adamw import step_adamw
 from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon,
-                   get_default_muon_param_groups, update_g, update_p)
 from .distributed.utils import (_is_shard, construct_shard_mesh,
                                 get_slices_of_dtensor)
 from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5)
-from .pipeline import muon_chunk_pipeline
 from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
@@ -45,9 +48,21 @@ def _expand_expert_params(names, params, expert_keys):
     expanded_params = []
     for n, p in zip(names, params):
-        is_expert = expert_keys and any(key in n for key in expert_keys)
         is_dtensor = isinstance(p.data, DTensor)
         if not is_expert:
             assert p.data.ndim <= 2, (
                 f"Param {n} has ndim={p.data.ndim} but does not match "
@@ -168,7 +183,6 @@ class Muon(torch.optim.Optimizer):
                      Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
-        small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
         expert_keys: List of strings to identify expert-parallel parameters.
                      If any key appears in a parameter's name, its outermost
                      dimension is treated as the expert dimension and expanded
@@ -193,8 +207,8 @@ class Muon(torch.optim.Optimizer):
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
-                 small_param_numel_threshold=65536,
-                 expert_keys=None):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -228,8 +242,12 @@ class Muon(torch.optim.Optimizer):
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
-        self.small_param_numel_threshold = small_param_numel_threshold
         self.expert_keys = expert_keys
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -333,8 +351,8 @@ class Muon(torch.optim.Optimizer):
             if g is None:
                 continue
-            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                             steps=group["ns_steps"])
             adjusted_lr = adjust_lr_for_muon(lr, p.shape)
             update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -355,52 +373,269 @@ class Muon(torch.optim.Optimizer):
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
-        """ Implementation of Distributed Muon by Liu et al. """
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            # Gather G
-            if isinstance(p.data, DTensor):
-                g_full = g.full_tensor()
-                p_full = p.data.full_tensor()
-            else:
-                g_full = g
-                p_full = p
-            u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
-                                                  steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
-            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p_full, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
-            if isinstance(p.data, DTensor):
-                ndims = len(p.device_mesh.mesh.shape)
-                p_replicate = DTensor.from_local(
-                    p_full,
-                    device_mesh=p.device_mesh,
-                    placements=[Replicate() for _ in range(ndims)],
-                )
-                p_sharded = p_replicate.redistribute(
-                    device_mesh=p.device_mesh,
-                    placements=p.placements,
                 )
-                p.copy_(p_sharded)
-    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
         """
         Perform a parallel optimization step using Muon.
@@ -409,31 +644,23 @@ class Muon(torch.optim.Optimizer):
         interleaves multiple chunks so that communication and computation
         overlap across chunks (the same overlap previously achieved by the
         warmup + main-loop index scheduling).
         """
         # Momentum is already applied by _step_muon before this method.
-        param_to_state, ordered_params = self.init_state_and_assign_params(
-            names, params, group, qk_logits)
-        # Compute local rank for this group's shard process group.
-        shard_pg = param_to_state[id(ordered_params[0])].process_group
-        rank = dist.get_rank(group=shard_pg)
-        if self.chunk_size == -1:
-            shard_ranks = dist.get_world_size(param_to_state[id(
-                ordered_params[0])].process_group)
-            chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-        elif self.chunk_size > 0:
-            chunk_size = self.chunk_size
-        else:
-            raise ValueError("chunk_size must be -1 or a positive integer.")
         def pipelines():
             for start in range(0, len(ordered_params), chunk_size):
                 chunk = ordered_params[start:start + chunk_size]
                 if chunk:
-                    yield muon_chunk_pipeline(
                         params=chunk,
                         param_to_state=param_to_state,
                         rank=rank,
@@ -442,9 +669,11 @@ class Muon(torch.optim.Optimizer):
                         weight_decay=weight_decay,
                         none_grad=group["none_grad"],
                     )
-        with record_function("muon::barrier"):
-            dist.barrier()
         with record_function("muon::pipeline"):
             run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
@@ -456,16 +685,152 @@ class Muon(torch.optim.Optimizer):
         names = group["names"]
         # Apply momentum to all params before routing/expansion.
         with record_function("muon::momentum"):
-            for n, p in zip(names, params):
-                g = p.grad
-                if g is None:
                     continue
-                g = update_g(self.state, p, g, group, momentum)
-                p.grad = g
         # Expand expert params by splitting on dim 0.
-        names, params = _expand_expert_params(names, params, self.expert_keys)
         param_dtensors = []
         name_dtensors = []
@@ -473,10 +838,10 @@ class Muon(torch.optim.Optimizer):
         param_tensors = []
         name_tensors = []
-        param_dtensors_small = []
-        name_dtensors_small = []
         if self.use_distributed_muon:
             self.distributed_muon(names=names,
                                   params=params,
                                   group=group,
@@ -485,8 +850,6 @@ class Muon(torch.optim.Optimizer):
                                   qk_logits=qk_logits)
             return
-        # For simplicity, we use distributed Muon for small parameters
-        # whose number of elements is below a threshold.
         for n, p in zip(names, params):
             if p is None or p.grad is None:
                 continue
@@ -494,23 +857,28 @@ class Muon(torch.optim.Optimizer):
                 if all(
                         isinstance(placement, Replicate)
                         for placement in p.placements):
                     param_tensors.append(p)
                     name_tensors.append(n)
-                elif p.data.numel() <= self.small_param_numel_threshold:
-                    param_dtensors_small.append(p)
-                    name_dtensors_small.append(n)
                 else:
                     param_dtensors.append(p)
                     name_dtensors.append(n)
             elif isinstance(p.data, torch.Tensor):
                 param_tensors.append(p)
                 name_tensors.append(n)
             else:
                 raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(
-            f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors, "
-            f"{len(param_dtensors_small)} Small DTensors")
         def group_dtensors(dtensors, names):
             # To support different placements, we group parameters by placements
@@ -526,21 +894,6 @@ class Muon(torch.optim.Optimizer):
                                            p.device_mesh])][1].append(p)
             return placement_to_params
-        if len(param_dtensors_small) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            self.distributed_muon(
-                params=param_dtensors_small,
-                names=name_dtensors_small,
-                group=group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
         if len(param_dtensors) > 0:
             if not dist.is_initialized():
                 raise RuntimeError(
@@ -548,7 +901,26 @@ class Muon(torch.optim.Optimizer):
                 )
             dtensor_group = group_dtensors(param_dtensors, name_dtensors)
             for _, (names, params) in dtensor_group.items():
                 self.parallel(
                     names,
                     params,
@@ -556,7 +928,10 @@ class Muon(torch.optim.Optimizer):
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
                 )
         if len(param_tensors) > 0:
             self.base(
@@ -568,6 +943,33 @@ class Muon(torch.optim.Optimizer):
                 qk_logits=qk_logits,
             )
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
@@ -585,10 +987,82 @@ class Muon(torch.optim.Optimizer):
             with torch.enable_grad():
                 loss = closure()
-        for group in self.param_groups:
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
                 step_adamw(self.state, group)
         return loss

 from .adamw import step_adamw
 from .async_utils import run_pipeline
+from .core import (_muon_state, adjust_lr_for_muon, batch_pre_ortho,
+                   get_default_muon_param_groups, is_expert_param, update_p)
+from .cpu_offload import CPUOffloadPool
 from .distributed.utils import (_is_shard, construct_shard_mesh,
                                 get_slices_of_dtensor)
 from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
+                            _zeropower_via_newtonschulz5,
+                            zeropower_via_newtonschulz5,
+                            zeropower_via_newtonschulz5_batched)
+from .pipeline import muon_chunk_pipeline, prelaunch_first_gather
 from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
     expanded_params = []
     for n, p in zip(names, params):
+        is_expert = is_expert_param(n, expert_keys)
         is_dtensor = isinstance(p.data, DTensor)
+        if is_expert:
+            if is_dtensor:
+                logger.debug(
+                    "[expand_expert] %s: expert DTensor, shape=%s, "
+                    "placements=%s, mesh=%s, local_shape=%s", n, p.shape,
+                    p.placements, p.device_mesh.mesh_dim_names,
+                    p.to_local().shape)
+            else:
+                logger.debug(
+                    "[expand_expert] %s: expert plain tensor, shape=%s", n,
+                    p.data.shape)
         if not is_expert:
             assert p.data.ndim <= 2, (
                 f"Param {n} has ndim={p.data.ndim} but does not match "
                      Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         expert_keys: List of strings to identify expert-parallel parameters.
                      If any key appears in a parameter's name, its outermost
                      dimension is treated as the expert dimension and expanded
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
+                 expert_keys=None,
+                 cpu_offload=False):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.expert_keys = expert_keys
+        self.cpu_offload = cpu_offload
+        self._cpu_offload_pool = CPUOffloadPool() if cpu_offload else None
+        self._offload_initialized = False
+        self._parallel_cache: dict[tuple[str, ...], dict] = {}
+        self._expert_expand_cache: dict[tuple[int, ...], dict] = {}
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if g is None:
                 continue
+            u = zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
+                                            steps=group["ns_steps"])
             adjusted_lr = adjust_lr_for_muon(lr, p.shape)
             update_p(p, u, lr, adjusted_lr, weight_decay)
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
+        """Batched Distributed Muon — for testing/correctness verification only.
+        Uses all-gather to reconstruct full tensors, computes Newton-Schulz on
+        the full grad, then slices back to local shards.  This is simpler but
+        slower than the parallel pipeline (all2all) path, so it serves as a
+        reference implementation for verifying correctness.
+        """
+        with record_function("distributed_muon"):
+            # Momentum is already applied by _step_muon before this method.
+            ns_steps = group["ns_steps"]
+            # Separate plain tensors (no communication) from DTensors.
+            plain_names, plain_params = [], []
+            dtensor_names, dtensor_params = [], []
+            for n, p in zip(names, params):
+                if p.grad is None:
+                    continue
+                if isinstance(p.data, DTensor):
+                    dtensor_names.append(n)
+                    dtensor_params.append(p)
+                else:
+                    plain_names.append(n)
+                    plain_params.append(p)
+            # Process plain tensors per-param (no communication).
+            for n, p in zip(plain_names, plain_params):
+                u = _zeropower_via_newtonschulz5(p.grad.to(COMM_DTYPE),
+                                                 steps=ns_steps)
+                adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                update_p(p, u, lr, adjusted_lr, weight_decay)
+                qk_clip_state = get_qk_clip_info(self.clip_config, n,
+                                                 qk_logits)
+                scales_full = compute_scales(
+                    p, qk_clip_state) if qk_clip_state is not None else None
+                if scales_full is not None:
+                    qk_clip(p, scales_full, qk_clip_state.head_dim)
+            if not dtensor_params:
+                return
+            # Group DTensors by (placements, mesh) for batched all-gather.
+            placement_groups: dict[tuple,
+                                   tuple[list,
+                                         list]] = defaultdict(lambda: ([], []))
+            for n, p in zip(dtensor_names, dtensor_params):
+                key = (p.placements, p.device_mesh)
+                placement_groups[key][0].append(n)
+                placement_groups[key][1].append(p)
+            logger.info(
+                "distributed_muon: %d placement groups, %d total dtensors",
+                len(placement_groups), len(dtensor_params))
+            for (placements, mesh), (grp_names,
+                                     grp_params) in placement_groups.items():
+                shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
+                    placements, mesh)
+                rank = dist.get_rank(shard_pg)
+                world_size = dist.get_world_size(shard_pg)
+                logger.info("  group: %d params, placements=%s, world_size=%d",
+                            len(grp_params), placements, world_size)
+                # Separate params that can be batched (all shard dims evenly
+                # divisible) from those needing per-param full_tensor
+                # (e.g. MoE gate weights with fewer rows than shard ranks).
+                # all_gather_into_tensor requires equal buffer sizes across
+                # ranks, so uneven splits must use DTensor full_tensor().
+                batch_names, batch_params = [], []
+                single_names, single_params = [], []
+                for n, p in zip(grp_names, grp_params):
+                    even = all(p.shape[pl.dim] %
+                               shard_mesh.mesh.shape[dim_idx] == 0
+                               for dim_idx, pl in enumerate(shard_placements))
+                    if even:
+                        batch_names.append(n)
+                        batch_params.append(p)
+                    else:
+                        single_names.append(n)
+                        single_params.append(p)
+                # Process uneven-split params per-param via full_tensor().
+                for n, p in zip(single_names, single_params):
+                    with record_function("distributed_muon::newton_schulz"):
+                        g_full = p.grad.full_tensor().to(COMM_DTYPE)
+                        u_full = _zeropower_via_newtonschulz5(g_full,
+                                                              steps=ns_steps)
+                        del g_full
+                    with record_function("distributed_muon::update"):
+                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                        p._local_tensor.mul_(1 - lr * weight_decay)
+                        local_indices = get_slices_of_dtensor(
+                            p, rank, shard_mesh, shard_placements)
+                        u_local = u_full[local_indices]
+                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
+                        del u_full
+                        qk_clip_state = get_qk_clip_info(
+                            self.clip_config, n, qk_logits)
+                        scales_full = compute_scales(
+                            p, qk_clip_state
+                        ) if qk_clip_state is not None else None
+                        if scales_full is not None:
+                            ratio = p.shape[0] // scales_full.shape[0]
+                            idx0 = local_indices[0]
+                            if isinstance(idx0, slice):
+                                start = idx0.start or 0
+                                idx0 = torch.arange(start,
+                                                    idx0.stop,
+                                                    device=scales_full.device)
+                            row_scales = scales_full[idx0 // ratio]
+                            p._local_tensor.mul_(row_scales.view(-1, 1))
+                if not batch_params:
+                    continue
+                logger.info("  batched=%d, single=%d", len(batch_params),
+                            len(single_params))
+                # Concat all local grad shards into a single flat buffer.
+                with record_function("distributed_muon::gather"):
+                    grad_locals = [
+                        p.grad.to_local().to(COMM_DTYPE).flatten()
+                        for p in batch_params
+                    ]
+                    numels = [g.numel() for g in grad_locals]
+                    grad_concat = torch.cat(grad_locals)
+                    del grad_locals
+                    # Single all-gather (replaces N separate full_tensor).
+                    grad_gathered = torch.empty(
+                        grad_concat.numel() * world_size,
+                        dtype=COMM_DTYPE,
+                        device="cuda",
+                    )
+                    dist.all_gather_into_tensor(grad_gathered,
+                                                grad_concat,
+                                                group=shard_pg)
+                total_numel = grad_concat.numel()
+                del grad_concat
+                # Precompute per-param offsets within the concat buffer.
+                offsets = []
+                off = 0
+                for ne in numels:
+                    offsets.append(off)
+                    off += ne
+                # Per-param: reconstruct full grad → NS → local update.
+                for i, (n, p) in enumerate(zip(batch_names, batch_params)):
+                    with record_function("distributed_muon::newton_schulz"):
+                        g_full = torch.empty(p.shape,
+                                             dtype=COMM_DTYPE,
+                                             device="cuda")
+                        for r in range(world_size):
+                            r_start = r * total_numel + offsets[i]
+                            shard = grad_gathered[r_start:r_start + numels[i]]
+                            indices = get_slices_of_dtensor(
+                                p, r, shard_mesh, shard_placements)
+                            g_full[indices] = shard.reshape(
+                                g_full[indices].shape)
+                        u_full = _zeropower_via_newtonschulz5(g_full,
+                                                              steps=ns_steps)
+                        del g_full
+                    with record_function("distributed_muon::update"):
+                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                        p._local_tensor.mul_(1 - lr * weight_decay)
+                        local_indices = get_slices_of_dtensor(
+                            p, rank, shard_mesh, shard_placements)
+                        u_local = u_full[local_indices]
+                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
+                        del u_full
+                        qk_clip_state = get_qk_clip_info(
+                            self.clip_config, n, qk_logits)
+                        scales_full = compute_scales(
+                            p, qk_clip_state
+                        ) if qk_clip_state is not None else None
+                        if scales_full is not None:
+                            ratio = p.shape[0] // scales_full.shape[0]
+                            idx0 = local_indices[0]
+                            if isinstance(idx0, slice):
+                                start = idx0.start or 0
+                                idx0 = torch.arange(start,
+                                                    idx0.stop,
+                                                    device=scales_full.device)
+                            row_scales = scales_full[idx0 // ratio]
+                            p._local_tensor.mul_(row_scales.view(-1, 1))
+    def _setup_parallel(self, names, params, group, qk_logits):
+        """Compute (or retrieve cached) parallel pipeline metadata.
+        Returns:
+            (ordered_params, param_to_state, rank, chunk_size)
+        """
+        cache_key = tuple(names)
+        if cache_key not in self._parallel_cache:
+            # First call: compute metadata and populate cache.
+            param_to_state, ordered_params = self.init_state_and_assign_params(
+                names, params, group, qk_logits)
+            shard_pg = param_to_state[id(ordered_params[0])].process_group
+            rank = dist.get_rank(group=shard_pg)
+            if self.chunk_size == -1:
+                shard_ranks = dist.get_world_size(shard_pg)
+                chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
+            elif self.chunk_size > 0:
+                chunk_size = self.chunk_size
+            else:
+                raise ValueError(
+                    "chunk_size must be -1 or a positive integer.")
+            ordered_names = [
+                param_to_state[id(p)].name for p in ordered_params
+            ]
+            name_to_state = {
+                param_to_state[id(p)].name: param_to_state[id(p)]
+                for p in ordered_params
+            }
+            self._parallel_cache[cache_key] = {
+                'ordered_names': ordered_names,
+                'name_to_state': name_to_state,
+                'rank': rank,
+                'chunk_size': chunk_size,
+            }
+        else:
+            # Cached path: rebuild param_to_state with current id(p) keys.
+            cache = self._parallel_cache[cache_key]
+            rank = cache['rank']
+            chunk_size = cache['chunk_size']
+            name_to_param = dict(zip(names, params))
+            ordered_params = [name_to_param[n] for n in cache['ordered_names']]
+            param_to_state = {}
+            for p, n in zip(ordered_params, cache['ordered_names']):
+                cached_state = cache['name_to_state'][n]
+                param_to_state[id(p)] = _muon_state(
+                    worker_rank=cached_state.worker_rank,
+                    process_group=cached_state.process_group,
+                    rank_indices=cached_state.rank_indices,
+                    rank_numels=cached_state.rank_numels,
+                    name=n,
+                    qk_clip_state=get_qk_clip_info(self.clip_config, n,
+                                                   qk_logits),
                 )
+        return ordered_params, param_to_state, rank, chunk_size
+    def parallel(self,
+                 names,
+                 params,
+                 group,
+                 lr,
+                 weight_decay,
+                 qk_logits,
+                 prelaunch_gather=None):
         """
         Perform a parallel optimization step using Muon.
         interleaves multiple chunks so that communication and computation
         overlap across chunks (the same overlap previously achieved by the
         warmup + main-loop index scheduling).
+        If ``prelaunch_gather`` is provided, it is passed to the first
+        chunk's generator to skip re-launching the already in-flight
+        A2A gather.
         """
         # Momentum is already applied by _step_muon before this method.
+        ordered_params, param_to_state, rank, chunk_size = (
+            self._setup_parallel(names, params, group, qk_logits))
         def pipelines():
+            first = True
             for start in range(0, len(ordered_params), chunk_size):
                 chunk = ordered_params[start:start + chunk_size]
                 if chunk:
+                    kwargs = dict(
                         params=chunk,
                         param_to_state=param_to_state,
                         rank=rank,
                         weight_decay=weight_decay,
                         none_grad=group["none_grad"],
                     )
+                    if first and prelaunch_gather is not None:
+                        kwargs['prelaunch_gather'] = prelaunch_gather
+                    first = False
+                    yield muon_chunk_pipeline(**kwargs)
         with record_function("muon::pipeline"):
             run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
         names = group["names"]
         # Apply momentum to all params before routing/expansion.
+        # Batched using _foreach_* ops (compiled, fullgraph=True).
         with record_function("muon::momentum"):
+            active_params = [p for p in params if p.grad is not None]
+            if active_params:
+                # Ensure momentum buffers exist (avoid zeros_like when already present).
+                for p in active_params:
+                    if "momentum_buffer" not in self.state[p]:
+                        self.state[p]["momentum_buffer"] = torch.zeros_like(
+                            p.grad)
+                # Extract local tensors for compiled batch function.
+                local_grads = [
+                    p.grad._local_tensor
+                    if isinstance(p.grad, DTensor) else p.grad
+                    for p in active_params
+                ]
+                local_bufs = [
+                    self.state[p]["momentum_buffer"]._local_tensor
+                    if isinstance(self.state[p]["momentum_buffer"], DTensor)
+                    else self.state[p]["momentum_buffer"]
+                    for p in active_params
+                ]
+                # Wrap momentum as tensor for torch.compile.
+                batch_pre_ortho(local_grads, local_bufs,
+                                torch.tensor(momentum), group["nesterov"])
+                # For non-nesterov, the result is the momentum buffer.
+                if not group["nesterov"]:
+                    for p in active_params:
+                        p.grad = self.state[p]["momentum_buffer"]
+        # Identify batched experts for deferred NS.
+        # Detection is cheap (condition checks only); actual NS compute is
+        # deferred so it can overlap with the first chunk's A2A gather.
+        deferred_expert_work = []
+        if self.expert_keys:
+            batched_expert_indices = []
+            for i, (n, p) in enumerate(zip(names, params)):
+                if not (is_expert_param(n, self.expert_keys)
+                        and p.grad is not None):
                     continue
+                # Eligible: plain tensor, or DTensor with no non-dim-0 shards.
+                if isinstance(p.data, DTensor):
+                    has_tp = any(
+                        _is_shard(pl) and pl.dim != 0 for pl in p.placements)
+                    if has_tp:
+                        continue
+                batched_expert_indices.append(i)
+            if batched_expert_indices:
+                # Save refs for deferred NS; free grads from param list.
+                for i in batched_expert_indices:
+                    p = params[i]
+                    g = p.grad
+                    local_g = (g._local_tensor
+                               if isinstance(g, DTensor) else g)
+                    local_data = (p.data._local_tensor if isinstance(
+                        p.data, DTensor) else p.data)
+                    deferred_expert_work.append((local_data, local_g))
+                    p.grad = None
+                # Remove batched experts from lists before expansion.
+                keep = sorted(
+                    set(range(len(params))) - set(batched_expert_indices))
+                names = [names[i] for i in keep]
+                params = [params[i] for i in keep]
+        def _run_deferred_expert_ns():
+            """Execute deferred batched expert NS."""
+            if not deferred_expert_work:
+                return
+            with record_function("muon::batched_expert_ns"):
+                ns_steps = group["ns_steps"]
+                for local_data, local_g in deferred_expert_work:
+                    u = zeropower_via_newtonschulz5_batched(
+                        local_g.to(COMM_DTYPE), steps=ns_steps)
+                    adjusted_lr = adjust_lr_for_muon(lr, local_g.shape[1:])
+                    local_data.mul_(1 - lr * weight_decay)
+                    local_data.add_(u, alpha=-adjusted_lr)
         # Expand expert params by splitting on dim 0.
+        logger.debug("[_step_muon] before expand: %d params, expert_keys=%s",
+                     len(params), self.expert_keys)
+        if self.expert_keys:
+            cache_key = tuple(id(p) for p in params)
+            cache = self._expert_expand_cache.get(cache_key)
+            if cache is None:
+                # Cold path: full expansion + build cache metadata.
+                exp_names, exp_params = _expand_expert_params(
+                    names, params, self.expert_keys)
+                # Build per-expert-group info for hot-path grad updates.
+                grad_info = []
+                exp_idx = 0
+                for orig_idx, (n, p) in enumerate(zip(names, params)):
+                    if not is_expert_param(n, self.expert_keys):
+                        exp_idx += 1
+                        continue
+                    is_dt = isinstance(p.data, DTensor)
+                    num_experts = (p.to_local() if is_dt else p.data).shape[0]
+                    # Detect TP mesh from the first expanded expert param.
+                    tp_mesh = None
+                    tp_pls = None
+                    sample = exp_params[exp_idx]
+                    if isinstance(sample.data, DTensor):
+                        tp_mesh = sample.data.device_mesh
+                        tp_pls = list(sample.data.placements)
+                    grad_info.append((orig_idx, num_experts, exp_idx, is_dt,
+                                      tp_mesh, tp_pls))
+                    exp_idx += num_experts
+                self._expert_expand_cache[cache_key] = {
+                    'names': exp_names,
+                    'params': exp_params,
+                    'grad_info': grad_info,
+                }
+                names, params = exp_names, exp_params
+            else:
+                # Hot path: reuse cached params, only update expert grads.
+                for (orig_idx, num_experts, exp_start, is_dt, tp_mesh,
+                     tp_pls) in cache['grad_info']:
+                    p = params[orig_idx]
+                    g = p.grad
+                    local_grad = (g.to_local()
+                                  if is_dt and isinstance(g, DTensor) else g)
+                    for i in range(num_experts):
+                        expert_p = cache['params'][exp_start + i]
+                        sg = local_grad[i]
+                        if tp_mesh is not None:
+                            expert_p.grad = DTensor.from_local(
+                                sg, device_mesh=tp_mesh, placements=tp_pls)
+                        else:
+                            expert_p.grad = sg
+                    p.grad = None
+                names = cache['names']
+                params = cache['params']
+        else:
+            names, params = _expand_expert_params(names, params,
+                                                  self.expert_keys)
+        logger.debug("[_step_muon] after expand: %d params", len(params))
         param_dtensors = []
         name_dtensors = []
         param_tensors = []
         name_tensors = []
+        # distributed_muon is a reference implementation for testing only.
+        # The parallel pipeline (all2all) path below is the production path.
         if self.use_distributed_muon:
+            _run_deferred_expert_ns()
             self.distributed_muon(names=names,
                                   params=params,
                                   group=group,
                                   qk_logits=qk_logits)
             return
         for n, p in zip(names, params):
             if p is None or p.grad is None:
                 continue
                 if all(
                         isinstance(placement, Replicate)
                         for placement in p.placements):
+                    logger.debug(
+                        "[route] %s → base (DTensor all-Replicate), "
+                        "shape=%s, placements=%s", n, p.shape, p.placements)
                     param_tensors.append(p)
                     name_tensors.append(n)
                 else:
+                    logger.debug(
+                        "[route] %s → parallel (DTensor), shape=%s, "
+                        "placements=%s, mesh=%s", n, p.shape, p.placements,
+                        p.device_mesh.mesh_dim_names)
                     param_dtensors.append(p)
                     name_dtensors.append(n)
             elif isinstance(p.data, torch.Tensor):
+                logger.debug("[route] %s → base (plain tensor), shape=%s", n,
+                             p.data.shape)
                 param_tensors.append(p)
                 name_tensors.append(n)
             else:
                 raise TypeError(f"Unsupported parameter type: {type(p.data)}")
+        logger.debug(f"[Muon] {len(param_dtensors)} DTensors → parallel, "
+                     f"{len(param_tensors)} Tensors → base")
         def group_dtensors(dtensors, names):
             # To support different placements, we group parameters by placements
                                            p.device_mesh])][1].append(p)
             return placement_to_params
         if len(param_dtensors) > 0:
             if not dist.is_initialized():
                 raise RuntimeError(
                 )
             dtensor_group = group_dtensors(param_dtensors, name_dtensors)
+            # Pre-launch the first chunk's A2A gather so that the NCCL
+            # communication overlaps with the (deferred) batched expert NS
+            # compute on the default CUDA stream.
+            prelaunch = None
+            if deferred_expert_work:
+                first_names, first_params = next(iter(dtensor_group.values()))
+                ordered, pts, rnk, csz = self._setup_parallel(
+                    first_names, first_params, group, qk_logits)
+                first_chunk = ordered[:csz]
+                if first_chunk:
+                    prelaunch = prelaunch_first_gather(first_chunk, pts, rnk,
+                                                       group["none_grad"])
+            _run_deferred_expert_ns()
+            first_group = True
             for _, (names, params) in dtensor_group.items():
+                pg = prelaunch if first_group else None
+                first_group = False
                 self.parallel(
                     names,
                     params,
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
+                    prelaunch_gather=pg,
                 )
+        else:
+            _run_deferred_expert_ns()
         if len(param_tensors) > 0:
             self.base(
                 qk_logits=qk_logits,
             )
+    def _register_states_for_offload(self):
+        """Register all optimizer state tensors with the CPU offload pool.
+        Called once after the first step when states have been lazily created.
+        Offloads all param states (momentum buffers for Muon, moment1/moment2
+        for AdamW) to free GPU memory between steps.
+        """
+        pool = self._cpu_offload_pool
+        tracked = 0
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p not in self.state:
+                    continue
+                state = self.state[p]
+                if group.get("use_muon", False):
+                    if "momentum_buffer" in state:
+                        pool.track(state["momentum_buffer"])
+                        tracked += 1
+                else:
+                    if "moment1" in state:
+                        pool.track(state["moment1"])
+                    if "moment2" in state:
+                        pool.track(state["moment2"])
+                        tracked += 1
+        logger.info("[CPUOffload] Registered %d param states for offload",
+                    tracked)
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
             with torch.enable_grad():
                 loss = closure()
+        # H2D: reload optimizer states from CPU before computation.
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+        logger.debug("[Muon.step] expert_keys=%s, %d param groups",
+                     self.expert_keys, len(self.param_groups))
+        for i, group in enumerate(self.param_groups):
             if group["use_muon"]:
+                logger.debug("[Muon.step] group %d: use_muon=True, %d params",
+                             i, len(group["params"]))
                 self._step_muon(group, qk_logits=qk_logits)
             else:
+                logger.debug(
+                    "[Muon.step] group %d: use_muon=False (AdamW), %d params",
+                    i, len(group["params"]))
                 step_adamw(self.state, group)
+        # D2H: offload optimizer states to CPU after computation.
+        if self.cpu_offload:
+            if not self._offload_initialized:
+                self._register_states_for_offload()
+                self._offload_initialized = True
+            self._cpu_offload_pool.offload()
         return loss
+    # ------------------------------------------------------------------
+    # Checkpoint support for cpu_offload
+    # ------------------------------------------------------------------
+    def state_dict(self) -> dict:
+        """Return optimizer state dict, reloading offloaded states first.
+        When ``cpu_offload=True``, optimizer state tensors have their GPU
+        storage freed (``resize_(0)``) between steps.  We reload them,
+        snapshot the state dict, then re-offload so the optimizer stays
+        in the expected post-step state.  The returned dict holds cloned
+        tensors so they remain valid after the re-offload frees the
+        originals' GPU storage.
+        """
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+            torch.cuda.current_stream().synchronize()
+        sd = super().state_dict()
+        if self.cpu_offload and self._offload_initialized:
+            # Clone state tensors so the returned dict survives re-offload
+            # (which frees GPU storage on the originals via resize_(0)).
+            for k in sd["state"]:
+                sd["state"][k] = {
+                    sk: sv.clone() if isinstance(sv, torch.Tensor) else sv
+                    for sk, sv in sd["state"][k].items()
+                }
+            self._cpu_offload_pool.offload()
+        return sd
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Load optimizer state dict, then offload states if needed.
+        After ``super().load_state_dict()`` populates GPU tensors, we
+        re-register them with the offload pool and offload to CPU so the
+        optimizer is in the same post-step state (GPU storage freed).
+        """
+        # If states were offloaded, reload first so storage sizes are
+        # correct for super().load_state_dict() to overwrite.
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+            torch.cuda.current_stream().synchronize()
+        super().load_state_dict(state_dict)
+        if self.cpu_offload:
+            # Re-create the offload pool since state tensors may be new
+            # objects after load_state_dict.
+            self._cpu_offload_pool = CPUOffloadPool()
+            self._offload_initialized = False
+            self._register_states_for_offload()
+            self._offload_initialized = True
+            self._cpu_offload_pool.offload()

build/torch210-cxx11-cu130-x86_64-linux/newton_schulz.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import torch
 from .matmul_transpose_triton import matmul_transpose_assign
@@ -6,21 +10,134 @@ COMM_DTYPE = torch.bfloat16
 DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
     assert G.dtype == COMM_DTYPE
@@ -28,18 +145,14 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
         matmul_transpose_assign(X, buf1)
         matmul_transpose_assign(buf1, buf2)
         buf1.mul_(b).add_(buf2, alpha=c)
@@ -47,4 +160,77 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
     return X

+from itertools import repeat
+from math import inf, sqrt
+import numpy as np
 import torch
 from .matmul_transpose_triton import matmul_transpose_assign
 DEFAULT_CHUNK_SIZE_RATIO = 4
+def _optimal_quintic(l, u, max_iter=1000):
+    """
+    Use the simplified Remez algorithm to find the optimal odd quintic approximant
+    to the constant function x -> 1 over the interval [l, u].
+    Returns (a, b, c) for p(x) = ax + bx^3 + cx^5 that minimizes the maximum
+    approximation error max_{x in [l,u]} |p(x) - 1|. Iterates by updating the
+    two interior equioscillation nodes q, r until convergence. Returns the
+    closed-form equioscillating solution when l ≈ u.
+    Raises ValueError if any intermediate value (a, b, c, E, q, r) is non-finite
+    (NaN or inf). Raises RuntimeError if convergence is not reached within
+    max_iter iterations.
+    """
+    assert 0 <= l <= u
+    if 1 - 5e-6 <= l / u:
+        return (15 / 8) / u, (-10 / 8) / (u**3), (3 / 8) / (u**5)
+    q = (3 * l + u) / 4
+    r = (l + 3 * u) / 4
+    E = inf
+    for _ in range(max_iter):
+        old_E = E
+        LHS = np.array([
+            [l, l**3, l**5, 1],
+            [q, q**3, q**5, -1],
+            [r, r**3, r**5, 1],
+            [u, u**3, u**5, -1],
+        ])
+        a, b, c, E = np.linalg.solve(LHS, np.ones(4))
+        if not np.all(np.isfinite([a, b, c, E])):
+            raise ValueError(f"_optimal_quintic: non-finite solve result "
+                             f"a={a}, b={b}, c={c}, E={E}")
+        q, r = np.sqrt(
+            (-3 * b + np.array([-1, 1]) * sqrt(9 * b**2 - 20 * a * c)) /
+            (10 * c))
+        if not np.all(np.isfinite([q, r])):
+            raise ValueError(
+                f"_optimal_quintic: non-finite node update q={q}, r={r}")
+        if abs(old_E - E) <= 1e-15:
+            break
+    else:
+        raise RuntimeError(
+            f"_optimal_quintic: did not converge after {max_iter} iterations")
+    return float(a), float(b), float(c)
+def _optimal_composition(l, num_iters, safety_factor_eps=0, cushion=0):
+    """
+    Compute the Polar Express coefficient series for `num_iters` quintic iterations.
+    Builds a sequence of per-step optimal odd quintic coefficients (a, b, c) that
+    compose to map singular values from [l, 1] toward 1. At each step:
+      1. Solves `_optimal_quintic` on [max(l, cushion*u), u]. The `cushion`
+         prevents near-zero singular values from stalling by raising the effective
+         lower bound; if it is active (cushion*u > l), the coefficients are
+         rescaled so that p(l) and p(u) are centered around 1 w.r.t. the true [l, u].
+      2. Deflates the coefficients by (1 + safety_factor_eps)^degree for all but the
+         last iteration, providing numerical headroom at the cost of a slightly slower
+         final convergence step.
+      3. Advances the interval: l <- p(l), u <- 2 - p(l) (by symmetry of p around 1).
+    Returns a list of (a, b, c) tuples, one per iteration.
+    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
+    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
+    """
+    u = 1
+    assert 0 <= l <= u
+    safety_factor = 1 + safety_factor_eps
+    coefficients = []
+    for iter in range(num_iters):
+        a, b, c = _optimal_quintic(max(l, cushion * u), u)
+        if cushion * u > l:
+            pl = a * l + b * l**3 + c * l**5
+            pu = a * u + b * u**3 + c * u**5
+            rescaler = 2 / (pl + pu)
+            a *= rescaler
+            b *= rescaler
+            c *= rescaler
+        if iter < num_iters - 1:
+            a /= safety_factor
+            b /= safety_factor**3
+            c /= safety_factor**5
+        coefficients.append((a, b, c))
+        l = a * l + b * l**3 + c * l**5
+        u = 2 - l
+    return coefficients
+# Precomputed Polar Express coefficients (a, b, c) for 10 quintic Newton-Schulz
+# iterations. Each tuple is the minimax-optimal (Remez/equioscillation) odd quintic
+# approximant to x->1 over the current singular-value interval, computed once at
+# import time and reused across all optimizer steps.
+#
+# Contrast with the former hardcoded NS coefficients (5 fixed tuples):
+#   - Former: empirically tuned to maximize slope at zero; did not converge
+#     singular values to 1, yielding US'V^T with S' ~ Uniform(0.5, 1.5) instead
+#     of the true polar factor UV^T.
+#   - Polar Express: analytically optimal per step, adapting to the shrinking
+#     singular-value interval [l, u] as iterations progress; converges all
+#     singular values to 1, producing the exact polar factor UV^T.
+_coeffs_list = _optimal_composition(l=1e-3,
+                                    num_iters=10,
+                                    safety_factor_eps=1e-2,
+                                    cushion=0.02)
+# This code is adapted from:
+#   KellerJordan/Muon (https://github.com/KellerJordan/Muon/blob/master/muon.py)
+#   NoahAmsel/PolarExpress (https://github.com/NoahAmsel/PolarExpress)
+#   matmul_transpose_assign kernel from nil0x9/flash-muon (https://github.com/nil0x9/flash-muon)
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
+    Compute the polar factor of G via the Polar Express method.
+    Applies `steps` quintic iterations X <- aX + bX^3 + cX^5, where (a, b, c)
+    are the Polar Express coefficients from `_coeffs_list`. Each step is the
+    optimal odd quintic approximant to x -> 1 over the current singular-value
+    interval, minimizing the maximum approximation error (Remez / minimax criterion).
+    The composition maps singular values from [l, 1] to near 1, producing the
+    polar factor (orthogonal factor in the polar decomposition G = UP).
+    `_coeffs_list` is precomputed for 10 iterations (l=1e-3, safety_factor_eps=1e-2,
+    cushion=0.02). If `steps` exceeds 10, the final coefficient set is repeated.
+    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
+    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
     """
     assert len(G.shape) == 2
     assert G.dtype == COMM_DTYPE
     if G.size(0) > G.size(1):
         X = X.T
     X = X / (X.norm() + 1e-7)
+    hs = _coeffs_list[:steps] + list(
+        repeat(_coeffs_list[-1], steps - len(_coeffs_list)))
     buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
+    for a, b, c in hs:
         matmul_transpose_assign(X, buf1)
         matmul_transpose_assign(buf1, buf2)
         buf1.mul_(b).add_(buf2, alpha=c)
     if G.size(0) > G.size(1):
         X = X.T
     return X
+@torch.no_grad()
+def _zeropower_via_newtonschulz5_batched(G, steps):
+    """Batched polar factor computation for 3D (E, out, in) tensors.
+    Same algorithm as ``_zeropower_via_newtonschulz5`` but uses
+    ``torch.bmm`` / ``torch.baddbmm`` instead of the 2D Triton kernel,
+    processing all E expert matrices in a single batched call.
+    """
+    assert len(G.shape) == 3
+    assert G.dtype == COMM_DTYPE
+    X = G
+    if G.size(1) > G.size(2):
+        X = X.transpose(-2, -1)
+    # Per-expert Frobenius norm.
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
+    hs = _coeffs_list[:steps] + list(
+        repeat(_coeffs_list[-1], steps - len(_coeffs_list)))
+    for a, b, c in hs:
+        buf1 = torch.bmm(X, X.transpose(-2, -1))
+        buf2 = torch.bmm(buf1, buf1.transpose(-2, -1))
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.baddbmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(1) > G.size(2):
+        X = X.transpose(-2, -1)
+    return X
+_ns_per_shape: dict[tuple[int, ...], callable] = {}
+_use_compile = True
+def set_ns_compile(enabled: bool):
+    """Toggle torch.compile for Newton-Schulz iteration."""
+    global _use_compile
+    _use_compile = enabled
+def zeropower_via_newtonschulz5(G, steps=5):
+    if not _use_compile:
+        return _zeropower_via_newtonschulz5(G, steps)
+    key = G.shape
+    if key not in _ns_per_shape:
+        _ns_per_shape[key] = torch.compile(_zeropower_via_newtonschulz5,
+                                           options={
+                                               "triton.cudagraphs": True,
+                                               "shape_padding": False
+                                           })
+    torch.compiler.cudagraph_mark_step_begin()
+    return _ns_per_shape[key](G, steps).clone()
+def zeropower_via_newtonschulz5_batched(G, steps=5):
+    """Compile-cached batched Newton-Schulz for 3D expert tensors."""
+    if not _use_compile:
+        return _zeropower_via_newtonschulz5_batched(G, steps)
+    key = G.shape
+    if key not in _ns_per_shape:
+        _ns_per_shape[key] = torch.compile(
+            _zeropower_via_newtonschulz5_batched,
+            options={
+                "triton.cudagraphs": True,
+                "shape_padding": False
+            })
+    torch.compiler.cudagraph_mark_step_begin()
+    return _ns_per_shape[key](G, steps).clone()

build/torch210-cxx11-cu130-x86_64-linux/pipeline.py CHANGED Viewed

@@ -6,8 +6,8 @@ import torch.distributed as dist
 from torch.distributed.tensor import DTensor
 from torch.profiler import record_function
-from .core import _muon_state, adjust_lr_for_muon, update_p
-from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
 from .qk_clip import compute_scales
 logger = logging.getLogger(__name__)
@@ -45,26 +45,33 @@ def _launch_gather(
         else:
             gathered_grads[id(p)] = None
-    # Build send buffer
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
     send_counts = [0] * num_ranks
     for p in params:
         state = param_to_state[id(p)]
-        dst = state.worker_rank
-        assert dst < num_ranks
-        shard_elems = state.rank_numels[rank]
-        g = p.grad
-        g = g.to_local().to(COMM_DTYPE).contiguous()
-        assert g.numel() == shard_elems
-        per_dst[dst].append(g.view(-1))
-        send_counts[dst] += shard_elems
-    assert any(
-        len(v) > 0 for v in
-        per_dst), "At least one destination rank must receive a sharded tensor"
-    per_dst_flat = [t for dst in per_dst for t in dst]
-    send_buf = torch.cat(per_dst_flat, dim=0)
     # Build recv buffer
     recv_counts = [0] * num_ranks
@@ -120,7 +127,8 @@ def _complete_gather(
             shard_view = gathered_grads[id(p)][indices]
             n = shard_view.numel()
-            assert n > 0
             sg = recv_buf.narrow(0, off + inner_off, n)
             sg = sg.reshape(shard_view.shape)
@@ -143,7 +151,7 @@ def _compute_ns(
     """
     computed_us: dict[int, torch.Tensor | None] = {}
     for p in owned_params:
-        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
         gathered_grads[id(p)] = None  # free gathered grad
         computed_us[id(p)] = u
     return computed_us
@@ -163,46 +171,47 @@ def _launch_scatter(
     Returns:
         work: Async operation handle.
         recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
-        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
         recv_counts: Per-source-rank element counts.
     """
-    # Allocate scattered-u buffers
     scattered_us: dict[int, torch.Tensor] = {}
     for p in params:
-        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
-    # Build send buffer (from computed_us on owner ranks)
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
     send_counts = [0] * num_ranks
     if owned_params:
         for p in owned_params:
             state = param_to_state[id(p)]
-            assert computed_us[id(p)] is not None
-            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
-            total_sent = 0
             for dst_rank in range(num_ranks):
-                indices = state.rank_indices[dst_rank]
-                su = u_full[indices].flatten()
-                n = su.numel()
-                assert n > 0
-                per_dst[dst_rank].append(su)
-                send_counts[dst_rank] += n
-                total_sent += n
-            assert total_sent == u_full.numel()
-    lengths = [len(v) for v in per_dst]
-    if all(l > 0 for l in lengths):
-        assert all(
-            l == lengths[0] for l in lengths
-        ), "All destination ranks must have the same number of sharded tensor"
-        per_dst_flat = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst_flat, dim=0)
     else:
         send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
@@ -218,7 +227,6 @@ def _launch_scatter(
         recv_counts[src] = total
     recv_total = sum(recv_counts)
-    assert recv_total > 0
     recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
     # Launch async all-to-all
@@ -242,7 +250,13 @@ def _complete_scatter(
     rank: int,
     scattered_us: dict[int, torch.Tensor],
 ) -> None:
-    """Copy recv buffer into scattered_us (in-place)."""
     off = 0
     for src in range(len(recv_counts)):
         block = recv_counts[src]
@@ -255,11 +269,11 @@ def _complete_scatter(
             if state.worker_rank != src:
                 continue
             n = state.rank_numels[rank]
-            assert n > 0
-            flat_local = recv_buf.narrow(0, off + inner_off,
-                                         n).view_as(p.to_local())
-            scattered_us[id(p)].copy_(flat_local)
             inner_off += n
@@ -275,23 +289,40 @@ def _update_params(
     lr: float,
     weight_decay: float,
 ) -> None:
-    """Apply weight decay, Muon update, and optional QK clipping."""
-    for p in params:
-        state = param_to_state[id(p)]
-        u_dtensor = DTensor.from_local(
-            scattered_us[id(p)],
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
         adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
-        # QK clipping – applied directly on the local tensor to
-        # avoid DTensor sharding-propagation issues with _StridedShard.
-        scales_full = compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
         if scales_full is not None:
             ratio = p.shape[0] // scales_full.shape[0]
             idx0 = state.rank_indices[rank][0]
@@ -304,6 +335,45 @@ def _update_params(
             p._local_tensor.mul_(row_scales.view(-1, 1))
 # ======================================================================
 # Main generator – thin orchestrator that wires stages together.
 # ======================================================================
@@ -318,6 +388,7 @@ def muon_chunk_pipeline(
     lr: float,
     weight_decay: float,
     none_grad: bool,
 ) -> Generator[None, None, None]:
     """Process one chunk of parameters through the full Muon pipeline.
@@ -334,9 +405,12 @@ def muon_chunk_pipeline(
     runs concurrently on the NCCL stream — no separate ``comm_stream``
     is required.
     Yields exactly **2** times:
-    1. After launching async all-to-all gather.
     2. After launching async all-to-all scatter.
     """
     process_group = param_to_state[id(params[0])].process_group
@@ -345,15 +419,19 @@ def muon_chunk_pipeline(
         p for p in params if param_to_state[id(p)].worker_rank == rank
     ]
-    # Stages 1-2: launch async gather.
-    with record_function("muon::launch_gather"):
-        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group)
-        if none_grad:
-            for p in params:
-                p.grad = None
     yield  # --- YIELD 1: other chunks can launch their gather ---

 from torch.distributed.tensor import DTensor
 from torch.profiler import record_function
+from .core import _muon_state, adjust_lr_for_muon
+from .newton_schulz import COMM_DTYPE, zeropower_via_newtonschulz5
 from .qk_clip import compute_scales
 logger = logging.getLogger(__name__)
         else:
             gathered_grads[id(p)] = None
+    # Build send buffer – batch grad copies via torch.cat
+    # (1-2 fused kernels vs N individual narrow().copy_() calls).
     send_counts = [0] * num_ranks
     for p in params:
         state = param_to_state[id(p)]
+        send_counts[state.worker_rank] += state.rank_numels[rank]
+    total_send = sum(send_counts)
+    if total_send > 0:
+        # Group grad slices by destination rank in a single pass.
+        dst_to_grads = [[] for _ in range(num_ranks)]
+        for p in params:
+            state = param_to_state[id(p)]
+            n = state.rank_numels[rank]
+            if n > 0:
+                g = p.grad.to_local()
+                dst_to_grads[state.worker_rank].append(g.reshape(-1))
+        # Flatten in dst order and cat once.
+        all_slices = []
+        for dst in range(num_ranks):
+            all_slices.extend(dst_to_grads[dst])
+        send_buf = torch.cat(all_slices)
+        if send_buf.dtype != COMM_DTYPE:
+            send_buf = send_buf.to(COMM_DTYPE)
+    else:
+        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
     # Build recv buffer
     recv_counts = [0] * num_ranks
             shard_view = gathered_grads[id(p)][indices]
             n = shard_view.numel()
+            if n == 0:
+                continue
             sg = recv_buf.narrow(0, off + inner_off, n)
             sg = sg.reshape(shard_view.shape)
     """
     computed_us: dict[int, torch.Tensor | None] = {}
     for p in owned_params:
+        u = zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
         gathered_grads[id(p)] = None  # free gathered grad
         computed_us[id(p)] = u
     return computed_us
     Returns:
         work: Async operation handle.
         recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
+        scattered_us: Empty dict, populated by ``_complete_scatter`` with
+            zero-copy views into ``recv_buf``.
         recv_counts: Per-source-rank element counts.
     """
+    # scattered_us is populated by _complete_scatter with zero-copy views
+    # into recv_buf, avoiding N empty_like allocations + N copy_ calls.
+    # Pre-seed entries for params whose local shard is empty (rank_numels == 0)
+    # so _update_params can iterate all params without KeyError.
     scattered_us: dict[int, torch.Tensor] = {}
     for p in params:
+        if param_to_state[id(p)].rank_numels[rank] == 0:
+            scattered_us[id(p)] = torch.empty_like(p.to_local(),
+                                                   dtype=COMM_DTYPE)
+    # Build send buffer – batch via torch.cat
+    # (1 fused kernel vs N*num_ranks individual narrow().copy_() calls).
     send_counts = [0] * num_ranks
     if owned_params:
         for p in owned_params:
             state = param_to_state[id(p)]
             for dst_rank in range(num_ranks):
+                send_counts[dst_rank] += state.rank_numels[dst_rank]
+    total_send = sum(send_counts)
+    if total_send > 0:
+        # Cache u_full conversions to avoid redundant .to() per dst_rank.
+        u_fulls = {}
+        for p in owned_params:
+            u_fulls[id(p)] = computed_us[id(p)].to(COMM_DTYPE).contiguous()
+        # Collect slices in dst order (matches all-to-all send layout).
+        all_slices = []
+        for dst_rank in range(num_ranks):
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                su = u_fulls[id(p)][state.rank_indices[dst_rank]].flatten()
+                if su.numel() > 0:
+                    all_slices.append(su)
+        send_buf = torch.cat(all_slices) if all_slices else torch.empty(
+            0, dtype=COMM_DTYPE, device="cuda")
     else:
         send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
         recv_counts[src] = total
     recv_total = sum(recv_counts)
     recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
     # Launch async all-to-all
     rank: int,
     scattered_us: dict[int, torch.Tensor],
 ) -> None:
+    """Populate scattered_us with zero-copy views into recv_buf.
+    Instead of pre-allocating tensors and copying, we assign views directly
+    from ``recv_buf``.  This eliminates N ``empty_like`` + N ``copy_`` calls.
+    The underlying storage of ``recv_buf`` is kept alive through the views
+    until ``scattered_us`` is cleared after ``_update_params``.
+    """
     off = 0
     for src in range(len(recv_counts)):
         block = recv_counts[src]
             if state.worker_rank != src:
                 continue
             n = state.rank_numels[rank]
+            if n == 0:
+                continue
+            scattered_us[id(p)] = recv_buf.narrow(0, off + inner_off,
+                                                  n).view_as(p.to_local())
             inner_off += n
     lr: float,
     weight_decay: float,
 ) -> None:
+    """Apply weight decay, Muon update, and optional QK clipping.
+    Uses batched ``_foreach_mul_`` for weight decay and batched
+    ``_foreach_add_`` for the Muon update, grouping parameters by
+    adjusted_lr to minimize kernel launches while preserving float32
+    precision for the alpha scaling.
+    """
+    if not params:
+        return
+    # Batched weight decay: p *= (1 - lr * wd) — single fused kernel.
+    p_locals = [p._local_tensor for p in params]
+    torch._foreach_mul_(p_locals, 1.0 - lr * weight_decay)
+    # Group params by adjusted_lr so _foreach_add_ can use a single
+    # alpha per group (preserves float32 precision for alpha scaling).
+    lr_groups: dict[float, tuple[list, list]] = {}
+    for p in params:
         adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+        if adjusted_lr not in lr_groups:
+            lr_groups[adjusted_lr] = ([], [])
+        lr_groups[adjusted_lr][0].append(p._local_tensor)
+        lr_groups[adjusted_lr][1].append(scattered_us[id(p)])
+    for adjusted_lr, (p_group, u_group) in lr_groups.items():
+        torch._foreach_add_(p_group, u_group, alpha=-adjusted_lr)
+    # QK clipping – applied directly on the local tensor to
+    # avoid DTensor sharding-propagation issues with _StridedShard.
+    for p in params:
+        state = param_to_state[id(p)]
+        if state.qk_clip_state is None:
+            continue
+        scales_full = compute_scales(p, state.qk_clip_state)
         if scales_full is not None:
             ratio = p.shape[0] // scales_full.shape[0]
             idx0 = state.rank_indices[rank][0]
             p._local_tensor.mul_(row_scales.view(-1, 1))
+# ======================================================================
+# Pre-launch helper for overlapping first chunk's gather with other work.
+# ======================================================================
+@torch.no_grad()
+def prelaunch_first_gather(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    none_grad: bool,
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
+    """Launch the first chunk's A2A gather early for overlap with other compute.
+    Call this *before* expensive GPU work (e.g. batched expert NS) so that
+    the NCCL all-to-all runs concurrently on the NCCL stream while the
+    default stream executes compute.
+    Returns the same 4-tuple that ``_launch_gather`` produces, which should
+    be passed as ``prelaunch_gather`` to :func:`muon_chunk_pipeline`.
+    """
+    process_group = param_to_state[id(params[0])].process_group
+    num_ranks = dist.get_world_size(group=process_group)
+    owned_params = [
+        p for p in params if param_to_state[id(p)].worker_rank == rank
+    ]
+    with record_function("muon::prelaunch_gather"):
+        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group)
+    if none_grad:
+        for p in params:
+            p.grad = None
+    return work, recv_buf, gathered_grads, recv_counts
 # ======================================================================
 # Main generator – thin orchestrator that wires stages together.
 # ======================================================================
     lr: float,
     weight_decay: float,
     none_grad: bool,
+    prelaunch_gather: tuple | None = None,
 ) -> Generator[None, None, None]:
     """Process one chunk of parameters through the full Muon pipeline.
     runs concurrently on the NCCL stream — no separate ``comm_stream``
     is required.
+    If ``prelaunch_gather`` is provided, the gather was already launched
+    by :func:`prelaunch_first_gather` and we skip launching it again.
     Yields exactly **2** times:
+    1. After launching async all-to-all gather (or immediately if pre-launched).
     2. After launching async all-to-all scatter.
     """
     process_group = param_to_state[id(params[0])].process_group
         p for p in params if param_to_state[id(p)].worker_rank == rank
     ]
+    if prelaunch_gather is not None:
+        # Gather was pre-launched; none_grad already handled by caller.
+        work, recv_buf, gathered_grads, recv_counts = prelaunch_gather
+    else:
+        # Normal path: launch async gather.
+        with record_function("muon::launch_gather"):
+            work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+                params, owned_params, param_to_state, rank, num_ranks,
+                process_group)
+            if none_grad:
+                for p in params:
+                    p.grad = None
     yield  # --- YIELD 1: other chunks can launch their gather ---

build/torch210-cxx11-cu130-x86_64-linux/qk_clip.py CHANGED Viewed

@@ -5,6 +5,8 @@ from dataclasses import dataclass
 import torch
 from torch.distributed.tensor import DTensor
 logger = logging.getLogger(__name__)
@@ -23,7 +25,7 @@ def parse_qk_layer(name: str) -> tuple[str | None, int]:
         'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
         'model.4.attn.v_proj.weight'  -> (None, -1)
     """
-    parts = name.split('.')
     if len(parts) < 3:
         return None, -1
@@ -100,23 +102,27 @@ def compute_scales(p, qk_clip_state):
     threshold = qk_clip_state.threshold
     logit = qk_clip_state.logit
-    H_global = p.shape[0] // head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    scaling = 0
     for logit_idx, head_idx in enumerate(indices):
         v_ele = float(logit[logit_idx])
         if v_ele > threshold:
             new_scale = math.sqrt(threshold / v_ele)
-            if new_scale < scales_full[head_idx]:
-                scales_full[head_idx] = new_scale
                 logger.info(
                     f"[{kind}] Head {head_idx} exceeded threshold "
                     f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
                 )
-                scaling += 1
-    return scales_full if scaling > 0 else None
 def qk_clip(p, scales, head_dim):

 import torch
 from torch.distributed.tensor import DTensor
+from .core import normalize_fqn
 logger = logging.getLogger(__name__)
         'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
         'model.4.attn.v_proj.weight'  -> (None, -1)
     """
+    parts = normalize_fqn(name).split('.')
     if len(parts) < 3:
         return None, -1
     threshold = qk_clip_state.threshold
     logit = qk_clip_state.logit
+    # Check if any head exceeds threshold before allocating.
+    head_scales = {}
     for logit_idx, head_idx in enumerate(indices):
         v_ele = float(logit[logit_idx])
         if v_ele > threshold:
             new_scale = math.sqrt(threshold / v_ele)
+            if head_idx not in head_scales or new_scale < head_scales[head_idx]:
+                head_scales[head_idx] = new_scale
                 logger.info(
                     f"[{kind}] Head {head_idx} exceeded threshold "
                     f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
                 )
+    if not head_scales:
+        return None
+    H_global = p.shape[0] // head_dim
+    scales_full = torch.ones(H_global, device=p.data.device)
+    for head_idx, scale in head_scales.items():
+        scales_full[head_idx] = scale
+    return scales_full
 def qk_clip(p, scales, head_dim):

build/torch210-cxx11-rocm70-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_7aef62f_dirty
-ops = torch.ops._optimizer_7aef62f_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_7aef62f_dirty::{op_name}"

 import torch
+from . import _optimizer_5b58933_dirty
+ops = torch.ops._optimizer_5b58933_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_5b58933_dirty::{op_name}"

build/torch210-cxx11-rocm70-x86_64-linux/{_optimizer_7aef62f_dirty.abi3.so → _optimizer_5b58933_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:00e9d9e1c2306badb97c3b8f2454a47d6335a302101a38c804ad3c7b075168cc
 size 1866400

 version https://git-lfs.github.com/spec/v1
+oid sha256:0102e10121a43f6d5d59a23f2c0e21d88469cc4597d84f7d48b64b0fabfeacdb
 size 1866400

build/torch210-cxx11-rocm70-x86_64-linux/adamw.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from collections import defaultdict
 from typing import cast
 import torch
 from torch.distributed.tensor import DTensor
 def fused_adamw(
@@ -72,54 +76,72 @@ def fused_adamw(
         )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
     params_with_grads = []
     grads = []
     moment1 = []
     moment2 = []
-    max_exp_avg_sqs = []
     state_steps = []
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
     for p in params:
         g = p.grad
         if g is None:
             continue
         state = optimizer_state[p]
-        params_with_grads.append(p)
-        grads.append(g)
         if "step" not in state:
-            state["step"] = (torch.zeros((),
-                                         dtype=torch.float32,
-                                         device=p.device))
             state["moment1"] = torch.zeros_like(g)
             state["moment2"] = torch.zeros_like(g)
-        moment1.append(state["moment1"])
-        moment2.append(state["moment2"])
         if not isinstance(state["step"], torch.Tensor):
-            step_tensor = torch.tensor(state["step"],
-                                       dtype=torch.float32,
-                                       device=p.device)
-        else:
-            step_tensor = state["step"]
-        state_steps.append(step_tensor)
     fused_adamw(
         params_with_grads,
         grads,
         moment1,
         moment2,
-        max_exp_avg_sqs,
         state_steps,
         amsgrad=False,
         beta1=beta1,
@@ -131,24 +153,119 @@ def step_adamw_params(optimizer_state, params, group):
     )
 def step_adamw(optimizer_state, group):
     """Dispatch AdamW step, grouping parameters by type and placement.
     Args:
         optimizer_state: The optimizer's state dict (self.state in Muon).
         group: Parameter group dict.
     """
     params = group["params"]
-    # group params with its type and placement
-    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
-    for p in params:
-        match p:
-            case DTensor():
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])].append(p)
-            case torch.Tensor():
-                placement_to_params[tuple([torch.Tensor, None])].append(p)
-    for group_params in placement_to_params.values():
         step_adamw_params(optimizer_state, group_params, group)

+import logging
 from collections import defaultdict
 from typing import cast
 import torch
 from torch.distributed.tensor import DTensor
+from torch.profiler import record_function
+logger = logging.getLogger(__name__)
 def fused_adamw(
         )
+def _to_local(t):
+    """Unwrap DTensor to local tensor for fused ops."""
+    return t._local_tensor if isinstance(t, DTensor) else t
+# ---------------------------------------------------------------------------
+# Caches for eliminating per-step Python overhead.
+#
+# Placement grouping and tensor list assembly are identical every step
+# (params don't change placement, moment/step tensors are the same objects
+# after initialisation).  We cache them keyed by id() of the param list
+# stored in param_groups (stable across steps).
+#
+# Only gradients change each step and must be collected fresh.
+# ---------------------------------------------------------------------------
+# id(group["params"]) → dict[placement_key, list[param]]
+_placement_cache: dict[int, dict[tuple, list]] = {}
+# id(placement_group_list) → (params_local, moment1, moment2, state_steps)
+_tensor_cache: dict[int, tuple[list, list, list, list]] = {}
+def _step_adamw_params_slow(optimizer_state, params, group):
+    """Uncached fallback for the rare case where some params lack grads."""
     params_with_grads = []
     grads = []
     moment1 = []
     moment2 = []
     state_steps = []
     for p in params:
         g = p.grad
         if g is None:
             continue
         state = optimizer_state[p]
+        params_with_grads.append(_to_local(p))
+        grads.append(_to_local(g))
         if "step" not in state:
+            state["step"] = torch.zeros((),
+                                        dtype=torch.float32,
+                                        device=p.device)
             state["moment1"] = torch.zeros_like(g)
             state["moment2"] = torch.zeros_like(g)
+        moment1.append(_to_local(state["moment1"]))
+        moment2.append(_to_local(state["moment2"]))
         if not isinstance(state["step"], torch.Tensor):
+            state["step"] = torch.tensor(state["step"],
+                                         dtype=torch.float32,
+                                         device=p.device)
+        state_steps.append(state["step"])
+    if not params_with_grads:
+        return
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
     fused_adamw(
         params_with_grads,
         grads,
         moment1,
         moment2,
+        [],
         state_steps,
         amsgrad=False,
         beta1=beta1,
     )
+def step_adamw_params(optimizer_state, params, group):
+    """Run fused AdamW on a list of parameters sharing the same placement.
+    After the first call, cached tensor lists (params_local, moment1,
+    moment2, state_steps) are reused — only gradients are collected fresh.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        params: List of parameters to update.
+        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
+    """
+    # Collect grads — the only thing that changes each step.
+    with record_function("adamw::collect_grads"):
+        grads = []
+        for p in params:
+            g = p.grad
+            if g is None:
+                # Rare: fall back to slow path that filters per-param.
+                _step_adamw_params_slow(optimizer_state, params, group)
+                return
+            grads.append(_to_local(g))
+    tensor_key = id(params)
+    if tensor_key not in _tensor_cache:
+        with record_function("adamw::init_tensor_cache"):
+            params_local = []
+            moment1 = []
+            moment2 = []
+            state_steps = []
+            for p in params:
+                state = optimizer_state[p]
+                params_local.append(_to_local(p))
+                if "step" not in state:
+                    state["step"] = torch.zeros((),
+                                                dtype=torch.float32,
+                                                device=p.device)
+                    state["moment1"] = torch.zeros_like(p.grad)
+                    state["moment2"] = torch.zeros_like(p.grad)
+                moment1.append(_to_local(state["moment1"]))
+                moment2.append(_to_local(state["moment2"]))
+                if not isinstance(state["step"], torch.Tensor):
+                    state["step"] = torch.tensor(state["step"],
+                                                 dtype=torch.float32,
+                                                 device=p.device)
+                state_steps.append(state["step"])
+            _tensor_cache[tensor_key] = (params_local, moment1, moment2,
+                                         state_steps)
+    params_local, moment1, moment2, state_steps = _tensor_cache[tensor_key]
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
+    with record_function("adamw::fused_adamw"):
+        fused_adamw(
+            params_local,
+            grads,
+            moment1,
+            moment2,
+            [],
+            state_steps,
+            amsgrad=False,
+            beta1=beta1,
+            beta2=beta2,
+            lr=lr,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=False,
+        )
 def step_adamw(optimizer_state, group):
     """Dispatch AdamW step, grouping parameters by type and placement.
+    Placement grouping is cached after the first call since params never
+    change their placement between steps.
     Args:
         optimizer_state: The optimizer's state dict (self.state in Muon).
         group: Parameter group dict.
     """
     params = group["params"]
+    placement_key = id(params)
+    if placement_key not in _placement_cache:
+        with record_function("adamw::group_by_placement"):
+            placement_to_params: dict[tuple,
+                                      list[torch.Tensor]] = defaultdict(list)
+            for p in params:
+                match p:
+                    case DTensor():
+                        logger.debug(
+                            "[AdamW] DTensor param: shape=%s, placements=%s, "
+                            "mesh=%s, grad=%s", p.shape, p.placements,
+                            p.device_mesh.mesh_dim_names,
+                            p.grad.shape if p.grad is not None else None)
+                        placement_to_params[tuple(
+                            [p.placements, p.device_mesh])].append(p)
+                    case torch.Tensor():
+                        logger.debug(
+                            "[AdamW] plain param: shape=%s, grad=%s", p.shape,
+                            p.grad.shape if p.grad is not None else None)
+                        placement_to_params[tuple([torch.Tensor,
+                                                   None])].append(p)
+            logger.debug("[AdamW] %d placement groups, %d total params",
+                         len(placement_to_params), len(params))
+            _placement_cache[placement_key] = dict(placement_to_params)
+    for group_params in _placement_cache[placement_key].values():
         step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-rocm70-x86_64-linux/core.py CHANGED Viewed

@@ -1,11 +1,25 @@
 import math
 from dataclasses import dataclass
 import torch
-import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
 @dataclass
 class _muon_state:
@@ -17,26 +31,71 @@ class _muon_state:
     qk_clip_state: torch.Tensor | None = None
-def update_g(optimizer_state, p, g, group, momentum):
-    """Apply momentum update to gradient.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        p: Parameter tensor.
-        g: Gradient tensor.
-        group: Parameter group dict.
-        momentum: Momentum coefficient.
-    Returns:
-        Momentum-updated gradient tensor.
     """
-    state = optimizer_state[p]
-    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-    torch.add(g, buf, alpha=momentum, out=buf)
-    if group["nesterov"]:
-        g.add_(buf, alpha=momentum)
-        return g
-    return buf
 def update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -49,14 +108,13 @@ def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
-    if isinstance(p, torch.nn.Parameter):
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    else:
-        p.mul_(1 - lr * weight_decay)
-        p.add_(u, alpha=-adjusted_lr)
 def adjust_lr_for_muon(lr, param_shape):
@@ -77,14 +135,55 @@ def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
 def default_is_muon(name, x, expert_keys=None):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    if any(key in name for key in skip_keys):
         return False
     effective_ndim = x.ndim
-    if expert_keys and any(key in name for key in expert_keys):
         effective_ndim -= 1
-    return effective_ndim >= 2
 def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
@@ -92,7 +191,7 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
-    non_muon_params = []
     for n, p in model.named_parameters():
         if not p.requires_grad:
@@ -102,6 +201,10 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
             muon_names.append(n)
         else:
             non_muon_params.append(p)
     return [
         {

+import logging
 import math
 from dataclasses import dataclass
+from typing import List
 import torch
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
+# torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
+# parameter FQNs.  Activation checkpointing similarly inserts
+# "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
+# expert_keys, QK layer parsing) works regardless of wrapper nesting.
+_WRAPPER_PARTS = frozenset({"_orig_mod", "_checkpoint_wrapped_module"})
+logger = logging.getLogger(__name__)
+def normalize_fqn(name: str) -> str:
+    """Strip torch.compile / checkpoint wrapper components from a parameter FQN."""
+    return ".".join(p for p in name.split(".") if p not in _WRAPPER_PARTS)
 @dataclass
 class _muon_state:
     qk_clip_state: torch.Tensor | None = None
+def _batch_momentum(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update (no nesterov)."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+def _batch_momentum_nesterov(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update with nesterov correction."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+    nesterov_terms = torch._foreach_mul(momentum_bufs, momentum)
+    torch._foreach_add_(grads, nesterov_terms)
+_compiled_momentum: dict[bool, callable] = {}
+_use_momentum_compile = True
+def set_momentum_compile(enabled: bool):
+    """Toggle torch.compile for batched momentum."""
+    global _use_momentum_compile
+    _use_momentum_compile = enabled
+def batch_pre_ortho(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+    nesterov: bool,
+) -> None:
+    """Batched momentum update on lists of plain tensors.
+    Mirrors dion's ``muon_update_pre_orthogonalize``.
+    Inputs must be plain CUDA tensors (not DTensor).
+    Modifies ``momentum_bufs`` and (for nesterov) ``grads`` in-place.
+    When compile is enabled, uses separately compiled functions for
+    nesterov=True/False to avoid graph breaks from the branch.
     """
+    fn = _batch_momentum_nesterov if nesterov else _batch_momentum
+    if _use_momentum_compile:
+        if nesterov not in _compiled_momentum:
+            _compiled_momentum[nesterov] = torch.compile(fn)
+        fn = _compiled_momentum[nesterov]
+    fn(grads, momentum_bufs, momentum)
+def _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay):
+    """Weight-decay + update on plain tensors.
+    Not compiled: per-param @torch.compile caused ~0.25ms TorchDynamo cache
+    lookup per call × 256+ params = massive overhead.  The pipeline path uses
+    batched _foreach_* ops instead; this function remains for base() and
+    distributed_muon().
+    """
+    p_data.mul_(1 - lr * weight_decay)
+    p_data.add_(u_data, alpha=-adjusted_lr)
 def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
+    # Unwrap Parameter -> underlying data tensor.
+    p_data = p.data if isinstance(p, torch.nn.Parameter) else p
+    # Unwrap DTensor -> local CUDA tensor for compiled kernel.
+    if isinstance(p_data, DTensor):
+        p_data = p_data._local_tensor
+    u_data = u._local_tensor if isinstance(u, DTensor) else u
+    _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay)
 def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
+def _match_key(parts, key):
+    """Check if key matches as contiguous components in parts.
+    Single-component keys (e.g. "experts") match any single component.
+    Multi-component keys (e.g. "experts.w1") match as a contiguous subsequence.
+    """
+    key_parts = key.split(".")
+    key_len = len(key_parts)
+    if key_len == 1:
+        return key in parts
+    return any(parts[i:i + key_len] == key_parts
+               for i in range(len(parts) - key_len + 1))
+def is_expert_param(name, expert_keys):
+    """Check if a parameter name matches any expert key (component-level)."""
+    if not expert_keys:
+        return False
+    parts = normalize_fqn(name).split(".")
+    return any(_match_key(parts, key) for key in expert_keys)
 def default_is_muon(name, x, expert_keys=None):
+    normalized = normalize_fqn(name)
+    parts = normalized.split(".")
+    skip_keys = [
+        "embed_tokens",
+        "lm_head",
+        "tok_embeddings",
+        "output",
+        "mhc_attn",
+        "mhc_ffn",
+        "lambda_proj",
+    ]
+    if any(key in parts for key in skip_keys):
+        logger.info(
+            "[is_muon] %s (orig: %s): skip (matched skip_key), ndim=%d",
+            normalized, name, x.ndim)
         return False
     effective_ndim = x.ndim
+    is_expert = is_expert_param(name, expert_keys)
+    if is_expert:
         effective_ndim -= 1
+    result = effective_ndim >= 2
+    logger.info(
+        "[is_muon] %s (orig: %s): ndim=%d, expert=%s, effective_ndim=%d → %s",
+        normalized, name, x.ndim, is_expert, effective_ndim,
+        "Muon" if result else "AdamW")
+    return result
 def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
+    non_muon_params, non_muon_names = [], []
     for n, p in model.named_parameters():
         if not p.requires_grad:
             muon_names.append(n)
         else:
             non_muon_params.append(p)
+            non_muon_names.append(n)
+    logger.info("[param_groups] expert_keys=%s, Muon=%d, AdamW=%d",
+                expert_keys, len(muon_names), len(non_muon_names))
     return [
         {

build/torch210-cxx11-rocm70-x86_64-linux/cpu_offload.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""CPU offloading for optimizer states.
+Manages a pinned CPU memory pool and async CUDA streams to offload
+optimizer state tensors (momentum buffers, Adam moments) to CPU between
+optimizer steps, freeing GPU memory.
+All tracked tensors are packed into a single flat pinned CPU buffer
+(per dtype).  D2H and H2D copies are performed per-tensor directly
+between individual GPU tensors and their slice of the CPU flat buffer
+— no GPU staging buffer is allocated, so there is **no temporary GPU
+memory spike** during offload or reload.
+Individual tensor storages are freed after offload via
+``untyped_storage().resize_(0)``, preserving tensor identity so
+downstream caches remain valid.
+"""
+import logging
+from collections import defaultdict
+import torch
+from torch.distributed.tensor import DTensor
+logger = logging.getLogger(__name__)
+class CPUOffloadPool:
+    """Pinned CPU memory pool for async optimizer state offloading.
+    Tracked tensors are grouped by dtype.  Each group gets a single flat
+    pinned CPU buffer.  D2H / H2D copies are per-tensor (into slices of
+    the flat buffer) to avoid allocating a GPU staging buffer.
+    """
+    def __init__(self):
+        self._managed: list[torch.Tensor] = []
+        self._storage_nbytes: dict[int, int] = {}  # id(t) → bytes
+        # Per-dtype group: populated on first offload.
+        # dtype → dict with keys:
+        #   "indices"   : list[int]           managed-list indices
+        #   "offsets"   : list[tuple[int,int]] (start, numel) in flat buf
+        #   "total"     : int                  total numel
+        #   "cpu_flat"  : Tensor               pinned CPU buffer
+        self._groups: dict[torch.dtype, dict] = {}
+        self._offload_stream: torch.cuda.Stream | None = None
+        self._device: torch.device | None = None
+        self._initialized: bool = False
+        self._logged: bool = False
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _local(t: torch.Tensor) -> torch.Tensor:
+        """Unwrap DTensor to its local CUDA tensor."""
+        return t._local_tensor if isinstance(t, DTensor) else t
+    def _ensure_stream(self):
+        if self._offload_stream is None:
+            self._offload_stream = torch.cuda.Stream(device=self._device)
+    # ------------------------------------------------------------------
+    def track(self, tensor: torch.Tensor):
+        """Register a GPU tensor for CPU offloading.  Idempotent."""
+        tid = id(tensor)
+        if tid in self._storage_nbytes:
+            return
+        local = self._local(tensor)
+        if self._device is None:
+            self._device = local.device
+        self._storage_nbytes[tid] = local.untyped_storage().size()
+        self._managed.append(tensor)
+    # ------------------------------------------------------------------
+    def _init_buffers(self):
+        """Build per-dtype flat buffers on first offload."""
+        # Group managed tensors by dtype.
+        dtype_map: dict[torch.dtype, list[tuple[int, int]]] = defaultdict(list)
+        for idx, t in enumerate(self._managed):
+            local = self._local(t)
+            dtype_map[local.dtype].append((idx, local.numel()))
+        total_cpu_bytes = 0
+        for dtype, entries in dtype_map.items():
+            offsets: list[tuple[int, int]] = []
+            indices: list[int] = []
+            off = 0
+            for idx, n in entries:
+                indices.append(idx)
+                offsets.append((off, n))
+                off += n
+            cpu_flat = torch.empty(off, dtype=dtype, device="cpu", pin_memory=True)
+            self._groups[dtype] = {
+                "indices": indices,
+                "offsets": offsets,
+                "total": off,
+                "cpu_flat": cpu_flat,
+            }
+            total_cpu_bytes += off * cpu_flat.element_size()
+        self._initialized = True
+        logger.info(
+            "[CPUOffload] Pool initialized: %d tensors, %d dtype group(s), "
+            "%.2f MB pinned CPU memory",
+            len(self._managed),
+            len(self._groups),
+            total_cpu_bytes / (1024**2),
+        )
+    # ------------------------------------------------------------------
+    def offload(self):
+        """Per-tensor async D2H into CPU flat buffer, then free GPU storage."""
+        if not self._managed:
+            return
+        if not self._initialized:
+            self._init_buffers()
+        self._ensure_stream()
+        # Offload stream waits for compute to finish.
+        compute_event = torch.cuda.current_stream(
+            self._device).record_event()
+        self._offload_stream.wait_event(compute_event)
+        offloaded_bytes = 0
+        # Per-tensor D2H copies directly into CPU flat buffer slices.
+        # No GPU staging buffer → no temporary GPU memory spike.
+        with torch.cuda.stream(self._offload_stream):
+            for dtype, grp in self._groups.items():
+                indices = grp["indices"]
+                offsets = grp["offsets"]
+                cpu_flat = grp["cpu_flat"]
+                for i, mgd_idx in enumerate(indices):
+                    local = self._local(self._managed[mgd_idx])
+                    off, n = offsets[i]
+                    cpu_flat[off:off + n].copy_(
+                        local.reshape(-1), non_blocking=True)
+                offloaded_bytes += grp["total"] * cpu_flat.element_size()
+        # Wait for all D2H copies to land, then free GPU storage.
+        self._offload_stream.synchronize()
+        for t in self._managed:
+            self._local(t).untyped_storage().resize_(0)
+        if not self._logged:
+            logger.info("[CPUOffload] Offloaded %.2f MB (GPU → CPU)",
+                        offloaded_bytes / (1024**2))
+    # ------------------------------------------------------------------
+    def reload(self):
+        """Per-tensor H2D from CPU flat buffer on the default stream.
+        Runs on the current (default) CUDA stream to avoid stream
+        interaction issues with the parallel Muon pipeline.  Since
+        pinned CPU memory is the source, the copies overlap with
+        GPU idle time between steps.
+        """
+        if not self._managed or not self._initialized:
+            return
+        reloaded_bytes = 0
+        # Re-allocate all GPU storages first.
+        for t in self._managed:
+            local = self._local(t)
+            local.untyped_storage().resize_(self._storage_nbytes[id(t)])
+        # Per-tensor H2D copies from CPU flat buffer slices.
+        # non_blocking=True with pinned source allows DMA overlap.
+        for dtype, grp in self._groups.items():
+            indices = grp["indices"]
+            offsets = grp["offsets"]
+            cpu_flat = grp["cpu_flat"]
+            for i, mgd_idx in enumerate(indices):
+                local = self._local(self._managed[mgd_idx])
+                off, n = offsets[i]
+                local.reshape(-1).copy_(
+                    cpu_flat[off:off + n], non_blocking=True)
+            reloaded_bytes += grp["total"] * cpu_flat.element_size()
+        if not self._logged:
+            logger.info("[CPUOffload] Reloaded %.2f MB (CPU → GPU)",
+                        reloaded_bytes / (1024**2))
+            self._logged = True

build/torch210-cxx11-rocm70-x86_64-linux/distributed/utils.py CHANGED Viewed

@@ -72,12 +72,6 @@ def get_slices_of_dtensor(
         else:
             curr_size = target.size()[shard_dim]
-        if curr_size % num_chunks != 0:
-            raise NotImplementedError(
-                f"Dimension size {curr_size} is not divisible "
-                f"by number of ranks {num_chunks} for shard "
-                f"placement on dim {shard_dim}. (shape: {target.shape})")
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

         else:
             curr_size = target.size()[shard_dim]
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

build/torch210-cxx11-rocm70-x86_64-linux/matmul_transpose_triton.py CHANGED Viewed

@@ -43,6 +43,7 @@ def get_autotune_config():
 @triton.autotune(
     configs=get_autotune_config(),
     key=['M', 'K'],
 )
 @triton.jit
 def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
@@ -102,16 +103,10 @@ def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
         tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-def matmul_transpose_assign(d_in, d_out):
-    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
-    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
-    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
-    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
-    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
-    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
-    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
-            "First dimension of `d_in` must match first and second dimension of `d_out`"
     d_in = d_in.contiguous()
     M, K = d_in.shape
     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
@@ -119,3 +114,9 @@ def matmul_transpose_assign(d_in, d_out):
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))

 @triton.autotune(
     configs=get_autotune_config(),
     key=['M', 'K'],
+    restore_value=['y'],
 )
 @triton.jit
 def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
         tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+@torch.library.custom_op("muon::matmul_transpose_assign",
+                         mutates_args=("d_out", ))
+def matmul_transpose_assign(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
+    """Compute d_out = d_in @ d_in.T using an optimized Triton kernel."""
     d_in = d_in.contiguous()
     M, K = d_in.shape
     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))
+@matmul_transpose_assign.register_fake
+def _(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
+    """FakeTensor impl: d_out is already allocated, mutation is declared."""
+    pass

build/torch210-cxx11-rocm70-x86_64-linux/muon.py CHANGED Viewed

@@ -10,13 +10,16 @@ from torch.profiler import record_function
 from .adamw import step_adamw
 from .async_utils import run_pipeline
-from .core import (_muon_state, adjust_lr_for_muon,
-                   get_default_muon_param_groups, update_g, update_p)
 from .distributed.utils import (_is_shard, construct_shard_mesh,
                                 get_slices_of_dtensor)
 from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
-                            _zeropower_via_newtonschulz5)
-from .pipeline import muon_chunk_pipeline
 from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
@@ -45,9 +48,21 @@ def _expand_expert_params(names, params, expert_keys):
     expanded_params = []
     for n, p in zip(names, params):
-        is_expert = expert_keys and any(key in n for key in expert_keys)
         is_dtensor = isinstance(p.data, DTensor)
         if not is_expert:
             assert p.data.ndim <= 2, (
                 f"Param {n} has ndim={p.data.ndim} but does not match "
@@ -168,7 +183,6 @@ class Muon(torch.optim.Optimizer):
                      Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
-        small_param_numel_threshold: Threshold for classifying parameters as small and falling back to distributed Muon
         expert_keys: List of strings to identify expert-parallel parameters.
                      If any key appears in a parameter's name, its outermost
                      dimension is treated as the expert dimension and expanded
@@ -193,8 +207,8 @@ class Muon(torch.optim.Optimizer):
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
-                 small_param_numel_threshold=65536,
-                 expert_keys=None):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -228,8 +242,12 @@ class Muon(torch.optim.Optimizer):
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
-        self.small_param_numel_threshold = small_param_numel_threshold
         self.expert_keys = expert_keys
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -333,8 +351,8 @@ class Muon(torch.optim.Optimizer):
             if g is None:
                 continue
-            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
-                                             steps=group["ns_steps"])
             adjusted_lr = adjust_lr_for_muon(lr, p.shape)
             update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -355,52 +373,269 @@ class Muon(torch.optim.Optimizer):
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
-        """ Implementation of Distributed Muon by Liu et al. """
-        # Momentum is already applied by _step_muon before this method.
-        for n, p in zip(names, params):
-            g = p.grad
-            if g is None:
-                continue
-            # Gather G
-            if isinstance(p.data, DTensor):
-                g_full = g.full_tensor()
-                p_full = p.data.full_tensor()
-            else:
-                g_full = g
-                p_full = p
-            u_full = _zeropower_via_newtonschulz5(g_full.to(COMM_DTYPE),
-                                                  steps=group["ns_steps"])
-            adjusted_lr = adjust_lr_for_muon(lr, p_full.shape)
-            update_p(p_full, u_full, lr, adjusted_lr, weight_decay)
-            qk_clip_state = get_qk_clip_info(self.clip_config, n, qk_logits)
-            scales_full = compute_scales(
-                p_full, qk_clip_state) if qk_clip_state is not None else None
-            if scales_full is not None:
-                qk_clip(p_full, scales_full, qk_clip_state.head_dim)
-            if isinstance(p.data, DTensor):
-                ndims = len(p.device_mesh.mesh.shape)
-                p_replicate = DTensor.from_local(
-                    p_full,
-                    device_mesh=p.device_mesh,
-                    placements=[Replicate() for _ in range(ndims)],
-                )
-                p_sharded = p_replicate.redistribute(
-                    device_mesh=p.device_mesh,
-                    placements=p.placements,
                 )
-                p.copy_(p_sharded)
-    def parallel(self, names, params, group, lr, weight_decay, qk_logits):
         """
         Perform a parallel optimization step using Muon.
@@ -409,31 +644,23 @@ class Muon(torch.optim.Optimizer):
         interleaves multiple chunks so that communication and computation
         overlap across chunks (the same overlap previously achieved by the
         warmup + main-loop index scheduling).
         """
         # Momentum is already applied by _step_muon before this method.
-        param_to_state, ordered_params = self.init_state_and_assign_params(
-            names, params, group, qk_logits)
-        # Compute local rank for this group's shard process group.
-        shard_pg = param_to_state[id(ordered_params[0])].process_group
-        rank = dist.get_rank(group=shard_pg)
-        if self.chunk_size == -1:
-            shard_ranks = dist.get_world_size(param_to_state[id(
-                ordered_params[0])].process_group)
-            chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
-        elif self.chunk_size > 0:
-            chunk_size = self.chunk_size
-        else:
-            raise ValueError("chunk_size must be -1 or a positive integer.")
         def pipelines():
             for start in range(0, len(ordered_params), chunk_size):
                 chunk = ordered_params[start:start + chunk_size]
                 if chunk:
-                    yield muon_chunk_pipeline(
                         params=chunk,
                         param_to_state=param_to_state,
                         rank=rank,
@@ -442,9 +669,11 @@ class Muon(torch.optim.Optimizer):
                         weight_decay=weight_decay,
                         none_grad=group["none_grad"],
                     )
-        with record_function("muon::barrier"):
-            dist.barrier()
         with record_function("muon::pipeline"):
             run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
@@ -456,16 +685,152 @@ class Muon(torch.optim.Optimizer):
         names = group["names"]
         # Apply momentum to all params before routing/expansion.
         with record_function("muon::momentum"):
-            for n, p in zip(names, params):
-                g = p.grad
-                if g is None:
                     continue
-                g = update_g(self.state, p, g, group, momentum)
-                p.grad = g
         # Expand expert params by splitting on dim 0.
-        names, params = _expand_expert_params(names, params, self.expert_keys)
         param_dtensors = []
         name_dtensors = []
@@ -473,10 +838,10 @@ class Muon(torch.optim.Optimizer):
         param_tensors = []
         name_tensors = []
-        param_dtensors_small = []
-        name_dtensors_small = []
         if self.use_distributed_muon:
             self.distributed_muon(names=names,
                                   params=params,
                                   group=group,
@@ -485,8 +850,6 @@ class Muon(torch.optim.Optimizer):
                                   qk_logits=qk_logits)
             return
-        # For simplicity, we use distributed Muon for small parameters
-        # whose number of elements is below a threshold.
         for n, p in zip(names, params):
             if p is None or p.grad is None:
                 continue
@@ -494,23 +857,28 @@ class Muon(torch.optim.Optimizer):
                 if all(
                         isinstance(placement, Replicate)
                         for placement in p.placements):
                     param_tensors.append(p)
                     name_tensors.append(n)
-                elif p.data.numel() <= self.small_param_numel_threshold:
-                    param_dtensors_small.append(p)
-                    name_dtensors_small.append(n)
                 else:
                     param_dtensors.append(p)
                     name_dtensors.append(n)
             elif isinstance(p.data, torch.Tensor):
                 param_tensors.append(p)
                 name_tensors.append(n)
             else:
                 raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-        logger.debug(
-            f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors, "
-            f"{len(param_dtensors_small)} Small DTensors")
         def group_dtensors(dtensors, names):
             # To support different placements, we group parameters by placements
@@ -526,21 +894,6 @@ class Muon(torch.optim.Optimizer):
                                            p.device_mesh])][1].append(p)
             return placement_to_params
-        if len(param_dtensors_small) > 0:
-            if not dist.is_initialized():
-                raise RuntimeError(
-                    "Parallel Muon requires torch.distributed to be initialized."
-                )
-            self.distributed_muon(
-                params=param_dtensors_small,
-                names=name_dtensors_small,
-                group=group,
-                lr=lr,
-                weight_decay=weight_decay,
-                qk_logits=qk_logits,
-            )
         if len(param_dtensors) > 0:
             if not dist.is_initialized():
                 raise RuntimeError(
@@ -548,7 +901,26 @@ class Muon(torch.optim.Optimizer):
                 )
             dtensor_group = group_dtensors(param_dtensors, name_dtensors)
             for _, (names, params) in dtensor_group.items():
                 self.parallel(
                     names,
                     params,
@@ -556,7 +928,10 @@ class Muon(torch.optim.Optimizer):
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
                 )
         if len(param_tensors) > 0:
             self.base(
@@ -568,6 +943,33 @@ class Muon(torch.optim.Optimizer):
                 qk_logits=qk_logits,
             )
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
@@ -585,10 +987,82 @@ class Muon(torch.optim.Optimizer):
             with torch.enable_grad():
                 loss = closure()
-        for group in self.param_groups:
             if group["use_muon"]:
                 self._step_muon(group, qk_logits=qk_logits)
             else:
                 step_adamw(self.state, group)
         return loss

 from .adamw import step_adamw
 from .async_utils import run_pipeline
+from .core import (_muon_state, adjust_lr_for_muon, batch_pre_ortho,
+                   get_default_muon_param_groups, is_expert_param, update_p)
+from .cpu_offload import CPUOffloadPool
 from .distributed.utils import (_is_shard, construct_shard_mesh,
                                 get_slices_of_dtensor)
 from .newton_schulz import (COMM_DTYPE, DEFAULT_CHUNK_SIZE_RATIO,
+                            _zeropower_via_newtonschulz5,
+                            zeropower_via_newtonschulz5,
+                            zeropower_via_newtonschulz5_batched)
+from .pipeline import muon_chunk_pipeline, prelaunch_first_gather
 from .qk_clip import compute_scales, get_qk_clip_info, qk_clip
 logger = logging.getLogger(__name__)
     expanded_params = []
     for n, p in zip(names, params):
+        is_expert = is_expert_param(n, expert_keys)
         is_dtensor = isinstance(p.data, DTensor)
+        if is_expert:
+            if is_dtensor:
+                logger.debug(
+                    "[expand_expert] %s: expert DTensor, shape=%s, "
+                    "placements=%s, mesh=%s, local_shape=%s", n, p.shape,
+                    p.placements, p.device_mesh.mesh_dim_names,
+                    p.to_local().shape)
+            else:
+                logger.debug(
+                    "[expand_expert] %s: expert plain tensor, shape=%s", n,
+                    p.data.shape)
         if not is_expert:
             assert p.data.ndim <= 2, (
                 f"Param {n} has ndim={p.data.ndim} but does not match "
                      Use shard ranks * DEFAULT_CHUNK_SIZE_RATIO when -1 is specified.
         use_distributed_muon: Use distributed muon by Liu et al. (2024).
                               For testing purpose only.
         expert_keys: List of strings to identify expert-parallel parameters.
                      If any key appears in a parameter's name, its outermost
                      dimension is treated as the expert dimension and expanded
                  warmup_step=5,
                  chunk_size=-1,
                  use_distributed_muon=False,
+                 expert_keys=None,
+                 cpu_offload=False):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.warmup_step = warmup_step
         self.chunk_size = chunk_size
         self.use_distributed_muon = use_distributed_muon
         self.expert_keys = expert_keys
+        self.cpu_offload = cpu_offload
+        self._cpu_offload_pool = CPUOffloadPool() if cpu_offload else None
+        self._offload_initialized = False
+        self._parallel_cache: dict[tuple[str, ...], dict] = {}
+        self._expert_expand_cache: dict[tuple[int, ...], dict] = {}
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if g is None:
                 continue
+            u = zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
+                                            steps=group["ns_steps"])
             adjusted_lr = adjust_lr_for_muon(lr, p.shape)
             update_p(p, u, lr, adjusted_lr, weight_decay)
         weight_decay: float,
         qk_logits: list[torch.Tensor | DTensor] | None,
     ):
+        """Batched Distributed Muon — for testing/correctness verification only.
+        Uses all-gather to reconstruct full tensors, computes Newton-Schulz on
+        the full grad, then slices back to local shards.  This is simpler but
+        slower than the parallel pipeline (all2all) path, so it serves as a
+        reference implementation for verifying correctness.
+        """
+        with record_function("distributed_muon"):
+            # Momentum is already applied by _step_muon before this method.
+            ns_steps = group["ns_steps"]
+            # Separate plain tensors (no communication) from DTensors.
+            plain_names, plain_params = [], []
+            dtensor_names, dtensor_params = [], []
+            for n, p in zip(names, params):
+                if p.grad is None:
+                    continue
+                if isinstance(p.data, DTensor):
+                    dtensor_names.append(n)
+                    dtensor_params.append(p)
+                else:
+                    plain_names.append(n)
+                    plain_params.append(p)
+            # Process plain tensors per-param (no communication).
+            for n, p in zip(plain_names, plain_params):
+                u = _zeropower_via_newtonschulz5(p.grad.to(COMM_DTYPE),
+                                                 steps=ns_steps)
+                adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                update_p(p, u, lr, adjusted_lr, weight_decay)
+                qk_clip_state = get_qk_clip_info(self.clip_config, n,
+                                                 qk_logits)
+                scales_full = compute_scales(
+                    p, qk_clip_state) if qk_clip_state is not None else None
+                if scales_full is not None:
+                    qk_clip(p, scales_full, qk_clip_state.head_dim)
+            if not dtensor_params:
+                return
+            # Group DTensors by (placements, mesh) for batched all-gather.
+            placement_groups: dict[tuple,
+                                   tuple[list,
+                                         list]] = defaultdict(lambda: ([], []))
+            for n, p in zip(dtensor_names, dtensor_params):
+                key = (p.placements, p.device_mesh)
+                placement_groups[key][0].append(n)
+                placement_groups[key][1].append(p)
+            logger.info(
+                "distributed_muon: %d placement groups, %d total dtensors",
+                len(placement_groups), len(dtensor_params))
+            for (placements, mesh), (grp_names,
+                                     grp_params) in placement_groups.items():
+                shard_mesh, shard_pg, shard_placements = construct_shard_mesh(
+                    placements, mesh)
+                rank = dist.get_rank(shard_pg)
+                world_size = dist.get_world_size(shard_pg)
+                logger.info("  group: %d params, placements=%s, world_size=%d",
+                            len(grp_params), placements, world_size)
+                # Separate params that can be batched (all shard dims evenly
+                # divisible) from those needing per-param full_tensor
+                # (e.g. MoE gate weights with fewer rows than shard ranks).
+                # all_gather_into_tensor requires equal buffer sizes across
+                # ranks, so uneven splits must use DTensor full_tensor().
+                batch_names, batch_params = [], []
+                single_names, single_params = [], []
+                for n, p in zip(grp_names, grp_params):
+                    even = all(p.shape[pl.dim] %
+                               shard_mesh.mesh.shape[dim_idx] == 0
+                               for dim_idx, pl in enumerate(shard_placements))
+                    if even:
+                        batch_names.append(n)
+                        batch_params.append(p)
+                    else:
+                        single_names.append(n)
+                        single_params.append(p)
+                # Process uneven-split params per-param via full_tensor().
+                for n, p in zip(single_names, single_params):
+                    with record_function("distributed_muon::newton_schulz"):
+                        g_full = p.grad.full_tensor().to(COMM_DTYPE)
+                        u_full = _zeropower_via_newtonschulz5(g_full,
+                                                              steps=ns_steps)
+                        del g_full
+                    with record_function("distributed_muon::update"):
+                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                        p._local_tensor.mul_(1 - lr * weight_decay)
+                        local_indices = get_slices_of_dtensor(
+                            p, rank, shard_mesh, shard_placements)
+                        u_local = u_full[local_indices]
+                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
+                        del u_full
+                        qk_clip_state = get_qk_clip_info(
+                            self.clip_config, n, qk_logits)
+                        scales_full = compute_scales(
+                            p, qk_clip_state
+                        ) if qk_clip_state is not None else None
+                        if scales_full is not None:
+                            ratio = p.shape[0] // scales_full.shape[0]
+                            idx0 = local_indices[0]
+                            if isinstance(idx0, slice):
+                                start = idx0.start or 0
+                                idx0 = torch.arange(start,
+                                                    idx0.stop,
+                                                    device=scales_full.device)
+                            row_scales = scales_full[idx0 // ratio]
+                            p._local_tensor.mul_(row_scales.view(-1, 1))
+                if not batch_params:
+                    continue
+                logger.info("  batched=%d, single=%d", len(batch_params),
+                            len(single_params))
+                # Concat all local grad shards into a single flat buffer.
+                with record_function("distributed_muon::gather"):
+                    grad_locals = [
+                        p.grad.to_local().to(COMM_DTYPE).flatten()
+                        for p in batch_params
+                    ]
+                    numels = [g.numel() for g in grad_locals]
+                    grad_concat = torch.cat(grad_locals)
+                    del grad_locals
+                    # Single all-gather (replaces N separate full_tensor).
+                    grad_gathered = torch.empty(
+                        grad_concat.numel() * world_size,
+                        dtype=COMM_DTYPE,
+                        device="cuda",
+                    )
+                    dist.all_gather_into_tensor(grad_gathered,
+                                                grad_concat,
+                                                group=shard_pg)
+                total_numel = grad_concat.numel()
+                del grad_concat
+                # Precompute per-param offsets within the concat buffer.
+                offsets = []
+                off = 0
+                for ne in numels:
+                    offsets.append(off)
+                    off += ne
+                # Per-param: reconstruct full grad → NS → local update.
+                for i, (n, p) in enumerate(zip(batch_names, batch_params)):
+                    with record_function("distributed_muon::newton_schulz"):
+                        g_full = torch.empty(p.shape,
+                                             dtype=COMM_DTYPE,
+                                             device="cuda")
+                        for r in range(world_size):
+                            r_start = r * total_numel + offsets[i]
+                            shard = grad_gathered[r_start:r_start + numels[i]]
+                            indices = get_slices_of_dtensor(
+                                p, r, shard_mesh, shard_placements)
+                            g_full[indices] = shard.reshape(
+                                g_full[indices].shape)
+                        u_full = _zeropower_via_newtonschulz5(g_full,
+                                                              steps=ns_steps)
+                        del g_full
+                    with record_function("distributed_muon::update"):
+                        adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+                        p._local_tensor.mul_(1 - lr * weight_decay)
+                        local_indices = get_slices_of_dtensor(
+                            p, rank, shard_mesh, shard_placements)
+                        u_local = u_full[local_indices]
+                        p._local_tensor.add_(u_local, alpha=-adjusted_lr)
+                        del u_full
+                        qk_clip_state = get_qk_clip_info(
+                            self.clip_config, n, qk_logits)
+                        scales_full = compute_scales(
+                            p, qk_clip_state
+                        ) if qk_clip_state is not None else None
+                        if scales_full is not None:
+                            ratio = p.shape[0] // scales_full.shape[0]
+                            idx0 = local_indices[0]
+                            if isinstance(idx0, slice):
+                                start = idx0.start or 0
+                                idx0 = torch.arange(start,
+                                                    idx0.stop,
+                                                    device=scales_full.device)
+                            row_scales = scales_full[idx0 // ratio]
+                            p._local_tensor.mul_(row_scales.view(-1, 1))
+    def _setup_parallel(self, names, params, group, qk_logits):
+        """Compute (or retrieve cached) parallel pipeline metadata.
+        Returns:
+            (ordered_params, param_to_state, rank, chunk_size)
+        """
+        cache_key = tuple(names)
+        if cache_key not in self._parallel_cache:
+            # First call: compute metadata and populate cache.
+            param_to_state, ordered_params = self.init_state_and_assign_params(
+                names, params, group, qk_logits)
+            shard_pg = param_to_state[id(ordered_params[0])].process_group
+            rank = dist.get_rank(group=shard_pg)
+            if self.chunk_size == -1:
+                shard_ranks = dist.get_world_size(shard_pg)
+                chunk_size = shard_ranks * DEFAULT_CHUNK_SIZE_RATIO
+            elif self.chunk_size > 0:
+                chunk_size = self.chunk_size
+            else:
+                raise ValueError(
+                    "chunk_size must be -1 or a positive integer.")
+            ordered_names = [
+                param_to_state[id(p)].name for p in ordered_params
+            ]
+            name_to_state = {
+                param_to_state[id(p)].name: param_to_state[id(p)]
+                for p in ordered_params
+            }
+            self._parallel_cache[cache_key] = {
+                'ordered_names': ordered_names,
+                'name_to_state': name_to_state,
+                'rank': rank,
+                'chunk_size': chunk_size,
+            }
+        else:
+            # Cached path: rebuild param_to_state with current id(p) keys.
+            cache = self._parallel_cache[cache_key]
+            rank = cache['rank']
+            chunk_size = cache['chunk_size']
+            name_to_param = dict(zip(names, params))
+            ordered_params = [name_to_param[n] for n in cache['ordered_names']]
+            param_to_state = {}
+            for p, n in zip(ordered_params, cache['ordered_names']):
+                cached_state = cache['name_to_state'][n]
+                param_to_state[id(p)] = _muon_state(
+                    worker_rank=cached_state.worker_rank,
+                    process_group=cached_state.process_group,
+                    rank_indices=cached_state.rank_indices,
+                    rank_numels=cached_state.rank_numels,
+                    name=n,
+                    qk_clip_state=get_qk_clip_info(self.clip_config, n,
+                                                   qk_logits),
                 )
+        return ordered_params, param_to_state, rank, chunk_size
+    def parallel(self,
+                 names,
+                 params,
+                 group,
+                 lr,
+                 weight_decay,
+                 qk_logits,
+                 prelaunch_gather=None):
         """
         Perform a parallel optimization step using Muon.
         interleaves multiple chunks so that communication and computation
         overlap across chunks (the same overlap previously achieved by the
         warmup + main-loop index scheduling).
+        If ``prelaunch_gather`` is provided, it is passed to the first
+        chunk's generator to skip re-launching the already in-flight
+        A2A gather.
         """
         # Momentum is already applied by _step_muon before this method.
+        ordered_params, param_to_state, rank, chunk_size = (
+            self._setup_parallel(names, params, group, qk_logits))
         def pipelines():
+            first = True
             for start in range(0, len(ordered_params), chunk_size):
                 chunk = ordered_params[start:start + chunk_size]
                 if chunk:
+                    kwargs = dict(
                         params=chunk,
                         param_to_state=param_to_state,
                         rank=rank,
                         weight_decay=weight_decay,
                         none_grad=group["none_grad"],
                     )
+                    if first and prelaunch_gather is not None:
+                        kwargs['prelaunch_gather'] = prelaunch_gather
+                    first = False
+                    yield muon_chunk_pipeline(**kwargs)
         with record_function("muon::pipeline"):
             run_pipeline(pipelines(), max_concurrent=self.warmup_step + 1)
         names = group["names"]
         # Apply momentum to all params before routing/expansion.
+        # Batched using _foreach_* ops (compiled, fullgraph=True).
         with record_function("muon::momentum"):
+            active_params = [p for p in params if p.grad is not None]
+            if active_params:
+                # Ensure momentum buffers exist (avoid zeros_like when already present).
+                for p in active_params:
+                    if "momentum_buffer" not in self.state[p]:
+                        self.state[p]["momentum_buffer"] = torch.zeros_like(
+                            p.grad)
+                # Extract local tensors for compiled batch function.
+                local_grads = [
+                    p.grad._local_tensor
+                    if isinstance(p.grad, DTensor) else p.grad
+                    for p in active_params
+                ]
+                local_bufs = [
+                    self.state[p]["momentum_buffer"]._local_tensor
+                    if isinstance(self.state[p]["momentum_buffer"], DTensor)
+                    else self.state[p]["momentum_buffer"]
+                    for p in active_params
+                ]
+                # Wrap momentum as tensor for torch.compile.
+                batch_pre_ortho(local_grads, local_bufs,
+                                torch.tensor(momentum), group["nesterov"])
+                # For non-nesterov, the result is the momentum buffer.
+                if not group["nesterov"]:
+                    for p in active_params:
+                        p.grad = self.state[p]["momentum_buffer"]
+        # Identify batched experts for deferred NS.
+        # Detection is cheap (condition checks only); actual NS compute is
+        # deferred so it can overlap with the first chunk's A2A gather.
+        deferred_expert_work = []
+        if self.expert_keys:
+            batched_expert_indices = []
+            for i, (n, p) in enumerate(zip(names, params)):
+                if not (is_expert_param(n, self.expert_keys)
+                        and p.grad is not None):
                     continue
+                # Eligible: plain tensor, or DTensor with no non-dim-0 shards.
+                if isinstance(p.data, DTensor):
+                    has_tp = any(
+                        _is_shard(pl) and pl.dim != 0 for pl in p.placements)
+                    if has_tp:
+                        continue
+                batched_expert_indices.append(i)
+            if batched_expert_indices:
+                # Save refs for deferred NS; free grads from param list.
+                for i in batched_expert_indices:
+                    p = params[i]
+                    g = p.grad
+                    local_g = (g._local_tensor
+                               if isinstance(g, DTensor) else g)
+                    local_data = (p.data._local_tensor if isinstance(
+                        p.data, DTensor) else p.data)
+                    deferred_expert_work.append((local_data, local_g))
+                    p.grad = None
+                # Remove batched experts from lists before expansion.
+                keep = sorted(
+                    set(range(len(params))) - set(batched_expert_indices))
+                names = [names[i] for i in keep]
+                params = [params[i] for i in keep]
+        def _run_deferred_expert_ns():
+            """Execute deferred batched expert NS."""
+            if not deferred_expert_work:
+                return
+            with record_function("muon::batched_expert_ns"):
+                ns_steps = group["ns_steps"]
+                for local_data, local_g in deferred_expert_work:
+                    u = zeropower_via_newtonschulz5_batched(
+                        local_g.to(COMM_DTYPE), steps=ns_steps)
+                    adjusted_lr = adjust_lr_for_muon(lr, local_g.shape[1:])
+                    local_data.mul_(1 - lr * weight_decay)
+                    local_data.add_(u, alpha=-adjusted_lr)
         # Expand expert params by splitting on dim 0.
+        logger.debug("[_step_muon] before expand: %d params, expert_keys=%s",
+                     len(params), self.expert_keys)
+        if self.expert_keys:
+            cache_key = tuple(id(p) for p in params)
+            cache = self._expert_expand_cache.get(cache_key)
+            if cache is None:
+                # Cold path: full expansion + build cache metadata.
+                exp_names, exp_params = _expand_expert_params(
+                    names, params, self.expert_keys)
+                # Build per-expert-group info for hot-path grad updates.
+                grad_info = []
+                exp_idx = 0
+                for orig_idx, (n, p) in enumerate(zip(names, params)):
+                    if not is_expert_param(n, self.expert_keys):
+                        exp_idx += 1
+                        continue
+                    is_dt = isinstance(p.data, DTensor)
+                    num_experts = (p.to_local() if is_dt else p.data).shape[0]
+                    # Detect TP mesh from the first expanded expert param.
+                    tp_mesh = None
+                    tp_pls = None
+                    sample = exp_params[exp_idx]
+                    if isinstance(sample.data, DTensor):
+                        tp_mesh = sample.data.device_mesh
+                        tp_pls = list(sample.data.placements)
+                    grad_info.append((orig_idx, num_experts, exp_idx, is_dt,
+                                      tp_mesh, tp_pls))
+                    exp_idx += num_experts
+                self._expert_expand_cache[cache_key] = {
+                    'names': exp_names,
+                    'params': exp_params,
+                    'grad_info': grad_info,
+                }
+                names, params = exp_names, exp_params
+            else:
+                # Hot path: reuse cached params, only update expert grads.
+                for (orig_idx, num_experts, exp_start, is_dt, tp_mesh,
+                     tp_pls) in cache['grad_info']:
+                    p = params[orig_idx]
+                    g = p.grad
+                    local_grad = (g.to_local()
+                                  if is_dt and isinstance(g, DTensor) else g)
+                    for i in range(num_experts):
+                        expert_p = cache['params'][exp_start + i]
+                        sg = local_grad[i]
+                        if tp_mesh is not None:
+                            expert_p.grad = DTensor.from_local(
+                                sg, device_mesh=tp_mesh, placements=tp_pls)
+                        else:
+                            expert_p.grad = sg
+                    p.grad = None
+                names = cache['names']
+                params = cache['params']
+        else:
+            names, params = _expand_expert_params(names, params,
+                                                  self.expert_keys)
+        logger.debug("[_step_muon] after expand: %d params", len(params))
         param_dtensors = []
         name_dtensors = []
         param_tensors = []
         name_tensors = []
+        # distributed_muon is a reference implementation for testing only.
+        # The parallel pipeline (all2all) path below is the production path.
         if self.use_distributed_muon:
+            _run_deferred_expert_ns()
             self.distributed_muon(names=names,
                                   params=params,
                                   group=group,
                                   qk_logits=qk_logits)
             return
         for n, p in zip(names, params):
             if p is None or p.grad is None:
                 continue
                 if all(
                         isinstance(placement, Replicate)
                         for placement in p.placements):
+                    logger.debug(
+                        "[route] %s → base (DTensor all-Replicate), "
+                        "shape=%s, placements=%s", n, p.shape, p.placements)
                     param_tensors.append(p)
                     name_tensors.append(n)
                 else:
+                    logger.debug(
+                        "[route] %s → parallel (DTensor), shape=%s, "
+                        "placements=%s, mesh=%s", n, p.shape, p.placements,
+                        p.device_mesh.mesh_dim_names)
                     param_dtensors.append(p)
                     name_dtensors.append(n)
             elif isinstance(p.data, torch.Tensor):
+                logger.debug("[route] %s → base (plain tensor), shape=%s", n,
+                             p.data.shape)
                 param_tensors.append(p)
                 name_tensors.append(n)
             else:
                 raise TypeError(f"Unsupported parameter type: {type(p.data)}")
+        logger.debug(f"[Muon] {len(param_dtensors)} DTensors → parallel, "
+                     f"{len(param_tensors)} Tensors → base")
         def group_dtensors(dtensors, names):
             # To support different placements, we group parameters by placements
                                            p.device_mesh])][1].append(p)
             return placement_to_params
         if len(param_dtensors) > 0:
             if not dist.is_initialized():
                 raise RuntimeError(
                 )
             dtensor_group = group_dtensors(param_dtensors, name_dtensors)
+            # Pre-launch the first chunk's A2A gather so that the NCCL
+            # communication overlaps with the (deferred) batched expert NS
+            # compute on the default CUDA stream.
+            prelaunch = None
+            if deferred_expert_work:
+                first_names, first_params = next(iter(dtensor_group.values()))
+                ordered, pts, rnk, csz = self._setup_parallel(
+                    first_names, first_params, group, qk_logits)
+                first_chunk = ordered[:csz]
+                if first_chunk:
+                    prelaunch = prelaunch_first_gather(first_chunk, pts, rnk,
+                                                       group["none_grad"])
+            _run_deferred_expert_ns()
+            first_group = True
             for _, (names, params) in dtensor_group.items():
+                pg = prelaunch if first_group else None
+                first_group = False
                 self.parallel(
                     names,
                     params,
                     lr=lr,
                     weight_decay=weight_decay,
                     qk_logits=qk_logits,
+                    prelaunch_gather=pg,
                 )
+        else:
+            _run_deferred_expert_ns()
         if len(param_tensors) > 0:
             self.base(
                 qk_logits=qk_logits,
             )
+    def _register_states_for_offload(self):
+        """Register all optimizer state tensors with the CPU offload pool.
+        Called once after the first step when states have been lazily created.
+        Offloads all param states (momentum buffers for Muon, moment1/moment2
+        for AdamW) to free GPU memory between steps.
+        """
+        pool = self._cpu_offload_pool
+        tracked = 0
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p not in self.state:
+                    continue
+                state = self.state[p]
+                if group.get("use_muon", False):
+                    if "momentum_buffer" in state:
+                        pool.track(state["momentum_buffer"])
+                        tracked += 1
+                else:
+                    if "moment1" in state:
+                        pool.track(state["moment1"])
+                    if "moment2" in state:
+                        pool.track(state["moment2"])
+                        tracked += 1
+        logger.info("[CPUOffload] Registered %d param states for offload",
+                    tracked)
     @torch.no_grad
     def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
             with torch.enable_grad():
                 loss = closure()
+        # H2D: reload optimizer states from CPU before computation.
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+        logger.debug("[Muon.step] expert_keys=%s, %d param groups",
+                     self.expert_keys, len(self.param_groups))
+        for i, group in enumerate(self.param_groups):
             if group["use_muon"]:
+                logger.debug("[Muon.step] group %d: use_muon=True, %d params",
+                             i, len(group["params"]))
                 self._step_muon(group, qk_logits=qk_logits)
             else:
+                logger.debug(
+                    "[Muon.step] group %d: use_muon=False (AdamW), %d params",
+                    i, len(group["params"]))
                 step_adamw(self.state, group)
+        # D2H: offload optimizer states to CPU after computation.
+        if self.cpu_offload:
+            if not self._offload_initialized:
+                self._register_states_for_offload()
+                self._offload_initialized = True
+            self._cpu_offload_pool.offload()
         return loss
+    # ------------------------------------------------------------------
+    # Checkpoint support for cpu_offload
+    # ------------------------------------------------------------------
+    def state_dict(self) -> dict:
+        """Return optimizer state dict, reloading offloaded states first.
+        When ``cpu_offload=True``, optimizer state tensors have their GPU
+        storage freed (``resize_(0)``) between steps.  We reload them,
+        snapshot the state dict, then re-offload so the optimizer stays
+        in the expected post-step state.  The returned dict holds cloned
+        tensors so they remain valid after the re-offload frees the
+        originals' GPU storage.
+        """
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+            torch.cuda.current_stream().synchronize()
+        sd = super().state_dict()
+        if self.cpu_offload and self._offload_initialized:
+            # Clone state tensors so the returned dict survives re-offload
+            # (which frees GPU storage on the originals via resize_(0)).
+            for k in sd["state"]:
+                sd["state"][k] = {
+                    sk: sv.clone() if isinstance(sv, torch.Tensor) else sv
+                    for sk, sv in sd["state"][k].items()
+                }
+            self._cpu_offload_pool.offload()
+        return sd
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Load optimizer state dict, then offload states if needed.
+        After ``super().load_state_dict()`` populates GPU tensors, we
+        re-register them with the offload pool and offload to CPU so the
+        optimizer is in the same post-step state (GPU storage freed).
+        """
+        # If states were offloaded, reload first so storage sizes are
+        # correct for super().load_state_dict() to overwrite.
+        if self.cpu_offload and self._offload_initialized:
+            self._cpu_offload_pool.reload()
+            torch.cuda.current_stream().synchronize()
+        super().load_state_dict(state_dict)
+        if self.cpu_offload:
+            # Re-create the offload pool since state tensors may be new
+            # objects after load_state_dict.
+            self._cpu_offload_pool = CPUOffloadPool()
+            self._offload_initialized = False
+            self._register_states_for_offload()
+            self._offload_initialized = True
+            self._cpu_offload_pool.offload()

build/torch210-cxx11-rocm70-x86_64-linux/newton_schulz.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import torch
 from .matmul_transpose_triton import matmul_transpose_assign
@@ -6,21 +10,134 @@ COMM_DTYPE = torch.bfloat16
 DEFAULT_CHUNK_SIZE_RATIO = 4
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# Muon's Newton–Schulz iteration causes high variance in singular values
-# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
-# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
     assert G.dtype == COMM_DTYPE
@@ -28,18 +145,14 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
-    for a, b, c in [
-        (4.0848, -6.8946, 2.9270),
-        (3.9505, -6.3029, 2.6377),
-        (3.7418, -5.5913, 2.3037),
-        (2.8769, -3.1427, 1.2046),
-        (2.8366, -3.0525, 1.2012),
-    ]:
         matmul_transpose_assign(X, buf1)
         matmul_transpose_assign(buf1, buf2)
         buf1.mul_(b).add_(buf2, alpha=c)
@@ -47,4 +160,77 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
     return X

+from itertools import repeat
+from math import inf, sqrt
+import numpy as np
 import torch
 from .matmul_transpose_triton import matmul_transpose_assign
 DEFAULT_CHUNK_SIZE_RATIO = 4
+def _optimal_quintic(l, u, max_iter=1000):
+    """
+    Use the simplified Remez algorithm to find the optimal odd quintic approximant
+    to the constant function x -> 1 over the interval [l, u].
+    Returns (a, b, c) for p(x) = ax + bx^3 + cx^5 that minimizes the maximum
+    approximation error max_{x in [l,u]} |p(x) - 1|. Iterates by updating the
+    two interior equioscillation nodes q, r until convergence. Returns the
+    closed-form equioscillating solution when l ≈ u.
+    Raises ValueError if any intermediate value (a, b, c, E, q, r) is non-finite
+    (NaN or inf). Raises RuntimeError if convergence is not reached within
+    max_iter iterations.
+    """
+    assert 0 <= l <= u
+    if 1 - 5e-6 <= l / u:
+        return (15 / 8) / u, (-10 / 8) / (u**3), (3 / 8) / (u**5)
+    q = (3 * l + u) / 4
+    r = (l + 3 * u) / 4
+    E = inf
+    for _ in range(max_iter):
+        old_E = E
+        LHS = np.array([
+            [l, l**3, l**5, 1],
+            [q, q**3, q**5, -1],
+            [r, r**3, r**5, 1],
+            [u, u**3, u**5, -1],
+        ])
+        a, b, c, E = np.linalg.solve(LHS, np.ones(4))
+        if not np.all(np.isfinite([a, b, c, E])):
+            raise ValueError(f"_optimal_quintic: non-finite solve result "
+                             f"a={a}, b={b}, c={c}, E={E}")
+        q, r = np.sqrt(
+            (-3 * b + np.array([-1, 1]) * sqrt(9 * b**2 - 20 * a * c)) /
+            (10 * c))
+        if not np.all(np.isfinite([q, r])):
+            raise ValueError(
+                f"_optimal_quintic: non-finite node update q={q}, r={r}")
+        if abs(old_E - E) <= 1e-15:
+            break
+    else:
+        raise RuntimeError(
+            f"_optimal_quintic: did not converge after {max_iter} iterations")
+    return float(a), float(b), float(c)
+def _optimal_composition(l, num_iters, safety_factor_eps=0, cushion=0):
+    """
+    Compute the Polar Express coefficient series for `num_iters` quintic iterations.
+    Builds a sequence of per-step optimal odd quintic coefficients (a, b, c) that
+    compose to map singular values from [l, 1] toward 1. At each step:
+      1. Solves `_optimal_quintic` on [max(l, cushion*u), u]. The `cushion`
+         prevents near-zero singular values from stalling by raising the effective
+         lower bound; if it is active (cushion*u > l), the coefficients are
+         rescaled so that p(l) and p(u) are centered around 1 w.r.t. the true [l, u].
+      2. Deflates the coefficients by (1 + safety_factor_eps)^degree for all but the
+         last iteration, providing numerical headroom at the cost of a slightly slower
+         final convergence step.
+      3. Advances the interval: l <- p(l), u <- 2 - p(l) (by symmetry of p around 1).
+    Returns a list of (a, b, c) tuples, one per iteration.
+    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
+    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
+    """
+    u = 1
+    assert 0 <= l <= u
+    safety_factor = 1 + safety_factor_eps
+    coefficients = []
+    for iter in range(num_iters):
+        a, b, c = _optimal_quintic(max(l, cushion * u), u)
+        if cushion * u > l:
+            pl = a * l + b * l**3 + c * l**5
+            pu = a * u + b * u**3 + c * u**5
+            rescaler = 2 / (pl + pu)
+            a *= rescaler
+            b *= rescaler
+            c *= rescaler
+        if iter < num_iters - 1:
+            a /= safety_factor
+            b /= safety_factor**3
+            c /= safety_factor**5
+        coefficients.append((a, b, c))
+        l = a * l + b * l**3 + c * l**5
+        u = 2 - l
+    return coefficients
+# Precomputed Polar Express coefficients (a, b, c) for 10 quintic Newton-Schulz
+# iterations. Each tuple is the minimax-optimal (Remez/equioscillation) odd quintic
+# approximant to x->1 over the current singular-value interval, computed once at
+# import time and reused across all optimizer steps.
+#
+# Contrast with the former hardcoded NS coefficients (5 fixed tuples):
+#   - Former: empirically tuned to maximize slope at zero; did not converge
+#     singular values to 1, yielding US'V^T with S' ~ Uniform(0.5, 1.5) instead
+#     of the true polar factor UV^T.
+#   - Polar Express: analytically optimal per step, adapting to the shrinking
+#     singular-value interval [l, u] as iterations progress; converges all
+#     singular values to 1, producing the exact polar factor UV^T.
+_coeffs_list = _optimal_composition(l=1e-3,
+                                    num_iters=10,
+                                    safety_factor_eps=1e-2,
+                                    cushion=0.02)
+# This code is adapted from:
+#   KellerJordan/Muon (https://github.com/KellerJordan/Muon/blob/master/muon.py)
+#   NoahAmsel/PolarExpress (https://github.com/NoahAmsel/PolarExpress)
+#   matmul_transpose_assign kernel from nil0x9/flash-muon (https://github.com/nil0x9/flash-muon)
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
+    Compute the polar factor of G via the Polar Express method.
+    Applies `steps` quintic iterations X <- aX + bX^3 + cX^5, where (a, b, c)
+    are the Polar Express coefficients from `_coeffs_list`. Each step is the
+    optimal odd quintic approximant to x -> 1 over the current singular-value
+    interval, minimizing the maximum approximation error (Remez / minimax criterion).
+    The composition maps singular values from [l, 1] to near 1, producing the
+    polar factor (orthogonal factor in the polar decomposition G = UP).
+    `_coeffs_list` is precomputed for 10 iterations (l=1e-3, safety_factor_eps=1e-2,
+    cushion=0.02). If `steps` exceeds 10, the final coefficient set is repeated.
+    Reference: Amsel et al., "The Polar Express: Optimal Matrix Sign Methods and
+    Their Application to the Muon Algorithm", https://arxiv.org/abs/2505.16932
     """
     assert len(G.shape) == 2
     assert G.dtype == COMM_DTYPE
     if G.size(0) > G.size(1):
         X = X.T
     X = X / (X.norm() + 1e-7)
+    hs = _coeffs_list[:steps] + list(
+        repeat(_coeffs_list[-1], steps - len(_coeffs_list)))
     buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
+    for a, b, c in hs:
         matmul_transpose_assign(X, buf1)
         matmul_transpose_assign(buf1, buf2)
         buf1.mul_(b).add_(buf2, alpha=c)
     if G.size(0) > G.size(1):
         X = X.T
     return X
+@torch.no_grad()
+def _zeropower_via_newtonschulz5_batched(G, steps):
+    """Batched polar factor computation for 3D (E, out, in) tensors.
+    Same algorithm as ``_zeropower_via_newtonschulz5`` but uses
+    ``torch.bmm`` / ``torch.baddbmm`` instead of the 2D Triton kernel,
+    processing all E expert matrices in a single batched call.
+    """
+    assert len(G.shape) == 3
+    assert G.dtype == COMM_DTYPE
+    X = G
+    if G.size(1) > G.size(2):
+        X = X.transpose(-2, -1)
+    # Per-expert Frobenius norm.
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
+    hs = _coeffs_list[:steps] + list(
+        repeat(_coeffs_list[-1], steps - len(_coeffs_list)))
+    for a, b, c in hs:
+        buf1 = torch.bmm(X, X.transpose(-2, -1))
+        buf2 = torch.bmm(buf1, buf1.transpose(-2, -1))
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.baddbmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(1) > G.size(2):
+        X = X.transpose(-2, -1)
+    return X
+_ns_per_shape: dict[tuple[int, ...], callable] = {}
+_use_compile = True
+def set_ns_compile(enabled: bool):
+    """Toggle torch.compile for Newton-Schulz iteration."""
+    global _use_compile
+    _use_compile = enabled
+def zeropower_via_newtonschulz5(G, steps=5):
+    if not _use_compile:
+        return _zeropower_via_newtonschulz5(G, steps)
+    key = G.shape
+    if key not in _ns_per_shape:
+        _ns_per_shape[key] = torch.compile(_zeropower_via_newtonschulz5,
+                                           options={
+                                               "triton.cudagraphs": True,
+                                               "shape_padding": False
+                                           })
+    torch.compiler.cudagraph_mark_step_begin()
+    return _ns_per_shape[key](G, steps).clone()
+def zeropower_via_newtonschulz5_batched(G, steps=5):
+    """Compile-cached batched Newton-Schulz for 3D expert tensors."""
+    if not _use_compile:
+        return _zeropower_via_newtonschulz5_batched(G, steps)
+    key = G.shape
+    if key not in _ns_per_shape:
+        _ns_per_shape[key] = torch.compile(
+            _zeropower_via_newtonschulz5_batched,
+            options={
+                "triton.cudagraphs": True,
+                "shape_padding": False
+            })
+    torch.compiler.cudagraph_mark_step_begin()
+    return _ns_per_shape[key](G, steps).clone()

build/torch210-cxx11-rocm70-x86_64-linux/pipeline.py CHANGED Viewed

@@ -6,8 +6,8 @@ import torch.distributed as dist
 from torch.distributed.tensor import DTensor
 from torch.profiler import record_function
-from .core import _muon_state, adjust_lr_for_muon, update_p
-from .newton_schulz import COMM_DTYPE, _zeropower_via_newtonschulz5
 from .qk_clip import compute_scales
 logger = logging.getLogger(__name__)
@@ -45,26 +45,33 @@ def _launch_gather(
         else:
             gathered_grads[id(p)] = None
-    # Build send buffer
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
     send_counts = [0] * num_ranks
     for p in params:
         state = param_to_state[id(p)]
-        dst = state.worker_rank
-        assert dst < num_ranks
-        shard_elems = state.rank_numels[rank]
-        g = p.grad
-        g = g.to_local().to(COMM_DTYPE).contiguous()
-        assert g.numel() == shard_elems
-        per_dst[dst].append(g.view(-1))
-        send_counts[dst] += shard_elems
-    assert any(
-        len(v) > 0 for v in
-        per_dst), "At least one destination rank must receive a sharded tensor"
-    per_dst_flat = [t for dst in per_dst for t in dst]
-    send_buf = torch.cat(per_dst_flat, dim=0)
     # Build recv buffer
     recv_counts = [0] * num_ranks
@@ -120,7 +127,8 @@ def _complete_gather(
             shard_view = gathered_grads[id(p)][indices]
             n = shard_view.numel()
-            assert n > 0
             sg = recv_buf.narrow(0, off + inner_off, n)
             sg = sg.reshape(shard_view.shape)
@@ -143,7 +151,7 @@ def _compute_ns(
     """
     computed_us: dict[int, torch.Tensor | None] = {}
     for p in owned_params:
-        u = _zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
         gathered_grads[id(p)] = None  # free gathered grad
         computed_us[id(p)] = u
     return computed_us
@@ -163,46 +171,47 @@ def _launch_scatter(
     Returns:
         work: Async operation handle.
         recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
-        scattered_us: ``{id(p): empty_local_tensor}`` for all params.
         recv_counts: Per-source-rank element counts.
     """
-    # Allocate scattered-u buffers
     scattered_us: dict[int, torch.Tensor] = {}
     for p in params:
-        scattered_us[id(p)] = torch.empty_like(p.to_local(), dtype=COMM_DTYPE)
-    # Build send buffer (from computed_us on owner ranks)
-    per_dst: list[list[torch.Tensor]] = [[] for _ in range(num_ranks)]
     send_counts = [0] * num_ranks
     if owned_params:
         for p in owned_params:
             state = param_to_state[id(p)]
-            assert computed_us[id(p)] is not None
-            u_full = computed_us[id(p)].to(COMM_DTYPE).contiguous()
-            total_sent = 0
             for dst_rank in range(num_ranks):
-                indices = state.rank_indices[dst_rank]
-                su = u_full[indices].flatten()
-                n = su.numel()
-                assert n > 0
-                per_dst[dst_rank].append(su)
-                send_counts[dst_rank] += n
-                total_sent += n
-            assert total_sent == u_full.numel()
-    lengths = [len(v) for v in per_dst]
-    if all(l > 0 for l in lengths):
-        assert all(
-            l == lengths[0] for l in lengths
-        ), "All destination ranks must have the same number of sharded tensor"
-        per_dst_flat = [t for dst in per_dst for t in dst]
-        send_buf = torch.cat(per_dst_flat, dim=0)
     else:
         send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
@@ -218,7 +227,6 @@ def _launch_scatter(
         recv_counts[src] = total
     recv_total = sum(recv_counts)
-    assert recv_total > 0
     recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
     # Launch async all-to-all
@@ -242,7 +250,13 @@ def _complete_scatter(
     rank: int,
     scattered_us: dict[int, torch.Tensor],
 ) -> None:
-    """Copy recv buffer into scattered_us (in-place)."""
     off = 0
     for src in range(len(recv_counts)):
         block = recv_counts[src]
@@ -255,11 +269,11 @@ def _complete_scatter(
             if state.worker_rank != src:
                 continue
             n = state.rank_numels[rank]
-            assert n > 0
-            flat_local = recv_buf.narrow(0, off + inner_off,
-                                         n).view_as(p.to_local())
-            scattered_us[id(p)].copy_(flat_local)
             inner_off += n
@@ -275,23 +289,40 @@ def _update_params(
     lr: float,
     weight_decay: float,
 ) -> None:
-    """Apply weight decay, Muon update, and optional QK clipping."""
-    for p in params:
-        state = param_to_state[id(p)]
-        u_dtensor = DTensor.from_local(
-            scattered_us[id(p)],
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
         adjusted_lr = adjust_lr_for_muon(lr, p.shape)
-        update_p(p, u_dtensor, lr, adjusted_lr, weight_decay)
-        # QK clipping – applied directly on the local tensor to
-        # avoid DTensor sharding-propagation issues with _StridedShard.
-        scales_full = compute_scales(
-            p,
-            state.qk_clip_state) if state.qk_clip_state is not None else None
         if scales_full is not None:
             ratio = p.shape[0] // scales_full.shape[0]
             idx0 = state.rank_indices[rank][0]
@@ -304,6 +335,45 @@ def _update_params(
             p._local_tensor.mul_(row_scales.view(-1, 1))
 # ======================================================================
 # Main generator – thin orchestrator that wires stages together.
 # ======================================================================
@@ -318,6 +388,7 @@ def muon_chunk_pipeline(
     lr: float,
     weight_decay: float,
     none_grad: bool,
 ) -> Generator[None, None, None]:
     """Process one chunk of parameters through the full Muon pipeline.
@@ -334,9 +405,12 @@ def muon_chunk_pipeline(
     runs concurrently on the NCCL stream — no separate ``comm_stream``
     is required.
     Yields exactly **2** times:
-    1. After launching async all-to-all gather.
     2. After launching async all-to-all scatter.
     """
     process_group = param_to_state[id(params[0])].process_group
@@ -345,15 +419,19 @@ def muon_chunk_pipeline(
         p for p in params if param_to_state[id(p)].worker_rank == rank
     ]
-    # Stages 1-2: launch async gather.
-    with record_function("muon::launch_gather"):
-        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
-            params, owned_params, param_to_state, rank, num_ranks,
-            process_group)
-        if none_grad:
-            for p in params:
-                p.grad = None
     yield  # --- YIELD 1: other chunks can launch their gather ---

 from torch.distributed.tensor import DTensor
 from torch.profiler import record_function
+from .core import _muon_state, adjust_lr_for_muon
+from .newton_schulz import COMM_DTYPE, zeropower_via_newtonschulz5
 from .qk_clip import compute_scales
 logger = logging.getLogger(__name__)
         else:
             gathered_grads[id(p)] = None
+    # Build send buffer – batch grad copies via torch.cat
+    # (1-2 fused kernels vs N individual narrow().copy_() calls).
     send_counts = [0] * num_ranks
     for p in params:
         state = param_to_state[id(p)]
+        send_counts[state.worker_rank] += state.rank_numels[rank]
+    total_send = sum(send_counts)
+    if total_send > 0:
+        # Group grad slices by destination rank in a single pass.
+        dst_to_grads = [[] for _ in range(num_ranks)]
+        for p in params:
+            state = param_to_state[id(p)]
+            n = state.rank_numels[rank]
+            if n > 0:
+                g = p.grad.to_local()
+                dst_to_grads[state.worker_rank].append(g.reshape(-1))
+        # Flatten in dst order and cat once.
+        all_slices = []
+        for dst in range(num_ranks):
+            all_slices.extend(dst_to_grads[dst])
+        send_buf = torch.cat(all_slices)
+        if send_buf.dtype != COMM_DTYPE:
+            send_buf = send_buf.to(COMM_DTYPE)
+    else:
+        send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
     # Build recv buffer
     recv_counts = [0] * num_ranks
             shard_view = gathered_grads[id(p)][indices]
             n = shard_view.numel()
+            if n == 0:
+                continue
             sg = recv_buf.narrow(0, off + inner_off, n)
             sg = sg.reshape(shard_view.shape)
     """
     computed_us: dict[int, torch.Tensor | None] = {}
     for p in owned_params:
+        u = zeropower_via_newtonschulz5(gathered_grads[id(p)], ns_steps)
         gathered_grads[id(p)] = None  # free gathered grad
         computed_us[id(p)] = u
     return computed_us
     Returns:
         work: Async operation handle.
         recv_buf: Flat receive buffer (needed by ``_complete_scatter``).
+        scattered_us: Empty dict, populated by ``_complete_scatter`` with
+            zero-copy views into ``recv_buf``.
         recv_counts: Per-source-rank element counts.
     """
+    # scattered_us is populated by _complete_scatter with zero-copy views
+    # into recv_buf, avoiding N empty_like allocations + N copy_ calls.
+    # Pre-seed entries for params whose local shard is empty (rank_numels == 0)
+    # so _update_params can iterate all params without KeyError.
     scattered_us: dict[int, torch.Tensor] = {}
     for p in params:
+        if param_to_state[id(p)].rank_numels[rank] == 0:
+            scattered_us[id(p)] = torch.empty_like(p.to_local(),
+                                                   dtype=COMM_DTYPE)
+    # Build send buffer – batch via torch.cat
+    # (1 fused kernel vs N*num_ranks individual narrow().copy_() calls).
     send_counts = [0] * num_ranks
     if owned_params:
         for p in owned_params:
             state = param_to_state[id(p)]
             for dst_rank in range(num_ranks):
+                send_counts[dst_rank] += state.rank_numels[dst_rank]
+    total_send = sum(send_counts)
+    if total_send > 0:
+        # Cache u_full conversions to avoid redundant .to() per dst_rank.
+        u_fulls = {}
+        for p in owned_params:
+            u_fulls[id(p)] = computed_us[id(p)].to(COMM_DTYPE).contiguous()
+        # Collect slices in dst order (matches all-to-all send layout).
+        all_slices = []
+        for dst_rank in range(num_ranks):
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                su = u_fulls[id(p)][state.rank_indices[dst_rank]].flatten()
+                if su.numel() > 0:
+                    all_slices.append(su)
+        send_buf = torch.cat(all_slices) if all_slices else torch.empty(
+            0, dtype=COMM_DTYPE, device="cuda")
     else:
         send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
         recv_counts[src] = total
     recv_total = sum(recv_counts)
     recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
     # Launch async all-to-all
     rank: int,
     scattered_us: dict[int, torch.Tensor],
 ) -> None:
+    """Populate scattered_us with zero-copy views into recv_buf.
+    Instead of pre-allocating tensors and copying, we assign views directly
+    from ``recv_buf``.  This eliminates N ``empty_like`` + N ``copy_`` calls.
+    The underlying storage of ``recv_buf`` is kept alive through the views
+    until ``scattered_us`` is cleared after ``_update_params``.
+    """
     off = 0
     for src in range(len(recv_counts)):
         block = recv_counts[src]
             if state.worker_rank != src:
                 continue
             n = state.rank_numels[rank]
+            if n == 0:
+                continue
+            scattered_us[id(p)] = recv_buf.narrow(0, off + inner_off,
+                                                  n).view_as(p.to_local())
             inner_off += n
     lr: float,
     weight_decay: float,
 ) -> None:
+    """Apply weight decay, Muon update, and optional QK clipping.
+    Uses batched ``_foreach_mul_`` for weight decay and batched
+    ``_foreach_add_`` for the Muon update, grouping parameters by
+    adjusted_lr to minimize kernel launches while preserving float32
+    precision for the alpha scaling.
+    """
+    if not params:
+        return
+    # Batched weight decay: p *= (1 - lr * wd) — single fused kernel.
+    p_locals = [p._local_tensor for p in params]
+    torch._foreach_mul_(p_locals, 1.0 - lr * weight_decay)
+    # Group params by adjusted_lr so _foreach_add_ can use a single
+    # alpha per group (preserves float32 precision for alpha scaling).
+    lr_groups: dict[float, tuple[list, list]] = {}
+    for p in params:
         adjusted_lr = adjust_lr_for_muon(lr, p.shape)
+        if adjusted_lr not in lr_groups:
+            lr_groups[adjusted_lr] = ([], [])
+        lr_groups[adjusted_lr][0].append(p._local_tensor)
+        lr_groups[adjusted_lr][1].append(scattered_us[id(p)])
+    for adjusted_lr, (p_group, u_group) in lr_groups.items():
+        torch._foreach_add_(p_group, u_group, alpha=-adjusted_lr)
+    # QK clipping – applied directly on the local tensor to
+    # avoid DTensor sharding-propagation issues with _StridedShard.
+    for p in params:
+        state = param_to_state[id(p)]
+        if state.qk_clip_state is None:
+            continue
+        scales_full = compute_scales(p, state.qk_clip_state)
         if scales_full is not None:
             ratio = p.shape[0] // scales_full.shape[0]
             idx0 = state.rank_indices[rank][0]
             p._local_tensor.mul_(row_scales.view(-1, 1))
+# ======================================================================
+# Pre-launch helper for overlapping first chunk's gather with other work.
+# ======================================================================
+@torch.no_grad()
+def prelaunch_first_gather(
+    params: list[DTensor],
+    param_to_state: dict[int, _muon_state],
+    rank: int,
+    none_grad: bool,
+) -> tuple[dist.Work, torch.Tensor, dict[int, torch.Tensor | None], list[int]]:
+    """Launch the first chunk's A2A gather early for overlap with other compute.
+    Call this *before* expensive GPU work (e.g. batched expert NS) so that
+    the NCCL all-to-all runs concurrently on the NCCL stream while the
+    default stream executes compute.
+    Returns the same 4-tuple that ``_launch_gather`` produces, which should
+    be passed as ``prelaunch_gather`` to :func:`muon_chunk_pipeline`.
+    """
+    process_group = param_to_state[id(params[0])].process_group
+    num_ranks = dist.get_world_size(group=process_group)
+    owned_params = [
+        p for p in params if param_to_state[id(p)].worker_rank == rank
+    ]
+    with record_function("muon::prelaunch_gather"):
+        work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+            params, owned_params, param_to_state, rank, num_ranks,
+            process_group)
+    if none_grad:
+        for p in params:
+            p.grad = None
+    return work, recv_buf, gathered_grads, recv_counts
 # ======================================================================
 # Main generator – thin orchestrator that wires stages together.
 # ======================================================================
     lr: float,
     weight_decay: float,
     none_grad: bool,
+    prelaunch_gather: tuple | None = None,
 ) -> Generator[None, None, None]:
     """Process one chunk of parameters through the full Muon pipeline.
     runs concurrently on the NCCL stream — no separate ``comm_stream``
     is required.
+    If ``prelaunch_gather`` is provided, the gather was already launched
+    by :func:`prelaunch_first_gather` and we skip launching it again.
     Yields exactly **2** times:
+    1. After launching async all-to-all gather (or immediately if pre-launched).
     2. After launching async all-to-all scatter.
     """
     process_group = param_to_state[id(params[0])].process_group
         p for p in params if param_to_state[id(p)].worker_rank == rank
     ]
+    if prelaunch_gather is not None:
+        # Gather was pre-launched; none_grad already handled by caller.
+        work, recv_buf, gathered_grads, recv_counts = prelaunch_gather
+    else:
+        # Normal path: launch async gather.
+        with record_function("muon::launch_gather"):
+            work, recv_buf, gathered_grads, recv_counts = _launch_gather(
+                params, owned_params, param_to_state, rank, num_ranks,
+                process_group)
+            if none_grad:
+                for p in params:
+                    p.grad = None
     yield  # --- YIELD 1: other chunks can launch their gather ---

build/torch210-cxx11-rocm70-x86_64-linux/qk_clip.py CHANGED Viewed

@@ -5,6 +5,8 @@ from dataclasses import dataclass
 import torch
 from torch.distributed.tensor import DTensor
 logger = logging.getLogger(__name__)
@@ -23,7 +25,7 @@ def parse_qk_layer(name: str) -> tuple[str | None, int]:
         'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
         'model.4.attn.v_proj.weight'  -> (None, -1)
     """
-    parts = name.split('.')
     if len(parts) < 3:
         return None, -1
@@ -100,23 +102,27 @@ def compute_scales(p, qk_clip_state):
     threshold = qk_clip_state.threshold
     logit = qk_clip_state.logit
-    H_global = p.shape[0] // head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    scaling = 0
     for logit_idx, head_idx in enumerate(indices):
         v_ele = float(logit[logit_idx])
         if v_ele > threshold:
             new_scale = math.sqrt(threshold / v_ele)
-            if new_scale < scales_full[head_idx]:
-                scales_full[head_idx] = new_scale
                 logger.info(
                     f"[{kind}] Head {head_idx} exceeded threshold "
                     f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
                 )
-                scaling += 1
-    return scales_full if scaling > 0 else None
 def qk_clip(p, scales, head_dim):

 import torch
 from torch.distributed.tensor import DTensor
+from .core import normalize_fqn
 logger = logging.getLogger(__name__)
         'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
         'model.4.attn.v_proj.weight'  -> (None, -1)
     """
+    parts = normalize_fqn(name).split('.')
     if len(parts) < 3:
         return None, -1
     threshold = qk_clip_state.threshold
     logit = qk_clip_state.logit
+    # Check if any head exceeds threshold before allocating.
+    head_scales = {}
     for logit_idx, head_idx in enumerate(indices):
         v_ele = float(logit[logit_idx])
         if v_ele > threshold:
             new_scale = math.sqrt(threshold / v_ele)
+            if head_idx not in head_scales or new_scale < head_scales[head_idx]:
+                head_scales[head_idx] = new_scale
                 logger.info(
                     f"[{kind}] Head {head_idx} exceeded threshold "
                     f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
                 )
+    if not head_scales:
+        return None
+    H_global = p.shape[0] // head_dim
+    scales_full = torch.ones(H_global, device=p.data.device)
+    for head_idx, scale in head_scales.items():
+        scales_full[head_idx] = scale
+    return scales_full
 def qk_clip(p, scales, head_dim):

build/torch210-cxx11-rocm71-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_7aef62f_dirty
-ops = torch.ops._optimizer_7aef62f_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_7aef62f_dirty::{op_name}"

 import torch
+from . import _optimizer_5b58933_dirty
+ops = torch.ops._optimizer_5b58933_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_5b58933_dirty::{op_name}"

build/torch210-cxx11-rocm71-x86_64-linux/{_optimizer_7aef62f_dirty.abi3.so → _optimizer_5b58933_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e67022789ddd9296552fc5ab4075ce96b8b00b75bce057c707e5b5076bbde734
 size 1866112

 version https://git-lfs.github.com/spec/v1
+oid sha256:f41709878a4def27b12f4f9a4f5b767027fb33141e775f64ad04d434fcbe33d9
 size 1866112

build/torch210-cxx11-rocm71-x86_64-linux/adamw.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from collections import defaultdict
 from typing import cast
 import torch
 from torch.distributed.tensor import DTensor
 def fused_adamw(
@@ -72,54 +76,72 @@ def fused_adamw(
         )
-def step_adamw_params(optimizer_state, params, group):
-    """Run fused AdamW on a list of parameters sharing the same placement.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        params: List of parameters to update.
-        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
-    """
     params_with_grads = []
     grads = []
     moment1 = []
     moment2 = []
-    max_exp_avg_sqs = []
     state_steps = []
-    lr = group["lr"]
-    beta1, beta2 = group["adamw_betas"]
-    eps = group["adamw_eps"]
-    weight_decay = group["weight_decay"]
     for p in params:
         g = p.grad
         if g is None:
             continue
         state = optimizer_state[p]
-        params_with_grads.append(p)
-        grads.append(g)
         if "step" not in state:
-            state["step"] = (torch.zeros((),
-                                         dtype=torch.float32,
-                                         device=p.device))
             state["moment1"] = torch.zeros_like(g)
             state["moment2"] = torch.zeros_like(g)
-        moment1.append(state["moment1"])
-        moment2.append(state["moment2"])
         if not isinstance(state["step"], torch.Tensor):
-            step_tensor = torch.tensor(state["step"],
-                                       dtype=torch.float32,
-                                       device=p.device)
-        else:
-            step_tensor = state["step"]
-        state_steps.append(step_tensor)
     fused_adamw(
         params_with_grads,
         grads,
         moment1,
         moment2,
-        max_exp_avg_sqs,
         state_steps,
         amsgrad=False,
         beta1=beta1,
@@ -131,24 +153,119 @@ def step_adamw_params(optimizer_state, params, group):
     )
 def step_adamw(optimizer_state, group):
     """Dispatch AdamW step, grouping parameters by type and placement.
     Args:
         optimizer_state: The optimizer's state dict (self.state in Muon).
         group: Parameter group dict.
     """
     params = group["params"]
-    # group params with its type and placement
-    placement_to_params: dict[tuple, list[torch.Tensor]] = defaultdict(list)
-    for p in params:
-        match p:
-            case DTensor():
-                placement_to_params[tuple([p.placements,
-                                           p.device_mesh])].append(p)
-            case torch.Tensor():
-                placement_to_params[tuple([torch.Tensor, None])].append(p)
-    for group_params in placement_to_params.values():
         step_adamw_params(optimizer_state, group_params, group)

+import logging
 from collections import defaultdict
 from typing import cast
 import torch
 from torch.distributed.tensor import DTensor
+from torch.profiler import record_function
+logger = logging.getLogger(__name__)
 def fused_adamw(
         )
+def _to_local(t):
+    """Unwrap DTensor to local tensor for fused ops."""
+    return t._local_tensor if isinstance(t, DTensor) else t
+# ---------------------------------------------------------------------------
+# Caches for eliminating per-step Python overhead.
+#
+# Placement grouping and tensor list assembly are identical every step
+# (params don't change placement, moment/step tensors are the same objects
+# after initialisation).  We cache them keyed by id() of the param list
+# stored in param_groups (stable across steps).
+#
+# Only gradients change each step and must be collected fresh.
+# ---------------------------------------------------------------------------
+# id(group["params"]) → dict[placement_key, list[param]]
+_placement_cache: dict[int, dict[tuple, list]] = {}
+# id(placement_group_list) → (params_local, moment1, moment2, state_steps)
+_tensor_cache: dict[int, tuple[list, list, list, list]] = {}
+def _step_adamw_params_slow(optimizer_state, params, group):
+    """Uncached fallback for the rare case where some params lack grads."""
     params_with_grads = []
     grads = []
     moment1 = []
     moment2 = []
     state_steps = []
     for p in params:
         g = p.grad
         if g is None:
             continue
         state = optimizer_state[p]
+        params_with_grads.append(_to_local(p))
+        grads.append(_to_local(g))
         if "step" not in state:
+            state["step"] = torch.zeros((),
+                                        dtype=torch.float32,
+                                        device=p.device)
             state["moment1"] = torch.zeros_like(g)
             state["moment2"] = torch.zeros_like(g)
+        moment1.append(_to_local(state["moment1"]))
+        moment2.append(_to_local(state["moment2"]))
         if not isinstance(state["step"], torch.Tensor):
+            state["step"] = torch.tensor(state["step"],
+                                         dtype=torch.float32,
+                                         device=p.device)
+        state_steps.append(state["step"])
+    if not params_with_grads:
+        return
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
     fused_adamw(
         params_with_grads,
         grads,
         moment1,
         moment2,
+        [],
         state_steps,
         amsgrad=False,
         beta1=beta1,
     )
+def step_adamw_params(optimizer_state, params, group):
+    """Run fused AdamW on a list of parameters sharing the same placement.
+    After the first call, cached tensor lists (params_local, moment1,
+    moment2, state_steps) are reused — only gradients are collected fresh.
+    Args:
+        optimizer_state: The optimizer's state dict (self.state in Muon).
+        params: List of parameters to update.
+        group: Parameter group dict with lr, adamw_betas, adamw_eps, weight_decay.
+    """
+    # Collect grads — the only thing that changes each step.
+    with record_function("adamw::collect_grads"):
+        grads = []
+        for p in params:
+            g = p.grad
+            if g is None:
+                # Rare: fall back to slow path that filters per-param.
+                _step_adamw_params_slow(optimizer_state, params, group)
+                return
+            grads.append(_to_local(g))
+    tensor_key = id(params)
+    if tensor_key not in _tensor_cache:
+        with record_function("adamw::init_tensor_cache"):
+            params_local = []
+            moment1 = []
+            moment2 = []
+            state_steps = []
+            for p in params:
+                state = optimizer_state[p]
+                params_local.append(_to_local(p))
+                if "step" not in state:
+                    state["step"] = torch.zeros((),
+                                                dtype=torch.float32,
+                                                device=p.device)
+                    state["moment1"] = torch.zeros_like(p.grad)
+                    state["moment2"] = torch.zeros_like(p.grad)
+                moment1.append(_to_local(state["moment1"]))
+                moment2.append(_to_local(state["moment2"]))
+                if not isinstance(state["step"], torch.Tensor):
+                    state["step"] = torch.tensor(state["step"],
+                                                 dtype=torch.float32,
+                                                 device=p.device)
+                state_steps.append(state["step"])
+            _tensor_cache[tensor_key] = (params_local, moment1, moment2,
+                                         state_steps)
+    params_local, moment1, moment2, state_steps = _tensor_cache[tensor_key]
+    lr = group["lr"]
+    beta1, beta2 = group["adamw_betas"]
+    eps = group["adamw_eps"]
+    weight_decay = group["weight_decay"]
+    with record_function("adamw::fused_adamw"):
+        fused_adamw(
+            params_local,
+            grads,
+            moment1,
+            moment2,
+            [],
+            state_steps,
+            amsgrad=False,
+            beta1=beta1,
+            beta2=beta2,
+            lr=lr,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=False,
+        )
 def step_adamw(optimizer_state, group):
     """Dispatch AdamW step, grouping parameters by type and placement.
+    Placement grouping is cached after the first call since params never
+    change their placement between steps.
     Args:
         optimizer_state: The optimizer's state dict (self.state in Muon).
         group: Parameter group dict.
     """
     params = group["params"]
+    placement_key = id(params)
+    if placement_key not in _placement_cache:
+        with record_function("adamw::group_by_placement"):
+            placement_to_params: dict[tuple,
+                                      list[torch.Tensor]] = defaultdict(list)
+            for p in params:
+                match p:
+                    case DTensor():
+                        logger.debug(
+                            "[AdamW] DTensor param: shape=%s, placements=%s, "
+                            "mesh=%s, grad=%s", p.shape, p.placements,
+                            p.device_mesh.mesh_dim_names,
+                            p.grad.shape if p.grad is not None else None)
+                        placement_to_params[tuple(
+                            [p.placements, p.device_mesh])].append(p)
+                    case torch.Tensor():
+                        logger.debug(
+                            "[AdamW] plain param: shape=%s, grad=%s", p.shape,
+                            p.grad.shape if p.grad is not None else None)
+                        placement_to_params[tuple([torch.Tensor,
+                                                   None])].append(p)
+            logger.debug("[AdamW] %d placement groups, %d total params",
+                         len(placement_to_params), len(params))
+            _placement_cache[placement_key] = dict(placement_to_params)
+    for group_params in _placement_cache[placement_key].values():
         step_adamw_params(optimizer_state, group_params, group)

build/torch210-cxx11-rocm71-x86_64-linux/core.py CHANGED Viewed

@@ -1,11 +1,25 @@
 import math
 from dataclasses import dataclass
 import torch
-import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
 @dataclass
 class _muon_state:
@@ -17,26 +31,71 @@ class _muon_state:
     qk_clip_state: torch.Tensor | None = None
-def update_g(optimizer_state, p, g, group, momentum):
-    """Apply momentum update to gradient.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        p: Parameter tensor.
-        g: Gradient tensor.
-        group: Parameter group dict.
-        momentum: Momentum coefficient.
-    Returns:
-        Momentum-updated gradient tensor.
     """
-    state = optimizer_state[p]
-    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-    torch.add(g, buf, alpha=momentum, out=buf)
-    if group["nesterov"]:
-        g.add_(buf, alpha=momentum)
-        return g
-    return buf
 def update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -49,14 +108,13 @@ def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
-    if isinstance(p, torch.nn.Parameter):
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    else:
-        p.mul_(1 - lr * weight_decay)
-        p.add_(u, alpha=-adjusted_lr)
 def adjust_lr_for_muon(lr, param_shape):
@@ -77,14 +135,55 @@ def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
 def default_is_muon(name, x, expert_keys=None):
-    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
-    if any(key in name for key in skip_keys):
         return False
     effective_ndim = x.ndim
-    if expert_keys and any(key in name for key in expert_keys):
         effective_ndim -= 1
-    return effective_ndim >= 2
 def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
@@ -92,7 +191,7 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
-    non_muon_params = []
     for n, p in model.named_parameters():
         if not p.requires_grad:
@@ -102,6 +201,10 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
             muon_names.append(n)
         else:
             non_muon_params.append(p)
     return [
         {

+import logging
 import math
 from dataclasses import dataclass
+from typing import List
 import torch
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
+# torch.compile wraps modules as OptimizedModule, inserting "_orig_mod" into
+# parameter FQNs.  Activation checkpointing similarly inserts
+# "_checkpoint_wrapped_module".  Strip these so name-based matching (skip_keys,
+# expert_keys, QK layer parsing) works regardless of wrapper nesting.
+_WRAPPER_PARTS = frozenset({"_orig_mod", "_checkpoint_wrapped_module"})
+logger = logging.getLogger(__name__)
+def normalize_fqn(name: str) -> str:
+    """Strip torch.compile / checkpoint wrapper components from a parameter FQN."""
+    return ".".join(p for p in name.split(".") if p not in _WRAPPER_PARTS)
 @dataclass
 class _muon_state:
     qk_clip_state: torch.Tensor | None = None
+def _batch_momentum(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update (no nesterov)."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+def _batch_momentum_nesterov(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update with nesterov correction."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+    nesterov_terms = torch._foreach_mul(momentum_bufs, momentum)
+    torch._foreach_add_(grads, nesterov_terms)
+_compiled_momentum: dict[bool, callable] = {}
+_use_momentum_compile = True
+def set_momentum_compile(enabled: bool):
+    """Toggle torch.compile for batched momentum."""
+    global _use_momentum_compile
+    _use_momentum_compile = enabled
+def batch_pre_ortho(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+    nesterov: bool,
+) -> None:
+    """Batched momentum update on lists of plain tensors.
+    Mirrors dion's ``muon_update_pre_orthogonalize``.
+    Inputs must be plain CUDA tensors (not DTensor).
+    Modifies ``momentum_bufs`` and (for nesterov) ``grads`` in-place.
+    When compile is enabled, uses separately compiled functions for
+    nesterov=True/False to avoid graph breaks from the branch.
     """
+    fn = _batch_momentum_nesterov if nesterov else _batch_momentum
+    if _use_momentum_compile:
+        if nesterov not in _compiled_momentum:
+            _compiled_momentum[nesterov] = torch.compile(fn)
+        fn = _compiled_momentum[nesterov]
+    fn(grads, momentum_bufs, momentum)
+def _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay):
+    """Weight-decay + update on plain tensors.
+    Not compiled: per-param @torch.compile caused ~0.25ms TorchDynamo cache
+    lookup per call × 256+ params = massive overhead.  The pipeline path uses
+    batched _foreach_* ops instead; this function remains for base() and
+    distributed_muon().
+    """
+    p_data.mul_(1 - lr * weight_decay)
+    p_data.add_(u_data, alpha=-adjusted_lr)
 def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
+    # Unwrap Parameter -> underlying data tensor.
+    p_data = p.data if isinstance(p, torch.nn.Parameter) else p
+    # Unwrap DTensor -> local CUDA tensor for compiled kernel.
+    if isinstance(p_data, DTensor):
+        p_data = p_data._local_tensor
+    u_data = u._local_tensor if isinstance(u, DTensor) else u
+    _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay)
 def adjust_lr_for_muon(lr, param_shape):
     return adjusted_lr
+def _match_key(parts, key):
+    """Check if key matches as contiguous components in parts.
+    Single-component keys (e.g. "experts") match any single component.
+    Multi-component keys (e.g. "experts.w1") match as a contiguous subsequence.
+    """
+    key_parts = key.split(".")
+    key_len = len(key_parts)
+    if key_len == 1:
+        return key in parts
+    return any(parts[i:i + key_len] == key_parts
+               for i in range(len(parts) - key_len + 1))
+def is_expert_param(name, expert_keys):
+    """Check if a parameter name matches any expert key (component-level)."""
+    if not expert_keys:
+        return False
+    parts = normalize_fqn(name).split(".")
+    return any(_match_key(parts, key) for key in expert_keys)
 def default_is_muon(name, x, expert_keys=None):
+    normalized = normalize_fqn(name)
+    parts = normalized.split(".")
+    skip_keys = [
+        "embed_tokens",
+        "lm_head",
+        "tok_embeddings",
+        "output",
+        "mhc_attn",
+        "mhc_ffn",
+        "lambda_proj",
+    ]
+    if any(key in parts for key in skip_keys):
+        logger.info(
+            "[is_muon] %s (orig: %s): skip (matched skip_key), ndim=%d",
+            normalized, name, x.ndim)
         return False
     effective_ndim = x.ndim
+    is_expert = is_expert_param(name, expert_keys)
+    if is_expert:
         effective_ndim -= 1
+    result = effective_ndim >= 2
+    logger.info(
+        "[is_muon] %s (orig: %s): ndim=%d, expert=%s, effective_ndim=%d → %s",
+        normalized, name, x.ndim, is_expert, effective_ndim,
+        "Muon" if result else "AdamW")
+    return result
 def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
+    non_muon_params, non_muon_names = [], []
     for n, p in model.named_parameters():
         if not p.requires_grad:
             muon_names.append(n)
         else:
             non_muon_params.append(p)
+            non_muon_names.append(n)
+    logger.info("[param_groups] expert_keys=%s, Muon=%d, AdamW=%d",
+                expert_keys, len(muon_names), len(non_muon_names))
     return [
         {

build/torch210-cxx11-rocm71-x86_64-linux/cpu_offload.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""CPU offloading for optimizer states.
+Manages a pinned CPU memory pool and async CUDA streams to offload
+optimizer state tensors (momentum buffers, Adam moments) to CPU between
+optimizer steps, freeing GPU memory.
+All tracked tensors are packed into a single flat pinned CPU buffer
+(per dtype).  D2H and H2D copies are performed per-tensor directly
+between individual GPU tensors and their slice of the CPU flat buffer
+— no GPU staging buffer is allocated, so there is **no temporary GPU
+memory spike** during offload or reload.
+Individual tensor storages are freed after offload via
+``untyped_storage().resize_(0)``, preserving tensor identity so
+downstream caches remain valid.
+"""
+import logging
+from collections import defaultdict
+import torch
+from torch.distributed.tensor import DTensor
+logger = logging.getLogger(__name__)
+class CPUOffloadPool:
+    """Pinned CPU memory pool for async optimizer state offloading.
+    Tracked tensors are grouped by dtype.  Each group gets a single flat
+    pinned CPU buffer.  D2H / H2D copies are per-tensor (into slices of
+    the flat buffer) to avoid allocating a GPU staging buffer.
+    """
+    def __init__(self):
+        self._managed: list[torch.Tensor] = []
+        self._storage_nbytes: dict[int, int] = {}  # id(t) → bytes
+        # Per-dtype group: populated on first offload.
+        # dtype → dict with keys:
+        #   "indices"   : list[int]           managed-list indices
+        #   "offsets"   : list[tuple[int,int]] (start, numel) in flat buf
+        #   "total"     : int                  total numel
+        #   "cpu_flat"  : Tensor               pinned CPU buffer
+        self._groups: dict[torch.dtype, dict] = {}
+        self._offload_stream: torch.cuda.Stream | None = None
+        self._device: torch.device | None = None
+        self._initialized: bool = False
+        self._logged: bool = False
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _local(t: torch.Tensor) -> torch.Tensor:
+        """Unwrap DTensor to its local CUDA tensor."""
+        return t._local_tensor if isinstance(t, DTensor) else t
+    def _ensure_stream(self):
+        if self._offload_stream is None:
+            self._offload_stream = torch.cuda.Stream(device=self._device)
+    # ------------------------------------------------------------------
+    def track(self, tensor: torch.Tensor):
+        """Register a GPU tensor for CPU offloading.  Idempotent."""
+        tid = id(tensor)
+        if tid in self._storage_nbytes:
+            return
+        local = self._local(tensor)
+        if self._device is None:
+            self._device = local.device
+        self._storage_nbytes[tid] = local.untyped_storage().size()
+        self._managed.append(tensor)
+    # ------------------------------------------------------------------
+    def _init_buffers(self):
+        """Build per-dtype flat buffers on first offload."""
+        # Group managed tensors by dtype.
+        dtype_map: dict[torch.dtype, list[tuple[int, int]]] = defaultdict(list)
+        for idx, t in enumerate(self._managed):
+            local = self._local(t)
+            dtype_map[local.dtype].append((idx, local.numel()))
+        total_cpu_bytes = 0
+        for dtype, entries in dtype_map.items():
+            offsets: list[tuple[int, int]] = []
+            indices: list[int] = []
+            off = 0
+            for idx, n in entries:
+                indices.append(idx)
+                offsets.append((off, n))
+                off += n
+            cpu_flat = torch.empty(off, dtype=dtype, device="cpu", pin_memory=True)
+            self._groups[dtype] = {
+                "indices": indices,
+                "offsets": offsets,
+                "total": off,
+                "cpu_flat": cpu_flat,
+            }
+            total_cpu_bytes += off * cpu_flat.element_size()
+        self._initialized = True
+        logger.info(
+            "[CPUOffload] Pool initialized: %d tensors, %d dtype group(s), "
+            "%.2f MB pinned CPU memory",
+            len(self._managed),
+            len(self._groups),
+            total_cpu_bytes / (1024**2),
+        )
+    # ------------------------------------------------------------------
+    def offload(self):
+        """Per-tensor async D2H into CPU flat buffer, then free GPU storage."""
+        if not self._managed:
+            return
+        if not self._initialized:
+            self._init_buffers()
+        self._ensure_stream()
+        # Offload stream waits for compute to finish.
+        compute_event = torch.cuda.current_stream(
+            self._device).record_event()
+        self._offload_stream.wait_event(compute_event)
+        offloaded_bytes = 0
+        # Per-tensor D2H copies directly into CPU flat buffer slices.
+        # No GPU staging buffer → no temporary GPU memory spike.
+        with torch.cuda.stream(self._offload_stream):
+            for dtype, grp in self._groups.items():
+                indices = grp["indices"]
+                offsets = grp["offsets"]
+                cpu_flat = grp["cpu_flat"]
+                for i, mgd_idx in enumerate(indices):
+                    local = self._local(self._managed[mgd_idx])
+                    off, n = offsets[i]
+                    cpu_flat[off:off + n].copy_(
+                        local.reshape(-1), non_blocking=True)
+                offloaded_bytes += grp["total"] * cpu_flat.element_size()
+        # Wait for all D2H copies to land, then free GPU storage.
+        self._offload_stream.synchronize()
+        for t in self._managed:
+            self._local(t).untyped_storage().resize_(0)
+        if not self._logged:
+            logger.info("[CPUOffload] Offloaded %.2f MB (GPU → CPU)",
+                        offloaded_bytes / (1024**2))
+    # ------------------------------------------------------------------
+    def reload(self):
+        """Per-tensor H2D from CPU flat buffer on the default stream.
+        Runs on the current (default) CUDA stream to avoid stream
+        interaction issues with the parallel Muon pipeline.  Since
+        pinned CPU memory is the source, the copies overlap with
+        GPU idle time between steps.
+        """
+        if not self._managed or not self._initialized:
+            return
+        reloaded_bytes = 0
+        # Re-allocate all GPU storages first.
+        for t in self._managed:
+            local = self._local(t)
+            local.untyped_storage().resize_(self._storage_nbytes[id(t)])
+        # Per-tensor H2D copies from CPU flat buffer slices.
+        # non_blocking=True with pinned source allows DMA overlap.
+        for dtype, grp in self._groups.items():
+            indices = grp["indices"]
+            offsets = grp["offsets"]
+            cpu_flat = grp["cpu_flat"]
+            for i, mgd_idx in enumerate(indices):
+                local = self._local(self._managed[mgd_idx])
+                off, n = offsets[i]
+                local.reshape(-1).copy_(
+                    cpu_flat[off:off + n], non_blocking=True)
+            reloaded_bytes += grp["total"] * cpu_flat.element_size()
+        if not self._logged:
+            logger.info("[CPUOffload] Reloaded %.2f MB (CPU → GPU)",
+                        reloaded_bytes / (1024**2))
+            self._logged = True

build/torch210-cxx11-rocm71-x86_64-linux/distributed/utils.py CHANGED Viewed

@@ -72,12 +72,6 @@ def get_slices_of_dtensor(
         else:
             curr_size = target.size()[shard_dim]
-        if curr_size % num_chunks != 0:
-            raise NotImplementedError(
-                f"Dimension size {curr_size} is not divisible "
-                f"by number of ranks {num_chunks} for shard "
-                f"placement on dim {shard_dim}. (shape: {target.shape})")
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

         else:
             curr_size = target.size()[shard_dim]
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(