# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import math
import random
import re
import time
import warnings
from dataclasses import dataclass
from typing import List, Union, Optional, Dict, Any, Tuple, Callable, TYPE_CHECKING
from datetime import datetime

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from PIL import Image
from einops import rearrange
from torch import Tensor
from torch import nn
from torch.cuda import nvtx

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, StaticCache
from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
from transformers.generation.stopping_criteria import StoppingCriteriaList
from transformers.generation.streamers import TextStreamer
from transformers.generation.utils import GenerationMixin, GenerationConfig, ALL_CACHE_NAMES
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    logging,
)

try:
    import flashinfer
except Exception as e:
    flashinfer = None

#from .autoencoder_kl_3d import AutoencoderKLConv3D
from .autoencoder_kl_3d import AutoencoderKLConv3D_Dist, AutoencoderKLConv3D
from .configuration_hunyuan_image_3 import HunyuanImage3Config
from .hunyuan_image_3_pipeline import HunyuanImage3Text2ImagePipeline, FlowMatchDiscreteScheduler
from .image_processor import HunyuanImage3ImageProcessor
from .siglip2 import Siglip2VisionTransformer, LightProjector
from .tokenization_hunyuan_image_3 import HunyuanImage3TokenizerFast, ImageInfo, ImageTensor, CondImage
from .system_prompt import get_system_prompt

from .cache_utils import TaylorCacheContainer, CacheWithFreqsContainer

if TYPE_CHECKING:
    from transformers.generation.streamers import BaseStreamer

logger = logging.get_logger(__name__)


if is_flash_attn_2_available():
    from flash_attn import flash_attn_func

# Type aliases
BatchRaggedImages = Union[torch.Tensor, List[Union[torch.Tensor, List[torch.Tensor]]]]
BatchRaggedTensor = Union[torch.Tensor, List[torch.Tensor]]
InputImage = Optional[Union[Image.Image, str, bytes]]


def get_device(tensor: BatchRaggedImages):
    if isinstance(tensor, torch.Tensor):
        return tensor.device
    elif isinstance(tensor, list):
        return get_device(tensor[0])
    else:
        raise ValueError(f"Unsupported type for get_device: {type(tensor)}")


_CONFIG_FOR_DOC = "HunyuanImage3Config"

Hunyuan_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`HunyuanImage3Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# =======================================================
#     Helper Functions
# =======================================================

def default(val, d):
    return val if val is not None else d


def to_device(data, device):
    if device is None:
        return data
    if isinstance(data, torch.Tensor):
        return data.to(device)
    elif isinstance(data, list):
        return [to_device(x, device) for x in data]
    else:
        return data


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def real_batched_index_select(t, dim, idx):
    """ index_select for batched index and batched t """
    assert t.ndim >= 2 and idx.ndim >= 2, f"{t.ndim=} {idx.ndim=}"
    assert len(t) == len(idx), f"{len(t)=} != {len(idx)=}"
    return torch.stack([torch.index_select(t[i], dim - 1, idx[i]) for i in range(len(t))])


# =======================================================
#     Module Functions
# =======================================================

def timestep_embedding(t, dim, max_period=10000):
    """
    Create sinusoidal timestep embeddings.

    Args:
        t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
        dim (int): the dimension of the output.
        max_period (int): controls the minimum frequency of the embeddings.

    Returns:
        embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.

    .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
    """
    half = dim // 2
    freqs = torch.exp(
        -math.log(max_period)
        * torch.arange(start=0, end=half, dtype=torch.float32)
        / half
    ).to(device=t.device)
    args = t[:, None].float() * freqs[None]
    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
    if dim % 2:
        embedding = torch.cat(
            [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
        )
    return embedding


def conv_nd(dims, *args, **kwargs):
    """
    Create a 1D, 2D, or 3D convolution module.
    """
    if dims == 1:
        return nn.Conv1d(*args, **kwargs)
    elif dims == 2:
        return nn.Conv2d(*args, **kwargs)
    elif dims == 3:
        return nn.Conv3d(*args, **kwargs)
    raise ValueError(f"unsupported dimensions: {dims}")


def linear(*args, **kwargs):
    """
    Create a linear module.
    """
    return nn.Linear(*args, **kwargs)


def avg_pool_nd(dims, *args, **kwargs):
    """
    Create a 1D, 2D, or 3D average pooling module.
    """
    if dims == 1:
        return nn.AvgPool1d(*args, **kwargs)
    elif dims == 2:
        return nn.AvgPool2d(*args, **kwargs)
    elif dims == 3:
        return nn.AvgPool3d(*args, **kwargs)
    raise ValueError(f"unsupported dimensions: {dims}")


def zero_module(module):
    """
    Zero out the parameters of a module and return it.
    """
    for p in module.parameters():
        p.detach().zero_()
    return module


def normalization(channels, **kwargs):
    """
    Make a standard normalization layer.

    :param channels: number of input channels.
    :return: a nn.Module for normalization.
    """
    return nn.GroupNorm(32, channels, **kwargs)


def topkgating(
        logits: Tensor,
        topk: int,
        group_limited_greedy: bool = False,
        n_group: int = None,
        topk_group: int = None,
        norm_topk_prob: bool = True,
        routed_scaling_factor: float = 1.0,
        capacity_factor: float = 1.0,
        drop_tokens: bool = False,
):
    logits = logits.float()
    gates = F.softmax(logits, dim=1)

    if group_limited_greedy:
        group_shape = list(gates.shape[:-1]) + [n_group, gates.shape[-1] // n_group]
        group_scores = (
            gates.reshape(group_shape).max(dim=-1).values
        )  # [n, n_group]
        group_idx = torch.topk(
            group_scores, topk_group, dim=-1, sorted=False
        )[
            1
        ]  # [n, top_k_group]
        group_mask = torch.zeros_like(group_scores)  # [n, n_group]
        group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
        score_mask = (
            group_mask.unsqueeze(-1)
            .expand(
                group_shape
            )
            .reshape(list(gates.shape))
        )  # [n, e]
        gates = gates.masked_fill(~score_mask.bool(), 0.0)

    num_experts = int(gates.shape[1])
    # Top-k router probability and corresponding expert indices for each token.
    # Shape: [tokens_per_group, num_selected_experts].
    expert_gate, expert_index = torch.topk(gates, topk)
    expert_mask = F.one_hot(expert_index, num_experts)
    # For a given token, determine if it was routed to a given expert.
    # Shape: [tokens_per_group, num_experts]
    expert_mask_aux = expert_mask.max(dim=-2)[0]
    tokens_per_group_and_expert = torch.mean(expert_mask_aux.float(), dim=-2)
    router_prob_per_group_and_expert = torch.mean(gates.float(), dim=-2)
    l_aux = num_experts ** 2 * torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert)

    if drop_tokens:
        expert_capacity = int(max(topk, topk * gates.shape[0] // gates.shape[1]) * capacity_factor)
    else:
        expert_index_flat = expert_index.flatten()
        tokens_per_expert = torch.bincount(expert_index_flat, minlength=num_experts)
        expert_capacity = torch.max(tokens_per_expert).item()

    if norm_topk_prob and topk > 1:
        gates_s = torch.clamp(
            torch.matmul(expert_mask.float(), gates.unsqueeze(-1)).sum(dim=1), min=torch.finfo(gates.dtype).eps
        )
        router_probs = gates / gates_s
    else:
        router_probs = gates * routed_scaling_factor
    # Make num_selected_experts the leading axis to ensure that top-1 choices
    # have priority over top-2 choices, which have priority over top-3 choices,
    # etc.
    expert_index = torch.transpose(expert_index, 0, 1)
    # Shape: [num_selected_experts * tokens_per_group]
    expert_index = expert_index.reshape(-1)

    # Create mask out of indices.
    # Shape: [tokens_per_group * num_selected_experts, num_experts].
    expert_mask = F.one_hot(expert_index, num_experts).to(torch.int32)
    exp_counts = torch.sum(expert_mask, dim=0).detach()

    # Experts have a fixed capacity that we cannot exceed. A token's priority
    # within the expert's buffer is given by the masked, cumulative capacity of
    # its target expert.
    # Shape: [tokens_per_group * num_selected_experts, num_experts].
    token_priority = torch.cumsum(expert_mask, dim=0) * expert_mask - 1
    # Shape: [num_selected_experts, tokens_per_group, num_experts].
    token_priority = token_priority.reshape((topk, -1, num_experts))
    # Shape: [tokens_per_group, num_selected_experts, num_experts].
    token_priority = torch.transpose(token_priority, 0, 1)
    # For each token, across all selected experts, select the only non-negative
    # (unmasked) priority. Now, for group G routing to expert E, token T has
    # non-negative priority (i.e. token_priority[G,T,E] >= 0) if and only if E
    # is its targeted expert.
    # Shape: [tokens_per_group, num_experts].
    token_priority = torch.max(token_priority, dim=1)[0]

    # Token T can only be routed to expert E if its priority is positive and
    # less than the expert capacity. One-hot matrix will ignore indices outside
    # the range [0, expert_capacity).
    # Shape: [tokens_per_group, num_experts, expert_capacity].
    valid_mask = torch.logical_and(token_priority >= 0, token_priority < expert_capacity)
    token_priority = torch.masked_fill(token_priority, ~valid_mask, 0)
    dispatch_mask = F.one_hot(token_priority, expert_capacity).to(torch.bool)
    valid_mask = valid_mask.unsqueeze(-1).expand(-1, -1, expert_capacity)
    dispatch_mask = torch.masked_fill(dispatch_mask, ~valid_mask, 0)

    # The combine array will be used for combining expert outputs, scaled by the
    # router probabilities. Shape: [num_groups, tokens_per_group, num_experts,
    # expert_capacity].
    combine_weights = torch.einsum("...te,...tec->...tec", router_probs, dispatch_mask)
    exp_counts_capacity = torch.sum(dispatch_mask)
    exp_capacity_rate = exp_counts_capacity / (logits.shape[0] * topk)

    return [l_aux, exp_capacity_rate], combine_weights, dispatch_mask, exp_counts


# =======================================================
#     Multi-Dimensional RoPE
# =======================================================

def _to_tuple(x, dim=2):
    if isinstance(x, int):
        return (x,) * dim
    elif len(x) == dim:
        return x
    else:
        raise ValueError(f"Expected length {dim} or int, but got {x}")


def get_meshgrid_nd(start, *args, dim=2, device="cpu"):
    """
    Get n-D meshgrid with start, stop and num.

    Args:
        start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
            step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
            should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
            n-tuples.
        *args: See above.
        dim (int): Dimension of the meshgrid. Defaults to 2.

    Returns:
        grid (np.ndarray): [dim, ...]
    """
    if len(args) == 0:
        # start is grid_size
        num = _to_tuple(start, dim=dim)
        start = (0,) * dim
        stop = num
    elif len(args) == 1:
        # start is start, args[0] is stop, step is 1
        start = _to_tuple(start, dim=dim)
        stop = _to_tuple(args[0], dim=dim)
        num = [stop[i] - start[i] for i in range(dim)]
        # assert num are all integers
        num_int = [int(x) for x in num]
        assert (torch.tensor(num) == torch.tensor(num_int)).all(), f"num should be int, but got {num}"
        num = num_int
    elif len(args) == 2:
        # start is start, args[0] is stop, args[1] is num
        start = _to_tuple(start, dim=dim)       # Left-Top       eg: 12,0
        stop = _to_tuple(args[0], dim=dim)      # Right-Bottom   eg: 20,32
        num = _to_tuple(args[1], dim=dim)       # Target Size    eg: 32,124
    else:
        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")

    # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
    axis_grid = []
    for i in range(dim):
        a, b, n = start[i], stop[i], num[i]
        g = torch.linspace(a, b, n + 1, dtype=torch.float32, device=device)[:n]
        axis_grid.append(g)
    grid = torch.meshgrid(*axis_grid, indexing="ij")   # dim x [H, W]
    grid = torch.stack(grid, dim=0)     # [dim, H, W]

    return grid


def build_2d_rope(
        seq_len: int, n_elem: int, image_infos: Optional[List[Tuple[slice, Tuple[int, int]]]] = None,
        device: Optional[torch.device] = None, base: int = 10000, base_rescale_factor: float = 1.0,
        return_all_pos: bool = False,
):
    """
    Reference: https://kexue.fm/archives/10352

    Start from 1, we have
        beta_y = L + (wh - h)/2
        beta_x = L + (wh - w)/2

    Returns
    -------
    cos: torch.Tensor with shape of [seq_len, n_elem]
    sin: torch.Tensor with shape of [seq_len, n_elem]
    """
    assert n_elem % 4 == 0, f"n_elem must be divisible by 4, but got {n_elem}."

    # theta
    if base_rescale_factor != 1.0:
        base *= base_rescale_factor ** (n_elem / (n_elem - 2))
    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
    theta = theta.reshape(1, n_elem // 4, 2)    # [1, half_d, 2]

    # position indices
    if image_infos is None:
        image_infos = []

    image_infos_list = [image_infos]
    sample_seq_lens = [seq_len]

    # Prepare position indices for each sample
    x_sections = []
    y_sections = []
    for sample_id, sample_image_infos in enumerate(image_infos_list):
        last_pos = 0
        for sec_slice, (h, w) in sample_image_infos:
            L = sec_slice.start   # start from 0, so image_slice.start is just L
            # previous text
            if last_pos < L:
                y_sections.append(torch.arange(last_pos, L, device=device))
                x_sections.append(torch.arange(last_pos, L, device=device))
            elif h is None:
                # Interleave data has overlapped positions for <boi> <size> <ratio> <timestep> <eoi> tokens.
                y_sections.append(torch.arange(sec_slice.start, sec_slice.stop, device=device))
                x_sections.append(torch.arange(sec_slice.start, sec_slice.stop, device=device))
                continue
            else:
                # Interleave data has overlapped positions for noised image and the successive clean image,
                # leading to last_pos (= last text end L + noise w * h) > L (last text end L).
                pass
            # current image
            beta_y = L + (w * h - h) / 2
            beta_x = L + (w * h - w) / 2
            grid = get_meshgrid_nd((beta_y, beta_x), (beta_y + h, beta_x + w), device=device)  # [2, h, w]
            grid = grid.reshape(2, -1)  # (y, x)
            y_sections.append(grid[0])
            x_sections.append(grid[1])
            # step
            last_pos = L + w * h
        # final text
        y_sections.append(torch.arange(last_pos, sample_seq_lens[sample_id], device=device))
        x_sections.append(torch.arange(last_pos, sample_seq_lens[sample_id], device=device))

    x_pos = torch.cat(x_sections).long()
    y_pos = torch.cat(y_sections).long()
    # If there are overlap positions, we need to remove them.
    x_pos = x_pos[:seq_len]
    y_pos = y_pos[:seq_len]
    all_pos = torch.stack((y_pos, x_pos), dim=1).unsqueeze(1).to(device)    # [seq_len, 1, 2]

    # calc rope
    idx_theta = (all_pos * theta).reshape(all_pos.shape[0], n_elem // 2).repeat(1, 2)

    cos = torch.cos(idx_theta)
    sin = torch.sin(idx_theta)

    if return_all_pos:
        return cos, sin, all_pos

    return cos, sin


def build_batch_2d_rope(
        seq_len: int, n_elem: int, image_infos: Optional[List[List[Tuple[slice, Tuple[int, int]]]]] = None,
        device: Optional[torch.device] = None, base: int = 10000, base_rescale_factor: float = 1.0,
        return_all_pos: bool = False,
):
    cos_list, sin_list, all_pos_list = [], [], []
    if image_infos is None:
        image_infos = [None]
    for i, image_info in enumerate(image_infos):
        res = build_2d_rope(
            seq_len, n_elem, image_infos=image_info, device=device,
            base=base, base_rescale_factor=base_rescale_factor,
            return_all_pos=return_all_pos,
        )
        if return_all_pos:
            cos, sin, all_pos = res
        else:
            cos, sin = res
            all_pos = None
        cos_list.append(cos)
        sin_list.append(sin)
        all_pos_list.append(all_pos)

    stacked_cos = torch.stack(cos_list, dim=0)
    stacked_sin = torch.stack(sin_list, dim=0)

    if return_all_pos:
        return stacked_cos, stacked_sin, all_pos_list

    return stacked_cos, stacked_sin


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2:]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass shifted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    if position_ids is not None:
        cos = cos[position_ids]
        sin = sin[position_ids]

    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


# =======================================================
#     Modules for Image Generation
# =======================================================

class TimestepEmbedder(nn.Module):
    """
    Embeds scalar timesteps into vector representations.
    """
    def __init__(self,
                 hidden_size,
                 act_layer=nn.GELU,
                 frequency_embedding_size=256,
                 max_period=10000,
                 out_size=None,
                 dtype=None,
                 device=None
                 ):
        factory_kwargs = {'dtype': dtype, 'device': device}
        super().__init__()
        self.frequency_embedding_size = frequency_embedding_size
        self.max_period = max_period
        if out_size is None:
            out_size = hidden_size

        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size, bias=True, **factory_kwargs),
            act_layer(),
            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
        )
        nn.init.normal_(self.mlp[0].weight, std=0.02)
        nn.init.normal_(self.mlp[2].weight, std=0.02)

    def forward(self, t):
        t_freq = timestep_embedding(t, self.frequency_embedding_size, self.max_period).type(self.mlp[0].weight.dtype)
        t_emb = self.mlp(t_freq)
        return t_emb


class Upsample(nn.Module):
    """
    An upsampling layer with an optional convolution.

    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    """

    def __init__(self, channels, use_conv, dims=2, out_channels=None, device=None, dtype=None):
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.channels = channels
        self.out_channels = out_channels or channels
        self.use_conv = use_conv
        self.dims = dims
        if use_conv:
            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1, **factory_kwargs)

    def forward(self, x):
        assert x.shape[1] == self.channels
        if self.dims == 3:
            x = F.interpolate(
                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
            )
        else:
            x = F.interpolate(x, scale_factor=2, mode="nearest")
        if self.use_conv:
            x = self.conv(x)
        return x


class Downsample(nn.Module):
    """
    A downsampling layer with an optional convolution.

    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    """

    def __init__(self, channels, use_conv, dims=2, out_channels=None, device=None, dtype=None):
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.channels = channels
        self.out_channels = out_channels or channels
        self.use_conv = use_conv
        self.dims = dims
        stride = 2 if dims != 3 else (1, 2, 2)
        if use_conv:
            self.op = conv_nd(
                dims, self.channels, self.out_channels, 3, stride=stride, padding=1, **factory_kwargs
            )
        else:
            assert self.channels == self.out_channels
            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)

    def forward(self, x):
        assert x.shape[1] == self.channels
        return self.op(x)


class ResBlock(nn.Module):
    """
    A residual block that can optionally change the number of channels.

    :param in_channels: the number of input channels.
    :param emb_channels: the number of timestep embedding channels.
    :param dropout: the rate of dropout.
    :param out_channels: if specified, the number of out channels.
    :param use_conv: if True and out_channels is specified, use a spatial
        convolution instead of a smaller 1x1 convolution to change the
        channels in the skip connection.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param up: if True, use this block for upsampling.
    :param down: if True, use this block for downsampling.
    """

    def __init__(
        self,
        in_channels,
        emb_channels,
        out_channels=None,
        dropout=0.0,
        use_conv=False,
        dims=2,
        up=False,
        down=False,
        device=None,
        dtype=None,
    ):
        factory_kwargs = {'dtype': dtype, 'device': device}
        super().__init__()
        self.in_channels = in_channels
        self.dropout = dropout
        self.out_channels = out_channels or self.in_channels
        self.use_conv = use_conv

        self.in_layers = nn.Sequential(
            normalization(self.in_channels, **factory_kwargs),
            nn.SiLU(),
            conv_nd(dims, self.in_channels, self.out_channels, 3, padding=1, **factory_kwargs),
        )

        self.updown = up or down

        if up:
            self.h_upd = Upsample(self.in_channels, False, dims, **factory_kwargs)
            self.x_upd = Upsample(self.in_channels, False, dims, **factory_kwargs)
        elif down:
            self.h_upd = Downsample(self.in_channels, False, dims, **factory_kwargs)
            self.x_upd = Downsample(self.in_channels, False, dims, **factory_kwargs)
        else:
            self.h_upd = self.x_upd = nn.Identity()

        self.emb_layers = nn.Sequential(
            nn.SiLU(),
            linear(emb_channels, 2 * self.out_channels, **factory_kwargs)
        )

        self.out_layers = nn.Sequential(
            normalization(self.out_channels, **factory_kwargs),
            nn.SiLU(),
            nn.Dropout(p=dropout),
            zero_module(
                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1, **factory_kwargs)
            ),
        )

        if self.out_channels == self.in_channels:
            self.skip_connection = nn.Identity()
        elif use_conv:
            self.skip_connection = conv_nd(
                dims, self.in_channels, self.out_channels, 3, padding=1, **factory_kwargs
            )
        else:
            self.skip_connection = conv_nd(dims, self.in_channels, self.out_channels, 1, **factory_kwargs)

    def forward(self, x, emb):
        if self.updown:
            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
            h = in_rest(x)
            h = self.h_upd(h)
            x = self.x_upd(x)
            h = in_conv(h)
        else:
            h = self.in_layers(x)

        emb_out = self.emb_layers(emb)
        while len(emb_out.shape) < len(h.shape):
            emb_out = emb_out[..., None]

        # Adaptive Group Normalization
        out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
        scale, shift = torch.chunk(emb_out, 2, dim=1)
        h = out_norm(h) * (1. + scale) + shift
        h = out_rest(h)

        return self.skip_connection(x) + h


class UNetDown(nn.Module):
    """
    patch_size: one of [1, 2 ,4 ,8]
    in_channels: vae latent dim
    hidden_channels: hidden dim for reducing parameters
    out_channels: transformer model dim
    """
    def __init__(self, patch_size, in_channels, emb_channels, hidden_channels, out_channels,
                 dropout=0.0, device=None, dtype=None):
        factory_kwargs = {'dtype': dtype, 'device': device}
        super().__init__()

        self.patch_size = patch_size
        assert self.patch_size in [1, 2, 4, 8]

        self.model = nn.ModuleList(
            [conv_nd(
                2,
                in_channels=in_channels,
                out_channels=hidden_channels,
                kernel_size=3,
                padding=1,
                **factory_kwargs
            )]
        )

        if self.patch_size == 1:
            self.model.append(ResBlock(
                in_channels=hidden_channels,
                emb_channels=emb_channels,
                out_channels=out_channels,
                dropout=dropout,
                **factory_kwargs
            ))
        else:
            for i in range(self.patch_size // 2):
                self.model.append(ResBlock(
                    in_channels=hidden_channels,
                    emb_channels=emb_channels,
                    out_channels=hidden_channels if (i + 1) * 2 != self.patch_size else out_channels,
                    dropout=dropout,
                    down=True,
                    **factory_kwargs
                ))

    def forward(self, x, t):
        assert x.shape[2] % self.patch_size == 0 and x.shape[3] % self.patch_size == 0
        for module in self.model:
            if isinstance(module, ResBlock):
                x = module(x, t)
            else:
                x = module(x)
        _, _, token_h, token_w = x.shape
        x = rearrange(x, 'b c h w -> b (h w) c')
        return x, token_h, token_w


class UNetUp(nn.Module):
    """
    patch_size: one of [1, 2 ,4 ,8]
    in_channels: transformer model dim
    hidden_channels: hidden dim for reducing parameters
    out_channels: vae latent dim
    """
    def __init__(self, patch_size, in_channels, emb_channels, hidden_channels, out_channels,
                 dropout=0.0, device=None, dtype=None, out_norm=False):
        factory_kwargs = {'dtype': dtype, 'device': device}
        super().__init__()

        self.patch_size = patch_size
        assert self.patch_size in [1, 2, 4, 8]

        self.model = nn.ModuleList()

        if self.patch_size == 1:
            self.model.append(ResBlock(
                in_channels=in_channels,
                emb_channels=emb_channels,
                out_channels=hidden_channels,
                dropout=dropout,
                **factory_kwargs
            ))
        else:
            for i in range(self.patch_size // 2):
                self.model.append(ResBlock(
                    in_channels=in_channels if i == 0 else hidden_channels,
                    emb_channels=emb_channels,
                    out_channels=hidden_channels,
                    dropout=dropout,
                    up=True,
                    **factory_kwargs
                ))

        if out_norm:
            self.model.append(nn.Sequential(
                normalization(hidden_channels, **factory_kwargs),
                nn.SiLU(),
                conv_nd(
                    2,
                    in_channels=hidden_channels,
                    out_channels=out_channels,
                    kernel_size=3,
                    padding=1,
                    **factory_kwargs
                ),
            ))
        else:
            self.model.append(conv_nd(
                2,
                in_channels=hidden_channels,
                out_channels=out_channels,
                kernel_size=3,
                padding=1,
                **factory_kwargs
            ))

    # batch_size, seq_len, model_dim
    def forward(self, x, t, token_h, token_w):
        x = rearrange(x, 'b (h w) c -> b c h w', h=token_h, w=token_w)
        for module in self.model:
            if isinstance(module, ResBlock):
                x = module(x, t)
            else:
                x = module(x)
        return x


# =======================================================
#     Modules for Transformer Backbone
# =======================================================

@dataclass
class CausalMMOutputWithPast(CausalLMOutputWithPast):
    diffusion_prediction: Optional[torch.Tensor] = None


class HunyuanStaticCache(StaticCache):
    """
    A custom static cache for multi-modal models that supports dynamic extension of the cache
    and inplace updates of the cache.

    This cache supports batch cache_position updates.
    """
    def __init__(self, *args, **kwargs):
        self.dynamic = kwargs.pop("dynamic", False)
        super().__init__(*args, **kwargs)

    def update(
        self,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
        cache_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
                to know how where to write in the cache.

        Return:
            A tuple containing the updated key and value states.
        """
        cache_position = cache_kwargs.get("cache_position")
        if self.layers[layer_idx].keys is None:
            self.layers[layer_idx].lazy_initialization(key_states)
        k_out = self.layers[layer_idx].keys
        v_out = self.layers[layer_idx].values

        if cache_position is None:
            k_out.copy_(key_states)
            v_out.copy_(value_states)
        else:
            # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to
            # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place
            # operation, that avoids copies and uses less memory.
            if cache_position.dim() == 1:
                k_out.index_copy_(2, cache_position, key_states)
                v_out.index_copy_(2, cache_position, value_states)

                if self.dynamic:
                    end = cache_position[-1].item() + 1
                    k_out = k_out[:, :, :end]
                    v_out = v_out[:, :, :end]
            else:
                assert cache_position.dim() == 2, f"multiple batch dims not yet {cache_position.shape=}"
                batch_size, idx_size = cache_position.shape
                assert batch_size == k_out.size(0)
                assert batch_size == v_out.size(0)
                assert batch_size == key_states.size(0)
                assert batch_size == value_states.size(0)
                for i in range(batch_size):
                    unbatched_dim = 1
                    k_out[i].index_copy_(unbatched_dim, cache_position[i], key_states[i])
                    v_out[i].index_copy_(unbatched_dim, cache_position[i], value_states[i])

                if self.dynamic:
                    assert len(cache_position) == 1
                    end = cache_position[0, -1].item() + 1
                    k_out = k_out[:, :, :end]
                    v_out = v_out[:, :, :end]

        return k_out, v_out


class CachedRoPE(object):
    """ A 2D RoPE is determined by rope_image_info and seq_len. """

    def __init__(self, config):
        self.config = config
        self.cos_cache = None
        self.sin_cache = None
        self.seq_len = None
        self.rope_image_info = None

    def __call__(self, seq_len, device, rope_image_info=None, position_ids=None):
        """ Get cached RoPE for given seq_len and rope_image_info.
        If cache miss, compute and cache it.

        Args:
            seq_len (int): The sequence length.
            device (torch.device): The device to store the RoPE.
            rope_image_info (list): The rope image info. list of lists of (slice, (height, width)) tuples.
            position_ids (torch.Tensor): The input positions.

        Returns:
            The RoPE cos and sin tensors.
        """
        if (self.seq_len != seq_len) or (rope_image_info is not None and self.rope_image_info != rope_image_info):
            # Cache miss, compute RoPE
            if self.config.rope_type in ["2d", "default"]:
                self.cos_cache, self.sin_cache = build_batch_2d_rope(
                    image_infos=rope_image_info,
                    seq_len=seq_len,
                    n_elem=self.config.attention_head_dim,
                    device=device,
                    base=self.config.rope_theta,
                )
            else:
                raise NotImplementedError(f"rope_type `{self.config.rope_type}` not supported")
        else:
            # hit cache
            pass

        if position_ids is None:
            # Typically for training
            cos, sin = self.cos_cache, self.sin_cache
        else:
            # Typically for inference
            assert position_ids.dim() == 2, f"{position_ids.shape=}"
            head_size = self.cos_cache.size(-1)
            cos = torch.gather(self.cos_cache, dim=1, index=position_ids.unsqueeze(-1).expand(-1, -1, head_size))
            sin = torch.gather(self.sin_cache, dim=1, index=position_ids.unsqueeze(-1).expand(-1, -1, head_size))

        return cos, sin


class HunyuanRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6, cast_weight_fp32=False):
        """
        HunyuanRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.cast_weight_fp32 = cast_weight_fp32

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        if self.cast_weight_fp32:
            return (self.weight.float() * hidden_states).to(input_dtype)
        else:
            return self.weight * hidden_states.to(input_dtype)


class HunyuanMLP(nn.Module):
    def __init__(self, config: HunyuanImage3Config, layer_idx=None, is_shared_mlp=False, is_moe=False):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.hidden_size = config.hidden_size
        self.hidden_act = config.hidden_act

        self.intermediate_size = config.intermediate_size
        if is_shared_mlp or is_moe:
            # 如果是 moe 的话，优先用 moe_intermediate_size
            if config.moe_intermediate_size is not None:
                self.intermediate_size = config.moe_intermediate_size \
                    if isinstance(config.moe_intermediate_size, int) else config.moe_intermediate_size[layer_idx]

            if is_shared_mlp:
                num_shared_expert = config.num_shared_expert \
                    if isinstance(config.num_shared_expert, int) else config.num_shared_expert[layer_idx]
                self.intermediate_size *= num_shared_expert

        self.act_fn = ACT2FN[config.hidden_act]
        if self.hidden_act == "silu":
            self.intermediate_size *= 2  # SwiGLU
            self.gate_and_up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
            self.down_proj = nn.Linear(self.intermediate_size // 2, self.hidden_size, bias=config.mlp_bias)
        elif self.hidden_act == "gelu":
            self.gate_and_up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
        else:
            assert False, "other hidden_act are not supported"

    def forward(self, x):
        if self.hidden_act == "silu":
            gate_and_up_proj = self.gate_and_up_proj(x)
            x1, x2 = gate_and_up_proj.chunk(2, dim=-1)
            down_proj = self.down_proj(x1 * self.act_fn(x2))
            return down_proj
        elif self.hidden_act == "gelu":
            intermediate = self.gate_and_up_proj(x)
            intermediate = self.act_fn(intermediate)
            output = self.down_proj(intermediate)
            return output
        else:
            assert False, "other hidden_act are not supported"


class HunyuanTopKGate(nn.Module):
    def __init__(self, config: HunyuanImage3Config, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.moe_topk = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx]
        self.drop_tokens = config.moe_drop_tokens
        self.min_capacity = 8
        self.random_routing_dropped_token = config.moe_random_routing_dropped_token
        num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
        self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32)

        # DeepSeek gating args
        self.routed_scaling_factor = config.routed_scaling_factor
        self.n_group = config.n_group
        self.topk_group = config.topk_group
        self.norm_topk_prob = config.norm_topk_prob
        self.group_limited_greedy = config.group_limited_greedy

    def forward(self, hidden_states, topk_impl='default'):
        bsz, seq_len, hidden_size = hidden_states.shape
        hidden_states = hidden_states.reshape(-1, hidden_size)
        if self.wg.weight.dtype == torch.float32:
            hidden_states = hidden_states.float()
        logits = self.wg(hidden_states)
        if topk_impl == 'default':
            gate_output = topkgating(logits, self.moe_topk, group_limited_greedy=self.group_limited_greedy,
                                     n_group=self.n_group, topk_group=self.topk_group,
                                     norm_topk_prob=self.norm_topk_prob,
                                     routed_scaling_factor=self.routed_scaling_factor,
                                     capacity_factor=self.config.capacity_factor,
                                     drop_tokens=self.drop_tokens)
        elif topk_impl == 'easy':
            gate_output = self.easy_topk(logits, self.moe_topk)
        else:
            raise ValueError(f"Unsupported topk_impl: {topk_impl}")

        return gate_output

    @staticmethod
    def easy_topk(logits, moe_topk):
        gates = F.softmax(logits, dim=1)
        topk_weight_1, expert_index = torch.topk(gates, moe_topk)
        weight_sums = topk_weight_1.sum(dim=1, keepdim=True)
        weight_sums = torch.clamp(weight_sums, min=1e-8)
        topk_weight = topk_weight_1 / weight_sums

        return topk_weight, expert_index


class HunyuanMoE(nn.Module):
    def __init__(self, config: HunyuanImage3Config, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.moe_topk = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx]
        self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
        if config.use_mixed_mlp_moe:
            self.shared_mlp = HunyuanMLP(config, layer_idx=layer_idx, is_shared_mlp=True)
        self.gate = HunyuanTopKGate(config, layer_idx=layer_idx)
        self.experts = nn.ModuleList(
            [HunyuanMLP(config, layer_idx=layer_idx, is_shared_mlp=False, is_moe=True) for _ in range(self.num_experts)]
        )

        self._moe_impl = config.moe_impl
        # For FlashInfer
        self.moe_weight = None
        self.moe_weight_2 = None
        self._weights_initialized = False

    @property
    def moe_impl(self):
        return self._moe_impl

    @moe_impl.setter
    def moe_impl(self, value):
        self._moe_impl = value
        if self._moe_impl == "flashinfer":
            assert flashinfer is not None, "When using fused_moe, flashinfer must be installed."

    def forward(self, hidden_states):
        torch.cuda.set_device(hidden_states.device.index)
        bsz, seq_len, hidden_size = hidden_states.shape
        input_hidden_states = hidden_states

        if self.config.use_mixed_mlp_moe:
            hidden_states_mlp = self.shared_mlp(hidden_states)

        reshaped_input = hidden_states.reshape(-1, hidden_size) # [bsz*seq_len, hidden_size]

        with nvtx.range("MoE"):
            if self._moe_impl == "flashinfer":
                # Get expert weights
                if not self._weights_initialized:
                    self._initialize_weights_on_device(hidden_states.device)
                topk_weight, topk_index = self.gate(hidden_states, topk_impl='easy')

                combined_output = torch.zeros_like(reshaped_input)
                _ = flashinfer.fused_moe.cutlass_fused_moe(     # noqa
                    reshaped_input.contiguous(),
                    topk_index.to(torch.int).contiguous(),
                    topk_weight.to(torch.float).contiguous(),
                    self.moe_weight,
                    self.moe_weight_2,
                    torch.bfloat16,
                    output=combined_output,
                    quant_scales=None,
                )
                combined_output = combined_output.reshape(bsz, seq_len, hidden_size)
            else:
                # DeepSeekMoE implementation
                # Reference: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat/blob/main/modeling_deepseek.py#L375
                with torch.autocast('cuda', enabled=False):
                    topk_weights, topk_idx = self.gate(hidden_states, topk_impl='easy')
                # Cast back to the input dtype
                topk_weights = topk_weights.to(hidden_states.dtype)

                # Flatten for easier indexing
                flat_topk_idx = topk_idx.view(-1)
                hidden_states_flat = input_hidden_states.view(-1, hidden_size)    # (bsz * seq_len, hidden_size)
                hidden_states_repeated = hidden_states_flat.repeat_interleave(self.moe_topk, dim=0)  # (bsz * seq_len * k, hidden_size)

                # Forward through experts
                expert_outputs = torch.zeros_like(hidden_states_repeated, dtype=hidden_states_repeated.dtype, device=hidden_states_repeated.device)
                for i in range(self.num_experts):
                    expert_mask = (flat_topk_idx == i)
                    selected_inputs = hidden_states_repeated[expert_mask]
                    expert_output = self.experts[i](selected_inputs)    # compatible with zero tensor
                    expert_outputs[expert_mask] = expert_output

                # Weighted sum of expert outputs
                combined_output = (expert_outputs.view(
                    bsz * seq_len, self.moe_topk, hidden_size) * topk_weights.unsqueeze(-1)).sum(dim=1)  # (bsz * seq_len, hidden_size)
                combined_output = combined_output.to(hidden_states.dtype).view(bsz, seq_len, hidden_size)

        if self.config.use_mixed_mlp_moe:
            output = hidden_states_mlp + combined_output    # noqa
        else:
            output = combined_output

        return output

    def _initialize_weights_on_device(self, device):
        expert_weights_gate_up = []
        expert_weights_down = []

        for expert in self.experts:
            expert.to(device)
            expert_weights_gate_up.append(expert.gate_and_up_proj.weight.to(device))
            expert_weights_down.append(expert.down_proj.weight.to(device))

        self.moe_weight = torch.stack(expert_weights_gate_up).contiguous()
        self.moe_weight_2 = torch.stack(expert_weights_down).contiguous()
        # empty the expert weights
        for expert in self.experts:
            expert.gate_and_up_proj.weight.data = torch.empty(0, device=device)
            if expert.gate_and_up_proj.bias is not None:
                expert.gate_and_up_proj.bias.data = torch.empty(0, device=device)
            expert.down_proj.weight.data = torch.empty(0, device=device)
            if expert.down_proj.bias is not None:
                expert.down_proj.bias.data = torch.empty(0, device=device)

        self._weights_initialized = True


class HunyuanImage3SDPAAttention(nn.Module):
    """PyTorch SDPA attention implementation using torch.nn.functional.scaled_dot_product_attention"""

    def __init__(self, config: HunyuanImage3Config, layer_idx: int):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.attention_type = 'self'

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        # self.head_dim = self.hidden_size // self.num_heads
        self.head_dim: int = config.attention_head_dim
        self.num_key_value_heads = config.num_key_value_heads if config.num_key_value_heads else self.num_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True
        self.use_qk_norm = config.use_qk_norm
        self.use_rotary_pos_emb = config.use_rotary_pos_emb
        self.hidden_size_q = self.head_dim * self.num_heads
        self.hidden_size_kv = self.head_dim * self.num_key_value_heads

        # define layers
        self.qkv_proj = nn.Linear(
            self.hidden_size,
            self.hidden_size_q + 2 * self.hidden_size_kv,
            bias=config.attention_bias
        )
        self.o_proj = nn.Linear(self.hidden_size_q, self.hidden_size, bias=config.attention_bias)

        if self.use_qk_norm:
            self.query_layernorm = HunyuanRMSNorm(self.head_dim, eps=config.rms_norm_eps)
            self.key_layernorm = HunyuanRMSNorm(self.head_dim, eps=config.rms_norm_eps)

        if self.use_rotary_pos_emb:
            self._init_rope()

    def _init_rope(self):
        scaling_type = self.config.rope_scaling["type"]
        if scaling_type == "custom":
            # Using custom rotary embedding
            self.rotary_emb = None
        else:
            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.reshape(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_value: Optional[Cache] = None,
            output_attentions: bool = False,
            use_cache: Optional[bool] = False,
            custom_pos_emb: Optional[Tuple[torch.FloatTensor]] = None,
            **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
        if output_attentions:
            raise NotImplementedError(
                'HunyuanImage3Model is using HunyuanImage3SDPAAttention,'
                'but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`.'
            )

        bsz, q_len, _ = hidden_states.size()

        qkv_states = self.qkv_proj(hidden_states)
        qkv_states = qkv_states.reshape(bsz, q_len, self.num_key_value_heads, self.num_key_value_groups + 2,
                                        self.head_dim)
        query_states, key_states, value_states = torch.split(qkv_states, [self.num_key_value_groups, 1, 1], dim=3)

        query_states = query_states.reshape(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        if self.use_rotary_pos_emb:
            cos, sin = custom_pos_emb
            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if self.use_qk_norm:
            query_states = self.query_layernorm(query_states)
            key_states = self.key_layernorm(key_states)

        query_states = query_states.to(value_states.dtype)
        key_states = key_states.to(value_states.dtype)

        if past_key_value is not None:
            cache_kwargs = {"cache_position": position_ids}
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
            query_states = query_states.to(key_states.dtype)

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with
        # custom attn_mask,
        # Reference: https://github.com/pytorch/pytorch/issues/112577.
        if query_states.device.type == "cuda" and attention_mask is not None:
            query_states = query_states.contiguous()
            key_states = key_states.contiguous()
            value_states = value_states.contiguous()
        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_states, key_states, value_states, attn_mask=attention_mask, dropout_p=0.0
        )
        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, -1)

        attn_output = self.o_proj(attn_output)

        return attn_output, None, past_key_value


Hunyuan_ATTENTION_CLASSES = {
    "eager": HunyuanImage3SDPAAttention,
    "sdpa": HunyuanImage3SDPAAttention,
}


class HunyuanImage3DecoderLayer(nn.Module):
    def __init__(self, config: HunyuanImage3Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.layer_idx = layer_idx

        attn_impl = config._attn_implementation     # noqa
        if attn_impl in Hunyuan_ATTENTION_CLASSES:
            self.self_attn = Hunyuan_ATTENTION_CLASSES[attn_impl](config=config, layer_idx=layer_idx)
        else:
            raise ValueError(f"Unsupported attention implementation: {attn_impl}")

        if ((isinstance(config.num_experts, int) and config.num_experts > 1) or (
                isinstance(config.num_experts, list) and max(
                config.num_experts) > 1)) and layer_idx >= config.moe_layer_num_skipped:
            self.mlp = HunyuanMoE(config, layer_idx=layer_idx)
        else:
            self.mlp = HunyuanMLP(config, layer_idx=layer_idx, is_shared_mlp=False, is_moe=False)
        if config.norm_type == 'hf_rms' or config.norm_type == 'rms':
            self.input_layernorm = HunyuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
            self.post_attention_layernorm = HunyuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        elif config.norm_type == 'fused' or config.norm_type == 'torch_nn':
            self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
            self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
        else:
            assert False, "other norm_type are not supported"

    def forward(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_value: Optional[Tuple[torch.Tensor]] = None,
            output_attentions: Optional[bool] = False,
            use_cache: Optional[bool] = False,
            custom_pos_emb: Optional[Tuple[torch.FloatTensor]] = None,
            **kwargs,
    ) -> Tuple[torch.FloatTensor | Any]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            position_ids (`torch.LongTensor`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            custom_pos_emb (`Tuple[torch.FloatTensor]`, *optional*): custom position embedding for rotary
                position embedding
        """
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use "
                "`attention_mask` instead.`"
            )
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            custom_pos_emb=custom_pos_emb,
            **kwargs,
        )
        hidden_states = residual + hidden_states
        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)

        hidden_states = residual + hidden_states
        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


@add_start_docstrings(
    "The bare Hunyuan Image 3 Model outputting raw hidden-states without any specific head on top.",
    Hunyuan_START_DOCSTRING,
)
class HunyuanImage3PreTrainedModel(PreTrainedModel):
    config_class = HunyuanImage3Config
    base_model_prefix = ""
    supports_gradient_checkpointing = True
    _no_split_modules = ["HunyuanImage3DecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


Hunyuan_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare Hunyuan Model outputting raw hidden-states without any specific head on top.",
    Hunyuan_START_DOCSTRING,
)
class HunyuanImage3Model(HunyuanImage3PreTrainedModel):
    def __init__(self, config: HunyuanImage3Config):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        self.add_classification_head = config.add_classification_head
        self.wte = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [HunyuanImage3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        if not config.add_classification_head:
            self.ln_f = HunyuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        # Initialize weights and apply final processing
        self.post_init()

        self.shared_tensor = None

    @add_start_docstrings_to_model_forward(Hunyuan_INPUTS_DOCSTRING)
    def forward(
            self,
            input_ids: torch.LongTensor = None,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_values: Optional[List[torch.FloatTensor]] = None,
            inputs_embeds: Optional[torch.FloatTensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            custom_pos_emb: Optional[Tuple[torch.FloatTensor]] = None,
            mode: str = "gen_text",
            first_step: Optional[bool] = None,
            post_token_len: int = None,
            num_image_tokens: int = None,
            gen_timestep_scatter_index: Optional[torch.Tensor] = None,
            num_special_tokens: int = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            inputs_embeds = self.wte(input_ids)

        # embed positions
        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for layer_idx, decoder_layer in enumerate(self.layers):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_values,
                output_attentions=output_attentions,
                use_cache=use_cache,
                custom_pos_emb=custom_pos_emb,
                mode=mode,
                first_step=first_step,
            )

            hidden_states = layer_outputs[0]
            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        if not self.add_classification_head:
            # Do ln_f outside of the model for compatibility with image generation.
            pass
            # hidden_states = self.ln_f(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = next_decoder_cache
        if not return_dict:
            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )


class HunyuanImage3ForCausalMM(HunyuanImage3PreTrainedModel, GenerationMixin):
    def __init__(self, config: HunyuanImage3Config, skip_load_module:set[str]={}, use_dist_vae=False, wgt_path=""):
        """HunyuanImage3ForCausalMM

        Args:
            config (HunyuanImage3Config): model config to initialize the model
            skip_load_module (set[str], optional):
                modules to skip loading, used for vllm inference. Defaults to {}.

        Raises:
            ValueError: if config is invalid
        """
        super().__init__(config)
        self.config = config
        self._tokenizer: Optional[HunyuanImage3TokenizerFast] = None

        #self.generation_config = GenerationConfig.from_model_config(config)

        # Initialize image preprocessor (for conditional images)
        self.image_processor = HunyuanImage3ImageProcessor(config)

        if 'all' in skip_load_module:
            skip_load_module = {
                'vae',
                'vit',
                'timestep_emb',
                'patch_embed',
                'time_embed',
                'final_layer',
                'time_embed_2',
                'transformers',
            }
        if 'vae' not in skip_load_module:
            # vae and gen_image pipeline
            if not use_dist_vae:
                self.vae = AutoencoderKLConv3D.from_config(config.vae)
                self.vae_dtype = getattr(torch, config.vae_dtype)
                self.vae_autocast_dtype = getattr(torch, config.vae_autocast_dtype)
                self.vae = self.vae.eval()
                for param in self.vae.parameters():
                    param.requires_grad = False  #
            else:
                self.vae = AutoencoderKLConv3D_Dist.from_config(config.vae)
                self.vae_dtype = getattr(torch, config.vae_dtype)
                self.vae_autocast_dtype = getattr(torch, config.vae_autocast_dtype)
                self.vae.create_dist(wgt_path, config.vae)
        self._pipeline = None

        if 'vit' not in skip_load_module:
            # vit
            self.vision_model = Siglip2VisionTransformer(config.vit)
            self.vision_aligner = LightProjector(config.vit_aligner)

        if 'timestep_emb' not in skip_load_module:
            # image generation related
            self.timestep_emb = TimestepEmbedder(hidden_size=config.hidden_size)

        if self.config.cfg_distilled:
            self.guidance_emb = TimestepEmbedder(hidden_size=config.hidden_size)
        if self.config.use_meanflow: 
            self.timestep_r_emb = TimestepEmbedder(hidden_size=config.hidden_size)

        if config.img_proj_type == "unet":
            if 'patch_embed' not in skip_load_module:
                self.patch_embed = UNetDown(
                    patch_size=config.patch_size,
                    emb_channels=config.hidden_size,
                    in_channels=config.vae["latent_channels"],
                    hidden_channels=config.patch_embed_hidden_dim,
                    out_channels=config.hidden_size,
                )
            if 'time_embed' not in skip_load_module:
                self.time_embed = TimestepEmbedder(hidden_size=config.hidden_size)

            if 'final_layer' not in skip_load_module:
                self.final_layer = UNetUp(
                    patch_size=config.patch_size,
                    emb_channels=config.hidden_size,
                    in_channels=config.hidden_size,
                    hidden_channels=config.patch_embed_hidden_dim,
                    out_channels=config.vae["latent_channels"],
                    out_norm=True,
                )
            if 'time_embed_2' not in skip_load_module:
                self.time_embed_2 = TimestepEmbedder(hidden_size=config.hidden_size)
        else:
            raise ValueError(f"Unknown img_proj_type {config.img_proj_type}")

        if 'transformers' not in skip_load_module:
            # transformer backbone
            self.model = HunyuanImage3Model(config)
            # linear head
            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.pad_id = config.pad_id
        self.vocab_size = config.vocab_size
      
        # Taylor Cache
        self.use_taylor_cache = False

        self.num_image_tokens = None
        self.num_special_tokens = None
        # Initialize cached rope, supporting automatic cache update
        self.cached_rope = CachedRoPE(config)

        # Initialize weights and apply final processing
        self.post_init()

    @classmethod
    def from_config(cls, config: HunyuanImage3Config,  skip_load_module:set[str]={}):
        return cls(config, skip_load_module=skip_load_module)

    @property
    def tokenizer(self):
        if self._tokenizer is None:
            raise ValueError("Attribute `tokenizer` has not been initialized yet. Please set it first.")
        return self._tokenizer

    def load_tokenizer(self, tokenizer):
        self._tokenizer = HunyuanImage3TokenizerFast.from_pretrained(tokenizer, model_version=self.config.model_version)

    @property
    def pipeline(self):
        if self._pipeline is None:
            self.scheduler = FlowMatchDiscreteScheduler(
                shift=self.generation_config.flow_shift, reverse=True, solver="euler",
            )
            self._pipeline = HunyuanImage3Text2ImagePipeline(
                model=self, scheduler=self.scheduler, vae=self.vae,
            )
        return self._pipeline

    def instantiate_vae_image_tokens(
            self,
            hidden_states: torch.Tensor,
            timesteps: BatchRaggedTensor,
            images: BatchRaggedImages,
            image_mask: torch.Tensor,
            guidance: torch.Tensor = None,
            timesteps_r: torch.Tensor = None,
    ):
        """
        Instantiate the VAE image embeddings into the input embedding sequence.

        Args:
            hidden_states: input sequence, (batch_size, seq_len, n_embd)
            images: BatchRaggedImages
                images can be a 4-D tensor, or a list of 4-D tensors, or a list of lists of 3-D tensors.
            timesteps: BatchRaggedTensor
                ts can be a 1-D tensor, or a list of 1-D tensors
            image_mask: (batch_size, seq_len)
        """
        if hidden_states is None:
            # Only for inference in non-first step image generation
            t_emb = self.time_embed(timesteps)
            image_emb = self.patch_embed(images, t_emb)[0]
            timestep_emb = self.timestep_emb(timesteps).reshape(images.size(0), -1, self.config.hidden_size)
            cat_list = [timestep_emb, image_emb]
            
            if guidance is not None:
                guidance_src = self.guidance_emb(guidance.reshape(-1))    # (bsz * n, n_embd)
                guidance_emb = guidance_src.reshape(images.size(0), -1, self.config.hidden_size)
            if timesteps_r is not None:
                timesteps_r_src = self.timestep_r_emb(timesteps_r.reshape(-1))    # (bsz * n, n_embd)
                timesteps_r_emb = timesteps_r_src.reshape(images.size(0), -1, self.config.hidden_size)

            if guidance is not None and timesteps_r is not None:
                cat_list = [timestep_emb, guidance_emb, timesteps_r_emb, image_emb]
            elif guidance is not None:
                cat_list = [timestep_emb, guidance_emb, image_emb]
            elif timesteps_r is not None:
                cat_list = [timestep_emb, timesteps_r_emb, image_emb]
            hidden_states = torch.cat(cat_list, dim=1)
            return hidden_states

        bsz, seqlen, n_embd = hidden_states.shape
        assert isinstance(images, (torch.Tensor, list)), f"images should be BatchRaggedImages, got {type(images)}"

        if isinstance(images, torch.Tensor):
            assert images.ndim == 4, f"images should be a 4-D tensor, got {images.ndim}-D tensor"
            assert isinstance(timesteps, torch.Tensor), f"timesteps should be 1-D tensor, got {type(timesteps)}"

            bsz, seqlen, n_embd = hidden_states.shape
            index = torch.arange(seqlen, device=hidden_states.device).unsqueeze(0).repeat(bsz, 1)   # (bsz, seqlen)
            t_emb = self.time_embed(timesteps)     # (bsz, n_embd)
            image_seq, token_h, token_w = self.patch_embed(images, t_emb)   # (bsz, num_patches, n_embd)
            image_scatter_index = index.masked_select(image_mask.bool()).reshape(bsz, -1)   # (bsz, num_patches)
            hidden_states.scatter_(
                dim=1,
                index=image_scatter_index.unsqueeze(-1).repeat(1, 1, n_embd),
                src=image_seq,
            )

        else:   # list
            index = torch.arange(seqlen, device=hidden_states.device).unsqueeze(0).repeat(bsz, 1)   # (bsz, seqlen)
            for i, (image_i, t_i) in enumerate(zip(images, timesteps)):
                t_i_emb = self.time_embed(t_i)      # (n_i, n_embd)

                if isinstance(image_i, torch.Tensor):
                    image_i_seq, _, _ = self.patch_embed(image_i, t_i_emb)  # (n_i, num_patches, n_embd)

                elif isinstance(image_i, list):
                    image_i_seq_list = []
                    for j in range(len(image_i)):
                        image_ij = image_i[j].unsqueeze(0)
                        assert image_ij.ndim == 4, \
                            f"image_ij should have size of (1, C, H, W), got {list(image_ij.size())}"
                        image_i_seq_j = self.patch_embed(image_ij, t_i_emb[j:j + 1])[0]  # (1, num_patches, n_embd)
                        image_i_seq_list.append(image_i_seq_j)
                    image_i_seq = torch.cat(image_i_seq_list, dim=1)    # (1, Σj num_patches_j, n_embd)

                else:
                    raise TypeError(f"image_i should be a torch.Tensor or a list, got {type(image_i)}")

                image_i_index = index[i:i + 1].masked_select(image_mask[i:i + 1].bool()).reshape(1, -1)  # (1, img_seqlen)
                hidden_states[i:i + 1].scatter_(
                    dim=1,
                    index=image_i_index.unsqueeze(-1).repeat(1, 1, n_embd),
                    src=image_i_seq.reshape(1, -1, n_embd),  # (1, img_seqlen, n_embd)
                )

        return hidden_states

    def _forward_vision_encoder(self, images, **image_kwargs):
        image_embeds = self.vision_model(images, **image_kwargs).last_hidden_state
        image_embeds = self.vision_aligner(image_embeds)

        return image_embeds

    def instantiate_vit_image_tokens(
            self,
            hidden_states: torch.Tensor,
            images: torch.Tensor | list[torch.Tensor],
            image_masks: torch.Tensor,
            image_kwargs: dict[str, torch.Tensor],
    ):
        """
        Encode images using vision encoder(vit), and then instantiate the image embeddings into
        the input embedding sequence.

        Args:
            hidden_states (torch.Tensor): input sequence, (bsz, seqlen, n_embd)
            images (torch.Tensor | list[torch.Tensor]): images can be a 3-D or 4-D tensor, or a list of tensors.
            image_masks (torch.Tensor): mask for the images, (bsz, seqlen)
            image_kwargs (dict[str, torch.Tensor]): additional keyword arguments for the image encoder

        Returns:
            Instantiated input sequence
        """
        bsz, seqlen, n_embd = hidden_states.shape
        index = torch.arange(seqlen, device=hidden_states.device).unsqueeze(0).repeat(bsz, 1)

        if isinstance(images, torch.Tensor):
            assert images.ndim in [3, 4, 5], f"images should be a 3-D, 4-D, or 5-D tensor, got {images.ndim}-D tensor."
            if images.ndim in [4, 5]:
                bsz, n = images.shape[:2]
                images = images.view(bsz * n, *images.shape[2:])
                image_kwargs = image_kwargs if image_kwargs is not None else {}
                for k, v in image_kwargs.items():
                    image_kwargs[k] = v.reshape(bsz * n, *v.shape[2:])
            else:
                n = 1
            image_embeds = self._forward_vision_encoder(images, **image_kwargs)
            image_seqlen = image_embeds.size(1)

            image_scatter_index = index.masked_select(image_masks.bool()).reshape(bsz, -1)
            hidden_states.scatter_(
                dim=1,
                index=image_scatter_index.unsqueeze(-1).repeat(1, 1, n_embd),
                src=image_embeds.reshape(bsz, n * image_seqlen, n_embd),
            )

        elif isinstance(images, list):
            for i, (image, image_mask) in enumerate(zip(images, image_masks)):
                cur_kwargs = {k: v[i] for k, v in image_kwargs.items()} if image_kwargs is not None else {}
                image_embed = self._forward_vision_encoder(image, **cur_kwargs)
                n, image_seqlen, n_embd = image_embed.shape
                image_embed = image_embed.reshape(n * image_seqlen, n_embd)

                image_scatter_index = index[i:i+1].masked_select(image_mask.bool()).reshape(1, -1)
                hidden_states[i:i+1].scatter_(
                    dim=1,
                    index=image_scatter_index.unsqueeze(-1).repeat(1, 1, n_embd),
                    src=image_embed.reshape(1, -1, n_embd),
                )
        else:
            raise ValueError(f"und_images should be Tensor or List, but got {type(images)}")

        return hidden_states

    def instantiate_continuous_tokens(
            self,
            hidden_states: torch.Tensor,
            timesteps: Optional[BatchRaggedTensor] = None,
            timesteps_index: Optional[BatchRaggedTensor] = None,
    ):
        bsz, seqlen, n_embd = hidden_states.shape

        if isinstance(timesteps, list):
            for i, timestep in enumerate(timesteps):
                timestep_src = self.timestep_emb(timestep)  # (n, n_embd)
                hidden_states[i:i+1].scatter_(
                    dim=1,
                    index=timesteps_index[i].unsqueeze(0).unsqueeze(-1).repeat(1, 1, n_embd),
                    src=timestep_src.reshape(1, -1, n_embd),
                )
        else:
            timesteps_src = self.timestep_emb(timesteps.reshape(-1))    # (bsz * n, n_embd)
            hidden_states.scatter_(
                dim=1,
                index=timesteps_index.unsqueeze(-1).repeat(1, 1, n_embd),
                src=timesteps_src.reshape(bsz, -1, n_embd),
            )

        return hidden_states

    def instantiate_guidance_tokens(
            self,
            hidden_states: torch.Tensor,
            guidance: Optional[BatchRaggedTensor] = None,
            guidance_index: Optional[BatchRaggedTensor] = None,
    ):
        bsz, seqlen, n_embd = hidden_states.shape

        guidance_src = self.guidance_emb(guidance.reshape(-1))    # (bsz * n, n_embd)
        hidden_states.scatter_(
            dim=1,
            index=guidance_index.unsqueeze(-1).repeat(1, 1, n_embd),
            src=guidance_src.reshape(bsz, -1, n_embd),
        )

        return hidden_states


    def instantiate_timestep_r_tokens(
            self,
            hidden_states: torch.Tensor,
            timesteps_r: Optional[BatchRaggedTensor] = None,
            timesteps_r_index: Optional[BatchRaggedTensor] = None,
    ):
        bsz, seqlen, n_embd = hidden_states.shape

        if isinstance(timesteps_r, list):
            for i, timestep_r in enumerate(timesteps_r):
                timestep_r_src = self.timestep_r_emb(timestep_r)  # (n, n_embd)
                hidden_states[i:i+1].scatter_(
                    dim=1,
                    index=timesteps_r_index[i].unsqueeze(0).unsqueeze(-1).repeat(1, 1, n_embd),
                    src=timestep_r_src.reshape(1, -1, n_embd),
                )
        else:
            timesteps_r_src = self.timestep_r_emb(timesteps_r.reshape(-1))    # (bsz * n, n_embd)
            hidden_states.scatter_(
                dim=1,
                index=timesteps_r_index.unsqueeze(-1).repeat(1, 1, n_embd),
                src=timesteps_r_src.reshape(bsz, -1, n_embd),
            )

        return hidden_states

    def get_image_tokens_hw(self, images: BatchRaggedImages):
        assert isinstance(images, (torch.Tensor, list)), f"images should be BatchRaggedImages, got {type(images)}"
        if isinstance(images, torch.Tensor):
            token_h = images.shape[-2] // self.config.patch_size
            token_w = images.shape[-1] // self.config.patch_size
        else:
            token_h, token_w = [], []
            for image_i in images:
                assert isinstance(image_i, (torch.Tensor, list)), \
                    f"image_i should be a tensor or a list of tensors, got {type(image_i)}"
                if isinstance(image_i, torch.Tensor):
                    token_h.append(image_i.shape[-2] // self.config.patch_size)
                    token_w.append(image_i.shape[-1] // self.config.patch_size)
                else:
                    token_h.append([])
                    token_w.append([])
                    for j in range(len(image_i)):
                        token_h[-1].append(image_i[j].shape[-2] // self.config.patch_size)
                        token_w[-1].append(image_i[j].shape[-1] // self.config.patch_size)
        return token_h, token_w

    def ragged_final_layer(self, hidden_states, image_mask, timesteps, token_h, token_w, first_step=None):
        n_embd = hidden_states.size(-1)
        if isinstance(timesteps, torch.Tensor):
            # Only one target image.
            t_emb = self.time_embed_2(timesteps)
            if first_step is False:
                # only for gen_image non-first-step inference
                image_output = hidden_states[:, self.num_special_tokens:, :]
            else:   # first_step is True or None
                image_output = hidden_states.masked_select(
                    image_mask.unsqueeze(-1).bool()).reshape(-1, token_h * token_w, n_embd)
            pred = self.final_layer(image_output, t_emb, token_h, token_w)
        else:
            # Multiple target images(interleave data).
            # In this case, each line of the image_mask may contain different number of Trues, leading
            # the `reshape(batch_size, ...)` is not possible.
            sections = image_mask.sum(1).tolist()
            image_output = hidden_states.masked_select(
                image_mask.unsqueeze(-1).bool()).reshape(-1, n_embd).split(sections)
            pred = []
            for image_output_i, t_i, token_h_i, token_w_i in zip(image_output, timesteps, token_h, token_w):
                t_emb_i = self.time_embed_2(t_i)
                if isinstance(token_h_i, int):
                    image_output_i = image_output_i.reshape(-1, token_h_i * token_w_i, n_embd)
                    pred_i = self.final_layer(image_output_i, t_emb_i, token_h_i, token_w_i)
                    pred.append(pred_i)
                else:
                    subsections = [token_h_ij * token_w_ij for token_h_ij, token_w_ij in zip(token_h_i, token_w_i)]
                    image_output_i = image_output_i.split(subsections)
                    pred_i = []
                    for j, image_output_ij in enumerate(image_output_i):
                        pred_ij = self.final_layer(image_output_ij[None], t_emb_i[j:j+1], token_h_i[j], token_w_i[j])
                        pred_i.append(pred_ij)
                    pred.append(pred_i)
        return pred

    @staticmethod
    def _check_inputs(cond, target, check_list):
        if cond:
            for name, item in check_list:
                assert item is not None, f"`{name}` should be provided when `{target}`."

    @add_start_docstrings_to_model_forward(Hunyuan_INPUTS_DOCSTRING)
    def forward(
            self,
            input_ids: Optional[torch.Tensor] = None,  # bsz x seqlen
            attention_mask: Optional[torch.Tensor] = None,  # bsz x 1 x seqlen x seqlen
            rope_image_info: Optional[list[list[tuple[slice, tuple[int, int]]]]] = None,
            return_dict: bool = True,
            # for gen images
            images: Optional[BatchRaggedImages] = None,  # bsz x c x h x w, or bsz x (n_i x (c x h_ij x w_ij))
            image_mask: Optional[torch.Tensor] = None,  # bsz x seqlen
            timesteps: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (n_i)
            timesteps_index: Optional[BatchRaggedTensor] = None,  # bsz x k, or bsz x (k_i)
            timesteps_r: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (n_i)
            timesteps_r_index: Optional[BatchRaggedTensor] = None,  # bsz x k, or bsz x (k_i)
            guidance: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (n_i)
            guidance_index: Optional[BatchRaggedTensor] = None,  # bsz x k, or bsz x (k_i)
            # for cond images
            cond_vae_images: Optional[BatchRaggedImages] = None,  # bsz x c x h x w, or bsz x (m_i x (c x h_ij x w_ij))
            cond_vae_image_mask: Optional[torch.Tensor] = None,  # bsz x seqlen
            cond_timesteps: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (m_i)
            cond_timesteps_index: Optional[BatchRaggedTensor] = None,
            cond_vit_images: Optional[BatchRaggedImages] = None,
            cond_vit_image_mask: Optional[torch.Tensor] = None,
            cond_vit_image_kwargs: Optional[dict[str, Any]] = None,
            # only for inference
            position_ids: Optional[torch.Tensor] = None,  # bsz x seq_len-1, used for KVCache
            past_key_values: Optional[HunyuanStaticCache] = None,
            mode: Optional[str] = None,
            first_step: Optional[bool] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            cache_dic = None,
            gen_timestep_scatter_index: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, CausalMMOutputWithPast]:
        

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # Sanity Check of Inputs
        self._check_inputs(mode == "gen_image", "in `gen_image` mode", [
            ("images", images), ("timesteps", timesteps),
        ])
        self._check_inputs(mode == "gen_image" and first_step, "in `gen_image` mode at the first step", [
            ("image_mask", image_mask), ("timesteps_index", timesteps_index),
        ])
        self._check_inputs(cond_vae_images is not None, "`cond_vae_images` is provided", [
            ("cond_timesteps", cond_timesteps), ("cond_vae_image_mask", cond_vae_image_mask),
            ("cond_timesteps_index", cond_timesteps_index),
        ])
        self._check_inputs(cond_vit_images is not None, "`cond_vit_images` is provided", [
            ("cond_vit_image_mask", cond_vit_image_mask),
        ])
        if input_ids is None and images is None:
            raise ValueError("Either input_ids or images should be provided.")
        if input_ids is not None:
            device = input_ids.device
        else:
            device = get_device(images)
        if self.training:
            seqlen = input_ids.size(1)
        else:
            # For inference, we always set seqlen to maximum length to simplify the rope cache handling
            seqlen = self.config.max_position_embeddings
        assert self.config.max_position_embeddings >= seqlen, (
            f"Cannot forward sequence of length {seqlen}, "
            f"max position embeddings is only {self.config.max_position_embeddings}, "
            f"try set --max-position-embeddings to a larger value."
        )

        # Calculate multimodal 2d rope
        cos, sin = self.cached_rope(
            seqlen, device, rope_image_info=rope_image_info, position_ids=position_ids,
        )
        # === Map token ids to embeddings ===
        if input_ids is not None:
            hidden_states = self.model.wte(input_ids)  # (bsz, seqlen, n_embd)
        else:
            hidden_states = None  # only for non-first step inference of the image generation
         
        # === Input layers ===
        if images is not None:
            if self.config.cfg_distilled and input_ids is None:
                hidden_states = self.instantiate_vae_image_tokens(hidden_states, timesteps, images, image_mask, guidance, timesteps_r)
            else:
                hidden_states = self.instantiate_vae_image_tokens(hidden_states, timesteps, images, image_mask)

        if cond_vae_images is not None:
            hidden_states = self.instantiate_vae_image_tokens(hidden_states, cond_timesteps, cond_vae_images,
                                                              cond_vae_image_mask)

        if cond_vit_images is not None:
            hidden_states = self.instantiate_vit_image_tokens(hidden_states, cond_vit_images, cond_vit_image_mask,
                                                              cond_vit_image_kwargs)
        if timesteps_index is not None:
            hidden_states = self.instantiate_continuous_tokens(hidden_states, timesteps, timesteps_index)

        # guidance token
        if guidance_index is not None:
            hidden_states = self.instantiate_guidance_tokens(hidden_states, guidance, guidance_index)

        # timestep r token
        if timesteps_r_index is not None:
            hidden_states = self.instantiate_timestep_r_tokens(hidden_states, timesteps_r, timesteps_r_index)

        if cond_timesteps_index is not None:
            hidden_states = self.instantiate_continuous_tokens(hidden_states, cond_timesteps, cond_timesteps_index)
        if mode == "gen_text":
            first_step = True
        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        if not self.use_taylor_cache:
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_values=past_key_values,
                inputs_embeds=hidden_states,
                use_cache=use_cache,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                custom_pos_emb=(cos, sin),
                mode=mode,
                first_step=first_step,
                post_token_len = self.post_token_len,
                num_image_tokens = self.num_image_tokens,
                gen_timestep_scatter_index = gen_timestep_scatter_index,
                num_special_tokens = self.num_special_tokens,
            )
            hidden_states = outputs[0]
        else:
            if not hasattr(self.model, "taylor_cache"):
                self.model.taylor_cache = CacheWithFreqsContainer(cache_dic['max_order'])
            if not hasattr(self.model, "counter"):
                self.model.counter = 0

            full_computation = (cache_dic['current_step'] == 0) \
                or (self.model.counter == cache_dic['cache_interval'] -1) \
                or (cache_dic['enable_first_enhance'] and cache_dic['current_step'] < cache_dic['first_enhance_steps']) \
                or (cache_dic['enable_tailing_enhance'] and cache_dic['current_step'] >= cache_dic['num_steps'] - cache_dic['tailing_enhance_steps'])
            if not hasattr(self.model, "last_full_computation_step"):
                self.model.last_full_computation_step = 0
            if full_computation:
                self.model.counter = 0
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    position_ids=position_ids,
                    past_key_values=past_key_values,
                    inputs_embeds=hidden_states,
                    use_cache=use_cache,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                    custom_pos_emb=(cos, sin),
                    mode=mode,
                    first_step=first_step,
                    post_token_len = self.post_token_len,
                    num_image_tokens = self.num_image_tokens,
                    gen_timestep_scatter_index = gen_timestep_scatter_index,
                    num_special_tokens = self.num_special_tokens,
                )
                hidden_states = outputs[0]

                if cache_dic['enable_first_enhance'] and (cache_dic['current_step'] < (cache_dic['first_enhance_steps']-1)):
                    pass
                else:
                    self.model.taylor_cache.derivatives_computation(hidden_states, distance = cache_dic['current_step'] - self.model.last_full_computation_step, low_freqs_order=cache_dic['low_freqs_order'], high_freqs_order=cache_dic['high_freqs_order'])

                self.model.last_full_computation_step = cache_dic['current_step']
                self.model.taylor_cache.last_past_key_values = outputs.past_key_values  
            else:
                self.model.counter += 1
                hidden_states = self.model.taylor_cache.taylor_formula(distance = self.model.counter)
                outputs = BaseModelOutputWithPast(
                    last_hidden_state=hidden_states,
                    past_key_values=self.model.taylor_cache.last_past_key_values,
                    hidden_states=None,
                    attentions=None,
                ) 
            if cache_dic['current_step'] == cache_dic['num_steps'] - 1:
                self.model.taylor_cache.clear_derivatives()


        # === Output layers ===
        # -- image tokens
        if images is not None:
            token_h, token_w = self.get_image_tokens_hw(images)
            hidden_states = hidden_states.to(device=get_device(images))
            diff_pred = self.ragged_final_layer(
                hidden_states, image_mask, timesteps, token_h, token_w, first_step)
        else:
            diff_pred = None
        # -- text tokens
        if input_ids is None or mode == "gen_image":
            logits = None
        else:
            hidden_states = self.model.ln_f(hidden_states)
            logits = self.lm_head(hidden_states)  # (bsz, seqlen, vocab_size)
        # -- for inference
        if not return_dict:
            return (logits.float(),) + outputs[1:] + (diff_pred,)
        return CausalMMOutputWithPast(
            logits=logits.float() if logits is not None else None,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            diffusion_prediction=diff_pred,
        )

    @staticmethod
    def check_inputs(prompt=None, image=None, message_list=None):
        if prompt is None and message_list is None:
            raise ValueError("Either `prompt` or `message_list` should be provided.")
        if prompt is not None and message_list is not None:
            raise ValueError("`prompt` and `message_list` cannot be provided at the same time.")
        if message_list is not None:
            if not isinstance(message_list, list):
                raise ValueError(f"`message_list` should be a list of messages, but got {type(message_list)}.")
            assert len(message_list) > 0, "`message_list` should be a non-empty list."
            for message in message_list:
                assert isinstance(message, list) or isinstance(message, dict), \
                    f"Each message should be a list of dicts or a dict, but got {type(message)}."
        if image is not None:
            error_msg = \
                "`image` should be a PIL Image, a string path, a base64 string, bytes, or a list of them, but got {}."
            if isinstance(image, list):
                for im in image:
                    assert isinstance(im, (Image.Image, str, bytes)), error_msg.format(type(im))
            else:
                assert isinstance(image, (Image.Image, str, bytes)), error_msg.format(type(image))

    @staticmethod
    def _validate_and_batchify_text(text, name, check_batch_size=None):
        if text is None:
            return text
        assert isinstance(text, str) or isinstance(text, list), \
            f"Input `{name}` should be a string or a list of strings, but got {type(text)}."
        if isinstance(text, str):
            text = [text]
        #assert len(text) > 0 and all(isinstance(p, str) and len(p) > 0 for p in text), \
        #    f"Input `{name}` should be a non-empty list of non-empty strings, got {text}."
        if check_batch_size is not None:
            assert len(text) == check_batch_size, \
                f"Input `{name}` should have the same batch size as other inputs({check_batch_size}), got {len(text)}."
        return text

    @staticmethod
    def _validate_and_batchify_image(image, name, check_batch_size=None):
        if image is None:
            return image
        assert isinstance(image, (InputImage, list)), \
            f"Input `{name}` should be a image or a list of images, but got {type(image)}."
        if not isinstance(image, list):
            image = [image]
        batch_image_list = [image] if not isinstance(image[0], list) else image
        for image_list in batch_image_list:
            assert all(isinstance(im, InputImage) for im in image_list), \
                (f"Each item in `{name}` should be a PIL Image, a string path, a base64 string, or bytes, "
                 f"got {[type(im) for im in image_list]}.")
        if check_batch_size is not None:
            assert len(batch_image_list) == check_batch_size, \
                f"Input `{name}` should have the same batch size as other inputs({check_batch_size})"
        return batch_image_list

    @staticmethod
    def prepare_seed(seed, batch_size):
        if isinstance(seed, torch.Tensor):
            seed = seed.tolist()
        if seed is None:
            seeds = [random.randint(0, 10_000_000) for _ in range(batch_size)]
        elif isinstance(seed, int):
            seeds = [seed for _ in range(batch_size)]
        elif isinstance(seed, (list, tuple)):
            if len(seed) == batch_size:
                seeds = [int(seed[i]) for i in range(batch_size)]
            else:
                raise ValueError(f"Length of seed must be equal to the batch_size({batch_size}), got {seed}.")
        else:
            raise ValueError(f"Seed must be an integer, a list of integers, or None, got {seed}.")
        return seeds

    def build_batch_rope_image_info(self, output, sections):
        # Rope 1D. No need to build rope_image_info
        if self.config.rope_type == "default":
            return None

        # Rope 2D
        assert self.config.rope_type == "2d", \
            f"Rope type {self.config.rope_type} not supported by method 'build_batch_rope_image_info'."
        rope_image_info = []
        for image_slices, sections_i in zip(output.all_image_slices, sections):
            rope_2d_image_slices = []
            rope_2d_image_shapes = []
            image_idx = 0

            for section in sections_i:
                if section['type'] in ["gen_image", "cond_vae_image", "cond_vit_image"]:
                    assert image_idx < len(image_slices), \
                        f"Image index {image_idx} out of range for image slices with length {len(image_slices)}."
                    rope_2d_image_slices.append(image_slices[image_idx])
                    rope_2d_image_shapes.append((section['token_height'], section['token_width']))
                    image_idx += 1

                elif section['type'] == "cond_joint_image":
                    assert image_idx + 1 < len(image_slices), \
                        f"Image index {image_idx + 1} out of range for image slices with length {len(image_slices)}."
                    assert len(section['token_height']) == len(section['token_width']), \
                        (f"token_height and token_width should have the same length, "
                         f"but got {len(section['token_height'])} and {len(section['token_width'])}")

                    if self.image_processor.cond_token_attn_type in ["full", "joint_full"]:
                        rope_2d_image_slices.extend([image_slices[image_idx], image_slices[image_idx + 1]])
                        rope_2d_image_shapes.extend(list(zip(section['token_height'], section['token_width'])))
                    elif self.image_processor.cond_token_attn_type == "full_causal":
                        rope_2d_image_slices.append(image_slices[image_idx])
                        rope_2d_image_shapes.append((section['token_height'][0], section['token_width'][0]))
                    elif self.image_processor.cond_token_attn_type == "causal":
                        pass
                    else:
                        raise NotImplementedError(
                            f"cond_token_attn_type {self.image_processor.cond_token_attn_type} not supported "
                            f"by method 'build_batch_rope_image_info'."
                        )
                    image_idx += 2

            rope_image_info.append(list(zip(rope_2d_image_slices, rope_2d_image_shapes)))

        return rope_image_info

    def vae_encode(self, image, cfg_factor=1, generator=None):
        config = self.vae.config

        with torch.autocast(
                device_type="cuda", dtype=self.vae_autocast_dtype,  # noqa
                enabled=self.vae_autocast_dtype != torch.float32
        ):
            vae_encode_result = self.vae.encode(image)
            if isinstance(vae_encode_result, torch.Tensor):
                latents = vae_encode_result
            else:
                latents = vae_encode_result.latent_dist.sample(generator)
            if hasattr(config, 'shift_factor') and config.shift_factor:
                latents.sub_(config.shift_factor)
            if hasattr(config, 'scaling_factor') and config.scaling_factor:
                latents.mul_(config.scaling_factor)

        if hasattr(self.vae, "ffactor_temporal"):
            assert latents.shape[2] == 1, "latents should have shape [B, C, T, H, W] and T should be 1"
            latents = latents.squeeze(2)

        # Here we always use t=0 to declare it is a clean conditional image
        t = torch.zeros((latents.shape[0],))

        if cfg_factor > 1:
            t = t.repeat(cfg_factor)
            latents = latents.repeat(cfg_factor, 1, 1, 1)

        return t, latents

    def _encode_cond_image(
            self,
            batch_cond_images: list[list[Union[ImageTensor, CondImage]]],
            cfg_factor: int = 1,
            generator=None,
    ):
        if batch_cond_images is None or len(batch_cond_images[0]) == 0:
            return None, None, None

        first_image = batch_cond_images[0][0]

        # 1. If vae_image presents
        if first_image.section_type in ["cond_vae_image", "cond_joint_image"]:
            # VAE encode one by one, as we assume cond images have different sizes
            batch_cond_vae_images, batch_cond_t = [], []
            for cond_images in batch_cond_images:
                cond_vae_image_list, cond_t_list = [], []
                for cond_image in cond_images:
                    vae_image = (
                        cond_image.vae_image
                        if cond_image.section_type == "cond_joint_image"
                        else cond_image
                    )
                    cond_t_, cond_vae_image_ = self.vae_encode(
                        vae_image[None].to(self.device),
                        generator=generator,
                    )
                    cond_vae_image_list.append(cond_vae_image_.squeeze(0))
                    cond_t_list.append(cond_t_)
                batch_cond_vae_images.append(cond_vae_image_list)
                batch_cond_t.append(cond_t_list)

            # If only one cond image for each sample and all have the same size, we can batch them together
            # In this case, cond_vae_images is a 4-D tensor.
            if all([len(items) == 1 for items in batch_cond_vae_images]) and all(
                    items[0].shape == batch_cond_vae_images[0][0].shape for items in batch_cond_vae_images):
                cond_vae_images = torch.stack([items[0] for items in batch_cond_vae_images], dim=0)
                cond_t = torch.cat([items[0] for items in batch_cond_t], dim=0)
                if cfg_factor > 1:
                    cond_t = cond_t.repeat(cfg_factor)
                    cond_vae_images = cond_vae_images.repeat(cfg_factor, 1, 1, 1)
            else:
                # In this case, cond_vae_images is a list of 4-D tensors or a list of lists of 3-D tensors.
                cond_t = [torch.cat(item, dim=0) for item in batch_cond_t]
                cond_vae_images = []
                for items in batch_cond_vae_images:
                    if all(items[0].shape == item.shape for item in items):
                        cond_vae_images.append(torch.stack(items, dim=0))
                    else:
                        cond_vae_images.append(items)
                if cfg_factor > 1:
                    cond_t = cond_t * cfg_factor
                    cond_vae_images = cond_vae_images * cfg_factor

        else:
            cond_vae_images = None
            cond_t = None

        # 2. If vit_image presents
        if first_image.section_type in ["cond_vit_image", "cond_joint_image"]:
            cond_vit_images = []
            for cond_images in batch_cond_images:
                cond_vit_image_list = []
                for cond_image in cond_images:
                    vit_image = (
                        cond_image.vit_image
                        if cond_image.section_type == "cond_joint_image"
                        else cond_image
                    )
                    cond_vit_image_list.append(vit_image)
                # Here we force convert the tensor to dtype
                cond_vit_images.append(
                    torch.stack(cond_vit_image_list, dim=0).to(dtype=self.dtype)
                )

            if cfg_factor > 1:
                cond_vit_images = cond_vit_images * cfg_factor

        else:
            cond_vit_images = None

        return cond_vae_images, cond_t, cond_vit_images

    @staticmethod
    def _prepare_vit_image_kwargs(batch_cond_images, cfg_factor):
        if batch_cond_images is None or len(batch_cond_images[0]) == 0:
            return None
        first_image = batch_cond_images[0][0]
        if first_image.section_type == "cond_joint_image":
            vit_image = first_image.vit_image
        else:
            vit_image = first_image
        if not hasattr(vit_image, "vision_encoder_kwargs") or len(vit_image.vision_encoder_kwargs) == 0:
            return None

        # Pack vit kwargs. Siglip2-so requires spatial_shapes and attention_mask for inference.
        cond_vit_image_kwargs = {"spatial_shapes": [], "attention_mask": []}
        for cond_images in batch_cond_images:
            cond_vit_image_kwargs["spatial_shapes"].append(
                torch.stack([
                    cond_image.vit_image.vision_encoder_kwargs["spatial_shapes"]
                    for cond_image in cond_images
                ]))
            cond_vit_image_kwargs["attention_mask"].append(
                torch.stack([
                    cond_image.vit_image.vision_encoder_kwargs["pixel_attention_mask"]
                    for cond_image in cond_images
                ]))
        if cfg_factor > 1:
            cond_vit_image_kwargs["spatial_shapes"] = cond_vit_image_kwargs["spatial_shapes"] * cfg_factor
            cond_vit_image_kwargs["attention_mask"] = cond_vit_image_kwargs["attention_mask"] * cfg_factor
        return cond_vit_image_kwargs

    @torch.no_grad()
    def prepare_message_list(
            self,
            message_list,
            cond_images: list[CondImage] = None,
            gen_image_info: ImageInfo = None,
    ):
        """ Convert a batch message list of OpenAI style to the internal format. """
        inner_message_list = []
        image_idx = 0
        for message in message_list:
            content = message["content"]
            if isinstance(content, str):
                inner_message_list.append(dict(role=message["role"], type="text", content=content))
            elif isinstance(content, list):
                for item in content:
                    if item["type"] == "text":
                        inner_message_list.append(dict(role=message["role"], type="text", content=item['text']))
                    elif item["type"] == "image":
                        if all(key not in item for key in ["image", "url", "path", "base64"]):
                            continue
                        assert cond_images is not None and image_idx < len(cond_images), \
                            f"Image index {image_idx} out of range for cond images with length {len(cond_images)}."
                        image = cond_images[image_idx]
                        inner_message_list.append(dict(role="assistant", type=image.section_type, content=image.i))
                        image_idx += 1
                    else:
                        raise NotImplementedError(f"Message content type {item['type']} not supported.")
            else:
                raise ValueError(f"Message content should be str or list, but got {type(content)}.")

        if gen_image_info is not None:
            inner_message_list.append(dict(role="assistant", type="gen_image", content=gen_image_info))

        return inner_message_list

    def preprocess_inputs(
            self,
            prompt: str | list[str] = None,
            image: InputImage | list[InputImage] = None,
            cot_text=None,
            message_list=None,
            cfg_factor=1,
            bot_task='auto',
            system_prompt=None,
            max_new_tokens=None,
            mode="gen_text",
            image_size="auto",
            infer_align_image_size=False,
            device=None,
            **kwargs,
    ):
        # 1. Sanity check
        self.check_inputs(prompt, image, message_list)

        # 2. Format inputs
        batch_message_list = message_list
        batch_prompt = prompt
        batch_cot_text = cot_text
        batch_system_prompt = system_prompt

        #   -- 2.1 message_list
        batch_cond_images = kwargs.get('batch_cond_images', None)
        if batch_message_list is not None:
            if isinstance(batch_message_list[0], dict):
                batch_message_list = [batch_message_list]
            batch_size = len(batch_message_list)

            # Multiple cond images are allowed.
            if batch_cond_images is None:
                batch_cond_images = [
                    self.image_processor.build_cond_images(
                        message_list=message_list_,
                        infer_align_image_size=infer_align_image_size,
                    )
                    for message_list_ in batch_message_list
                ]
            if mode == "gen_image":
                batch_gen_image_info = [
                    self.image_processor.build_gen_image_info(image_size, add_guidance_token=self.config.cfg_distilled, add_timestep_r_token=self.config.use_meanflow) for _ in range(batch_size)
                ]
            else:
                batch_gen_image_info = [None] * batch_size
            # Convert OpenAI message list into inner message list
            batch_message_list = [
                self.prepare_message_list(message_list_, cond_images, gen_image_info)
                for message_list_, cond_images, gen_image_info in zip(
                    batch_message_list, batch_cond_images, batch_gen_image_info
                )
            ]

        #   -- 2.2 Prompt, image, cot text, system prompt
        else:
            batch_prompt = self._validate_and_batchify_text(batch_prompt, 'prompt')
            batch_size = len(batch_prompt)

            batch_cot_text = self._validate_and_batchify_text(batch_cot_text, 'cot_text', batch_size)
            batch_system_prompt = self._validate_and_batchify_text(batch_system_prompt, 'system_prompt', batch_size)

            batch_image_list = self._validate_and_batchify_image(image, 'image', batch_size)
            if batch_cond_images is None:
                batch_cond_images = [
                    self.image_processor.build_cond_images(
                        image_list=image_list,
                        infer_align_image_size=infer_align_image_size
                    )
                    for image_list in batch_image_list
                ] if batch_image_list is not None else None

            if mode == "gen_image":
                batch_gen_image_info = [
                    self.image_processor.build_gen_image_info(image_size, add_guidance_token=self.config.cfg_distilled, add_timestep_r_token=self.config.use_meanflow) for _ in range(batch_size)
                ]
            else:
                batch_gen_image_info = [None] * batch_size

        # Apply batched prompt or batched message_list to build input sequence with associated info.
        # If `drop_think` enabled, always drop <tool_call> parts in the context.
        drop_think = kwargs.get('drop_think', getattr(self.generation_config, 'drop_think', False))
        out = self._tokenizer.apply_chat_template(
            batch_prompt=batch_prompt,
            batch_message_list=batch_message_list,
            mode=mode,
            batch_gen_image_info=batch_gen_image_info,
            batch_cond_images=batch_cond_images,
            batch_system_prompt=batch_system_prompt,
            batch_cot_text=batch_cot_text,
            max_length=kwargs.get('max_length', self.generation_config.max_length),
            bot_task=bot_task,
            image_base_size=(
                None if mode == "gen_text" and bot_task == "auto" else self.image_processor.vae_reso_group.base_size
            ),
            sequence_template=getattr(self.generation_config, 'sequence_template', 'pretrain'),
            cfg_factor=cfg_factor,
            drop_think=drop_think,
        )
        out['batch_size'] = batch_size
        out['batch_cond_images'] = batch_cond_images
        out['batch_gen_image_info'] = batch_gen_image_info

        # 8. Define stop tokens by tasks
        tkw = self._tokenizer
        if bot_task == "auto":
            stop_token_id = dict(
                auto=self._tokenizer.conversation.stop_token_ids,
            )
        else:
            if image_size == "auto":
                extra_auto_stops = [tkw.ratio_token_id(i) for i in range(33)]
            else:
                extra_auto_stops = [tkw.boi_token_id]
            stop_token_id = dict(
                auto=self._tokenizer.conversation.stop_token_ids + extra_auto_stops,
                recaption=[tkw.end_of_recaption_token_id],
                think=[tkw.end_of_think_token_id, tkw.end_of_recaption_token_id],
                img_ratio=extra_auto_stops,
            )
        out['stop_token_id'] = stop_token_id

        return out

    def prepare_model_inputs(
            self,
            prompt: str | list[str] = None,
            image: InputImage | list[InputImage] = None,
            mode="gen_text",
            system_prompt=None,
            cot_text=None,
            image_size="auto",
            message_list=None,
            device=None,
            max_new_tokens=None,
            **kwargs,
    ):
        device = default(device, self.device)

        # 1. apply chat template
        cfg_factor = {"gen_text": 1, "gen_image": 2}
        if self.config.cfg_distilled:
            cfg_factor["gen_image"] = 1

        bot_task = kwargs.pop("bot_task", "auto")

        out = kwargs.pop('tokenizer_output', None)
        if out is None:
            out = self.preprocess_inputs(
                prompt=prompt,
                image=image,
                mode=mode,
                system_prompt=system_prompt,
                cot_text=cot_text,
                image_size=image_size,
                message_list=message_list,
                cfg_factor=cfg_factor[mode],
                bot_task=bot_task,
                **kwargs,
            )
        output, sections = out['output'], out['sections']

        batch_size = out['batch_size']
        batch_cond_images = out['batch_cond_images']
        batch_gen_image_info = out['batch_gen_image_info']
        stop_token_id = out['stop_token_id']
        #if batch_gen_image_info[0] is not None:
        #    print("batch_gen_image_info image_token_length:", batch_gen_image_info[0].image_token_length)
        #   -- 2.3 seed
        seeds = self.prepare_seed(seed=kwargs.get('seed'), batch_size=batch_size)
        generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds]

        # 4. Encode conditional images
        cond_vae_images, cond_timesteps, cond_vit_images = self._encode_cond_image(
            batch_cond_images, cfg_factor[mode], generator=generator,
        )
        cond_vit_image_kwargs = self._prepare_vit_image_kwargs(batch_cond_images, cfg_factor[mode])

        # 5. Build position embeddings
        rope_image_info = self.build_batch_rope_image_info(output, sections)

        # 6. Build kv cache
        if mode == "gen_image":
            # Image generation will not extend sequence length, using token length as max_cache_len is enough.
            max_cache_len = output.tokens.shape[1]
        else:
            max_cache_len = output.tokens.shape[1] + default(max_new_tokens, self.generation_config.max_length)
        cache = HunyuanStaticCache(
            config=self.config,
            max_batch_size=batch_size * cfg_factor[mode],
            max_cache_len=max_cache_len,
            dtype=self.dtype,
            dynamic=mode == "gen_text",
        )

        # 7. Build position ids
        batch_position_ids = torch.arange(
            0, output.tokens.shape[1], dtype=torch.long, device=device)[None].expand(
            batch_size * cfg_factor[mode], -1)  # use expand to share indices to save memory

        # 8. Define stop tokens by tasks
        tkw = self._tokenizer
        if mode == "gen_image":
            eos_token_id = None  # don't need to define eos_token_id for image generation
        else:
            if bot_task == "auto":
                stop_token_id = dict(
                    auto=self._tokenizer.conversation.stop_token_ids,
                )
            else:
                if image_size == "auto":
                    extra_auto_stops = tkw.get_all_ratio_token_ids()
                else:
                    extra_auto_stops = [tkw.boi_token_id]
                stop_token_id = dict(
                    auto=self._tokenizer.conversation.stop_token_ids + extra_auto_stops,
                    recaption=[tkw.end_of_recaption_token_id],
                    think=[tkw.end_of_think_token_id, tkw.end_of_recaption_token_id],
                    img_ratio=extra_auto_stops,
                )
            eos_token_id = stop_token_id[bot_task]

        # 9. Build model input kwargs
        model_input_kwargs = dict(
            input_ids=output.tokens.to(device),
            position_ids=batch_position_ids,
            past_key_values=cache,
            mode=mode,
            rope_image_info=rope_image_info,
            image_mask=to_device(output.gen_image_mask, device),
            timesteps_index=to_device(output.gen_timestep_scatter_index, device),
            guidance_index=to_device(output.guidance_scatter_index, device),
            timesteps_r_index=to_device(output.gen_timestep_r_scatter_index, device),
            cond_vae_images=to_device(cond_vae_images, device),
            cond_vae_image_mask=to_device(output.vae_image_mask, device),
            cond_timesteps=to_device(cond_timesteps, device),
            cond_timesteps_index=to_device(output.cond_timestep_scatter_index, device),
            cond_vit_images=to_device(cond_vit_images, device),
            cond_vit_image_mask=to_device(output.vit_image_mask, device),
            cond_vit_image_kwargs=to_device(cond_vit_image_kwargs, device),
            # for inner usage
            tokenizer_output=output,
            batch_gen_image_info=batch_gen_image_info,
            generator=generator,
            batch_cond_images=batch_cond_images,
            # generation config
            eos_token_id=eos_token_id,
            max_new_tokens=max_new_tokens,
            gen_timestep_scatter_index=to_device(output.gen_timestep_scatter_index, device),
        )

        return model_input_kwargs

    def _prepare_attention_mask_for_generation(
            self,
            inputs_tensor: torch.Tensor,
            generation_config: GenerationConfig,
            model_kwargs: dict[str, Any],
    ) -> Optional[torch.Tensor]:
        # create `4d` bool attention mask (b, 1, seqlen, seqlen) using this implementation to bypass the 2d requirement
        # in the `transformers.generation_utils.GenerationMixin.generate`.
        # This implementation can handle sequences with text and image modalities, where text tokens use causal
        # attention and image tokens use full attention.
        bsz, seq_len = inputs_tensor.shape
        tokenizer_output = model_kwargs["tokenizer_output"]
        batch_full_attn_slices = [
            self.image_processor.prepare_full_attn_slices(tokenizer_output, i)
            for i in range(bsz)
        ]
        #if len(batch_full_attn_slices[0]) == 0:
        #    return None

        attention_mask = torch.ones(seq_len, seq_len, dtype=torch.bool, device=inputs_tensor.device).tril(
            diagonal=0).repeat(bsz, 1, 1)
        for i in range(bsz):
            for j, image_slice in enumerate(batch_full_attn_slices[i]):
                attention_mask[i, image_slice, image_slice] = True
        attention_mask = attention_mask.unsqueeze(1)
        return attention_mask

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None,
            tokenizer_output=None, batch_gen_image_info=None, batch_cond_images=None,
            infer_align_image_size=False, generator=None, **kwargs
    ):
        position_ids = kwargs.get("position_ids")
        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            assert position_ids is not None, "position_ids must be provided in kwargs."
            if input_ids is not None and input_ids.shape[1] != position_ids.shape[1]:    # in decode steps
                input_ids = torch.gather(input_ids, dim=1, index=position_ids)
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "attention_mask": attention_mask,
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                # "use_cache": kwargs.get("use_cache"),
                "rope_image_info": kwargs["rope_image_info"],
                "mode": kwargs["mode"],
                "images": kwargs.get("images"),
                "image_mask": kwargs.get("image_mask"),
                "timesteps": kwargs.get("timesteps"),
                "timesteps_index": kwargs.get("timesteps_index"),
                "timesteps_r": kwargs.get("timesteps_r"),
                "timesteps_r_index": kwargs.get("timesteps_r_index"),
                "guidance": kwargs.get("guidance"),
                "guidance_index": kwargs.get("guidance_index"),
                "cond_vae_images": kwargs.get("cond_vae_images"),
                "cond_vae_image_mask": kwargs.get("cond_vae_image_mask"),
                "cond_timesteps": kwargs.get("cond_timesteps"),
                "cond_timesteps_index": kwargs.get("cond_timesteps_index"),
                "cond_vit_images": kwargs.get("cond_vit_images"),
                "cond_vit_image_mask": kwargs.get("cond_vit_image_mask"),
                "cond_vit_image_kwargs": kwargs.get("cond_vit_image_kwargs"),
                "cache_dic":  kwargs.get("cache_dic"),
                "gen_timestep_scatter_index": kwargs.get("gen_timestep_scatter_index"),
            }
        )

        return model_inputs

    def _update_model_kwargs_for_generation(
            self,
            outputs: ModelOutput,
            model_kwargs: dict[str, Any],
            is_encoder_decoder: bool = False,
            num_new_tokens: int = 1,
    ) -> dict[str, Any]:
        """ This function is run after each step of model forward. It updates model kwargs for next forward step.
        """
        mode = model_kwargs["mode"]

        updated_model_kwargs = {
            "mode": mode,
            "rope_image_info": model_kwargs["rope_image_info"],
        }

        # update past_key_values keeping its naming used in model code
        for possible_cache_name in ALL_CACHE_NAMES:
            if possible_cache_name in outputs:
                # TODO (joao): remove output/input mismatch when these old models (xlnet, reformer) are deprecated
                if possible_cache_name in ("past_buckets_states", "mems"):
                    cache_name = "past_key_values"
                else:
                    cache_name = possible_cache_name
                updated_model_kwargs[cache_name] = getattr(outputs, possible_cache_name)
                break

        if "tokenizer_output" in model_kwargs:
            # After prefill step
            if mode == "gen_text":
                # When enable batching, we use right padding, which requires a real_pos to index the valid
                # end position of the sequence. If tokenizer_output in model_kwargs, it means we are in the
                # prefill step of generation.
                real_pos = to_device(model_kwargs["tokenizer_output"].real_pos, self.device)
                updated_model_kwargs["position_ids"] = real_pos
            else:
                # inputs_pos
                image_mask = model_kwargs["image_mask"]
                bsz, seq_len = image_mask.shape
                index = torch.arange(seq_len, device=image_mask.device).unsqueeze(0).repeat(bsz, 1)
                position_ids = index.masked_select(image_mask.bool()).reshape(bsz, -1)
                timestep_position_ids = \
                    index[torch.arange(bsz), model_kwargs["timesteps_index"][:, -1]].unsqueeze(-1)
                pos_cat_list = [timestep_position_ids, ]
                if self.config.cfg_distilled:
                    guidance_position_ids = index[torch.arange(bsz), model_kwargs["guidance_index"][:, -1]].unsqueeze(-1)
                    pos_cat_list.append(guidance_position_ids)
                if self.config.use_meanflow:
                    timestep_r_position_ids = index[torch.arange(bsz), model_kwargs["timesteps_r_index"][:, -1]].unsqueeze(-1)
                    pos_cat_list.append(timestep_r_position_ids)
                pos_cat_list.append(position_ids)
                updated_model_kwargs["position_ids"] = torch.cat(pos_cat_list, dim=1)

                # attention mask
                mask_list = []
                for attention_mask_i, position_ids_i in zip(
                        model_kwargs["attention_mask"], updated_model_kwargs["position_ids"]):
                    mask_list.append(torch.index_select(attention_mask_i, dim=1, index=position_ids_i.reshape(-1)))
                attention_mask = torch.stack(mask_list, dim=0)
                updated_model_kwargs["attention_mask"] = attention_mask
                updated_model_kwargs["gen_timestep_scatter_index"] = model_kwargs["gen_timestep_scatter_index"]
        else:
            # After decode steps
            if mode == "gen_text":
                # Now we are in the decode steps.
                updated_model_kwargs["position_ids"] = model_kwargs["position_ids"] + 1
                # Remove attention mask to use full attention of 1 x seqlen in decode steps
            else:
                updated_model_kwargs["position_ids"] = model_kwargs["position_ids"]
                updated_model_kwargs["attention_mask"] = model_kwargs["attention_mask"]
                updated_model_kwargs["gen_timestep_scatter_index"] = model_kwargs["gen_timestep_scatter_index"]
        return updated_model_kwargs

    class _StageTransitionLogitsProcessor(LogitsProcessor):
        def __init__(self, stage_transitions: list[tuple[int, list[int]]], batch_size: int):
            self.transition_map = {stop_id: list(append_ids) for stop_id, append_ids in stage_transitions}
            self.pending_tokens = [[] for _ in range(batch_size)]
            self.completed = [set() for _ in range(batch_size)]

        def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
            batch_size = input_ids.shape[0]
            last_tokens = input_ids[:, -1]
            device = scores.device
            min_score = torch.finfo(scores.dtype).min

            for i in range(batch_size):
                last_token = last_tokens[i].item()

                # Consume pending tokens if the last token matches the head.
                if self.pending_tokens[i] and last_token == self.pending_tokens[i][0]:
                    self.pending_tokens[i].pop(0)

                # If pending tokens remain, force the next token.
                if self.pending_tokens[i]:
                    scores[i].fill_(min_score)
                    scores[i, self.pending_tokens[i][0]] = 0
                    continue

                # Trigger stage transition if needed.
                if last_token in self.transition_map and last_token not in self.completed[i]:
                    self.completed[i].add(last_token)
                    next_tokens = self.transition_map[last_token]
                    if next_tokens:
                        self.pending_tokens[i] = list(next_tokens)
                        scores[i].fill_(min_score)
                        scores[i, self.pending_tokens[i][0]] = 0

                scores[i] = scores[i].to(device)

            return scores

    class _ConditionalSliceVocabLogitsProcessor(LogitsProcessor):
        def __init__(
            self,
            trigger_token_ids: list[int],
            vocab_start: int,
            vocab_end: int,
            other_slices: Optional[list[tuple[int, int]]] = None,
            force_greedy: bool = False,
        ):
            self.trigger_token_ids = set(trigger_token_ids)
            self.vocab_start = vocab_start
            self.vocab_end = vocab_end
            self.other_slices = other_slices or []
            self.force_greedy = force_greedy

        def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
            last_tokens = input_ids[:, -1]
            min_score = torch.finfo(scores.dtype).min
            for i in range(scores.size(0)):
                if last_tokens[i].item() not in self.trigger_token_ids:
                    continue
                original_scores = scores[i].clone()
                scores[i].fill_(min_score)
                scores[i, self.vocab_start:self.vocab_end] = original_scores[self.vocab_start:self.vocab_end]
                for start, end in self.other_slices:
                    scores[i, start:end] = original_scores[start:end]
                if self.force_greedy:
                    max_token_id = scores[i].argmax().item()
                    scores[i].fill_(min_score)
                    scores[i, max_token_id] = 0
            return scores

    def _get_ratio_index_from_token(self, ratio_token_id: int, tokenizer) -> int:
        if hasattr(tokenizer, "get_all_ratio_token_ids"):
            ratio_token_ids = tokenizer.get_all_ratio_token_ids()
            try:
                ratio_index = ratio_token_ids.index(ratio_token_id)
            except ValueError as exc:
                raise ValueError(f"Unknown ratio token id {ratio_token_id}") from exc
        else:
            ratio_index = ratio_token_id - tokenizer.ratio_token_id(0)
        if ratio_index < 0 or ratio_index >= len(self.image_processor.vae_reso_group):
            raise ValueError(f"ratio_index {ratio_index} out of range for vae_reso_group")
        return ratio_index

    @torch.no_grad()
    def generate(
            self,
            inputs: Optional[torch.Tensor] = None,
            generation_config: Optional[GenerationConfig] = None,
            logits_processor: Optional[LogitsProcessorList] = None,
            stopping_criteria: Optional[StoppingCriteriaList] = None,
            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
            synced_gpus: Optional[bool] = None,
            assistant_model: Optional["PreTrainedModel"] = None,
            streamer: Optional["BaseStreamer"] = None,
            negative_prompt_ids: Optional[torch.Tensor] = None,
            negative_prompt_attention_mask: Optional[torch.Tensor] = None,
            use_model_defaults: Optional[bool] = None,
            generator: Optional[list[torch.Generator]] = None,
            decode_text: bool = False,
            verbose: int = 0,
            stage_transitions: Optional[list[tuple[int, list[int]]]] = None,
            final_stop_tokens: Optional[list[int]] = None,
            **kwargs,
    ):
        gen_config = default(generation_config, self.generation_config)
        mode = kwargs.get("mode", "gen_text")
        output = kwargs["tokenizer_output"]
        indices = torch.where(output.tokens[0] == self._tokenizer.encode("<img>")[0])[0]
        if indices.shape[0] > 0:
            last_idx = indices[-1]
            self.post_token_len = int(output.tokens[0].shape[0] - 1 - last_idx)
        else:
            self.post_token_len = None
        # Log info
        if verbose >= 1:
            context = self._tokenizer.decode(output.tokens[0], skip_special_tokens=False)
            # Replace <img><img>...<img> with [<img>]{number}
            img_token = self._tokenizer.get_img_token()
            context = re.sub(f"({img_token})+", lambda m: f"[{img_token}]{{{len(m.group(0)) // 5}}}", context)
            info_list = [
                ("token shape", output.tokens.shape),
                ("context[0]", context),
            ]
            if mode == "gen_image":
                if generator is not None:
                    info_list.extend([
                        ("seed", [g.initial_seed() for g in generator]),
                    ])
                info_list.extend([
                    ("image_size",
                     [f"{info.image_height}x{info.image_width}" for info in kwargs["batch_gen_image_info"]]),
                    ("infer_steps", gen_config.diff_infer_steps),
                    ("guidance_scale", gen_config.diff_guidance_scale),
                    ("flow_shift", gen_config.flow_shift),
                ])
            else:
                info_list.extend([
                    ("do_sample", kwargs.get("do_sample", gen_config.do_sample)),
                    ("max_new_tokens", kwargs.get("max_new_tokens", gen_config.max_new_tokens)),
                    ("top_k", kwargs.get("top_k", gen_config.top_k)),
                    ("top_p", kwargs.get("top_p", gen_config.top_p)),
                    ("temperature", kwargs.get("temperature", gen_config.temperature)),
                    ("repetition_penalty", kwargs.get("repetition_penalty", gen_config.repetition_penalty)),
                ])
            max_key_len = max(len(k) for k, _ in info_list)
            info_str = "=" * 50 + \
                       "\nModel input info:\n" + \
                       "\n".join([f"    {k.rjust(max_key_len)}: {v}" for k, v in info_list]) + \
                       "\n--------------------------------------------------"
            print(info_str, flush=True)
            start_time = time.time()

        if mode == "gen_text":
            if verbose >= 2 and streamer is None:
                streamer = TextStreamer(self._tokenizer, skip_prompt=True, skip_special_tokens=False)   # noqa

            with torch.autocast(device_type="cuda", dtype=self.dtype, enabled=self.dtype != torch.float32):
                if stage_transitions is not None:
                    if final_stop_tokens is None:
                        raise ValueError("`final_stop_tokens` must be provided when `stage_transitions` is set.")
                    if logits_processor is None:
                        logits_processor = LogitsProcessorList()
                    elif not isinstance(logits_processor, LogitsProcessorList):
                        logits_processor = LogitsProcessorList(logits_processor)
                    input_ids = kwargs.get("input_ids")
                    if input_ids is None:
                        raise ValueError("`input_ids` must be provided for multi-stage generation.")
                    logits_processor.append(
                        self._StageTransitionLogitsProcessor(stage_transitions, input_ids.shape[0])
                    )
                    kwargs["eos_token_id"] = final_stop_tokens

                samples = super().generate(
                    inputs=inputs,
                    generation_config=gen_config,
                    logits_processor=logits_processor,
                    stopping_criteria=stopping_criteria,
                    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
                    synced_gpus=synced_gpus,
                    assistant_model=assistant_model,
                    streamer=streamer,
                    negative_prompt_ids=negative_prompt_ids,
                    negative_prompt_attention_mask=negative_prompt_attention_mask,
                    use_model_defaults=use_model_defaults,
                    **kwargs,
                )
                if decode_text:
                    samples = self.decode_text(samples, input_length=kwargs["input_ids"].shape[1])

        elif mode == "gen_image":
            batch_gen_image_info: list[ImageInfo] = kwargs.get("batch_gen_image_info")
            if batch_gen_image_info is None:
                raise ValueError("`batch_gen_image_info` should be provided when `mode` is `gen_image`.")
            self.num_image_tokens = (batch_gen_image_info[0].image_token_length) 
            #                       + (1 if batch_gen_image_info[0].add_timestep_token else 0)
            #                       + (1 if batch_gen_image_info[0].add_guidance_token else 0) )
            self.num_special_tokens = ((1 if batch_gen_image_info[0].add_timestep_token else 0) + 
                                       (1 if batch_gen_image_info[0].add_guidance_token else 0) +
                                       (1 if batch_gen_image_info[0].add_timestep_r_token else 0) )
            results = self.pipeline(
                batch_size=len(batch_gen_image_info),
                image_size=[batch_gen_image_info[0].image_height, batch_gen_image_info[0].image_width],
                num_inference_steps=gen_config.diff_infer_steps,
                guidance_scale=gen_config.diff_guidance_scale,
                generator=generator,
                meanflow=self.config.use_meanflow,
                model_kwargs=kwargs,
                cfg_distilled = self.config.cfg_distilled,
            )
            samples = results[0]

        else:
            raise ValueError(f"Unknown mode {mode}, only `gen_text` and `gen_image` are supported.")

        if verbose >= 1:
            end_time = time.time()
            print(f"Generation completed in {end_time - start_time:.2f} seconds.", flush=True)  # noqa

        return samples

    def decode_text(self, output: torch.Tensor, input_length: int = None):
        if output.ndim == 2:
            assert output.size(0) == 1, "Batch decoding is not supported yet."
            return [self.decode_text(output_i, input_length) for output_i in output]
        elif output.ndim == 1:
            if input_length is not None:
                output = output[input_length:]
            text = self._tokenizer.decode(output)
            return text
        else:
            raise ValueError(f"output should be 1D or 2D tensor, but got {output.ndim}D tensor.")

    @torch.no_grad()
    def generate_image(
            self,
            prompt=None,
            image=None,
            message_list=None,
            seed=None,
            image_size="auto",
            use_system_prompt=None,
            system_prompt=None,
            bot_task=None,
            infer_align_image_size=False,
            use_taylor_cache=False,
            taylor_cache_interval=None,
            taylor_cache_order=None,
            taylor_cache_enable_first_enhance=None,
            taylor_cache_first_enhance_steps=None,
            taylor_cache_enable_tailing_enhance=None,
            taylor_cache_tailing_enhance_steps=None,
            taylor_cache_low_freqs_order=None,
            taylor_cache_high_freqs_order=None,
            **kwargs,
    ):
        max_new_tokens = kwargs.pop("max_new_tokens", 2048)
        cot_text = kwargs.pop("cot_text", None)

        use_system_prompt = default(use_system_prompt, self.generation_config.use_system_prompt)
        bot_task = default(bot_task, self.generation_config.bot_task)
        system_prompt = get_system_prompt(use_system_prompt, bot_task, system_prompt)
        system_prompt = system_prompt.strip() if system_prompt is not None else ""

        self.taylor_cache_interval = taylor_cache_interval
        self.taylor_cache_order = taylor_cache_order
        self.taylor_cache_enable_first_enhance = taylor_cache_enable_first_enhance
        self.taylor_cache_first_enhance_steps = taylor_cache_first_enhance_steps
        self.taylor_cache_enable_tailing_enhance = taylor_cache_enable_tailing_enhance
        self.taylor_cache_tailing_enhance_steps = taylor_cache_tailing_enhance_steps
        self.taylor_cache_low_freqs_order = taylor_cache_low_freqs_order 
        self.taylor_cache_high_freqs_order = taylor_cache_high_freqs_order
        self.use_taylor_cache = False

        batch_cond_images_cache = None
        tkw = self._tokenizer
        need_ratio = image_size == "auto" or bot_task == "img_ratio"
        if bot_task in ["think", "recaption", "think_recaption"]:
            first_bot_task = bot_task.split("_")[0]
            stage_transitions = []

            if first_bot_task == "think" and "recaption" in bot_task:
                stage_transitions.append(
                    (tkw.end_of_think_token_id, [tkw.convert_tokens_to_ids(tkw.recaption_token)])
                )

            if need_ratio:
                answer_prefix_tokens = []
                if getattr(self.generation_config, "sequence_template", "pretrain") == "instruct":
                    answer_prefix_tokens = [tkw.convert_tokens_to_ids(tkw.answer_token)]
                image_base_size = self.image_processor.vae_reso_group.base_size
                if "recaption" in bot_task:
                    transition_id = tkw.end_of_recaption_token_id
                else:
                    transition_id = tkw.end_of_think_token_id
                stage_transitions.append(
                    (transition_id, answer_prefix_tokens + [tkw.boi_token_id, tkw.size_token_id(image_base_size)])
                )
                final_stop_tokens = list(range(tkw.start_ratio_token_id, tkw.end_ratio_token_id + 1))
                for start, end in getattr(tkw, "ratio_token_other_slices", []):
                    final_stop_tokens.extend(range(start, end))
            else:
                if "recaption" in bot_task:
                    final_stop_tokens = [tkw.end_of_recaption_token_id]
                else:
                    final_stop_tokens = [tkw.end_of_think_token_id, tkw.end_of_recaption_token_id]
                    
            model_inputs = self.prepare_model_inputs(
                prompt=prompt, image=image, message_list=message_list, system_prompt=system_prompt,
                max_new_tokens=max_new_tokens, mode="gen_text", bot_task=first_bot_task,
                batch_cond_images=batch_cond_images_cache, infer_align_image_size=infer_align_image_size,
            )
            batch_cond_images_cache = model_inputs['batch_cond_images']
            logits_processor = None
            if need_ratio:
                image_base_size = self.image_processor.vae_reso_group.base_size
                logits_processor = LogitsProcessorList([
                    self._ConditionalSliceVocabLogitsProcessor(
                        trigger_token_ids=[tkw.size_token_id(image_base_size)],
                        vocab_start=tkw.start_ratio_token_id,
                        vocab_end=tkw.end_ratio_token_id + 1,
                        other_slices=getattr(tkw, "ratio_token_other_slices", []),
                        force_greedy=True,
                    )
                ])

            input_length = model_inputs["input_ids"].shape[1]
            if stage_transitions:
                outputs = self.generate(
                    **model_inputs,
                    decode_text=False,
                    stage_transitions=stage_transitions,
                    final_stop_tokens=final_stop_tokens,
                    logits_processor=logits_processor,
                    **kwargs,
                )
            else:
                outputs = self.generate(**model_inputs, decode_text=False, logits_processor=logits_processor, **kwargs)
             
            generated_tokens = outputs[:, input_length:]
            if "recaption" in bot_task:
                end_token_id = tkw.end_of_recaption_token_id
            else:
                end_token_id = tkw.end_of_think_token_id
            end_positions = (generated_tokens[0] == end_token_id).nonzero(as_tuple=False)
            if end_positions.numel() > 0:
                end_pos = end_positions[0].item()
                cot_tokens = generated_tokens[0, :end_pos + 1]
            else:
                cot_tokens = generated_tokens[0]
            cot_text_gen = self._tokenizer.decode(cot_tokens)

            if first_bot_task == "think":
                cot_text = [tkw.think_token + cot_text_gen]
            else:
                cot_text = [tkw.recaption_token + cot_text_gen]

            if self.generation_config.drop_think and tkw.think_token in cot_text[0]:
                if tkw.recaption_token in cot_text[0]:
                    recaption_part = cot_text[0].split(tkw.recaption_token)[1]
                    if tkw.end_of_recaption_token in recaption_part:
                        recaption_part = recaption_part.split(tkw.end_of_recaption_token)[0]
                    cot_text = [tkw.recaption_token + recaption_part + tkw.end_of_recaption_token]

                    if system_prompt:
                        system_prompt = get_system_prompt("en_recaption", bot_task)

            if need_ratio:
                ratio_token_id = outputs[0, -1].item()  # get the original ratio index from the generated tokens
                ratio_index = self._get_ratio_index_from_token(ratio_token_id, tkw)
                reso = self.image_processor.vae_reso_group[ratio_index]
                image_size = reso.height, reso.width

        elif need_ratio:
            self.image_processor.build_img_ratio_slice_logits_proc(self.tokenizer)
            model_inputs = self.prepare_model_inputs(
                prompt=prompt, image=image, cot_text=cot_text, message_list=message_list, max_new_tokens=1,
                system_prompt=system_prompt, seed=seed, mode="gen_text", bot_task="img_ratio",
                batch_cond_images=batch_cond_images_cache, infer_align_image_size=infer_align_image_size,
            )
            batch_cond_images_cache = model_inputs['batch_cond_images']
            outputs = self.generate(**model_inputs, do_sample=False, logits_processor=self.image_processor.img_ratio_slice_logits_processor, **kwargs)
            ratio_index = outputs[0, -1].item()
            reso = self.image_processor.vae_reso_group[ratio_index]
            image_size = reso.height, reso.width

        # Generate image
        self.use_taylor_cache = use_taylor_cache
        model_inputs = self.prepare_model_inputs(
            prompt=prompt, image=image, cot_text=cot_text, message_list=message_list, system_prompt=system_prompt,
            seed=seed, image_size=image_size, mode="gen_image", batch_cond_images=batch_cond_images_cache,
            infer_align_image_size=infer_align_image_size,
        )
        batch_cond_images_cache = model_inputs['batch_cond_images']
        outputs = self.generate(**model_inputs, **kwargs)
        self.image_processor.postprocess_outputs(
            outputs,
            batch_cond_images=batch_cond_images_cache,
            infer_align_image_size=infer_align_image_size,
        )
        return cot_text, outputs


__all__ = [
    "HunyuanImage3ForCausalMM",
    "HunyuanImage3Model",
    "HunyuanImage3PreTrainedModel",
    "TimestepEmbedder",
    "UNetDown",
    "UNetUp"
    "CachedRoPE",
    "apply_rotary_pos_emb",
    "build_batch_2d_rope",
]