| { |
| "architectures": [ |
| "MossAudioTokenizerModel" |
| ], |
| "auto_map": { |
| "AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig", |
| "AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel" |
| }, |
| "causal_transformer_context_duration": 10, |
| "code_dim": 768, |
| "decoder_kwargs": [ |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 1280, |
| "dim_feedforward": 5120, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 20, |
| "num_layers": 32, |
| "output_dimension": 1280, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 768, |
| "dim_feedforward": 3072, |
| "gating": "none", |
| "input_dimension": 640, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 12, |
| "num_layers": 12, |
| "output_dimension": 768, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 768, |
| "dim_feedforward": 3072, |
| "gating": "none", |
| "input_dimension": 384, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 12, |
| "num_layers": 12, |
| "output_dimension": 768, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 768, |
| "dim_feedforward": 3072, |
| "gating": "none", |
| "input_dimension": 384, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 12, |
| "num_layers": 12, |
| "output_dimension": 240, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 240 |
| } |
| ], |
| "downsample_rate": 1920, |
| "dtype": "float32", |
| "encoder_kwargs": [ |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 240 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 768, |
| "dim_feedforward": 3072, |
| "gating": "none", |
| "input_dimension": 240, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 12, |
| "num_layers": 12, |
| "output_dimension": 384, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 768, |
| "dim_feedforward": 3072, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 12, |
| "num_layers": 12, |
| "output_dimension": 384, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 768, |
| "dim_feedforward": 3072, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 12, |
| "num_layers": 12, |
| "output_dimension": 640, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 1280, |
| "dim_feedforward": 5120, |
| "gating": "none", |
| "input_dimension": 1280, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 20, |
| "num_layers": 32, |
| "output_dimension": 768, |
| "positional_embedding": "rope" |
| } |
| ], |
| "model_type": "moss-audio-tokenizer", |
| "quantizer_kwargs": { |
| "codebook_dim": 8, |
| "codebook_size": 1024, |
| "input_dim": 768, |
| "num_quantizers": 32, |
| "output_dim": 768, |
| "quantizer_type": "rlfq", |
| "rvq_dim": 512 |
| }, |
| "quantizer_type": "rlfq", |
| "reversed_decoder_kwargs": [ |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 240 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 768, |
| "dim_feedforward": 3072, |
| "gating": "none", |
| "input_dimension": 240, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 12, |
| "num_layers": 12, |
| "output_dimension": 384, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 768, |
| "dim_feedforward": 3072, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 12, |
| "num_layers": 12, |
| "output_dimension": 384, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 768, |
| "dim_feedforward": 3072, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 12, |
| "num_layers": 12, |
| "output_dimension": 640, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "conv_layout": true, |
| "d_model": 1280, |
| "dim_feedforward": 5120, |
| "gating": "none", |
| "input_dimension": 1280, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 20, |
| "num_layers": 32, |
| "output_dimension": 768, |
| "positional_embedding": "rope" |
| } |
| ], |
| "sample_rate": 24000, |
| "sampling_rate": 24000, |
| "transformers_version": "4.56.0.dev0", |
| "version": "4.26.1.a" |
| } |
|
|