| cfg_file: | |
| precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed'] | |
| min_dur: 80 | |
| max_dur: 240 | |
| sr: 48000 | |
| pretrained_path: ${dynamic_path:???/songbloom_full_240s.pt} | |
| continue_checkpoint: | |
| train_dataset: | |
| lyric_processor: phoneme | |
| prompt_len: 10 | |
| vae: | |
| vae_cfg: ${dynamic_path:???/stable_audio_1920_vae.json} | |
| vae_ckpt: ${dynamic_path:???/autoencoder_music_dsp1920.ckpt} | |
| sr: ${sr} | |
| model: | |
| block_size: 16 | |
| latent_dim: 64 | |
| dim: 1536 | |
| num_heads: 24 | |
| lm_layers: 36 | |
| diff_layers: 12 | |
| num_pitch: 16384 | |
| time_cond_type: prepend | |
| timestep_features_dim: 256 | |
| diffusion_objective: rectified_flow | |
| timestep_sampler: logit_normal | |
| backend: llama | |
| rotary_base_val: 40000 | |
| init_std: 0.02 | |
| h_dropout: 0.05 | |
| condition_provider_cfg: | |
| prompt_wav: | |
| type: audio_tokenizer_wrapper | |
| output_dim: ${model.dim} | |
| audio_tokenizer: | |
| max_len: 250 # 25.0 * 10s | |
| lyrics: | |
| type: phoneme_tokenizer | |
| output_dim: ${model.dim} | |
| vocab_list: ${load_yaml:${dynamic_path:???/vocab_g2p.yaml}} | |
| max_len: 1000 | |
| max_sentence_per_structure: 50 | |
| mode: sum | |
| cfg_dropout: 0.1 | |
| attribute_dropout: | |
| text: | |
| lyrics: 0. | |
| wav: | |
| prompt_wav: 0.1 | |
| fuser_cfg: | |
| cross_attention_pos_emb: false | |
| cross_attention_pos_emb_scale: 1 | |
| sum: [] | |
| prepend: [lyrics, prompt_wav] | |
| cross: [] | |
| input_interpolate: [] | |
| inference: | |
| cfg_coef: 1.5 | |
| temp: 0.9 | |
| diff_temp: 0.9 | |
| top_k: 100 | |
| penalty_repeat: True | |
| penalty_window: 50 | |
| steps: 36 | |
| dit_cfg_type: h |