Upload tiny DeepseekV3ForCausalLM

#2
by qgallouedec HF Staff - opened
Files changed (3) hide show
  1. config.json +29 -6
  2. generation_config.json +1 -1
  3. model.safetensors +2 -2
config.json CHANGED
@@ -7,16 +7,18 @@
7
  "bos_token_id": 0,
8
  "dtype": "bfloat16",
9
  "eos_token_id": 1,
 
10
  "first_k_dense_replace": 3,
11
  "head_dim": 64,
12
  "hidden_act": "silu",
13
- "hidden_size": 8,
14
  "initializer_range": 0.02,
15
- "intermediate_size": 32,
16
  "kv_lora_rank": 512,
17
- "max_position_embeddings": 4096,
18
  "model_type": "deepseek_v3",
19
  "moe_intermediate_size": 2048,
 
20
  "n_group": 8,
21
  "n_routed_experts": 256,
22
  "n_shared_experts": 1,
@@ -25,20 +27,41 @@
25
  "num_experts_per_tok": 8,
26
  "num_hidden_layers": 2,
27
  "num_key_value_heads": 2,
 
28
  "pretraining_tp": 1,
29
  "q_lora_rank": 1536,
30
  "qk_head_dim": 192,
31
  "qk_nope_head_dim": 128,
32
  "qk_rope_head_dim": 64,
 
 
 
 
 
 
 
 
 
33
  "rms_norm_eps": 1e-06,
34
  "rope_interleave": true,
35
- "rope_scaling": null,
 
 
 
 
 
 
 
 
 
36
  "rope_theta": 10000.0,
37
  "routed_scaling_factor": 2.5,
 
38
  "tie_word_embeddings": false,
39
  "topk_group": 4,
40
- "transformers_version": "4.57.3",
 
41
  "use_cache": true,
42
  "v_head_dim": 128,
43
- "vocab_size": 128815
44
  }
 
7
  "bos_token_id": 0,
8
  "dtype": "bfloat16",
9
  "eos_token_id": 1,
10
+ "ep_size": 1,
11
  "first_k_dense_replace": 3,
12
  "head_dim": 64,
13
  "hidden_act": "silu",
14
+ "hidden_size": 64,
15
  "initializer_range": 0.02,
16
+ "intermediate_size": 64,
17
  "kv_lora_rank": 512,
18
+ "max_position_embeddings": 163840,
19
  "model_type": "deepseek_v3",
20
  "moe_intermediate_size": 2048,
21
+ "moe_layer_freq": 1,
22
  "n_group": 8,
23
  "n_routed_experts": 256,
24
  "n_shared_experts": 1,
 
27
  "num_experts_per_tok": 8,
28
  "num_hidden_layers": 2,
29
  "num_key_value_heads": 2,
30
+ "num_nextn_predict_layers": 1,
31
  "pretraining_tp": 1,
32
  "q_lora_rank": 1536,
33
  "qk_head_dim": 192,
34
  "qk_nope_head_dim": 128,
35
  "qk_rope_head_dim": 64,
36
+ "quantization_config": {
37
+ "activation_scheme": "dynamic",
38
+ "modules_to_not_convert": null,
39
+ "quant_method": "fp8",
40
+ "weight_block_size": [
41
+ 32,
42
+ 32
43
+ ]
44
+ },
45
  "rms_norm_eps": 1e-06,
46
  "rope_interleave": true,
47
+ "rope_scaling": {
48
+ "beta_fast": 32.0,
49
+ "beta_slow": 1.0,
50
+ "factor": 40.0,
51
+ "mscale": 1.0,
52
+ "mscale_all_dim": 1.0,
53
+ "original_max_position_embeddings": 4096,
54
+ "rope_type": "yarn",
55
+ "type": "yarn"
56
+ },
57
  "rope_theta": 10000.0,
58
  "routed_scaling_factor": 2.5,
59
+ "scoring_func": "sigmoid",
60
  "tie_word_embeddings": false,
61
  "topk_group": 4,
62
+ "topk_method": "noaux_tc",
63
+ "transformers_version": "4.56.2",
64
  "use_cache": true,
65
  "v_head_dim": 128,
66
+ "vocab_size": 129280
67
  }
generation_config.json CHANGED
@@ -5,5 +5,5 @@
5
  "eos_token_id": 1,
6
  "temperature": 0.6,
7
  "top_p": 0.95,
8
- "transformers_version": "4.57.3"
9
  }
 
5
  "eos_token_id": 1,
6
  "temperature": 0.6,
7
  "top_p": 0.95,
8
+ "transformers_version": "4.56.2"
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bb900594a8201ed0be43c96f9d402137954b73d19c677167a24e4af1440be95
3
- size 11036056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d3fbce5e2dd79a4eb21e3c846d08f28cb4599735a658fa778565fbc84f86221
3
+ size 36892336