AI-Hypercomputer · JamesDeng42 · Jun 2, 2026 · gagika · Jun 2, 2026
@@ -718,6 +718,40 @@
     vocab_size=151936,
 )
 
+qwen3_30b_a3b_base_config = transformers.Qwen3MoeConfig(
+    architectures=["Qwen3MoeForCausalLM"],
+    attention_bias=False,
+    attention_dropout=0.0,
+    bos_token_id=151643,
+    decoder_sparse_step=1,
+    eos_token_id=151645,
+    head_dim=128,
+    hidden_act="silu",
+    hidden_size=2048,
+    initializer_range=0.02,
+    intermediate_size=6144,
+    max_position_embeddings=262144,
+    max_window_layers=48,
+    model_type="qwen3_moe",
+    moe_intermediate_size=768,
+    norm_topk_prob=True,
+    num_attention_heads=32,
+    num_experts=128,
+    num_experts_per_tok=8,
+    num_hidden_layers=48,
+    num_key_value_heads=4,
+    output_router_logits=False,
+    rms_norm_eps=1e-06,
+    rope_scaling=None,
+    rope_theta=1000000,
+    router_aux_loss_coef=0.001,
+    sliding_window=None,
+    tie_word_embeddings=False,
+    torch_dtype="bfloat16",
+    use_cache=True,
+    vocab_size=151936,
+)
+
 qwen3_235b_a22b_thinking_2507_config = transformers.Qwen3MoeConfig(
     architectures=["Qwen3MoeForCausalLM"],
     attention_bias=False,
@@ -1579,7 +1613,7 @@ def __init__(self, **kwargs):
     "llama3.1-70b": llama31_70b_config,
     "llama3.1-405b": llama31_405b_config,
     "qwen3-30b-a3b": qwen3_30b_a3b_thinking_2507_config,
-    "qwen3-30b-a3b-base": qwen3_30b_a3b_thinking_2507_config,
+    "qwen3-30b-a3b-base": qwen3_30b_a3b_base_config,
     "qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config,
     "qwen3-480b-a35b": qwen3_coder_480b_a35b_config,
     "deepseek2-16b": deepseek2_16b_config,

@@ -1131,6 +1131,7 @@ def MIXTRAL_HF_WEIGHTS_TO_SHAPE(config):
     "llama3.1-70b": LLAMA31_HF_WEIGHTS_TO_SHAPE,
     "llama3.1-405b": LLAMA31_HF_WEIGHTS_TO_SHAPE,
     "qwen3-30b-a3b": QWEN_HF_WEIGHTS_TO_SHAPE,
+    "qwen3-30b-a3b-base": QWEN_HF_WEIGHTS_TO_SHAPE,
     "qwen3-235b-a22b": QWEN_HF_WEIGHTS_TO_SHAPE,
     "qwen3-480b-a35b": QWEN_HF_WEIGHTS_TO_SHAPE,
     "deepseek2-16b": DEEPSEEK_HF_WEIGHTS_TO_SHAPE,

@@ -34,7 +34,7 @@ base_moe_mlp_dim: 768
 norm_topk_prob: true
 
 # RoPE Settings
-rope_max_timescale: 10_000_000
+rope_max_timescale: 1_000_000
 
 # General Model Settings
 enable_dropout: false