From 659d5b1c8282bc227adb7d7d70eaaa83a65f3cdc Mon Sep 17 00:00:00 2001 From: yujiedeng Date: Tue, 2 Jun 2026 17:58:03 +0000 Subject: [PATCH] fix: set rope_max_timescale to 1M for qwen3-30b-a3b-base and update HF configuration/shape mappings --- .../utils/hf_model_configs.py | 36 ++++++++++++++++++- .../checkpoint_conversion/utils/hf_shape.py | 1 + .../configs/models/qwen3-30b-a3b-base.yml | 2 +- 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py b/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py index 7bf36731df..a4b26cf65e 100644 --- a/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py +++ b/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py @@ -718,6 +718,40 @@ vocab_size=151936, ) +qwen3_30b_a3b_base_config = transformers.Qwen3MoeConfig( + architectures=["Qwen3MoeForCausalLM"], + attention_bias=False, + attention_dropout=0.0, + bos_token_id=151643, + decoder_sparse_step=1, + eos_token_id=151645, + head_dim=128, + hidden_act="silu", + hidden_size=2048, + initializer_range=0.02, + intermediate_size=6144, + max_position_embeddings=262144, + max_window_layers=48, + model_type="qwen3_moe", + moe_intermediate_size=768, + norm_topk_prob=True, + num_attention_heads=32, + num_experts=128, + num_experts_per_tok=8, + num_hidden_layers=48, + num_key_value_heads=4, + output_router_logits=False, + rms_norm_eps=1e-06, + rope_scaling=None, + rope_theta=1000000, + router_aux_loss_coef=0.001, + sliding_window=None, + tie_word_embeddings=False, + torch_dtype="bfloat16", + use_cache=True, + vocab_size=151936, +) + qwen3_235b_a22b_thinking_2507_config = transformers.Qwen3MoeConfig( architectures=["Qwen3MoeForCausalLM"], attention_bias=False, @@ -1579,7 +1613,7 @@ def __init__(self, **kwargs): "llama3.1-70b": llama31_70b_config, "llama3.1-405b": llama31_405b_config, "qwen3-30b-a3b": qwen3_30b_a3b_thinking_2507_config, - "qwen3-30b-a3b-base": qwen3_30b_a3b_thinking_2507_config, + "qwen3-30b-a3b-base": qwen3_30b_a3b_base_config, "qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config, "qwen3-480b-a35b": qwen3_coder_480b_a35b_config, "deepseek2-16b": deepseek2_16b_config, diff --git a/src/maxtext/checkpoint_conversion/utils/hf_shape.py b/src/maxtext/checkpoint_conversion/utils/hf_shape.py index 8589af8092..35e0ea5a99 100644 --- a/src/maxtext/checkpoint_conversion/utils/hf_shape.py +++ b/src/maxtext/checkpoint_conversion/utils/hf_shape.py @@ -1131,6 +1131,7 @@ def MIXTRAL_HF_WEIGHTS_TO_SHAPE(config): "llama3.1-70b": LLAMA31_HF_WEIGHTS_TO_SHAPE, "llama3.1-405b": LLAMA31_HF_WEIGHTS_TO_SHAPE, "qwen3-30b-a3b": QWEN_HF_WEIGHTS_TO_SHAPE, + "qwen3-30b-a3b-base": QWEN_HF_WEIGHTS_TO_SHAPE, "qwen3-235b-a22b": QWEN_HF_WEIGHTS_TO_SHAPE, "qwen3-480b-a35b": QWEN_HF_WEIGHTS_TO_SHAPE, "deepseek2-16b": DEEPSEEK_HF_WEIGHTS_TO_SHAPE, diff --git a/src/maxtext/configs/models/qwen3-30b-a3b-base.yml b/src/maxtext/configs/models/qwen3-30b-a3b-base.yml index 723e4ed6d2..06b3b50729 100644 --- a/src/maxtext/configs/models/qwen3-30b-a3b-base.yml +++ b/src/maxtext/configs/models/qwen3-30b-a3b-base.yml @@ -34,7 +34,7 @@ base_moe_mlp_dim: 768 norm_topk_prob: true # RoPE Settings -rope_max_timescale: 10_000_000 +rope_max_timescale: 1_000_000 # General Model Settings enable_dropout: false