From ff40b116526a3ef14f5425703bbb187bd0ccf2f4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 10 Jun 2026 23:15:43 +0000
Subject: [PATCH 1/5] Initial plan


From 862be328ec68313ce82c18c3e0671f310dd894c1 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 10 Jun 2026 23:41:51 +0000
Subject: [PATCH 2/5] Add public SentencePieceTokenizer.Create(vocab) and
 CreateFromTokenizerJson APIs

Co-authored-by: ericstj <8918108+ericstj@users.noreply.github.com>
---
 .../Model/SentencePieceBaseModel.cs           |  44 ++++
 .../Model/SentencePieceTokenizer.cs           | 238 ++++++++++++++++++
 .../Model/SentencePieceUnigramModel.cs        | 138 ++++++++++
 .../UnigramTests.cs                           | 103 ++++++++
 4 files changed, 523 insertions(+)
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs
index 5bd204f501..885f541c5b 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs
@@ -59,6 +59,50 @@ internal SentencePieceBaseModel(ModelProto modelProto, bool addBos = false, bool
                                 specialTokens);
         }
 
+        internal SentencePieceBaseModel(
+            bool addBos, bool addEos,
+            string bosToken, int bosId,
+            string eosToken, int eosId,
+            string unkToken, int unkId,
+            bool addDummyPrefix, bool escapeWhiteSpaces,
+            bool treatWhitespaceAsSuffix, bool byteFallback,
+            ReadOnlySpan<byte> precompiledCharsmap, bool removeExtraWhitespaces,
+            IReadOnlyDictionary<string, int>? specialTokens)
+        {
+            AddBeginningOfSentence = addBos;
+            AddEndOfSentence = addEos;
+            BeginningOfSentenceToken = bosToken;
+            BeginningOfSentenceId = Math.Max(0, bosId);
+            EndOfSentenceToken = eosToken;
+            EndOfSentenceId = Math.Max(0, eosId);
+            UnknownToken = unkToken;
+            UnknownId = Math.Max(0, unkId);
+            AddDummyPrefix = addDummyPrefix;
+            EscapeWhiteSpaces = escapeWhiteSpaces;
+            TreatWhitespaceAsSuffix = treatWhitespaceAsSuffix;
+            ByteFallback = byteFallback;
+            SpecialTokens = specialTokens;
+
+            if (specialTokens is not null && specialTokens.Count > 0)
+            {
+                InternalSpecialTokens = new Dictionary<StringSpanOrdinalKey, int>();
+                SpecialTokensReverse = new Dictionary<int, string>();
+
+                foreach (var item in specialTokens)
+                {
+                    InternalSpecialTokens.Add(new StringSpanOrdinalKey(item.Key), item.Value);
+                    SpecialTokensReverse.Add(item.Value, item.Key);
+                }
+
+                SpecialTokensRegex = new Regex(string.Join("|", specialTokens.Keys.Select(s => Regex.Escape(s))), RegexOptions.Compiled);
+            }
+
+            Normalizer = new SentencePieceNormalizer(
+                                precompiledCharsmap, removeExtraWhitespaces,
+                                addDummyPrefix, escapeWhiteSpaces,
+                                treatWhitespaceAsSuffix, specialTokens);
+        }
+
         internal Regex? SpecialTokensRegex { get; }
 
         internal Dictionary<StringSpanOrdinalKey, int>? InternalSpecialTokens { get; }
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
index cb945d24fa..2176fba0fa 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
@@ -7,6 +7,7 @@
 using System.Buffers;
 using System.Collections.Generic;
 using System.IO;
+using System.Text.Json;
 
 namespace Microsoft.ML.Tokenizers
 {
@@ -30,6 +31,11 @@ internal SentencePieceTokenizer(ModelProto modelProto, bool addBos, bool addEos,
             };
         }
 
+        private SentencePieceTokenizer(SentencePieceBaseModel model)
+        {
+            _model = model;
+        }
+
         /// <summary>
         /// The special tokens.
         /// </summary>
@@ -457,5 +463,237 @@ public static SentencePieceTokenizer Create(
 
             return new SentencePieceTokenizer(modelProto, addBeginningOfSentence, addEndOfSentence, specialTokens);
         }
+
+        /// <summary>
+        /// Creates a Unigram <see cref="SentencePieceTokenizer"/> from an in-memory vocabulary of (piece, score) pairs.
+        /// </summary>
+        /// <param name="vocab">
+        /// The vocabulary as an ordered sequence of (piece, score) pairs. The position of each pair
+        /// in the sequence determines its token ID.
+        /// </param>
+        /// <param name="unkId">The index (token ID) of the unknown token in <paramref name="vocab"/>.</param>
+        /// <param name="addBeginningOfSentence">Whether to emit the beginning-of-sentence token during encoding.</param>
+        /// <param name="addEndOfSentence">Whether to emit the end-of-sentence token during encoding.</param>
+        /// <param name="precompiledCharsMap">
+        /// Optional precompiled character normalization map (as found in the SentencePiece <c>normalizer_spec.precompiled_charsmap</c>
+        /// field or in the Hugging Face <c>tokenizer.json</c> <c>normalizer.precompiled_charsmap</c> property).
+        /// Pass <see langword="default"/> to skip precompiled normalization.
+        /// </param>
+        /// <param name="addDummyPrefix">Whether to prepend the dummy whitespace prefix character (U+2581) at the start of the input.</param>
+        /// <param name="escapeWhiteSpaces">Whether to replace spaces with the dummy whitespace character (U+2581) during normalization.</param>
+        /// <param name="treatWhitespaceAsSuffix">Whether to emit the U+2581 character at the end of the last token rather than the beginning of the first token.</param>
+        /// <param name="specialTokens">Additional special tokens to recognize, supplied as a mapping of token string to token ID.</param>
+        /// <returns>A new <see cref="SentencePieceTokenizer"/> instance.</returns>
+        /// <remarks>
+        /// The beginning-of-sentence and end-of-sentence token IDs are auto-detected by looking for pieces
+        /// named <c>&lt;s&gt;</c> and <c>&lt;/s&gt;</c> in <paramref name="vocab"/>. If not found, positions 1 and 2
+        /// are used as fallbacks (the SentencePiece convention). Similarly, a <c>&lt;pad&gt;</c> piece is
+        /// detected automatically if present.
+        /// <para>
+        /// When creating the tokenizer, ensure that the vocabulary is sourced from a trusted provider.
+        /// </para>
+        /// </remarks>
+        public static SentencePieceTokenizer Create(
+            IEnumerable<(string Piece, float Score)> vocab,
+            int unkId,
+            bool addBeginningOfSentence = true,
+            bool addEndOfSentence = false,
+            ReadOnlySpan<byte> precompiledCharsMap = default,
+            bool addDummyPrefix = true,
+            bool escapeWhiteSpaces = true,
+            bool treatWhitespaceAsSuffix = false,
+            IReadOnlyDictionary<string, int>? specialTokens = null)
+        {
+            if (vocab is null)
+            {
+                throw new ArgumentNullException(nameof(vocab));
+            }
+
+            IReadOnlyList<(string Piece, float Score)> pieces = vocab as IReadOnlyList<(string Piece, float Score)>
+                ?? new List<(string Piece, float Score)>(vocab);
+
+            SentencePieceUnigramModel model = new SentencePieceUnigramModel(
+                pieces, unkId, addBeginningOfSentence, addEndOfSentence,
+                precompiledCharsMap, addDummyPrefix, escapeWhiteSpaces,
+                treatWhitespaceAsSuffix, removeExtraWhitespaces: true, specialTokens);
+
+            return new SentencePieceTokenizer(model);
+        }
+
+        /// <summary>
+        /// Creates a Unigram <see cref="SentencePieceTokenizer"/> by parsing a Hugging Face <c>tokenizer.json</c>
+        /// that contains a Unigram model (<c>model.type == "Unigram"</c>).
+        /// </summary>
+        /// <param name="tokenizerJsonStream">A stream containing the UTF-8-encoded <c>tokenizer.json</c> content.</param>
+        /// <param name="addBeginningOfSentence">Whether to emit the beginning-of-sentence token during encoding.</param>
+        /// <param name="addEndOfSentence">Whether to emit the end-of-sentence token during encoding.</param>
+        /// <param name="specialTokens">Additional special tokens to recognize, supplied as a mapping of token string to token ID.</param>
+        /// <returns>A new <see cref="SentencePieceTokenizer"/> instance.</returns>
+        /// <remarks>
+        /// The following fields are read from the JSON:
+        /// <list type="bullet">
+        ///   <item><description><c>model.vocab</c> — array of <c>[piece, score]</c> pairs (required).</description></item>
+        ///   <item><description><c>model.unk_id</c> — index of the unknown token (required).</description></item>
+        ///   <item><description><c>normalizer.precompiled_charsmap</c> (base64) — normalization map; also searched inside a <c>Sequence</c> normalizer.</description></item>
+        ///   <item><description><c>pre_tokenizer</c> of type <c>Metaspace</c> — <c>add_prefix_space</c> and <c>replacement</c>; also searched inside a <c>Sequence</c> pre-tokenizer.</description></item>
+        /// </list>
+        /// <para>
+        /// When creating the tokenizer, ensure that the JSON stream is sourced from a trusted provider.
+        /// </para>
+        /// </remarks>
+        public static SentencePieceTokenizer CreateFromTokenizerJson(
+            Stream tokenizerJsonStream,
+            bool addBeginningOfSentence = true,
+            bool addEndOfSentence = false,
+            IReadOnlyDictionary<string, int>? specialTokens = null)
+        {
+            if (tokenizerJsonStream is null)
+            {
+                throw new ArgumentNullException(nameof(tokenizerJsonStream));
+            }
+
+            using JsonDocument doc = JsonDocument.Parse(tokenizerJsonStream);
+            JsonElement root = doc.RootElement;
+
+            // Validate model type
+            if (!root.TryGetProperty("model", out JsonElement modelElement))
+            {
+                throw new InvalidDataException("The tokenizer.json does not contain a 'model' property.");
+            }
+
+            if (modelElement.TryGetProperty("type", out JsonElement modelTypeElement) &&
+                !string.Equals(modelTypeElement.GetString(), "Unigram", StringComparison.OrdinalIgnoreCase))
+            {
+                throw new InvalidDataException($"Expected model type 'Unigram' but found '{modelTypeElement.GetString()}'.");
+            }
+
+            if (!modelElement.TryGetProperty("unk_id", out JsonElement unkIdElement))
+            {
+                throw new InvalidDataException("The tokenizer.json model does not contain an 'unk_id' property.");
+            }
+
+            int unkId = unkIdElement.GetInt32();
+
+            if (!modelElement.TryGetProperty("vocab", out JsonElement vocabElement) ||
+                vocabElement.ValueKind != JsonValueKind.Array)
+            {
+                throw new InvalidDataException("The tokenizer.json model does not contain a valid 'vocab' array.");
+            }
+
+            List<(string Piece, float Score)> vocab = new List<(string Piece, float Score)>(vocabElement.GetArrayLength());
+            foreach (JsonElement entry in vocabElement.EnumerateArray())
+            {
+                if (entry.ValueKind != JsonValueKind.Array || entry.GetArrayLength() < 2)
+                {
+                    throw new InvalidDataException("Each entry in 'model.vocab' must be a [piece, score] array.");
+                }
+
+                string? piece = entry[0].GetString();
+                if (piece is null)
+                {
+                    throw new InvalidDataException("A piece string in 'model.vocab' is null.");
+                }
+
+                vocab.Add((piece, entry[1].GetSingle()));
+            }
+
+            // Extract normalizer settings
+            byte[]? precompiledCharsMap = null;
+            bool addDummyPrefix = true;
+            bool removeExtraWhitespaces = true;
+            if (root.TryGetProperty("normalizer", out JsonElement normalizerElement))
+            {
+                precompiledCharsMap = ExtractPrecompiledCharsMap(normalizerElement);
+            }
+
+            // Extract pre_tokenizer settings
+            bool escapeWhiteSpaces = true;
+            bool treatWhitespaceAsSuffix = false;
+            if (root.TryGetProperty("pre_tokenizer", out JsonElement preTokenizerElement))
+            {
+                ExtractMetaspaceSettings(preTokenizerElement, ref addDummyPrefix, ref escapeWhiteSpaces, ref treatWhitespaceAsSuffix);
+            }
+
+            SentencePieceUnigramModel model = new SentencePieceUnigramModel(
+                vocab, unkId, addBeginningOfSentence, addEndOfSentence,
+                precompiledCharsMap is not null ? precompiledCharsMap.AsSpan() : default,
+                addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens);
+
+            return new SentencePieceTokenizer(model);
+        }
+
+        private static byte[]? ExtractPrecompiledCharsMap(JsonElement normalizer)
+        {
+            if (!normalizer.TryGetProperty("type", out JsonElement typeEl))
+            {
+                return null;
+            }
+
+            string? type = typeEl.GetString();
+            if (string.Equals(type, "Precompiled", StringComparison.OrdinalIgnoreCase))
+            {
+                if (normalizer.TryGetProperty("precompiled_charsmap", out JsonElement mapEl))
+                {
+                    string? base64 = mapEl.GetString();
+                    if (base64 is not null)
+                    {
+                        return Convert.FromBase64String(base64);
+                    }
+                }
+            }
+            else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) &&
+                     normalizer.TryGetProperty("normalizers", out JsonElement normalizersEl) &&
+                     normalizersEl.ValueKind == JsonValueKind.Array)
+            {
+                foreach (JsonElement inner in normalizersEl.EnumerateArray())
+                {
+                    byte[]? result = ExtractPrecompiledCharsMap(inner);
+                    if (result is not null)
+                    {
+                        return result;
+                    }
+                }
+            }
+
+            return null;
+        }
+
+        private static void ExtractMetaspaceSettings(JsonElement preTokenizer, ref bool addDummyPrefix, ref bool escapeWhiteSpaces, ref bool treatWhitespaceAsSuffix)
+        {
+            if (!preTokenizer.TryGetProperty("type", out JsonElement typeEl))
+            {
+                return;
+            }
+
+            string? type = typeEl.GetString();
+            if (string.Equals(type, "Metaspace", StringComparison.OrdinalIgnoreCase))
+            {
+                if (preTokenizer.TryGetProperty("add_prefix_space", out JsonElement addPrefixEl))
+                {
+                    addDummyPrefix = addPrefixEl.GetBoolean();
+                }
+
+                if (preTokenizer.TryGetProperty("replacement", out JsonElement replacementEl))
+                {
+                    string? replacement = replacementEl.GetString();
+                    escapeWhiteSpaces = replacement == "\u2581"; // U+2581 LOWER ONE EIGHTH BLOCK (▁)
+                }
+
+                if (preTokenizer.TryGetProperty("prepend_scheme", out JsonElement prependSchemeEl))
+                {
+                    string? scheme = prependSchemeEl.GetString();
+                    treatWhitespaceAsSuffix = string.Equals(scheme, "last", StringComparison.OrdinalIgnoreCase);
+                }
+            }
+            else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) &&
+                     preTokenizer.TryGetProperty("pretokenizers", out JsonElement preTokenizersEl) &&
+                     preTokenizersEl.ValueKind == JsonValueKind.Array)
+            {
+                foreach (JsonElement inner in preTokenizersEl.EnumerateArray())
+                {
+                    ExtractMetaspaceSettings(inner, ref addDummyPrefix, ref escapeWhiteSpaces, ref treatWhitespaceAsSuffix);
+                }
+            }
+        }
     }
 }
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
index 3714206cf0..adc9f09f13 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
@@ -93,6 +93,144 @@ public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos
             }
         }
 
+        // Constructor that builds a Unigram model directly from a list of (piece, score) pairs.
+        // BOS, EOS, and PAD tokens are identified by their names ("<s>", "</s>", "<pad>") in the vocab;
+        // if not found by name, SentencePiece-conventional positions (1, 2, none) are used as fallbacks.
+        internal SentencePieceUnigramModel(
+            IReadOnlyList<(string Piece, float Score)> pieces,
+            int unkId,
+            bool addBos,
+            bool addEos,
+            ReadOnlySpan<byte> precompiledCharsmap,
+            bool addDummyPrefix,
+            bool escapeWhiteSpaces,
+            bool treatWhitespaceAsSuffix,
+            bool removeExtraWhitespaces,
+            IReadOnlyDictionary<string, int>? specialTokens)
+            : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces,
+                   treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens,
+                   FindSpecialTokenId(pieces, "<s>", 1),
+                   FindSpecialTokenId(pieces, "</s>", 2),
+                   FindSpecialTokenId(pieces, "<pad>", -1))
+        {
+        }
+
+        private SentencePieceUnigramModel(
+            IReadOnlyList<(string Piece, float Score)> pieces,
+            int unkId,
+            bool addBos,
+            bool addEos,
+            ReadOnlySpan<byte> precompiledCharsmap,
+            bool addDummyPrefix,
+            bool escapeWhiteSpaces,
+            bool treatWhitespaceAsSuffix,
+            bool removeExtraWhitespaces,
+            IReadOnlyDictionary<string, int>? specialTokens,
+            int bosId, int eosId, int padId)
+            : base(addBos, addEos,
+                   bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "<s>", bosId,
+                   eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "</s>", eosId,
+                   GetPieceAtIndex(pieces, unkId), unkId,
+                   addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, byteFallback: false,
+                   precompiledCharsmap, removeExtraWhitespaces, specialTokens)
+        {
+            Debug.Assert(pieces is not null);
+
+            _vocab = new SortedDictionary<string, int>(OrdinalUtf8StringComparer.Instance);
+            _vocabReverse = new (string Piece, float Score, ModelProto.Types.SentencePiece.Types.Type Type)[pieces!.Count];
+            _minScore = float.MaxValue;
+            _maxScore = float.MinValue;
+
+            for (int i = 0; i < pieces.Count; i++)
+            {
+                var (piece, score) = pieces[i];
+                if (i == unkId)
+                {
+                    _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Unknown);
+                }
+                else if (i == bosId || i == eosId || (padId >= 0 && i == padId))
+                {
+                    _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Control);
+                }
+                else
+                {
+                    _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Normal);
+                    _vocab.Add(piece, i);
+                    _minScore = Math.Min(_minScore, score);
+                    _maxScore = Math.Max(_maxScore, score);
+                }
+            }
+
+            ByteCodeToIdOffset = _vocab.TryGetValue("<0x00>", out int id) ? id : MaxByteId;
+            OneByteUtf8EncodingMaxId = ByteCodeToIdOffset + 0x7F;
+            MaxIdByteFallbackId = ByteCodeToIdOffset + 0xFF;
+
+            _trie = new DoubleArrayTrie(_vocab);
+
+            // Add special tokens to vocab after trie is built.
+            string unkToken = pieces[unkId].Piece;
+            _vocab[unkToken] = unkId;
+            _vocabReverse[unkId] = (unkToken, 0f, ModelProto.Types.SentencePiece.Types.Type.Unknown);
+
+            if (bosId >= 0 && bosId < pieces.Count)
+            {
+                string bos = pieces[bosId].Piece;
+                _vocab[bos] = bosId;
+                _vocabReverse[bosId] = (bos, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
+            }
+
+            if (eosId >= 0 && eosId < pieces.Count)
+            {
+                string eos = pieces[eosId].Piece;
+                _vocab[eos] = eosId;
+                _vocabReverse[eosId] = (eos, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
+            }
+
+            if (padId >= 0 && padId < pieces.Count)
+            {
+                string pad = pieces[padId].Piece;
+                _vocab[pad] = padId;
+                _vocabReverse[padId] = (pad, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
+            }
+        }
+
+        private static int GetPieceCount(IReadOnlyList<(string Piece, float Score)>? pieces)
+            => pieces?.Count ?? 0;
+
+        private static string GetPieceAtIndex(IReadOnlyList<(string Piece, float Score)>? pieces, int index)
+        {
+            if (pieces is null)
+            {
+                throw new ArgumentNullException("vocab");
+            }
+
+            if ((uint)index >= (uint)pieces.Count)
+            {
+                throw new ArgumentOutOfRangeException("unkId", "unkId must be a valid index in the vocabulary.");
+            }
+
+            return pieces[index].Piece;
+        }
+
+        // Finds a special token by name; falls back to defaultId if not found (returns -1 if defaultId is out of range).
+        private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName, int defaultId)
+        {
+            if (pieces is null)
+            {
+                return defaultId;
+            }
+
+            for (int i = 0; i < pieces.Count; i++)
+            {
+                if (pieces[i].Piece == tokenName)
+                {
+                    return i;
+                }
+            }
+
+            return defaultId >= 0 && defaultId < pieces.Count ? defaultId : -1;
+        }
+
         public override IReadOnlyDictionary<string, int> Vocabulary => new ReadOnlyDictionary<string, int>(_vocab);
 
         public int MaxIdByteFallbackId { get; }
diff --git a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
index ca671ddebe..5e0d6765b2 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
@@ -562,5 +562,108 @@ public void SpecialTokensTest()
             Assert.Equal("</s>", _unigramTokenizer.EndOfSentenceToken);
             Assert.Equal(2, _unigramTokenizer.EndOfSentenceId);
         }
+
+        [Fact]
+        public void CreateFromVocabTest()
+        {
+            // Build a minimal synthetic Unigram vocab: <unk>=0, <s>=1, </s>=2, then normal tokens
+            var vocab = new List<(string Piece, float Score)>
+            {
+                ("<unk>", 0f),
+                ("<s>",   0f),
+                ("</s>",  0f),
+                ("▁Hello", -1f),
+                (",",      -2f),
+                ("▁world", -3f),
+                ("!",      -4f),
+            };
+
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(
+                vocab, unkId: 0, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal("<unk>", tokenizer.UnknownToken);
+            Assert.Equal(0, tokenizer.UnknownId);
+            Assert.Equal("<s>", tokenizer.BeginningOfSentenceToken);
+            Assert.Equal(1, tokenizer.BeginningOfSentenceId);
+            Assert.Equal("</s>", tokenizer.EndOfSentenceToken);
+            Assert.Equal(2, tokenizer.EndOfSentenceId);
+
+            IReadOnlyList<int> ids = tokenizer.EncodeToIds("Hello, world!", addBeginningOfSentence: false, addEndOfSentence: false);
+            Assert.Equal(new[] { 3, 4, 5, 6 }, ids);
+
+            string decoded = tokenizer.Decode(ids, considerSpecialTokens: false);
+            Assert.Equal("Hello, world!", decoded);
+        }
+
+        [Fact]
+        public void CreateFromVocabNullTest()
+        {
+            Assert.Throws<ArgumentNullException>(() =>
+                SentencePieceTokenizer.Create((IEnumerable<(string Piece, float Score)>)null!, unkId: 0));
+        }
+
+        [Fact]
+        public void CreateFromVocabInvalidUnkIdTest()
+        {
+            var vocab = new List<(string Piece, float Score)> { ("a", 0f) };
+            Assert.Throws<ArgumentOutOfRangeException>(() =>
+                SentencePieceTokenizer.Create(vocab, unkId: 5));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonTest()
+        {
+            using Stream jsonStream = File.OpenRead(Path.Combine("Paraphrase-multilingual-MiniLM-L12-v2", "tokenizer.json"));
+            SentencePieceTokenizer jsonTokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(
+                jsonStream, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            // The tokenizer.json vocab has <s>=0, <pad>=1, </s>=2, <unk>=3, then normal tokens
+            // (shifted +1 relative to .model which has <unk>=0, <s>=1, </s>=2)
+            Assert.Equal("<unk>", jsonTokenizer.UnknownToken);
+            Assert.Equal(3, jsonTokenizer.UnknownId);
+            Assert.Equal("<s>", jsonTokenizer.BeginningOfSentenceToken);
+            Assert.Equal(0, jsonTokenizer.BeginningOfSentenceId);
+            Assert.Equal("</s>", jsonTokenizer.EndOfSentenceToken);
+            Assert.Equal(2, jsonTokenizer.EndOfSentenceId);
+
+            // Pieces produced should match the .model tokenizer; IDs are shifted by +1
+            IReadOnlyList<EncodedToken> jsonTokens = jsonTokenizer.EncodeToTokens("Hello, world!", out _, addBeginningOfSentence: false, addEndOfSentence: false);
+            IReadOnlyList<EncodedToken> modelTokens = _unigramTokenizer.EncodeToTokens("Hello, world!", out _, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal(modelTokens.Count, jsonTokens.Count);
+            for (int i = 0; i < modelTokens.Count; i++)
+            {
+                Assert.Equal(modelTokens[i].Value, jsonTokens[i].Value);
+                // JSON IDs are offset by 1 from the .model IDs for normal tokens
+                Assert.Equal(modelTokens[i].Id + 1, jsonTokens[i].Id);
+            }
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonNullStreamTest()
+        {
+            Assert.Throws<ArgumentNullException>(() =>
+                SentencePieceTokenizer.CreateFromTokenizerJson(null!));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonNormalizationTest()
+        {
+            // Verify that the JSON tokenizer applies the precompiled charsmap normalization
+            // (same normalization as the .model tokenizer)
+            using Stream jsonStream = File.OpenRead(Path.Combine("Paraphrase-multilingual-MiniLM-L12-v2", "tokenizer.json"));
+            SentencePieceTokenizer jsonTokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(
+                jsonStream, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            // "㍻" normalizes to "平成" via the precompiled charsmap (NFKC normalization)
+            IReadOnlyList<int> jsonIds = jsonTokenizer.EncodeToIds("㍻", addBeginningOfSentence: false, addEndOfSentence: false);
+            IReadOnlyList<int> modelIds = _unigramTokenizer.EncodeToIds("㍻", addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal(modelIds.Count, jsonIds.Count);
+            for (int i = 0; i < modelIds.Count; i++)
+            {
+                Assert.Equal(modelIds[i] + 1, jsonIds[i]);
+            }
+        }
     }
 }

From 112ee2b87f5efde8a09fe487bca1c862e46b4ba2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Jun 2026 00:50:01 +0000
Subject: [PATCH 3/5] Fix BOS/EOS positional fallback, normalizer type
 validation, and prepend_scheme handling

Co-authored-by: ericstj <8918108+ericstj@users.noreply.github.com>
---
 .../Model/SentencePieceTokenizer.cs           | 30 +++++++++++++++----
 .../Model/SentencePieceUnigramModel.cs        | 16 +++++-----
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
index 2176fba0fa..e570f75378 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
@@ -640,22 +640,36 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
                         return Convert.FromBase64String(base64);
                     }
                 }
+                return null;
             }
             else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) &&
                      normalizer.TryGetProperty("normalizers", out JsonElement normalizersEl) &&
                      normalizersEl.ValueKind == JsonValueKind.Array)
             {
+                byte[]? result = null;
                 foreach (JsonElement inner in normalizersEl.EnumerateArray())
                 {
-                    byte[]? result = ExtractPrecompiledCharsMap(inner);
-                    if (result is not null)
+                    if (!inner.TryGetProperty("type", out JsonElement innerTypeEl))
+                    {
+                        continue;
+                    }
+
+                    string? innerType = innerTypeEl.GetString();
+                    if (string.Equals(innerType, "Precompiled", StringComparison.OrdinalIgnoreCase))
+                    {
+                        result = ExtractPrecompiledCharsMap(inner);
+                    }
+                    else
                     {
-                        return result;
+                        throw new NotSupportedException($"Normalizer type '{innerType}' in Sequence is not supported. Only 'Precompiled' normalizer is supported.");
                     }
                 }
+                return result;
+            }
+            else
+            {
+                throw new NotSupportedException($"Normalizer type '{type}' is not supported. Only 'Precompiled' and 'Sequence' normalizers are supported.");
             }
-
-            return null;
         }
 
         private static void ExtractMetaspaceSettings(JsonElement preTokenizer, ref bool addDummyPrefix, ref bool escapeWhiteSpaces, ref bool treatWhitespaceAsSuffix)
@@ -682,7 +696,11 @@ private static void ExtractMetaspaceSettings(JsonElement preTokenizer, ref bool
                 if (preTokenizer.TryGetProperty("prepend_scheme", out JsonElement prependSchemeEl))
                 {
                     string? scheme = prependSchemeEl.GetString();
-                    treatWhitespaceAsSuffix = string.Equals(scheme, "last", StringComparison.OrdinalIgnoreCase);
+                    // "never" suppresses the dummy prefix; "always"/"first" keep the default (true)
+                    if (string.Equals(scheme, "never", StringComparison.OrdinalIgnoreCase))
+                    {
+                        addDummyPrefix = false;
+                    }
                 }
             }
             else if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) &&
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
index adc9f09f13..4e865504be 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
@@ -95,7 +95,7 @@ public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos
 
         // Constructor that builds a Unigram model directly from a list of (piece, score) pairs.
         // BOS, EOS, and PAD tokens are identified by their names ("<s>", "</s>", "<pad>") in the vocab;
-        // if not found by name, SentencePiece-conventional positions (1, 2, none) are used as fallbacks.
+        // if not found by name, they are treated as absent (id = -1) to avoid misidentifying real pieces.
         internal SentencePieceUnigramModel(
             IReadOnlyList<(string Piece, float Score)> pieces,
             int unkId,
@@ -109,9 +109,9 @@ internal SentencePieceUnigramModel(
             IReadOnlyDictionary<string, int>? specialTokens)
             : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces,
                    treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens,
-                   FindSpecialTokenId(pieces, "<s>", 1),
-                   FindSpecialTokenId(pieces, "</s>", 2),
-                   FindSpecialTokenId(pieces, "<pad>", -1))
+                   FindSpecialTokenId(pieces, "<s>"),
+                   FindSpecialTokenId(pieces, "</s>"),
+                   FindSpecialTokenId(pieces, "<pad>"))
         {
         }
 
@@ -212,12 +212,12 @@ private static string GetPieceAtIndex(IReadOnlyList<(string Piece, float Score)>
             return pieces[index].Piece;
         }
 
-        // Finds a special token by name; falls back to defaultId if not found (returns -1 if defaultId is out of range).
-        private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName, int defaultId)
+        // Finds a special token by name; returns -1 if not found.
+        private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName)
         {
             if (pieces is null)
             {
-                return defaultId;
+                return -1;
             }
 
             for (int i = 0; i < pieces.Count; i++)
@@ -228,7 +228,7 @@ private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>
                 }
             }
 
-            return defaultId >= 0 && defaultId < pieces.Count ? defaultId : -1;
+            return -1;
         }
 
         public override IReadOnlyDictionary<string, int> Vocabulary => new ReadOnlyDictionary<string, int>(_vocab);

From 7e32c05ec38457e08554ffccc193627bcaa40fb5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Jun 2026 01:22:50 +0000
Subject: [PATCH 4/5] Fix null normalizer guard, recursive Sequence support,
 BOS/EOS validation, and add tests

Co-authored-by: ericstj <8918108+ericstj@users.noreply.github.com>
---
 .../Model/SentencePieceTokenizer.cs           | 15 ++--
 .../Model/SentencePieceUnigramModel.cs        | 32 ++++++-
 .../UnigramTests.cs                           | 84 +++++++++++++++++++
 3 files changed, 119 insertions(+), 12 deletions(-)

diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
index e570f75378..9768826425 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
@@ -601,7 +601,8 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
             byte[]? precompiledCharsMap = null;
             bool addDummyPrefix = true;
             bool removeExtraWhitespaces = true;
-            if (root.TryGetProperty("normalizer", out JsonElement normalizerElement))
+            if (root.TryGetProperty("normalizer", out JsonElement normalizerElement) &&
+                normalizerElement.ValueKind == JsonValueKind.Object)
             {
                 precompiledCharsMap = ExtractPrecompiledCharsMap(normalizerElement);
             }
@@ -649,19 +650,15 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
                 byte[]? result = null;
                 foreach (JsonElement inner in normalizersEl.EnumerateArray())
                 {
-                    if (!inner.TryGetProperty("type", out JsonElement innerTypeEl))
+                    if (inner.ValueKind != JsonValueKind.Object)
                     {
                         continue;
                     }
 
-                    string? innerType = innerTypeEl.GetString();
-                    if (string.Equals(innerType, "Precompiled", StringComparison.OrdinalIgnoreCase))
+                    byte[]? innerResult = ExtractPrecompiledCharsMap(inner);
+                    if (innerResult is not null)
                     {
-                        result = ExtractPrecompiledCharsMap(inner);
-                    }
-                    else
-                    {
-                        throw new NotSupportedException($"Normalizer type '{innerType}' in Sequence is not supported. Only 'Precompiled' normalizer is supported.");
+                        result = innerResult;
                     }
                 }
                 return result;
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
index 4e865504be..c876327009 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
@@ -109,7 +109,7 @@ internal SentencePieceUnigramModel(
             IReadOnlyDictionary<string, int>? specialTokens)
             : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces,
                    treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens,
-                   FindSpecialTokenId(pieces, "<s>"),
+                   FindSpecialTokenId(ValidateVocab(pieces, unkId), "<s>"),
                    FindSpecialTokenId(pieces, "</s>"),
                    FindSpecialTokenId(pieces, "<pad>"))
         {
@@ -128,8 +128,8 @@ private SentencePieceUnigramModel(
             IReadOnlyDictionary<string, int>? specialTokens,
             int bosId, int eosId, int padId)
             : base(addBos, addEos,
-                   bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "<s>", bosId,
-                   eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "</s>", eosId,
+                   bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "<s>", CheckSpecialId(addBos, bosId, "addBeginningOfSentence"),
+                   eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "</s>", CheckSpecialId(addEos, eosId, "addEndOfSentence"),
                    GetPieceAtIndex(pieces, unkId), unkId,
                    addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, byteFallback: false,
                    precompiledCharsmap, removeExtraWhitespaces, specialTokens)
@@ -212,6 +212,23 @@ private static string GetPieceAtIndex(IReadOnlyList<(string Piece, float Score)>
             return pieces[index].Piece;
         }
 
+        // Validates pieces is not null and unkId is in range; returns pieces unchanged.
+        private static IReadOnlyList<(string Piece, float Score)> ValidateVocab(
+            IReadOnlyList<(string Piece, float Score)>? pieces, int unkId)
+        {
+            if (pieces is null)
+            {
+                throw new ArgumentNullException("vocab");
+            }
+
+            if ((uint)unkId >= (uint)pieces.Count)
+            {
+                throw new ArgumentOutOfRangeException("unkId", "unkId must be a valid index in the vocabulary.");
+            }
+
+            return pieces;
+        }
+
         // Finds a special token by name; returns -1 if not found.
         private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>? pieces, string tokenName)
         {
@@ -231,6 +248,15 @@ private static int FindSpecialTokenId(IReadOnlyList<(string Piece, float Score)>
             return -1;
         }
 
+        private static int CheckSpecialId(bool required, int id, string paramName)
+        {
+            if (required && id < 0)
+            {
+                throw new ArgumentException($"The vocabulary does not contain the required special token.", paramName);
+            }
+            return id;
+        }
+
         public override IReadOnlyDictionary<string, int> Vocabulary => new ReadOnlyDictionary<string, int>(_vocab);
 
         public int MaxIdByteFallbackId { get; }
diff --git a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
index 5e0d6765b2..2d7a63373a 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
@@ -665,5 +665,89 @@ public void CreateFromTokenizerJsonNormalizationTest()
                 Assert.Equal(modelIds[i] + 1, jsonIds[i]);
             }
         }
+
+        [Fact]
+        public void CreateFromVocabNoSpecialTokensTest()
+        {
+            // Vocab without <s>/<pad>/</s> — resembles bge-m3/potion layout.
+            // Verify that real pieces (e.g. ",") are not marked Control and remain encodable.
+            var vocab = new List<(string Piece, float Score)>
+            {
+                ("[PAD]", 0f),   // 0
+                ("[UNK]", 0f),   // 1
+                (",",     -1f),  // 2
+                ("▁Hello", -2f), // 3
+                ("▁world", -3f), // 4
+                ("!",     -4f),  // 5
+            };
+
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(
+                vocab, unkId: 1, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            // "," must be in the vocabulary and encodable (not silently dropped as Control)
+            IReadOnlyList<int> ids = tokenizer.EncodeToIds("Hello, world!", addBeginningOfSentence: false, addEndOfSentence: false);
+            Assert.Contains(2, ids); // id 2 is ","
+        }
+
+        [Fact]
+        public void CreateFromVocabBosRequiredButAbsentTest()
+        {
+            // Vocab without <s>: addBeginningOfSentence:true should throw rather than emit index 0.
+            var vocab = new List<(string Piece, float Score)>
+            {
+                ("[UNK]", 0f),
+                ("▁Hello", -1f),
+            };
+
+            Assert.Throws<ArgumentException>(() =>
+                SentencePieceTokenizer.Create(vocab, unkId: 0, addBeginningOfSentence: true));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonUnsupportedNormalizerTest()
+        {
+            // A Sequence normalizer containing a non-Precompiled step should throw NotSupportedException.
+            string json = """
+                {
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 0,
+                    "vocab": [["<unk>", 0.0], ["a", -1.0]]
+                  },
+                  "normalizer": {
+                    "type": "Sequence",
+                    "normalizers": [
+                      { "type": "Precompiled", "precompiled_charsmap": "" },
+                      { "type": "Replace", "pattern": " ", "content": "_" }
+                    ]
+                  }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            Assert.Throws<NotSupportedException>(() =>
+                SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonNullNormalizerTest()
+        {
+            // A null normalizer value in JSON should not throw.
+            string json = """
+                {
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 0,
+                    "vocab": [["<unk>", 0.0], ["a", -1.0]]
+                  },
+                  "normalizer": null
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(
+                stream, addBeginningOfSentence: false);
+            Assert.NotNull(tokenizer);
+        }
     }
 }

From 057cd4648e1d766dc5aaa71555e0aab2fb9aefb1 Mon Sep 17 00:00:00 2001
From: Eric StJohn <ericstj@microsoft.com>
Date: Fri, 12 Jun 2026 12:06:01 -0700
Subject: [PATCH 5/5] Generalize SentencePiece special tokens to prefix/suffix
 lists with post-processor support

Support Hugging Face tokenizer.json post-processor templates that wrap a sequence with
multiple special tokens (XLNet/mBART/NLLB styles) by generalizing the single BOS/EOS
handling into ordered prefix/suffix special-token lists, applied consistently across the
encode/count/index paths.

CreateFromTokenizerJson now:
- parses post_processor (TemplateProcessing, RobertaProcessing, BertProcessing, Sequence)
  to resolve the prefix/suffix wrapping, resolving ids via special_tokens -> added_tokens -> vocab.
- reads model.byte_fallback and exposes byteFallback on Create.
- treats added_tokens (special) as the authoritative special-token source.
- tolerates unmodeled sibling steps inside a Sequence normalizer.
- deduces remove_extra_whitespaces from the normalizer Strip/Replace(" {2,}") steps instead
  of hardcoding it, matching the Hugging Face fast-tokenizer runtime.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../Model/SentencePieceBaseModel.cs           |   6 +-
 .../Model/SentencePieceTokenizer.cs           | 372 +++++++++++++++++-
 .../Model/SentencePieceUnigramModel.cs        | 207 +++++++---
 .../UnigramTests.cs                           | 318 ++++++++++++++-
 4 files changed, 839 insertions(+), 64 deletions(-)

diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs
index 885f541c5b..8786eeb9ac 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs
@@ -72,11 +72,11 @@ internal SentencePieceBaseModel(
             AddBeginningOfSentence = addBos;
             AddEndOfSentence = addEos;
             BeginningOfSentenceToken = bosToken;
-            BeginningOfSentenceId = Math.Max(0, bosId);
+            BeginningOfSentenceId = bosId;
             EndOfSentenceToken = eosToken;
-            EndOfSentenceId = Math.Max(0, eosId);
+            EndOfSentenceId = eosId;
             UnknownToken = unkToken;
-            UnknownId = Math.Max(0, unkId);
+            UnknownId = unkId;
             AddDummyPrefix = addDummyPrefix;
             EscapeWhiteSpaces = escapeWhiteSpaces;
             TreatWhitespaceAsSuffix = treatWhitespaceAsSuffix;
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
index 9768826425..8ef032682d 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs
@@ -482,13 +482,15 @@ public static SentencePieceTokenizer Create(
         /// <param name="addDummyPrefix">Whether to prepend the dummy whitespace prefix character (U+2581) at the start of the input.</param>
         /// <param name="escapeWhiteSpaces">Whether to replace spaces with the dummy whitespace character (U+2581) during normalization.</param>
         /// <param name="treatWhitespaceAsSuffix">Whether to emit the U+2581 character at the end of the last token rather than the beginning of the first token.</param>
+        /// <param name="byteFallback">Whether unknown characters are decomposed into UTF-8 byte pieces (<c>&lt;0x00&gt;</c>..<c>&lt;0xFF&gt;</c>) instead of the unknown token.</param>
         /// <param name="specialTokens">Additional special tokens to recognize, supplied as a mapping of token string to token ID.</param>
         /// <returns>A new <see cref="SentencePieceTokenizer"/> instance.</returns>
         /// <remarks>
         /// The beginning-of-sentence and end-of-sentence token IDs are auto-detected by looking for pieces
-        /// named <c>&lt;s&gt;</c> and <c>&lt;/s&gt;</c> in <paramref name="vocab"/>. If not found, positions 1 and 2
-        /// are used as fallbacks (the SentencePiece convention). Similarly, a <c>&lt;pad&gt;</c> piece is
-        /// detected automatically if present.
+        /// named <c>&lt;s&gt;</c> and <c>&lt;/s&gt;</c> in <paramref name="vocab"/>. If a piece is not found it is
+        /// treated as absent; requesting <paramref name="addBeginningOfSentence"/> or <paramref name="addEndOfSentence"/>
+        /// when the corresponding piece is absent throws an <see cref="ArgumentException"/>. A <c>&lt;pad&gt;</c> piece
+        /// is likewise detected automatically when present.
         /// <para>
         /// When creating the tokenizer, ensure that the vocabulary is sourced from a trusted provider.
         /// </para>
@@ -502,6 +504,7 @@ public static SentencePieceTokenizer Create(
             bool addDummyPrefix = true,
             bool escapeWhiteSpaces = true,
             bool treatWhitespaceAsSuffix = false,
+            bool byteFallback = false,
             IReadOnlyDictionary<string, int>? specialTokens = null)
         {
             if (vocab is null)
@@ -515,7 +518,7 @@ public static SentencePieceTokenizer Create(
             SentencePieceUnigramModel model = new SentencePieceUnigramModel(
                 pieces, unkId, addBeginningOfSentence, addEndOfSentence,
                 precompiledCharsMap, addDummyPrefix, escapeWhiteSpaces,
-                treatWhitespaceAsSuffix, removeExtraWhitespaces: true, specialTokens);
+                treatWhitespaceAsSuffix, removeExtraWhitespaces: true, byteFallback, specialTokens);
 
             return new SentencePieceTokenizer(model);
         }
@@ -534,10 +537,18 @@ public static SentencePieceTokenizer Create(
         /// <list type="bullet">
         ///   <item><description><c>model.vocab</c> — array of <c>[piece, score]</c> pairs (required).</description></item>
         ///   <item><description><c>model.unk_id</c> — index of the unknown token (required).</description></item>
+        ///   <item><description><c>model.byte_fallback</c> — whether unknown characters fall back to UTF-8 byte pieces.</description></item>
+        ///   <item><description><c>added_tokens</c> — special tokens (those with <c>"special": true</c>) and their IDs.</description></item>
         ///   <item><description><c>normalizer.precompiled_charsmap</c> (base64) — normalization map; also searched inside a <c>Sequence</c> normalizer.</description></item>
         ///   <item><description><c>pre_tokenizer</c> of type <c>Metaspace</c> — <c>add_prefix_space</c> and <c>replacement</c>; also searched inside a <c>Sequence</c> pre-tokenizer.</description></item>
+        ///   <item><description><c>post_processor</c> (<c>TemplateProcessing</c>, <c>RobertaProcessing</c>, <c>BertProcessing</c>, or a <c>Sequence</c> of these) — the special tokens that wrap a single sequence, gated by <paramref name="addBeginningOfSentence"/> (prefix) and <paramref name="addEndOfSentence"/> (suffix).</description></item>
         /// </list>
         /// <para>
+        /// <c>remove_extra_whitespaces</c> has no direct representation in <c>tokenizer.json</c> and is assumed to be
+        /// <see langword="true"/>. Pair-sequence templates and per-token <c>type_id</c>s are not applied. Templates that
+        /// place a special token in the middle of the sequence are rejected with <see cref="NotSupportedException"/>.
+        /// </para>
+        /// <para>
         /// When creating the tokenizer, ensure that the JSON stream is sourced from a trusted provider.
         /// </para>
         /// </remarks>
@@ -561,8 +572,17 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
                 throw new InvalidDataException("The tokenizer.json does not contain a 'model' property.");
             }
 
-            if (modelElement.TryGetProperty("type", out JsonElement modelTypeElement) &&
-                !string.Equals(modelTypeElement.GetString(), "Unigram", StringComparison.OrdinalIgnoreCase))
+            if (modelElement.ValueKind != JsonValueKind.Object)
+            {
+                throw new InvalidDataException("The tokenizer.json 'model' property must be a JSON object.");
+            }
+
+            if (!modelElement.TryGetProperty("type", out JsonElement modelTypeElement))
+            {
+                throw new InvalidDataException("The tokenizer.json model does not contain a 'type' property; this factory only supports 'Unigram' models.");
+            }
+
+            if (!string.Equals(modelTypeElement.GetString(), "Unigram", StringComparison.OrdinalIgnoreCase))
             {
                 throw new InvalidDataException($"Expected model type 'Unigram' but found '{modelTypeElement.GetString()}'.");
             }
@@ -574,6 +594,9 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
 
             int unkId = unkIdElement.GetInt32();
 
+            bool byteFallback = modelElement.TryGetProperty("byte_fallback", out JsonElement byteFallbackElement) &&
+                                byteFallbackElement.ValueKind == JsonValueKind.True;
+
             if (!modelElement.TryGetProperty("vocab", out JsonElement vocabElement) ||
                 vocabElement.ValueKind != JsonValueKind.Array)
             {
@@ -600,29 +623,286 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
             // Extract normalizer settings
             byte[]? precompiledCharsMap = null;
             bool addDummyPrefix = true;
-            bool removeExtraWhitespaces = true;
+            // HF tokenizer.json has no remove_extra_whitespaces flag; SpmConverter encodes that behavior as
+            // explicit normalizer steps (a right-Strip plus a Replace collapsing runs of spaces). Deduce it from
+            // those steps, defaulting to false when absent to match the HF fast-tokenizer runtime.
+            bool removeExtraWhitespaces = false;
             if (root.TryGetProperty("normalizer", out JsonElement normalizerElement) &&
                 normalizerElement.ValueKind == JsonValueKind.Object)
             {
                 precompiledCharsMap = ExtractPrecompiledCharsMap(normalizerElement);
+                removeExtraWhitespaces = NormalizerCollapsesWhitespace(normalizerElement);
             }
 
             // Extract pre_tokenizer settings
             bool escapeWhiteSpaces = true;
             bool treatWhitespaceAsSuffix = false;
-            if (root.TryGetProperty("pre_tokenizer", out JsonElement preTokenizerElement))
+            if (root.TryGetProperty("pre_tokenizer", out JsonElement preTokenizerElement) &&
+                preTokenizerElement.ValueKind == JsonValueKind.Object)
             {
                 ExtractMetaspaceSettings(preTokenizerElement, ref addDummyPrefix, ref escapeWhiteSpaces, ref treatWhitespaceAsSuffix);
             }
 
+            // Merge the special tokens declared in added_tokens (authoritative source for their IDs) with any
+            // caller-supplied special tokens; the caller's entries win on conflict.
+            Dictionary<string, int> mergedSpecialTokens = ParseAddedTokens(root);
+            if (specialTokens is not null)
+            {
+                foreach (var kvp in specialTokens)
+                {
+                    mergedSpecialTokens[kvp.Key] = kvp.Value;
+                }
+            }
+
+            // Resolve the prefix/suffix special-token wrapping from the post_processor (if present), falling back
+            // to the SentencePiece-conventional <s>/</s> names otherwise.
+            ResolvePostProcessorAffixes(root, vocab, mergedSpecialTokens,
+                out List<(int Id, string Token)> prefixTokens, out List<(int Id, string Token)> suffixTokens);
+
+            // Ensure every wrapping token is registered as a special token so it is classified Control and round-trips on decode.
+            foreach (var (id, token) in prefixTokens)
+            {
+                mergedSpecialTokens[token] = id;
+            }
+            foreach (var (id, token) in suffixTokens)
+            {
+                mergedSpecialTokens[token] = id;
+            }
+
+            int padId = mergedSpecialTokens.TryGetValue("<pad>", out int p) ? p : FindPieceId(vocab, "<pad>");
+
             SentencePieceUnigramModel model = new SentencePieceUnigramModel(
                 vocab, unkId, addBeginningOfSentence, addEndOfSentence,
                 precompiledCharsMap is not null ? precompiledCharsMap.AsSpan() : default,
-                addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens);
+                addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, removeExtraWhitespaces, byteFallback,
+                mergedSpecialTokens.Count > 0 ? mergedSpecialTokens : null,
+                prefixTokens, suffixTokens, padId);
 
             return new SentencePieceTokenizer(model);
         }
 
+        // Reads the special tokens (those marked "special": true) from the top-level added_tokens array.
+        private static Dictionary<string, int> ParseAddedTokens(JsonElement root)
+        {
+            Dictionary<string, int> result = new Dictionary<string, int>();
+            if (!root.TryGetProperty("added_tokens", out JsonElement addedTokens) || addedTokens.ValueKind != JsonValueKind.Array)
+            {
+                return result;
+            }
+
+            foreach (JsonElement entry in addedTokens.EnumerateArray())
+            {
+                if (entry.ValueKind != JsonValueKind.Object)
+                {
+                    continue;
+                }
+
+                if (!entry.TryGetProperty("special", out JsonElement specialElement) || specialElement.ValueKind != JsonValueKind.True)
+                {
+                    continue;
+                }
+
+                if (entry.TryGetProperty("content", out JsonElement contentElement) &&
+                    entry.TryGetProperty("id", out JsonElement idElement) &&
+                    contentElement.GetString() is string content)
+                {
+                    result[content] = idElement.GetInt32();
+                }
+            }
+
+            return result;
+        }
+
+        // Resolves the ordered prefix/suffix special tokens that wrap an encoded sequence, from the post_processor.
+        private static void ResolvePostProcessorAffixes(
+            JsonElement root,
+            IReadOnlyList<(string Piece, float Score)> vocab,
+            IReadOnlyDictionary<string, int> specialTokens,
+            out List<(int Id, string Token)> prefixTokens,
+            out List<(int Id, string Token)> suffixTokens)
+        {
+            prefixTokens = new List<(int Id, string Token)>();
+            suffixTokens = new List<(int Id, string Token)>();
+
+            if (root.TryGetProperty("post_processor", out JsonElement postProcessor) &&
+                postProcessor.ValueKind == JsonValueKind.Object)
+            {
+                ProcessPostProcessor(postProcessor, vocab, specialTokens, prefixTokens, suffixTokens);
+                return;
+            }
+
+            // No post_processor: fall back to the SentencePiece-conventional names.
+            AddAffixToken(prefixTokens, "<s>", vocab, specialTokens, required: false);
+            AddAffixToken(suffixTokens, "</s>", vocab, specialTokens, required: false);
+        }
+
+        private static void ProcessPostProcessor(
+            JsonElement postProcessor,
+            IReadOnlyList<(string Piece, float Score)> vocab,
+            IReadOnlyDictionary<string, int> specialTokens,
+            List<(int Id, string Token)> prefixTokens,
+            List<(int Id, string Token)> suffixTokens)
+        {
+            string? type = postProcessor.TryGetProperty("type", out JsonElement typeEl) ? typeEl.GetString() : null;
+
+            switch (type)
+            {
+                case "TemplateProcessing":
+                    ProcessTemplate(postProcessor, vocab, specialTokens, prefixTokens, suffixTokens);
+                    break;
+
+                case "RobertaProcessing":
+                    AddProcessorAffix(postProcessor, "cls", prefixTokens, vocab, specialTokens);
+                    AddProcessorAffix(postProcessor, "sep", suffixTokens, vocab, specialTokens);
+                    break;
+
+                case "BertProcessing":
+                    AddProcessorAffix(postProcessor, "cls", prefixTokens, vocab, specialTokens);
+                    AddProcessorAffix(postProcessor, "sep", suffixTokens, vocab, specialTokens);
+                    break;
+
+                case "Sequence":
+                    if (postProcessor.TryGetProperty("processors", out JsonElement processors) && processors.ValueKind == JsonValueKind.Array)
+                    {
+                        foreach (JsonElement inner in processors.EnumerateArray())
+                        {
+                            if (inner.ValueKind == JsonValueKind.Object)
+                            {
+                                ProcessPostProcessor(inner, vocab, specialTokens, prefixTokens, suffixTokens);
+                            }
+                        }
+                    }
+                    break;
+
+                default:
+                    // ByteLevel and other processors do not contribute special-token wrapping; ignore them.
+                    break;
+            }
+        }
+
+        // Parses a TemplateProcessing "single" template into leading (prefix) and trailing (suffix) special tokens.
+        private static void ProcessTemplate(
+            JsonElement postProcessor,
+            IReadOnlyList<(string Piece, float Score)> vocab,
+            IReadOnlyDictionary<string, int> specialTokens,
+            List<(int Id, string Token)> prefixTokens,
+            List<(int Id, string Token)> suffixTokens)
+        {
+            if (!postProcessor.TryGetProperty("single", out JsonElement single) || single.ValueKind != JsonValueKind.Array)
+            {
+                return;
+            }
+
+            JsonElement? ppSpecialTokens = postProcessor.TryGetProperty("special_tokens", out JsonElement st) && st.ValueKind == JsonValueKind.Object
+                ? st : (JsonElement?)null;
+
+            bool seenSequence = false;
+            foreach (JsonElement item in single.EnumerateArray())
+            {
+                if (item.ValueKind != JsonValueKind.Object)
+                {
+                    continue;
+                }
+
+                if (item.TryGetProperty("Sequence", out _))
+                {
+                    if (seenSequence)
+                    {
+                        throw new NotSupportedException("tokenizer.json post_processor templates with more than one sequence are not supported.");
+                    }
+
+                    seenSequence = true;
+                }
+                else if (item.TryGetProperty("SpecialToken", out JsonElement specialToken) &&
+                         specialToken.TryGetProperty("id", out JsonElement idEl) &&
+                         idEl.GetString() is string tokenName)
+                {
+                    int id = ResolveTemplateTokenId(tokenName, ppSpecialTokens, specialTokens, vocab);
+                    (seenSequence ? suffixTokens : prefixTokens).Add((id, tokenName));
+                }
+            }
+
+            if (!seenSequence)
+            {
+                throw new NotSupportedException("tokenizer.json post_processor template does not contain a sequence placeholder.");
+            }
+        }
+
+        private static int ResolveTemplateTokenId(
+            string tokenName,
+            JsonElement? ppSpecialTokens,
+            IReadOnlyDictionary<string, int> specialTokens,
+            IReadOnlyList<(string Piece, float Score)> vocab)
+        {
+            if (ppSpecialTokens is JsonElement st &&
+                st.TryGetProperty(tokenName, out JsonElement entry) &&
+                entry.TryGetProperty("ids", out JsonElement ids) &&
+                ids.ValueKind == JsonValueKind.Array &&
+                ids.GetArrayLength() > 0)
+            {
+                return ids[0].GetInt32();
+            }
+
+            if (specialTokens.TryGetValue(tokenName, out int specialId))
+            {
+                return specialId;
+            }
+
+            int vocabId = FindPieceId(vocab, tokenName);
+            if (vocabId < 0)
+            {
+                throw new InvalidDataException($"The tokenizer.json post_processor references special token '{tokenName}' that is not present in the vocabulary.");
+            }
+
+            return vocabId;
+        }
+
+        private static void AddProcessorAffix(
+            JsonElement postProcessor,
+            string property,
+            List<(int Id, string Token)> target,
+            IReadOnlyList<(string Piece, float Score)> vocab,
+            IReadOnlyDictionary<string, int> specialTokens)
+        {
+            // Roberta/Bert processors store cls/sep as [token, id] arrays.
+            if (postProcessor.TryGetProperty(property, out JsonElement el) && el.ValueKind == JsonValueKind.Array && el.GetArrayLength() >= 2 &&
+                el[0].GetString() is string token)
+            {
+                target.Add((el[1].GetInt32(), token));
+            }
+        }
+
+        private static void AddAffixToken(
+            List<(int Id, string Token)> target,
+            string tokenName,
+            IReadOnlyList<(string Piece, float Score)> vocab,
+            IReadOnlyDictionary<string, int> specialTokens,
+            bool required)
+        {
+            int id = specialTokens.TryGetValue(tokenName, out int specialId) ? specialId : FindPieceId(vocab, tokenName);
+            if (id >= 0)
+            {
+                target.Add((id, tokenName));
+            }
+            else if (required)
+            {
+                throw new InvalidDataException($"The tokenizer.json does not contain the required special token '{tokenName}'.");
+            }
+        }
+
+        private static int FindPieceId(IReadOnlyList<(string Piece, float Score)> vocab, string token)
+        {
+            for (int i = 0; i < vocab.Count; i++)
+            {
+                if (vocab[i].Piece == token)
+                {
+                    return i;
+                }
+            }
+
+            return -1;
+        }
+
         private static byte[]? ExtractPrecompiledCharsMap(JsonElement normalizer)
         {
             if (!normalizer.TryGetProperty("type", out JsonElement typeEl))
@@ -647,6 +927,8 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
                      normalizer.TryGetProperty("normalizers", out JsonElement normalizersEl) &&
                      normalizersEl.ValueKind == JsonValueKind.Array)
             {
+                // A Sequence may legitimately interleave the precompiled map with other steps (Nmt, Replace, ...).
+                // Extract the precompiled map and ignore the steps we don't model rather than failing the load.
                 byte[]? result = null;
                 foreach (JsonElement inner in normalizersEl.EnumerateArray())
                 {
@@ -663,9 +945,77 @@ public static SentencePieceTokenizer CreateFromTokenizerJson(
                 }
                 return result;
             }
-            else
+
+            // Other normalizer types (Nmt, Replace, Lowercase, ...) carry no precompiled map; treat as absent.
+            return null;
+        }
+
+        // Detects whether the normalizer collapses extra whitespace, i.e. SentencePiece's remove_extra_whitespaces.
+        // HF's SpmConverter emits this as a right-Strip plus a Replace of a runs-of-spaces Regex (" {2,}") -> "▁".
+        private static bool NormalizerCollapsesWhitespace(JsonElement normalizer)
+        {
+            if (normalizer.ValueKind != JsonValueKind.Object || !normalizer.TryGetProperty("type", out JsonElement typeEl))
+            {
+                return false;
+            }
+
+            string? type = typeEl.GetString();
+
+            if (string.Equals(type, "Strip", StringComparison.OrdinalIgnoreCase))
+            {
+                // A right-Strip removes trailing whitespace; treat its presence as the strip half of the behavior.
+                return !normalizer.TryGetProperty("strip_right", out JsonElement stripRight) || stripRight.ValueKind != JsonValueKind.False;
+            }
+
+            if (string.Equals(type, "Replace", StringComparison.OrdinalIgnoreCase))
+            {
+                return ReplaceCollapsesSpaces(normalizer);
+            }
+
+            if (string.Equals(type, "Sequence", StringComparison.OrdinalIgnoreCase) &&
+                normalizer.TryGetProperty("normalizers", out JsonElement normalizersEl) &&
+                normalizersEl.ValueKind == JsonValueKind.Array)
+            {
+                foreach (JsonElement inner in normalizersEl.EnumerateArray())
+                {
+                    if (NormalizerCollapsesWhitespace(inner))
+                    {
+                        return true;
+                    }
+                }
+            }
+
+            return false;
+        }
+
+        // True only for a Replace whose Regex matches runs of two-or-more spaces, not a single-space Metaspace Replace.
+        private static bool ReplaceCollapsesSpaces(JsonElement replace)
+        {
+            if (!replace.TryGetProperty("pattern", out JsonElement patternEl) ||
+                patternEl.ValueKind != JsonValueKind.Object ||
+                !patternEl.TryGetProperty("Regex", out JsonElement regexEl))
+            {
+                return false;
+            }
+
+            string? pattern = regexEl.GetString();
+            if (pattern is null)
+            {
+                return false;
+            }
+
+            // Do not trim: HF's canonical patterns " {2,}" and " +" carry a significant leading space.
+            switch (pattern)
             {
-                throw new NotSupportedException($"Normalizer type '{type}' is not supported. Only 'Precompiled' and 'Sequence' normalizers are supported.");
+                case " {2,}":
+                case " +":
+                case "[ ]+":
+                case "[ ]{2,}":
+                case "\\s+":
+                case "\\s{2,}":
+                    return true;
+                default:
+                    return false;
             }
         }
 
diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
index c876327009..5ecb08c69b 100644
--- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs
@@ -22,6 +22,8 @@ internal sealed class SentencePieceUnigramModel : SentencePieceBaseModel
         private readonly DoubleArrayTrie _trie;
         private readonly float _minScore;
         private readonly float _maxScore;
+        private readonly (int Id, string Token)[] _prefixTokens;
+        private readonly (int Id, string Token)[] _suffixTokens;
         private const float UnkPenalty = 10.0f;
 
         public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos, IReadOnlyDictionary<string, int>? specialTokens = null) : base(modelProto, addBos, addEos, specialTokens)
@@ -91,6 +93,9 @@ public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos
                 _vocab[modelProto.TrainerSpec.PadPiece] = modelProto.TrainerSpec.PadId;
                 _vocabReverse[modelProto.TrainerSpec.PadId] = (modelProto.TrainerSpec.PadPiece, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
             }
+
+            _prefixTokens = DefaultAffix(BeginningOfSentenceId, BeginningOfSentenceToken);
+            _suffixTokens = DefaultAffix(EndOfSentenceId, EndOfSentenceToken);
         }
 
         // Constructor that builds a Unigram model directly from a list of (piece, score) pairs.
@@ -106,12 +111,37 @@ internal SentencePieceUnigramModel(
             bool escapeWhiteSpaces,
             bool treatWhitespaceAsSuffix,
             bool removeExtraWhitespaces,
+            bool byteFallback,
             IReadOnlyDictionary<string, int>? specialTokens)
             : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces,
-                   treatWhitespaceAsSuffix, removeExtraWhitespaces, specialTokens,
-                   FindSpecialTokenId(ValidateVocab(pieces, unkId), "<s>"),
-                   FindSpecialTokenId(pieces, "</s>"),
-                   FindSpecialTokenId(pieces, "<pad>"))
+                   treatWhitespaceAsSuffix, removeExtraWhitespaces, byteFallback, specialTokens,
+                   CheckSpecialId(addBos, FindSpecialTokenId(ValidateVocab(pieces, unkId), "<s>"), "addBeginningOfSentence"),
+                   CheckSpecialId(addEos, FindSpecialTokenId(pieces, "</s>"), "addEndOfSentence"),
+                   FindSpecialTokenId(pieces, "<pad>"), prefixTokens: null, suffixTokens: null)
+        {
+        }
+
+        // Constructor that builds a Unigram model with explicit prefix/suffix special-token lists, for example
+        // resolved from a tokenizer.json post_processor template. addBeginningOfSentence gates the prefix list
+        // and addEndOfSentence gates the suffix list; an empty list is allowed (no tokens are emitted).
+        internal SentencePieceUnigramModel(
+            IReadOnlyList<(string Piece, float Score)> pieces,
+            int unkId,
+            bool addBos,
+            bool addEos,
+            ReadOnlySpan<byte> precompiledCharsmap,
+            bool addDummyPrefix,
+            bool escapeWhiteSpaces,
+            bool treatWhitespaceAsSuffix,
+            bool removeExtraWhitespaces,
+            bool byteFallback,
+            IReadOnlyDictionary<string, int>? specialTokens,
+            IReadOnlyList<(int Id, string Token)> prefixTokens,
+            IReadOnlyList<(int Id, string Token)> suffixTokens,
+            int padId)
+            : this(pieces, unkId, addBos, addEos, precompiledCharsmap, addDummyPrefix, escapeWhiteSpaces,
+                   treatWhitespaceAsSuffix, removeExtraWhitespaces, byteFallback, specialTokens,
+                   FirstId(prefixTokens), FirstId(suffixTokens), padId, prefixTokens, suffixTokens)
         {
         }
 
@@ -125,13 +155,16 @@ private SentencePieceUnigramModel(
             bool escapeWhiteSpaces,
             bool treatWhitespaceAsSuffix,
             bool removeExtraWhitespaces,
+            bool byteFallback,
             IReadOnlyDictionary<string, int>? specialTokens,
-            int bosId, int eosId, int padId)
+            int bosId, int eosId, int padId,
+            IReadOnlyList<(int Id, string Token)>? prefixTokens,
+            IReadOnlyList<(int Id, string Token)>? suffixTokens)
             : base(addBos, addEos,
-                   bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "<s>", CheckSpecialId(addBos, bosId, "addBeginningOfSentence"),
-                   eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "</s>", CheckSpecialId(addEos, eosId, "addEndOfSentence"),
+                   bosId >= 0 && bosId < GetPieceCount(pieces) ? pieces[bosId].Piece : "<s>", bosId,
+                   eosId >= 0 && eosId < GetPieceCount(pieces) ? pieces[eosId].Piece : "</s>", eosId,
                    GetPieceAtIndex(pieces, unkId), unkId,
-                   addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, byteFallback: false,
+                   addDummyPrefix, escapeWhiteSpaces, treatWhitespaceAsSuffix, byteFallback,
                    precompiledCharsmap, removeExtraWhitespaces, specialTokens)
         {
             Debug.Assert(pieces is not null);
@@ -141,6 +174,20 @@ private SentencePieceUnigramModel(
             _minScore = float.MaxValue;
             _maxScore = float.MinValue;
 
+            // Control tokens (BOS/EOS/PAD plus any caller- or added_tokens-supplied special tokens) are kept
+            // out of the trie so normal segmentation never produces them; they are re-inserted afterwards.
+            HashSet<int> controlIds = new HashSet<int>();
+            AddControlId(controlIds, bosId);
+            AddControlId(controlIds, eosId);
+            AddControlId(controlIds, padId);
+            if (specialTokens is not null)
+            {
+                foreach (int specialId in specialTokens.Values)
+                {
+                    AddControlId(controlIds, specialId);
+                }
+            }
+
             for (int i = 0; i < pieces.Count; i++)
             {
                 var (piece, score) = pieces[i];
@@ -148,7 +195,7 @@ private SentencePieceUnigramModel(
                 {
                     _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Unknown);
                 }
-                else if (i == bosId || i == eosId || (padId >= 0 && i == padId))
+                else if (controlIds.Contains(i))
                 {
                     _vocabReverse[i] = (piece, score, ModelProto.Types.SentencePiece.Types.Type.Control);
                 }
@@ -167,30 +214,67 @@ private SentencePieceUnigramModel(
 
             _trie = new DoubleArrayTrie(_vocab);
 
-            // Add special tokens to vocab after trie is built.
+            // Re-insert special tokens into the vocab maps after the trie is built so they map like regular tokens.
             string unkToken = pieces[unkId].Piece;
             _vocab[unkToken] = unkId;
             _vocabReverse[unkId] = (unkToken, 0f, ModelProto.Types.SentencePiece.Types.Type.Unknown);
 
-            if (bosId >= 0 && bosId < pieces.Count)
+            foreach (int controlId in controlIds)
             {
-                string bos = pieces[bosId].Piece;
-                _vocab[bos] = bosId;
-                _vocabReverse[bosId] = (bos, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
+                if (controlId == unkId)
+                {
+                    continue; // unk is classified Unknown above; don't downgrade it to Control.
+                }
+
+                if (controlId >= 0 && controlId < pieces.Count)
+                {
+                    string piece = pieces[controlId].Piece;
+                    _vocab[piece] = controlId;
+                    _vocabReverse[controlId] = (piece, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
+                }
+            }
+
+            _prefixTokens = prefixTokens is not null ? ToAffixArray(prefixTokens) : DefaultAffix(BeginningOfSentenceId, BeginningOfSentenceToken);
+            _suffixTokens = suffixTokens is not null ? ToAffixArray(suffixTokens) : DefaultAffix(EndOfSentenceId, EndOfSentenceToken);
+        }
+
+        private static (int Id, string Token)[] DefaultAffix(int id, string token)
+            => id >= 0 ? new[] { (id, token) } : Array.Empty<(int, string)>();
+
+        private static (int Id, string Token)[] ToAffixArray(IReadOnlyList<(int Id, string Token)> tokens)
+        {
+            var array = new (int Id, string Token)[tokens.Count];
+            for (int i = 0; i < tokens.Count; i++)
+            {
+                array[i] = tokens[i];
             }
 
-            if (eosId >= 0 && eosId < pieces.Count)
+            return array;
+        }
+
+        private static int FirstId(IReadOnlyList<(int Id, string Token)> tokens) => tokens.Count > 0 ? tokens[0].Id : -1;
+
+        private void AddPrefixTokens(List<EncodedToken> tokens)
+        {
+            foreach (var (id, token) in _prefixTokens)
             {
-                string eos = pieces[eosId].Piece;
-                _vocab[eos] = eosId;
-                _vocabReverse[eosId] = (eos, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
+                tokens.Add(new EncodedToken(id, token, new Range(0, 0)));
             }
+        }
 
-            if (padId >= 0 && padId < pieces.Count)
+        private void AddSuffixTokens(List<EncodedToken> tokens, int offset)
+        {
+            foreach (var (id, token) in _suffixTokens)
             {
-                string pad = pieces[padId].Piece;
-                _vocab[pad] = padId;
-                _vocabReverse[padId] = (pad, 0f, ModelProto.Types.SentencePiece.Types.Type.Control);
+                tokens.Add(new EncodedToken(id, token, new Range(offset, offset)));
+            }
+        }
+
+        private static void AddControlId(HashSet<int> set, int id)
+        {
+            if (id >= 0)
+            {
+                set.Add(id);
             }
         }
 
@@ -382,7 +466,7 @@ private void EncodeToTokensWithSpecialTokens(
 
             if (addBeginningOfSentence)
             {
-                tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0)));
+                AddPrefixTokens(tokens);
             }
 
             int currentOffset = 0;
@@ -414,7 +498,7 @@ private void EncodeToTokensWithSpecialTokens(
 
             if (addEndOfSentence)
             {
-                tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, new Range(progressOffset, progressOffset)));
+                AddSuffixTokens(tokens, progressOffset);
             }
 
             normalizedText = normalizedString.AsSpan().Slice(0, normalizedStringIndex).ToString();
@@ -432,7 +516,7 @@ private void EncodeToTokensWithoutSpecialTokens(
         {
             if (addBeginningOfSentence)
             {
-                tokens.Add(new EncodedToken(BeginningOfSentenceId, BeginningOfSentenceToken, new Range(0, 0)));
+                AddPrefixTokens(tokens);
             }
 
             int progressOffset = 0;
@@ -442,7 +526,7 @@ private void EncodeToTokensWithoutSpecialTokens(
 
             if (addEndOfSentence)
             {
-                tokens.Add(new EncodedToken(EndOfSentenceId, EndOfSentenceToken, new Range(progressOffset, progressOffset)));
+                AddSuffixTokens(tokens, progressOffset);
             }
 
             normalizedText = normalizedString.AsSpan().Slice(0, normalizedStringIndex).ToString();
@@ -735,12 +819,15 @@ public override IReadOnlyList<int> EncodeToIds(
 
             if (addBeginningOfSentence)
             {
-                ids.Add(BeginningOfSentenceId);
-                if (maxTokenCount == 1)
+                foreach (var (id, _) in _prefixTokens)
                 {
-                    normalizedText = null;
-                    charsConsumed = 0;
-                    return ids; // done. no more space for anything else.
+                    ids.Add(id);
+                    if (ids.Count >= maxTokenCount)
+                    {
+                        normalizedText = null;
+                        charsConsumed = 0;
+                        return ids; // done. no more space for anything else.
+                    }
                 }
             }
 
@@ -759,9 +846,17 @@ public override IReadOnlyList<int> EncodeToIds(
                 EncodeToIdsWithoutSpecialTokens(textToEncode, considerNormalization, ids, buffer, ref normalizedString, out normalizedText, out charsConsumed, maxTokenCount);
             }
 
-            if (addEndOfSentence && ids.Count < maxTokenCount)
+            if (addEndOfSentence)
             {
-                ids.Add(EndOfSentenceId);
+                foreach (var (id, _) in _suffixTokens)
+                {
+                    if (ids.Count >= maxTokenCount)
+                    {
+                        break;
+                    }
+
+                    ids.Add(id);
+                }
             }
 
             if (normalizedString is not null)
@@ -1124,13 +1219,15 @@ public override int CountTokens(
 
             if (addBeginningOfSentence)
             {
-                tokenCount++;
-
-                if (maxTokenCount == 1)
+                foreach (var _ in _prefixTokens)
                 {
-                    normalizedText = null;
-                    charsConsumed = 0;
-                    return tokenCount;
+                    tokenCount++;
+                    if (tokenCount >= maxTokenCount)
+                    {
+                        normalizedText = null;
+                        charsConsumed = 0;
+                        return tokenCount;
+                    }
                 }
             }
 
@@ -1151,7 +1248,15 @@ public override int CountTokens(
 
             if (addEndOfSentence && tokenCount < maxTokenCount)
             {
-                tokenCount++;
+                foreach (var _ in _suffixTokens)
+                {
+                    if (tokenCount >= maxTokenCount)
+                    {
+                        break;
+                    }
+
+                    tokenCount++;
+                }
             }
 
             if (normalizedString is not null)
@@ -1392,12 +1497,14 @@ public override int GetIndexByTokenCountFromEnd(
 
             if (addEndOfSentence)
             {
-                tokenCount++;
-
-                if (maxTokenCount == 1)
+                foreach (var _ in _suffixTokens)
                 {
-                    normalizedText = null;
-                    return textToEncode.Length;
+                    tokenCount++;
+                    if (tokenCount >= maxTokenCount)
+                    {
+                        normalizedText = null;
+                        return textToEncode.Length;
+                    }
                 }
             }
 
@@ -1420,7 +1527,15 @@ public override int GetIndexByTokenCountFromEnd(
 
             if (addBeginningOfSentence && tokenCount < maxTokenCount)
             {
-                tokenCount++;
+                foreach (var _ in _prefixTokens)
+                {
+                    if (tokenCount >= maxTokenCount)
+                    {
+                        break;
+                    }
+
+                    tokenCount++;
+                }
             }
 
             ArrayPool<int>.Shared.Return(buffer);
diff --git a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
index 2d7a63373a..53343aa8d5 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/UnigramTests.cs
@@ -704,9 +704,11 @@ public void CreateFromVocabBosRequiredButAbsentTest()
         }
 
         [Fact]
-        public void CreateFromTokenizerJsonUnsupportedNormalizerTest()
+        public void CreateFromTokenizerJsonSequenceNormalizerWithExtraStepsTest()
         {
-            // A Sequence normalizer containing a non-Precompiled step should throw NotSupportedException.
+            // A Sequence normalizer that interleaves the precompiled map with other steps (e.g. Replace)
+            // is common in real tokenizers; the precompiled map is extracted and the other steps are ignored
+            // rather than failing the load.
             string json = """
                 {
                   "model": {
@@ -717,6 +719,7 @@ public void CreateFromTokenizerJsonUnsupportedNormalizerTest()
                   "normalizer": {
                     "type": "Sequence",
                     "normalizers": [
+                      { "type": "Nmt" },
                       { "type": "Precompiled", "precompiled_charsmap": "" },
                       { "type": "Replace", "pattern": " ", "content": "_" }
                     ]
@@ -725,8 +728,91 @@ public void CreateFromTokenizerJsonUnsupportedNormalizerTest()
                 """;
 
             using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
-            Assert.Throws<NotSupportedException>(() =>
-                SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false);
+            Assert.NotNull(tokenizer);
+        }
+
+        // Vocab shared by the remove_extra_whitespaces deduction tests; "▁" is its own piece so a preserved
+        // extra space surfaces as an extra token.
+        private const string WhitespaceDeductionVocab =
+            "\"vocab\": [[\"<unk>\", 0.0], [\"\u2581a\", -1.0], [\"\u2581b\", -1.0], [\"\u2581\", -3.0], [\"a\", -10.0], [\"b\", -10.0]]";
+
+        [Fact]
+        public void CreateFromTokenizerJsonDeducesRemoveExtraWhitespacesFromReplaceStep()
+        {
+            // HF encodes remove_extra_whitespaces as a Strip + Replace(" {2,}" -> "▁"); the collapsing Replace
+            // ALONE (no sibling Strip) must enable whitespace collapsing so "a  b" collapses to two pieces.
+            string json = $$"""
+                {
+                  "normalizer": {
+                    "type": "Sequence",
+                    "normalizers": [
+                      { "type": "Replace", "pattern": { "Regex": " {2,}" }, "content": "\u2581" }
+                    ]
+                  },
+                  "pre_tokenizer": { "type": "Metaspace", "replacement": "\u2581", "add_prefix_space": true },
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 0,
+                    {{WhitespaceDeductionVocab}}
+                  }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal(2, tokenizer.CountTokens("a  b", addBeginningOfSentence: false, addEndOfSentence: false));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonDeducesRemoveExtraWhitespacesFromStripStep()
+        {
+            // A right-Strip alone (no Replace) also marks the behavior.
+            string json = $$"""
+                {
+                  "normalizer": {
+                    "type": "Sequence",
+                    "normalizers": [
+                      { "type": "Strip", "strip_left": false, "strip_right": true }
+                    ]
+                  },
+                  "pre_tokenizer": { "type": "Metaspace", "replacement": "\u2581", "add_prefix_space": true },
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 0,
+                    {{WhitespaceDeductionVocab}}
+                  }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal(2, tokenizer.CountTokens("a  b", addBeginningOfSentence: false, addEndOfSentence: false));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonNoCollapseStepPreservesExtraWhitespace()
+        {
+            // Without a Strip/Replace collapsing step (e.g. older bare-Precompiled files), remove_extra_whitespaces
+            // is deduced false to match the HF fast-tokenizer runtime, so the extra space is preserved as a token.
+            string json = $$"""
+                {
+                  "normalizer": { "type": "Precompiled", "precompiled_charsmap": "" },
+                  "pre_tokenizer": { "type": "Metaspace", "replacement": "\u2581", "add_prefix_space": true },
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 0,
+                    {{WhitespaceDeductionVocab}}
+                  }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal(3, tokenizer.CountTokens("a  b", addBeginningOfSentence: false, addEndOfSentence: false));
         }
 
         [Fact]
@@ -749,5 +835,229 @@ public void CreateFromTokenizerJsonNullNormalizerTest()
                 stream, addBeginningOfSentence: false);
             Assert.NotNull(tokenizer);
         }
+
+        [Fact]
+        public void CreateFromVocabAbsentBosNotDecodedAsIdZeroTest()
+        {
+            // Vocab without <s>/</s>. With the add flags off, BOS/EOS must stay absent (-1)
+            // rather than being clamped to 0, so id 0 decodes as its real piece.
+            var vocab = new List<(string Piece, float Score)>
+            {
+                ("<unk>", 0f),   // 0
+                ("▁Hello", -1f), // 1
+                ("▁world", -2f), // 2
+            };
+
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(
+                vocab, unkId: 0, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal(-1, tokenizer.BeginningOfSentenceId);
+            Assert.Equal(-1, tokenizer.EndOfSentenceId);
+
+            // id 0 is <unk>, not BOS; decoding it with considerSpecialTokens must yield the unk piece.
+            string decoded = tokenizer.Decode(new[] { 0 }, considerSpecialTokens: true);
+            Assert.Equal("<unk>", decoded);
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonMissingModelTypeTest()
+        {
+            string json = """
+                {
+                  "model": {
+                    "unk_id": 0,
+                    "vocab": [["<unk>", 0.0], ["a", -1.0]]
+                  }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            Assert.Throws<InvalidDataException>(() =>
+                SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonNonUnigramModelTypeTest()
+        {
+            string json = """
+                {
+                  "model": {
+                    "type": "BPE",
+                    "unk_id": 0,
+                    "vocab": [["<unk>", 0.0], ["a", -1.0]]
+                  }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            Assert.Throws<InvalidDataException>(() =>
+                SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonNullModelTest()
+        {
+            string json = """
+                {
+                  "model": null
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            Assert.Throws<InvalidDataException>(() =>
+                SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonNullPreTokenizerTest()
+        {
+            // A null pre_tokenizer value in JSON should not throw.
+            string json = """
+                {
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 0,
+                    "vocab": [["<unk>", 0.0], ["a", -1.0]]
+                  },
+                  "pre_tokenizer": null
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(
+                stream, addBeginningOfSentence: false);
+            Assert.NotNull(tokenizer);
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonTemplateMultiTokenSuffixTest()
+        {
+            // XLNet-style template: the sequence is followed by two special tokens (<sep> then <cls>).
+            string json = """
+                {
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 1,
+                    "vocab": [["<sep>", 0.0], ["<unk>", 0.0], ["<cls>", 0.0], ["a", -1.0], ["b", -2.0]]
+                  },
+                  "added_tokens": [
+                    { "id": 0, "content": "<sep>", "special": true },
+                    { "id": 1, "content": "<unk>", "special": true },
+                    { "id": 2, "content": "<cls>", "special": true }
+                  ],
+                  "pre_tokenizer": { "type": "Metaspace", "add_prefix_space": false, "replacement": "_" },
+                  "post_processor": {
+                    "type": "TemplateProcessing",
+                    "single": [
+                      { "Sequence": { "id": "A", "type_id": 0 } },
+                      { "SpecialToken": { "id": "<sep>", "type_id": 0 } },
+                      { "SpecialToken": { "id": "<cls>", "type_id": 0 } }
+                    ],
+                    "special_tokens": {
+                      "<sep>": { "id": "<sep>", "ids": [0], "tokens": ["<sep>"] },
+                      "<cls>": { "id": "<cls>", "ids": [2], "tokens": ["<cls>"] }
+                    }
+                  }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal(0, tokenizer.EndOfSentenceId);
+            Assert.Equal("<sep>", tokenizer.EndOfSentenceToken);
+
+            IReadOnlyList<int> withSuffix = tokenizer.EncodeToIds("a", addBeginningOfSentence: false, addEndOfSentence: true);
+            Assert.Equal(new[] { 3, 0, 2 }, withSuffix);
+
+            IReadOnlyList<int> withoutSuffix = tokenizer.EncodeToIds("a", addBeginningOfSentence: false, addEndOfSentence: false);
+            Assert.Equal(new[] { 3 }, withoutSuffix);
+
+            Assert.Equal("a", tokenizer.Decode(withSuffix, considerSpecialTokens: false));
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonRobertaProcessingTest()
+        {
+            // RobertaProcessing wraps the sequence with cls (<s>) at the front and sep (</s>) at the end.
+            string json = """
+                {
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 1,
+                    "vocab": [["<s>", 0.0], ["<unk>", 0.0], ["</s>", 0.0], ["a", -1.0]]
+                  },
+                  "added_tokens": [
+                    { "id": 0, "content": "<s>", "special": true },
+                    { "id": 2, "content": "</s>", "special": true }
+                  ],
+                  "pre_tokenizer": { "type": "Metaspace", "add_prefix_space": false, "replacement": "_" },
+                  "post_processor": { "type": "RobertaProcessing", "sep": ["</s>", 2], "cls": ["<s>", 0] }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            Assert.Equal(0, tokenizer.BeginningOfSentenceId);
+            Assert.Equal("<s>", tokenizer.BeginningOfSentenceToken);
+            Assert.Equal(2, tokenizer.EndOfSentenceId);
+            Assert.Equal("</s>", tokenizer.EndOfSentenceToken);
+
+            IReadOnlyList<int> ids = tokenizer.EncodeToIds("a", addBeginningOfSentence: true, addEndOfSentence: true);
+            Assert.Equal(new[] { 0, 3, 2 }, ids);
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonAddedTokenRecognizedTest()
+        {
+            // A special token from added_tokens that is not <s>/</s>/<pad> must still be recognized as atomic.
+            string json = """
+                {
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 1,
+                    "vocab": [["<s>", 0.0], ["<unk>", 0.0], ["</s>", 0.0], ["a", -1.0], ["<extra>", -5.0]]
+                  },
+                  "added_tokens": [
+                    { "id": 4, "content": "<extra>", "special": true }
+                  ],
+                  "pre_tokenizer": { "type": "Metaspace", "add_prefix_space": false, "replacement": "_" }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false, addEndOfSentence: false);
+
+            IReadOnlyList<int> ids = tokenizer.EncodeToIds("a<extra>a", addBeginningOfSentence: false, addEndOfSentence: false);
+            Assert.Equal(new[] { 3, 4, 3 }, ids);
+        }
+
+        [Fact]
+        public void CreateFromTokenizerJsonTemplateMultiSequenceThrowsTest()
+        {
+            // A template with more than one sequence placeholder cannot be represented and must be rejected.
+            string json = """
+                {
+                  "model": {
+                    "type": "Unigram",
+                    "unk_id": 0,
+                    "vocab": [["<unk>", 0.0], ["a", -1.0]]
+                  },
+                  "post_processor": {
+                    "type": "TemplateProcessing",
+                    "single": [
+                      { "Sequence": { "id": "A", "type_id": 0 } },
+                      { "Sequence": { "id": "B", "type_id": 0 } }
+                    ],
+                    "special_tokens": {}
+                  }
+                }
+                """;
+
+            using Stream stream = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(json));
+            Assert.Throws<NotSupportedException>(() =>
+                SentencePieceTokenizer.CreateFromTokenizerJson(stream, addBeginningOfSentence: false, addEndOfSentence: false));
+        }
     }
 }