Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,50 @@ internal SentencePieceBaseModel(ModelProto modelProto, bool addBos = false, bool
specialTokens);
}

internal SentencePieceBaseModel(
bool addBos, bool addEos,
string bosToken, int bosId,
string eosToken, int eosId,
string unkToken, int unkId,
bool addDummyPrefix, bool escapeWhiteSpaces,
bool treatWhitespaceAsSuffix, bool byteFallback,
ReadOnlySpan<byte> precompiledCharsmap, bool removeExtraWhitespaces,
IReadOnlyDictionary<string, int>? specialTokens)
{
AddBeginningOfSentence = addBos;
AddEndOfSentence = addEos;
BeginningOfSentenceToken = bosToken;
BeginningOfSentenceId = bosId;
EndOfSentenceToken = eosToken;
EndOfSentenceId = eosId;
UnknownToken = unkToken;
UnknownId = unkId;
AddDummyPrefix = addDummyPrefix;
Comment thread
ericstj marked this conversation as resolved.
EscapeWhiteSpaces = escapeWhiteSpaces;
TreatWhitespaceAsSuffix = treatWhitespaceAsSuffix;
ByteFallback = byteFallback;
SpecialTokens = specialTokens;

if (specialTokens is not null && specialTokens.Count > 0)
{
InternalSpecialTokens = new Dictionary<StringSpanOrdinalKey, int>();
SpecialTokensReverse = new Dictionary<int, string>();

foreach (var item in specialTokens)
{
InternalSpecialTokens.Add(new StringSpanOrdinalKey(item.Key), item.Value);
SpecialTokensReverse.Add(item.Value, item.Key);
}

SpecialTokensRegex = new Regex(string.Join("|", specialTokens.Keys.Select(s => Regex.Escape(s))), RegexOptions.Compiled);
}

Normalizer = new SentencePieceNormalizer(
precompiledCharsmap, removeExtraWhitespaces,
addDummyPrefix, escapeWhiteSpaces,
treatWhitespaceAsSuffix, specialTokens);
}

internal Regex? SpecialTokensRegex { get; }

internal Dictionary<StringSpanOrdinalKey, int>? InternalSpecialTokens { get; }
Expand Down
Loading
Loading