55 lines
1.2 KiB
Go
55 lines
1.2 KiB
Go
package tokenizer
|
|
|
|
import (
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
// Decode converts token IDs back to text
|
|
func (t *Tokenizer) Decode(ids []int32) string {
|
|
var sb strings.Builder
|
|
|
|
for _, id := range ids {
|
|
if int(id) >= len(t.vocab.Values) {
|
|
continue
|
|
}
|
|
|
|
token := t.vocab.Values[id]
|
|
|
|
switch t.typ {
|
|
case TokenizerSentencePiece:
|
|
// SentencePiece style: replace ▁ with space, decode byte tokens
|
|
token = strings.ReplaceAll(token, "▁", " ")
|
|
// Handle byte fallback tokens like <0x0D>
|
|
if len(token) == 6 && token[0] == '<' && token[1] == '0' && token[2] == 'x' && token[5] == '>' {
|
|
if v, err := strconv.ParseUint(token[3:5], 16, 8); err == nil {
|
|
sb.WriteByte(byte(v))
|
|
continue
|
|
}
|
|
}
|
|
sb.WriteString(token)
|
|
default:
|
|
// GPT-2 BPE style: decode byte-level encoding
|
|
for _, r := range token {
|
|
switch {
|
|
case r == 0x0100:
|
|
// Mirror GGML tokenizer behavior for NULL byte.
|
|
// 0x00 is omitted during decode.
|
|
continue
|
|
case r == 0x0143:
|
|
r = 0x00ad
|
|
case r > 0x0100 && r <= 0x0120:
|
|
r = r - 0x0100
|
|
case r > 0x0120 && r <= 0x0142:
|
|
r = r - 0x00a2
|
|
}
|
|
|
|
// Write as byte, not UTF-8 encoded rune
|
|
sb.WriteByte(byte(r))
|
|
}
|
|
}
|
|
}
|
|
|
|
return sb.String()
|
|
}
|