Meta Llama 3 文本编码为 token
flyfish
tiktoken 是一个用于 OpenAI 模型的快速 BPE 分词器,这里用在Meta Llama 3上。主要功能包括将文本编码为token,以及将token解码回文本。这个过程通常使用BPE(Byte Pair Encoding)算法或其他类似的子词分割方法。
参考网址
https://github.com/openai/tiktoken
https://github.com/karpathy/minbpe
什么是BPE(Byte Pair Encoding)?
BPE(Byte Pair Encoding)是一种用于文本分词的子词(subword)分割算法。它通过逐步合并最常见的字符或字符序列来减少词汇表的大小,从而能够更高效地处理和表示文本数据。
BPE在tiktoken中的应用
简单的应用
import tiktoken
# 获取GPT-2编码器
enc = tiktoken.get_encoding("gpt2")
# 示例文本
text = "This is an example text."
# 将文本编码为tokens
tokens = enc.encode(text)
print(f"Encoded tokens: {tokens}")
# 将tokens解码为原文本
decoded_text = enc.decode(tokens)
print(f"Decoded text: {decoded_text}")
Meta Llama 3的使用方式 - load_tiktoken_bpe函数
在tiktoken库中,BPE用于将文本编码成模型可以处理的tokens。load_tiktoken_bpe函数会加载BPE编码的词汇表和规则,以便将文本分解成子词单位。
代码示例
import os
from logging import getLogger
from pathlib import Path
from typing import (
AbstractSet,
cast,
Collection,
Dict,
Iterator,
List,
Literal,
Sequence,
TypedDict,
Union,
)
import tiktoken
from tiktoken.load import load_tiktoken_bpe
logger = getLogger(__name__)
Role = Literal["system", "user", "assistant"]
class Message(TypedDict):
role: Role
content: str
Dialog = Sequence[Message]
class Tokenizer:
"""
Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
"""
special_tokens: Dict[str, int]
num_reserved_special_tokens = 256
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: E501
def __init__(self, model_path: str):
"""
Initializes the Tokenizer with a Tiktoken model.
Args:
model_path (str): The path to the Tiktoken model file.
"""
assert os.path.isfile(model_path), model_path
mergeable_ranks = load_tiktoken_bpe(model_path)
num_base_tokens = len(mergeable_ranks)
special_tokens = [
"<|begin_of_text|>",
"<|end_of_text|>",
"<|reserved_special_token_0|>",
"<|reserved_special_token_1|>",
"<|reserved_special_token_2|>",
"<|reserved_special_token_3|>",
"<|start_header_id|>",
"<|end_header_id|>",
"<|reserved_special_token_4|>",
"<|eot_id|>", # end of turn
] + [
f"<|reserved_special_token_{i}|>"
for i in range(5, self.num_reserved_special_tokens - 5)
]
self.special_tokens = {
token: num_base_tokens + i for i, token in enumerate(special_tokens)
}
self.model = tiktoken.Encoding(
name=Path(model_path).name,
pat_str=self.pat_str,
mergeable_ranks=mergeable_ranks,
special_tokens=self.special_tokens,
)
logger.info(f"Reloaded tiktoken model from {model_path}")
self.n_words: int = self.model.n_vocab
# BOS / EOS token IDs
self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
self.eos_id: int = self.special_tokens["<|end_of_text|>"]
self.pad_id: int = -1
self.stop_tokens = {
self.special_tokens["<|end_of_text|>"],
self.special_tokens["<|eot_id|>"],
}
logger.info(
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
)
def encode(
self,
s: str,
*,
bos: bool,
eos: bool,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
disallowed_special: Union[Literal["all"], Collection[str]] = (),
) -> List[int]:
assert type(s) is str
TIKTOKEN_MAX_ENCODE_CHARS = 400_000
MAX_NO_WHITESPACES_CHARS = 25_000
substrs = (
substr
for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
for substr in self._split_whitespaces_or_nonwhitespaces(
s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
)
)
t: List[int] = []
for substr in substrs:
t.extend(
self.model.encode(
substr,
allowed_special=allowed_special,
disallowed_special=disallowed_special,
)
)
if bos:
t.insert(0, self.bos_id)
if eos:
t.append(self.eos_id)
return t
def decode(self, t: Sequence[int]) -> str:
return self.model.decode(cast(List[int], t))
@staticmethod
def _split_whitespaces_or_nonwhitespaces(
s: str, max_consecutive_slice_len: int
) -> Iterator[str]:
current_slice_len = 0
current_slice_is_space = s[0].isspace() if len(s) > 0 else False
slice_start = 0
for i in range(len(s)):
is_now_space = s[i].isspace()
if current_slice_is_space ^ is_now_space:
current_slice_len = 1
current_slice_is_space = is_now_space
else:
current_slice_len += 1
if current_slice_len > max_consecutive_slice_len:
yield s[slice_start:i]
slice_start = i
current_slice_len = 1
yield s[slice_start:]
model_path = "Meta-Llama-3-8B-Instruct/tokenizer.model"
tokenizer = Tokenizer(model_path)
print(tokenizer.encode( "This is a test sentence.", bos=True,eos=True))
print(tokenizer.decode( [128000, 2028, 374, 264, 1296, 11914, 13, 128001]))
输出
[128000, 2028, 374, 264, 1296, 11914, 13, 128001]
<|begin_of_text|>This is a test sentence.<|end_of_text|>
再测试一个
print(tokenizer.encode( "This is Ji'nan in the winter", bos=True,eos=True))
print(tokenizer.decode( [128000, 2028, 374, 55551, 6, 19285, 304, 279, 12688, 128001]))
输出
[128000, 2028, 374, 55551, 6, 19285, 304, 279, 12688, 128001]
<|begin_of_text|>This is Ji'nan in the winter<|end_of_text|>
在这个例子中,load_tiktoken_bpe函数加载了一个预训练的BPE词汇表和规则,然后使用这些规则将输入的文本分割成tokens。之后,这些tokens可以被解码回原文本。
扩展
import tiktoken
cl100k_base = tiktoken.get_encoding("cl100k_base")
# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
enc = tiktoken.Encoding(
# If you're changing the set of special tokens, make sure to use a different name
# It should be clear from the name what behaviour to expect.
name="cl100k_im",
pat_str=cl100k_base._pat_str,
mergeable_ranks=cl100k_base._mergeable_ranks,
special_tokens={
**cl100k_base._special_tokens,
"<|im_start|>": 100264,
"<|im_end|>": 100265,
}
)
print(enc)#<Encoding 'cl100k_im'>