mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-08-02 23:14:49 +08:00
Rename LLAMATokenizer to SPieceTokenizer.
This commit is contained in:
22
comfy/text_encoders/spiece_tokenizer.py
Normal file
22
comfy/text_encoders/spiece_tokenizer.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import os
|
||||
|
||||
class SPieceTokenizer:
|
||||
@staticmethod
|
||||
def from_pretrained(path):
|
||||
return SPieceTokenizer(path)
|
||||
|
||||
def __init__(self, tokenizer_path):
|
||||
import sentencepiece
|
||||
self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=tokenizer_path)
|
||||
self.end = self.tokenizer.eos_id()
|
||||
|
||||
def get_vocab(self):
|
||||
out = {}
|
||||
for i in range(self.tokenizer.get_piece_size()):
|
||||
out[self.tokenizer.id_to_piece(i)] = i
|
||||
return out
|
||||
|
||||
def __call__(self, string):
|
||||
out = self.tokenizer.encode(string)
|
||||
out += [self.end]
|
||||
return {"input_ids": out}
|
Reference in New Issue
Block a user