Skip to content

Commit

Permalink
Fix error: characters can not be displayed normally in chinese (#1342)
Browse files Browse the repository at this point in the history
  • Loading branch information
wjunLu authored Nov 7, 2024
1 parent 170581a commit 743e6f3
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion torchchat/usages/openai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,8 @@ def callback(x, *, done_generating=False):

device_sync(device=self.builder_args.device)

buffer = []
ILLEGAL_CHAR = '\ufffd'
# Process each token, metrics tuple yielded by Generator.generate.
for y, _ in self.generate(
model=self.model,
Expand All @@ -413,10 +415,15 @@ def callback(x, *, done_generating=False):
break

y = y.view(-1)
buffer.append(y.item())
# Decode the torch.Tensor token to a string and append to the buffer. Separate the sequences with a period token.
content = "".join(
self.tokenizer.decode([self.tokenizer.encode(".")[0]] + y.tolist())[1:]
self.tokenizer.decode([self.tokenizer.encode(".")[0]] + buffer)[1:]
)
# Skip content while illegal characters appear.
if ILLEGAL_CHAR in content:
continue
buffer.clear()

# Package the sequence into a CompletionChunkResponse and yield it.
chunk_delta = ChunkDelta(
Expand Down

0 comments on commit 743e6f3

Please sign in to comment.