Skip to content

Commit

Permalink
Fix random_mask_tokenize when the text is long
Browse files Browse the repository at this point in the history
Without this patch, the function crashes for long texts. See https://colab.research.google.com/drive/1SHBAUEnI1dNJmXQPUqZekFqXm7xrwH65?usp=sharing
  • Loading branch information
bryant1410 authored Oct 18, 2023
1 parent e7b39e4 commit 07ab8ae
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/open_clip/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def random_mask_tokenize(texts: Union[str, List[str]], context_length: int = 77)
if len(tokens) > context_length - 2: # 2 for sot and eot token
indices = np.random.permutation(len(tokens)).tolist()
indices = indices[:context_length - 2]
tokens = tokens[indices]
tokens = [tokens[i] for i in indices]
tokens = [sot_token,] + tokens + [eot_token,]
result[i, :len(tokens)] = torch.tensor(tokens)

Expand Down Expand Up @@ -350,4 +350,4 @@ def get_order(x):
tokens[-1] = eot_token
result[i, :len(tokens)] = torch.tensor(tokens)

return result
return result

0 comments on commit 07ab8ae

Please sign in to comment.