Skip to content

Commit

Permalink
Merge pull request #2 from petergoldstein/feature/add_github_actions_ci
Browse files Browse the repository at this point in the history
Add CI with GitHub Actions
  • Loading branch information
IAPark authored Mar 28, 2023
2 parents 7bc51a0 + af2250b commit 9bc513c
Show file tree
Hide file tree
Showing 12 changed files with 118 additions and 69 deletions.
37 changes: 37 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: build
on: [push, pull_request]
jobs:
test:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
ruby: [3.2, 3.1, "3.0"]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- uses: actions/cache@v3
with:
path: |
~/.cargo/registry
~/.cargo/git
tmp
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
- uses: ruby/setup-ruby@v1
with:
ruby-version: ${{ matrix.ruby }}
bundler-cache: true
- run: bundle exec rake compile
- run: bundle exec rake spec
lint:
strategy:
matrix:
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- uses: ruby/setup-ruby@v1
with:
ruby-version: 3.1
bundler-cache: true
- run: bundle exec rake standard
4 changes: 2 additions & 2 deletions .github/workflows/cross_compile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
matrix:
platform: ${{ fromJSON(needs.ci-data.outputs.result).supported-ruby-platforms }}
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- uses: ruby/setup-ruby@v1
with:
Expand All @@ -42,4 +42,4 @@ jobs:
- uses: actions/upload-artifact@v2
with:
name: cross-gem
path: ${{ steps.cross-gem.outputs.gem-path }}
path: ${{ steps.cross-gem.outputs.gem-path }}
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ gem "rb_sys"
gem "rspec", "~> 3.0"

gem "standard", "~> 1.3"
gem 'pry', '~> 0.14.2'
gem "pry", "~> 0.14.2"

gem "yard-doctest", "~> 0.1.17"
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ GEM

PLATFORMS
arm64-darwin-22
x86_64-darwin-22
x86_64-linux

DEPENDENCIES
pry (~> 0.14.2)
Expand Down
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,17 @@ To install this gem onto your local machine, run `bundle exec rake install`. To

Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.

To get started with development:

```sh
git clone https://github.com/IAPark/tiktoken_ruby.git
cd tiktoken_ruby
bundle install
bundle exec rake compile
bundle exec rake spec
```


## License

The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
1 change: 0 additions & 1 deletion Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ end

RSpec::Core::RakeTask.new(:spec)


task :native, [:platform] do |_t, platform:|
sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build"
end
Expand Down
2 changes: 1 addition & 1 deletion doctest_helper.rb
Original file line number Diff line number Diff line change
@@ -1 +1 @@
require 'lib/tiktoken_ruby.rb'
require "lib/tiktoken_ruby"
17 changes: 8 additions & 9 deletions lib/tiktoken_ruby.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# frozen_string_literal: true

require_relative "tiktoken_ruby/version"
require_relative "tiktoken_ruby/encoding.rb"
require_relative "tiktoken_ruby/encoding"

begin
RUBY_VERSION =~ /(\d+\.\d+)/
Expand All @@ -12,7 +12,6 @@

module Tiktoken
class << self

# Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
# it will reuse the instance of that type that was previous loaded
# @param name [Symbol|String] The name of the encoding to load
Expand All @@ -34,7 +33,7 @@ def get_encoding(name)
# enc = Tiktoken.encoding_for_model("gpt-4")
# enc.encode("hello world").length #=> 2
def encoding_for_model(model_name)
for prefix in PREFIX_MODELS
PREFIX_MODELS.each do |prefix|
if model_name.to_s.start_with?("#{prefix}-")
model_name = prefix
break
Expand Down Expand Up @@ -65,7 +64,7 @@ def list_model_names
:r50k_base,
:p50k_base,
:p50k_edit,
:cl100k_base,
:cl100k_base
]

# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
Expand All @@ -80,10 +79,10 @@ def list_model_names
"text-curie-001": "r50k_base",
"text-babbage-001": "r50k_base",
"text-ada-001": "r50k_base",
"davinci": "r50k_base",
"curie": "r50k_base",
"babbage": "r50k_base",
"ada": "r50k_base",
davinci: "r50k_base",
curie: "r50k_base",
babbage: "r50k_base",
ada: "r50k_base",
# code
"code-davinci-002": "p50k_base",
"code-davinci-001": "p50k_base",
Expand All @@ -106,7 +105,7 @@ def list_model_names
"text-search-babbage-doc-001": "r50k_base",
"text-search-ada-doc-001": "r50k_base",
"code-search-babbage-code-001": "r50k_base",
"code-search-ada-code-001": "r50k_base",
"code-search-ada-code-001": "r50k_base"
}

# these are models that have a versioned models that are otherwise identical
Expand Down
95 changes: 48 additions & 47 deletions lib/tiktoken_ruby/encoding.rb
Original file line number Diff line number Diff line change
@@ -1,51 +1,52 @@
# frozen_string_literal: true

class Tiktoken::Encoding
attr_reader :name

# This returns a new Tiktoken::Encoding instance for the requested encoding
# @param encoding [Symbol] The name of the encoding to load
# @return [Tiktoken::Encoding] The encoding instance
def self.for_name(encoding)
Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
end

# This returns a Tiktoken::Encoding instance for the requested encoding
# It will reuse an existing encoding if it's already been loaded
# @param encoding [Symbol] The name of the encoding to load
# @return [Tiktoken::Encoding] The encoding instance
def self.for_name_cached(encoding)
@encodings ||= {}
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
end

# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
# basically it's unescaped
# @param text [String] The text to encode
# @return [Array<Integer>] The encoded tokens
def encode_ordinary(text)
@ext_base_bpe.encode_ordinary(text)
end

# Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
# as text unless they're in the allowed_special array. It's basically like the text was escaped
# @param text [String] The text to encode
# @param allowed_special [Array<String>] An array of special tokens to allow
# @return [Array<Integer>] The encoded tokens
def encode(text, allowed_special: [])
@ext_base_bpe.encode(text, allowed_special)
end

# Decodes the tokens back into text
# @param tokens [Array<Integer>] The tokens to decode
# @return [String] The decoded text
def decode(tokens)
@ext_base_bpe.decode(tokens)
end

private
def initialize(ext_base_bpe, name)
@ext_base_bpe = ext_base_bpe
@name = name
end
attr_reader :name

# This returns a new Tiktoken::Encoding instance for the requested encoding
# @param encoding [Symbol] The name of the encoding to load
# @return [Tiktoken::Encoding] The encoding instance
def self.for_name(encoding)
Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
end

# This returns a Tiktoken::Encoding instance for the requested encoding
# It will reuse an existing encoding if it's already been loaded
# @param encoding [Symbol] The name of the encoding to load
# @return [Tiktoken::Encoding] The encoding instance
def self.for_name_cached(encoding)
@encodings ||= {}
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
end

# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
# basically it's unescaped
# @param text [String] The text to encode
# @return [Array<Integer>] The encoded tokens
def encode_ordinary(text)
@ext_base_bpe.encode_ordinary(text)
end

# Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
# as text unless they're in the allowed_special array. It's basically like the text was escaped
# @param text [String] The text to encode
# @param allowed_special [Array<String>] An array of special tokens to allow
# @return [Array<Integer>] The encoded tokens
def encode(text, allowed_special: [])
@ext_base_bpe.encode(text, allowed_special)
end

# Decodes the tokens back into text
# @param tokens [Array<Integer>] The tokens to decode
# @return [String] The decoded text
def decode(tokens)
@ext_base_bpe.decode(tokens)
end

private

def initialize(ext_base_bpe, name)
@ext_base_bpe = ext_base_bpe
@name = name
end
end
2 changes: 1 addition & 1 deletion spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# frozen_string_literal: true

require "tiktoken_ruby"
require 'pry'
require "pry"

RSpec.configure do |config|
# Enable flags like --only-failures and --next-failure
Expand Down
4 changes: 2 additions & 2 deletions spec/tiktoken_ruby_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
end

it "can get an encoding for a model" do
expect(Tiktoken.encoding_for_model('gpt-3.5-turbo')).to be_a(Tiktoken::Encoding)
expect(Tiktoken.encoding_for_model("gpt-3.5-turbo")).to be_a(Tiktoken::Encoding)
end

it "lists available encodings" do
expect(Tiktoken.list_encoding_names).to be_a(Array)
end

for encoding_name in Tiktoken.list_encoding_names
Tiktoken.list_encoding_names.each do |encoding_name|
describe "Encoding #{encoding_name}" do
let(:encoding) { Tiktoken.get_encoding(encoding_name) }
describe Tiktoken::Encoding do
Expand Down
10 changes: 5 additions & 5 deletions tiktoken_ruby.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ Gem::Specification.new do |spec|
spec.email = ["[email protected]"]

spec.summary = "Ruby wrapper for Tiktoken"
spec.description = "An unofficial Ruby wrapper for Tiktoken, " +
"a BPE tokenizer written by and used by OpenAI. It can be used to " +
"count the number of tokens in text before sending it to OpenAI APIs."
spec.description = "An unofficial Ruby wrapper for Tiktoken, " \
"a BPE tokenizer written by and used by OpenAI. It can be used to " \
"count the number of tokens in text before sending it to OpenAI APIs."

spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
spec.license = "MIT"
Expand All @@ -22,8 +22,8 @@ Gem::Specification.new do |spec|
spec.metadata["homepage_uri"] = spec.homepage
spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
#spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."

# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."

# Specify which files should be added to the gem when it is released.
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
Expand Down

0 comments on commit 9bc513c

Please sign in to comment.