Merge pull request #2 from petergoldstein/feature/add_github_actions_ci

Add CI with GitHub Actions
IAPark · Mar 28, 2023 · 9bc513c · 9bc513c
2 parents 7bc51a0 + af2250b
commit 9bc513c
Show file tree

Hide file tree

Showing 12 changed files with 118 additions and 69 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,37 @@
+name: build
+on: [push, pull_request]
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        ruby: [3.2, 3.1, "3.0"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            tmp
+          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+      - uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby }}
+          bundler-cache: true
+      - run: bundle exec rake compile
+      - run: bundle exec rake spec
+  lint:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: 3.1
+          bundler-cache: true
+      - run: bundle exec rake standard
diff --git a/.github/workflows/cross_compile.yml b/.github/workflows/cross_compile.yml
@@ -27,7 +27,7 @@ jobs:
       matrix:
         platform: ${{ fromJSON(needs.ci-data.outputs.result).supported-ruby-platforms }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - uses: ruby/setup-ruby@v1
         with:
@@ -42,4 +42,4 @@ jobs:
       - uses: actions/upload-artifact@v2
         with:
           name: cross-gem
-          path: ${{ steps.cross-gem.outputs.gem-path }}
+          path: ${{ steps.cross-gem.outputs.gem-path }}
diff --git a/Gemfile b/Gemfile
@@ -13,6 +13,6 @@ gem "rb_sys"
 gem "rspec", "~> 3.0"
 
 gem "standard", "~> 1.3"
-gem 'pry', '~> 0.14.2'
+gem "pry", "~> 0.14.2"
 
 gem "yard-doctest", "~> 0.1.17"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -69,6 +69,8 @@ GEM
 
 PLATFORMS
   arm64-darwin-22
+  x86_64-darwin-22
+  x86_64-linux
 
 DEPENDENCIES
   pry (~> 0.14.2)

diff --git a/README.md b/README.md
@@ -43,6 +43,17 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 
 Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
 
+To get started with development:
+
+```sh
+git clone https://github.com/IAPark/tiktoken_ruby.git
+cd tiktoken_ruby
+bundle install
+bundle exec rake compile
+bundle exec rake spec
+```
+
+
 ## License
 
 The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
diff --git a/Rakefile b/Rakefile
@@ -14,7 +14,6 @@ end
 
 RSpec::Core::RakeTask.new(:spec)
 
-
 task :native, [:platform] do |_t, platform:|
   sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build"
 end

diff --git a/doctest_helper.rb b/doctest_helper.rb
@@ -1 +1 @@
-require 'lib/tiktoken_ruby.rb'
+require "lib/tiktoken_ruby"
diff --git a/lib/tiktoken_ruby.rb b/lib/tiktoken_ruby.rb
@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 
 require_relative "tiktoken_ruby/version"
-require_relative "tiktoken_ruby/encoding.rb"
+require_relative "tiktoken_ruby/encoding"
 
 begin
   RUBY_VERSION =~ /(\d+\.\d+)/
@@ -12,7 +12,6 @@
 
 module Tiktoken
   class << self
-
     # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
     # it will reuse the instance of that type that was previous loaded
     # @param name [Symbol|String] The name of the encoding to load
@@ -34,7 +33,7 @@ def get_encoding(name)
     #   enc = Tiktoken.encoding_for_model("gpt-4")
     #   enc.encode("hello world").length #=> 2
     def encoding_for_model(model_name)
-      for prefix in PREFIX_MODELS
+      PREFIX_MODELS.each do |prefix|
         if model_name.to_s.start_with?("#{prefix}-")
           model_name = prefix
           break
@@ -65,7 +64,7 @@ def list_model_names
       :r50k_base,
       :p50k_base,
       :p50k_edit,
-      :cl100k_base,
+      :cl100k_base
     ]
 
     # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
@@ -80,10 +79,10 @@ def list_model_names
       "text-curie-001": "r50k_base",
       "text-babbage-001": "r50k_base",
       "text-ada-001": "r50k_base",
-      "davinci": "r50k_base",
-      "curie": "r50k_base",
-      "babbage": "r50k_base",
-      "ada": "r50k_base",
+      davinci: "r50k_base",
+      curie: "r50k_base",
+      babbage: "r50k_base",
+      ada: "r50k_base",
       # code
       "code-davinci-002": "p50k_base",
       "code-davinci-001": "p50k_base",
@@ -106,7 +105,7 @@ def list_model_names
       "text-search-babbage-doc-001": "r50k_base",
       "text-search-ada-doc-001": "r50k_base",
       "code-search-babbage-code-001": "r50k_base",
-      "code-search-ada-code-001": "r50k_base",
+      "code-search-ada-code-001": "r50k_base"
     }
 
     # these are models that have a versioned models that are otherwise identical

diff --git a/lib/tiktoken_ruby/encoding.rb b/lib/tiktoken_ruby/encoding.rb
@@ -1,51 +1,52 @@
 # frozen_string_literal: true
 
 class Tiktoken::Encoding
-    attr_reader :name
-
-    # This returns a new Tiktoken::Encoding instance for the requested encoding
-    # @param encoding [Symbol] The name of the encoding to load
-    # @return [Tiktoken::Encoding] The encoding instance
-    def self.for_name(encoding)
-        Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
-    end
-
-    # This returns a Tiktoken::Encoding instance for the requested encoding
-    # It will reuse an existing encoding if it's already been loaded
-    # @param encoding [Symbol] The name of the encoding to load
-    # @return [Tiktoken::Encoding] The encoding instance
-    def self.for_name_cached(encoding)
-        @encodings ||= {}
-        @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
-    end
-
-    # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
-    # basically it's unescaped
-    # @param text [String] The text to encode
-    # @return [Array<Integer>] The encoded tokens
-    def encode_ordinary(text)
-        @ext_base_bpe.encode_ordinary(text)
-    end
-
-    # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
-    # as text unless they're in the allowed_special array. It's basically like the text was escaped
-    # @param text [String] The text to encode
-    # @param allowed_special [Array<String>] An array of special tokens to allow
-    # @return [Array<Integer>] The encoded tokens
-    def encode(text, allowed_special: [])
-        @ext_base_bpe.encode(text, allowed_special)
-    end
-
-    # Decodes the tokens back into text
-    # @param tokens [Array<Integer>] The tokens to decode
-    # @return [String] The decoded text
-    def decode(tokens)
-        @ext_base_bpe.decode(tokens)
-    end
-
-    private
-    def initialize(ext_base_bpe, name)
-        @ext_base_bpe = ext_base_bpe
-        @name = name
-    end
+  attr_reader :name
+
+  # This returns a new Tiktoken::Encoding instance for the requested encoding
+  # @param encoding [Symbol] The name of the encoding to load
+  # @return [Tiktoken::Encoding] The encoding instance
+  def self.for_name(encoding)
+    Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
+  end
+
+  # This returns a Tiktoken::Encoding instance for the requested encoding
+  # It will reuse an existing encoding if it's already been loaded
+  # @param encoding [Symbol] The name of the encoding to load
+  # @return [Tiktoken::Encoding] The encoding instance
+  def self.for_name_cached(encoding)
+    @encodings ||= {}
+    @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
+  end
+
+  # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
+  # basically it's unescaped
+  # @param text [String] The text to encode
+  # @return [Array<Integer>] The encoded tokens
+  def encode_ordinary(text)
+    @ext_base_bpe.encode_ordinary(text)
+  end
+
+  # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
+  # as text unless they're in the allowed_special array. It's basically like the text was escaped
+  # @param text [String] The text to encode
+  # @param allowed_special [Array<String>] An array of special tokens to allow
+  # @return [Array<Integer>] The encoded tokens
+  def encode(text, allowed_special: [])
+    @ext_base_bpe.encode(text, allowed_special)
+  end
+
+  # Decodes the tokens back into text
+  # @param tokens [Array<Integer>] The tokens to decode
+  # @return [String] The decoded text
+  def decode(tokens)
+    @ext_base_bpe.decode(tokens)
+  end
+
+  private
+
+  def initialize(ext_base_bpe, name)
+    @ext_base_bpe = ext_base_bpe
+    @name = name
+  end
 end
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 
 require "tiktoken_ruby"
-require 'pry'
+require "pry"
 
 RSpec.configure do |config|
   # Enable flags like --only-failures and --next-failure

diff --git a/spec/tiktoken_ruby_spec.rb b/spec/tiktoken_ruby_spec.rb
@@ -10,14 +10,14 @@
   end
 
   it "can get an encoding for a model" do
-    expect(Tiktoken.encoding_for_model('gpt-3.5-turbo')).to be_a(Tiktoken::Encoding)
+    expect(Tiktoken.encoding_for_model("gpt-3.5-turbo")).to be_a(Tiktoken::Encoding)
   end
 
   it "lists available encodings" do
     expect(Tiktoken.list_encoding_names).to be_a(Array)
   end
 
-  for encoding_name in Tiktoken.list_encoding_names
+  Tiktoken.list_encoding_names.each do |encoding_name|
     describe "Encoding #{encoding_name}" do
       let(:encoding) { Tiktoken.get_encoding(encoding_name) }
       describe Tiktoken::Encoding do

diff --git a/tiktoken_ruby.gemspec b/tiktoken_ruby.gemspec
@@ -9,9 +9,9 @@ Gem::Specification.new do |spec|
   spec.email = ["[email protected]"]
 
   spec.summary = "Ruby wrapper for Tiktoken"
-  spec.description = "An unofficial Ruby wrapper for Tiktoken, " +
-  "a BPE tokenizer written by and used by OpenAI. It can be used to " + 
-  "count the number of tokens in text before sending it to OpenAI APIs."
+  spec.description = "An unofficial Ruby wrapper for Tiktoken, " \
+    "a BPE tokenizer written by and used by OpenAI. It can be used to " \
+    "count the number of tokens in text before sending it to OpenAI APIs."
 
   spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
   spec.license = "MIT"
@@ -22,8 +22,8 @@ Gem::Specification.new do |spec|
   spec.metadata["homepage_uri"] = spec.homepage
   spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
   spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
-  
-  #spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
+
+  # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
 
   # Specify which files should be added to the gem when it is released.
   # The `git ls-files -z` loads the files in the RubyGem that have been added into git.