diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index aa71e74..6dd7578 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,12 +2,29 @@ name: build on: [push, pull_request] jobs: build: - if: "!contains(github.event.head_commit.message, '[skip ci]')" + strategy: + fail-fast: false + matrix: + include: + - ruby: 3.4 + gemfile: Gemfile + - ruby: 3.3 + gemfile: gemfiles/activerecord72.gemfile + - ruby: 3.2 + gemfile: gemfiles/activerecord71.gemfile + - ruby: 3.1 + gemfile: gemfiles/activerecord70.gemfile runs-on: ubuntu-latest + env: + BUNDLE_GEMFILE: ${{ matrix.gemfile }} steps: - - uses: actions/checkout@v2 - - uses: ruby/setup-ruby@v1 - with: - ruby-version: 2.7 - bundler-cache: true - - run: bundle exec rake test + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby }} + bundler-cache: true + - uses: actions/cache@v4 + with: + path: ~/.cache/disco + key: disco-v2 + - run: bundle exec rake test diff --git a/CHANGELOG.md b/CHANGELOG.md index 614fe42..adb24a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,76 @@ +## 0.5.1 (2024-12-29) + +- Removed dependency on `base64` gem for serialization + +## 0.5.0 (2024-10-22) + +- Changed dataset directory to match XDG Base Directory Specification +- Dropped support for marshal serialization +- Dropped support for Ruby < 3.1 and Rails < 7 + +## 0.4.2 (2024-06-24) + +- Removed dependency on `csv` gem for `load_movielens` + +## 0.4.1 (2024-05-23) + +- Reduced memory for `item_recs` and `similar_users` + +## 0.4.0 (2023-01-30) + +- Fixed issue with `has_recommended` and inheritance with Rails < 6.1 +- Deprecated marshal serialization +- Dropped support for Ruby < 2.7 and Rails < 6 + +## 0.3.2 (2022-09-26) + +- Fixed issue when `fit` is called multiple times + +## 0.3.1 (2022-07-10) + +- Added support for JSON serialization + +## 0.3.0 (2022-03-22) + +- Changed `item_id` to `user_id` for `similar_users` +- Changed warning to an error when `value` passed to `fit` +- Changed to use Faiss over NGT for `optimize_item_recs` and `optimize_similar_users` when both are installed +- Removed dependency on `wilson_score` gem for `top_items` +- Dropped support for Ruby < 2.6 + +## 0.2.9 (2022-03-22) + +- Fixed error with `load_movielens` + +## 0.2.8 (2022-03-13) + +- Fixed error with `top_items` with all same rating + +## 0.2.7 (2021-08-06) + +- Added warning for `value` + +## 0.2.6 (2021-02-24) + +- Improved performance +- Improved `inspect` method +- Fixed issue with `similar_users` and `item_recs` returning the original user/item +- Fixed error with `fit` after loading + +## 0.2.5 (2021-02-20) + +- Added `top_items` method +- Added `optimize_similar_users` method +- Added support for Faiss for `optimize_item_recs` and `optimize_similar_users` methods +- Added `rmse` method +- Improved performance + +## 0.2.4 (2021-02-15) + +- Added `user_ids` and `item_ids` methods +- Added `user_id` argument to `user_factors` +- Added `item_id` argument to `item_factors` + ## 0.2.3 (2020-11-28) - Added `predict` method diff --git a/Gemfile b/Gemfile index 9d4d5b4..cc16f1d 100644 --- a/Gemfile +++ b/Gemfile @@ -4,8 +4,10 @@ gemspec gem "rake" gem "minitest", ">= 5" -gem "activerecord" +gem "activerecord", "~> 8.0.0" gem "sqlite3" gem "daru" +gem "matrix" # for daru gem "rover-df" gem "ngt", ">= 0.3.0" +gem "faiss" diff --git a/LICENSE.txt b/LICENSE.txt index e2e1d2a..55abd58 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2019-2020 Andrew Kane +Copyright (c) 2019-2024 Andrew Kane MIT License diff --git a/README.md b/README.md index e9309ed..4c46d7d 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,14 @@ - Works with explicit and implicit feedback - Uses high-performance matrix factorization -[![Build Status](https://github.com/ankane/disco/workflows/build/badge.svg?branch=master)](https://github.com/ankane/disco/actions) +[![Build Status](https://github.com/ankane/disco/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/disco/actions) ## Installation Add this line to your application’s Gemfile: ```ruby -gem 'disco' +gem "disco" ``` ## Getting Started @@ -35,24 +35,24 @@ recommender.fit([ > IDs can be integers, strings, or any other data type -If users don’t rate items directly (for instance, they’re purchasing items or reading posts), this is known as implicit feedback. Leave out the rating, or use a value like number of purchases, number of page views, or time spent on page: +If users don’t rate items directly (for instance, they’re purchasing items or reading posts), this is known as implicit feedback. Leave out the rating. ```ruby recommender.fit([ - {user_id: 1, item_id: 1, value: 1}, - {user_id: 2, item_id: 1, value: 1} + {user_id: 1, item_id: 1}, + {user_id: 2, item_id: 1} ]) ``` -> Use `value` instead of rating for implicit feedback +> Each `user_id`/`item_id` combination should only appear once -Get user-based (user-item) recommendations - “users like you also liked” +Get user-based recommendations - “users like you also liked” ```ruby recommender.user_recs(user_id) ``` -Get item-based (item-item) recommendations - “users who liked this item also liked” +Get item-based recommendations - “users who liked this item also liked” ```ruby recommender.item_recs(item_id) @@ -99,18 +99,13 @@ recommender.item_recs("Star Wars (1977)") [Ahoy](https://github.com/ankane/ahoy) is a great source for implicit feedback ```ruby -views = Ahoy::Event. - where(name: "Viewed post"). - group(:user_id). - group("properties->>'post_id'"). # postgres syntax - count +views = Ahoy::Event.where(name: "Viewed post").group(:user_id).group_prop(:post_id).count data = - views.map do |(user_id, post_id), count| + views.map do |(user_id, post_id), _| { user_id: user_id, - item_id: post_id, - value: count + item_id: post_id } end ``` @@ -181,26 +176,26 @@ user.update_recommended_products_v2(recs) user.recommended_products_v2 ``` -For Rails < 6, speed up inserts by adding [activerecord-import](https://github.com/zdennis/activerecord-import) to your app. - ## Storing Recommenders If you’d prefer to perform recommendations on-the-fly, store the recommender ```ruby -bin = Marshal.dump(recommender) -File.binwrite("recommender.bin", bin) +json = recommender.to_json +File.write("recommender.json", json) ``` -> You can save it to a file, database, or any other storage system +The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. You can save it to a file, database, or any other storage system, or use a tool like [Trove](https://github.com/ankane/trove). Also, user and item IDs should be integers or strings for this. Load a recommender ```ruby -bin = File.binread("recommender.bin") -recommender = Marshal.load(bin) +json = File.read("recommender.json") +recommender = Disco::Recommender.load_json(json) ``` +Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples/disco). + ## Algorithms Disco uses high-performance matrix factorization. @@ -226,7 +221,7 @@ recommender.fit(data, validation_set: validation_set) ## Cold Start -Collaborative filtering suffers from the [cold start problem](https://www.yuspify.com/blog/cold-start-problem-recommender-systems/). It’s unable to make good recommendations without data on a user or item, which is problematic for new users and items. +Collaborative filtering suffers from the [cold start problem](https://en.wikipedia.org/wiki/Cold_start_(recommender_systems)). It’s unable to make good recommendations without data on a user or item, which is problematic for new users and items. ```ruby recommender.user_recs(new_user_id) # returns empty array @@ -234,8 +229,18 @@ recommender.user_recs(new_user_id) # returns empty array There are a number of ways to deal with this, but here are some common ones: -- For user-based recommendations, show new users the most popular items. -- For item-based recommendations, make content-based recommendations with a gem like [tf-idf-similarity](https://github.com/jpmckinney/tf-idf-similarity). +- For user-based recommendations, show new users the most popular items +- For item-based recommendations, make content-based recommendations with a gem like [tf-idf-similarity](https://github.com/jpmckinney/tf-idf-similarity) + +Get top items with: + +```ruby +recommender = Disco::Recommender.new(top_items: true) +recommender.fit(data) +recommender.top_items +``` + +This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback and item frequency for implicit feedback. ## Data @@ -257,45 +262,65 @@ Or a Daru data frame Daru::DataFrame.from_csv("ratings.csv") ``` -## Faster Similarity +## Performance -If you have a large number of users/items, you can use an approximate nearest neighbors library like [NGT](https://github.com/ankane/ngt) to speed up item-based recommendations and similar users. +If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods. Add this line to your application’s Gemfile: ```ruby -gem 'ngt', '>= 0.3.0' +gem "faiss" +``` + +Speed up the `user_recs` method with: + +```ruby +recommender.optimize_user_recs ``` -Speed up item-based recommendations with: +Speed up the `item_recs` method with: ```ruby -model.optimize_item_recs +recommender.optimize_item_recs ``` -Speed up similar users with: +Speed up the `similar_users` method with: ```ruby -model.optimize_similar_users +recommender.optimize_similar_users ``` -This should be called after fitting or loading the model. +This should be called after fitting or loading the recommender. ## Reference +Get ids + +```ruby +recommender.user_ids +recommender.item_ids +``` + Get the global mean ```ruby recommender.global_mean ``` -Get the factors +Get factors ```ruby recommender.user_factors recommender.item_factors ``` +Get factors for specific users and items + +```ruby +recommender.user_factors(user_id) +recommender.item_factors(item_id) +``` + ## Credits Thanks to: diff --git a/Rakefile b/Rakefile index 35dafa0..34beef8 100644 --- a/Rakefile +++ b/Rakefile @@ -5,5 +5,57 @@ task default: :test Rake::TestTask.new do |t| t.libs << "test" t.pattern = "test/**/*_test.rb" - t.warning = false + t.warning = false # for daru +end + +# TODO use benchmark-ips +def benchmark_user_recs(name, recommender) + ms = Benchmark.realtime do + recommender.user_ids.each do |user_id| + recommender.user_recs(user_id) + end + end + puts "%-8s %f" % [name, ms] +end + +# TODO use benchmark-ips +def benchmark_item_recs(name, recommender) + ms = Benchmark.realtime do + recommender.item_ids.each do |item_id| + recommender.item_recs(item_id) + end + end + puts "%-8s %f" % [name, ms] +end + +namespace :benchmark do + task :user_recs do + require "bundler/setup" + Bundler.require + require "benchmark" + + data = Disco.load_movielens + recommender = Disco::Recommender.new + recommender.fit(data) + + benchmark_user_recs("none", recommender) + recommender.optimize_user_recs + benchmark_user_recs("faiss", recommender) + end + + task :item_recs do + require "bundler/setup" + Bundler.require + require "benchmark" + + data = Disco.load_movielens + recommender = Disco::Recommender.new + recommender.fit(data) + + benchmark_item_recs("none", recommender) + recommender.optimize_item_recs(library: "ngt") + benchmark_item_recs("ngt", recommender) + recommender.optimize_item_recs(library: "faiss") + benchmark_item_recs("faiss", recommender) + end end diff --git a/disco.gemspec b/disco.gemspec index d851258..110565b 100644 --- a/disco.gemspec +++ b/disco.gemspec @@ -8,13 +8,13 @@ Gem::Specification.new do |spec| spec.license = "MIT" spec.author = "Andrew Kane" - spec.email = "andrew@chartkick.com" + spec.email = "andrew@ankane.org" spec.files = Dir["*.{md,txt}", "{app,lib}/**/*"] spec.require_path = "lib" - spec.required_ruby_version = ">= 2.4" + spec.required_ruby_version = ">= 3.1" - spec.add_dependency "libmf", ">= 0.2.0" - spec.add_dependency "numo-narray" + spec.add_dependency "libmf", ">= 0.4" + spec.add_dependency "numo-narray", ">= 0.9.2" end diff --git a/gemfiles/activerecord70.gemfile b/gemfiles/activerecord70.gemfile new file mode 100644 index 0000000..1ff6fe4 --- /dev/null +++ b/gemfiles/activerecord70.gemfile @@ -0,0 +1,13 @@ +source "https://rubygems.org" + +gemspec path: ".." + +gem "rake" +gem "minitest", ">= 5" +gem "activerecord", "~> 7.0.0" +gem "sqlite3", "< 2" +gem "daru" +gem "matrix" # for daru +gem "rover-df" +gem "ngt", ">= 0.3.0" +gem "faiss" diff --git a/gemfiles/activerecord71.gemfile b/gemfiles/activerecord71.gemfile new file mode 100644 index 0000000..adb0692 --- /dev/null +++ b/gemfiles/activerecord71.gemfile @@ -0,0 +1,13 @@ +source "https://rubygems.org" + +gemspec path: ".." + +gem "rake" +gem "minitest", ">= 5" +gem "activerecord", "~> 7.1.0" +gem "sqlite3", "< 2" +gem "daru" +gem "matrix" # for daru +gem "rover-df" +gem "ngt", ">= 0.3.0" +gem "faiss" diff --git a/gemfiles/activerecord72.gemfile b/gemfiles/activerecord72.gemfile new file mode 100644 index 0000000..bad3348 --- /dev/null +++ b/gemfiles/activerecord72.gemfile @@ -0,0 +1,13 @@ +source "https://rubygems.org" + +gemspec path: ".." + +gem "rake" +gem "minitest", ">= 5" +gem "activerecord", "~> 7.2.0" +gem "sqlite3" +gem "daru" +gem "matrix" # for daru +gem "rover-df" +gem "ngt", ">= 0.3.0" +gem "faiss" diff --git a/lib/disco.rb b/lib/disco.rb index 5e36e1c..f18d7ee 100644 --- a/lib/disco.rb +++ b/lib/disco.rb @@ -2,18 +2,14 @@ require "libmf" require "numo/narray" -# stdlib -require "csv" -require "fileutils" -require "net/http" - # modules -require "disco/data" -require "disco/recommender" -require "disco/version" +require_relative "disco/data" +require_relative "disco/metrics" +require_relative "disco/recommender" +require_relative "disco/version" # integrations -require "disco/engine" if defined?(Rails) +require_relative "disco/engine" if defined?(Rails) module Disco class Error < StandardError; end @@ -23,7 +19,7 @@ class Error < StandardError; end if defined?(ActiveSupport.on_load) ActiveSupport.on_load(:active_record) do - require "disco/model" + require_relative "disco/model" extend Disco::Model end end diff --git a/lib/disco/data.rb b/lib/disco/data.rb index 451972c..e5ea269 100644 --- a/lib/disco/data.rb +++ b/lib/disco/data.rb @@ -1,21 +1,20 @@ module Disco module Data def load_movielens - item_path = download_file("ml-100k/u.item", "http://files.grouplens.org/datasets/movielens/ml-100k/u.item", + item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item", file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701") - data_path = download_file("ml-100k/u.data", "http://files.grouplens.org/datasets/movielens/ml-100k/u.data", + data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data", file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490") - # convert u.item to utf-8 - movies_str = File.read(item_path).encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "") - movies = {} - CSV.parse(movies_str, col_sep: "|") do |row| + File.foreach(item_path) do |line| + row = line.encode("UTF-8", "ISO-8859-1").split("|") movies[row[0]] = row[1] end data = [] - CSV.foreach(data_path, col_sep: "\t") do |row| + File.foreach(data_path) do |line| + row = line.split("\t") data << { user_id: row[0].to_i, item_id: movies[row[1]], @@ -29,9 +28,13 @@ def load_movielens private def download_file(fname, origin, file_hash:) - # TODO handle this better - raise "No HOME" unless ENV["HOME"] - dest = "#{ENV["HOME"]}/.disco/#{fname}" + require "digest" + require "fileutils" + require "net/http" + require "tmpdir" + + cache_home = ENV["XDG_CACHE_HOME"] || "#{ENV.fetch("HOME")}/.cache" + dest = "#{cache_home}/disco/#{fname}" FileUtils.mkdir_p(File.dirname(dest)) return dest if File.exist?(dest) diff --git a/lib/disco/metrics.rb b/lib/disco/metrics.rb new file mode 100644 index 0000000..dfcf0b5 --- /dev/null +++ b/lib/disco/metrics.rb @@ -0,0 +1,10 @@ +module Disco + module Metrics + class << self + def rmse(act, exp) + raise ArgumentError, "Size mismatch" if act.size != exp.size + Math.sqrt(act.zip(exp).sum { |a, e| (a - e)**2 } / act.size.to_f) + end + end + end +end diff --git a/lib/disco/model.rb b/lib/disco/model.rb index 2a29d35..1d79886 100644 --- a/lib/disco/model.rb +++ b/lib/disco/model.rb @@ -1,7 +1,12 @@ module Disco module Model def has_recommended(name, class_name: nil) + if ActiveRecord::VERSION::MAJOR < 7 + raise Disco::Error, "Requires Active Record 7+" + end + class_name ||= name.to_s.singularize.camelize + subject_type = model_name.name class_eval do unless reflect_on_association(:recommendations) @@ -12,21 +17,13 @@ def has_recommended(name, class_name: nil) define_method("update_recommended_#{name}") do |items| now = Time.now - items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item[:item_id], context: name, score: item[:score], created_at: now, updated_at: now} } + items = items.map { |item| {subject_type: subject_type, subject_id: id, item_type: class_name, item_id: item.fetch(:item_id), context: name, score: item.fetch(:score), created_at: now, updated_at: now} } self.class.transaction do recommendations.where(context: name).delete_all if items.any? - if recommendations.respond_to?(:insert_all!) - # Rails 6 - recommendations.insert_all!(items) - elsif recommendations.respond_to?(:bulk_import!) - # activerecord-import - recommendations.bulk_import!(items, validate: false) - else - recommendations.create!([items]) - end + recommendations.insert_all!(items) end end end diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index cdea4e0..3c3a2ff 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -1,46 +1,78 @@ module Disco class Recommender - attr_reader :global_mean, :item_factors, :user_factors + attr_reader :global_mean - def initialize(factors: 8, epochs: 20, verbose: nil) + def initialize(factors: 8, epochs: 20, verbose: nil, top_items: false) @factors = factors @epochs = epochs @verbose = verbose + @user_map = {} + @item_map = {} + @top_items = top_items end def fit(train_set, validation_set: nil) train_set = to_dataset(train_set) validation_set = to_dataset(validation_set) if validation_set + check_training_set(train_set) + + # TODO option to set in initializer to avoid pass + # could also just check first few values + # but may be confusing if they are all missing and later ones aren't @implicit = !train_set.any? { |v| v[:rating] } + if @implicit && train_set.any? { |v| v[:value] } + raise ArgumentError, "Passing `:value` with implicit feedback has no effect on recommendations and should be removed. Earlier versions of the library incorrectly stated this was used." + end + + # TODO improve performance + # (catch exception instead of checking ahead of time) unless @implicit - ratings = train_set.map { |o| o[:rating] } - check_ratings(ratings) - @min_rating = ratings.min - @max_rating = ratings.max + check_ratings(train_set) if validation_set - check_ratings(validation_set.map { |o| o[:rating] }) + check_ratings(validation_set) end end - check_training_set(train_set) - create_maps(train_set) - + @user_map = {} + @item_map = {} @rated = Hash.new { |hash, key| hash[key] = {} } input = [] - value_key = @implicit ? :value : :rating train_set.each do |v| - u = @user_map[v[:user_id]] - i = @item_map[v[:item_id]] + # update maps and build matrix in single pass + u = (@user_map[v[:user_id]] ||= @user_map.size) + i = (@item_map[v[:item_id]] ||= @item_map.size) @rated[u][i] = true # explicit will always have a value due to check_ratings - input << [u, i, v[value_key] || 1] + input << [u, i, @implicit ? 1 : v[:rating]] end @rated.default = nil + # much more efficient than checking every value in another pass + raise ArgumentError, "Missing user_id" if @user_map.key?(nil) + raise ArgumentError, "Missing item_id" if @item_map.key?(nil) + + # TODO improve performance + unless @implicit + @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] } + else + @min_rating = nil + @max_rating = nil + end + + if @top_items + @item_count = Array.new(@item_map.size, 0) + @item_sum = Array.new(@item_map.size, 0.0) + train_set.each do |v| + i = @item_map[v[:item_id]] + @item_count[i] += 1 + @item_sum[i] += (@implicit ? 1 : v[:rating]) + end + end + eval_set = nil if validation_set eval_set = [] @@ -52,7 +84,7 @@ def fit(train_set, validation_set: nil) u ||= -1 i ||= -1 - eval_set << [u, i, v[value_key] || 1] + eval_set << [u, i, @implicit ? 1 : v[:rating]] end end @@ -67,8 +99,12 @@ def fit(train_set, validation_set: nil) @user_factors = model.p_factors(format: :numo) @item_factors = model.q_factors(format: :numo) - @user_index = nil - @item_index = nil + @user_norms = nil + @item_norms = nil + + @user_recs_index = nil + @similar_users_index = nil + @similar_items_index = nil end # generates a prediction even if a user has already rated the item @@ -95,71 +131,213 @@ def user_recs(user_id, count: 5, item_ids: nil) u = @user_map[user_id] if u - predictions = @item_factors.inner(@user_factors[u, true]) - - predictions = - @item_map.keys.zip(predictions).map do |item_id, pred| - {item_id: item_id, score: pred} - end + rated = item_ids ? {} : @rated[u] if item_ids - idx = item_ids.map { |i| @item_map[i] }.compact - predictions = predictions.values_at(*idx) + ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact) + return [] if ids.size == 0 + + predictions = @item_factors[ids, true].inner(@user_factors[u, true]) + indexes = predictions.sort_index.reverse + indexes = indexes[0...[count + rated.size, indexes.size].min] if count + predictions = predictions[indexes] + ids = ids[indexes] + elsif @user_recs_index && count + predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] } else - @rated[u].keys.sort_by { |v| -v }.each do |i| - predictions.delete_at(i) - end + predictions = @item_factors.inner(@user_factors[u, true]) + indexes = predictions.sort_index.reverse # reverse just creates view + indexes = indexes[0...[count + rated.size, indexes.size].min] if count + predictions = predictions[indexes] + ids = indexes end - predictions.sort_by! { |pred| -pred[:score] } # already sorted by id - predictions = predictions.first(count) if count && !item_ids + predictions.inplace.clip(@min_rating, @max_rating) if @min_rating - # clamp *after* sorting - # also, only needed for returned predictions - if @min_rating - predictions.each do |pred| - pred[:score] = pred[:score].clamp(@min_rating, @max_rating) - end - end + keys = @item_map.keys + result = [] + ids.each_with_index do |item_id, i| + next if rated[item_id] - predictions + result << {item_id: keys[item_id], score: predictions[i]} + break if result.size == count + end + result + elsif @top_items + top_items(count: count) else - # no items if user is unknown - # TODO maybe most popular items [] end end - def optimize_similar_items + def similar_items(item_id, count: 5) check_fit - @item_index = create_index(@item_factors) + similar(item_id, :item_id, @item_map, @item_factors, item_norms, count, @similar_items_index) end - alias_method :optimize_item_recs, :optimize_similar_items + alias_method :item_recs, :similar_items - def optimize_similar_users + def similar_users(user_id, count: 5) check_fit - @user_index = create_index(@user_factors) + similar(user_id, :user_id, @user_map, @user_factors, user_norms, count, @similar_users_index) end - def similar_items(item_id, count: 5) + def top_items(count: 5) check_fit - similar(item_id, @item_map, @item_factors, item_norms, count, @item_index) + raise "top_items not computed" unless @top_items + + if @implicit + scores = Numo::UInt64.cast(@item_count) + else + min_rating = @min_rating + + # TODO remove temp fix + min_rating -= 1 if @min_rating == @max_rating + + # wilson score with continuity correction + # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval_with_continuity_correction + z = 1.96 # 95% confidence + range = @max_rating - @min_rating + n = Numo::DFloat.cast(@item_count) + phat = (Numo::DFloat.cast(@item_sum) - (min_rating * n)) / range / n + phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction + scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n) + scores = scores * range + min_rating + end + + indexes = scores.sort_index.reverse + indexes = indexes[0...[count, indexes.size].min] if count + scores = scores[indexes] + + keys = @item_map.keys + indexes.size.times.map do |i| + {item_id: keys[indexes[i]], score: scores[i]} + end end - alias_method :item_recs, :similar_items - def similar_users(user_id, count: 5) + def user_ids + @user_map.keys + end + + def item_ids + @item_map.keys + end + + def user_factors(user_id = nil) + if user_id + u = @user_map[user_id] + @user_factors[u, true] if u + else + @user_factors + end + end + + def item_factors(item_id = nil) + if item_id + i = @item_map[item_id] + @item_factors[i, true] if i + else + @item_factors + end + end + + def optimize_user_recs + check_fit + @user_recs_index = create_index(item_factors, library: "faiss") + end + + def optimize_similar_items(library: nil) + check_fit + @similar_items_index = create_index(@item_factors / item_norms.expand_dims(1), library: library) + end + alias_method :optimize_item_recs, :optimize_similar_items + + def optimize_similar_users(library: nil) check_fit - similar(user_id, @user_map, @user_factors, user_norms, count, @user_index) + @similar_users_index = create_index(@user_factors / user_norms.expand_dims(1), library: library) + end + + def inspect + to_s # for now + end + + def to_json + require "json" + + obj = { + implicit: @implicit, + user_ids: @user_map.keys, + item_ids: @item_map.keys, + rated: @user_map.map { |_, u| (@rated[u] || {}).keys }, + global_mean: @global_mean, + user_factors: [@user_factors.to_binary].pack("m0"), + item_factors: [@item_factors.to_binary].pack("m0"), + factors: @factors, + epochs: @epochs, + verbose: @verbose + } + + unless @implicit + obj[:min_rating] = @min_rating + obj[:max_rating] = @max_rating + end + + if @top_items + obj[:item_count] = @item_count + obj[:item_sum] = @item_sum + end + + JSON.generate(obj) + end + + def self.load_json(json) + require "json" + + obj = JSON.parse(json) + + recommender = new + recommender.send(:json_load, obj) + recommender end private - def create_index(factors) - require "ngt" + # factors should already be normalized for similar users/items + def create_index(factors, library:) + library ||= defined?(Ngt) && !defined?(Faiss) ? "ngt" : "faiss" + + case library + when "faiss" + require "faiss" + + # inner product is cosine similarity with normalized vectors + # https://github.com/facebookresearch/faiss/issues/95 + # + # TODO add option for index type + # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes + # index = Faiss::IndexHNSWFlat.new(factors.shape[1], 32, :inner_product) + index = Faiss::IndexFlatIP.new(factors.shape[1]) - index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") - index.batch_insert(factors) - index + # ids are from 0...total + # https://github.com/facebookresearch/faiss/blob/96b740abedffc8f67389f29c2a180913941534c6/faiss/Index.h#L89 + index.add(factors) + + index + when "ngt" + require "ngt" + + # could speed up search with normalized cosine + # https://github.com/yahoojapan/NGT/issues/36 + index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") + + # NGT normalizes so could call create_index without normalized factors + # but keep code simple for now + ids = index.batch_insert(factors) + raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] + + index + else + raise ArgumentError, "Invalid library: #{library}" + end end def user_norms @@ -176,58 +354,52 @@ def norms(factors) norms end - def similar(id, map, factors, norms, count, index) + def similar(id, key, map, factors, norms, count, index) i = map[id] - if i + + if i && factors.shape[0] > 1 if index && count - keys = map.keys - result = index.search(factors[i, true], size: count + 1)[1..-1] - result.map do |v| - { - # ids from batch_insert start at 1 instead of 0 - item_id: keys[v[:id] - 1], - # convert cosine distance to cosine similarity - score: 1 - v[:distance] - } + norm_factors = factors[i, true] / norms[i] + if defined?(Faiss) && index.is_a?(Faiss::Index) + predictions, ids = index.search(norm_factors.expand_dims(0), count + 1).map { |v| v.to_a[0] } + else + result = index.search(norm_factors, size: count + 1) + # ids from batch_insert start at 1 instead of 0 + ids = result.map { |v| v[:id] - 1 } + # convert cosine distance to cosine similarity + predictions = result.map { |v| 1 - v[:distance] } end else - predictions = factors.dot(factors[i, true]) / norms - - predictions = - map.keys.zip(predictions).map do |item_id, pred| - {item_id: item_id, score: pred} - end - - max_score = predictions.delete_at(i)[:score] - predictions.sort_by! { |pred| -pred[:score] } # already sorted by id - predictions = predictions.first(count) if count - # divide by max score to get cosine similarity - # only need to do for returned records - predictions.each { |pred| pred[:score] /= max_score } - predictions + predictions = factors.inner(factors[i, true]) / (norms * norms[i]) + indexes = predictions.sort_index.reverse + indexes = indexes[0...[count + 1, indexes.size].min] if count + predictions = predictions[indexes] + ids = indexes end - else - [] - end - end - def create_maps(train_set) - user_ids = train_set.map { |v| v[:user_id] }.uniq.sort - item_ids = train_set.map { |v| v[:item_id] }.uniq.sort + keys = map.keys - raise ArgumentError, "Missing user_id" if user_ids.any?(&:nil?) - raise ArgumentError, "Missing item_id" if item_ids.any?(&:nil?) + result = [] + # items can have the same score + # so original item may not be at index 0 + ids.each_with_index do |id, j| + next if id == i - @user_map = user_ids.zip(user_ids.size.times).to_h - @item_map = item_ids.zip(item_ids.size.times).to_h + result << {key => keys[id], score: predictions[j]} + break if result.size == count + end + result + else + [] + end end def check_ratings(ratings) - unless ratings.all? { |r| !r.nil? } - raise ArgumentError, "Missing ratings" + unless ratings.all? { |r| !r[:rating].nil? } + raise ArgumentError, "Missing rating" end - unless ratings.all? { |r| r.is_a?(Numeric) } - raise ArgumentError, "Ratings must be numeric" + unless ratings.all? { |r| r[:rating].is_a?(Numeric) } + raise ArgumentError, "Rating must be numeric" end end @@ -258,37 +430,27 @@ def to_dataset(dataset) end end - def marshal_dump - obj = { - implicit: @implicit, - user_map: @user_map, - item_map: @item_map, - rated: @rated, - global_mean: @global_mean, - user_factors: @user_factors, - item_factors: @item_factors - } + def json_load(obj) + @implicit = obj["implicit"] + @user_map = obj["user_ids"].map.with_index.to_h + @item_map = obj["item_ids"].map.with_index.to_h + @rated = obj["rated"].map.with_index.to_h { |r, i| [i, r.to_h { |v| [v, true] }] } + @global_mean = obj["global_mean"].to_f + @factors = obj["factors"].to_i + @user_factors = Numo::SFloat.from_binary(obj["user_factors"].unpack1("m0"), [@user_map.size, @factors]) + @item_factors = Numo::SFloat.from_binary(obj["item_factors"].unpack1("m0"), [@item_map.size, @factors]) + @epochs = obj["epochs"].to_i + @verbose = obj["verbose"] unless @implicit - obj[:min_rating] = @min_rating - obj[:max_rating] = @max_rating + @min_rating = obj["min_rating"] + @max_rating = obj["max_rating"] end - obj - end - - def marshal_load(obj) - @implicit = obj[:implicit] - @user_map = obj[:user_map] - @item_map = obj[:item_map] - @rated = obj[:rated] - @global_mean = obj[:global_mean] - @user_factors = obj[:user_factors] - @item_factors = obj[:item_factors] - - unless @implicit - @min_rating = obj[:min_rating] - @max_rating = obj[:max_rating] + @top_items = obj.key?("item_count") + if @top_items + @item_count = obj["item_count"] + @item_sum = obj["item_sum"] end end end diff --git a/lib/disco/version.rb b/lib/disco/version.rb index 3c82993..75158dd 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.2.3" + VERSION = "0.5.1" end diff --git a/test/gemfiles/activerecord50.gemfile b/test/gemfiles/activerecord50.gemfile deleted file mode 100644 index 4c2ad1d..0000000 --- a/test/gemfiles/activerecord50.gemfile +++ /dev/null @@ -1,6 +0,0 @@ -source "https://rubygems.org" - -gemspec path: "../../" - -gem "activerecord", "~> 5.0.0" -gem "sqlite3", "~> 1.3.0" diff --git a/test/gemfiles/activerecord51.gemfile b/test/gemfiles/activerecord51.gemfile deleted file mode 100644 index 11e5fb2..0000000 --- a/test/gemfiles/activerecord51.gemfile +++ /dev/null @@ -1,5 +0,0 @@ -source "https://rubygems.org" - -gemspec path: "../../" - -gem "activerecord", "~> 5.1.0" diff --git a/test/gemfiles/activerecord52.gemfile b/test/gemfiles/activerecord52.gemfile deleted file mode 100644 index 5e35e61..0000000 --- a/test/gemfiles/activerecord52.gemfile +++ /dev/null @@ -1,5 +0,0 @@ -source "https://rubygems.org" - -gemspec path: "../../" - -gem "activerecord", "~> 5.2.0" diff --git a/test/metrics_test.rb b/test/metrics_test.rb new file mode 100644 index 0000000..ebfc602 --- /dev/null +++ b/test/metrics_test.rb @@ -0,0 +1,7 @@ +require_relative "test_helper" + +class MetricsTest < Minitest::Test + def test_rmse + assert_in_delta 2, Disco::Metrics.rmse([0, 0, 0, 1, 1], [0, 2, 4, 1, 1]) + end +end diff --git a/test/model_test.rb b/test/model_test.rb index 9e41faf..2cfe072 100644 --- a/test/model_test.rb +++ b/test/model_test.rb @@ -1,7 +1,7 @@ require_relative "test_helper" class ModelTest < Minitest::Test - def test_works + def test_recommendations user = User.create! products = Product.create!([{name: "Product A"}, {name: "Product B"}].shuffle) user.update_recommended_products([ @@ -12,4 +12,16 @@ def test_works assert_equal products, user.recommended_products.to_a assert_equal [], user.recommended_products_v2.to_a end + + def test_inheritance + user = AdminUser.create! + products = Product.create!([{name: "Product A"}, {name: "Product B"}].shuffle) + user.update_recommended_products([ + {item_id: products.first.id, score: 1}, + {item_id: products.last.id, score: 0.5} + ].shuffle) + assert_equal products.size, user.recommendations.count + assert_equal products, user.recommended_products.to_a + assert_equal [], user.recommended_products_v2.to_a + end end diff --git a/test/optimize_test.rb b/test/optimize_test.rb new file mode 100644 index 0000000..c4f707d --- /dev/null +++ b/test/optimize_test.rb @@ -0,0 +1,114 @@ +require_relative "test_helper" + +class OptimizeTest < Minitest::Test + def setup + skip "Not available on Windows" if windows? + end + + def test_optimize_user_recs + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.user_recs(1) + + recommender.optimize_user_recs + + recs = recommender.user_recs(1) + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + end + + def test_optimize_item_recs + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.item_recs("Star Wars (1977)") + + recommender.optimize_item_recs(library: "faiss") + + recs = recommender.item_recs("Star Wars (1977)") + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + + item_ids = recs.map { |r| r[:item_id] } + assert_includes item_ids, "Empire Strikes Back, The (1980)" + assert_includes item_ids, "Return of the Jedi (1983)" + + assert_in_delta 0.9972, recs.first[:score], 0.01 + end + + def test_optimize_similar_users + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.similar_users(1) + + recommender.optimize_similar_users(library: "faiss") + + recs = recommender.similar_users(1) + + assert_equal original_recs.map { |v| v[:user_id] }, recs.map { |v| v[:user_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + end + + def test_optimize_item_recs_ngt + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.item_recs("Star Wars (1977)") + + recommender.optimize_item_recs(library: "ngt") + + recs = recommender.item_recs("Star Wars (1977)") + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + + item_ids = recs.map { |r| r[:item_id] } + assert_includes item_ids, "Empire Strikes Back, The (1980)" + assert_includes item_ids, "Return of the Jedi (1983)" + + assert_in_delta 0.9972, recs.first[:score], 0.01 + end + + def test_optimize_similar_users_ngt + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.similar_users(1, count: 10) + + recommender.optimize_similar_users(library: "ngt") + + recs = recommender.similar_users(1, count: 10) + + # won't match exactly due to ANN + matching_ids = original_recs.map { |v| v[:user_id] } & recs.map { |v| v[:user_id] } + assert_includes 8..10, matching_ids.size + matching_ids.each do |user_id| + exp = original_recs.find { |v| v[:user_id] == user_id } + act = recs.find { |v| v[:user_id] == user_id } + assert_in_delta exp[:score], act[:score] + end + assert_equal 10, recs.size + end + + def windows? + Gem.win_platform? + end +end diff --git a/test/recommender_test.rb b/test/recommender_test.rb index b4890f6..67e0b4e 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -6,19 +6,12 @@ def test_explicit recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) - path = "#{Dir.mktmpdir}/recommender.bin" - - dump = Marshal.dump(recommender) - File.binwrite(path, dump) - - dump = File.binread(path) - recommender = Marshal.load(dump) + dump = recommender.to_json + recommender = Disco::Recommender.load_json(dump) assert_equal [1664, 20], recommender.item_factors.shape assert_equal [943, 20], recommender.user_factors.shape - - expected = data.map { |v| v[:rating] }.sum / data.size.to_f - assert_in_delta expected, recommender.global_mean + assert_in_delta 3.52986, recommender.global_mean recs = recommender.item_recs("Star Wars (1977)") assert_equal 5, recs.size @@ -26,8 +19,19 @@ def test_explicit item_ids = recs.map { |r| r[:item_id] } assert_includes item_ids, "Empire Strikes Back, The (1980)" assert_includes item_ids, "Return of the Jedi (1983)" + refute_includes item_ids, "Star Wars (1977)" assert_in_delta 0.9972, recs.first[:score], 0.01 + + assert_equal (1664 - data.select { |v| v[:user_id] == 1 }.map { |v| v[:item_id] }.uniq.size), recommender.user_recs(1, count: nil).size + assert_equal 1663, recommender.item_recs("Star Wars (1977)", count: nil).size + assert_equal 942, recommender.similar_users(1, count: nil).size + + assert recommender.inspect.size < 50 + assert recommender.to_s.size < 50 + + # fit after loading + recommender.fit(data.first(5)) end def test_implicit @@ -37,21 +41,18 @@ def test_implicit recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) - path = "#{Dir.mktmpdir}/recommender.bin" - - dump = Marshal.dump(recommender) - File.binwrite(path, dump) - - dump = File.binread(path) - recommender = Marshal.load(dump) + dump = recommender.to_json + recommender = Disco::Recommender.load_json(dump) assert_equal [1664, 20], recommender.item_factors.shape assert_equal [943, 20], recommender.user_factors.shape assert_equal 0, recommender.global_mean - recs = recommender.item_recs("Star Wars (1977)", count: 10).map { |r| r[:item_id] } - assert_includes recs, "Empire Strikes Back, The (1980)" - assert_includes recs, "Return of the Jedi (1983)" + recs = recommender.item_recs("Star Wars (1977)", count: 10) + item_ids = recs.map { |r| r[:item_id] } + assert_includes item_ids, "Empire Strikes Back, The (1980)" + assert_includes item_ids, "Return of the Jedi (1983)" + refute_includes item_ids, "Star Wars (1977)" end def test_examples @@ -65,8 +66,8 @@ def test_examples recommender = Disco::Recommender.new recommender.fit([ - {user_id: 1, item_id: 1, value: 1}, - {user_id: 2, item_id: 1, value: 2} + {user_id: 1, item_id: 1}, + {user_id: 2, item_id: 1} ]) recommender.user_recs(1) recommender.item_recs(1) @@ -89,6 +90,99 @@ def test_rated assert_equal ["A", "B"], recommender.user_recs(2).map { |r| r[:item_id] }.sort end + def test_item_recs_same_score + data = [{user_id: 1, item_id: "A"}, {user_id: 1, item_id: "B"}, {user_id: 2, item_id: "C"}] + recommender = Disco::Recommender.new(factors: 50) + recommender.fit(data) + assert_equal ["B", "C"], recommender.item_recs("A").map { |r| r[:item_id] } + end + + def test_similar_users + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + refute_empty recommender.similar_users(data.first[:user_id]) + assert_empty recommender.similar_users("missing") + end + + def test_top_items_explicit + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20, top_items: true) + recommender.fit(data) + top_items = recommender.top_items + assert_equal top_items, recommender.user_recs("unknown") + + recommender = Disco::Recommender.load_json(recommender.to_json) + assert_equal top_items, recommender.top_items + assert_equal top_items, recommender.user_recs("unknown") + end + + def test_top_items_implicit + data = Disco.load_movielens + data.each { |v| v.delete(:rating) } + recommender = Disco::Recommender.new(factors: 20, top_items: true) + recommender.fit(data) + top_items = recommender.top_items + assert_equal top_items, recommender.user_recs("unknown") + + recommender = Disco::Recommender.load_json(recommender.to_json) + assert_equal top_items, recommender.top_items + assert_equal top_items, recommender.user_recs("unknown") + end + + def test_top_items_not_computed + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data.first(5)) + error = assert_raises do + recommender.top_items + end + assert_equal "top_items not computed", error.message + end + + def test_top_items_no_range + data = [ + {user_id: 1, item_id: "A", rating: 5}, + {user_id: 1, item_id: "B", rating: 5}, + {user_id: 2, item_id: "B", rating: 5} + ] + recommender = Disco::Recommender.new(factors: 20, top_items: true) + recommender.fit(data) + assert_equal ["B", "A"], recommender.top_items.map { |r| r[:item_id] } + end + + def test_ids + data = [ + {user_id: 1, item_id: "A"}, + {user_id: 1, item_id: "B"}, + {user_id: 2, item_id: "B"} + ] + recommender = Disco::Recommender.new + recommender.fit(data) + assert_equal [1, 2], recommender.user_ids + assert_equal ["A", "B"], recommender.item_ids + end + + def test_factors + data = [ + {user_id: 1, item_id: "A"}, + {user_id: 1, item_id: "B"}, + {user_id: 2, item_id: "B"} + ] + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + assert_equal [2, 20], recommender.user_factors.shape + assert_equal [2, 20], recommender.item_factors.shape + + assert_equal [20], recommender.user_factors(1).shape + assert_equal [20], recommender.item_factors("A").shape + + assert_nil recommender.user_factors(3) + assert_nil recommender.item_factors("C") + end + def test_validation_set_explicit data = Disco.load_movielens train_set = data.first(80000) @@ -131,15 +225,21 @@ def test_user_recs_new_item {user_id: 1, item_id: 1, rating: 5}, {user_id: 2, item_id: 1, rating: 3} ]) - assert_empty [], recommender.user_recs(1, item_ids: [1000]) + assert_empty recommender.user_recs(1, item_ids: [1000]) end - # TODO better test (need deterministic output) def test_predict data = Disco.load_movielens - recommender = Disco::Recommender.new(factors: 20) - recommender.fit(data) - assert_kind_of Array, recommender.predict(data.first(5)) + data.shuffle!(random: Random.new(1)) + + train_set = data.first(80000) + valid_set = data.last(20000) + + recommender = Disco::Recommender.new(factors: 20, verbose: false) + recommender.fit(train_set, validation_set: valid_set) + + predictions = recommender.predict(valid_set) + assert_in_delta 0.91, Disco::Metrics.rmse(valid_set.map { |v| v[:rating] }, predictions), 0.01 end def test_predict_new_user @@ -192,12 +292,52 @@ def test_missing_item_id assert_equal "Missing item_id", error.message end + def test_missing_rating + recommender = Disco::Recommender.new + error = assert_raises ArgumentError do + recommender.fit([{user_id: 1, item_id: 1, rating: 5}, {user_id: 1, item_id: 2}]) + end + assert_equal "Missing rating", error.message + end + + def test_missing_rating_validation_set + recommender = Disco::Recommender.new + error = assert_raises ArgumentError do + recommender.fit([{user_id: 1, item_id: 1, rating: 5}], validation_set: [{user_id: 1, item_id: 2}]) + end + assert_equal "Missing rating", error.message + end + + def test_invalid_rating + recommender = Disco::Recommender.new + error = assert_raises ArgumentError do + recommender.fit([{user_id: 1, item_id: 1, rating: "invalid"}]) + end + assert_equal "Rating must be numeric", error.message + end + + def test_invalid_rating_validation_set + recommender = Disco::Recommender.new + error = assert_raises ArgumentError do + recommender.fit([{user_id: 1, item_id: 1, rating: 5}], validation_set: [{user_id: 1, item_id: 1, rating: "invalid"}]) + end + assert_equal "Rating must be numeric", error.message + end + + def test_value + recommender = Disco::Recommender.new + error = assert_raises(ArgumentError) do + recommender.fit([{user_id: 1, item_id: 1, value: 5}]) + end + assert_match "Passing `:value` with implicit feedback has no effect on recommendations", error.message + end + def test_multiple_user_item skip # no error for now train_set = [ {user_id: 1, item_id: 2, rating: 1}, - {user_id: 1, item_id: 2, rating: 2}, + {user_id: 1, item_id: 2, rating: 2} ] recommender = Disco::Recommender.new error = assert_raises ArgumentError do @@ -214,6 +354,15 @@ def test_not_fit assert_equal "Not fit", error.message end + def test_fit_multiple + recommender = Disco::Recommender.new + recommender.fit([{user_id: 1, item_id: 1, rating: 5}]) + recommender.fit([{user_id: 2, item_id: 2}]) + assert_equal [2], recommender.user_ids + assert_equal [2], recommender.item_ids + assert_operator recommender.predict([{user_id: 2, item_id: 2}])[0], :<, 1.0 + end + def test_rover movielens = Disco.load_movielens @@ -247,23 +396,4 @@ def test_daru # original data frame not modified assert_equal ["user_id", "item_id", "rating"], data.vectors.to_a end - - def test_optimize_similar_items - skip "NGT not available on Windows" if Gem.win_platform? - - data = Disco.load_movielens - recommender = Disco::Recommender.new(factors: 20) - recommender.fit(data) - - recommender.optimize_similar_items - - recs = recommender.item_recs("Star Wars (1977)") - assert_equal 5, recs.size - - item_ids = recs.map { |r| r[:item_id] } - assert_includes item_ids, "Empire Strikes Back, The (1980)" - assert_includes item_ids, "Return of the Jedi (1983)" - - assert_in_delta 0.9972, recs.first[:score], 0.01 - end end diff --git a/test/support/active_record.rb b/test/support/active_record.rb index 0c97c1d..d50a348 100644 --- a/test/support/active_record.rb +++ b/test/support/active_record.rb @@ -7,20 +7,23 @@ # migrations ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:" -ActiveRecord::Migration.create_table :users do |t| - t.string :name -end - -ActiveRecord::Migration.create_table :products do |t| - t.string :name -end - -ActiveRecord::Migration.create_table :disco_recommendations do |t| - t.references :subject, polymorphic: true - t.references :item, polymorphic: true - t.float :score - t.string :context - t.timestamps +ActiveRecord::Schema.define do + create_table :users do |t| + t.string :name + t.string :type + end + + create_table :products do |t| + t.string :name + end + + create_table :disco_recommendations do |t| + t.references :subject, polymorphic: true + t.references :item, polymorphic: true + t.float :score + t.string :context + t.timestamps + end end class User < ActiveRecord::Base @@ -28,6 +31,9 @@ class User < ActiveRecord::Base has_recommended :products_v2, class_name: "Product" end +class AdminUser < User +end + class Product < ActiveRecord::Base end diff --git a/test/test_helper.rb b/test/test_helper.rb index 419a47e..1da70ae 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -3,7 +3,6 @@ Bundler.require(:default) require "minitest/autorun" require "minitest/pride" -require "csv" require "daru" require "rover"