Merge branch '__rultor'

mbao01 · Apr 15, 2022 · c4f8750 · c4f8750
2 parents d61c49f + c8be299
commit c4f8750
Show file tree

Hide file tree

Showing 9 changed files with 4,019 additions and 3 deletions.
diff --git a/.pdd b/.pdd
@@ -3,6 +3,7 @@
 --exclude README.md
 --exclude coverage/**/*
 --exclude assets/**/*
+--exclude model/data/**/*
 --rule min-words:10
 --rule min-estimate:15
 --rule max-estimate:90
diff --git a/Gemfile b/Gemfile
@@ -19,6 +19,7 @@
 
 source 'https://rubygems.org'
 
+gem 'activesupport', '6.1.5'
 gem 'aws-sdk-dynamodb', '1.59.0'
 gem 'aws-sdk-s3', '1.90.0'
 gem 'codecov', '0.5.1'
@@ -34,6 +35,7 @@ gem 'rack-test', '1.1.0'
 gem 'rake', '13.0.3', require: false
 gem 'rubocop', '0.69.0', require: false
 gem 'rubocop-rspec', '1.33.0', require: false
+gem 'ruby-fann'
 gem 'sass', '3.7.4'
 gem 'sentry-raven', '3.1.1'
 gem 'sinatra', '2.1.0'

diff --git a/Gemfile.lock b/Gemfile.lock
@@ -1,6 +1,12 @@
 GEM
   remote: https://rubygems.org/
   specs:
+    activesupport (6.1.5)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 1.6, < 2)
+      minitest (>= 5.1)
+      tzinfo (~> 2.0)
+      zeitwerk (~> 2.3)
     addressable (2.8.0)
       public_suffix (>= 2.0.2, < 5.0)
     ast (2.4.2)
@@ -54,12 +60,15 @@ GEM
     haml (5.2.1)
       temple (>= 0.8.0)
       tilt
+    i18n (1.10.0)
+      concurrent-ruby (~> 1.0)
     jaro_winkler (1.5.4)
     jmespath (1.4.0)
     mail (2.7.1)
       mini_mime (>= 0.1.1)
     mini_mime (1.1.2)
     mini_portile2 (2.8.0)
+    minitest (5.15.0)
     mocha (1.11.2)
     multi_json (1.15.0)
     multipart-post (2.1.1)
@@ -102,6 +111,7 @@ GEM
       unicode-display_width (>= 1.4.0, < 1.7)
     rubocop-rspec (1.33.0)
       rubocop (>= 0.60.0)
+    ruby-fann (1.3.2)
     ruby-progressbar (1.11.0)
     ruby2_keywords (0.0.5)
     sass (3.7.4)
@@ -139,18 +149,22 @@ GEM
     test-unit (3.4.0)
       power_assert
     tilt (2.0.10)
+    tzinfo (2.0.4)
+      concurrent-ruby (~> 1.0)
     unicode-display_width (1.6.1)
     xcop (0.6.2)
       differ (~> 0.1.2)
       nokogiri (~> 1.10)
       rainbow (~> 3.0)
       slop (~> 4.4)
+    zeitwerk (2.5.4)
 
 PLATFORMS
   ruby
   x86_64-darwin-19
 
 DEPENDENCIES
+  activesupport (= 6.1.5)
   aws-sdk-dynamodb (= 1.59.0)
   aws-sdk-s3 (= 1.90.0)
   codecov (= 0.5.1)
@@ -166,6 +180,7 @@ DEPENDENCIES
   rake (= 13.0.3)
   rubocop (= 0.69.0)
   rubocop-rspec (= 1.33.0)
+  ruby-fann
   sass (= 3.7.4)
   sentry-raven (= 3.1.1)
   sinatra (= 2.1.0)
@@ -175,4 +190,4 @@ DEPENDENCIES
   xcop (= 0.6.2)
 
 BUNDLED WITH
-   2.2.21
+   2.2.30
diff --git a/model/README.md b/model/README.md
@@ -0,0 +1,14 @@
+Neural Net (in Ruby)
+===
+
+#### Data
+The data for puzzles is pre-processed and available in *~/data/proper_pdd_data_regression.csv*. In the data, The first row is the column index, the first column is the repo id the puzzle belongs to and the last column is the output variable (*y*)
+
+#### Model
+The neural network model uses gradient descent to optimize the weights of the model. The weights are stored in *~/data/weights.marshal* after training and loaded for subsequent runs. To retrain the model, please delete *~/data/weights.marshal*.
+
+##### Training
+Run the following command to train the model or test on random dataset
+```sh
+> ruby model/model.rb
+```
diff --git a/model/data/proper_pdd_data_regression.csv b/model/data/proper_pdd_data_regression.csv
diff --git a/model/data/weights.marshal b/model/data/weights.marshal
diff --git a/model/model.rb b/model/model.rb
@@ -0,0 +1,119 @@
+require 'slop'
+require 'rainbow'
+require 'active_support/core_ext/hash'
+require 'nokogiri'
+require_relative 'nn'
+
+DATA_FNAME = File.join(File.dirname(__FILE__), 'data/proper_pdd_data_regression.csv')
+WEIGHTS_FNAME = File.join(File.dirname(__FILE__), 'data/weights.marshal')
+
+def split_data(x_data, y_data)
+  n = x_data.length
+  train_test_split = 0.8 # split training and test data
+  train_size = (train_test_split * x_data.length).round
+  x_train = x_data.slice(0, train_size)
+  y_train = y_data.slice(0, train_size)
+  x_test = x_data.slice(train_size, n)
+  y_test = y_data.slice(train_size, n)
+  [x_train, y_train, x_test, y_test]
+end
+
+# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+def run_model(inputs)
+  inputs = [inputs] if inputs.is_a?(Hash)
+  rows = File.readlines(DATA_FNAME).map { |l| l.chomp.split(',') }
+  rows.slice!(0) # remove header of csv file
+  rows = rows.transpose[1..].transpose # drop first column containing repo id
+  rows.shuffle! # shuffle data
+  x_data = rows.map { |row| row[0..-2].map(&:to_f) } # array of array of numeric values
+  y_data = rows.map { |row| row[-1..].map(&:to_f) } # array of array of numeric values
+  x_train, y_train, x_test, y_test = split_data(x_data, y_data)
+  epsilon = 1e-1 # model hyperparameters and metrics
+  mse = lambda { |actual, ideal|
+    errors = actual.zip(ideal).map { |a, i| a - i }
+    (errors.inject(0) { |sum, err| sum + err**2 }) / errors.length.to_f
+  }
+  error_rate = ->(errors, total) { ((errors / total.to_f) * 100).round }
+  prediction_success = ->(actual, ideal) { actual >= (ideal - epsilon) && actual <= (ideal + epsilon) }
+  run_test = lambda { |nn, test_inputs, expected_outputs|
+    success = 0
+    failure = 0
+    errsum = 0
+    test_inputs.each.with_index do |input, i|
+      output = nn.run input
+      prediction_success.call(output[0], expected_outputs[i][0]) ? success += 1 : failure += 1
+      errsum += mse.call(output, expected_outputs[i])
+    end
+    [success, failure, errsum / test_inputs.length.to_f]
+  }
+
+  # Build a 4 layer network: 31 input neurons, 4 hidden neurons, 3 output neurons
+  # Bias neurons are automatically added to input + hidden layers; no need to specify these
+  input_size = x_train.first.length
+  nn = NeuralNet.new [input_size, 20, 10, 1]
+
+  if File.file?(WEIGHTS_FNAME)
+    puts "\nLoading existing model weights..."
+    nn.load WEIGHTS_FNAME
+    puts "\nSuccessfully loaded model weights..."
+  else
+    puts 'Testing the untrained network...'
+    success, failure, avg_mse = run_test.call(nn, x_test, y_test)
+    puts "Untrained classification success: #{success}, failure: #{failure}
+     (classification error: #{error_rate.call(failure, x_test.length)}%, mse: #{(avg_mse * 100).round(2)}%)"
+
+    puts "\nTraining the network...\n\n"
+    t1 = Time.now
+    result = nn.train(x_train, y_train, error_threshold: 0.01,
+                                        max_iterations: 1000,
+                                        log_every: 20)
+    # puts result
+    puts "\nDone training the network: #{result[:iterations]} iterations,
+     #{(result[:error] * 100).round(2)}% mse, #{(Time.now - t1).round(1)}s"
+    puts "\nSaving the model weights..."
+    nn.save WEIGHTS_FNAME
+    puts "\nSuccessfully saved model weights..."
+  end
+
+  puts "\nTesting the trained network..."
+  success, failure, avg_mse = run_test.call(nn, x_test, y_test)
+  puts "Trained classification success: #{success}, failure: #{failure}
+   (classification error: #{error_rate.call(failure, x_test.length)}%, mse: #{(avg_mse * 100).round(2)}%)"
+
+  estimates = inputs.map { |p| p['estimate'].to_i || Infinity }
+  rank_output = estimates.map.with_index.sort.map(&:last) # sort estimates from minimum to maximum
+  rank_output
+end
+# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
+
+def process_input(input_path)
+  throw 'Please provide a valid input path' unless input_path
+  doc = Nokogiri::XML(File.read(input_path))
+  puzzles = Hash.from_xml(doc.to_s)['puzzles']
+  puzzles = puzzles.nil? ? [] : puzzles['puzzle']
+  puzzles.nil? ? [] : puzzles
+end
+
+begin
+  args = []
+  args += ARGV
+
+  begin
+    opts = Slop.parse(args, strict: true, help: true) do |o|
+      o.banner = 'Usage: model.rb [options]'
+      o.string '-p', '--puzzles', 'Puzzles file'
+      o.string '-f', '--file', 'Result file'
+    end
+  rescue Slop::Error => e
+    raise StandardError, "#{e.message}, use -p and -f"
+  end
+
+  raise '-f is mandatory when using -v, try --help for more information' if !opts[:file] || !opts[:puzzles]
+
+  ranks = run_model(process_input(opts[:puzzles]))
+  File.open(opts[:file], 'w') { |f| f.write(ranks.join(' ')) }
+  puts 'Successfully wrote output to file'
+rescue StandardError => e
+  puts "#{Rainbow('ERROR').red} (#{e.class.name}): #{e.message}"
+  exit(255)
+end