-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgarbage_detector.rb
45 lines (36 loc) · 1.15 KB
/
garbage_detector.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
require 'optparse'
require 'fileutils'
require 'wordtriez'
require 'wordtree'
require 'benchmark'
require 'json'
options = {:output => "index.json"}
$t = Wordtriez.new
OptionParser.new do |opts|
opts.banner = "Usage: compare.rb [options]"
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
options[:verbose] = v
end
opts.on("-f", "--files FILELIST", "Read files from FILELIST.") do |path|
options[:files] = path
end
opts.on("", "--chdir DIR", "Change working dir to DIR before processing") do |path|
options[:chdir] = path
end
end.parse!
files = File.read(options[:files]).split("\n")
files = files.map{ |f| File.join(options[:chdir], f) } if options[:chdir]
files.each_with_index do |path, i|
time = Time.now.strftime("%H:%M:%S.%L")
text = File.open(path, "r:UTF-8", &:read).scrub
total_count = text.size - 2
begin
WordTree::Text.clean(text)
rescue ArgumentError
puts "#{i+1}\t#{time}\t\t#{total_count}\t\t#{path}\t(error)"
next
end
common_count = WordTree::Text.common_trigrams(text)
puts "#{i+1}\t#{time}\t#{common_count}\t#{total_count}\t#{common_count.to_f / total_count}\t#{path}\t"
$stdout.flush
end