reimplement levenshtein as a fallback

This commit is contained in:
Ben Balter 2014-10-07 19:07:09 -04:00
parent 3e6a2ed2b3
commit 526b84343d
14 changed files with 211 additions and 47 deletions

View File

@ -6,6 +6,7 @@ license = Licensee::Project.new(Dir.pwd).license_file
if license
puts "License: #{license.match ? license.match.meta['title'] : 'no license'}"
puts "Confidence: #{license.confidence}%"
puts "Method: #{license.matcher.class}"
else
puts "Unknown"
end

View File

@ -1,19 +1,30 @@
require 'yaml'
require 'rugged'
require 'levenshtein'
require_relative "licensee/license"
require_relative "licensee/licenses"
require_relative "licensee/license_file"
require_relative "licensee/project"
require_relative "licensee/matcher"
require_relative "licensee/matchers/git"
require_relative "licensee/matchers/levenshtein"
class Licensee
CONFIDENCE_THRESHOLD = 90
def self.licenses
Licensee::Licenses.list
end
class << self
def self.license(path)
Licensee::Project.new(path).license
def licenses
Licensee::Licenses.list
end
def license(path)
Licensee::Project.new(path).license
end
def matchers
[Licensee::GitMatcher, Licensee::LevenshteinMatcher]
end
end
end

View File

@ -24,16 +24,16 @@ class Licensee
nil
end
def length
@length ||= body.length
end
def body
@body ||= parts[2]
end
alias_method :to_s, :body
alias_method :text, :body
def body_normalized
@content_normalized ||= body.downcase.gsub("\n", " ").strip
end
def hashsig
@hashsig ||= Rugged::Blob::HashSignature.new(
body, Rugged::Blob::HashSignature::WHITESPACE_SMART)

View File

@ -7,43 +7,40 @@ class Licensee
blob.hashsig(Rugged::Blob::HashSignature::WHITESPACE_SMART)
end
def contents
@contents ||= blob.content
def content
@contents ||= begin
blob.content
end
end
alias_method :to_s, :contents
alias_method :content, :contents
alias_method :to_s, :content
alias_method :contents, :content
def content_wrapped
def length
@length ||= blob.size
end
def matches
@matches ||= Licensee::Licenses.list.map { |l| [l, calculate_similarity(l)] }.select { |l,sim| sim > 0 }
def content_normalized
@content_normalized ||= content.downcase.gsub("\n", " ").strip
end
def match_info
@match_info ||= matches.max_by { |license, similarity| similarity }
end
def match
match_info ? match_info[0] : nil
end
def confidence
match_info ? match_info[1] : nil
end
alias_method :similarity, :confidence
def diff(options={})
options = options.merge(:reverse => true)
blob.diff(match.body, options).to_s if match
end
private
def matcher
@matcher ||= Licensee.matchers.each do |matcher|
matcher = matcher.new(self)
break matcher if matcher.match
end
end
# Pulled out for easier testing
def calculate_similarity(other)
blob.similarity(other.hashsig)
def match
@match ||= matcher.match if matcher
end
def confidence
@condience ||= matcher.confidence
end
end
end

27
lib/licensee/matcher.rb Normal file
View File

@ -0,0 +1,27 @@
class Licensee
class Matcher
attr_reader :file
def self.match(file)
self.new(file).match
end
def initialize(file)
@file = file
end
def matches
[]
end
def match
nil
end
def confidence
0
end
alias_method :similarity, :confidence
end
end

View File

@ -0,0 +1,30 @@
class Licensee
class GitMatcher < Matcher
def matches
@matches ||= Licensee::Licenses.list.map { |l| [l, similarity(l)] }.select { |l,sim| sim > 0 }
end
def match
match_info ? match_info[0] : nil
end
def confidence
match_info ? match_info[1] : nil
end
private
def similarity(other)
file.blob.similarity(other.hashsig)
end
# Pulled out for easier testing
def match_info
@match_info ||= begin
match = matches.max_by { |license, similarity| similarity }
match if match[1] > Licensee::CONFIDENCE_THRESHOLD
end
end
end
end

View File

@ -0,0 +1,42 @@
class Licensee
class LevenshteinMatcher < Matcher
def match
potential_licenses.find do |license|
similarity(license) >= Licensee::CONFIDENCE_THRESHOLD
end
end
def potential_licenses
@potential_licenses ||= begin
Licensee::Licenses.list.select { |license| length_delta(license) <= max_delta }.sort_by { |l| length_delta(l) }.reverse
end
end
def length_delta(license)
(file.content_normalized.length - license.body_normalized.length).abs
end
def max_delta
@max_delta ||= (file.content_normalized.length * (Licensee::CONFIDENCE_THRESHOLD.to_f / 100.to_f ))
end
def confidence
@confidence ||= match ? similarity(match) : 0
end
private
def length
file.content_normalized.length.to_f
end
def similarity(license)
100 * (length - distance(license)) / length
end
def distance(license)
Levenshtein.distance(file.content_normalized, license.body_normalized).to_f
end
end
end

View File

@ -16,6 +16,7 @@ Gem::Specification.new do |gem|
gem.executables << 'licensee'
gem.add_dependency('rugged', '~> 0.21.1b2')
gem.add_dependency('levenshtein-ffi', '~> 1.1')
gem.add_development_dependency('pry', '~> 0.9')
gem.add_development_dependency('shoulda', '~> 3.5')
gem.add_development_dependency('rake', '~> 10.3')

View File

@ -60,7 +60,7 @@ def verify_license_file(license, chaos = false, wrap=false)
actual = license_file.match
assert actual, "No match for #{expected}."
assert_equal expected, actual.name, "expeceted #{expected} but got #{actual.name} for .match. Matches: #{license_file.matches}"
assert_equal expected, actual.name, "expeceted #{expected} but got #{actual.name} for .match. Confidence: #{license_file.confidence}. Method: #{license_file.matcher.class}"
end
def wrap(text, line_width=80)

View File

@ -0,0 +1,22 @@
require 'helper'
class TestLicenseeGitMatcher < Minitest::Test
def setup
text = license_from_path( Licensee::Licenses.find("mit").path )
blob = FakeBlob.new(text)
@mit = Licensee::LicenseFile.new(blob)
end
should "match the license" do
assert_equal "mit", Licensee::GitMatcher.match(@mit).name
end
should "know the match confidence" do
assert_equal 94, Licensee::GitMatcher.new(@mit).confidence
end
should "know the matches" do
assert_equal 1, Licensee::GitMatcher.new(@mit).matches.size
end
end

View File

@ -0,0 +1,34 @@
require 'helper'
class TestLicenseeLevenshteinMatcher < Minitest::Test
def setup
text = license_from_path( Licensee::Licenses.find("mit").path )
blob = FakeBlob.new(text)
@mit = Licensee::LicenseFile.new(blob)
end
should "match the license" do
assert_equal "mit", Licensee::LevenshteinMatcher.match(@mit).name
end
should "know the match confidence" do
matcher = Licensee::LevenshteinMatcher.new(@mit)
assert matcher.confidence > 98, "#{matcher.confidence} < 98"
end
should "calculate max delta" do
assert_equal 968.4, Licensee::LevenshteinMatcher.new(@mit).max_delta
end
should "calculate length delta" do
isc = Licensee::Licenses.find("isc")
assert_equal 2, Licensee::LevenshteinMatcher.new(@mit).length_delta(Licensee::Licenses.find("mit"))
assert_equal 336, Licensee::LevenshteinMatcher.new(@mit).length_delta(isc)
end
should "round up potential licenses" do
assert_equal 5, Licensee::LevenshteinMatcher.new(@mit).potential_licenses.size
end
end

View File

@ -8,7 +8,6 @@ class TestLicenseeLicense < Minitest::Test
should "read the license body" do
assert @license.body
assert @license.length > 0
assert @license.text =~ /MIT/
end

View File

@ -14,17 +14,6 @@ class TestLicenseeLicenseFile < Minitest::Test
assert @file.contents =~ /MIT/
end
should "known the file length" do
assert_equal 1077, @file.length
end
should "calculate similiarty" do
actual = @file.send(:calculate_similarity, @mit)
assert actual > Licensee::CONFIDENCE_THRESHOLD, "expected #{actual} to be > 90% for MIT"
actual = @file.send(:calculate_similarity, @gpl)
assert actual < 1, "expected #{actual} to be < 1% for GPL"
end
should "match the license" do
assert_equal "mit", @file.match.name
end
@ -33,4 +22,8 @@ class TestLicenseeLicenseFile < Minitest::Test
expected = "-Copyright (c) [year] [fullname]\n+Copyright (c) 2014 Ben Balter"
assert @file.diff.include?(expected)
end
should "calculate confidence" do
assert_equal 94, @file.confidence
end
end

View File

@ -0,0 +1,7 @@
require 'helper'
class TestLicenseeMatcher < Minitest::Test
should "match the license without raising an error" do
assert_nil Licensee::Matcher.match(nil)
end
end