From 2e2b2c74c5207ddb00d201e21a7a64ec84332f66 Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Fri, 10 Oct 2014 16:42:34 -0400 Subject: [PATCH] comment all the things --- lib/licensee.rb | 6 ++- lib/licensee/license.rb | 16 ++++++-- lib/licensee/license_file.rb | 6 +++ lib/licensee/licenses.rb | 39 ++++++++++++-------- lib/licensee/matcher.rb | 11 +++--- lib/licensee/matchers/exact_matcher.rb | 6 +-- lib/licensee/matchers/git_matcher.rb | 8 ++-- lib/licensee/matchers/levenshtein_matcher.rb | 15 +++++++- lib/licensee/project.rb | 17 +++++---- test/test_licensee_exact_matcher.rb | 4 -- test/test_licensee_git_matcher.rb | 4 -- test/test_licensee_licenses.rb | 4 +- 12 files changed, 83 insertions(+), 53 deletions(-) diff --git a/lib/licensee.rb b/lib/licensee.rb index 198c417..3a2b77e 100644 --- a/lib/licensee.rb +++ b/lib/licensee.rb @@ -12,14 +12,18 @@ require_relative "licensee/matchers/git_matcher" require_relative "licensee/matchers/levenshtein_matcher" class Licensee + + # Over watch percent is a match considered a match CONFIDENCE_THRESHOLD = 90 class << self + # Returns an array of Licensee::License instances def licenses - Licensee::Licenses.list + @licenses ||= Licensee::Licenses.list end + # Returns the license for a given git repo def license(path) Licensee::Project.new(path).license end diff --git a/lib/licensee/license.rb b/lib/licensee/license.rb index 461e9e6..a476925 100644 --- a/lib/licensee/license.rb +++ b/lib/licensee/license.rb @@ -6,34 +6,36 @@ class Licensee @name=name.downcase end + # Path to vendored license file on disk def path @path ||= File.expand_path "#{@name}.txt", Licensee::Licenses.base end + # Raw content of license file, including YAML front matter def content @content ||= File.open(path).read end - def parts - @parts ||= content.match(/^(---\n.*\n---)?(.*)/m).to_a - end - + # License metadata from YAML front matter def meta @meta ||= front_matter = YAML.load(parts[1]) if parts[1] rescue nil end + # The license body (e.g., contents - frontmatter) def body @body ||= parts[2] end alias_method :to_s, :body alias_method :text, :body + # License body with all whitespace replaced with a single space def body_normalized @content_normalized ||= body.downcase.gsub(/\s+/, " ").strip end + # Git-computed hash signature for the license file def hashsig @hashsig ||= Rugged::Blob::HashSignature.new( body, Rugged::Blob::HashSignature::WHITESPACE_SMART) @@ -42,5 +44,11 @@ class Licensee def inspect "#" end + + private + + def parts + @parts ||= content.match(/^(---\n.*\n---)?(.*)/m).to_a + end end end diff --git a/lib/licensee/license_file.rb b/lib/licensee/license_file.rb index ec0d083..9959cb5 100644 --- a/lib/licensee/license_file.rb +++ b/lib/licensee/license_file.rb @@ -7,6 +7,7 @@ class Licensee blob.hashsig(Rugged::Blob::HashSignature::WHITESPACE_SMART) end + # Raw file contents def content @contents ||= begin blob.content @@ -15,23 +16,28 @@ class Licensee alias_method :to_s, :content alias_method :contents, :content + # File content with all whitespace replaced with a single space def content_normalized @content_normalized ||= content.downcase.gsub(/\s+/, " ").strip end + # Comptutes a diff between known license and project license def diff(options={}) options = options.merge(:reverse => true) blob.diff(match.body, options).to_s if match end + # Determines which matching strategy to use, returns an instane of that matcher def matcher @matcher ||= Licensee.matchers.map { |m| m.new(self) }.find { |m| m.match } end + # Returns an Licensee::License instance of the matches license def match @match ||= matcher.match if matcher end + # Returns the percent confident with the match def confidence @condience ||= matcher.confidence if matcher end diff --git a/lib/licensee/licenses.rb b/lib/licensee/licenses.rb index 2e99e87..425fe0f 100644 --- a/lib/licensee/licenses.rb +++ b/lib/licensee/licenses.rb @@ -1,6 +1,29 @@ class Licensee class Licenses class << self + + # Returns an array of Licensee::License instances + def list + @licenses ||= begin + licenses = [] + names.each { |name| licenses.push License.new(name) } + licenses + end + end + + # Given a license name, attempt to return a matching Licensee::License instance + def find(name) + list.find { |l| l.name.downcase == name.downcase } + end + + # Path to vendored licenses + def base + @base ||= File.expand_path "../../vendor/choosealicense.com/_licenses", File.dirname(__FILE__) + end + + private + + # Returns a list of potential license names, as vendored def names @names ||= begin names = Dir.entries(base) @@ -10,22 +33,6 @@ class Licensee end end - def list - @licenses ||= begin - licenses = [] - names.each { |name| licenses.push License.new(name) } - licenses - end - end - - def base - @base ||= File.expand_path "../../vendor/choosealicense.com/_licenses", File.dirname(__FILE__) - end - - def find(name) - name = name.downcase - list.find { |l| l.name.downcase == name } - end end end end diff --git a/lib/licensee/matcher.rb b/lib/licensee/matcher.rb index bf880b6..b971cc6 100644 --- a/lib/licensee/matcher.rb +++ b/lib/licensee/matcher.rb @@ -1,3 +1,9 @@ +# Abstract class to describe different matching strategies +# Must respond to: +# - match +# - confidence +# +# Can assume file will be a Licensee::LicenseFile instance class Licensee class Matcher attr_reader :file @@ -10,10 +16,6 @@ class Licensee @file = file end - def matches - [] - end - def match nil end @@ -22,6 +24,5 @@ class Licensee 0 end alias_method :similarity, :confidence - end end diff --git a/lib/licensee/matchers/exact_matcher.rb b/lib/licensee/matchers/exact_matcher.rb index fcc66cb..95e2921 100644 --- a/lib/licensee/matchers/exact_matcher.rb +++ b/lib/licensee/matchers/exact_matcher.rb @@ -1,11 +1,7 @@ class Licensee class ExactMatcher < Matcher - def matches - [match] - end - def match - Licensee::Licenses.list.find { |l| l.body_normalized == file.content_normalized } + Licensee.licenses.find { |l| l.body_normalized == file.content_normalized } end def confidence diff --git a/lib/licensee/matchers/git_matcher.rb b/lib/licensee/matchers/git_matcher.rb index f9bbcd9..a5984ac 100644 --- a/lib/licensee/matchers/git_matcher.rb +++ b/lib/licensee/matchers/git_matcher.rb @@ -1,10 +1,6 @@ class Licensee class GitMatcher < Matcher - def matches - @matches ||= Licensee::Licenses.list.map { |l| [l, similarity(l)] }.select { |l,sim| sim > 0 } - end - def match match_info[0] unless match_info.nil? end @@ -15,6 +11,10 @@ class Licensee private + def matches + @matches ||= Licensee.licenses.map { |l| [l, similarity(l)] }.select { |l,sim| sim > 0 } + end + def similarity(other) file.blob.similarity(other.hashsig) end diff --git a/lib/licensee/matchers/levenshtein_matcher.rb b/lib/licensee/matchers/levenshtein_matcher.rb index af3ea34..6489f9f 100644 --- a/lib/licensee/matchers/levenshtein_matcher.rb +++ b/lib/licensee/matchers/levenshtein_matcher.rb @@ -1,40 +1,53 @@ class Licensee class LevenshteinMatcher < Matcher + # Return the first potential license that is more similar than the confidence threshold def match @match ||= potential_licenses.find do |license| similarity(license) >= Licensee::CONFIDENCE_THRESHOLD end end + # Sort all licenses, in decending order, by difference in length to the file + # Difference in lengths cannot exceed the file's length * the confidence threshold / 100 def potential_licenses @potential_licenses ||= begin - Licensee::Licenses.list.select { |license| length_delta(license) <= max_delta }.sort_by { |l| length_delta(l) }.reverse + Licensee.licenses.select { |license| length_delta(license) <= max_delta }.sort_by { |l| length_delta(l) }.reverse end end + # Calculate the difference between the file length and a given license's length def length_delta(license) (file_length - license.body_normalized.length).abs end + # Maximum possible difference between file length and license length + # for a license to be a potential license to be matched def max_delta @max_delta ||= (file_length * (Licensee::CONFIDENCE_THRESHOLD.to_f / 100.to_f )) end + # Confidence that the matched license is a match def confidence @confidence ||= match ? similarity(match) : 0 end private + # Length of the file, normalized to strip whitespace def file_length @file_length ||= file.content_normalized.length.to_f end + # Calculate percent changed between file and potential license def similarity(license) 100 * (file_length - distance(license)) / file_length end + # Calculate the levenshtein distance between file and license + # Note: We used content/body normalized because white space and capitalization + # isn't legally significant in this context. Fewer characters lets levenshtein + # work faster. As long as they both undergo the same transformation, should match. def distance(license) Levenshtein.distance(license.body_normalized, file.content_normalized).to_f end diff --git a/lib/licensee/project.rb b/lib/licensee/project.rb index 24bf382..83e6363 100644 --- a/lib/licensee/project.rb +++ b/lib/licensee/project.rb @@ -2,7 +2,8 @@ class Licensee class Project attr_reader :repository - VALID_FILENAMES = %w[ + # Array of file names to look for potential license files, in order + LICENSE_FILENAMES = %w[ LICENSE LICENSE.txt LICENSE.md @@ -10,6 +11,10 @@ class Licensee COPYING ] + # Initializes a new project + # + # path_or_repo path to git repo or Rugged::Repository instance + # revsion - revision ref, if any def initialize(path_or_repo, revision = nil) if path_or_repo.kind_of? Rugged::Repository @repository = path_or_repo @@ -20,22 +25,20 @@ class Licensee @revision = revision end + # Detects the license file, if any + # Returns a Licensee::LicenseFile instance def license_file return @license_file if defined? @license_file commit = @revision ? @repository.lookup(@revision) : @repository.last_commit - license_blob = commit.tree.each_blob { |blob| break blob if VALID_FILENAMES.include? blob[:name] } - + license_blob = commit.tree.each_blob { |blob| break blob if LICENSE_FILENAMES.include? blob[:name] } @license_file = if license_blob LicenseFile.new(@repository.lookup(license_blob[:oid])) end end - def matches - @matches ||= license_file.matches if license_file - end - + # Returns the matching Licensee::License instance if a license can be detected def license @license ||= license_file.match if license_file end diff --git a/test/test_licensee_exact_matcher.rb b/test/test_licensee_exact_matcher.rb index 48ce52f..fba6bd9 100644 --- a/test/test_licensee_exact_matcher.rb +++ b/test/test_licensee_exact_matcher.rb @@ -15,8 +15,4 @@ class TestLicenseeExactMatcher < Minitest::Test should "know the match confidence" do assert_equal 100, Licensee::ExactMatcher.new(@mit).confidence end - - should "know the matches" do - assert_equal 1, Licensee::ExactMatcher.new(@mit).matches.size - end end diff --git a/test/test_licensee_git_matcher.rb b/test/test_licensee_git_matcher.rb index 70a297d..53c2bf3 100644 --- a/test/test_licensee_git_matcher.rb +++ b/test/test_licensee_git_matcher.rb @@ -15,8 +15,4 @@ class TestLicenseeGitMatcher < Minitest::Test should "know the match confidence" do assert_equal 94, Licensee::GitMatcher.new(@mit).confidence end - - should "know the matches" do - assert_equal 1, Licensee::GitMatcher.new(@mit).matches.size - end end diff --git a/test/test_licensee_licenses.rb b/test/test_licensee_licenses.rb index 4c3780e..8ea6645 100644 --- a/test/test_licensee_licenses.rb +++ b/test/test_licensee_licenses.rb @@ -3,8 +3,8 @@ require 'helper' class TestLicenseeLicenses < Minitest::Test should "know license names" do - assert_equal Array, Licensee::Licenses.names.class - assert_equal 15, Licensee::Licenses.names.size + assert_equal Array, Licensee::Licenses.send(:names).class + assert_equal 15, Licensee::Licenses.send(:names).size end should "load the licenses" do