comment all the things

This commit is contained in:
Ben Balter 2014-10-10 16:42:34 -04:00
parent 8e230f4151
commit 2e2b2c74c5
12 changed files with 83 additions and 53 deletions

View File

@ -12,14 +12,18 @@ require_relative "licensee/matchers/git_matcher"
require_relative "licensee/matchers/levenshtein_matcher"
class Licensee
# Over watch percent is a match considered a match
CONFIDENCE_THRESHOLD = 90
class << self
# Returns an array of Licensee::License instances
def licenses
Licensee::Licenses.list
@licenses ||= Licensee::Licenses.list
end
# Returns the license for a given git repo
def license(path)
Licensee::Project.new(path).license
end

View File

@ -6,34 +6,36 @@ class Licensee
@name=name.downcase
end
# Path to vendored license file on disk
def path
@path ||= File.expand_path "#{@name}.txt", Licensee::Licenses.base
end
# Raw content of license file, including YAML front matter
def content
@content ||= File.open(path).read
end
def parts
@parts ||= content.match(/^(---\n.*\n---)?(.*)/m).to_a
end
# License metadata from YAML front matter
def meta
@meta ||= front_matter = YAML.load(parts[1]) if parts[1]
rescue
nil
end
# The license body (e.g., contents - frontmatter)
def body
@body ||= parts[2]
end
alias_method :to_s, :body
alias_method :text, :body
# License body with all whitespace replaced with a single space
def body_normalized
@content_normalized ||= body.downcase.gsub(/\s+/, " ").strip
end
# Git-computed hash signature for the license file
def hashsig
@hashsig ||= Rugged::Blob::HashSignature.new(
body, Rugged::Blob::HashSignature::WHITESPACE_SMART)
@ -42,5 +44,11 @@ class Licensee
def inspect
"#<Licensee::License name=\"#{name}\">"
end
private
def parts
@parts ||= content.match(/^(---\n.*\n---)?(.*)/m).to_a
end
end
end

View File

@ -7,6 +7,7 @@ class Licensee
blob.hashsig(Rugged::Blob::HashSignature::WHITESPACE_SMART)
end
# Raw file contents
def content
@contents ||= begin
blob.content
@ -15,23 +16,28 @@ class Licensee
alias_method :to_s, :content
alias_method :contents, :content
# File content with all whitespace replaced with a single space
def content_normalized
@content_normalized ||= content.downcase.gsub(/\s+/, " ").strip
end
# Comptutes a diff between known license and project license
def diff(options={})
options = options.merge(:reverse => true)
blob.diff(match.body, options).to_s if match
end
# Determines which matching strategy to use, returns an instane of that matcher
def matcher
@matcher ||= Licensee.matchers.map { |m| m.new(self) }.find { |m| m.match }
end
# Returns an Licensee::License instance of the matches license
def match
@match ||= matcher.match if matcher
end
# Returns the percent confident with the match
def confidence
@condience ||= matcher.confidence if matcher
end

View File

@ -1,6 +1,29 @@
class Licensee
class Licenses
class << self
# Returns an array of Licensee::License instances
def list
@licenses ||= begin
licenses = []
names.each { |name| licenses.push License.new(name) }
licenses
end
end
# Given a license name, attempt to return a matching Licensee::License instance
def find(name)
list.find { |l| l.name.downcase == name.downcase }
end
# Path to vendored licenses
def base
@base ||= File.expand_path "../../vendor/choosealicense.com/_licenses", File.dirname(__FILE__)
end
private
# Returns a list of potential license names, as vendored
def names
@names ||= begin
names = Dir.entries(base)
@ -10,22 +33,6 @@ class Licensee
end
end
def list
@licenses ||= begin
licenses = []
names.each { |name| licenses.push License.new(name) }
licenses
end
end
def base
@base ||= File.expand_path "../../vendor/choosealicense.com/_licenses", File.dirname(__FILE__)
end
def find(name)
name = name.downcase
list.find { |l| l.name.downcase == name }
end
end
end
end

View File

@ -1,3 +1,9 @@
# Abstract class to describe different matching strategies
# Must respond to:
# - match
# - confidence
#
# Can assume file will be a Licensee::LicenseFile instance
class Licensee
class Matcher
attr_reader :file
@ -10,10 +16,6 @@ class Licensee
@file = file
end
def matches
[]
end
def match
nil
end
@ -22,6 +24,5 @@ class Licensee
0
end
alias_method :similarity, :confidence
end
end

View File

@ -1,11 +1,7 @@
class Licensee
class ExactMatcher < Matcher
def matches
[match]
end
def match
Licensee::Licenses.list.find { |l| l.body_normalized == file.content_normalized }
Licensee.licenses.find { |l| l.body_normalized == file.content_normalized }
end
def confidence

View File

@ -1,10 +1,6 @@
class Licensee
class GitMatcher < Matcher
def matches
@matches ||= Licensee::Licenses.list.map { |l| [l, similarity(l)] }.select { |l,sim| sim > 0 }
end
def match
match_info[0] unless match_info.nil?
end
@ -15,6 +11,10 @@ class Licensee
private
def matches
@matches ||= Licensee.licenses.map { |l| [l, similarity(l)] }.select { |l,sim| sim > 0 }
end
def similarity(other)
file.blob.similarity(other.hashsig)
end

View File

@ -1,40 +1,53 @@
class Licensee
class LevenshteinMatcher < Matcher
# Return the first potential license that is more similar than the confidence threshold
def match
@match ||= potential_licenses.find do |license|
similarity(license) >= Licensee::CONFIDENCE_THRESHOLD
end
end
# Sort all licenses, in decending order, by difference in length to the file
# Difference in lengths cannot exceed the file's length * the confidence threshold / 100
def potential_licenses
@potential_licenses ||= begin
Licensee::Licenses.list.select { |license| length_delta(license) <= max_delta }.sort_by { |l| length_delta(l) }.reverse
Licensee.licenses.select { |license| length_delta(license) <= max_delta }.sort_by { |l| length_delta(l) }.reverse
end
end
# Calculate the difference between the file length and a given license's length
def length_delta(license)
(file_length - license.body_normalized.length).abs
end
# Maximum possible difference between file length and license length
# for a license to be a potential license to be matched
def max_delta
@max_delta ||= (file_length * (Licensee::CONFIDENCE_THRESHOLD.to_f / 100.to_f ))
end
# Confidence that the matched license is a match
def confidence
@confidence ||= match ? similarity(match) : 0
end
private
# Length of the file, normalized to strip whitespace
def file_length
@file_length ||= file.content_normalized.length.to_f
end
# Calculate percent changed between file and potential license
def similarity(license)
100 * (file_length - distance(license)) / file_length
end
# Calculate the levenshtein distance between file and license
# Note: We used content/body normalized because white space and capitalization
# isn't legally significant in this context. Fewer characters lets levenshtein
# work faster. As long as they both undergo the same transformation, should match.
def distance(license)
Levenshtein.distance(license.body_normalized, file.content_normalized).to_f
end

View File

@ -2,7 +2,8 @@ class Licensee
class Project
attr_reader :repository
VALID_FILENAMES = %w[
# Array of file names to look for potential license files, in order
LICENSE_FILENAMES = %w[
LICENSE
LICENSE.txt
LICENSE.md
@ -10,6 +11,10 @@ class Licensee
COPYING
]
# Initializes a new project
#
# path_or_repo path to git repo or Rugged::Repository instance
# revsion - revision ref, if any
def initialize(path_or_repo, revision = nil)
if path_or_repo.kind_of? Rugged::Repository
@repository = path_or_repo
@ -20,22 +25,20 @@ class Licensee
@revision = revision
end
# Detects the license file, if any
# Returns a Licensee::LicenseFile instance
def license_file
return @license_file if defined? @license_file
commit = @revision ? @repository.lookup(@revision) : @repository.last_commit
license_blob = commit.tree.each_blob { |blob| break blob if VALID_FILENAMES.include? blob[:name] }
license_blob = commit.tree.each_blob { |blob| break blob if LICENSE_FILENAMES.include? blob[:name] }
@license_file = if license_blob
LicenseFile.new(@repository.lookup(license_blob[:oid]))
end
end
def matches
@matches ||= license_file.matches if license_file
end
# Returns the matching Licensee::License instance if a license can be detected
def license
@license ||= license_file.match if license_file
end

View File

@ -15,8 +15,4 @@ class TestLicenseeExactMatcher < Minitest::Test
should "know the match confidence" do
assert_equal 100, Licensee::ExactMatcher.new(@mit).confidence
end
should "know the matches" do
assert_equal 1, Licensee::ExactMatcher.new(@mit).matches.size
end
end

View File

@ -15,8 +15,4 @@ class TestLicenseeGitMatcher < Minitest::Test
should "know the match confidence" do
assert_equal 94, Licensee::GitMatcher.new(@mit).confidence
end
should "know the matches" do
assert_equal 1, Licensee::GitMatcher.new(@mit).matches.size
end
end

View File

@ -3,8 +3,8 @@ require 'helper'
class TestLicenseeLicenses < Minitest::Test
should "know license names" do
assert_equal Array, Licensee::Licenses.names.class
assert_equal 15, Licensee::Licenses.names.size
assert_equal Array, Licensee::Licenses.send(:names).class
assert_equal 15, Licensee::Licenses.send(:names).size
end
should "load the licenses" do