push CC false positive logic into the dice matcher

This commit is contained in:
Ben Balter 2017-02-21 15:49:57 -05:00
parent 19a84890b6
commit c8f0ea23c0
No known key found for this signature in database
GPG Key ID: DBB67C246AD356C4
7 changed files with 97 additions and 18 deletions

View File

@ -30,19 +30,6 @@ module Licensee
(length - other.length).abs
end
# Modify similarlity to avoid known false positives in context of particular
# licenses eg https://github.com/benbalter/licensee/issues/116
def license_similarity(other)
s = similarity(other)
if key.start_with?('cc-by') &&
s >= Licensee.confidence_threshold &&
(other.content.include?('NonCommercial') ||
other.content.include?('NoDeriv'))
return Licensee.confidence_threshold - 1
end
s
end
# Given another license or project file, calculates the similarity
# as a percentage of words in common
def similarity(other)

View File

@ -111,6 +111,11 @@ module Licensee
key == 'gpl-2.0' || key == 'gpl-3.0'
end
# Is this license a Creative Commons license?
def creative_commons?
key.start_with?('cc-')
end
# The license body (e.g., contents - frontmatter)
def content
@content ||= parts[2] if parts && parts[2]

View File

@ -18,20 +18,29 @@ module Licensee
end
# Licenses that may be a match for this file.
# To avoid false positives, the percentage change in file length
# may not exceed the inverse of the confidence threshold
# To avoid false positives:
#
# 1. Creative commons licenses cannot be matched against license files
# that begin with the title of a non-open source CC license variant
# 2. The percentage change in file length may not exceed the inverse
# of the confidence threshold
def potential_licenses
@potential_licenses ||= begin
Licensee.licenses(hidden: true).select do |license|
license.wordset && license.length_delta(file) <= license.max_delta
if license.creative_commons? && file.potential_false_positive?
false
else
license.wordset && license.length_delta(file) <= license.max_delta
end
end
end
end
def licenses_by_similiarity
@licenses_by_similiarity ||= begin
licenses =
potential_licenses.map { |l| [l, l.license_similarity(file)] }
licenses = potential_licenses.map do |license|
[license, license.similarity(file)]
end
licenses.sort_by { |_, similarity| similarity }.reverse
end
end

View File

@ -29,6 +29,12 @@ module Licensee
// => 0.0 # Catch all
}.freeze
# CC-NC and CC-ND are not open source licenses and should always be
# detected as the "other" license
CC_FALSE_POSITIVE_REGEX = /
\A(creative\ commons\ )?Attribution-(NonCommercial|NoDerivatives)
/xi
def possible_matchers
[Matchers::Copyright, Matchers::Exact, Matchers::Dice]
end
@ -38,6 +44,11 @@ module Licensee
matches[0].strip if matches
end
# Is this file likely to result in a creative commons false positive?
def potential_false_positive?
content.strip =~ CC_FALSE_POSITIVE_REGEX
end
def self.name_score(filename)
FILENAME_REGEXES.find { |regex, _| filename =~ regex }[1]
end

View File

@ -168,6 +168,11 @@ RSpec.describe Licensee::License do
expect(mit).to_not be_gpl
expect(gpl).to be_gpl
end
it 'knows if a license is CC' do
expect(gpl).to_not be_creative_commons
expect(cc_by).to be_creative_commons
end
end
context 'content' do

View File

@ -2,6 +2,8 @@ RSpec.describe Licensee::Matchers::Dice do
let(:mit) { Licensee::License.find('mit') }
let(:gpl) { Licensee::License.find('gpl-3.0') }
let(:agpl) { Licensee::License.find('agpl-3.0') }
let(:cc_by) { Licensee::License.find('cc-by-4.0') }
let(:cc_by_sa) { Licensee::License.find('cc-by-sa-4.0') }
let(:content) { sub_copyright_info(gpl.content) }
let(:file) { Licensee::Project::LicenseFile.new(content, 'LICENSE.txt') }
subject { described_class.new(file) }
@ -55,4 +57,28 @@ RSpec.describe Licensee::Matchers::Dice do
expect(subject.confidence).to eql(0)
end
end
context 'CC false positive' do
context 'CC-BY' do
let(:content) { cc_by.content_normalized }
it 'matches' do
expect(content).to be_detected_as(cc_by)
end
end
context 'CC-ND' do
let(:project_path) { fixture_path('cc-by-nd') }
let(:license_path) { File.expand_path('LICENSE', project_path) }
let(:content) { File.read(license_path) }
it "doesn't match" do
expect(content).to_not be_detected_as(cc_by)
expect(content).to_not be_detected_as(cc_by_sa)
expect(subject.match).to be_nil
expect(subject.matches).to be_empty
expect(subject.confidence).to eql(0)
end
end
end
end

View File

@ -119,4 +119,40 @@ RSpec.describe Licensee::Project::LicenseFile do
end
end
end
context 'CC false positives' do
let(:regex) { Licensee::Project::LicenseFile::CC_FALSE_POSITIVE_REGEX }
it "knows MIT isn't a potential false positive" do
expect(subject.content).to_not match(regex)
expect(subject).to_not be_a_potential_false_positive
end
context 'a CC false positive without creative commons in the title' do
let(:content) { 'Creative Commons Attribution-NonCommercial 4.0' }
it "knows it's a potential false positive" do
expect(subject.content).to match(regex)
expect(subject).to be_a_potential_false_positive
end
end
context 'a CC false positive without creative commons in the title' do
let(:content) { 'Attribution-NonCommercial 4.0 International' }
it "knows it's a potential false positive" do
expect(subject.content).to match(regex)
expect(subject).to be_a_potential_false_positive
end
end
context 'CC-BY-ND' do
let(:content) { 'Attribution-NoDerivatives 4.0 International' }
it "knows it's a potential false positive" do
expect(subject.content).to match(regex)
expect(subject).to be_a_potential_false_positive
end
end
end
end