mirror of https://github.com/licensee/licensee.git
push CC false positive logic into the dice matcher
This commit is contained in:
parent
19a84890b6
commit
c8f0ea23c0
|
@ -30,19 +30,6 @@ module Licensee
|
|||
(length - other.length).abs
|
||||
end
|
||||
|
||||
# Modify similarlity to avoid known false positives in context of particular
|
||||
# licenses eg https://github.com/benbalter/licensee/issues/116
|
||||
def license_similarity(other)
|
||||
s = similarity(other)
|
||||
if key.start_with?('cc-by') &&
|
||||
s >= Licensee.confidence_threshold &&
|
||||
(other.content.include?('NonCommercial') ||
|
||||
other.content.include?('NoDeriv'))
|
||||
return Licensee.confidence_threshold - 1
|
||||
end
|
||||
s
|
||||
end
|
||||
|
||||
# Given another license or project file, calculates the similarity
|
||||
# as a percentage of words in common
|
||||
def similarity(other)
|
||||
|
|
|
@ -111,6 +111,11 @@ module Licensee
|
|||
key == 'gpl-2.0' || key == 'gpl-3.0'
|
||||
end
|
||||
|
||||
# Is this license a Creative Commons license?
|
||||
def creative_commons?
|
||||
key.start_with?('cc-')
|
||||
end
|
||||
|
||||
# The license body (e.g., contents - frontmatter)
|
||||
def content
|
||||
@content ||= parts[2] if parts && parts[2]
|
||||
|
|
|
@ -18,20 +18,29 @@ module Licensee
|
|||
end
|
||||
|
||||
# Licenses that may be a match for this file.
|
||||
# To avoid false positives, the percentage change in file length
|
||||
# may not exceed the inverse of the confidence threshold
|
||||
# To avoid false positives:
|
||||
#
|
||||
# 1. Creative commons licenses cannot be matched against license files
|
||||
# that begin with the title of a non-open source CC license variant
|
||||
# 2. The percentage change in file length may not exceed the inverse
|
||||
# of the confidence threshold
|
||||
def potential_licenses
|
||||
@potential_licenses ||= begin
|
||||
Licensee.licenses(hidden: true).select do |license|
|
||||
license.wordset && license.length_delta(file) <= license.max_delta
|
||||
if license.creative_commons? && file.potential_false_positive?
|
||||
false
|
||||
else
|
||||
license.wordset && license.length_delta(file) <= license.max_delta
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def licenses_by_similiarity
|
||||
@licenses_by_similiarity ||= begin
|
||||
licenses =
|
||||
potential_licenses.map { |l| [l, l.license_similarity(file)] }
|
||||
licenses = potential_licenses.map do |license|
|
||||
[license, license.similarity(file)]
|
||||
end
|
||||
licenses.sort_by { |_, similarity| similarity }.reverse
|
||||
end
|
||||
end
|
||||
|
|
|
@ -29,6 +29,12 @@ module Licensee
|
|||
// => 0.0 # Catch all
|
||||
}.freeze
|
||||
|
||||
# CC-NC and CC-ND are not open source licenses and should always be
|
||||
# detected as the "other" license
|
||||
CC_FALSE_POSITIVE_REGEX = /
|
||||
\A(creative\ commons\ )?Attribution-(NonCommercial|NoDerivatives)
|
||||
/xi
|
||||
|
||||
def possible_matchers
|
||||
[Matchers::Copyright, Matchers::Exact, Matchers::Dice]
|
||||
end
|
||||
|
@ -38,6 +44,11 @@ module Licensee
|
|||
matches[0].strip if matches
|
||||
end
|
||||
|
||||
# Is this file likely to result in a creative commons false positive?
|
||||
def potential_false_positive?
|
||||
content.strip =~ CC_FALSE_POSITIVE_REGEX
|
||||
end
|
||||
|
||||
def self.name_score(filename)
|
||||
FILENAME_REGEXES.find { |regex, _| filename =~ regex }[1]
|
||||
end
|
||||
|
|
|
@ -168,6 +168,11 @@ RSpec.describe Licensee::License do
|
|||
expect(mit).to_not be_gpl
|
||||
expect(gpl).to be_gpl
|
||||
end
|
||||
|
||||
it 'knows if a license is CC' do
|
||||
expect(gpl).to_not be_creative_commons
|
||||
expect(cc_by).to be_creative_commons
|
||||
end
|
||||
end
|
||||
|
||||
context 'content' do
|
||||
|
|
|
@ -2,6 +2,8 @@ RSpec.describe Licensee::Matchers::Dice do
|
|||
let(:mit) { Licensee::License.find('mit') }
|
||||
let(:gpl) { Licensee::License.find('gpl-3.0') }
|
||||
let(:agpl) { Licensee::License.find('agpl-3.0') }
|
||||
let(:cc_by) { Licensee::License.find('cc-by-4.0') }
|
||||
let(:cc_by_sa) { Licensee::License.find('cc-by-sa-4.0') }
|
||||
let(:content) { sub_copyright_info(gpl.content) }
|
||||
let(:file) { Licensee::Project::LicenseFile.new(content, 'LICENSE.txt') }
|
||||
subject { described_class.new(file) }
|
||||
|
@ -55,4 +57,28 @@ RSpec.describe Licensee::Matchers::Dice do
|
|||
expect(subject.confidence).to eql(0)
|
||||
end
|
||||
end
|
||||
|
||||
context 'CC false positive' do
|
||||
context 'CC-BY' do
|
||||
let(:content) { cc_by.content_normalized }
|
||||
|
||||
it 'matches' do
|
||||
expect(content).to be_detected_as(cc_by)
|
||||
end
|
||||
end
|
||||
|
||||
context 'CC-ND' do
|
||||
let(:project_path) { fixture_path('cc-by-nd') }
|
||||
let(:license_path) { File.expand_path('LICENSE', project_path) }
|
||||
let(:content) { File.read(license_path) }
|
||||
|
||||
it "doesn't match" do
|
||||
expect(content).to_not be_detected_as(cc_by)
|
||||
expect(content).to_not be_detected_as(cc_by_sa)
|
||||
expect(subject.match).to be_nil
|
||||
expect(subject.matches).to be_empty
|
||||
expect(subject.confidence).to eql(0)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -119,4 +119,40 @@ RSpec.describe Licensee::Project::LicenseFile do
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'CC false positives' do
|
||||
let(:regex) { Licensee::Project::LicenseFile::CC_FALSE_POSITIVE_REGEX }
|
||||
|
||||
it "knows MIT isn't a potential false positive" do
|
||||
expect(subject.content).to_not match(regex)
|
||||
expect(subject).to_not be_a_potential_false_positive
|
||||
end
|
||||
|
||||
context 'a CC false positive without creative commons in the title' do
|
||||
let(:content) { 'Creative Commons Attribution-NonCommercial 4.0' }
|
||||
|
||||
it "knows it's a potential false positive" do
|
||||
expect(subject.content).to match(regex)
|
||||
expect(subject).to be_a_potential_false_positive
|
||||
end
|
||||
end
|
||||
|
||||
context 'a CC false positive without creative commons in the title' do
|
||||
let(:content) { 'Attribution-NonCommercial 4.0 International' }
|
||||
|
||||
it "knows it's a potential false positive" do
|
||||
expect(subject.content).to match(regex)
|
||||
expect(subject).to be_a_potential_false_positive
|
||||
end
|
||||
end
|
||||
|
||||
context 'CC-BY-ND' do
|
||||
let(:content) { 'Attribution-NoDerivatives 4.0 International' }
|
||||
|
||||
it "knows it's a potential false positive" do
|
||||
expect(subject.content).to match(regex)
|
||||
expect(subject).to be_a_potential_false_positive
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue