fix normalization errors

This commit is contained in:
Ben Balter 2018-11-02 11:43:42 -07:00
parent 998aeece2e
commit cc7b1350a9
No known key found for this signature in database
GPG Key ID: DBB67C246AD356C4
7 changed files with 115 additions and 73 deletions

View File

@ -8,27 +8,26 @@ module Licensee
END_OF_TERMS_REGEX = /^[\s#*_]*end of terms and conditions\s*$/i END_OF_TERMS_REGEX = /^[\s#*_]*end of terms and conditions\s*$/i
ALT_TITLE_REGEX = License::ALT_TITLE_REGEX ALT_TITLE_REGEX = License::ALT_TITLE_REGEX
REGEXES = { REGEXES = {
hrs: /^\s*[=\-\*][=\-\*]{2,}\s*/, hrs: /^\s*[=\-\*]{3,}\s*$/,
all_rights_reserved: /#{START_REGEX}all rights reserved\.?$/i, all_rights_reserved: /#{START_REGEX}all rights reserved\.?$/i,
whitespace: /\s+/, whitespace: /\s+/,
markdown_headings: /#{START_REGEX}#+/, markdown_headings: /#{START_REGEX}#+/,
version: /#{START_REGEX}version.*$/i, version: /#{START_REGEX}version.*$/i,
markup: /(?:[_*~`]+.*?[_*~`]+|^\s*[>-]|\[.*?\]\(.*?\))/, span_markup: /[_*~]+(.*?)[_*~]+/,
link_markup: /\[(.+?)\]\(.+?\)/,
block_markup: /^\s*>/,
border_markup: /^[\*-](.*?)[\*-]$/,
url: %r{#{START_REGEX}https?://[^ ]+\n}, url: %r{#{START_REGEX}https?://[^ ]+\n},
bullet: /\n\n\s*(?:[*-]|\(?[\da-z]{1,2}[)\.])\s+/i, bullet: /\n\n\s*(?:[*-]|\(?[\da-z]{1,2}[)\.])\s+/i,
developed_by: /#{START_REGEX}developed by:.*?\n\n/im, developed_by: /#{START_REGEX}developed by:.*?\n\n/im,
quote_begin: /[`'"‘“]/, quote_begin: /[`'"‘“]/,
quote_end: /['"’”]/ quote_end: /[`'"’”]/
}.freeze }.freeze
NORMALIZATIONS = { NORMALIZATIONS = {
lists: { from: /^\s*(\d\.|\*)/, to: '-' }, lists: { from: /^\s*(?:\d\.|\*)\s+([^\n])/, to: '- \1' },
https: { from: /http:/, to: 'https:' }, https: { from: /http:/, to: 'https:' },
ampersands: { from: '&', to: 'and' }, ampersands: { from: '&', to: 'and' },
dashes: { from: /[—–-]+/, to: '-' }, dashes: { from: /(?<!^)([—–-]+)(?!$)/, to: '-' },
copyright: {
from: /(?:copyright\ )?#{Matchers::Copyright::COPYRIGHT_SYMBOLS}/,
to: 'copyright'
},
quotes: { quotes: {
from: /#{REGEXES[:quote_begin]}+([\w -]*?\w)#{REGEXES[:quote_end]}+/, from: /#{REGEXES[:quote_begin]}+([\w -]*?\w)#{REGEXES[:quote_end]}+/,
to: '"\1"' to: '"\1"'
@ -82,7 +81,8 @@ module Licensee
'owner' => 'holder' 'owner' => 'holder'
}.freeze }.freeze
STRIP_METHODS = %i[ STRIP_METHODS = %i[
hrs markdown_headings borders markup title version url copyright hrs markdown_headings borders title version url copyright
block_markup span_markup link_markup
all_rights_reserved developed_by end_of_terms whitespace all_rights_reserved developed_by end_of_terms whitespace
].freeze ].freeze
@ -131,7 +131,7 @@ module Licensee
def content_without_title_and_version def content_without_title_and_version
@content_without_title_and_version ||= begin @content_without_title_and_version ||= begin
@_content = nil @_content = nil
%w[markdown_headings hrs title version].each { |op| strip(op) } %i[hrs markdown_headings title version].each { |op| strip(op) }
_content _content
end end
end end
@ -186,19 +186,21 @@ module Licensee
end end
def self.title_regex def self.title_regex
licenses = Licensee::License.all(hidden: true, psuedo: false) @title_regex ||= begin
titles = licenses.map(&:title_regex) licenses = Licensee::License.all(hidden: true, psuedo: false)
titles = licenses.map(&:title_regex)
# Title regex must include the version to support matching within # Title regex must include the version to support matching within
# families, but for sake of normalization, we can be less strict # families, but for sake of normalization, we can be less strict
without_versions = licenses.map do |license| without_versions = licenses.map do |license|
next if license.title == license.name_without_version next if license.title == license.name_without_version
Regexp.new Regexp.escape(license.name_without_version), 'i' Regexp.new Regexp.escape(license.name_without_version), 'i'
end
titles.concat(without_versions.compact)
/#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i
end end
titles.concat(without_versions.compact)
/#{START_REGEX}\(?(the )?#{Regexp.union titles}.*$/i
end end
private private
@ -213,10 +215,10 @@ module Licensee
return unless _content return unless _content
if regex_or_sym.is_a?(Symbol) if regex_or_sym.is_a?(Symbol)
if REGEXES[regex_or_sym] if respond_to?("strip_#{regex_or_sym}", true)
regex_or_sym = REGEXES[regex_or_sym]
elsif respond_to?("strip_#{regex_or_sym}", true)
return send("strip_#{regex_or_sym}") return send("strip_#{regex_or_sym}")
elsif REGEXES[regex_or_sym]
regex_or_sym = REGEXES[regex_or_sym]
else else
raise ArgumentError, "#{regex_or_sym} is an invalid regex reference" raise ArgumentError, "#{regex_or_sym} is an invalid regex reference"
end end
@ -225,12 +227,6 @@ module Licensee
@_content = _content.gsub(regex_or_sym, ' ').squeeze(' ').strip @_content = _content.gsub(regex_or_sym, ' ').squeeze(' ').strip
end end
STRIP_METHODS.each do |sym|
define_method "strip_#{sym}" do
strip(sym)
end
end
def strip_title def strip_title
while _content =~ ContentHelper.title_regex while _content =~ ContentHelper.title_regex
strip(ContentHelper.title_regex) strip(ContentHelper.title_regex)
@ -238,7 +234,7 @@ module Licensee
end end
def strip_borders def strip_borders
normalize(/^[\*-](.*?)\*$/, '\1') normalize(REGEXES[:border_markup], '\1')
end end
def strip_copyright def strip_copyright
@ -251,15 +247,18 @@ module Licensee
@_content = body @_content = body
end end
NORMALIZATIONS.each do |key, _op| def strip_span_markup
define_method("normalize_#{key}") do normalize(REGEXES[:span_markup], '\1')
normalize(key) end
end
def strip_link_markup
normalize(REGEXES[:link_markup], '\1')
end end
def normalize(from_or_key, to = nil) def normalize(from_or_key, to = nil)
operation = { from: from_or_key, to: to } if to operation = { from: from_or_key, to: to } if to
operation ||= NORMALIZATIONS[from_or_key] operation ||= NORMALIZATIONS[from_or_key]
if operation if operation
@_content = _content.gsub operation[:from], operation[:to] @_content = _content.gsub operation[:from], operation[:to]
elsif respond_to?("normalize_#{from_or_key}", true) elsif respond_to?("normalize_#{from_or_key}", true)

View File

@ -55,7 +55,6 @@ module Licensee
def attribution def attribution
@attribution ||= begin @attribution ||= begin
return unless copyright? || license.content =~ /\[fullname\]/ return unless copyright? || license.content =~ /\[fullname\]/
matches = Matchers::Copyright::REGEX matches = Matchers::Copyright::REGEX
.match(content_without_title_and_version) .match(content_without_title_and_version)
matches[0] if matches matches[0] if matches

View File

@ -1,36 +1,36 @@
{ {
"upl-1.0": "093b8b048dec7bc685c9ee6a5afffa4a1d148c02", "upl-1.0": "093b8b048dec7bc685c9ee6a5afffa4a1d148c02",
"ofl-1.1": "b6eb018d65c3ef1aecd29a99ad4653b47b34323d", "ofl-1.1": "1fb0563aa1250e18a6948afde286edc95761f461",
"lgpl-3.0": "fde363437aa287dddb4305dbbe1a59c41e98ea2b", "lgpl-3.0": "bdb3c042bd84f914eacfbe4977c5e58352745809",
"agpl-3.0": "8184105f82a05296bd50332643c3a3cc4067ba54", "agpl-3.0": "d445855a1f169b12cbee97d320c2e3522d053016",
"gpl-2.0": "f5e7151e1cd9830c0caf577bc747d7736f36658e", "gpl-2.0": "3becd209e8ed8039656c1debe01dd17b9a79208f",
"cc-by-4.0": "df16a2377ab7ea49e9bf80a8f3344e38121671f4", "cc-by-4.0": "899872bc08626e6cf154dcf9e08ff0de82c9b3db",
"ms-rl": "eb5794a2e90d1b83fd19e46d6790b2b66b8b857b", "ms-rl": "402bf344e506a8d10175c1e516b396c060ffd823",
"wtfpl": "dfa47b27c85780af7b2ddb2c30bdd7808e3060d7", "wtfpl": "f8544c074f203d86cdcb24082fedfb2cf2fe411a",
"osl-3.0": "39c1c650cb46ecc34c72693eb5ba967611ac1fed", "osl-3.0": "ab241ef932d3ac038e8ed62c860e9eba051ae7a0",
"bsl-1.0": "ca8f916d00c234719956e932061f192abb2d5bf9", "bsl-1.0": "ca8f916d00c234719956e932061f192abb2d5bf9",
"lgpl-2.1": "cc67fdbf1313fa11056c01a573c0287fab61bc17", "lgpl-2.1": "91e779a787786276618f58d6e396a5e64a981805",
"mpl-2.0": "b0285063c18aea6fae6a146882193f0d3de6dc44", "mpl-2.0": "b4db668fa7573bfdcae74eb51eafc961034f0a61",
"isc": "d168f98624be864548b2bbf4f198fdbf702d6743", "isc": "d168f98624be864548b2bbf4f198fdbf702d6743",
"cc0-1.0": "d76b663aad99ce405c971acd22cfbe23bfd29378", "cc0-1.0": "ec5027313ed11fea202060f6958ac25b086d6dcb",
"bsd-3-clause-clear": "6cd7a95b9e5f0e866b07b46fcfebb70f1c42994f", "bsd-3-clause-clear": "251d4599b622d2a87b2c4bb21dfacd438c048466",
"gpl-3.0": "39d041443ec3f4f2aa13e1fa2e9aef7d4356a04b", "gpl-3.0": "b22f1b1f953a38a8a11686587b98831858d6468b",
"unlicense": "86c75861af1b9b9e0573b190dcb2c2cdbbee7037", "unlicense": "86c75861af1b9b9e0573b190dcb2c2cdbbee7037",
"bsd-2-clause": "8c6525f4700252c313825f1f85acd04cd7c30394", "bsd-2-clause": "59f0099ff04225daf184db3fe55e478256133b1a",
"artistic-2.0": "68cc5c6eb6563437200308f227d36af5ba32f9be", "artistic-2.0": "a2ff6e7fb76e51bda9a5350c759a824f206049d1",
"zlib": "4768246ef0140435f718039efc0a11ef437e58fa", "zlib": "8d43f632a4884e70c72a1ac5926fc87f98305490",
"lppl-1.3c": "7025cef767e2d508bde52922c28e6c0ec7831230", "lppl-1.3c": "60961652297042d28bb689c17fac47eca7348d16",
"epl-2.0": "d858a8a6f0dfcc337acd93e3e791957d60f790b2", "epl-2.0": "b57663bc9c3f41446a8cd3f0050149221a58fe66",
"mit": "d64f3bb4282a97b37454b5bb96a8a264a3363dc3", "mit": "d64f3bb4282a97b37454b5bb96a8a264a3363dc3",
"postgresql": "87550a6bb3409db00d8552b2ac07d373ea56a024", "postgresql": "87550a6bb3409db00d8552b2ac07d373ea56a024",
"afl-3.0": "c564c5cf16eb650c6ee784d71b90818bbbc5d3ae", "afl-3.0": "4702ff33018a2874510beeef5916d6e8629cdc32",
"ncsa": "58a1d83992144038eab133b4af8a31ddbc575b56", "ncsa": "04c052b69de47ab0641068657a14632cdf9aa48d",
"cc-by-sa-4.0": "145990c59e69fa6f691008c30994c909d865caa5", "cc-by-sa-4.0": "d11590d97684231d5358252e0cc97373d62ec4f1",
"bsd-3-clause": "78f89f12ad4369a2dc932076182946195f1fdb04", "bsd-3-clause": "fa22c672927af9c7334874561198799cbf4bdf31",
"epl-1.0": "0e1bc53f3b94e1b1e0d9e2eb565df10e6800e60d", "epl-1.0": "e306464a81ab0e6688653c6509245b451637172c",
"ms-pl": "e72c4981307230d82983f1a3272d30c7c9fa37e1", "ms-pl": "c900293d66a241e54f7817367a8f32f7f94e12ff",
"ecl-2.0": "8669b2b35e243e378a99d8ceee2c05f6ce3603b9", "ecl-2.0": "58e7f645bfa1c5ccca7e2c37e626b3487e4d9d1b",
"eupl-1.2": "bab4a863ebdbd2f2f30bc333fe4635dc038136d1", "eupl-1.2": "f122f96b9f1a56e4806a89cb1cc6ca2bb956f3e5",
"apache-2.0": "1dd463ea99a5cd7537b8230e05c9af07b6cc582f", "apache-2.0": "ab3901051663cb8ee5dea9ebdff406ad136910e3",
"eupl-1.1": "26d0bb98b95d434f861b73cb8194b5620e945d94" "eupl-1.1": "873e30dbc5f75d076d7aecb6ceb84fb6bb765452"
} }

View File

@ -10,7 +10,7 @@ RSpec.describe 'detect command' do
let(:stdout) { output[0] } let(:stdout) { output[0] }
let(:stderr) { output[1] } let(:stderr) { output[1] }
let(:status) { output[2] } let(:status) { output[2] }
let(:hash) { 'd64f3bb4282a97b37454b5bb96a8a264a3363dc3' } let(:hash) { license_hashes['mit'] }
let(:expected) do let(:expected) do
{ {
'License' => 'MIT', 'License' => 'MIT',

View File

@ -91,7 +91,9 @@ RSpec.describe Licensee::ContentHelper do
borders: '* Foo *', borders: '* Foo *',
title: "The MIT License\nfoo", title: "The MIT License\nfoo",
copyright: "The MIT License\nCopyright 2018 Ben Balter\nFoo", copyright: "The MIT License\nCopyright 2018 Ben Balter\nFoo",
end_of_terms: "Foo\nend of terms and conditions\nbar" end_of_terms: "Foo\nend of terms and conditions\nbar",
block_markup: "> Foo",
link_markup: "[Foo](http://exmaple.com)"
}.each do |field, fixture| }.each do |field, fixture|
context "#strip_#{field}" do context "#strip_#{field}" do
let(:content) { fixture } let(:content) { fixture }
@ -102,12 +104,11 @@ RSpec.describe Licensee::ContentHelper do
end end
end end
context 'markup' do context "span markup" do
let(:content) { "> foo\n_foo_ [bar](#baz) ~foo~ `bar` *baz*" } let(:content) { '_foo_ *foo* **foo** ~foo~'}
it 'strips markup' do it "strips span markup" do
skip 'failing' expect(normalized_content).to eql('foo foo foo foo')
expect(normalized_content).to eql('foo foo bar foo bar baz')
end end
end end
end end
@ -177,6 +178,46 @@ RSpec.describe Licensee::ContentHelper do
end end
context 'normalizing' do context 'normalizing' do
context 'https' do
let(:content) { 'http://example.com' }
it 'normalized URL protocals' do
expect(subject.content_normalized).to eql('https://example.com')
end
end
context 'ampersands' do
let(:content) { 'Foo & Bar' }
it 'normalized ampersands' do
expect(subject.content_normalized).to eql('foo and bar')
end
end
context "lists" do
let(:content) { "1. Foo\n * Bar"}
it 'normalizes lists' do
expect(subject.content_normalized).to eql("- foo - bar")
end
end
context "dashes" do
let(:content) { "Foo-Bar—baz-buzz"}
it 'normalizes dashes' do
expect(subject.content_normalized).to eql("foo-bar-baz-buzz")
end
end
context "quotes" do
let(:content) { "`a` 'b' \"c\" d “e”" }
it 'normalizes quotes' do
expect(subject.content_normalized).to eql('"a" "b" "c" "d" "e"')
end
end
it 'strips formatting from the MPL' do it 'strips formatting from the MPL' do
license = Licensee::License.find('mpl-2.0') license = Licensee::License.find('mpl-2.0')
expect(license.content_normalized).to_not include('* *') expect(license.content_normalized).to_not include('* *')

View File

@ -134,3 +134,7 @@ RSpec::Matchers.define :be_detected_as do |expected|
diffable diffable
end end
def license_hashes
@license_hashese ||= JSON.parse(fixture_contents('license-hashes.json'))
end

View File

@ -5,7 +5,6 @@ RSpec.describe 'vendored licenses' do
end end
let(:detected_license) { license_file.license if license_file } let(:detected_license) { license_file.license if license_file }
let(:wtfpl) { Licensee::License.find('wtfpl') } let(:wtfpl) { Licensee::License.find('wtfpl') }
let(:expected_hashes) { JSON.parse(fixture_contents('license-hashes.json')) }
Licensee.licenses(hidden: true).each do |license| Licensee.licenses(hidden: true).each do |license|
next if license.pseudo_license? next if license.pseudo_license?
@ -14,7 +13,7 @@ RSpec.describe 'vendored licenses' do
context "the #{license.name} license" do context "the #{license.name} license" do
let(:content_with_copyright) { sub_copyright_info(license) } let(:content_with_copyright) { sub_copyright_info(license) }
let(:content) { content_with_copyright } let(:content) { content_with_copyright }
let(:expected_hash) { expected_hashes[license.key] } let(:expected_hash) { license_hashes[license.key] }
let(:hash_change_msg) do let(:hash_change_msg) do
msg = 'Did you update a vendored license? Run script/hash-licenses. ' msg = 'Did you update a vendored license? Run script/hash-licenses. '
msg << 'Changes in license hashes must be a MINOR (or MAJOR) bump.' msg << 'Changes in license hashes must be a MINOR (or MAJOR) bump.'