From ee6e66426b417722dac7d0d1ea85eaee0ccb6bf2 Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Wed, 20 Dec 2017 14:17:46 -0500 Subject: [PATCH] strip markup in ContentHelper#content_normalized --- lib/licensee/content_helper.rb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/licensee/content_helper.rb b/lib/licensee/content_helper.rb index 674ecec..0cd472d 100644 --- a/lib/licensee/content_helper.rb +++ b/lib/licensee/content_helper.rb @@ -4,13 +4,14 @@ require 'digest' module Licensee module ContentHelper DIGEST = Digest::SHA1 - END_OF_TERMS_REGEX = /^\s*end of terms and conditions\s*$/i + END_OF_TERMS_REGEX = /^[\s#]*end of terms and conditions\s*$/i HR_REGEX = /[=\-\*][=\-\*\s]{3,}/ ALT_TITLE_REGEX = License::ALT_TITLE_REGEX ALL_RIGHTS_RESERVED_REGEX = /\Aall rights reserved\.?$/i WHITESPACE_REGEX = /\s+/ MARKDOWN_HEADING_REGEX = /\A\s*#+/ VERSION_REGEX = /\Aversion.*$/i + MARKUP_REGEX = /[^\w'\.\-]+/ # A set of each word in the license, without duplicates def wordset @@ -77,6 +78,7 @@ module Licensee end string = strip_all_rights_reserved(string) string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX) + string = strip_markup(string) strip_whitespace(string) end @@ -155,6 +157,10 @@ module Licensee strip(string, ALL_RIGHTS_RESERVED_REGEX) end + def strip_markup(string) + strip(string, MARKUP_REGEX) + end + def strip(string, regex) string.gsub(regex, ' ').squeeze(' ').strip end