Replace dissimilar crate with difflib (#2322)

This also inserts the expected text if it's missing at the very
beginning of the provided text.
This commit is contained in:
RumovZ 2023-01-16 00:49:34 +01:00 committed by GitHub
parent 604e0f46e1
commit 1be30573e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 118 additions and 76 deletions

14
Cargo.lock generated
View File

@ -92,7 +92,7 @@ dependencies = [
"convert_case 0.6.0",
"criterion",
"csv",
"dissimilar",
"difflib",
"env_logger",
"flate2",
"fluent",
@ -822,6 +822,12 @@ dependencies = [
"cipher 0.4.3",
]
[[package]]
name = "difflib"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
[[package]]
name = "digest"
version = "0.10.6"
@ -844,12 +850,6 @@ dependencies = [
"syn",
]
[[package]]
name = "dissimilar"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c97b9233581d84b8e1e689cdd3a47b6f69770084fc246e86a7f78b0d9c1d4a5"
[[package]]
name = "doc-comment"
version = "0.3.3"

View File

@ -57,7 +57,7 @@ bytes = "1.3.0"
chrono = { version = "0.4.19", default-features = false, features = ["std", "clock"] }
coarsetime = "0.1.22"
convert_case = "0.6.0"
dissimilar = "1.0.4"
difflib = "0.4.0"
flate2 = "1.0.25"
fluent = "0.16.0"
fluent-bundle = "0.15.2"

View File

@ -3,7 +3,8 @@
use std::borrow::Cow;
use dissimilar::Chunk;
use difflib::sequencematcher::{Opcode, SequenceMatcher};
use itertools::Itertools;
use lazy_static::lazy_static;
use regex::Regex;
use unic_ucd_category::GeneralCategory;
@ -29,57 +30,55 @@ lazy_static! {
}
struct DiffContext {
expected: String,
provided: String,
expected: Vec<char>,
provided: Vec<char>,
}
impl DiffContext {
fn new(expected: &str, provided: &str) -> Self {
DiffContext {
expected: prepare_expected(expected),
provided: prepare_provided(provided),
provided: prepare_provided(provided).chars().collect_vec(),
expected: prepare_expected(expected).chars().collect_vec(),
}
}
fn to_tokens(&self) -> DiffOutput<'_> {
let chunks = dissimilar::diff(&self.provided, &self.expected);
fn slice_expected(&self, opcode: &Opcode) -> String {
self.expected[opcode.second_start..opcode.second_end]
.iter()
.cloned()
.collect()
}
fn slice_provided(&self, opcode: &Opcode) -> String {
self.provided[opcode.first_start..opcode.first_end]
.iter()
.cloned()
.collect()
}
fn to_tokens(&self) -> DiffOutput {
let mut matcher = SequenceMatcher::new(&self.provided, &self.expected);
let opcodes = matcher.get_opcodes();
let mut provided = vec![];
let mut expected = vec![];
for chunk in chunks {
match chunk {
Chunk::Equal(text) => {
provided.push(DiffToken {
kind: DiffTokenKind::Good,
text: text.into(),
});
expected.push(DiffToken {
kind: DiffTokenKind::Good,
text: text.into(),
});
for opcode in opcodes {
match opcode.tag.as_str() {
"equal" => {
provided.push(DiffToken::good(self.slice_provided(&opcode)));
expected.push(DiffToken::good(self.slice_expected(&opcode)));
}
Chunk::Delete(text) => {
provided.push(DiffToken {
kind: DiffTokenKind::Bad,
text: text.into(),
});
"delete" => {
provided.push(DiffToken::bad(self.slice_provided(&opcode)));
}
Chunk::Insert(text) => {
// If the preceding text was correct, indicate text was missing
if provided
.last()
.map(|v| v.kind == DiffTokenKind::Good)
.unwrap_or_default()
{
provided.push(DiffToken {
kind: DiffTokenKind::Missing,
text: text.into(),
});
}
expected.push(DiffToken {
kind: DiffTokenKind::Missing,
text: text.into(),
});
"insert" => {
provided.push(DiffToken::missing(self.slice_expected(&opcode)));
expected.push(DiffToken::missing(self.slice_expected(&opcode)));
}
"replace" => {
provided.push(DiffToken::bad(self.slice_provided(&opcode)));
expected.push(DiffToken::missing(self.slice_expected(&opcode)));
}
_ => unreachable!(),
}
}
DiffOutput { provided, expected }
@ -123,15 +122,38 @@ enum DiffTokenKind {
}
#[derive(Debug, PartialEq, Eq)]
struct DiffToken<'a> {
struct DiffToken {
kind: DiffTokenKind,
text: Cow<'a, str>,
text: String,
}
impl DiffToken {
fn bad(text: String) -> Self {
Self {
kind: DiffTokenKind::Bad,
text,
}
}
fn good(text: String) -> Self {
Self {
kind: DiffTokenKind::Good,
text,
}
}
fn missing(text: String) -> Self {
Self {
kind: DiffTokenKind::Missing,
text,
}
}
}
#[derive(Debug, PartialEq, Eq)]
struct DiffOutput<'a> {
provided: Vec<DiffToken<'a>>,
expected: Vec<DiffToken<'a>>,
struct DiffOutput {
provided: Vec<DiffToken>,
expected: Vec<DiffToken>,
}
pub fn compare_answer(expected: &str, provided: &str) -> String {
@ -168,18 +190,18 @@ fn with_isolated_leading_mark(text: &str) -> Cow<str> {
#[cfg(test)]
mod test {
use DiffTokenKind::*;
use super::*;
macro_rules! token {
($kind:ident, $text:expr) => {
DiffToken {
kind: $kind,
text: $text.into(),
macro_rules! token_factory {
($name:ident) => {
fn $name(text: &str) -> DiffToken {
DiffToken::$name(String::from(text))
}
};
}
token_factory!(bad);
token_factory!(good);
token_factory!(missing);
#[test]
fn tokens() {
@ -188,25 +210,25 @@ mod test {
assert_eq!(
output.provided,
vec![
token!(Bad, "y"),
token!(Good, " ahora q"),
token!(Bad, "e"),
token!(Good, " vamos"),
token!(Missing, " "),
token!(Good, "a hacer"),
token!(Missing, "?"),
bad("y"),
good(" ahora q"),
bad("e"),
good(" vamos"),
missing(" "),
good("a hacer"),
missing("?"),
]
);
assert_eq!(
output.expected,
vec![
token!(Missing, "¿Y"),
token!(Good, " ahora q"),
token!(Missing, ""),
token!(Good, " vamos"),
token!(Missing, " "),
token!(Good, "a hacer"),
token!(Missing, "?"),
missing("¿Y"),
good(" ahora q"),
missing(""),
good(" vamos"),
missing(" "),
good("a hacer"),
missing("?"),
]
);
}
@ -215,17 +237,37 @@ mod test {
fn html_and_media() {
let ctx = DiffContext::new("[sound:foo.mp3]<b>1</b> &nbsp;2", "1 2");
// the spacing is handled by wrapping html output in white-space: pre-wrap
assert_eq!(ctx.to_tokens().expected, &[token!(Good, "1 2")]);
assert_eq!(ctx.to_tokens().expected, &[good("1 2")]);
}
#[test]
fn missed_chars_only_shown_in_provided_when_after_good() {
let ctx = DiffContext::new("1", "23");
assert_eq!(ctx.to_tokens().provided, &[token!(Bad, "23")]);
assert_eq!(ctx.to_tokens().provided, &[bad("23")]);
let ctx = DiffContext::new("12", "1");
assert_eq!(ctx.to_tokens().provided, &[good("1"), missing("2"),]);
}
#[test]
fn handles_certain_unicode_as_expected() {
// this was not parsed as expected with dissimilar 1.0.4
let ctx = DiffContext::new("쓰다듬다", "스다뜸다");
assert_eq!(
ctx.to_tokens().provided,
&[token!(Good, "1"), token!(Missing, "2"),]
&[bad(""), good(""), bad(""), good(""),]
);
}
#[test]
fn does_not_panic_with_certain_unicode() {
// this was causing a panic with dissimilar 1.0.4
let ctx = DiffContext::new(
"Сущность должна быть ответственна только за одно дело",
concat!(
"Single responsibility Сущность выполняет только одну задачу.",
"Повод для изменения сущности только один."
),
);
ctx.to_tokens();
}
}