radiff2: drop MODE_DIST (#16986)

MODE_DIST (-s) is the same as MODE_DIST_LENVENSTEIN (-ss) but much slower.
Drop it. Make MODE_DIST_MYERS (-sss, faster than MODE_DIST_LENVENSTEIN) take its place (-s).

The original comment was incorrect (-s is not Eugene W. Myer's algorithm). It is correct now.

* drop buggy Levenshtein and rename the original
* fix tests

Co-authored-by: eagleoflqj and Maskray
This commit is contained in:
Fangrui Song 2021-01-24 13:17:25 -08:00 committed by GitHub
parent 2e1d0579d7
commit 0d4d8c083f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 23 additions and 252 deletions

View File

@ -67,7 +67,7 @@ R_API char *r_diff_buffers_to_string(RDiff *d, const ut8 *a, int la, const ut8 *
R_API int r_diff_set_callback(RDiff *d, RDiffCallback callback, void *user);
R_API bool r_diff_buffers_distance(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity);
R_API bool r_diff_buffers_distance_myers(RDiff *diff, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity);
R_API bool r_diff_buffers_distance_levenstein(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity);
R_API bool r_diff_buffers_distance_levenshtein(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity);
R_API char *r_diff_buffers_unified(RDiff *d, const ut8 *a, int la, const ut8 *b, int lb);
/* static method !??! */
R_API int r_diff_lines(const char *file1, const char *sa, int la, const char *file2, const char *sb, int lb);

View File

@ -7,9 +7,8 @@ enum {
MODE_DIFF,
MODE_DIFF_STRS,
MODE_DIFF_IMPORTS,
MODE_DIST,
MODE_DIST_MYERS,
MODE_DIST_LEVENSTEIN,
MODE_DIST_LEVENSHTEIN,
MODE_CODE,
MODE_GRAPH,
MODE_COLS,
@ -17,15 +16,15 @@ enum {
};
enum {
GRAPH_DEFAULT_MODE,
GRAPH_SDB_MODE,
GRAPH_JSON_MODE,
GRAPH_JSON_DIS_MODE,
GRAPH_TINY_MODE,
GRAPH_INTERACTIVE_MODE,
GRAPH_DOT_MODE,
GRAPH_STAR_MODE,
GRAPH_GML_MODE
GRAPH_DEFAULT_MODE,
GRAPH_SDB_MODE,
GRAPH_JSON_MODE,
GRAPH_JSON_DIS_MODE,
GRAPH_TINY_MODE,
GRAPH_INTERACTIVE_MODE,
GRAPH_DOT_MODE,
GRAPH_STAR_MODE,
GRAPH_GML_MODE
};
typedef struct {
@ -1040,12 +1039,10 @@ R_API int r_main_radiff2(int argc, const char **argv) {
case 'h':
return show_help (1);
case 's':
if (ro.mode == MODE_DIST) {
ro.mode = MODE_DIST_LEVENSTEIN;
} else if (ro.mode == MODE_DIST_LEVENSTEIN) {
ro.mode = MODE_DIST_MYERS;
if (ro.mode == MODE_DIST_MYERS) {
ro.mode = MODE_DIST_LEVENSHTEIN;
} else {
ro.mode = MODE_DIST;
ro.mode = MODE_DIST_MYERS;
}
break;
case 'S':
@ -1279,19 +1276,16 @@ R_API int r_main_radiff2(int argc, const char **argv) {
}
r_diff_free (d);
break;
case MODE_DIST:
case MODE_DIST_MYERS:
case MODE_DIST_LEVENSTEIN:
case MODE_DIST_LEVENSHTEIN:
{
RDiff *d = r_diff_new ();
if (d) {
d->verbose = ro.verbose;
if (ro.mode == MODE_DIST_LEVENSTEIN) {
d->type = 'l';
} else if (ro.mode == MODE_DIST_MYERS) {
if (ro.mode == MODE_DIST_MYERS) {
d->type = 'm';
} else {
d->type = 0;
d->type = 'l';
}
r_diff_buffers_distance (d, bufa, (ut32)sza, bufb, (ut32)szb, &ro.count, &sim);
r_diff_free (d);

View File

@ -167,219 +167,6 @@ R_API int r_diff_buffers(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb)
: r_diff_buffers_static (d, a, la, b, lb);
}
R_API bool r_diff_buffers_distance_levenstein(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity) {
r_return_val_if_fail (a && b, false);
const bool verbose = d? d->verbose: false;
/*
More memory efficient version on Levenshtein Distance from:
https://en.wikipedia.org/wiki/Levenshtein_distance
http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm
ObM..
8/July/2016 - More time efficient Levenshtein Distance. Now runs in about O(N*sum(MDistance)) instead of O(NM)
In real world testing the speedups for similar files are immense. Processing of
radiff2 -sV routerA/firmware_extract/bin/httpd routerB/firmware_extract/bin/httpd
reduced from 28 hours to about 13 minutes.
*/
int i, j;
const ut8 *aBufPtr;
const ut8 *bBufPtr;
ut32 aLen;
ut32 bLen;
// temp pointer will be used to switch v0 and v1 after processing the inner loop.
int *temp;
int *v0, *v1;
// We need these variables outside the context of the loops as we need to
// survive multiple loop iterations.
// start and stop are used in our inner loop
// colMin tells us the current 'best' edit distance.
// extendStop & extendStart are used when we get 'double up' edge conditions
// that require us to keep some more data.
int start = 0;
int stop = 0;
int smallest;
int colMin = 0;
int extendStop = 0;
int extendStart = 0;
//we could move cost into the 'i' loop.
int cost = 0;
// loops can get very big, this can be removed, but it's currently in there for debugging
// and optimisation testing.
ut64 loops = 0;
// We need the longest file to be 'A' because our optimisation tries to stop and start
// around the diagonal.
// AAAAAAA
// B*
// B *
// B *____
// if we have them the other way around and we terminate on the diagonal, we won't have
// inspected all the bytes of file B..
// AAAA
// B*
// B *
// B *
// B *
// B ?
if (la < lb) {
aBufPtr = b;
bBufPtr = a;
aLen = lb;
bLen = la;
} else {
aBufPtr = a;
bBufPtr = b;
aLen = la;
bLen = lb;
}
stop = bLen;
// Preliminary tests
// one or both buffers empty?
if (aLen == 0 || bLen == 0) {
if (distance) {
*distance = R_MAX (aLen, bLen);
}
if (similarity) {
*similarity = aLen == bLen? 1.0: 0.0;
}
return true;
}
//IF the files are the same size and are identical, then we have matching files
if (aLen == bLen && !memcmp (aBufPtr, bBufPtr, aLen)) {
if (distance) {
*distance = 0;
}
if (similarity) {
*similarity = 1.0;
}
return true;
}
// Only calloc if we have to do some processing
// calloc v0 & v1 and check they initialised
v0 = (int*) calloc ((bLen + 3), sizeof (int));
if (!v0) {
eprintf ("Error: cannot allocate %i bytes.", bLen + 3);
return false;
}
v1 = (int*) calloc ((bLen + 3), sizeof (int));
if (!v1) {
eprintf ("Error: cannot allocate %i bytes", 2 * (bLen + 3));
free (v0);
return false;
}
// initialise v0 and v1.
// With optimisiation we only strictly we only need to initialise v0[0..2]=0..2 & v1[0] = 1;
for (i = 0; i < bLen + 1 ; i++) {
v0[i] = i;
v1[i] = i + 1;
}
// Outer loop = the length of the longest input file.
for (i = 0; i < aLen; i++) {
// We're going to stop the inner loop at:
// bLen (so we don't run off the end of our array)
// or 'two below the diagonal' PLUS any extension we need for 'double up' edge values
// (see extendStop for logic)
stop = R_MIN ((i + extendStop + 2), bLen);
// We need a value in the result column (v1[start]).
// If you look at the loop below, we need it because we look at v1[j] as one of the
// potential shortest edit distances.
// In all cases where the edit distance can't 'reach',
// the value of v1[start] simply increments.
if (start > bLen) {
break;
}
v1[start] = v0[start] + 1;
// need to have a bigger number in colMin than we'll ever encounter in the inner loop
colMin = aLen;
// Inner loop does all the work:
for (j = start; j <= stop; j++) {
loops++;
// The main levenshtein comparison:
cost = (aBufPtr[i] == bBufPtr[j]) ? 0 : 1;
smallest = R_MIN ((v1[j] + 1), (v0[j + 1] + 1));
smallest = R_MIN (smallest, (v0[j] + cost));
// populate the next two entries in v1.
// only really required if this is the last loop.
if (j + 2 > bLen + 3) {
break;
}
v1[j + 1] = smallest;
v1[j + 2] = smallest + 1;
// If we have seen a smaller number, it's the new column Minimum
colMin = R_MIN ((colMin), (smallest));
}
// We're going to start at i+1 next iteration
// The column minimum is the current edit distance
// This distance is the minimum 'search width' from the optimal 'i' diagonal
// The extendStart picks up an edge case where we have a match on the first iteration
// We update extendStart after we've set start for the next iteration.
start = i + 1 - colMin - extendStart;
// If the last processed entry is a match, AND
// the current byte in 'a' and the previous processed entry in 'b' aren't a match
// then we need to extend our search below the optimal 'i' diagonal. because we'll
// have a vertical double up condition in our last two values of the results column.
// j-2 is used because j++ increments prior to loop exit in the processing loop above.
if (!cost && aBufPtr[i] != bBufPtr[j - 2]) {
extendStop ++;
}
// If new start would be a match then we have a horizontal 'double up'
// which means we need to keep an extra row of data
// so don't increment the start counter this time, BUT keep
// extendStart up our sleeves for next iteration.
if (i + 1 < aLen && start < bLen && aBufPtr[i + 1] == bBufPtr[start]) {
start --;
extendStart ++;
}
//Switch v0 and v1 pointers via temp pointer
temp = v0;
v0 = v1;
v1 = temp;
//Print a processing update every 10K of outer loop
if (verbose && i % 10000==0) {
eprintf ("\rProcessing %d of %d\r", i, aLen);
}
}
//Clean up output on loop exit (purely aesthetic)
if (verbose) {
eprintf ("\rProcessing %d of %d (loops=%"PFMT64d")\n", i, aLen,loops);
}
if (distance) {
// the final distance is the last byte we processed in the inner loop.
// v0 is used instead of v1 because we switched the pointers before exiting the outer loop
*distance = v0[stop];
}
if (similarity) {
double diff = (double) (v0[stop]) / (double) (R_MAX (aLen, bLen));
*similarity = (double)1 - diff;
}
free (v0);
free (v1);
return true;
}
// Eugene W. Myers' O(ND) diff algorithm
// Returns edit distance with costs: insertion=1, deletion=1, no substitution
R_API bool r_diff_buffers_distance_myers(RDiff *diff, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity) {
@ -437,7 +224,7 @@ out:
return true;
}
R_API bool r_diff_buffers_distance_original(RDiff *diff, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity) {
R_API bool r_diff_buffers_distance_levenshtein(RDiff *diff, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity) {
if (!a || !b) {
return false;
}
@ -499,12 +286,11 @@ R_API bool r_diff_buffers_distance(RDiff *d, const ut8 *a, ut32 la, const ut8 *b
case 'm':
return r_diff_buffers_distance_myers (d, a, la, b, lb, distance, similarity);
case 'l':
return r_diff_buffers_distance_levenstein (d, a, la, b, lb, distance, similarity);
default:
break;
}
}
return r_diff_buffers_distance_original (d, a, la, b, lb, distance, similarity);
return r_diff_buffers_distance_levenshtein (d, a, la, b, lb, distance, similarity);
}
// Use NeedlemanWunsch to diffchar.

View File

@ -54,9 +54,9 @@ Show two column hexdump diffing.
.It Fl X
Show two column hexII diffing.
.It Fl s
Calculate text distance from two files.
Compute edit distance (no substitution, Eugene W. Myers' O(ND) diff algorithm) between two files.
.It Fl ss
Same as before but using the Levenstein algorithm (faster but sometimes buggy)
Compute Levenshtein edit distance (substitution is allowed, O(N^2)) between two files.
.It Fl S Ar [name, namelen, dist, size, ...]
Specify which column of the code diffing algo use for diffing
.It Fl t Ar 0\-100

View File

@ -32,23 +32,14 @@ bool test_r_diff_buffers_distance(void) {
int i;
// Levenshtein edit distance (deletion/insertion/substitution)
diff->type = '\0';
diff->type = 'l';
for (i = 0; tests[i].a; i++) {
size_t la = strlen ((const char *)tests[i].a), lb = strlen ((const char *)tests[i].b);
r_diff_buffers_distance (diff, tests[i].a, la, tests[i].b, lb, &distance, NULL);
snprintf (msg, sizeof msg, "original %s/%s distance", tests[i].a, tests[i].b);
snprintf (msg, sizeof msg, "levenshtein %s/%s distance", tests[i].a, tests[i].b);
mu_assert_eq (distance, tests[i].dis_distance, msg);
}
// Broken r_diff_buffers_distance_levenshtein, uncomment and see why it is incorrect
// diff->type = 'l';
// for (i = 0; i < R_ARRAY_SIZE (tests); i++) {
// size_t la = strlen (tests[i].a), lb = strlen ((const char *)tests[i].b);
// r_diff_buffers_distance (diff, tests[i].a, la, tests[i].b, lb, &distance, NULL);
// snprintf (msg, sizeof msg, "levenshtein %s/%s distance", tests[i].a, tests[i].b);
// mu_assert_eq (distance, tests[i].dis_distance, msg);
// }
// Eugene W. Myers' O(ND) diff algorithm, deletion/insertion edit distance
diff->type = 'm';
for (i = 0; tests[i].a; i++) {