Line data Source code
1 : /**
2 : * GPL blah blah, see below for history
3 : */
4 :
5 : #ifndef DIFFENGINE_H
6 : #define DIFFENGINE_H
7 :
8 : #include <vector>
9 : #include <map>
10 : #include <set>
11 : #include <utility>
12 : #include <algorithm>
13 : #include <cassert>
14 : #include <string>
15 : #include <numeric>
16 :
17 : #include "IntSet.h"
18 : #include "Word.h"
19 :
20 : namespace wikidiff2 {
21 :
22 : #ifdef DEBUG_MOVED_LINES
23 : inline void debugLog(const char *fmt, ...) {
24 : static FILE *f = nullptr;
25 : char ch[2048];
26 : va_list ap;
27 : va_start(ap, fmt);
28 : vsnprintf(ch, sizeof(ch), fmt, ap);
29 : va_end(ap);
30 : if(!f) f = fopen("/tmp/wd2-debug-log.log", "a");
31 : if(!f) throw std::runtime_error("failed to open debug log...");
32 : fwrite(ch, 1, strlen(ch), f);
33 : fflush(f);
34 : };
35 : #else
36 : #define debugLog(...)
37 : #endif
38 :
39 : /**
40 : * Options to be passed to the various constructors
41 : */
42 : struct DiffConfig {
43 : /**
44 : * Complexity is the product of input sizes, after identical leading and
45 : * trailing parts are removed. This is supposed to be proportional to
46 : * worst-case runtime. If the complexity is larger than this limit, trivial
47 : * output will be produced, and the bailout flag will be set in the Diff.
48 : */
49 : long long bailoutComplexity;
50 : };
51 :
52 : /**
53 : * Diff operation
54 : *
55 : * from and to are vectors containing pointers to the objects passed in from_lines and to_lines
56 : *
57 : * op is one of the following
58 : * copy: A sequence of lines (in from and to) which are the same in both files.
59 : * del: A sequence of lines (in from) which were in the first file but not the second.
60 : * add: A sequence of lines (in to) which were in the second file but not the first.
61 : * change: A sequence of lines which are different between the two files. Lines from the
62 : * first file are in from, lines from the second are in to. The two vectors need
63 : * not be the same length.
64 : */
65 : template<typename T>
66 : class DiffOp
67 : {
68 : public:
69 : typedef std::vector<const T*, WD2_ALLOCATOR<const T*> > PointerVector;
70 3997 : DiffOp(int op_, const PointerVector & from_, const PointerVector & to_)
71 3997 : : op(op_), from(from_), to(to_) {}
72 :
73 : enum {copy, del, add, change};
74 : int op;
75 : PointerVector from;
76 : PointerVector to;
77 : };
78 :
79 : /**
80 : * Basic diff template class. After construction, edits will contain a vector of DiffOpTemplate
81 : * objects representing the diff
82 : */
83 : template<typename T>
84 : class Diff
85 : {
86 : public:
87 : typedef std::vector<T, WD2_ALLOCATOR<T> > ValueVector;
88 : typedef std::vector<DiffOp<T>, WD2_ALLOCATOR<DiffOp<T>> > DiffOpVector;
89 :
90 47 : Diff() {}
91 :
92 : Diff(const DiffConfig & config, const ValueVector & from_lines, const ValueVector & to_lines);
93 :
94 4236 : void add_edit(const DiffOp<T> & edit) {
95 4236 : edits.push_back(edit);
96 4236 : }
97 :
98 11921 : size_t size() const {
99 11921 : return edits.size();
100 : }
101 :
102 303 : DiffOp<T> & operator[](int i) {
103 303 : return edits[i];
104 : }
105 :
106 21891 : const DiffOp<T> & operator[](int i) const {
107 21891 : return edits[i];
108 : }
109 :
110 47 : void swap(Diff<T> & other) {
111 47 : edits.swap(other.edits);
112 47 : }
113 :
114 : DiffOpVector edits;
115 : bool bailout = false;
116 : };
117 :
118 : /**
119 : * Class used internally by Diff to actually compute the diffs.
120 : *
121 : * The algorithm used here is mostly lifted from the perl module
122 : * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
123 : * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
124 : *
125 : * More ideas are taken from:
126 : * http://www.ics.uci.edu/~eppstein/161/960229.html
127 : *
128 : * Some ideas are (and a bit of code) are from from analyze.c, from GNU
129 : * diffutils-2.7, which can be found at:
130 : * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
131 : *
132 : * This implementation is largely due to Geoffrey T. Dairiki, who wrote this
133 : * diff engine for phpwiki 1-3.3. It was then adopted by MediaWiki.
134 : *
135 : * Finally, it was ported to C++ by Tim Starling in February 2006
136 : *
137 : * @access private
138 : */
139 :
140 : template<typename T>
141 : class DiffEngine
142 : {
143 : public:
144 : // Vectors
145 : typedef std::vector<bool> BoolVector; // skip the allocator here to get the specialisation
146 : typedef std::vector<const T*, WD2_ALLOCATOR<const T*> > PointerVector;
147 : typedef std::vector<T, WD2_ALLOCATOR<T> > ValueVector;
148 : typedef std::vector<int, WD2_ALLOCATOR<int> > IntVector;
149 : typedef std::vector<std::pair<int, int>, WD2_ALLOCATOR<std::pair<int, int> > > IntPairVector;
150 :
151 : // Maps
152 : typedef std::map<T, IntVector, std::less<T>, WD2_ALLOCATOR<std::pair<const T, IntVector>>> MatchesMap;
153 :
154 : // Sets
155 : typedef std::set<T, std::less<T>, WD2_ALLOCATOR<T> > ValueSet;
156 :
157 698 : DiffEngine(const DiffConfig & config_)
158 698 : : config(config_), done(false)
159 698 : {}
160 :
161 : void clear();
162 : void diff(const ValueVector & from_lines,
163 : const ValueVector & to_lines, Diff<T> & diff);
164 : int lcs_pos (int ypos);
165 : void compareseq (int xoff, int xlim, int yoff, int ylim);
166 : void shift_boundaries (const ValueVector & lines, BoolVector & changed,
167 : const BoolVector & other_changed);
168 : protected:
169 : int diag (int xoff, int xlim, int yoff, int ylim, int nchunks,
170 : IntPairVector & seps);
171 :
172 : DiffConfig config;
173 : BoolVector xchanged, ychanged;
174 : PointerVector xv, yv;
175 : IntVector xind, yind;
176 : IntVector seq;
177 : IntSet in_seq;
178 : int lcs;
179 : bool done;
180 : enum {MAX_CHUNKS=8};
181 : };
182 :
183 : //-----------------------------------------------------------------------------
184 : // DiffEngine implementation
185 : //-----------------------------------------------------------------------------
186 : template<typename T>
187 0 : void DiffEngine<T>::clear()
188 : {
189 0 : xchanged.clear();
190 0 : ychanged.clear();
191 0 : xv.clear();
192 0 : yv.clear();
193 0 : xind.clear();
194 0 : yind.clear();
195 0 : seq.clear();
196 0 : in_seq.clear();
197 0 : done = false;
198 0 : }
199 :
200 : template<typename T>
201 698 : void DiffEngine<T>::diff (const ValueVector & from_lines,
202 : const ValueVector & to_lines, Diff<T> & diff)
203 : {
204 698 : int n_from = (int)from_lines.size();
205 698 : int n_to = (int)to_lines.size();
206 :
207 : // If this diff engine has been used before for a diff, clear the member variables
208 698 : if (done) {
209 0 : clear();
210 : }
211 698 : xchanged.resize(n_from);
212 698 : ychanged.resize(n_to);
213 698 : seq.resize(std::max(n_from, n_to) + 1);
214 :
215 : // Skip leading common lines.
216 : int skip, endskip;
217 8880 : for (skip = 0; skip < n_from && skip < n_to; skip++) {
218 8764 : if (from_lines[skip] != to_lines[skip])
219 582 : break;
220 8182 : xchanged[skip] = ychanged[skip] = false;
221 : }
222 : // Skip trailing common lines.
223 698 : int xi = n_from, yi = n_to;
224 7628 : for (endskip = 0; --xi > skip && --yi > skip; endskip++) {
225 7345 : if (from_lines[xi] != to_lines[yi])
226 415 : break;
227 6930 : xchanged[xi] = ychanged[yi] = false;
228 : }
229 :
230 1396 : long long complexity = (long long)(n_from - skip - endskip)
231 698 : * (n_to - skip - endskip);
232 :
233 : // If too complex, just output "whole left side replaced with right"
234 698 : if (config.bailoutComplexity > 0 && complexity > config.bailoutComplexity) {
235 6 : PointerVector del;
236 3 : PointerVector add;
237 :
238 26012 : for (xi = 0; xi < n_from; xi++) {
239 26009 : del.push_back(&from_lines[xi]);
240 : }
241 24012 : for (yi = 0; yi < n_to; yi++) {
242 24009 : add.push_back(&to_lines[yi]);
243 : }
244 3 : diff.add_edit(DiffOp<T>(DiffOp<T>::change, del, add));
245 3 : diff.bailout = true;
246 :
247 3 : done = true;
248 3 : return;
249 : }
250 :
251 : // Ignore lines which do not exist in both files.
252 1390 : ValueSet xhash, yhash;
253 11719 : for (xi = skip; xi < n_from - endskip; xi++) {
254 11024 : xhash.insert(from_lines[xi]);
255 : }
256 :
257 16678 : for (yi = skip; yi < n_to - endskip; yi++) {
258 15983 : const T & line = to_lines[yi];
259 15983 : if ( (ychanged[yi] = (xhash.find(line) == xhash.end())) )
260 10487 : continue;
261 5496 : yhash.insert(line);
262 5496 : yv.push_back(&line);
263 5496 : yind.push_back(yi);
264 : }
265 11719 : for (xi = skip; xi < n_from - endskip; xi++) {
266 11024 : const T & line = from_lines[xi];
267 11024 : if ( (xchanged[xi] = (yhash.find(line) == yhash.end())) )
268 6536 : continue;
269 4488 : xv.push_back(&line);
270 4488 : xind.push_back(xi);
271 : }
272 :
273 : // Find the LCS.
274 695 : compareseq(0, xv.size(), 0, yv.size());
275 :
276 : // Merge edits when possible
277 695 : shift_boundaries(from_lines, xchanged, ychanged);
278 695 : shift_boundaries(to_lines, ychanged, xchanged);
279 :
280 : // Compute the edit operations.
281 695 : xi = yi = 0;
282 2971 : while (xi < n_from || yi < n_to) {
283 : assert(yi < n_to || xchanged[xi]);
284 : assert(xi < n_from || ychanged[yi]);
285 :
286 : // Skip matching "snake".
287 4552 : PointerVector del;
288 4552 : PointerVector add;
289 4552 : PointerVector empty;
290 20418 : while (xi < n_from && yi < n_to && !xchanged[xi] && !ychanged[yi]) {
291 18142 : del.push_back(&from_lines[xi]);
292 18142 : add.push_back(&to_lines[yi]);
293 18142 : ++xi;
294 18142 : ++yi;
295 : }
296 2276 : if (del.size()) {
297 1829 : diff.add_edit(DiffOp<T>(DiffOp<T>::copy, del, add));
298 1829 : del.clear();
299 1829 : add.clear();
300 : }
301 :
302 : // Find deletes & adds.
303 10268 : while (xi < n_from && xchanged[xi])
304 7992 : del.push_back(&from_lines[xi++]);
305 :
306 15227 : while (yi < n_to && ychanged[yi])
307 12951 : add.push_back(&to_lines[yi++]);
308 :
309 2276 : if (del.size() && add.size()) {
310 1102 : diff.add_edit(DiffOp<T>(DiffOp<T>::change, del, add));
311 1174 : } else if (del.size())
312 425 : diff.add_edit(DiffOp<T>(DiffOp<T>::del, del, empty));
313 749 : else if (add.size())
314 467 : diff.add_edit(DiffOp<T>(DiffOp<T>::add, empty, add));
315 : }
316 :
317 695 : done = true;
318 : }
319 :
320 : /* Divide the Largest Common Subsequence (LCS) of the sequences
321 : * [XOFF, XLIM) and [YOFF, YLIM) into NCHUNKS approximately equally
322 : * sized segments.
323 : *
324 : * Returns (LCS, SEPS). LCS is the length of the LCS. SEPS is an
325 : * array of NCHUNKS+1 (X, Y) indexes giving the diving points between
326 : * sub sequences. The first sub-sequence is contained in [X0, X1),
327 : * [Y0, Y1), the second in [X1, X2), [Y1, Y2) and so on. Note
328 : * that (X0, Y0) == (XOFF, YOFF) and
329 : * (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
330 : *
331 : * This function assumes that the first lines of the specified portions
332 : * of the two files do not match, and likewise that the last lines do not
333 : * match. The caller must trim matching lines from the beginning and end
334 : * of the portions it is going to specify.
335 : */
336 : template <typename T>
337 421 : int DiffEngine<T>::diag (int xoff, int xlim, int yoff, int ylim, int nchunks,
338 : IntPairVector & seps)
339 : {
340 : using std::swap;
341 : using std::make_pair;
342 : using std::copy;
343 421 : bool flip = false;
344 842 : MatchesMap ymatches;
345 :
346 421 : if (xlim - xoff > ylim - yoff) {
347 : // Things seems faster (I'm not sure I understand why)
348 : // when the shortest sequence in X.
349 143 : flip = true;
350 143 : swap(xoff, yoff);
351 143 : swap(xlim, ylim);
352 : }
353 :
354 421 : if (flip)
355 3663 : for (int i = ylim - 1; i >= yoff; i--)
356 3520 : ymatches[*xv[i]].push_back(i);
357 : else
358 4430 : for (int i = ylim - 1; i >= yoff; i--)
359 4152 : ymatches[*yv[i]].push_back(i);
360 :
361 421 : int nlines = ylim - yoff;
362 421 : lcs = 0;
363 421 : seq[0] = yoff - 1;
364 421 : in_seq.clear();
365 :
366 : // 2-d array, line major, chunk minor
367 421 : IntVector ymids(nlines * nchunks);
368 :
369 421 : int numer = xlim - xoff + nchunks - 1;
370 421 : int x = xoff, x1, y1;
371 2305 : for (int chunk = 0; chunk < nchunks; chunk++) {
372 1884 : if (chunk > 0)
373 12000 : for (int i = 0; i <= lcs; i++)
374 10537 : ymids.at(i * nchunks + chunk-1) = seq[i];
375 :
376 1884 : x1 = xoff + (int)((numer + (xlim-xoff)*chunk) / nchunks);
377 5484 : for ( ; x < x1; x++) {
378 3600 : const T & line = flip ? *yv[x] : *xv[x];
379 3600 : typename MatchesMap::iterator iter = ymatches.find(line);
380 3600 : if (iter == ymatches.end())
381 288 : continue;
382 3312 : IntVector * pMatches = &(iter->second);
383 3312 : IntVector::iterator y;
384 3312 : int k = 0;
385 :
386 3512 : for (y = pMatches->begin(); y != pMatches->end(); ++y) {
387 3427 : if (!in_seq.contains(*y)) {
388 3227 : k = lcs_pos(*y);
389 : assert(k > 0);
390 3227 : copy(ymids.begin() + (k-1) * nchunks, ymids.begin() + k * nchunks,
391 3227 : ymids.begin() + k * nchunks);
392 3227 : ++y;
393 3227 : break;
394 : }
395 : }
396 65905 : for ( ; y != pMatches->end(); ++y) {
397 62593 : if (*y > seq[k-1]) {
398 : assert(*y < seq[k]);
399 : // Optimization: this is a common case:
400 : // next match is just replacing previous match.
401 36525 : in_seq.erase(seq[k]);
402 36525 : seq[k] = *y;
403 36525 : in_seq.insert(*y);
404 26068 : } else if (!in_seq.contains(*y)) {
405 5752 : k = lcs_pos(*y);
406 : assert(k > 0);
407 5752 : copy(ymids.begin() + (k-1) * nchunks, ymids.begin() + k * nchunks,
408 11504 : ymids.begin() + k * nchunks);
409 : }
410 : }
411 : }
412 : }
413 :
414 421 : seps.clear();
415 421 : seps.resize(nchunks + 1);
416 :
417 564 : seps[0] = flip ? make_pair(yoff, xoff) : make_pair(xoff, yoff);
418 421 : IntVector::iterator ymid = ymids.begin() + lcs * nchunks;
419 1884 : for (int n = 0; n < nchunks - 1; n++) {
420 1463 : x1 = xoff + (numer + (xlim - xoff) * n) / nchunks;
421 1463 : y1 = ymid[n] + 1;
422 1982 : seps[n+1] = flip ? make_pair(y1, x1) : make_pair(x1, y1);
423 : }
424 564 : seps[nchunks] = flip ? make_pair(ylim, xlim) : make_pair(xlim, ylim);
425 842 : return lcs;
426 : }
427 :
428 : template <typename T>
429 8979 : int DiffEngine<T>::lcs_pos (int ypos) {
430 8979 : int end = lcs;
431 8979 : if (end == 0 || ypos > seq[end]) {
432 2564 : seq[++lcs] = ypos;
433 2564 : in_seq.insert(ypos);
434 2564 : return lcs;
435 : }
436 :
437 6415 : int beg = 1;
438 39247 : while (beg < end) {
439 32832 : int mid = (beg + end) / 2;
440 32832 : if ( ypos > seq[mid] )
441 19011 : beg = mid + 1;
442 : else
443 13821 : end = mid;
444 : }
445 :
446 : assert(ypos != seq[end]);
447 :
448 6415 : in_seq.erase(seq[end]);
449 6415 : seq[end] = ypos;
450 6415 : in_seq.insert(ypos);
451 6415 : return end;
452 : }
453 :
454 : /* Find LCS of two sequences.
455 : *
456 : * The results are recorded in the vectors {x,y}changed[], by
457 : * storing a 1 in the element for each line that is an insertion
458 : * or deletion (ie. is not in the LCS).
459 : *
460 : * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
461 : *
462 : * Note that XLIM, YLIM are exclusive bounds.
463 : * All line numbers are origin-0 and discarded lines are not counted.
464 : */
465 : template <typename T>
466 2452 : void DiffEngine<T>::compareseq (int xoff, int xlim, int yoff, int ylim) {
467 : using std::pair;
468 :
469 4904 : IntPairVector seps;
470 : int lcs;
471 :
472 : // Slide down the bottom initial diagonal.
473 4203 : while (xoff < xlim && yoff < ylim && *xv[xoff] == *yv[yoff]) {
474 1751 : ++xoff;
475 1751 : ++yoff;
476 : }
477 :
478 : // Slide up the top initial diagonal.
479 3733 : while (xlim > xoff && ylim > yoff && *xv[xlim - 1] == *yv[ylim - 1]) {
480 1281 : --xlim;
481 1281 : --ylim;
482 : }
483 :
484 2452 : if (xoff == xlim || yoff == ylim)
485 2031 : lcs = 0;
486 : else {
487 : // This is ad hoc but seems to work well.
488 : //nchunks = sqrt(min(xlim - xoff, ylim - yoff) / 2.5);
489 : //nchunks = max(2,min(8,(int)nchunks));
490 421 : int nchunks = std::min(MAX_CHUNKS-1, std::min(xlim - xoff, ylim - yoff)) + 1;
491 421 : lcs = diag(xoff, xlim, yoff, ylim, nchunks, seps);
492 : }
493 :
494 2452 : if (lcs == 0) {
495 : // X and Y sequences have no common subsequence:
496 : // mark all changed.
497 4556 : while (yoff < ylim)
498 2464 : ychanged[yind[yoff++]] = true;
499 3548 : while (xoff < xlim)
500 1456 : xchanged[xind[xoff++]] = true;
501 : } else {
502 : // Use the partitions to split this problem into subproblems.
503 360 : IntPairVector::iterator pt1, pt2;
504 360 : pt1 = pt2 = seps.begin();
505 2117 : while (++pt2 != seps.end()) {
506 1757 : compareseq (pt1->first, pt2->first, pt1->second, pt2->second);
507 1757 : pt1 = pt2;
508 : }
509 : }
510 2452 : }
511 :
512 : /* Adjust inserts/deletes of identical lines to join changes
513 : * as much as possible.
514 : *
515 : * We do something when a run of changed lines include a
516 : * line at one end and has an excluded, identical line at the other.
517 : * We are free to choose which identical line is included.
518 : * `compareseq' usually chooses the one at the beginning,
519 : * but usually it is cleaner to consider the following identical line
520 : * to be the "change".
521 : *
522 : * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
523 : */
524 : template <typename T>
525 1390 : void DiffEngine<T>::shift_boundaries (const ValueVector & lines, BoolVector & changed,
526 : const BoolVector & other_changed)
527 : {
528 1390 : int i = 0;
529 1390 : int j = 0;
530 :
531 1390 : int len = (int)lines.size();
532 1390 : int other_len = (int)other_changed.size();
533 :
534 3110 : while (1) {
535 : /*
536 : * Scan forwards to find beginning of another run of changes.
537 : * Also keep track of the corresponding point in the other file.
538 : *
539 : * Throughout this code, i and j are adjusted together so that
540 : * the first i elements of changed and the first j elements
541 : * of other_changed both contain the same number of zeros
542 : * (unchanged lines).
543 : * Furthermore, j is always kept so that j == other_len or
544 : * other_changed[j] == false.
545 : */
546 13010 : while (j < other_len && other_changed[j])
547 8510 : j++;
548 :
549 40876 : while (i < len && ! changed[i]) {
550 36376 : i++; j++;
551 48822 : while (j < other_len && other_changed[j])
552 12446 : j++;
553 : }
554 :
555 4500 : if (i == len)
556 1390 : break;
557 :
558 3110 : int start = i, runlength, corresponding;
559 :
560 : // Find the end of this run of changes.
561 20931 : while (++i < len && changed[i])
562 17821 : continue;
563 :
564 21 : do {
565 : /*
566 : * Record the length of this run of changes, so that
567 : * we can later determine whether the run has grown.
568 : */
569 3131 : runlength = i - start;
570 :
571 : /*
572 : * Move the changed region back, so long as the
573 : * previous unchanged line matches the last changed one.
574 : * This merges with previous changed regions.
575 : */
576 3287 : while (start > 0 && lines[start - 1] == lines[i - 1]) {
577 156 : changed[--start] = true;
578 156 : changed[--i] = false;
579 173 : while (start > 0 && changed[start - 1])
580 17 : start--;
581 389 : while (other_changed[--j])
582 233 : continue;
583 : }
584 :
585 : /*
586 : * Set CORRESPONDING to the end of the changed run, at the last
587 : * point where it corresponds to a changed run in the other file.
588 : * CORRESPONDING == LEN means no such point has been found.
589 : */
590 3131 : corresponding = j < other_len ? i : len;
591 :
592 : /*
593 : * Move the changed region forward, so long as the
594 : * first changed line matches the following unchanged one.
595 : * This merges with following changed regions.
596 : * Do this second, so that if there are no merges,
597 : * the changed region is moved forward as far as possible.
598 : */
599 3328 : while (i < len && lines[start] == lines[i]) {
600 197 : changed[start++] = false;
601 197 : changed[i++] = true;
602 209 : while (i < len && changed[i])
603 12 : i++;
604 :
605 197 : j++;
606 197 : if (j < other_len && other_changed[j]) {
607 42 : corresponding = i;
608 262 : while (j < other_len && other_changed[j])
609 220 : j++;
610 : }
611 : }
612 3131 : } while (runlength != i - start);
613 :
614 : /*
615 : * If possible, move the fully-merged run of changes
616 : * back to a corresponding run in the other file.
617 : */
618 3243 : while (corresponding < i) {
619 133 : changed[--start] = 1;
620 133 : changed[--i] = 0;
621 133 : while (other_changed[--j])
622 0 : continue;
623 : }
624 : }
625 1390 : }
626 : //-----------------------------------------------------------------------------
627 : // Diff implementation
628 : //-----------------------------------------------------------------------------
629 :
630 : template<typename T>
631 698 : Diff<T>::Diff(const DiffConfig& config, const ValueVector & from_lines, const ValueVector & to_lines)
632 : {
633 1396 : DiffEngine<T> engine(config);
634 698 : engine.diff(from_lines, to_lines, *this);
635 698 : }
636 :
637 : } // namespace wikidiff2
638 :
639 : #endif
|