1 rizwank 1.1 package Algorithm::Diff;
2 use strict;
3 use vars qw($VERSION @EXPORT_OK @ISA @EXPORT);
4 use integer; # see below in _replaceNextLargerWith() for mod to make
5 # if you don't use this
6 require Exporter;
7 @ISA = qw(Exporter);
8 @EXPORT = qw();
9 @EXPORT_OK = qw(LCS diff traverse_sequences traverse_balanced sdiff);
10 $VERSION = sprintf('%d.%02d', (q$Revision: 1.2 $ =~ /\d+/g));
11
12 # McIlroy-Hunt diff algorithm
13 # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
14 # by Ned Konz, perl@bike-nomad.com
15
16 =head1 NAME
17
18 Algorithm::Diff - Compute `intelligent' differences between two files / lists
19
20 =head1 SYNOPSIS
21
22 rizwank 1.1 use Algorithm::Diff qw(diff sdiff LCS traverse_sequences
23 traverse_balanced);
24
25 @lcs = LCS( \@seq1, \@seq2 );
26
27 @lcs = LCS( \@seq1, \@seq2, $key_generation_function );
28
29 $lcsref = LCS( \@seq1, \@seq2 );
30
31 $lcsref = LCS( \@seq1, \@seq2, $key_generation_function );
32
33 @diffs = diff( \@seq1, \@seq2 );
34
35 @diffs = diff( \@seq1, \@seq2, $key_generation_function );
36
37 @sdiffs = sdiff( \@seq1, \@seq2 );
38
39 @sdiffs = sdiff( \@seq1, \@seq2, $key_generation_function );
40
41 traverse_sequences( \@seq1, \@seq2,
42 { MATCH => $callback,
43 rizwank 1.1 DISCARD_A => $callback,
44 DISCARD_B => $callback,
45 } );
46
47 traverse_sequences( \@seq1, \@seq2,
48 { MATCH => $callback,
49 DISCARD_A => $callback,
50 DISCARD_B => $callback,
51 },
52 $key_generation_function );
53
54 traverse_balanced( \@seq1, \@seq2,
55 { MATCH => $callback,
56 DISCARD_A => $callback,
57 DISCARD_B => $callback,
58 CHANGE => $callback,
59 } );
60
61 =head1 INTRODUCTION
62
63 (by Mark-Jason Dominus)
64 rizwank 1.1
65 I once read an article written by the authors of C<diff>; they said
66 that they hard worked very hard on the algorithm until they found the
67 right one.
68
69 I think what they ended up using (and I hope someone will correct me,
70 because I am not very confident about this) was the `longest common
71 subsequence' method. in the LCS problem, you have two sequences of
72 items:
73
74 a b c d f g h j q z
75
76 a b c d e f g i j k r x y z
77
78 and you want to find the longest sequence of items that is present in
79 both original sequences in the same order. That is, you want to find
80 a new sequence I<S> which can be obtained from the first sequence by
81 deleting some items, and from the secend sequence by deleting other
82 items. You also want I<S> to be as long as possible. In this case
83 I<S> is
84
85 rizwank 1.1 a b c d f g j z
86
87 From there it's only a small step to get diff-like output:
88
89 e h i k q r x y
90 + - + + - + + +
91
92 This module solves the LCS problem. It also includes a canned
93 function to generate C<diff>-like output.
94
95 It might seem from the example above that the LCS of two sequences is
96 always pretty obvious, but that's not always the case, especially when
97 the two sequences have many repeated elements. For example, consider
98
99 a x b y c z p d q
100 a b c a x b y c z
101
102 A naive approach might start by matching up the C<a> and C<b> that
103 appear at the beginning of each sequence, like this:
104
105 a x b y c z p d q
106 rizwank 1.1 a b c a b y c z
107
108 This finds the common subsequence C<a b c z>. But actually, the LCS
109 is C<a x b y c z>:
110
111 a x b y c z p d q
112 a b c a x b y c z
113
114 =head1 USAGE
115
116 This module provides three exportable functions, which we'll deal with in
117 ascending order of difficulty: C<LCS>,
118 C<diff>, C<sdiff>, C<traverse_sequences>, and C<traverse_balanced>.
119
120 =head2 C<LCS>
121
122 Given references to two lists of items, LCS returns an array containing their
123 longest common subsequence. In scalar context, it returns a reference to
124 such a list.
125
126 @lcs = LCS( \@seq1, \@seq2 );
127 rizwank 1.1 $lcsref = LCS( \@seq1, \@seq2 );
128
129 C<LCS> may be passed an optional third parameter; this is a CODE
130 reference to a key generation function. See L</KEY GENERATION
131 FUNCTIONS>.
132
133 @lcs = LCS( \@seq1, \@seq2, $keyGen );
134 $lcsref = LCS( \@seq1, \@seq2, $keyGen );
135
136 Additional parameters, if any, will be passed to the key generation
137 routine.
138
139 =head2 C<diff>
140
141 @diffs = diff( \@seq1, \@seq2 );
142 $diffs_ref = diff( \@seq1, \@seq2 );
143
144 C<diff> computes the smallest set of additions and deletions necessary
145 to turn the first sequence into the second, and returns a description
146 of these changes. The description is a list of I<hunks>; each hunk
147 represents a contiguous section of items which should be added,
148 rizwank 1.1 deleted, or replaced. The return value of C<diff> is a list of
149 hunks, or, in scalar context, a reference to such a list.
150
151 Here is an example: The diff of the following two sequences:
152
153 a b c e h j l m n p
154 b c d e f j k l m r s t
155
156 Result:
157
158 [
159 [ [ '-', 0, 'a' ] ],
160
161 [ [ '+', 2, 'd' ] ],
162
163 [ [ '-', 4, 'h' ] ,
164 [ '+', 4, 'f' ] ],
165
166 [ [ '+', 6, 'k' ] ],
167
168 [ [ '-', 8, 'n' ],
169 rizwank 1.1 [ '-', 9, 'p' ],
170 [ '+', 9, 'r' ],
171 [ '+', 10, 's' ],
172 [ '+', 11, 't' ],
173 ]
174 ]
175
176 There are five hunks here. The first hunk says that the C<a> at
177 position 0 of the first sequence should be deleted (C<->). The second
178 hunk says that the C<d> at position 2 of the second sequence should
179 be inserted (C<+>). The third hunk says that the C<h> at position 4
180 of the first sequence should be removed and replaced with the C<f>
181 from position 4 of the second sequence. The other two hunks similarly.
182
183 C<diff> may be passed an optional third parameter; this is a CODE
184 reference to a key generation function. See L</KEY GENERATION
185 FUNCTIONS>.
186
187 Additional parameters, if any, will be passed to the key generation
188 routine.
189
190 rizwank 1.1 =head2 C<sdiff>
191
192 @sdiffs = sdiff( \@seq1, \@seq2 );
193 $sdiffs_ref = sdiff( \@seq1, \@seq2 );
194
195 C<sdiff> computes all necessary components to show two sequences
196 and their minimized differences side by side, just like the
197 Unix-utility I<sdiff> does:
198
199 same same
200 before | after
201 old < -
202 - > new
203
204 It returns a list of array refs, each pointing to an array of
205 display instructions. In scalar context it returns a reference
206 to such a list.
207
208 Display instructions consist of three elements: A modifier indicator
209 (C<+>: Element added, C<->: Element removed, C<u>: Element unmodified,
210 C<c>: Element changed) and the value of the old and new elements, to
211 rizwank 1.1 be displayed side by side.
212
213 An C<sdiff> of the following two sequences:
214
215 a b c e h j l m n p
216 b c d e f j k l m r s t
217
218 results in
219
220 [ [ '-', 'a', '' ],
221 [ 'u', 'b', 'b' ],
222 [ 'u', 'c', 'c' ],
223 [ '+', '', 'd' ],
224 [ 'u', 'e', 'e' ],
225 [ 'c', 'h', 'f' ],
226 [ 'u', 'j', 'j' ],
227 [ '+', '', 'k' ],
228 [ 'u', 'l', 'l' ],
229 [ 'u', 'm', 'm' ],
230 [ 'c', 'n', 'r' ],
231 [ 'c', 'p', 's' ],
232 rizwank 1.1 [ '+', '', 't' ] ]
233
234 C<sdiff> may be passed an optional third parameter; this is a CODE
235 reference to a key generation function. See L</KEY GENERATION
236 FUNCTIONS>.
237
238 Additional parameters, if any, will be passed to the key generation
239 routine.
240
241 =head2 C<traverse_sequences>
242
243 C<traverse_sequences> is the most general facility provided by this
244 module; C<diff> and C<LCS> are implemented as calls to it.
245
246 Imagine that there are two arrows. Arrow A points to an element of sequence A,
247 and arrow B points to an element of the sequence B. Initially, the arrows
248 point to the first elements of the respective sequences. C<traverse_sequences>
249 will advance the arrows through the sequences one element at a time, calling an
250 appropriate user-specified callback function before each advance. It
251 willadvance the arrows in such a way that if there are equal elements C<$A[$i]>
252 and C<$B[$j]> which are equal and which are part of the LCS, there will be
253 rizwank 1.1 some moment during the execution of C<traverse_sequences> when arrow A is
254 pointing to C<$A[$i]> and arrow B is pointing to C<$B[$j]>. When this happens,
255 C<traverse_sequences> will call the C<MATCH> callback function and then it will
256 advance both arrows.
257
258 Otherwise, one of the arrows is pointing to an element of its sequence that is
259 not part of the LCS. C<traverse_sequences> will advance that arrow and will
260 call the C<DISCARD_A> or the C<DISCARD_B> callback, depending on which arrow it
261 advanced. If both arrows point to elements that are not part of the LCS, then
262 C<traverse_sequences> will advance one of them and call the appropriate
263 callback, but it is not specified which it will call.
264
265 The arguments to C<traverse_sequences> are the two sequences to traverse, and a
266 hash which specifies the callback functions, like this:
267
268 traverse_sequences( \@seq1, \@seq2,
269 { MATCH => $callback_1,
270 DISCARD_A => $callback_2,
271 DISCARD_B => $callback_3,
272 } );
273
274 rizwank 1.1 Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least the
275 indices of the two arrows as their arguments. They are not expected to return
276 any values. If a callback is omitted from the table, it is not called.
277
278 Callbacks for A_FINISHED and B_FINISHED are invoked with at least the
279 corresponding index in A or B.
280
281 If arrow A reaches the end of its sequence, before arrow B does,
282 C<traverse_sequences> will call the C<A_FINISHED> callback when it advances
283 arrow B, if there is such a function; if not it will call C<DISCARD_B> instead.
284 Similarly if arrow B finishes first. C<traverse_sequences> returns when both
285 arrows are at the ends of their respective sequences. It returns true on
286 success and false on failure. At present there is no way to fail.
287
288 C<traverse_sequences> may be passed an optional fourth parameter; this is a
289 CODE reference to a key generation function. See L</KEY GENERATION FUNCTIONS>.
290
291 Additional parameters, if any, will be passed to the key generation function.
292
293 =head2 C<traverse_balanced>
294
295 rizwank 1.1 C<traverse_balanced> is an alternative to C<traverse_sequences>. It
296 uses a different algorithm to iterate through the entries in the
297 computed LCS. Instead of sticking to one side and showing element changes
298 as insertions and deletions only, it will jump back and forth between
299 the two sequences and report I<changes> occurring as deletions on one
300 side followed immediatly by an insertion on the other side.
301
302 In addition to the
303 C<DISCARD_A>,
304 C<DISCARD_B>, and
305 C<MATCH>
306 callbacks supported by C<traverse_sequences>, C<traverse_balanced> supports
307 a C<CHANGE> callback indicating that one element got C<replaced> by another:
308
309 traverse_sequences( \@seq1, \@seq2,
310 { MATCH => $callback_1,
311 DISCARD_A => $callback_2,
312 DISCARD_B => $callback_3,
313 CHANGE => $callback_4,
314 } );
315
316 rizwank 1.1 If no C<CHANGE> callback is specified, C<traverse_balanced>
317 will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions,
318 therefore resulting in a similar behaviour as C<traverse_sequences>
319 with different order of events.
320
321 C<traverse_balanced> might be a bit slower than C<traverse_sequences>,
322 noticable only while processing huge amounts of data.
323
324 The C<sdiff> function of this module
325 is implemented as call to C<traverse_balanced>.
326
327 =head1 KEY GENERATION FUNCTIONS
328
329 C<diff>, C<LCS>, and C<traverse_sequences> accept an optional last parameter.
330 This is a CODE reference to a key generating (hashing) function that should
331 return a string that uniquely identifies a given element. It should be the
332 case that if two elements are to be considered equal, their keys should be the
333 same (and the other way around). If no key generation function is provided,
334 the key will be the element as a string.
335
336 By default, comparisons will use "eq" and elements will be turned into keys
337 rizwank 1.1 using the default stringizing operator '""'.
338
339 Where this is important is when you're comparing something other than strings.
340 If it is the case that you have multiple different objects that should be
341 considered to be equal, you should supply a key generation function. Otherwise,
342 you have to make sure that your arrays contain unique references.
343
344 For instance, consider this example:
345
346 package Person;
347
348 sub new
349 {
350 my $package = shift;
351 return bless { name => '', ssn => '', @_ }, $package;
352 }
353
354 sub clone
355 {
356 my $old = shift;
357 my $new = bless { %$old }, ref($old);
358 rizwank 1.1 }
359
360 sub hash
361 {
362 return shift()->{'ssn'};
363 }
364
365 my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' );
366 my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' );
367 my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' );
368 my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' );
369 my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' );
370
371 If you did this:
372
373 my $array1 = [ $person1, $person2, $person4 ];
374 my $array2 = [ $person1, $person3, $person4, $person5 ];
375 Algorithm::Diff::diff( $array1, $array2 );
376
377 everything would work out OK (each of the objects would be converted
378 into a string like "Person=HASH(0x82425b0)" for comparison).
379 rizwank 1.1
380 But if you did this:
381
382 my $array1 = [ $person1, $person2, $person4 ];
383 my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
384 Algorithm::Diff::diff( $array1, $array2 );
385
386 $person4 and $person4->clone() (which have the same name and SSN)
387 would be seen as different objects. If you wanted them to be considered
388 equivalent, you would have to pass in a key generation function:
389
390 my $array1 = [ $person1, $person2, $person4 ];
391 my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
392 Algorithm::Diff::diff( $array1, $array2, \&Person::hash );
393
394 This would use the 'ssn' field in each Person as a comparison key, and
395 so would consider $person4 and $person4->clone() as equal.
396
397 You may also pass additional parameters to the key generation function
398 if you wish.
399
400 rizwank 1.1 =head1 AUTHOR
401
402 This version by Ned Konz, perl@bike-nomad.com
403
404 =head1 LICENSE
405
406 Copyright (c) 2000-2002 Ned Konz. All rights reserved.
407 This program is free software;
408 you can redistribute it and/or modify it under the same terms
409 as Perl itself.
410
411 =head1 CREDITS
412
413 Versions through 0.59 (and much of this documentation) were written by:
414
415 Mark-Jason Dominus, mjd-perl-diff@plover.com
416
417 This version borrows the documentation and names of the routines
418 from Mark-Jason's, but has all new code in Diff.pm.
419
420 This code was adapted from the Smalltalk code of
421 rizwank 1.1 Mario Wolczko <mario@wolczko.com>, which is available at
422 ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st
423
424 C<sdiff> and C<traverse_balanced> were written by Mike Schilli
425 <m@perlmeister.com>.
426
427 The algorithm is that described in
428 I<A Fast Algorithm for Computing Longest Common Subsequences>,
429 CACM, vol.20, no.5, pp.350-353, May 1977, with a few
430 minor improvements to improve the speed.
431
432 =cut
433
434 # Create a hash that maps each element of $aCollection to the set of positions
435 # it occupies in $aCollection, restricted to the elements within the range of
436 # indexes specified by $start and $end.
437 # The fourth parameter is a subroutine reference that will be called to
438 # generate a string to use as a key.
439 # Additional parameters, if any, will be passed to this subroutine.
440 #
441 # my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
442 rizwank 1.1
443 sub _withPositionsOfInInterval
444 {
445 my $aCollection = shift; # array ref
446 my $start = shift;
447 my $end = shift;
448 my $keyGen = shift;
449 my %d;
450 my $index;
451 for ( $index = $start ; $index <= $end ; $index++ )
452 {
453 my $element = $aCollection->[$index];
454 my $key = &$keyGen( $element, @_ );
455 if ( exists( $d{$key} ) )
456 {
457 unshift ( @{ $d{$key} }, $index );
458 }
459 else
460 {
461 $d{$key} = [$index];
462 }
463 rizwank 1.1 }
464 return wantarray ? %d : \%d;
465 }
466
467 # Find the place at which aValue would normally be inserted into the array. If
468 # that place is already occupied by aValue, do nothing, and return undef. If
469 # the place does not exist (i.e., it is off the end of the array), add it to
470 # the end, otherwise replace the element at that point with aValue.
471 # It is assumed that the array's values are numeric.
472 # This is where the bulk (75%) of the time is spent in this module, so try to
473 # make it fast!
474
475 sub _replaceNextLargerWith
476 {
477 my ( $array, $aValue, $high ) = @_;
478 $high ||= $#$array;
479
480 # off the end?
481 if ( $high == -1 || $aValue > $array->[-1] )
482 {
483 push ( @$array, $aValue );
484 rizwank 1.1 return $high + 1;
485 }
486
487 # binary search for insertion point...
488 my $low = 0;
489 my $index;
490 my $found;
491 while ( $low <= $high )
492 {
493 $index = ( $high + $low ) / 2;
494
495 # $index = int(( $high + $low ) / 2); # without 'use integer'
496 $found = $array->[$index];
497
498 if ( $aValue == $found )
499 {
500 return undef;
501 }
502 elsif ( $aValue > $found )
503 {
504 $low = $index + 1;
505 rizwank 1.1 }
506 else
507 {
508 $high = $index - 1;
509 }
510 }
511
512 # now insertion point is in $low.
513 $array->[$low] = $aValue; # overwrite next larger
514 return $low;
515 }
516
517 # This method computes the longest common subsequence in $a and $b.
518
519 # Result is array or ref, whose contents is such that
520 # $a->[ $i ] == $b->[ $result[ $i ] ]
521 # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
522
523 # An additional argument may be passed; this is a hash or key generating
524 # function that should return a string that uniquely identifies the given
525 # element. It should be the case that if the key is the same, the elements
526 rizwank 1.1 # will compare the same. If this parameter is undef or missing, the key
527 # will be the element as a string.
528
529 # By default, comparisons will use "eq" and elements will be turned into keys
530 # using the default stringizing operator '""'.
531
532 # Additional parameters, if any, will be passed to the key generation routine.
533
534 sub _longestCommonSubsequence
535 {
536 my $a = shift; # array ref
537 my $b = shift; # array ref
538 my $keyGen = shift; # code ref
539 my $compare; # code ref
540
541 # set up code refs
542 # Note that these are optimized.
543 if ( !defined($keyGen) ) # optimize for strings
544 {
545 $keyGen = sub { $_[0] };
546 $compare = sub { my ( $a, $b ) = @_; $a eq $b };
547 rizwank 1.1 }
548 else
549 {
550 $compare = sub {
551 my $a = shift;
552 my $b = shift;
553 &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
554 };
555 }
556
557 my ( $aStart, $aFinish, $bStart, $bFinish, $matchVector ) =
558 ( 0, $#$a, 0, $#$b, [] );
559
560 # First we prune off any common elements at the beginning
561 while ( $aStart <= $aFinish
562 and $bStart <= $bFinish
563 and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
564 {
565 $matchVector->[ $aStart++ ] = $bStart++;
566 }
567
568 rizwank 1.1 # now the end
569 while ( $aStart <= $aFinish
570 and $bStart <= $bFinish
571 and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
572 {
573 $matchVector->[ $aFinish-- ] = $bFinish--;
574 }
575
576 # Now compute the equivalence classes of positions of elements
577 my $bMatches =
578 _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
579 my $thresh = [];
580 my $links = [];
581
582 my ( $i, $ai, $j, $k );
583 for ( $i = $aStart ; $i <= $aFinish ; $i++ )
584 {
585 $ai = &$keyGen( $a->[$i], @_ );
586 if ( exists( $bMatches->{$ai} ) )
587 {
588 $k = 0;
589 rizwank 1.1 for $j ( @{ $bMatches->{$ai} } )
590 {
591
592 # optimization: most of the time this will be true
593 if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
594 {
595 $thresh->[$k] = $j;
596 }
597 else
598 {
599 $k = _replaceNextLargerWith( $thresh, $j, $k );
600 }
601
602 # oddly, it's faster to always test this (CPU cache?).
603 if ( defined($k) )
604 {
605 $links->[$k] =
606 [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
607 }
608 }
609 }
610 rizwank 1.1 }
611
612 if (@$thresh)
613 {
614 for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
615 {
616 $matchVector->[ $link->[1] ] = $link->[2];
617 }
618 }
619
620 return wantarray ? @$matchVector : $matchVector;
621 }
622
623 sub traverse_sequences
624 {
625 my $a = shift; # array ref
626 my $b = shift; # array ref
627 my $callbacks = shift || {};
628 my $keyGen = shift;
629 my $matchCallback = $callbacks->{'MATCH'} || sub { };
630 my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
631 rizwank 1.1 my $finishedACallback = $callbacks->{'A_FINISHED'};
632 my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
633 my $finishedBCallback = $callbacks->{'B_FINISHED'};
634 my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ );
635
636 # Process all the lines in @$matchVector
637 my $lastA = $#$a;
638 my $lastB = $#$b;
639 my $bi = 0;
640 my $ai;
641
642 for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
643 {
644 my $bLine = $matchVector->[$ai];
645 if ( defined($bLine) ) # matched
646 {
647 &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
648 &$matchCallback( $ai, $bi++, @_ );
649 }
650 else
651 {
652 rizwank 1.1 &$discardACallback( $ai, $bi, @_ );
653 }
654 }
655
656 # The last entry (if any) processed was a match.
657 # $ai and $bi point just past the last matching lines in their sequences.
658
659 while ( $ai <= $lastA or $bi <= $lastB )
660 {
661
662 # last A?
663 if ( $ai == $lastA + 1 and $bi <= $lastB )
664 {
665 if ( defined($finishedACallback) )
666 {
667 &$finishedACallback( $lastA, @_ );
668 $finishedACallback = undef;
669 }
670 else
671 {
672 &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
673 rizwank 1.1 }
674 }
675
676 # last B?
677 if ( $bi == $lastB + 1 and $ai <= $lastA )
678 {
679 if ( defined($finishedBCallback) )
680 {
681 &$finishedBCallback( $lastB, @_ );
682 $finishedBCallback = undef;
683 }
684 else
685 {
686 &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
687 }
688 }
689
690 &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
691 &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
692 }
693
694 rizwank 1.1 return 1;
695 }
696
697 sub traverse_balanced
698 {
699 my $a = shift; # array ref
700 my $b = shift; # array ref
701 my $callbacks = shift || {};
702 my $keyGen = shift;
703 my $matchCallback = $callbacks->{'MATCH'} || sub { };
704 my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
705 my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
706 my $changeCallback = $callbacks->{'CHANGE'};
707 my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ );
708
709 # Process all the lines in match vector
710 my $lastA = $#$a;
711 my $lastB = $#$b;
712 my $bi = 0;
713 my $ai = 0;
714 my $ma = -1;
715 rizwank 1.1 my $mb;
716
717 while (1)
718 {
719
720 # Find next match indices $ma and $mb
721 do { $ma++ } while ( $ma <= $#$matchVector && !defined $matchVector->[$ma] );
722
723 last if $ma > $#$matchVector; # end of matchVector?
724 $mb = $matchVector->[$ma];
725
726 # Proceed with discard a/b or change events until
727 # next match
728 while ( $ai < $ma || $bi < $mb )
729 {
730
731 if ( $ai < $ma && $bi < $mb )
732 {
733
734 # Change
735 if ( defined $changeCallback )
736 rizwank 1.1 {
737 &$changeCallback( $ai++, $bi++, @_ );
738 }
739 else
740 {
741 &$discardACallback( $ai++, $bi, @_ );
742 &$discardBCallback( $ai, $bi++, @_ );
743 }
744 }
745 elsif ( $ai < $ma )
746 {
747 &$discardACallback( $ai++, $bi, @_ );
748 }
749 else
750 {
751
752 # $bi < $mb
753 &$discardBCallback( $ai, $bi++, @_ );
754 }
755 }
756
757 rizwank 1.1 # Match
758 &$matchCallback( $ai++, $bi++, @_ );
759 }
760
761 while ( $ai <= $lastA || $bi <= $lastB )
762 {
763 if ( $ai <= $lastA && $bi <= $lastB )
764 {
765
766 # Change
767 if ( defined $changeCallback )
768 {
769 &$changeCallback( $ai++, $bi++, @_ );
770 }
771 else
772 {
773 &$discardACallback( $ai++, $bi, @_ );
774 &$discardBCallback( $ai, $bi++, @_ );
775 }
776 }
777 elsif ( $ai <= $lastA )
778 rizwank 1.1 {
779 &$discardACallback( $ai++, $bi, @_ );
780 }
781 else
782 {
783
784 # $bi <= $lastB
785 &$discardBCallback( $ai, $bi++, @_ );
786 }
787 }
788
789 return 1;
790 }
791
792 sub LCS
793 {
794 my $a = shift; # array ref
795 my $matchVector = _longestCommonSubsequence( $a, @_ );
796 my @retval;
797 my $i;
798 for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
799 rizwank 1.1 {
800 if ( defined( $matchVector->[$i] ) )
801 {
802 push ( @retval, $a->[$i] );
803 }
804 }
805 return wantarray ? @retval : \@retval;
806 }
807
808 sub diff
809 {
810 my $a = shift; # array ref
811 my $b = shift; # array ref
812 my $retval = [];
813 my $hunk = [];
814 my $discard = sub { push ( @$hunk, [ '-', $_[0], $a->[ $_[0] ] ] ) };
815 my $add = sub { push ( @$hunk, [ '+', $_[1], $b->[ $_[1] ] ] ) };
816 my $match = sub { push ( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] };
817 traverse_sequences( $a, $b,
818 { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
819 &$match();
820 rizwank 1.1 return wantarray ? @$retval : $retval;
821 }
822
823 sub sdiff
824 {
825 my $a = shift; # array ref
826 my $b = shift; # array ref
827 my $retval = [];
828 my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
829 my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
830 my $change = sub {
831 push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
832 };
833 my $match = sub {
834 push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
835 };
836 traverse_balanced(
837 $a,
838 $b,
839 {
840 MATCH => $match,
841 rizwank 1.1 DISCARD_A => $discard,
842 DISCARD_B => $add,
843 CHANGE => $change,
844 },
845 @_
846 );
847 return wantarray ? @$retval : $retval;
848 }
849
850 1;
|