Branch data Line data Source code
1 : : /*
2 : : ** Copyright (c) 2007 D. Richard Hipp
3 : : **
4 : : ** This program is free software; you can redistribute it and/or
5 : : ** modify it under the terms of the Simplified BSD License (also
6 : : ** known as the "2-Clause License" or "FreeBSD License".)
7 : :
8 : : ** This program is distributed in the hope that it will be useful,
9 : : ** but without any warranty; without even the implied warranty of
10 : : ** merchantability or fitness for a particular purpose.
11 : : **
12 : : ** Author contact information:
13 : : ** drh@hwaci.com
14 : : ** http://www.hwaci.com/drh/
15 : : **
16 : : *******************************************************************************
17 : : **
18 : : ** This file contains code used to compute a "diff" between two
19 : : ** text files.
20 : : */
21 : : #include <sys/types.h>
22 : :
23 : : #include <string.h>
24 : : #include <stdlib.h>
25 : :
26 : : #include "private/utils.h"
27 : : #include "xmalloc.h"
28 : :
29 : : /*
30 : : ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes)
31 : : */
32 : : #define LENGTH_MASK_SZ 13
33 : : #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
34 : :
35 : : /*
36 : : ** Information about each line of a file being diffed.
37 : : **
38 : : ** The lower LENGTH_MASK_SZ bits of the hash (DLine.h) are the length
39 : : ** of the line. If any line is longer than LENGTH_MASK characters,
40 : : ** the file is considered binary.
41 : : */
42 : : typedef struct DLine DLine;
43 : : struct DLine {
44 : : const char *z; /* The text of the line */
45 : : unsigned int h; /* Hash of the line */
46 : : unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */
47 : : unsigned short n; /* number of bytes */
48 : : unsigned int iNext; /* 1+(Index of next line with same the same hash) */
49 : :
50 : : /* an array of DLine elements serves two purposes. The fields
51 : : ** above are one per line of input text. But each entry is also
52 : : ** a bucket in a hash table, as follows: */
53 : : unsigned int iHash; /* 1+(first entry in the hash chain) */
54 : : };
55 : :
56 : : /*
57 : : ** Length of a dline
58 : : */
59 : : #define LENGTH(X) ((X)->n)
60 : :
61 : : /*
62 : : ** A context for running a raw diff.
63 : : **
64 : : ** The aEdit[] array describes the raw diff. Each triple of integers in
65 : : ** aEdit[] means:
66 : : **
67 : : ** (1) COPY: Number of lines aFrom and aTo have in common
68 : : ** (2) DELETE: Number of lines found only in aFrom
69 : : ** (3) INSERT: Number of lines found only in aTo
70 : : **
71 : : ** The triples repeat until all lines of both aFrom and aTo are accounted
72 : : ** for.
73 : : */
74 : : typedef struct DContext DContext;
75 : : struct DContext {
76 : : int *aEdit; /* Array of copy/delete/insert triples */
77 : : int nEdit; /* Number of integers (3x num of triples) in aEdit[] */
78 : : int nEditAlloc; /* Space allocated for aEdit[] */
79 : : DLine *aFrom; /* File on left side of the diff */
80 : : int nFrom; /* Number of lines in aFrom[] */
81 : : DLine *aTo; /* File on right side of the diff */
82 : : int nTo; /* Number of lines in aTo[] */
83 : : int (*same_fn)(const DLine *, const DLine *); /* Function to be used for comparing */
84 : : };
85 : :
86 : : /*
87 : : ** Return an array of DLine objects containing a pointer to the
88 : : ** start of each line and a hash of that line. The lower
89 : : ** bits of the hash store the length of each line.
90 : : **
91 : : ** Trailing whitespace is removed from each line. 2010-08-20: Not any
92 : : ** more. If trailing whitespace is ignored, the "patch" command gets
93 : : ** confused by the diff output. Ticket [a9f7b23c2e376af5b0e5b]
94 : : **
95 : : ** Return 0 if the file is binary or contains a line that is
96 : : ** too long.
97 : : **
98 : : ** Profiling show that in most cases this routine consumes the bulk of
99 : : ** the CPU time on a diff.
100 : : */
101 : 32 : static DLine *break_into_lines(char *z, int *pnLine){
102 : : int nLine, i, j, k, s, x;
103 : : unsigned int h, h2;
104 : : DLine *a;
105 : :
106 : 32 : int n = strlen(z);
107 : : /* Count the number of lines. Allocate space to hold
108 : : ** the returned array.
109 : : */
110 [ + + ]: 488 : for(i=j=0, nLine=1; i<n; i++, j++){
111 : 456 : int c = z[i];
112 [ + - ]: 456 : if( c==0 ){
113 : 0 : return 0;
114 : : }
115 [ + + + + ]: 456 : if( c=='\n' && z[i+1]!=0 ){
116 : 35 : nLine++;
117 [ - + ]: 35 : if( j>LENGTH_MASK ){
118 : 0 : return 0;
119 : : }
120 : 35 : j = 0;
121 : 35 : }
122 : 456 : }
123 [ - + ]: 32 : if( j>LENGTH_MASK ){
124 : 0 : return 0;
125 : : }
126 : 32 : a = xcalloc(nLine, sizeof(a[0]) );
127 [ + - ]: 32 : if( n==0 ){
128 : 0 : *pnLine = 0;
129 : 0 : return a;
130 : : }
131 : :
132 : : /* Fill in the array */
133 [ + + ]: 99 : for(i=0; i<nLine; i++){
134 [ + + + + : 467 : for(j=0; z[j] && z[j]!='\n'; j++){}
+ + ]
135 : 67 : a[i].z = z;
136 : 67 : k = j;
137 : 67 : a[i].n = k;
138 : 67 : s = 0;
139 [ + + ]: 467 : for(h=0, x=s; x<k; x++){
140 : 400 : h = h ^ (h<<2) ^ z[x];
141 : 400 : }
142 : 67 : a[i].indent = s;
143 : 67 : a[i].h = h = (h<<LENGTH_MASK_SZ) | (k-s);
144 : 67 : h2 = h % nLine;
145 : 67 : a[i].iNext = a[h2].iHash;
146 : 67 : a[h2].iHash = i+1;
147 : 67 : z += j+1;
148 : 67 : }
149 : :
150 : : /* Return results */
151 : 32 : *pnLine = nLine;
152 : 32 : return a;
153 : 32 : }
154 : :
155 : : /*
156 : : ** Return true if two DLine elements are identical.
157 : : */
158 : 46 : static int same_dline(const DLine *pA, const DLine *pB){
159 [ + + - + ]: 46 : return pA->h==pB->h && memcmp(pA->z,pB->z, pA->h&LENGTH_MASK)==0;
160 : : }
161 : :
162 : :
163 : : /*
164 : : ** Minimum of two values
165 : : */
166 [ # # ]: 0 : static int minInt(int a, int b){ return a<b ? a : b; }
167 : :
168 : : /*
169 : : ** Compute the optimal longest common subsequence (LCS) using an
170 : : ** exhaustive search. This version of the LCS is only used for
171 : : ** shorter input strings since runtime is O(N*N) where N is the
172 : : ** input string length.
173 : : */
174 : 4 : static void optimalLCS(
175 : : DContext *p, /* Two files being compared */
176 : : int iS1, int iE1, /* Range of lines in p->aFrom[] */
177 : : int iS2, int iE2, /* Range of lines in p->aTo[] */
178 : : int *piSX, int *piEX, /* Write p->aFrom[] common segment here */
179 : : int *piSY, int *piEY /* Write p->aTo[] common segment here */
180 : : ){
181 : 4 : int mxLength = 0; /* Length of longest common subsequence */
182 : : int i, j; /* Loop counters */
183 : : int k; /* Length of a candidate subsequence */
184 : 4 : int iSXb = iS1; /* Best match so far */
185 : 4 : int iSYb = iS2; /* Best match so far */
186 : :
187 [ + + ]: 8 : for(i=iS1; i<iE1-mxLength; i++){
188 [ + + ]: 8 : for(j=iS2; j<iE2-mxLength; j++){
189 [ - + ]: 4 : if( !p->same_fn(&p->aFrom[i], &p->aTo[j]) ) continue;
190 [ # # # # ]: 0 : if( mxLength && !p->same_fn(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){
191 : 0 : continue;
192 : : }
193 : 0 : k = 1;
194 [ # # # # : 0 : while( i+k<iE1 && j+k<iE2 && p->same_fn(&p->aFrom[i+k],&p->aTo[j+k]) ){
# # # # ]
195 : 0 : k++;
196 : : }
197 [ # # ]: 0 : if( k>mxLength ){
198 : 0 : iSXb = i;
199 : 0 : iSYb = j;
200 : 0 : mxLength = k;
201 : 0 : }
202 : 0 : }
203 : 4 : }
204 : 4 : *piSX = iSXb;
205 : 4 : *piEX = iSXb + mxLength;
206 : 4 : *piSY = iSYb;
207 : 4 : *piEY = iSYb + mxLength;
208 : 4 : }
209 : :
210 : : /*
211 : : ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
212 : : ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
213 : : ** of lines in these two blocks that are exactly the same. Return
214 : : ** the bounds of the matching sequence.
215 : : **
216 : : ** If there are two or more possible answers of the same length, the
217 : : ** returned sequence should be the one closest to the center of the
218 : : ** input range.
219 : : **
220 : : ** Ideally, the common sequence should be the longest possible common
221 : : ** sequence. However, an exact computation of LCS is O(N*N) which is
222 : : ** way too slow for larger files. So this routine uses an O(N)
223 : : ** heuristic approximation based on hashing that usually works about
224 : : ** as well. But if the O(N) algorithm doesn't get a good solution
225 : : ** and N is not too large, we fall back to an exact solution by
226 : : ** calling optimalLCS().
227 : : */
228 : 4 : static void longestCommonSequence(
229 : : DContext *p, /* Two files being compared */
230 : : int iS1, int iE1, /* Range of lines in p->aFrom[] */
231 : : int iS2, int iE2, /* Range of lines in p->aTo[] */
232 : : int *piSX, int *piEX, /* Write p->aFrom[] common segment here */
233 : : int *piSY, int *piEY /* Write p->aTo[] common segment here */
234 : : ){
235 : : int i, j, k; /* Loop counters */
236 : : int n; /* Loop limit */
237 : : DLine *pA, *pB; /* Pointers to lines */
238 : : int iSX, iSY, iEX, iEY; /* Current match */
239 : 4 : int skew = 0; /* How lopsided is the match */
240 : 4 : int dist = 0; /* Distance of match from center */
241 : : int mid; /* Center of the span */
242 : : int iSXb, iSYb, iEXb, iEYb; /* Best match so far */
243 : : int iSXp, iSYp, iEXp, iEYp; /* Previous match */
244 : : int64_t bestScore; /* Best score so far */
245 : : int64_t score; /* Score for current candidate LCS */
246 : : int span; /* combined width of the input sequences */
247 : :
248 : 4 : span = (iE1 - iS1) + (iE2 - iS2);
249 : 4 : bestScore = -10000;
250 : 4 : iSXb = iSXp = iS1;
251 : 4 : iEXb = iEXp = iS1;
252 : 4 : iSYb = iSYp = iS2;
253 : 4 : iEYb = iEYp = iS2;
254 : 4 : mid = (iE1 + iS1)/2;
255 [ + + ]: 8 : for(i=iS1; i<iE1; i++){
256 : 4 : int limit = 0;
257 : 4 : j = p->aTo[p->aFrom[i].h % p->nTo].iHash;
258 [ + + ]: 12 : while( j>0
259 [ + + + + : 8 : && (j-1<iS2 || j>=iE2 || !p->same_fn(&p->aFrom[i], &p->aTo[j-1]))
+ - # # ]
260 : : ){
261 [ - + ]: 4 : if( limit++ > 10 ){
262 : 0 : j = 0;
263 : 0 : break;
264 : : }
265 : 4 : j = p->aTo[j-1].iNext;
266 : : }
267 [ - + ]: 4 : if( j==0 ) continue;
268 [ # # # # : 0 : if( i<iEXb && j>=iSYb && j<iEYb ) continue;
# # ]
269 [ # # # # : 0 : if( i<iEXp && j>=iSYp && j<iEYp ) continue;
# # ]
270 : 0 : iSX = i;
271 : 0 : iSY = j-1;
272 : 0 : pA = &p->aFrom[iSX-1];
273 : 0 : pB = &p->aTo[iSY-1];
274 : 0 : n = minInt(iSX-iS1, iSY-iS2);
275 [ # # # # : 0 : for(k=0; k<n && p->same_fn(pA,pB); k++, pA--, pB--){}
# # ]
276 : 0 : iSX -= k;
277 : 0 : iSY -= k;
278 : 0 : iEX = i+1;
279 : 0 : iEY = j;
280 : 0 : pA = &p->aFrom[iEX];
281 : 0 : pB = &p->aTo[iEY];
282 : 0 : n = minInt(iE1-iEX, iE2-iEY);
283 [ # # # # : 0 : for(k=0; k<n && p->same_fn(pA,pB); k++, pA++, pB++){}
# # ]
284 : 0 : iEX += k;
285 : 0 : iEY += k;
286 : 0 : skew = (iSX-iS1) - (iSY-iS2);
287 [ # # ]: 0 : if( skew<0 ) skew = -skew;
288 : 0 : dist = (iSX+iEX)/2 - mid;
289 [ # # ]: 0 : if( dist<0 ) dist = -dist;
290 : 0 : score = (iEX - iSX)*(int64_t)span - (skew + dist);
291 [ # # ]: 0 : if( score>bestScore ){
292 : 0 : bestScore = score;
293 : 0 : iSXb = iSX;
294 : 0 : iSYb = iSY;
295 : 0 : iEXb = iEX;
296 : 0 : iEYb = iEY;
297 [ # # ]: 0 : }else if( iEX>iEXp ){
298 : 0 : iSXp = iSX;
299 : 0 : iSYp = iSY;
300 : 0 : iEXp = iEX;
301 : 0 : iEYp = iEY;
302 : 0 : }
303 : 0 : }
304 [ + - - + ]: 4 : if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){
305 : : /* If no common sequence is found using the hashing heuristic and
306 : : ** the input is not too big, use the expensive exact solution */
307 : 4 : optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY);
308 : 4 : }else{
309 : 0 : *piSX = iSXb;
310 : 0 : *piSY = iSYb;
311 : 0 : *piEX = iEXb;
312 : 0 : *piEY = iEYb;
313 : : }
314 : 4 : }
315 : :
316 : : /*
317 : : ** Expand the size of aEdit[] array to hold at least nEdit elements.
318 : : */
319 : 32 : static void expandEdit(DContext *p, int nEdit){
320 : 32 : p->aEdit = xrealloc(p->aEdit, nEdit*sizeof(int));
321 : 32 : p->nEditAlloc = nEdit;
322 : 32 : }
323 : :
324 : : /*
325 : : ** Append a new COPY/DELETE/INSERT triple.
326 : : */
327 : 30 : static void appendTriple(DContext *p, int nCopy, int nDel, int nIns){
328 : : /* printf("APPEND %d/%d/%d\n", nCopy, nDel, nIns); */
329 [ + + ]: 30 : if( p->nEdit>=3 ){
330 [ + + ]: 14 : if( p->aEdit[p->nEdit-1]==0 ){
331 [ - + ]: 11 : if( p->aEdit[p->nEdit-2]==0 ){
332 : 11 : p->aEdit[p->nEdit-3] += nCopy;
333 : 11 : p->aEdit[p->nEdit-2] += nDel;
334 : 11 : p->aEdit[p->nEdit-1] += nIns;
335 : 11 : return;
336 : : }
337 [ # # ]: 0 : if( nCopy==0 ){
338 : 0 : p->aEdit[p->nEdit-2] += nDel;
339 : 0 : p->aEdit[p->nEdit-1] += nIns;
340 : 0 : return;
341 : : }
342 : 0 : }
343 [ - + # # ]: 3 : if( nCopy==0 && nDel==0 ){
344 : 0 : p->aEdit[p->nEdit-1] += nIns;
345 : 0 : return;
346 : : }
347 : 3 : }
348 [ + + ]: 19 : if( p->nEdit+3>p->nEditAlloc ){
349 : 16 : expandEdit(p, p->nEdit*2 + 15);
350 [ + - ]: 16 : if( p->aEdit==0 ) return;
351 : 16 : }
352 : 19 : p->aEdit[p->nEdit++] = nCopy;
353 : 19 : p->aEdit[p->nEdit++] = nDel;
354 : 19 : p->aEdit[p->nEdit++] = nIns;
355 : 30 : }
356 : :
357 : : /*
358 : : ** Do a single step in the difference. Compute a sequence of
359 : : ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
360 : : ** the input into lines iS2 through iE2-1 of the output and write
361 : : ** that sequence into the difference context.
362 : : **
363 : : ** The algorithm is to find a block of common text near the middle
364 : : ** of the two segments being diffed. Then recursively compute
365 : : ** differences on the blocks before and after that common segment.
366 : : ** Special cases apply if either input segment is empty or if the
367 : : ** two segments have no text in common.
368 : : */
369 : 16 : static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
370 : : int iSX, iEX, iSY, iEY;
371 : :
372 [ + + ]: 16 : if( iE1<=iS1 ){
373 : : /* The first segment is empty */
374 [ + + ]: 11 : if( iE2>iS2 ){
375 : 8 : appendTriple(p, 0, 0, iE2-iS2);
376 : 8 : }
377 : 11 : return;
378 : : }
379 [ + + ]: 5 : if( iE2<=iS2 ){
380 : : /* The second segment is empty */
381 : 1 : appendTriple(p, 0, iE1-iS1, 0);
382 : 1 : return;
383 : : }
384 : :
385 : : /* Find the longest matching segment between the two sequences */
386 : 4 : longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
387 : :
388 [ - + ]: 4 : if( iEX>iSX ){
389 : : /* A common segment has been found.
390 : : ** Recursively diff either side of the matching segment */
391 : 0 : diff_step(p, iS1, iSX, iS2, iSY);
392 [ # # ]: 0 : if( iEX>iSX ){
393 : 0 : appendTriple(p, iEX - iSX, 0, 0);
394 : 0 : }
395 : 0 : diff_step(p, iEX, iE1, iEY, iE2);
396 : 0 : }else{
397 : : /* The two segments have nothing in common. Delete the first then
398 : : ** insert the second. */
399 : 4 : appendTriple(p, 0, iE1-iS1, iE2-iS2);
400 : : }
401 : 16 : }
402 : :
403 : : /*
404 : : ** Compute the differences between two files already loaded into
405 : : ** the DContext structure.
406 : : **
407 : : ** A divide and conquer technique is used. We look for a large
408 : : ** block of common text that is in the middle of both files. Then
409 : : ** compute the difference on those parts of the file before and
410 : : ** after the common block. This technique is fast, but it does
411 : : ** not necessarily generate the minimum difference set. On the
412 : : ** other hand, we do not need a minimum difference set, only one
413 : : ** that makes sense to human readers, which this algorithm does.
414 : : **
415 : : ** Any common text at the beginning and end of the two files is
416 : : ** removed before starting the divide-and-conquer algorithm.
417 : : */
418 : 16 : static void diff_all(DContext *p){
419 : : int mnE, iS, iE1, iE2;
420 : :
421 : : /* Carve off the common header and footer */
422 : 16 : iE1 = p->nFrom;
423 : 16 : iE2 = p->nTo;
424 [ + + - + : 26 : while( iE1>0 && iE2>0 && p->same_fn(&p->aFrom[iE1-1], &p->aTo[iE2-1]) ){
+ + + + ]
425 : 10 : iE1--;
426 : 10 : iE2--;
427 : : }
428 [ + + ]: 16 : mnE = iE1<iE2 ? iE1 : iE2;
429 [ + + + + : 31 : for(iS=0; iS<mnE && p->same_fn(&p->aFrom[iS],&p->aTo[iS]); iS++){}
+ + ]
430 : :
431 : : /* do the difference */
432 [ + + ]: 16 : if( iS>0 ){
433 : 11 : appendTriple(p, iS, 0, 0);
434 : 11 : }
435 : 16 : diff_step(p, iS, iE1, iS, iE2);
436 [ + + ]: 16 : if( iE1<p->nFrom ){
437 : 6 : appendTriple(p, p->nFrom - iE1, 0, 0);
438 : 6 : }
439 : :
440 : : /* Terminate the COPY/DELETE/INSERT triples with three zeros */
441 : 16 : expandEdit(p, p->nEdit+3);
442 [ - + ]: 16 : if( p->aEdit ){
443 : 16 : p->aEdit[p->nEdit++] = 0;
444 : 16 : p->aEdit[p->nEdit++] = 0;
445 : 16 : p->aEdit[p->nEdit++] = 0;
446 : 16 : }
447 : 16 : }
448 : :
449 : : /*
450 : : ** Generate a report of the differences between files pA and pB.
451 : : ** If pOut is not NULL then a unified diff is appended there. It
452 : : ** is assumed that pOut has already been initialized. If pOut is
453 : : ** NULL, then a pointer to an array of integers is returned.
454 : : ** The integers come in triples. For each triple,
455 : : ** the elements are the number of lines copied, the number of
456 : : ** lines deleted, and the number of lines inserted. The vector
457 : : ** is terminated by a triple of all zeros.
458 : : **
459 : : ** This diff utility does not work on binary files. If a binary
460 : : ** file is encountered, 0 is returned and pOut is written with
461 : : ** text "cannot compute difference between binary files".
462 : : */
463 : : int *
464 : 16 : text_diff(
465 : : char *pA, /* FROM file */
466 : : char *pB /* TO file */
467 : : ){
468 : : DContext c;
469 : :
470 : : /* Prepare the input files */
471 : 16 : memset(&c, 0, sizeof(c));
472 : 16 : c.same_fn = same_dline;
473 : 16 : c.aFrom = break_into_lines(pA, &c.nFrom);
474 : 16 : c.aTo = break_into_lines(pB, &c.nTo);
475 [ + - - + ]: 16 : if( c.aFrom==0 || c.aTo==0 ){
476 : 0 : free(c.aFrom);
477 : 0 : free(c.aTo);
478 : 0 : return 0;
479 : : }
480 : :
481 : : /* Compute the difference */
482 : 16 : diff_all(&c);
483 : : /* If a context diff is not requested, then return the
484 : : ** array of COPY/DELETE/INSERT triples.
485 : : */
486 : 16 : free(c.aFrom);
487 : 16 : free(c.aTo);
488 : 16 : return c.aEdit;
489 : 16 : }
|