diffreg.c (8861B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include "diff.h" 5 6 /* diff - differential file comparison 7 * 8 * Uses an algorithm due to Harold Stone, which finds 9 * a pair of longest identical subsequences in the two 10 * files. 11 * 12 * The major goal is to generate the match vector J. 13 * J[i] is the index of the line in file1 corresponding 14 * to line i file0. J[i] = 0 if there is no 15 * such line in file1. 16 * 17 * Lines are hashed so as to work in core. All potential 18 * matches are located by sorting the lines of each file 19 * on the hash (called value). In particular, this 20 * collects the equivalence classes in file1 together. 21 * Subroutine equiv replaces the value of each line in 22 * file0 by the index of the first element of its 23 * matching equivalence in (the reordered) file1. 24 * To save space equiv squeezes file1 into a single 25 * array member in which the equivalence classes 26 * are simply concatenated, except that their first 27 * members are flagged by changing sign. 28 * 29 * Next the indices that point into member are unsorted into 30 * array class according to the original order of file0. 31 * 32 * The cleverness lies in routine stone. This marches 33 * through the lines of file0, developing a vector klist 34 * of "k-candidates". At step i a k-candidate is a matched 35 * pair of lines x,y (x in file0 y in file1) such that 36 * there is a common subsequence of lenght k 37 * between the first i lines of file0 and the first y 38 * lines of file1, but there is no such subsequence for 39 * any smaller y. x is the earliest possible mate to y 40 * that occurs in such a subsequence. 41 * 42 * Whenever any of the members of the equivalence class of 43 * lines in file1 matable to a line in file0 has serial number 44 * less than the y of some k-candidate, that k-candidate 45 * with the smallest such y is replaced. The new 46 * k-candidate is chained (via pred) to the current 47 * k-1 candidate so that the actual subsequence can 48 * be recovered. When a member has serial number greater 49 * that the y of all k-candidates, the klist is extended. 50 * At the end, the longest subsequence is pulled out 51 * and placed in the array J by unravel. 52 * 53 * With J in hand, the matches there recorded are 54 * check'ed against reality to assure that no spurious 55 * matches have crept in due to hashing. If they have, 56 * they are broken, and "jackpot " is recorded--a harmless 57 * matter except that a true match for a spuriously 58 * mated line may now be unnecessarily reported as a change. 59 * 60 * Much of the complexity of the program comes simply 61 * from trying to minimize core utilization and 62 * maximize the range of doable problems by dynamically 63 * allocating what is needed and reusing what is not. 64 * The core requirements for problems larger than somewhat 65 * are (in words) 2*length(file0) + length(file1) + 66 * 3*(number of k-candidates installed), typically about 67 * 6n words for files of length n. 68 */ 69 70 #define class diffclass 71 72 /* TIDY THIS UP */ 73 struct cand { 74 int x; 75 int y; 76 int pred; 77 } cand; 78 struct line { 79 int serial; 80 int value; 81 } *file[2], line; 82 int len[2]; 83 int binary; 84 struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/ 85 int slen[2]; 86 int pref, suff; /*length of prefix and suffix*/ 87 int *class; /*will be overlaid on file[0]*/ 88 int *member; /*will be overlaid on file[1]*/ 89 int *klist; /*will be overlaid on file[0] after class*/ 90 struct cand *clist; /* merely a free storage pot for candidates */ 91 int clen; 92 int *J; /*will be overlaid on class*/ 93 long *ixold; /*will be overlaid on klist*/ 94 long *ixnew; /*will be overlaid on file[1]*/ 95 /* END OF SOME TIDYING */ 96 97 static void 98 sort(struct line *a, int n) /*shellsort CACM #201*/ 99 { 100 int m; 101 struct line *ai, *aim, *j, *k; 102 struct line w; 103 int i; 104 105 m = 0; 106 for (i = 1; i <= n; i *= 2) 107 m = 2*i - 1; 108 for (m /= 2; m != 0; m /= 2) { 109 k = a+(n-m); 110 for (j = a+1; j <= k; j++) { 111 ai = j; 112 aim = ai+m; 113 do { 114 if (aim->value > ai->value || 115 aim->value == ai->value && 116 aim->serial > ai->serial) 117 break; 118 w = *ai; 119 *ai = *aim; 120 *aim = w; 121 122 aim = ai; 123 ai -= m; 124 } while (ai > a && aim >= ai); 125 } 126 } 127 } 128 129 static void 130 unsort(struct line *f, int l, int *b) 131 { 132 int *a; 133 int i; 134 135 a = MALLOC(int, (l+1)); 136 for(i=1;i<=l;i++) 137 a[f[i].serial] = f[i].value; 138 for(i=1;i<=l;i++) 139 b[i] = a[i]; 140 FREE(a); 141 } 142 143 static void 144 prune(void) 145 { 146 int i,j; 147 148 for(pref=0;pref<len[0]&&pref<len[1]&& 149 file[0][pref+1].value==file[1][pref+1].value; 150 pref++ ) ; 151 for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& 152 file[0][len[0]-suff].value==file[1][len[1]-suff].value; 153 suff++) ; 154 for(j=0;j<2;j++) { 155 sfile[j] = file[j]+pref; 156 slen[j] = len[j]-pref-suff; 157 for(i=0;i<=slen[j];i++) 158 sfile[j][i].serial = i; 159 } 160 } 161 162 static void 163 equiv(struct line *a, int n, struct line *b, int m, int *c) 164 { 165 int i, j; 166 167 i = j = 1; 168 while(i<=n && j<=m) { 169 if(a[i].value < b[j].value) 170 a[i++].value = 0; 171 else if(a[i].value == b[j].value) 172 a[i++].value = j; 173 else 174 j++; 175 } 176 while(i <= n) 177 a[i++].value = 0; 178 b[m+1].value = 0; 179 j = 0; 180 while(++j <= m) { 181 c[j] = -b[j].serial; 182 while(b[j+1].value == b[j].value) { 183 j++; 184 c[j] = b[j].serial; 185 } 186 } 187 c[j] = -1; 188 } 189 190 static int 191 newcand(int x, int y, int pred) 192 { 193 struct cand *q; 194 195 clist = REALLOC(clist, struct cand, (clen+1)); 196 q = clist + clen; 197 q->x = x; 198 q->y = y; 199 q->pred = pred; 200 return clen++; 201 } 202 203 static int 204 search(int *c, int k, int y) 205 { 206 int i, j, l; 207 int t; 208 209 if(clist[c[k]].y < y) /*quick look for typical case*/ 210 return k+1; 211 i = 0; 212 j = k+1; 213 while((l=(i+j)/2) > i) { 214 t = clist[c[l]].y; 215 if(t > y) 216 j = l; 217 else if(t < y) 218 i = l; 219 else 220 return l; 221 } 222 return l+1; 223 } 224 225 static int 226 stone(int *a, int n, int *b, int *c) 227 { 228 int i, k,y; 229 int j, l; 230 int oldc, tc; 231 int oldl; 232 233 k = 0; 234 c[0] = newcand(0,0,0); 235 for(i=1; i<=n; i++) { 236 j = a[i]; 237 if(j==0) 238 continue; 239 y = -b[j]; 240 oldl = 0; 241 oldc = c[0]; 242 do { 243 if(y <= clist[oldc].y) 244 continue; 245 l = search(c, k, y); 246 if(l!=oldl+1) 247 oldc = c[l-1]; 248 if(l<=k) { 249 if(clist[c[l]].y <= y) 250 continue; 251 tc = c[l]; 252 c[l] = newcand(i,y,oldc); 253 oldc = tc; 254 oldl = l; 255 } else { 256 c[l] = newcand(i,y,oldc); 257 k++; 258 break; 259 } 260 } while((y=b[++j]) > 0); 261 } 262 return k; 263 } 264 265 static void 266 unravel(int p) 267 { 268 int i; 269 struct cand *q; 270 271 for(i=0; i<=len[0]; i++) { 272 if (i <= pref) 273 J[i] = i; 274 else if (i > len[0]-suff) 275 J[i] = i+len[1]-len[0]; 276 else 277 J[i] = 0; 278 } 279 for(q=clist+p;q->y!=0;q=clist+q->pred) 280 J[q->x+pref] = q->y+pref; 281 } 282 283 static void 284 output(void) 285 { 286 int m, i0, i1, j0, j1; 287 288 m = len[0]; 289 J[0] = 0; 290 J[m+1] = len[1]+1; 291 if (mode != 'e') { 292 for (i0 = 1; i0 <= m; i0 = i1+1) { 293 while (i0 <= m && J[i0] == J[i0-1]+1) 294 i0++; 295 j0 = J[i0-1]+1; 296 i1 = i0-1; 297 while (i1 < m && J[i1+1] == 0) 298 i1++; 299 j1 = J[i1+1]-1; 300 J[i1] = j1; 301 change(i0, i1, j0, j1); 302 } 303 } 304 else { 305 for (i0 = m; i0 >= 1; i0 = i1-1) { 306 while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0]) 307 i0--; 308 j0 = J[i0+1]-1; 309 i1 = i0+1; 310 while (i1 > 1 && J[i1-1] == 0) 311 i1--; 312 j1 = J[i1-1]+1; 313 J[i1] = j1; 314 change(i1 , i0, j1, j0); 315 } 316 } 317 if (m == 0) 318 change(1, 0, 1, len[1]); 319 flushchanges(); 320 } 321 322 #define BUF 4096 323 static int 324 cmp(Biobuf* b1, Biobuf* b2) 325 { 326 int n; 327 uchar buf1[BUF], buf2[BUF]; 328 int f1, f2; 329 vlong nc = 1; 330 uchar *b1s, *b1e, *b2s, *b2e; 331 332 f1 = Bfildes(b1); 333 f2 = Bfildes(b2); 334 seek(f1, 0, 0); 335 seek(f2, 0, 0); 336 b1s = b1e = buf1; 337 b2s = b2e = buf2; 338 for(;;){ 339 if(b1s >= b1e){ 340 if(b1s >= &buf1[BUF]) 341 b1s = buf1; 342 n = read(f1, b1s, &buf1[BUF] - b1s); 343 b1e = b1s + n; 344 } 345 if(b2s >= b2e){ 346 if(b2s >= &buf2[BUF]) 347 b2s = buf2; 348 n = read(f2, b2s, &buf2[BUF] - b2s); 349 b2e = b2s + n; 350 } 351 n = b2e - b2s; 352 if(n > b1e - b1s) 353 n = b1e - b1s; 354 if(n <= 0) 355 break; 356 if(memcmp((void *)b1s, (void *)b2s, n) != 0){ 357 return 1; 358 } 359 nc += n; 360 b1s += n; 361 b2s += n; 362 } 363 if(b1e - b1s == b2e - b2s) 364 return 0; 365 return 1; 366 } 367 368 void 369 diffreg(char *f, char *t) 370 { 371 Biobuf *b0, *b1; 372 int k; 373 374 binary = 0; 375 b0 = prepare(0, f); 376 if (!b0) 377 return; 378 b1 = prepare(1, t); 379 if (!b1) { 380 FREE(file[0]); 381 Bterm(b0); 382 return; 383 } 384 if (binary){ 385 /* could use b0 and b1 but this is simpler. */ 386 if (cmp(b0, b1)) 387 print("binary files %s %s differ\n", f, t); 388 Bterm(b0); 389 Bterm(b1); 390 return; 391 } 392 clen = 0; 393 prune(); 394 sort(sfile[0], slen[0]); 395 sort(sfile[1], slen[1]); 396 397 member = (int *)file[1]; 398 equiv(sfile[0], slen[0], sfile[1], slen[1], member); 399 member = REALLOC(member, int, slen[1]+2); 400 401 class = (int *)file[0]; 402 unsort(sfile[0], slen[0], class); 403 class = REALLOC(class, int, slen[0]+2); 404 405 klist = MALLOC(int, slen[0]+2); 406 clist = MALLOC(struct cand, 1); 407 k = stone(class, slen[0], member, klist); 408 FREE(member); 409 FREE(class); 410 411 J = MALLOC(int, len[0]+2); 412 unravel(klist[k]); 413 FREE(clist); 414 FREE(klist); 415 416 ixold = MALLOC(long, len[0]+2); 417 ixnew = MALLOC(long, len[1]+2); 418 Bseek(b0, 0, 0); Bseek(b1, 0, 0); 419 check(b0, b1); 420 output(); 421 FREE(J); FREE(ixold); FREE(ixnew); 422 Bterm(b0); Bterm(b1); /* ++++ */ 423 }