plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

fixarenas.c (40556B)


      1 /*
      2  * Check and fix an arena partition.
      3  *
      4  * This is a lot grittier than the rest of Venti because
      5  * it can't just give up if a byte here or there is wrong.
      6  *
      7  * The rule here (hopefully followed!) is that block corruption
      8  * only ever has a local effect -- there are no blocks that you
      9  * can wipe out that will cause large portions of
     10  * uncorrupted data blocks to be useless.
     11  */
     12 
     13 #include "stdinc.h"
     14 #include "dat.h"
     15 #include "fns.h"
     16 #include "whack.h"
     17 
     18 #define ROUNDUP(x,n)		(((x)+(n)-1)&~((n)-1))
     19 
     20 #pragma varargck type "z" uvlong
     21 #pragma varargck type "z" vlong
     22 #pragma varargck type "t" uint
     23 
     24 enum
     25 {
     26 	K = 1024,
     27 	M = 1024*1024,
     28 	G = 1024*1024*1024,
     29 
     30 	Block = 4096,
     31 };
     32 
     33 int debugsha1;
     34 
     35 int verbose;
     36 Part *part;
     37 char *file;
     38 char *basename;
     39 char *dumpbase;
     40 int fix;
     41 int badreads;
     42 int unseal;
     43 uchar zero[MaxDiskBlock];
     44 
     45 Arena lastarena;
     46 ArenaPart ap;
     47 uvlong arenasize;
     48 int nbadread;
     49 int nbad;
     50 uvlong partend;
     51 void checkarena(vlong, int);
     52 
     53 void
     54 usage(void)
     55 {
     56 	fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n");
     57 	threadexitsall(0);
     58 }
     59 
     60 /*
     61  * Format number in simplest way that is okay with unittoull.
     62  */
     63 static int
     64 zfmt(Fmt *fmt)
     65 {
     66 	vlong x;
     67 
     68 	x = va_arg(fmt->args, vlong);
     69 	if(x == 0)
     70 		return fmtstrcpy(fmt, "0");
     71 	if(x%G == 0)
     72 		return fmtprint(fmt, "%lldG", x/G);
     73 	if(x%M == 0)
     74 		return fmtprint(fmt, "%lldM", x/M);
     75 	if(x%K == 0)
     76 		return fmtprint(fmt, "%lldK", x/K);
     77 	return fmtprint(fmt, "%lld", x);
     78 }
     79 
     80 /*
     81  * Format time like ctime without newline.
     82  */
     83 static int
     84 tfmt(Fmt *fmt)
     85 {
     86 	uint t;
     87 	char buf[30];
     88 
     89 	t = va_arg(fmt->args, uint);
     90 	strcpy(buf, ctime(t));
     91 	buf[28] = 0;
     92 	return fmtstrcpy(fmt, buf);
     93 }
     94 
     95 /*
     96  * Coalesce messages about unreadable sectors into larger ranges.
     97  * bad(0, 0) flushes the buffer.
     98  */
     99 static void
    100 bad(char *msg, vlong o, int len)
    101 {
    102 	static vlong lb0, lb1;
    103 	static char *lmsg;
    104 
    105 	if(msg == nil)
    106 		msg = lmsg;
    107 	if(o == -1){
    108 		lmsg = nil;
    109 		lb0 = 0;
    110 		lb1 = 0;
    111 		return;
    112 	}
    113 	if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){
    114 		if(lb0 != lb1)
    115 			print("%s %#llux+%#llux (%,lld+%,lld)\n",
    116 				lmsg, lb0, lb1-lb0, lb0, lb1-lb0);
    117 		lb0 = o;
    118 	}
    119 	lmsg = msg;
    120 	lb1 = o+len;
    121 }
    122 
    123 /*
    124  * Read in the len bytes of data at the offset.  If can't for whatever reason,
    125  * fill it with garbage but print an error.
    126  */
    127 static uchar*
    128 readdisk(uchar *buf, vlong offset, int len)
    129 {
    130 	int i, j, k, n;
    131 
    132 	if(offset >= partend){
    133 		memset(buf, 0xFB, len);
    134 		return buf;
    135 	}
    136 
    137 	if(offset+len > partend){
    138 		memset(buf, 0xFB, len);
    139 		len = partend - offset;
    140 	}
    141 
    142 	if(readpart(part, offset, buf, len) >= 0)
    143 		return buf;
    144 
    145 	/*
    146 	 * The read failed.  Clear the buffer to nonsense, and
    147 	 * then try reading in smaller pieces.  If that fails,
    148 	 * read in even smaller pieces.  And so on down to sectors.
    149 	 */
    150 	memset(buf, 0xFD, len);
    151 	for(i=0; i<len; i+=64*K){
    152 		n = 64*K;
    153 		if(i+n > len)
    154 			n = len-i;
    155 		if(readpart(part, offset+i, buf+i, n) >= 0)
    156 			continue;
    157 		for(j=i; j<len && j<i+64*K; j+=4*K){
    158 			n = 4*K;
    159 			if(j+n > len)
    160 				n = len-j;
    161 			if(readpart(part, offset+j, buf+j, n) >= 0)
    162 				continue;
    163 			for(k=j; k<len && k<j+4*K; k+=512){
    164 				if(readpart(part, offset+k, buf+k, 512) >= 0)
    165 					continue;
    166 				bad("disk read failed at", k, 512);
    167 				badreads++;
    168 			}
    169 		}
    170 	}
    171 	bad(nil, 0, 0);
    172 	return buf;
    173 }
    174 
    175 /*
    176  * Buffer to support running SHA1 hash of the disk.
    177  */
    178 typedef struct Shabuf Shabuf;
    179 struct Shabuf
    180 {
    181 	int fd;
    182 	vlong offset;
    183 	DigestState state;
    184 	int rollback;
    185 	vlong r0;
    186 	DigestState *hist;
    187 	int nhist;
    188 };
    189 
    190 void
    191 sbdebug(Shabuf *sb, char *file)
    192 {
    193 	int fd;
    194 
    195 	if(sb->fd > 0){
    196 		close(sb->fd);
    197 		sb->fd = 0;
    198 	}
    199 	if((fd = create(file, OWRITE, 0666)) < 0)
    200 		return;
    201 	if(fd == 0){
    202 		fd = dup(fd, -1);
    203 		close(0);
    204 	}
    205 	sb->fd = fd;
    206 }
    207 
    208 void
    209 sbupdate(Shabuf *sb, uchar *p, vlong offset, int len)
    210 {
    211 	int n, x;
    212 	vlong o;
    213 
    214 	if(sb->rollback && !sb->hist){
    215 		sb->r0 = offset;
    216 		sb->nhist = 1;
    217 		sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist);
    218 		memset(sb->hist, 0, sizeof sb->hist[0]);
    219 	}
    220 	if(sb->r0 == 0)
    221 		sb->r0 = offset;
    222 
    223 	if(sb->offset < offset || sb->offset >= offset+len){
    224 		if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n",
    225 			p, offset, len, sb->offset);
    226 		return;
    227 	}
    228 	x = sb->offset - offset;
    229 	if(0) print("sbupdate %p %#llux+%d skip %d\n",
    230 		sb, offset, len, x);
    231 	if(x){
    232 		p += x;
    233 		offset += x;
    234 		len -= x;
    235 	}
    236 	assert(sb->offset == offset);
    237 
    238 	if(sb->fd > 0)
    239 		pwrite(sb->fd, p, len, offset - sb->r0);
    240 
    241 	if(!sb->rollback){
    242 		sha1(p, len, nil, &sb->state);
    243 		sb->offset += len;
    244 		return;
    245 	}
    246 
    247 	/* save state every 4M so we can roll back quickly */
    248 	o = offset - sb->r0;
    249 	while(len > 0){
    250 		n = 4*M - o%(4*M);
    251 		if(n > len)
    252 			n = len;
    253 		sha1(p, n, nil, &sb->state);
    254 		sb->offset += n;
    255 		o += n;
    256 		p += n;
    257 		len -= n;
    258 		if(o%(4*M) == 0){
    259 			x = o/(4*M);
    260 			if(x >= sb->nhist){
    261 				if(x != sb->nhist)
    262 					print("oops! x=%d nhist=%d\n", x, sb->nhist);
    263 				sb->nhist += 32;
    264 				sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist);
    265 			}
    266 			sb->hist[x] = sb->state;
    267 		}
    268 	}
    269 }
    270 
    271 void
    272 sbdiskhash(Shabuf *sb, vlong eoffset)
    273 {
    274 	static uchar dbuf[4*M];
    275 	int n;
    276 
    277 	while(sb->offset < eoffset){
    278 		n = sizeof dbuf;
    279 		if(sb->offset+n > eoffset)
    280 			n = eoffset - sb->offset;
    281 		readdisk(dbuf, sb->offset, n);
    282 		sbupdate(sb, dbuf, sb->offset, n);
    283 	}
    284 }
    285 
    286 void
    287 sbrollback(Shabuf *sb, vlong offset)
    288 {
    289 	int x;
    290 	vlong o;
    291 	Dir d;
    292 
    293 	if(!sb->rollback || !sb->r0){
    294 		print("cannot rollback sha\n");
    295 		return;
    296 	}
    297 	if(offset >= sb->offset)
    298 		return;
    299 	o = offset - sb->r0;
    300 	x = o/(4*M);
    301 	if(x >= sb->nhist){
    302 		print("cannot rollback sha\n");
    303 		return;
    304 	}
    305 	sb->state = sb->hist[x];
    306 	sb->offset = sb->r0 + x*4*M;
    307 	assert(sb->offset <= offset);
    308 
    309 	if(sb->fd > 0){
    310 		nulldir(&d);
    311 		d.length = sb->offset - sb->r0;
    312 		dirfwstat(sb->fd, &d);
    313 	}
    314 }
    315 
    316 void
    317 sbscore(Shabuf *sb, uchar *score)
    318 {
    319 	if(sb->hist){
    320 		free(sb->hist);
    321 		sb->hist = nil;
    322 	}
    323 	sha1(nil, 0, score, &sb->state);
    324 }
    325 
    326 /*
    327  * If we're fixing arenas, then editing this memory edits the disk!
    328  * It will be written back out as new data is paged in.
    329  */
    330 uchar buf[4*M];
    331 uchar sbuf[4*M];
    332 vlong bufoffset;
    333 int buflen;
    334 
    335 static void pageout(void);
    336 static uchar*
    337 pagein(vlong offset, int len)
    338 {
    339 	pageout();
    340 	if(offset >= partend){
    341 		memset(buf, 0xFB, sizeof buf);
    342 		return buf;
    343 	}
    344 
    345 	if(offset+len > partend){
    346 		memset(buf, 0xFB, sizeof buf);
    347 		len = partend - offset;
    348 	}
    349 	bufoffset = offset;
    350 	buflen = len;
    351 	readdisk(buf, offset, len);
    352 	memmove(sbuf, buf, len);
    353 	return buf;
    354 }
    355 
    356 static void
    357 pageout(void)
    358 {
    359 	if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){
    360 		buflen = 0;
    361 		return;
    362 	}
    363 	if(writepart(part, bufoffset, buf, buflen) < 0)
    364 		print("disk write failed at %#llux+%#ux (%,lld+%,d)\n",
    365 			bufoffset, buflen, bufoffset, buflen);
    366 	buflen = 0;
    367 }
    368 
    369 static void
    370 zerorange(vlong offset, int len)
    371 {
    372 	int i;
    373 	vlong ooff;
    374 	int olen;
    375 	enum { MinBlock = 4*K, MaxBlock = 8*K };
    376 
    377 	if(0)
    378 	if(bufoffset <= offset && offset+len <= bufoffset+buflen){
    379 		memset(buf+(offset-bufoffset), 0, len);
    380 		return;
    381 	}
    382 
    383 	ooff = bufoffset;
    384 	olen = buflen;
    385 
    386 	i = offset%MinBlock;
    387 	if(i+len < MaxBlock){
    388 		pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1));
    389 		memset(buf+i, 0, len);
    390 	}else{
    391 		pagein(offset-i, MaxBlock);
    392 		memset(buf+i, 0, MaxBlock-i);
    393 		offset += MaxBlock-i;
    394 		len -= MaxBlock-i;
    395 		while(len >= MaxBlock){
    396 			pagein(offset, MaxBlock);
    397 			memset(buf, 0, MaxBlock);
    398 			offset += MaxBlock;
    399 			len -= MaxBlock;
    400 		}
    401 		pagein(offset, (len+MinBlock-1)&~(MinBlock-1));
    402 		memset(buf, 0, len);
    403 	}
    404 	pagein(ooff, olen);
    405 }
    406 
    407 /*
    408  * read/write integers
    409  *
    410 static void
    411 p16(uchar *p, u16int u)
    412 {
    413 	p[0] = (u>>8) & 0xFF;
    414 	p[1] = u & 0xFF;
    415 }
    416 */
    417 
    418 static u16int
    419 u16(uchar *p)
    420 {
    421 	return (p[0]<<8)|p[1];
    422 }
    423 
    424 static void
    425 p32(uchar *p, u32int u)
    426 {
    427 	p[0] = (u>>24) & 0xFF;
    428 	p[1] = (u>>16) & 0xFF;
    429 	p[2] = (u>>8) & 0xFF;
    430 	p[3] = u & 0xFF;
    431 }
    432 
    433 static u32int
    434 u32(uchar *p)
    435 {
    436 	return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
    437 }
    438 
    439 /*
    440 static void
    441 p64(uchar *p, u64int u)
    442 {
    443 	p32(p, u>>32);
    444 	p32(p, u);
    445 }
    446 */
    447 
    448 static u64int
    449 u64(uchar *p)
    450 {
    451 	return ((u64int)u32(p)<<32) | u32(p+4);
    452 }
    453 
    454 static int
    455 vlongcmp(const void *va, const void *vb)
    456 {
    457 	vlong a, b;
    458 
    459 	a = *(vlong*)va;
    460 	b = *(vlong*)vb;
    461 	if(a < b)
    462 		return -1;
    463 	if(b > a)
    464 		return 1;
    465 	return 0;
    466 }
    467 
    468 /* D and S are in draw.h */
    469 #define D VD
    470 #define S VS
    471 
    472 enum
    473 {
    474 	D = 0x10000,
    475 	Z = 0x20000,
    476 	S = 0x30000,
    477 	T = 0x40000,
    478 	N = 0xFFFF
    479 };
    480 typedef struct Info Info;
    481 struct Info
    482 {
    483 	int len;
    484 	char *name;
    485 };
    486 
    487 Info partinfo[] = {
    488 	4,	"magic",
    489 	D|4,	"version",
    490 	Z|4,	"blocksize",
    491 	4,	"arenabase",
    492 	0
    493 };
    494 
    495 Info headinfo4[] = {
    496 	4,	"magic",
    497 	D|4,	"version",
    498 	S|ANameSize,	"name",
    499 	Z|4,	"blocksize",
    500 	Z|8,	"size",
    501 	0
    502 };
    503 
    504 Info headinfo5[] = {
    505 	4,	"magic",
    506 	D|4,	"version",
    507 	S|ANameSize,	"name",
    508 	Z|4,	"blocksize",
    509 	Z|8,	"size",
    510 	4,	"clumpmagic",
    511 	0
    512 };
    513 
    514 Info tailinfo4[] = {
    515 	4,	"magic",
    516 	D|4,	"version",
    517 	S|ANameSize,	"name",
    518 	D|4,	"clumps",
    519 	D|4,	"cclumps",
    520 	T|4,	"ctime",
    521 	T|4,	"wtime",
    522 	D|8,	"used",
    523 	D|8,	"uncsize",
    524 	1,	"sealed",
    525 	0
    526 };
    527 
    528 Info tailinfo4a[] = {
    529 	/* tailinfo 4 */
    530 	4,	"magic",
    531 	D|4,	"version",
    532 	S|ANameSize,	"name",
    533 	D|4,	"clumps",
    534 	D|4,	"cclumps",
    535 	T|4,	"ctime",
    536 	T|4,	"wtime",
    537 	D|8,	"used",
    538 	D|8,	"uncsize",
    539 	1,	"sealed",
    540 
    541 	/* mem stats */
    542 	1,	"extension",
    543 	D|4,	"mem.clumps",
    544 	D|4,	"mem.cclumps",
    545 	D|8,	"mem.used",
    546 	D|8,	"mem.uncsize",
    547 	1,	"mem.sealed",
    548 	0
    549 };
    550 
    551 Info tailinfo5[] = {
    552 	4,	"magic",
    553 	D|4,	"version",
    554 	S|ANameSize,	"name",
    555 	D|4,	"clumps",
    556 	D|4,	"cclumps",
    557 	T|4,	"ctime",
    558 	T|4,	"wtime",
    559 	4,	"clumpmagic",
    560 	D|8,	"used",
    561 	D|8,	"uncsize",
    562 	1,	"sealed",
    563 	0
    564 };
    565 
    566 Info tailinfo5a[] = {
    567 	/* tailinfo 5 */
    568 	4,	"magic",
    569 	D|4,	"version",
    570 	S|ANameSize,	"name",
    571 	D|4,	"clumps",
    572 	D|4,	"cclumps",
    573 	T|4,	"ctime",
    574 	T|4,	"wtime",
    575 	4,	"clumpmagic",
    576 	D|8,	"used",
    577 	D|8,	"uncsize",
    578 	1,	"sealed",
    579 
    580 	/* mem stats */
    581 	1,	"extension",
    582 	D|4,	"mem.clumps",
    583 	D|4,	"mem.cclumps",
    584 	D|8,	"mem.used",
    585 	D|8,	"mem.uncsize",
    586 	1,	"mem.sealed",
    587 	0
    588 };
    589 
    590 void
    591 showdiffs(uchar *want, uchar *have, int len, Info *info)
    592 {
    593 	int n;
    594 
    595 	while(len > 0 && (n=info->len&N) > 0){
    596 		if(memcmp(have, want, n) != 0){
    597 			switch(info->len){
    598 			case 1:
    599 				print("\t%s: correct=%d disk=%d\n",
    600 					info->name, *want, *have);
    601 				break;
    602 			case 4:
    603 				print("\t%s: correct=%#ux disk=%#ux\n",
    604 					info->name, u32(want), u32(have));
    605 				break;
    606 			case D|4:
    607 				print("\t%s: correct=%,ud disk=%,ud\n",
    608 					info->name, u32(want), u32(have));
    609 				break;
    610 			case T|4:
    611 				print("\t%s: correct=%t\n\t\tdisk=%t\n",
    612 					info->name, u32(want), u32(have));
    613 				break;
    614 			case Z|4:
    615 				print("\t%s: correct=%z disk=%z\n",
    616 					info->name, (uvlong)u32(want), (uvlong)u32(have));
    617 				break;
    618 			case D|8:
    619 				print("\t%s: correct=%,lld disk=%,lld\n",
    620 					info->name, u64(want), u64(have));
    621 				break;
    622 			case Z|8:
    623 				print("\t%s: correct=%z disk=%z\n",
    624 					info->name, u64(want), u64(have));
    625 				break;
    626 			case S|ANameSize:
    627 				print("\t%s: correct=%s disk=%.*s\n",
    628 					info->name, (char*)want,
    629 					utfnlen((char*)have, ANameSize-1),
    630 					(char*)have);
    631 				break;
    632 			default:
    633 				print("\t%s: correct=%.*H disk=%.*H\n",
    634 					info->name, n, want, n, have);
    635 				break;
    636 			}
    637 		}
    638 		have += n;
    639 		want += n;
    640 		len -= n;
    641 		info++;
    642 	}
    643 	if(len > 0 && memcmp(have, want, len) != 0){
    644 		if(memcmp(want, zero, len) != 0)
    645 			print("!!\textra want data in showdiffs (bug in fixarenas)\n");
    646 		else
    647 			print("\tnon-zero data on disk after structure\n");
    648 		if(verbose > 1){
    649 			print("want: %.*H\n", len, want);
    650 			print("have: %.*H\n", len, have);
    651 		}
    652 	}
    653 }
    654 
    655 /*
    656  * Does part begin with an arena?
    657  */
    658 int
    659 isonearena(void)
    660 {
    661 	return u32(pagein(0, Block)) == ArenaHeadMagic;
    662 }
    663 
    664 static int tabsizes[] = { 16*1024, 64*1024, 512*1024, 768*1024, };
    665 /*
    666  * Poke around on the disk to guess what the ArenaPart numbers are.
    667  */
    668 void
    669 guessgeometry(void)
    670 {
    671 	int i, j, n, bestn, ndiff, nhead, ntail;
    672 	uchar *p, *ep, *sp;
    673 	u64int diff[100], head[20], tail[20];
    674 	u64int offset, bestdiff;
    675 
    676 	ap.version = ArenaPartVersion;
    677 
    678 	if(arenasize == 0 || ap.blocksize == 0){
    679 		/*
    680 		 * The ArenaPart block at offset PartBlank may be corrupt or just wrong.
    681 		 * Instead, look for the individual arena headers and tails, which there
    682 		 * are many of, and once we've seen enough, infer the spacing.
    683 		 *
    684 		 * Of course, nothing in the file format requires that arenas be evenly
    685 		 * spaced, but fmtarenas always does that for us.
    686 		 */
    687 		nhead = 0;
    688 		ntail = 0;
    689 		for(offset=PartBlank; offset<partend; offset+=4*M){
    690 			p = pagein(offset, 4*M);
    691 			for(sp=p, ep=p+4*M; p<ep; p+=K){
    692 				if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){
    693 					if(verbose)
    694 						print("arena head at %#llx\n", offset+(p-sp));
    695 					head[nhead++] = offset+(p-sp);
    696 				}
    697 				if(u32(p) == ArenaMagic && ntail < nelem(tail)){
    698 					tail[ntail++] = offset+(p-sp);
    699 					if(verbose)
    700 						print("arena tail at %#llx\n", offset+(p-sp));
    701 				}
    702 			}
    703 			if(nhead == nelem(head) && ntail == nelem(tail))
    704 				break;
    705 		}
    706 		if(nhead < 3 && ntail < 3)
    707 			sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail);
    708 
    709 		/*
    710 		 * Arena size is likely the most common
    711 		 * inter-head or inter-tail spacing.
    712 		 */
    713 		ndiff = 0;
    714 		for(i=1; i<nhead; i++)
    715 			diff[ndiff++] = head[i] - head[i-1];
    716 		for(i=1; i<ntail; i++)
    717 			diff[ndiff++] = tail[i] - tail[i-1];
    718 		qsort(diff, ndiff, sizeof diff[0], vlongcmp);
    719 		bestn = 0;
    720 		bestdiff = 0;
    721 		for(i=1, n=1; i<=ndiff; i++, n++){
    722 			if(i==ndiff || diff[i] != diff[i-1]){
    723 				if(n > bestn){
    724 					bestn = n;
    725 					bestdiff = diff[i-1];
    726 				}
    727 				n = 0;
    728 			}
    729 		}
    730 		print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
    731 		if(arenasize != 0 && arenasize != bestdiff)
    732 			print("using user-specified size %z instead\n", arenasize);
    733 		else
    734 			arenasize = bestdiff;
    735 
    736 		/*
    737 		 * The arena tail for an arena is arenasize-blocksize from the head.
    738 		 */
    739 		ndiff = 0;
    740 		for(i=j=0; i<nhead && j<ntail; ){
    741 			if(tail[j] < head[i]){
    742 				j++;
    743 				continue;
    744 			}
    745 			if(tail[j] < head[i]+arenasize){
    746 				diff[ndiff++] = head[i]+arenasize - tail[j];
    747 				j++;
    748 				continue;
    749 			}
    750 			i++;
    751 		}
    752 		if(ndiff < 3)
    753 			sysfatal("too few intact arenas: %d head, tail pairs", ndiff);
    754 		qsort(diff, ndiff, sizeof diff[0], vlongcmp);
    755 		bestn = 0;
    756 		bestdiff = 0;
    757 		for(i=1, n=1; i<=ndiff; i++, n++){
    758 			if(i==ndiff || diff[i] != diff[i-1]){
    759 				if(n > bestn){
    760 					bestn = n;
    761 					bestdiff = diff[i-1];
    762 				}
    763 				n = 0;
    764 			}
    765 		}
    766 		print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
    767 		if(ap.blocksize != 0 && ap.blocksize != bestdiff)
    768 			print("using user-specified size %z instead\n", (vlong)ap.blocksize);
    769 		else
    770 			ap.blocksize = bestdiff;
    771 		if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1))
    772 			sysfatal("block size not a power of two");
    773 		if(ap.blocksize > MaxDiskBlock)
    774 			sysfatal("block size too big (max=%d)", MaxDiskBlock);
    775 
    776 		/*
    777 		 * Use head/tail information to deduce arena base.
    778 		 */
    779 		ndiff = 0;
    780 		for(i=0; i<nhead; i++)
    781 			diff[ndiff++] = head[i]%arenasize;
    782 		for(i=0; i<ntail; i++)
    783 			diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize;
    784 		qsort(diff, ndiff, sizeof diff[0], vlongcmp);
    785 		bestn = 0;
    786 		bestdiff = 0;
    787 		for(i=1, n=1; i<=ndiff; i++, n++){
    788 			if(i==ndiff || diff[i] != diff[i-1]){
    789 				if(n > bestn){
    790 					bestn = n;
    791 					bestdiff = diff[i-1];
    792 				}
    793 				n = 0;
    794 			}
    795 		}
    796 		ap.arenabase = bestdiff;
    797 	}
    798 
    799 	ap.tabbase = ROUNDUP(PartBlank+HeadSize, ap.blocksize);
    800 	/*
    801 	 * XXX pick up table, check arenabase.
    802 	 * XXX pick up table, record base name.
    803 	 */
    804 
    805 	/*
    806 	 * Somewhat standard computation.
    807 	 * Fmtarenas used to use 64k tab, now uses 512k tab.
    808 	 */
    809 	if(ap.arenabase == 0){
    810 		print("trying standard arena bases...\n");
    811 		for(i=0; i<nelem(tabsizes); i++){
    812 			ap.arenabase = ROUNDUP(PartBlank+HeadSize+tabsizes[i], ap.blocksize);
    813 			p = pagein(ap.arenabase, Block);
    814 			if(u32(p) == ArenaHeadMagic)
    815 				break;
    816 		}
    817 	}
    818 	p = pagein(ap.arenabase, Block);
    819 	print("arena base likely %z%s\n", (vlong)ap.arenabase,
    820 		u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : "");
    821 
    822 	ap.tabsize = ap.arenabase - ap.tabbase;
    823 }
    824 
    825 /*
    826  * Check the arena partition blocks and then the arenas listed in range.
    827  */
    828 void
    829 checkarenas(char *range)
    830 {
    831 	char *s, *t;
    832 	int i, lo, hi, narena;
    833 	uchar dbuf[HeadSize];
    834 	uchar *p;
    835 
    836 	guessgeometry();
    837 
    838 	partend -= partend%ap.blocksize;
    839 
    840 	memset(dbuf, 0, sizeof dbuf);
    841 	packarenapart(&ap, dbuf);
    842 	p = pagein(PartBlank, Block);
    843 	if(memcmp(p, dbuf, HeadSize) != 0){
    844 		print("on-disk arena part superblock incorrect\n");
    845 		showdiffs(dbuf, p, HeadSize, partinfo);
    846 	}
    847 	memmove(p, dbuf, HeadSize);
    848 
    849 	narena = (partend-ap.arenabase + arenasize-1)/arenasize;
    850 	if(range == nil){
    851 		for(i=0; i<narena; i++)
    852 			checkarena(ap.arenabase+(vlong)i*arenasize, i);
    853 	}else if(strcmp(range, "none") == 0){
    854 		/* nothing */
    855 	}else{
    856 		/* parse, e.g., -4,8-9,10- */
    857 		for(s=range; *s; s=t){
    858 			t = strchr(s, ',');
    859 			if(t)
    860 				*t++ = 0;
    861 			else
    862 				t = s+strlen(s);
    863 			if(*s == '-')
    864 				lo = 0;
    865 			else
    866 				lo = strtol(s, &s, 0);
    867 			hi = lo;
    868 			if(*s == '-'){
    869 				s++;
    870 				if(*s == 0)
    871 					hi = narena-1;
    872 				else
    873 					hi = strtol(s, &s, 0);
    874 			}
    875 			if(*s != 0){
    876 				print("bad arena range: %s\n", s);
    877 				continue;
    878 			}
    879 			for(i=lo; i<=hi; i++)
    880 				checkarena(ap.arenabase+(vlong)i*arenasize, i);
    881 		}
    882 	}
    883 }
    884 
    885 /*
    886  * Is there a clump here at p?
    887  */
    888 static int
    889 isclump(uchar *p, Clump *cl, u32int *pmagic)
    890 {
    891 	int n;
    892 	u32int magic;
    893 	uchar score[VtScoreSize], *bp;
    894 	Unwhack uw;
    895 	uchar ubuf[70*1024];
    896 
    897 	bp = p;
    898 	magic = u32(p);
    899 	if(magic == 0)
    900 		return 0;
    901 	p += U32Size;
    902 
    903 	cl->info.type = vtfromdisktype(*p);
    904 	if(cl->info.type == 0xFF)
    905 		return 0;
    906 	p++;
    907 	cl->info.size = u16(p);
    908 	p += U16Size;
    909 	cl->info.uncsize = u16(p);
    910 	if(cl->info.size > cl->info.uncsize)
    911 		return 0;
    912 	p += U16Size;
    913 	scorecp(cl->info.score, p);
    914 	p += VtScoreSize;
    915 	cl->encoding = *p;
    916 	p++;
    917 	cl->creator = u32(p);
    918 	p += U32Size;
    919 	cl->time = u32(p);
    920 	p += U32Size;
    921 
    922 	switch(cl->encoding){
    923 	case ClumpENone:
    924 		if(cl->info.size != cl->info.uncsize)
    925 			return 0;
    926 		scoremem(score, p, cl->info.size);
    927 		if(scorecmp(score, cl->info.score) != 0)
    928 			return 0;
    929 		break;
    930 	case ClumpECompress:
    931 		if(cl->info.size >= cl->info.uncsize)
    932 			return 0;
    933 		unwhackinit(&uw);
    934 		n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size);
    935 		if(n != cl->info.uncsize)
    936 			return 0;
    937 		scoremem(score, ubuf, cl->info.uncsize);
    938 		if(scorecmp(score, cl->info.score) != 0)
    939 			return 0;
    940 		break;
    941 	default:
    942 		return 0;
    943 	}
    944 	p += cl->info.size;
    945 
    946 	/* it all worked out in the end */
    947 	*pmagic = magic;
    948 	return p - bp;
    949 }
    950 
    951 /*
    952  * All ClumpInfos seen in this arena.
    953  * Kept in binary tree so we can look up by score.
    954  */
    955 typedef struct Cit Cit;
    956 struct Cit
    957 {
    958 	int left;
    959 	int right;
    960 	vlong corrupt;
    961 	ClumpInfo ci;
    962 };
    963 Cit *cibuf;
    964 int ciroot;
    965 int ncibuf, mcibuf;
    966 
    967 void
    968 resetcibuf(void)
    969 {
    970 	ncibuf = 0;
    971 	ciroot = -1;
    972 }
    973 
    974 int*
    975 ltreewalk(int *p, uchar *score)
    976 {
    977 	int i;
    978 
    979 	for(;;){
    980 		if(*p == -1)
    981 			return p;
    982 		i = scorecmp(cibuf[*p].ci.score, score);
    983 		if(i == 0)
    984 			return p;
    985 		if(i < 0)
    986 			p = &cibuf[*p].right;
    987 		else
    988 			p = &cibuf[*p].left;
    989 	}
    990 }
    991 
    992 void
    993 addcibuf(ClumpInfo *ci, vlong corrupt)
    994 {
    995 	Cit *cit;
    996 
    997 	if(ncibuf == mcibuf){
    998 		mcibuf += 131072;
    999 		cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]);
   1000 	}
   1001 	cit = &cibuf[ncibuf];
   1002 	cit->ci = *ci;
   1003 	cit->left = -1;
   1004 	cit->right = -1;
   1005 	cit->corrupt = corrupt;
   1006 	if(!corrupt)
   1007 		*ltreewalk(&ciroot, ci->score) = ncibuf;
   1008 	ncibuf++;
   1009 }
   1010 
   1011 void
   1012 addcicorrupt(vlong len)
   1013 {
   1014 	static ClumpInfo zci;
   1015 
   1016 	addcibuf(&zci, len);
   1017 }
   1018 
   1019 int
   1020 haveclump(uchar *score)
   1021 {
   1022 	int i;
   1023 	int p;
   1024 
   1025 	p = ciroot;
   1026 	for(;;){
   1027 		if(p == -1)
   1028 			return 0;
   1029 		i = scorecmp(cibuf[p].ci.score, score);
   1030 		if(i == 0)
   1031 			return 1;
   1032 		if(i < 0)
   1033 			p = cibuf[p].right;
   1034 		else
   1035 			p = cibuf[p].left;
   1036 	}
   1037 }
   1038 
   1039 int
   1040 matchci(ClumpInfo *ci, uchar *p)
   1041 {
   1042 	if(ci->type != vtfromdisktype(p[0]))
   1043 		return 0;
   1044 	if(ci->size != u16(p+1))
   1045 		return 0;
   1046 	if(ci->uncsize != u16(p+3))
   1047 		return 0;
   1048 	if(scorecmp(ci->score, p+5) != 0)
   1049 		return 0;
   1050 	return 1;
   1051 }
   1052 
   1053 int
   1054 sealedarena(uchar *p, int blocksize)
   1055 {
   1056 	int v, n;
   1057 
   1058 	v = u32(p+4);
   1059 	switch(v){
   1060 	default:
   1061 		return 0;
   1062 	case ArenaVersion4:
   1063 		n = ArenaSize4;
   1064 		break;
   1065 	case ArenaVersion5:
   1066 		n = ArenaSize5;
   1067 		break;
   1068 	}
   1069 	if(p[n-1] != 1){
   1070 		print("arena tail says not sealed\n");
   1071 		return 0;
   1072 	}
   1073 	if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){
   1074 		print("arena tail followed by non-zero data\n");
   1075 		return 0;
   1076 	}
   1077 	if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){
   1078 		print("arena score zero\n");
   1079 		return 0;
   1080 	}
   1081 	return 1;
   1082 }
   1083 
   1084 int
   1085 okayname(char *name, int n)
   1086 {
   1087 	char buf[20];
   1088 
   1089 	if(nameok(name) < 0)
   1090 		return 0;
   1091 	sprint(buf, "%d", n);
   1092 	if(n == 0)
   1093 		buf[0] = 0;
   1094 	if(strlen(name) < strlen(buf)
   1095 	|| strcmp(name+strlen(name)-strlen(buf), buf) != 0)
   1096 		return 0;
   1097 	return 1;
   1098 }
   1099 
   1100 int
   1101 clumpinfocmp(ClumpInfo *a, ClumpInfo *b)
   1102 {
   1103 	if(a->type != b->type)
   1104 		return a->type - b->type;
   1105 	if(a->size != b->size)
   1106 		return a->size - b->size;
   1107 	if(a->uncsize != b->uncsize)
   1108 		return a->uncsize - b->uncsize;
   1109 	return scorecmp(a->score, b->score);
   1110 }
   1111 
   1112 ClumpInfo*
   1113 loadci(vlong offset, Arena *arena, int nci)
   1114 {
   1115 	int i, j, per;
   1116 	uchar *p, *sp;
   1117 	ClumpInfo *bci, *ci;
   1118 
   1119 	per = arena->blocksize/ClumpInfoSize;
   1120 	bci = vtmalloc(nci*sizeof bci[0]);
   1121 	ci = bci;
   1122 	offset += arena->size - arena->blocksize;
   1123 	p = sp = nil;
   1124 	for(i=0; i<nci; i+=per){
   1125 		if(p == sp){
   1126 			sp = pagein(offset-4*M, 4*M);
   1127 			p = sp+4*M;
   1128 		}
   1129 		p -= arena->blocksize;
   1130 		offset -= arena->blocksize;
   1131 		for(j=0; j<per && i+j<nci; j++)
   1132 			unpackclumpinfo(ci++, p+j*ClumpInfoSize);
   1133 	}
   1134 	return bci;
   1135 }
   1136 
   1137 vlong
   1138 writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci)
   1139 {
   1140 	int i, j, per;
   1141 	uchar *p, *sp;
   1142 
   1143 	per = arena->blocksize/ClumpInfoSize;
   1144 	offset += arena->size - arena->blocksize;
   1145 	p = sp = nil;
   1146 	for(i=0; i<nci; i+=per){
   1147 		if(p == sp){
   1148 			sp = pagein(offset-4*M, 4*M);
   1149 			p = sp+4*M;
   1150 		}
   1151 		p -= arena->blocksize;
   1152 		offset -= arena->blocksize;
   1153 		memset(p, 0, arena->blocksize);
   1154 		for(j=0; j<per && i+j<nci; j++)
   1155 			packclumpinfo(ci++, p+j*ClumpInfoSize);
   1156 	}
   1157 	pageout();
   1158 	return offset;
   1159 }
   1160 
   1161 void
   1162 loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena)
   1163 {
   1164 	char dname[ANameSize];
   1165 	static char lastbase[ANameSize];
   1166 	uchar *p;
   1167 	Arena oarena;
   1168 	ArenaHead ohead;
   1169 
   1170 	/*
   1171 	 * Fmtarenas makes all arenas the same size
   1172 	 * except the last, which may be smaller.
   1173 	 * It uses the same block size for arenas as for
   1174 	 * the arena partition blocks.
   1175 	 */
   1176 	arena->size = arenasize;
   1177 	if(offset0+arena->size > partend)
   1178 		arena->size = partend - offset0;
   1179 	head->size = arena->size;
   1180 
   1181 	arena->blocksize = ap.blocksize;
   1182 	head->blocksize = arena->blocksize;
   1183 
   1184 	/*
   1185 	 * Look for clump magic and name in head/tail blocks.
   1186 	 * All the other info we will reconstruct just in case.
   1187 	 */
   1188 	p = pagein(offset0, arena->blocksize);
   1189 	memset(&ohead, 0, sizeof ohead);
   1190 	if(unpackarenahead(&ohead, p) >= 0){
   1191 		head->version = ohead.version;
   1192 		head->clumpmagic = ohead.clumpmagic;
   1193 		if(okayname(ohead.name, anum))
   1194 			strcpy(head->name, ohead.name);
   1195 	}
   1196 
   1197 	p = pagein(offset0+arena->size-arena->blocksize,
   1198 		arena->blocksize);
   1199 	memset(&oarena, 0, sizeof oarena);
   1200 	if(unpackarena(&oarena, p) >= 0){
   1201 		arena->version = oarena.version;
   1202 		arena->clumpmagic = oarena.clumpmagic;
   1203 		if(okayname(oarena.name, anum))
   1204 			strcpy(arena->name, oarena.name);
   1205 		arena->diskstats.clumps = oarena.diskstats.clumps;
   1206 print("old arena: sealed=%d\n", oarena.diskstats.sealed);
   1207 		arena->diskstats.sealed = oarena.diskstats.sealed;
   1208 	}
   1209 
   1210 	/* Head trumps arena. */
   1211 	if(head->version){
   1212 		arena->version = head->version;
   1213 		arena->clumpmagic = head->clumpmagic;
   1214 	}
   1215 	if(arena->version == 0)
   1216 		arena->version = ArenaVersion5;
   1217 	if(basename){
   1218 		if(anum == -1)
   1219 			snprint(arena->name, ANameSize, "%s", basename);
   1220 		else
   1221 			snprint(arena->name, ANameSize, "%s%d", basename, anum);
   1222 	}else if(lastbase[0])
   1223 		snprint(arena->name, ANameSize, "%s%d", lastbase, anum);
   1224 	else if(head->name[0])
   1225 		strcpy(arena->name, head->name);
   1226 	else if(arena->name[0] == 0)
   1227 		sysfatal("cannot determine base name for arena; use -n");
   1228 	strcpy(lastbase, arena->name);
   1229 	sprint(dname, "%d", anum);
   1230 	lastbase[strlen(lastbase)-strlen(dname)] = 0;
   1231 
   1232 	/* Was working in arena, now copy to head. */
   1233 	head->version = arena->version;
   1234 	memmove(head->name, arena->name, sizeof head->name);
   1235 	head->blocksize = arena->blocksize;
   1236 	head->size = arena->size;
   1237 }
   1238 
   1239 void
   1240 shahead(Shabuf *sb, vlong offset0, ArenaHead *head)
   1241 {
   1242 	uchar headbuf[MaxDiskBlock];
   1243 
   1244 	sb->offset = offset0;
   1245 	memset(headbuf, 0, sizeof headbuf);
   1246 	packarenahead(head, headbuf);
   1247 	sbupdate(sb, headbuf, offset0, head->blocksize);
   1248 }
   1249 
   1250 u32int
   1251 newclumpmagic(int version)
   1252 {
   1253 	u32int m;
   1254 
   1255 	if(version == ArenaVersion4)
   1256 		return _ClumpMagic;
   1257 	do{
   1258 		m = fastrand();
   1259 	}while(m==0 || m == _ClumpMagic);
   1260 	return m;
   1261 }
   1262 
   1263 /*
   1264  * Poke around in the arena to find the clump data
   1265  * and compute the relevant statistics.
   1266  */
   1267 void
   1268 guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena,
   1269 	uchar *oldscore, uchar *score)
   1270 {
   1271 	uchar dbuf[MaxDiskBlock];
   1272 	int needtozero, clumps, nb1, nb2, minclumps;
   1273 	int inbad, n, ncib, printed, sealing, smart;
   1274 	u32int magic;
   1275 	uchar *sp, *ep, *p;
   1276 	vlong boffset, eoffset, lastclumpend, leaked;
   1277 	vlong offset, toffset, totalcorrupt, v;
   1278 	Clump cl;
   1279 	ClumpInfo *bci, *ci, *eci, *xci;
   1280 	Cit *bcit, *cit, *ecit;
   1281 	Shabuf oldsha, newsha;
   1282 
   1283 	/*
   1284 	 * We expect to find an arena, with data, between offset
   1285 	 * and offset+arenasize.  With any luck, the data starts at
   1286 	 * offset+ap.blocksize.  The blocks have variable size and
   1287 	 * aren't padded at all, which doesn't give us any alignment
   1288 	 * constraints.  The blocks are compressed or high entropy,
   1289 	 * but the headers are pretty low entropy (except the score):
   1290 	 *
   1291 	 *	type[1] (range 0 thru 9, 13)
   1292 	 *	size[2]
   1293 	 *	uncsize[2] (<= size)
   1294 	 *
   1295 	 * so we can look for these.  We check the scores as we go,
   1296 	 * so we can't make any wrong turns.  If we find ourselves
   1297 	 * in a dead end, scan forward looking for a new start.
   1298 	 */
   1299 
   1300 	resetcibuf();
   1301 	memset(head, 0, sizeof *head);
   1302 	memset(arena, 0, sizeof *arena);
   1303 	memset(oldscore, 0, VtScoreSize);
   1304 	memset(score, 0, VtScoreSize);
   1305 	memset(&oldsha, 0, sizeof oldsha);
   1306 	memset(&newsha, 0, sizeof newsha);
   1307 	newsha.rollback = 1;
   1308 
   1309 	if(0){
   1310 		sbdebug(&oldsha, "old.sha");
   1311 		sbdebug(&newsha, "new.sha");
   1312 	}
   1313 
   1314 	loadarenabasics(offset0, anum, head, arena);
   1315 
   1316 	/* start the clump hunt */
   1317 
   1318 	clumps = 0;
   1319 	totalcorrupt = 0;
   1320 	sealing = 1;
   1321 	boffset = offset0 + arena->blocksize;
   1322 	offset = boffset;
   1323 	eoffset = offset0+arena->size - arena->blocksize;
   1324 	toffset = eoffset;
   1325 	sp = pagein(offset0, 4*M);
   1326 
   1327 	if(arena->diskstats.sealed){
   1328 		oldsha.offset = offset0;
   1329 		sbupdate(&oldsha, sp, offset0, 4*M);
   1330 	}
   1331 	ep = sp+4*M;
   1332 	p = sp + (boffset - offset0);
   1333 	ncib = arena->blocksize / ClumpInfoSize;	/* ci per block in index */
   1334 	lastclumpend = offset;
   1335 	nbad = 0;
   1336 	inbad = 0;
   1337 	needtozero = 0;
   1338 	minclumps = 0;
   1339 	while(offset < eoffset){
   1340 		/*
   1341 		 * Shift buffer if we're running out of room.
   1342 		 */
   1343 		if(p+70*K >= ep){
   1344 			/*
   1345 			 * Start the post SHA1 buffer.   By now we should know the
   1346 			 * clumpmagic and arena version, so we can create a
   1347 			 * correct head block to get things going.
   1348 			 */
   1349 			if(sealing && fix && newsha.offset == 0){
   1350 				newsha.offset = offset0;
   1351 				if(arena->clumpmagic == 0){
   1352 					if(arena->version == 0)
   1353 						arena->version = ArenaVersion5;
   1354 					arena->clumpmagic = newclumpmagic(arena->version);
   1355 				}
   1356 				head->clumpmagic = arena->clumpmagic;
   1357 				shahead(&newsha, offset0, head);
   1358 			}
   1359 			n = 4*M-256*K;
   1360 			if(sealing && fix){
   1361 				sbdiskhash(&newsha, bufoffset);
   1362 				sbupdate(&newsha, buf, bufoffset, 4*M-256*K);
   1363 			}
   1364 			pagein(bufoffset+n, 4*M);
   1365 			p -= n;
   1366 			if(arena->diskstats.sealed)
   1367 				sbupdate(&oldsha, buf, bufoffset, 4*M);
   1368 		}
   1369 
   1370 		/*
   1371 		 * Check for a clump at p, which is at offset in the disk.
   1372 		 * Duplicate clumps happen in corrupted disks
   1373 		 * (the same pattern gets written many times in a row)
   1374 		 * and should never happen during regular use.
   1375 		 */
   1376 		magic = 0;
   1377 		if((n = isclump(p, &cl, &magic)) > 0){
   1378 			/*
   1379 			 * If we were in the middle of some corrupted data,
   1380 			 * flush a warning about it and then add any clump
   1381 			 * info blocks as necessary.
   1382 			 */
   1383 			if(inbad){
   1384 				inbad = 0;
   1385 				v = offset-lastclumpend;
   1386 				if(needtozero){
   1387 					zerorange(lastclumpend, v);
   1388 					sbrollback(&newsha, lastclumpend);
   1389 					print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n",
   1390 						lastclumpend, v, v);
   1391 				}
   1392 				addcicorrupt(v);
   1393 				totalcorrupt += v;
   1394 				nb1 = (minclumps+ncib-1)/ncib;
   1395 				minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize);
   1396 				nb2 = (minclumps+ncib-1)/ncib;
   1397 				eoffset -= (nb2-nb1)*arena->blocksize;
   1398 			}
   1399 
   1400 			if(haveclump(cl.info.score))
   1401 				print("warning: duplicate clump %d %V at %#llux+%#d\n", cl.info.type, cl.info.score, offset, n);
   1402 
   1403 			/*
   1404 			 * If clumps use different magic numbers, we don't care.
   1405 			 * We'll just use the first one we find and make the others
   1406 			 * follow suit.
   1407 			 */
   1408 			if(arena->clumpmagic == 0){
   1409 				print("clump type %d size %d score %V magic %x\n",
   1410 					cl.info.type, cl.info.size, cl.info.score, magic);
   1411 				arena->clumpmagic = magic;
   1412 				if(magic == _ClumpMagic)
   1413 					arena->version = ArenaVersion4;
   1414 				else
   1415 					arena->version = ArenaVersion5;
   1416 			}
   1417 			if(magic != arena->clumpmagic)
   1418 				p32(p, arena->clumpmagic);
   1419 			if(clumps == 0)
   1420 				arena->ctime = cl.time;
   1421 
   1422 			/*
   1423 			 * Record the clump, update arena stats,
   1424 			 * grow clump info blocks if needed.
   1425 			 */
   1426 			if(verbose > 1)
   1427 				print("\tclump %d: %d %V at %#llux+%#ux (%d)\n",
   1428 					clumps, cl.info.type, cl.info.score, offset, n, n);
   1429 			addcibuf(&cl.info, 0);
   1430 			if(minclumps%ncib == 0)
   1431 				eoffset -= arena->blocksize;
   1432 			minclumps++;
   1433 			clumps++;
   1434 			if(cl.encoding != ClumpENone)
   1435 				arena->diskstats.cclumps++;
   1436 			arena->diskstats.uncsize += cl.info.uncsize;
   1437 			arena->wtime = cl.time;
   1438 
   1439 			/*
   1440 			 * Move to next clump.
   1441 			 */
   1442 			offset += n;
   1443 			p += n;
   1444 			lastclumpend = offset;
   1445 		}else{
   1446 			/*
   1447 			 * Overwrite malformed clump data with zeros later.
   1448 			 * For now, just record whether it needs to be overwritten.
   1449 			 * Bad regions must be of size at least ClumpSize.
   1450 			 * Postponing the overwriting keeps us from writing past
   1451 			 * the end of the arena data (which might be directory data)
   1452 			 * with zeros.
   1453 			 */
   1454 			if(!inbad){
   1455 				inbad = 1;
   1456 				needtozero = 0;
   1457 				if(memcmp(p, zero, ClumpSize) != 0)
   1458 					needtozero = 1;
   1459 				p += ClumpSize;
   1460 				offset += ClumpSize;
   1461 				nbad++;
   1462 			}else{
   1463 				if(*p != 0)
   1464 					needtozero = 1;
   1465 				p++;
   1466 				offset++;
   1467 			}
   1468 		}
   1469 	}
   1470 	pageout();
   1471 
   1472 	if(verbose)
   1473 		print("readable clumps: %d; min. directory entries: %d\n",
   1474 			clumps, minclumps);
   1475 	arena->diskstats.used = lastclumpend - boffset;
   1476 	leaked = eoffset - lastclumpend;
   1477 	if(verbose)
   1478 		print("used from %#llux to %#llux = %,lld (%,lld unused)\n",
   1479 			boffset, lastclumpend, arena->diskstats.used, leaked);
   1480 
   1481 	/*
   1482 	 * Finish the SHA1 of the old data.
   1483 	 */
   1484 	if(arena->diskstats.sealed){
   1485 		sbdiskhash(&oldsha, toffset);
   1486 		readdisk(dbuf, toffset, arena->blocksize);
   1487 		scorecp(dbuf+arena->blocksize-VtScoreSize, zero);
   1488 		sbupdate(&oldsha, dbuf, toffset, arena->blocksize);
   1489 		sbscore(&oldsha, oldscore);
   1490 	}
   1491 
   1492 	/*
   1493 	 * If we still don't know the clump magic, the arena
   1494 	 * must be empty.  It still needs a value, so make
   1495 	 * something up.
   1496 	 */
   1497 	if(arena->version == 0)
   1498 		arena->version = ArenaVersion5;
   1499 	if(arena->clumpmagic == 0){
   1500 		if(arena->version == ArenaVersion4)
   1501 			arena->clumpmagic = _ClumpMagic;
   1502 		else{
   1503 			do
   1504 				arena->clumpmagic = fastrand();
   1505 			while(arena->clumpmagic==_ClumpMagic
   1506 				||arena->clumpmagic==0);
   1507 		}
   1508 		head->clumpmagic = arena->clumpmagic;
   1509 	}
   1510 
   1511 	/*
   1512 	 * Guess at number of clumpinfo blocks to load.
   1513 	 * If we guess high, it's no big deal.  If we guess low,
   1514 	 * we'll be forced into rewriting the whole directory.
   1515 	 * Still not such a big deal.
   1516 	 */
   1517 	if(clumps == 0 || arena->diskstats.used == totalcorrupt)
   1518 		goto Nocib;
   1519 	if(clumps < arena->diskstats.clumps)
   1520 		clumps = arena->diskstats.clumps;
   1521 	if(clumps < ncibuf)
   1522 		clumps = ncibuf;
   1523 	clumps += totalcorrupt/
   1524 		((arena->diskstats.used - totalcorrupt)/clumps);
   1525 	clumps += totalcorrupt/2000;
   1526 	if(clumps < minclumps)
   1527 		clumps = minclumps;
   1528 	clumps += ncib-1;
   1529 	clumps -= clumps%ncib;
   1530 
   1531 	/*
   1532 	 * Can't write into the actual data.
   1533 	 */
   1534 	v = offset0 + arena->size - arena->blocksize;
   1535 	v -= (clumps+ncib-1)/ncib * arena->blocksize;
   1536 	if(v < lastclumpend){
   1537 		v = offset0 + arena->size - arena->blocksize;
   1538 		clumps = (v-lastclumpend)/arena->blocksize * ncib;
   1539 	}
   1540 
   1541 	if(clumps < minclumps)
   1542 		print("cannot happen?\n");
   1543 
   1544 	/*
   1545 	 * Check clumpinfo blocks against directory we created.
   1546 	 * The tricky part is handling the corrupt sections of arena.
   1547 	 * If possible, we remark just the affected directory entries
   1548 	 * rather than slide everything down.
   1549 	 *
   1550 	 * Allocate clumps+1 blocks and check that we don't need
   1551 	 * the last one at the end.
   1552 	 */
   1553 	bci = loadci(offset0, arena, clumps+1);
   1554 	eci = bci+clumps+1;
   1555 	bcit = cibuf;
   1556 	ecit = cibuf+ncibuf;
   1557 
   1558 	smart = 0;	/* Somehow the smart code doesn't do corrupt clumps right. */
   1559 Again:
   1560 	nbad = 0;
   1561 	ci = bci;
   1562 	for(cit=bcit; cit<ecit && ci<eci; cit++){
   1563 		if(cit->corrupt){
   1564 			vlong n, m;
   1565 			if(smart){
   1566 				/*
   1567 				 * If we can, just mark existing entries as corrupt.
   1568 				 */
   1569 				n = cit->corrupt;
   1570 				for(xci=ci; n>0 && xci<eci; xci++)
   1571 					n -= ClumpSize+xci->size;
   1572 				if(n > 0 || xci >= eci)
   1573 					goto Dumb;
   1574 				printed = 0;
   1575 				for(; ci<xci; ci++){
   1576 					if(verbose && ci->type != VtCorruptType){
   1577 						if(!printed){
   1578 							print("marking directory %d-%d as corrupt\n",
   1579 								(int)(ci-bci), (int)(xci-bci));
   1580 							printed = 1;
   1581 						}
   1582 						print("\ttype=%d size=%d uncsize=%d score=%V\n",
   1583 							ci->type, ci->size, ci->uncsize, ci->score);
   1584 					}
   1585 					ci->type = VtCorruptType;
   1586 				}
   1587 			}else{
   1588 			Dumb:
   1589 				print("\trewriting clump directory\n");
   1590 				/*
   1591 				 * Otherwise, blaze a new trail.
   1592 				 */
   1593 				n = cit->corrupt;
   1594 				while(n > 0 && ci < eci){
   1595 					if(n < ClumpSize)
   1596 						sysfatal("bad math in clump corrupt");
   1597 					if(n <= VtMaxLumpSize+ClumpSize)
   1598 						m = n;
   1599 					else{
   1600 						m = VtMaxLumpSize+ClumpSize;
   1601 						if(n-m < ClumpSize)
   1602 							m -= ClumpSize;
   1603 					}
   1604 					ci->type = VtCorruptType;
   1605 					ci->size = m-ClumpSize;
   1606 					ci->uncsize = m-ClumpSize;
   1607 					memset(ci->score, 0, VtScoreSize);
   1608 					ci++;
   1609 					n -= m;
   1610 				}
   1611 			}
   1612 			continue;
   1613 		}
   1614 		if(clumpinfocmp(&cit->ci, ci) != 0){
   1615 			if(verbose && (smart || verbose>1)){
   1616 				print("clumpinfo %d\n", (int)(ci-bci));
   1617 				print("\twant: %d %d %d %V\n",
   1618 					cit->ci.type, cit->ci.size,
   1619 					cit->ci.uncsize, cit->ci.score);
   1620 				print("\thave: %d %d %d %V\n",
   1621 					ci->type, ci->size,
   1622 					ci->uncsize, ci->score);
   1623 			}
   1624 			*ci = cit->ci;
   1625 			nbad++;
   1626 		}
   1627 		ci++;
   1628 	}
   1629 	if(ci >= eci || cit < ecit){
   1630 		print("ran out of space editing existing directory; rewriting\n");
   1631 		print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit);
   1632 		assert(smart);	/* can't happen second time thru */
   1633 		smart = 0;
   1634 		goto Again;
   1635 	}
   1636 
   1637 	assert(ci <= eci);
   1638 	arena->diskstats.clumps = ci-bci;
   1639 	eoffset = writeci(offset0, arena, bci, ci-bci);
   1640 	if(sealing && fix)
   1641 		sbrollback(&newsha, v);
   1642 print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal);
   1643 	if(lastclumpend > eoffset)
   1644 		print("arena directory overwrote blocks!  cannot happen!\n");
   1645 	free(bci);
   1646 	if(smart && nbad)
   1647 		print("arena directory has %d bad or missing entries\n", nbad);
   1648 Nocib:
   1649 	if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){
   1650 		if(arena->diskstats.sealed)
   1651 			print("unsealing arena\n");
   1652 		sealing = 0;
   1653 		memset(oldscore, 0, VtScoreSize);
   1654 	}
   1655 
   1656 	/*
   1657 	 * Finish the SHA1 of the new data - only meaningful
   1658 	 * if we've been writing to disk (`fix').
   1659 	 */
   1660 	arena->diskstats.sealed = sealing;
   1661 	arena->memstats = arena->diskstats;
   1662 	if(sealing && fix){
   1663 		uchar tbuf[MaxDiskBlock];
   1664 
   1665 		sbdiskhash(&newsha, toffset);
   1666 		memset(tbuf, 0, sizeof tbuf);
   1667 		packarena(arena, tbuf);
   1668 		sbupdate(&newsha, tbuf, toffset, arena->blocksize);
   1669 		sbscore(&newsha, score);
   1670 	}
   1671 }
   1672 
   1673 void
   1674 dumparena(vlong offset, int anum, Arena *arena)
   1675 {
   1676 	char buf[1000];
   1677 	vlong o, e;
   1678 	int fd, n;
   1679 
   1680 	snprint(buf, sizeof buf, "%s.%d", dumpbase, anum);
   1681 	if((fd = create(buf, OWRITE, 0666)) < 0){
   1682 		fprint(2, "create %s: %r\n", buf);
   1683 		return;
   1684 	}
   1685 	e = offset+arena->size;
   1686 	for(o=offset; o<e; o+=n){
   1687 		n = 4*M;
   1688 		if(o+n > e)
   1689 			n = e-o;
   1690 		if(pwrite(fd, pagein(o, n), n, o-offset) != n){
   1691 			fprint(2, "write %s at %#llux: %r\n", buf, o-offset);
   1692 			return;
   1693 		}
   1694 	}
   1695 }
   1696 
   1697 void
   1698 checkarena(vlong offset, int anum)
   1699 {
   1700 	uchar dbuf[MaxDiskBlock];
   1701 	uchar *p, oldscore[VtScoreSize], score[VtScoreSize];
   1702 	Arena arena, oarena;
   1703 	ArenaHead head;
   1704 	Info *fmt, *fmta;
   1705 	int sz;
   1706 
   1707 	print("# arena %d: offset %#llux\n", anum, offset);
   1708 
   1709 	if(offset >= partend){
   1710 		print("arena offset out of bounds\n");
   1711 		return;
   1712 	}
   1713 
   1714 	guessarena(offset, anum, &head, &arena, oldscore, score);
   1715 
   1716 	if(verbose){
   1717 		print("#\tversion=%d name=%s blocksize=%d size=%z",
   1718 			head.version, head.name, head.blocksize, head.size);
   1719 		if(head.clumpmagic)
   1720 			print(" clumpmagic=%#.8ux", head.clumpmagic);
   1721 		print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n",
   1722 			arena.diskstats.clumps, arena.diskstats.cclumps,
   1723 			arena.diskstats.used, arena.diskstats.uncsize);
   1724 		print("#\tctime=%t\n", arena.ctime);
   1725 		print("#\twtime=%t\n", arena.wtime);
   1726 		if(arena.diskstats.sealed)
   1727 			print("#\tsealed score=%V\n", score);
   1728 	}
   1729 
   1730 	if(dumpbase){
   1731 		dumparena(offset, anum, &arena);
   1732 		return;
   1733 	}
   1734 
   1735 	memset(dbuf, 0, sizeof dbuf);
   1736 	packarenahead(&head, dbuf);
   1737 	p = pagein(offset, arena.blocksize);
   1738 	if(memcmp(dbuf, p, arena.blocksize) != 0){
   1739 		print("on-disk arena header incorrect\n");
   1740 		showdiffs(dbuf, p, arena.blocksize,
   1741 			arena.version==ArenaVersion4 ? headinfo4 : headinfo5);
   1742 	}
   1743 	memmove(p, dbuf, arena.blocksize);
   1744 
   1745 	memset(dbuf, 0, sizeof dbuf);
   1746 	packarena(&arena, dbuf);
   1747 	if(arena.diskstats.sealed)
   1748 		scorecp(dbuf+arena.blocksize-VtScoreSize, score);
   1749 	p = pagein(offset+arena.size-arena.blocksize, arena.blocksize);
   1750 	memset(&oarena, 0, sizeof oarena);
   1751 	unpackarena(&oarena, p);
   1752 	if(arena.version == ArenaVersion4){
   1753 		sz = ArenaSize4;
   1754 		fmt = tailinfo4;
   1755 		fmta = tailinfo4a;
   1756 	}else{
   1757 		sz = ArenaSize5;
   1758 		fmt = tailinfo5;
   1759 		fmta = tailinfo5a;
   1760 	}
   1761 	if(p[sz] == 1){
   1762 		fmt = fmta;
   1763 		if(oarena.diskstats.sealed){
   1764 			/*
   1765 			 * some arenas were sealed with the extension
   1766 			 * before we adopted the convention that if it didn't
   1767 			 * add new information it gets dropped.
   1768 			 */
   1769 			_packarena(&arena, dbuf, 1);
   1770 		}
   1771 	}
   1772 	if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){
   1773 		print("on-disk arena tail incorrect\n");
   1774 		showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt);
   1775 	}
   1776 	if(arena.diskstats.sealed){
   1777 		if(oarena.diskstats.sealed)
   1778 		if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){
   1779 			print("on-disk arena seal score incorrect\n");
   1780 			print("\tcorrect=%V\n", oldscore);
   1781 			print("\t   disk=%V\n", p+arena.blocksize-VtScoreSize);
   1782 		}
   1783 		if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){
   1784 			print("%ssealing arena%s: %V\n",
   1785 				oarena.diskstats.sealed ? "re" : "",
   1786 				scorecmp(oldscore, score) == 0 ?
   1787 					"" : " after changes", score);
   1788 		}
   1789 	}
   1790 	memmove(p, dbuf, arena.blocksize);
   1791 
   1792 	pageout();
   1793 }
   1794 
   1795 AMapN*
   1796 buildamap(void)
   1797 {
   1798 	uchar *p;
   1799 	vlong o;
   1800 	ArenaHead h;
   1801 	AMapN *an;
   1802 	AMap *m;
   1803 
   1804 	an = vtmallocz(sizeof *an);
   1805 	for(o=ap.arenabase; o<partend; o+=arenasize){
   1806 		p = pagein(o, Block);
   1807 		if(unpackarenahead(&h, p) >= 0){
   1808 			an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]);
   1809 			m = &an->map[an->n++];
   1810 			m->start = o;
   1811 			m->stop = o+h.size;
   1812 			strcpy(m->name, h.name);
   1813 		}
   1814 	}
   1815 	return an;
   1816 }
   1817 
   1818 void
   1819 checkmap(void)
   1820 {
   1821 	char *s;
   1822 	uchar *p;
   1823 	int i, len;
   1824 	AMapN *an;
   1825 	Fmt fmt;
   1826 
   1827 	an = buildamap();
   1828 	fmtstrinit(&fmt);
   1829 	fmtprint(&fmt, "%ud\n", an->n);
   1830 	for(i=0; i<an->n; i++)
   1831 		fmtprint(&fmt, "%s\t%lld\t%lld\n",
   1832 			an->map[i].name, an->map[i].start, an->map[i].stop);
   1833 	s = fmtstrflush(&fmt);
   1834 	len = strlen(s);
   1835 	if(len > ap.tabsize){
   1836 		print("arena partition map too long: need %z bytes have %z\n",
   1837 			(vlong)len, (vlong)ap.tabsize);
   1838 		len = ap.tabsize;
   1839 	}
   1840 
   1841 	if(ap.tabsize >= 4*M){	/* can't happen - max arenas is 2000 */
   1842 		print("arena partition map *way* too long\n");
   1843 		return;
   1844 	}
   1845 
   1846 	p = pagein(ap.tabbase, ap.tabsize);
   1847 	if(memcmp(p, s, len) != 0){
   1848 		print("arena partition map incorrect; rewriting.\n");
   1849 		memmove(p, s, len);
   1850 	}
   1851 	pageout();
   1852 }
   1853 
   1854 int mainstacksize = 512*1024;
   1855 
   1856 void
   1857 threadmain(int argc, char **argv)
   1858 {
   1859 	int mode;
   1860 
   1861 	mode = OREAD;
   1862 	readonly = 1;
   1863 	ARGBEGIN{
   1864 	case 'U':
   1865 		unseal = 1;
   1866 		break;
   1867 	case 'a':
   1868 		arenasize = unittoull(EARGF(usage()));
   1869 		break;
   1870 	case 'b':
   1871 		ap.blocksize = unittoull(EARGF(usage()));
   1872 		break;
   1873 	case 'f':
   1874 		fix = 1;
   1875 		mode = ORDWR;
   1876 		readonly = 0;
   1877 		break;
   1878 	case 'n':
   1879 		basename = EARGF(usage());
   1880 		break;
   1881 	case 'v':
   1882 		verbose++;
   1883 		break;
   1884 	case 'x':
   1885 		dumpbase = EARGF(usage());
   1886 		break;
   1887 	default:
   1888 		usage();
   1889 	}ARGEND
   1890 
   1891 	if(argc != 1 && argc != 2)
   1892 		usage();
   1893 
   1894 	file = argv[0];
   1895 
   1896 	ventifmtinstall();
   1897 	fmtinstall('z', zfmt);
   1898 	fmtinstall('t', tfmt);
   1899 	quotefmtinstall();
   1900 
   1901 	part = initpart(file, mode|ODIRECT);
   1902 	if(part == nil)
   1903 		sysfatal("can't open %s: %r", file);
   1904 	partend = part->size;
   1905 
   1906 	if(isonearena()){
   1907 		checkarena(0, -1);
   1908 		threadexitsall(nil);
   1909 	}
   1910 	checkarenas(argc > 1 ? argv[1] : nil);
   1911 	checkmap();
   1912 	threadexitsall(nil);
   1913 }